前置知识:自定义序列化(Writable接口)
在Hadoop中,数据需要在网络中传输和磁盘上存储,因此必须实现序列化。Hadoop没有使用Java的Serializable接口,而是自定义了更轻量级的Writable接口。
Writable接口核心方法
| 方法 | 作用 |
|---|---|
write(DataOutput out) |
序列化:将对象写入输出流 |
readFields(DataInput in) |
反序列化:从输入流读取对象 |
实现规范
- 顺序一致:序列化和反序列化的字段顺序必须完全相同
- 空参构造:必须提供无参构造函数
toString():建议重写,方便输出查看
题目四:TopN排序(SortTop10)
题目描述
从大量数字中找出最大的N个数(TopN)。这是面试高频题,考察MapReduce的全局排序能力。
输入数据(numbers.txt):
34 12 89 45 67 23 91 56 78 90 11 2 87 43 65 33 99 54 76 88
15 44 77 92 38 61 29 83 50 72 5 96 41 63 31 85 48 70 18 95
期望输出(Top 10):
99
96
95
92
91
90
89
88
87
85
核心思路
方案一:单Reduce全局排序(适合数据量不大)
Map阶段:每个数字作为key输出
Shuffle阶段:MapReduce自动按key排序(默认升序)
Reduce阶段:在cleanup中控制只输出前N个
方案二:多Reduce分布式TopN(适合海量数据)
Map阶段:每个Map任务维护一个TreeSet,保留本地TopN
Shuffle阶段:只传输TopN数据
Reduce阶段:合并所有Map的TopN,再取全局TopN
本文采用方案二,更适合大数据场景。
代码实现
1. Mapper类(局部TopN)
java
package com.mapper;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.TreeSet;
/**
* SortTopN Mapper - 每个Map任务计算局部TopN
* 使用TreeSet自动排序,只保留最大的N个数
*/
public class SortMapper extends Mapper<LongWritable, org.apache.hadoop.io.Text, IntWritable, NullWritable> {
// TreeSet降序排列,只保留TopN
TreeSet<Integer> treeSet = new TreeSet<>((o1, o2) -> o2 - o1);
// 控制TopN的数量
private static final int TOP_N = 10;
@Override
protected void map(LongWritable key, org.apache.hadoop.io.Text value, Context context)
throws IOException, InterruptedException {
// 按空格切分数字
String[] nums = value.toString().split(" ");
for (String num : nums) {
int x = Integer.parseInt(num);
treeSet.add(x);
// 只保留TopN,超出则移除最小的
if (treeSet.size() > TOP_N) {
treeSet.pollLast(); // 移除最后一个(最小的)
}
}
}
/**
* cleanup方法:Map任务结束时执行
* 输出当前Map任务的局部TopN
*/
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
for (Integer x : treeSet) {
context.write(new IntWritable(x), NullWritable.get());
}
}
}
2. Reducer类(全局TopN)
java
package com.reducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.TreeSet;
/**
* SortTopN Reducer - 合并所有局部TopN,得到全局TopN
*/
public class SortReducer extends Reducer<IntWritable, NullWritable, IntWritable, NullWritable> {
TreeSet<Integer> treeSet = new TreeSet<>((o1, o2) -> o2 - o1);
private static final int TOP_N = 10;
@Override
protected void reduce(IntWritable key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
// 所有相同key的value都是NullWritable,只需取key
treeSet.add(key.get());
// 只保留全局TopN
if (treeSet.size() > TOP_N) {
treeSet.pollLast();
}
}
/**
* cleanup方法:Reduce任务结束时执行
* 输出全局TopN(已按降序排列)
*/
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
for (Integer x : treeSet) {
context.write(new IntWritable(x), NullWritable.get());
}
}
}
3. Driver执行类
java
package com.driver;
import com.mapper.SortMapper;
import com.reducer.SortReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SortDriver {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
// 1. 获取配置信息以及获取job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "SortTopN");
// 2. 关联本Driver程序的jar
job.setJarByClass(SortDriver.class);
// 3. 关联Mapper和Reducer
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
// 4. 设置Mapper输出的kv类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(NullWritable.class);
// 5. 设置最终输出kv类型
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(NullWritable.class);
// 6. 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7. 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
关键知识点:cleanup()方法
| 方法 | 调用时机 | 用途 |
|---|---|---|
setup() |
Map/Reduce任务开始时 | 初始化资源(如数据库连接) |
map()/reduce() |
每条记录处理时 | 核心业务逻辑 |
cleanup() |
Map/Reduce任务结束时 | 释放资源、输出汇总结果 |
TopN问题必须借助cleanup(),在任务结束时输出TreeSet中的结果。
集群运行命令
bash
# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /sort/input
[atguigu@hadoop102 ~]$ hadoop fs -put numbers.txt /sort/input/
# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.SortDriver /sort/input /sort/output
# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /sort/output/part-r-00000
题目五:手机号流量统计(PhoneFlow)
题目描述
统计每个手机号的上行流量、下行流量和总流量。这是企业级大数据项目的经典场景(如运营商流量计费系统)。
输入数据(phone_data.txt):
1 13726230503 192.196.100.1 www.atguigu.com 2481 24681 200
2 13826544101 192.196.100.2 264 0 200
3 13926435656 192.196.100.3 132 1512 200
4 13926251106 192.196.100.4 240 0 200
5 13726230503 192.196.100.5 2481 24681 200
6 13826544101 192.196.100.6 264 0 200
7 13926435656 192.196.100.7 132 1512 200
8 13726230503 192.196.100.8 2481 24681 200
字段说明:
| 字段索引 | 含义 |
|---|---|
| 0 | 序号 |
| 1 | 手机号 |
| 2 | IP地址 |
| 3 | 网址 |
| 4 | 上行流量 |
| 5 | 下行流量 |
| 6 | HTTP状态码 |
期望输出:
13726230503 7443 74043 81486
13826544101 528 0 528
13926435656 264 3024 3288
13926251106 240 0 240
核心思路
Map阶段:提取手机号、上行流量、下行流量,输出 <手机号, PhoneBean>
Shuffle阶段:相同手机号聚合,得到 <手机号, [Bean1, Bean2, ...]>
Reduce阶段:遍历Bean列表,累加上行和下行流量,计算总流量
代码实现
1. 自定义Bean类(实现Writable)
java
package com;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 流量统计Bean
* 实现Writable接口,支持Hadoop序列化
*/
public class PhoneBean implements Writable {
private long upflow; // 上行流量
private long downflow; // 下行流量
private long sumflow; // 总流量
// ========== Getter/Setter ==========
public long getUpflow() {
return upflow;
}
public void setUpflow(long upflow) {
this.upflow = upflow;
}
public long getDownflow() {
return downflow;
}
public void setDownflow(long downflow) {
this.downflow = downflow;
}
public long getSumflow() {
return sumflow;
}
public void setSumflow(long sumflow) {
this.sumflow = sumflow;
}
/**
* 序列化方法:将对象写入输出流
* 注意:顺序必须与readFields一致!
*/
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upflow);
dataOutput.writeLong(downflow);
dataOutput.writeLong(sumflow);
}
/**
* 反序列化方法:从输入流读取对象
* 注意:顺序必须与write一致!
*/
@Override
public void readFields(DataInput dataInput) throws IOException {
upflow = dataInput.readLong();
downflow = dataInput.readLong();
sumflow = dataInput.readLong();
}
/**
* 重写toString,方便输出
*/
@Override
public String toString() {
return upflow + "\t" + downflow + "\t" + sumflow;
}
}
2. Mapper类
java
package com.mapper;
import com.PhoneBean;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* PhoneFlow Mapper
* 输入:KEYIN=LongWritable, VALUEIN=Text(一行流量记录)
* 输出:KEYOUT=Text(手机号), VALUEOUT=PhoneBean(流量Bean)
*/
public class PhoneMapper extends Mapper<LongWritable, Text, Text, PhoneBean> {
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 按制表符切分
String[] words = value.toString().split("\t");
// 提取手机号(索引1)和流量数据(索引4、5)
String phone = words[1];
long up = Long.parseLong(words[words.length - 3]); // 上行流量
long down = Long.parseLong(words[words.length - 2]); // 下行流量
// 创建Bean并设置值
PhoneBean bean = new PhoneBean();
bean.setUpflow(up);
bean.setDownflow(down);
bean.setSumflow(up + down);
text.set(phone);
context.write(text, bean);
}
}
3. Reducer类
java
package com.reducer;
import com.PhoneBean;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* PhoneFlow Reducer
* 累加同一手机号的所有流量记录
*/
public class PhoneReducer extends Reducer<Text, PhoneBean, Text, PhoneBean> {
PhoneBean bean = new PhoneBean();
@Override
protected void reduce(Text key, Iterable<PhoneBean> values, Context context)
throws IOException, InterruptedException {
long sumup = 0;
long sumdown = 0;
// 遍历该手机号的所有记录,累加流量
for (PhoneBean x : values) {
sumup += x.getUpflow();
sumdown += x.getDownflow();
}
// 设置累加后的结果
bean.setUpflow(sumup);
bean.setDownflow(sumdown);
bean.setSumflow(sumup + sumdown);
context.write(key, bean);
}
}
4. Driver执行类
java
package com.driver;
import com.PhoneBean;
import com.mapper.PhoneMapper;
import com.reducer.PhoneReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PhoneDriver {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
// 1. 获取配置信息以及获取job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "PhoneFlow");
// 2. 关联本Driver程序的jar
job.setJarByClass(PhoneDriver.class);
// 3. 关联Mapper和Reducer
job.setMapperClass(PhoneMapper.class);
job.setReducerClass(honeReducer.class);
// 4. 设置Mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(PhoneBean.class);
// 5. 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(PhoneBean.class);
// 6. 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7. 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
集群运行命令
bash
# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /phone/input
[atguigu@hadoop102 ~]$ hadoop fs -put phone_data.txt /phone/input/
# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.PhoneDriver /phone/input /phone/output
# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /phone/output/part-r-00000