前置准备
Maven依赖配置(pom.xml)
xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>hadoop-examples</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- Hadoop客户端依赖 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
<!-- 单元测试 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!-- 日志 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.30</version>
</dependency>
</dependencies>
</project>
Hadoop序列化类型速查表
| Hadoop类型 | Java对应类型 | 说明 |
|---|---|---|
| Text | String | 可变长度文本 |
| LongWritable | long | 长整型 |
| IntWritable | int | 整型 |
| NullWritable | null | 空值(不占位) |
| Writable | 自定义 | 自定义序列化接口 |
题目一:单词计数(WordCount)
题目描述
统计输入文件中每个单词出现的次数,是MapReduce的"Hello World"程序。
输入数据(word.txt):
hadoop hdfs yarn mapreduce
hadoop spark flink kafka
hive hbase sqoop flume
hadoop spark hive pig
java python scala sql
期望输出:
flink 1
flume 1
hadoop 3
hbase 1
hdfs 1
hive 2
java 1
kafka 1
mapreduce 1
pig 1
python 1
scala 1
spark 2
sqoop 1
sql 1
yarn 1
核心思路
Map阶段:将每行按空格切分,输出 <单词, 1>
Shuffle阶段:相同单词聚合,得到 <单词, [1,1,1,...]>
Reduce阶段:对values累加,输出 <单词, 总次数>
代码实现
1. Mapper类
java
package com.mapper;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import java.io.IOException;
/**
* WordCount Mapper
* 输入:KEYIN=LongWritable(行偏移量), VALUEIN=Text(一行文本)
* 输出:KEYOUT=Text(单词), VALUEOUT=LongWritable(1)
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
// 复用对象,减少GC压力
private Text outKey = new Text();
private final static LongWritable outValue = new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 获取一行数据
String line = value.toString();
// 使用Hadoop的StringUtils.split按空格切分(性能优于Java原生split)
String[] words = StringUtils.split(line, ' ');
// 遍历单词,输出 <单词, 1>
for (String word : words) {
if (word != null && !word.trim().isEmpty()) {
outKey.set(word.trim());
context.write(outKey, outValue);
}
}
}
}
2. Reducer类
java
package com.reducer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* WordCount Reducer
* 输入:KEYIN=Text(单词), VALUEIN=Iterable<LongWritable>([1,1,1,...])
* 输出:KEYOUT=Text(单词), VALUEOUT=LongWritable(总次数)
*/
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
private LongWritable outValue = new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
// 累加单词出现次数
long sum = 0;
for (LongWritable value : values) {
sum += value.get();
}
outValue.set(sum);
context.write(key, outValue);
}
}
3. Driver执行类
java
package com.driver;
import com.mapper.WordCountMapper;
import com.reducer.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* WordCount Driver
* 配置并提交MapReduce作业
*/
public class WordCountDriver {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
// 参数校验
if (args.length != 2) {
System.err.println("Usage: WordCountDriver <input path> <output path>");
System.exit(-1);
}
// 1. 获取配置和Job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "WordCount");
// 2. 设置Jar包路径(集群运行必须)
job.setJarByClass(WordCountDriver.class);
// 3. 关联Mapper和Reducer
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4. 设置Map输出KV类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 5. 设置最终输出KV类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 6. 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7. 提交作业并等待完成
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
集群运行命令
bash
# 准备输入数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /wordcount/input
[atguigu@hadoop102 ~]$ hadoop fs -put word.txt /wordcount/input/
# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.WordCountDriver /wordcount/input /wordcount/output
# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /wordcount/output/part-r-00000
题目二:数据去重(Unique)
题目描述
去除输入文件中的重复行,只保留唯一值。这是MapReduce的经典应用,利用Shuffle阶段的自动排序去重特性。
输入数据(unique.txt):
apple
banana
apple
cherry
banana
date
elderberry
cherry
fig
apple
期望输出:
apple
banana
cherry
date
elderberry
fig
核心思路
Map阶段:将每行内容作为key输出,value设为NullWritable(不占位)
Shuffle阶段:相同key自动聚合,只保留一个
Reduce阶段:直接输出key,value为NullWritable
💡 技巧:利用MapReduce的Shuffle机制天然去重,Reduce每个key只执行一次。
代码实现
1. Mapper类
java
package com.mapper;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* Unique Mapper - 数据去重
* 将每行内容作为key,利用Shuffle自动去重
*/
public class UniqueMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable num = new IntWritable(1);
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 去掉字符串两端的空格
String newValue = value.toString().trim();
text.set(newValue);
// 输出 <行内容, 1>,value在Reduce中会被忽略
context.write(text, num);
}
}
2. Reducer类
java
package com.reducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* Unique Reducer - 直接输出key,value为空
* 由于Shuffle已去重,每个key只出现一次
*/
public class UniqueReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
// 直接输出key,value为NullWritable(不占位)
context.write(key, NullWritable.get());
}
}
3. Driver执行类
java
package com.driver;
import com.mapper.UniqueMapper;
import com.reducer.UniqueReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class UniqueDriver {
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
// 1. 获取配置信息以及获取job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Unique");
// 2. 关联本Driver程序的jar
job.setJarByClass(UniqueDriver.class);
// 3. 关联Mapper和Reducer
job.setMapperClass(UniqueMapper.class);
job.setReducerClass(UniqueReducer.class);
// 4. 设置Mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5. 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 6. 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7. 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
集群运行命令
bash
# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /unique/input
[atguigu@hadoop102 ~]$ hadoop fs -put unique.txt /unique/input/
# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.UniqueDriver /unique/input /unique/output
# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /unique/output/part-r-00000
题目三:日期统计(CountByDate)
题目描述
统计日志文件中每个日期出现的记录数。模拟电商场景:统计每天的订单量。
输入数据(orders.txt):
user001|2024-01-01|100.50
user002|2024-01-01|200.00
user003|2024-01-02|150.00
user004|2024-01-01|80.00
user005|2024-01-03|300.00
user006|2024-01-02|120.00
user007|2024-01-01|50.00
user008|2024-01-03|210.00
user009|2024-01-02|90.00
user010|2024-01-01|180.00
期望输出:
2024-01-01 4
2024-01-02 3
2024-01-03 2
核心思路
Map阶段:按"|"切分,提取日期作为key,输出 <日期, 1>
Shuffle阶段:相同日期聚合
Reduce阶段:累加计数,输出 <日期, 总记录数>
代码实现
1. Mapper类
java
package com.mapper;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* CountByDate Mapper - 按日期统计
* 输入格式:user_id|date|amount
*/
public class CountByDateMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
// 按 "|" 切分,-1表示保留空字符串
String[] data = value.toString().split("\\|", -1);
// 提取日期字段(第2个字段,索引为1)
word.set(data[1]);
// 输出 <日期, 1>
context.write(word, one);
}
}
2. Reducer类
java
package com.reducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* CountByDate Reducer - 日期计数累加
*/
public class CountByDateReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
3. Driver执行类
java
package com.driver;
import com.mapper.CountByDateMapper;
import com.reducer.CountByDateReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class CountByDateDriver {
public static void main(String[] args) throws Exception {
// 1. 获取配置信息以及获取job对象
Configuration conf = new Configuration();
// 解析命令行参数
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: CountByDateDriver <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "CountByDate");
// 2. 关联本Driver程序的jar
job.setJarByClass(CountByDateDriver.class);
// 3. 关联Mapper和Reducer
job.setMapperClass(CountByDateMapper.class);
// 设置Combiner,在Map端预聚合,减少Shuffle数据量
job.setCombinerClass(CountByDateReducer.class);
job.setReducerClass(CountByDateReducer.class);
// 4. 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 5. 设置输入路径(支持多个输入)
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
// 6. 设置输出路径
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
// 7. 提交job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
💡 Combiner优化 :本题设置了
job.setCombinerClass(CountByDateReducer.class),在Map端进行预聚合,显著减少网络传输。
集群运行命令
bash
# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /countdate/input
[atguigu@hadoop102 ~]$ hadoop fs -put orders.txt /countdate/input/
# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.CountByDateDriver /countdate/input /countdate/output
# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /countdate/output/part-r-00000
如果这篇文章对你有帮助,欢迎点赞、收藏、评论三连!有问题可以在评论区留言交流~