MapReduce经典例题【第一期】

前置准备

Maven依赖配置（pom.xml）

xml 复制代码

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>hadoop-examples</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>
        <!-- Hadoop客户端依赖 -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.3</version>
        </dependency>
        <!-- 单元测试 -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <!-- 日志 -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.30</version>
        </dependency>
    </dependencies>
</project>

Hadoop序列化类型速查表

Hadoop类型	Java对应类型	说明
Text	String	可变长度文本
LongWritable	long	长整型
IntWritable	int	整型
NullWritable	null	空值（不占位）
Writable	自定义	自定义序列化接口

题目一：单词计数（WordCount）

题目描述

统计输入文件中每个单词出现的次数，是MapReduce的"Hello World"程序。

输入数据（word.txt）：

复制代码

hadoop hdfs yarn mapreduce
hadoop spark flink kafka
hive hbase sqoop flume
hadoop spark hive pig
java python scala sql

期望输出：

复制代码

flink	1
flume	1
hadoop	3
hbase	1
hdfs	1
hive	2
java	1
kafka	1
mapreduce	1
pig	1
python	1
scala	1
spark	2
sqoop	1
sql	1
yarn	1

核心思路

复制代码

Map阶段：将每行按空格切分，输出 <单词, 1>
Shuffle阶段：相同单词聚合，得到 <单词, [1,1,1,...]>
Reduce阶段：对values累加，输出 <单词, 总次数>

代码实现

1. Mapper类

java 复制代码

package com.mapper;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;

/**
 * WordCount Mapper
 * 输入：KEYIN=LongWritable(行偏移量), VALUEIN=Text(一行文本)
 * 输出：KEYOUT=Text(单词), VALUEOUT=LongWritable(1)
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    
    // 复用对象，减少GC压力
    private Text outKey = new Text();
    private final static LongWritable outValue = new LongWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) 
            throws IOException, InterruptedException {
        
        // 获取一行数据
        String line = value.toString();
        
        // 使用Hadoop的StringUtils.split按空格切分（性能优于Java原生split）
        String[] words = StringUtils.split(line, ' ');
        
        // 遍历单词，输出 <单词, 1>
        for (String word : words) {
            if (word != null && !word.trim().isEmpty()) {
                outKey.set(word.trim());
                context.write(outKey, outValue);
            }
        }
    }
}

2. Reducer类

java 复制代码

package com.reducer;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * WordCount Reducer
 * 输入：KEYIN=Text(单词), VALUEIN=Iterable<LongWritable>([1,1,1,...])
 * 输出：KEYOUT=Text(单词), VALUEOUT=LongWritable(总次数)
 */
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    
    private LongWritable outValue = new LongWritable();

    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) 
            throws IOException, InterruptedException {
        
        // 累加单词出现次数
        long sum = 0;
        for (LongWritable value : values) {
            sum += value.get();
        }
        
        outValue.set(sum);
        context.write(key, outValue);
    }
}

3. Driver执行类

java 复制代码

package com.driver;

import com.mapper.WordCountMapper;
import com.reducer.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * WordCount Driver
 * 配置并提交MapReduce作业
 */
public class WordCountDriver {

    public static void main(String[] args) throws IOException, 
            ClassNotFoundException, InterruptedException {
        
        // 参数校验
        if (args.length != 2) {
            System.err.println("Usage: WordCountDriver <input path> <output path>");
            System.exit(-1);
        }

        // 1. 获取配置和Job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "WordCount");
        
        // 2. 设置Jar包路径（集群运行必须）
        job.setJarByClass(WordCountDriver.class);
        
        // 3. 关联Mapper和Reducer
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        
        // 4. 设置Map输出KV类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        
        // 5. 设置最终输出KV类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        
        // 6. 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        
        // 7. 提交作业并等待完成
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

集群运行命令

bash 复制代码

# 准备输入数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /wordcount/input
[atguigu@hadoop102 ~]$ hadoop fs -put word.txt /wordcount/input/

# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.WordCountDriver /wordcount/input /wordcount/output

# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /wordcount/output/part-r-00000

题目二：数据去重（Unique）

题目描述

去除输入文件中的重复行，只保留唯一值。这是MapReduce的经典应用，利用Shuffle阶段的自动排序去重特性。

输入数据（unique.txt）：

复制代码

apple
banana
apple
cherry
banana
date
elderberry
cherry
fig
apple

期望输出：

复制代码

apple
banana
cherry
date
elderberry
fig

核心思路

复制代码

Map阶段：将每行内容作为key输出，value设为NullWritable（不占位）
Shuffle阶段：相同key自动聚合，只保留一个
Reduce阶段：直接输出key，value为NullWritable

💡 技巧：利用MapReduce的Shuffle机制天然去重，Reduce每个key只执行一次。

代码实现

1. Mapper类

java 复制代码

package com.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Unique Mapper - 数据去重
 * 将每行内容作为key，利用Shuffle自动去重
 */
public class UniqueMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
    IntWritable num = new IntWritable(1);
    Text text = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) 
            throws IOException, InterruptedException {
        
        // 去掉字符串两端的空格
        String newValue = value.toString().trim();
        text.set(newValue);
        
        // 输出 <行内容, 1>，value在Reduce中会被忽略
        context.write(text, num);
    }
}

2. Reducer类

java 复制代码

package com.reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Unique Reducer - 直接输出key，value为空
 * 由于Shuffle已去重，每个key只出现一次
 */
public class UniqueReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
            throws IOException, InterruptedException {
        
        // 直接输出key，value为NullWritable（不占位）
        context.write(key, NullWritable.get());
    }
}

3. Driver执行类

java 复制代码

package com.driver;

import com.mapper.UniqueMapper;
import com.reducer.UniqueReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class UniqueDriver {
    
    public static void main(String[] args) throws IOException, 
            InterruptedException, ClassNotFoundException {
        
        // 1. 获取配置信息以及获取job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "Unique");

        // 2. 关联本Driver程序的jar
        job.setJarByClass(UniqueDriver.class);

        // 3. 关联Mapper和Reducer
        job.setMapperClass(UniqueMapper.class);
        job.setReducerClass(UniqueReducer.class);

        // 4. 设置Mapper输出的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 5. 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 6. 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 7. 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

集群运行命令

bash 复制代码

# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /unique/input
[atguigu@hadoop102 ~]$ hadoop fs -put unique.txt /unique/input/

# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.UniqueDriver /unique/input /unique/output

# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /unique/output/part-r-00000

题目三：日期统计（CountByDate）

题目描述

统计日志文件中每个日期出现的记录数。模拟电商场景：统计每天的订单量。

输入数据（orders.txt）：

复制代码

user001|2024-01-01|100.50
user002|2024-01-01|200.00
user003|2024-01-02|150.00
user004|2024-01-01|80.00
user005|2024-01-03|300.00
user006|2024-01-02|120.00
user007|2024-01-01|50.00
user008|2024-01-03|210.00
user009|2024-01-02|90.00
user010|2024-01-01|180.00

期望输出：

复制代码

2024-01-01	4
2024-01-02	3
2024-01-03	2

核心思路

复制代码

Map阶段：按"|"切分，提取日期作为key，输出 <日期, 1>
Shuffle阶段：相同日期聚合
Reduce阶段：累加计数，输出 <日期, 总记录数>

代码实现

1. Mapper类

java 复制代码

package com.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * CountByDate Mapper - 按日期统计
 * 输入格式：user_id|date|amount
 */
public class CountByDateMapper extends Mapper<Object, Text, Text, IntWritable> {
    
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    @Override
    public void map(Object key, Text value, Context context) 
            throws IOException, InterruptedException {
        
        // 按 "|" 切分，-1表示保留空字符串
        String[] data = value.toString().split("\\|", -1);
        
        // 提取日期字段（第2个字段，索引为1）
        word.set(data[1]);
        
        // 输出 <日期, 1>
        context.write(word, one);
    }
}

2. Reducer类

java 复制代码

package com.reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * CountByDate Reducer - 日期计数累加
 */
public class CountByDateReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    
    private IntWritable result = new IntWritable();

    @Override
    public void reduce(Text key, Iterable<IntWritable> values, Context context) 
            throws IOException, InterruptedException {
        
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        
        result.set(sum);
        context.write(key, result);
    }
}

3. Driver执行类

java 复制代码

package com.driver;

import com.mapper.CountByDateMapper;
import com.reducer.CountByDateReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class CountByDateDriver {
    
    public static void main(String[] args) throws Exception {
        
        // 1. 获取配置信息以及获取job对象
        Configuration conf = new Configuration();
        
        // 解析命令行参数
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: CountByDateDriver <in> [<in>...] <out>");
            System.exit(2);
        }
        
        Job job = Job.getInstance(conf, "CountByDate");

        // 2. 关联本Driver程序的jar
        job.setJarByClass(CountByDateDriver.class);

        // 3. 关联Mapper和Reducer
        job.setMapperClass(CountByDateMapper.class);
        // 设置Combiner，在Map端预聚合，减少Shuffle数据量
        job.setCombinerClass(CountByDateReducer.class);
        job.setReducerClass(CountByDateReducer.class);

        // 4. 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 5. 设置输入路径（支持多个输入）
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        
        // 6. 设置输出路径
        FileOutputFormat.setOutputPath(job, 
                new Path(otherArgs[otherArgs.length - 1]));

        // 7. 提交job
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

💡 Combiner优化 ：本题设置了job.setCombinerClass(CountByDateReducer.class)，在Map端进行预聚合，显著减少网络传输。

集群运行命令

bash 复制代码

# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /countdate/input
[atguigu@hadoop102 ~]$ hadoop fs -put orders.txt /countdate/input/

# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.CountByDateDriver /countdate/input /countdate/output

# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /countdate/output/part-r-00000

如果这篇文章对你有帮助，欢迎点赞、收藏、评论三连！有问题可以在评论区留言交流~