MapReduce经典例题【第一期】

前置准备

Maven依赖配置(pom.xml)

xml 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>hadoop-examples</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>
        <!-- Hadoop客户端依赖 -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.3</version>
        </dependency>
        <!-- 单元测试 -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <!-- 日志 -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.30</version>
        </dependency>
    </dependencies>
</project>

Hadoop序列化类型速查表

Hadoop类型 Java对应类型 说明
Text String 可变长度文本
LongWritable long 长整型
IntWritable int 整型
NullWritable null 空值(不占位)
Writable 自定义 自定义序列化接口

题目一:单词计数(WordCount)

题目描述

统计输入文件中每个单词出现的次数,是MapReduce的"Hello World"程序。

输入数据(word.txt):

复制代码
hadoop hdfs yarn mapreduce
hadoop spark flink kafka
hive hbase sqoop flume
hadoop spark hive pig
java python scala sql

期望输出:

复制代码
flink	1
flume	1
hadoop	3
hbase	1
hdfs	1
hive	2
java	1
kafka	1
mapreduce	1
pig	1
python	1
scala	1
spark	2
sqoop	1
sql	1
yarn	1

核心思路

复制代码
Map阶段:将每行按空格切分,输出 <单词, 1>
Shuffle阶段:相同单词聚合,得到 <单词, [1,1,1,...]>
Reduce阶段:对values累加,输出 <单词, 总次数>

代码实现

1. Mapper类
java 复制代码
package com.mapper;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;

/**
 * WordCount Mapper
 * 输入:KEYIN=LongWritable(行偏移量), VALUEIN=Text(一行文本)
 * 输出:KEYOUT=Text(单词), VALUEOUT=LongWritable(1)
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    
    // 复用对象,减少GC压力
    private Text outKey = new Text();
    private final static LongWritable outValue = new LongWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) 
            throws IOException, InterruptedException {
        
        // 获取一行数据
        String line = value.toString();
        
        // 使用Hadoop的StringUtils.split按空格切分(性能优于Java原生split)
        String[] words = StringUtils.split(line, ' ');
        
        // 遍历单词,输出 <单词, 1>
        for (String word : words) {
            if (word != null && !word.trim().isEmpty()) {
                outKey.set(word.trim());
                context.write(outKey, outValue);
            }
        }
    }
}
2. Reducer类
java 复制代码
package com.reducer;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * WordCount Reducer
 * 输入:KEYIN=Text(单词), VALUEIN=Iterable<LongWritable>([1,1,1,...])
 * 输出:KEYOUT=Text(单词), VALUEOUT=LongWritable(总次数)
 */
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    
    private LongWritable outValue = new LongWritable();

    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) 
            throws IOException, InterruptedException {
        
        // 累加单词出现次数
        long sum = 0;
        for (LongWritable value : values) {
            sum += value.get();
        }
        
        outValue.set(sum);
        context.write(key, outValue);
    }
}
3. Driver执行类
java 复制代码
package com.driver;

import com.mapper.WordCountMapper;
import com.reducer.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * WordCount Driver
 * 配置并提交MapReduce作业
 */
public class WordCountDriver {

    public static void main(String[] args) throws IOException, 
            ClassNotFoundException, InterruptedException {
        
        // 参数校验
        if (args.length != 2) {
            System.err.println("Usage: WordCountDriver <input path> <output path>");
            System.exit(-1);
        }

        // 1. 获取配置和Job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "WordCount");
        
        // 2. 设置Jar包路径(集群运行必须)
        job.setJarByClass(WordCountDriver.class);
        
        // 3. 关联Mapper和Reducer
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        
        // 4. 设置Map输出KV类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        
        // 5. 设置最终输出KV类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        
        // 6. 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        
        // 7. 提交作业并等待完成
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

集群运行命令

bash 复制代码
# 准备输入数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /wordcount/input
[atguigu@hadoop102 ~]$ hadoop fs -put word.txt /wordcount/input/

# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.WordCountDriver /wordcount/input /wordcount/output

# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /wordcount/output/part-r-00000

题目二:数据去重(Unique)

题目描述

去除输入文件中的重复行,只保留唯一值。这是MapReduce的经典应用,利用Shuffle阶段的自动排序去重特性。

输入数据(unique.txt):

复制代码
apple
banana
apple
cherry
banana
date
elderberry
cherry
fig
apple

期望输出:

复制代码
apple
banana
cherry
date
elderberry
fig

核心思路

复制代码
Map阶段:将每行内容作为key输出,value设为NullWritable(不占位)
Shuffle阶段:相同key自动聚合,只保留一个
Reduce阶段:直接输出key,value为NullWritable

💡 技巧:利用MapReduce的Shuffle机制天然去重,Reduce每个key只执行一次。

代码实现

1. Mapper类
java 复制代码
package com.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Unique Mapper - 数据去重
 * 将每行内容作为key,利用Shuffle自动去重
 */
public class UniqueMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
    IntWritable num = new IntWritable(1);
    Text text = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) 
            throws IOException, InterruptedException {
        
        // 去掉字符串两端的空格
        String newValue = value.toString().trim();
        text.set(newValue);
        
        // 输出 <行内容, 1>,value在Reduce中会被忽略
        context.write(text, num);
    }
}
2. Reducer类
java 复制代码
package com.reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Unique Reducer - 直接输出key,value为空
 * 由于Shuffle已去重,每个key只出现一次
 */
public class UniqueReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
            throws IOException, InterruptedException {
        
        // 直接输出key,value为NullWritable(不占位)
        context.write(key, NullWritable.get());
    }
}
3. Driver执行类
java 复制代码
package com.driver;

import com.mapper.UniqueMapper;
import com.reducer.UniqueReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class UniqueDriver {
    
    public static void main(String[] args) throws IOException, 
            InterruptedException, ClassNotFoundException {
        
        // 1. 获取配置信息以及获取job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "Unique");

        // 2. 关联本Driver程序的jar
        job.setJarByClass(UniqueDriver.class);

        // 3. 关联Mapper和Reducer
        job.setMapperClass(UniqueMapper.class);
        job.setReducerClass(UniqueReducer.class);

        // 4. 设置Mapper输出的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 5. 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 6. 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 7. 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

集群运行命令

bash 复制代码
# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /unique/input
[atguigu@hadoop102 ~]$ hadoop fs -put unique.txt /unique/input/

# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.UniqueDriver /unique/input /unique/output

# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /unique/output/part-r-00000

题目三:日期统计(CountByDate)

题目描述

统计日志文件中每个日期出现的记录数。模拟电商场景:统计每天的订单量。

输入数据(orders.txt):

复制代码
user001|2024-01-01|100.50
user002|2024-01-01|200.00
user003|2024-01-02|150.00
user004|2024-01-01|80.00
user005|2024-01-03|300.00
user006|2024-01-02|120.00
user007|2024-01-01|50.00
user008|2024-01-03|210.00
user009|2024-01-02|90.00
user010|2024-01-01|180.00

期望输出:

复制代码
2024-01-01	4
2024-01-02	3
2024-01-03	2

核心思路

复制代码
Map阶段:按"|"切分,提取日期作为key,输出 <日期, 1>
Shuffle阶段:相同日期聚合
Reduce阶段:累加计数,输出 <日期, 总记录数>

代码实现

1. Mapper类
java 复制代码
package com.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * CountByDate Mapper - 按日期统计
 * 输入格式:user_id|date|amount
 */
public class CountByDateMapper extends Mapper<Object, Text, Text, IntWritable> {
    
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    @Override
    public void map(Object key, Text value, Context context) 
            throws IOException, InterruptedException {
        
        // 按 "|" 切分,-1表示保留空字符串
        String[] data = value.toString().split("\\|", -1);
        
        // 提取日期字段(第2个字段,索引为1)
        word.set(data[1]);
        
        // 输出 <日期, 1>
        context.write(word, one);
    }
}
2. Reducer类
java 复制代码
package com.reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * CountByDate Reducer - 日期计数累加
 */
public class CountByDateReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    
    private IntWritable result = new IntWritable();

    @Override
    public void reduce(Text key, Iterable<IntWritable> values, Context context) 
            throws IOException, InterruptedException {
        
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        
        result.set(sum);
        context.write(key, result);
    }
}
3. Driver执行类
java 复制代码
package com.driver;

import com.mapper.CountByDateMapper;
import com.reducer.CountByDateReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class CountByDateDriver {
    
    public static void main(String[] args) throws Exception {
        
        // 1. 获取配置信息以及获取job对象
        Configuration conf = new Configuration();
        
        // 解析命令行参数
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: CountByDateDriver <in> [<in>...] <out>");
            System.exit(2);
        }
        
        Job job = Job.getInstance(conf, "CountByDate");

        // 2. 关联本Driver程序的jar
        job.setJarByClass(CountByDateDriver.class);

        // 3. 关联Mapper和Reducer
        job.setMapperClass(CountByDateMapper.class);
        // 设置Combiner,在Map端预聚合,减少Shuffle数据量
        job.setCombinerClass(CountByDateReducer.class);
        job.setReducerClass(CountByDateReducer.class);

        // 4. 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 5. 设置输入路径(支持多个输入)
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        
        // 6. 设置输出路径
        FileOutputFormat.setOutputPath(job, 
                new Path(otherArgs[otherArgs.length - 1]));

        // 7. 提交job
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

💡 Combiner优化 :本题设置了job.setCombinerClass(CountByDateReducer.class),在Map端进行预聚合,显著减少网络传输。

集群运行命令

bash 复制代码
# 准备数据
[atguigu@hadoop102 ~]$ hadoop fs -mkdir -p /countdate/input
[atguigu@hadoop102 ~]$ hadoop fs -put orders.txt /countdate/input/

# 提交作业
[atguigu@hadoop102 ~]$ hadoop jar hadoop-examples-1.0-SNAPSHOT.jar com.driver.CountByDateDriver /countdate/input /countdate/output

# 查看结果
[atguigu@hadoop102 ~]$ hadoop fs -cat /countdate/output/part-r-00000

如果这篇文章对你有帮助,欢迎点赞、收藏、评论三连!有问题可以在评论区留言交流~

相关推荐
Datakeji2 小时前
维恩波特Vairnport商业逻辑
大数据·人工智能·区块链
尚医云·云HIS2 小时前
从“21粒”误开,看AI如何补位处方安全
大数据·人工智能·健康医疗
摇滚侠3 小时前
黑马 Elasticsearch 全套教程,黑马旅游网案例
大数据·elasticsearch·jenkins
m0_380167144 小时前
清算热力图怎么看?如何用来判断行情走向
大数据·人工智能·区块链
消失的旧时光-19434 小时前
线程池解决了什么?为什么还不够?(从线程到协程 · 第2篇)
java·大数据·数据库
AI周红伟4 小时前
周红伟:AI时代,苹果还行吗?
大数据·人工智能·安全·copilot·openclaw
醉颜凉4 小时前
Elasticsearch 安全组件详解:Search Guard 和 X-Pack Security 到底有什么区别?
大数据·安全·elasticsearch
Elastic 中国社区官方博客4 小时前
Elasticsearch Serverless 中跨项目搜索(CPS)的工作原理
大数据·elasticsearch·搜索引擎·云原生·serverless
lst04264 小时前
Git 巨大失误案例记录 (2026-05-01)
大数据·git·elasticsearch