MapReduce_Writable序列化

使用序列化封装对象

将输入的csv按照员工号拆分成每个员工,每个员工存储为员工对象

数据处理过程

employee_noheader.csv

bash 复制代码
1,ZhangSan,101,5000
2,LiSi,102,6000
3,WangWu,101,5500
4,ZhaoLiu,103,7000
5,SunQi,102,6500
  1. pom.xml
bash 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hadoop</groupId>
    <artifactId>Mapreduce_Writable</artifactId>
    <version>1.0-SNAPSHOT</version>

    <name>Mapreduce_Writable</name>
    <description>wunaiieq</description>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <!--版本控制-->
        <hadoop.version>2.7.3</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

    </dependencies>
    <!--构建配置-->
    <build>
        <plugins>
            <plugin>
                <!--声明-->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <!--具体配置-->
                <configuration>
                    <archive>
                        <manifest>
                            <!--jar包的执行入口-->
                            <mainClass>com.hadoop.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
                        允许自定义生成jar文件内容-->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--执行配置-->
                <executions>
                    <execution>
                        <!--执行配置ID,可修改-->
                        <id>make-assembly</id>
                        <!--执行的生命周期-->
                        <phase>package</phase>
                        <goals>
                            <!--执行的目标,single表示创建一个分发包-->
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>
  1. main
bash 复制代码
package com.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Job job =  Job.getInstance(new Configuration());
        job.setJarByClass(Main.class);

        //map
        job.setMapperClass(Map_1.class);
        job.setMapOutputKeyClass(IntWritable.class);//k2
        job.setMapOutputValueClass(Employee.class);//v2

        //任务输出
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Employee.class);

        //输入和输出
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //执行
        job.waitForCompletion(true);
    }
}
  1. Map_1
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
//1,ZhangSan,101,5000
public class Map_1 extends Mapper<LongWritable, Text, IntWritable, Employee> {
    @Override
    protected void map(LongWritable k1, Text v1, Context context)
            throws IOException, InterruptedException {
        //获取数据
        String data = v1.toString();
        //分词
        String[] words =data.split(",");

        Employee e=new Employee();

        //设置v2的输出内容(输出内容为对象e,这里的区别是每个对象不同,以下为属性设置)
        e.setId(Integer.parseInt(words[0]));
        e.setName(words[1]);
        e.setDepartment_id(Integer.parseInt(words[2]));
        e.setSalary(Integer.parseInt(words[3]));

        context.write(new IntWritable(e.getId()),e);

    }
}
  1. Employee
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

//1,ZhangSan,101,5000
public class Employee implements Writable {
    private int id;
    private String name;
    private int department_id;
    private int salary;

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getDepartment_id() {
        return department_id;
    }

    public void setDepartment_id(int department_id) {
        this.department_id = department_id;
    }

    public int getSalary() {
        return salary;
    }

    public void setSalary(int salary) {
        this.salary = salary;
    }

    //toString方法,用于查看

    @Override
    public String toString() {
        return "Employee{" +
                "id=" + id +
                ", name='" + name + '\'' +
                ", department_id=" + department_id +
                ", salary=" + salary +
                '}';
    }
    //序列化和反序列化过程需要保持一致
    @Override
    public void write(DataOutput output) throws IOException {
        //实现序列化的过程,输出到文件
        output.writeInt(this.id);
        output.writeUTF(this.name);
        output.writeInt(this.department_id);
        output.writeInt(this.salary);
    }
    @Override
    public void readFields(DataInput input) throws IOException {
        //实现反序列化的过程,从文件读取
        this.id=input.readInt();
        this.name=input.readUTF();
        this.department_id=input.readInt();
        this.salary=input.readInt();
    }
}
  1. 打jar包+部署+运行(部分内容已省略)

运行

bash 复制代码
hadoop jar Mapreduce_Writable.jar /input/employee_noheader.csv /output/Writable
  1. 效果
相关推荐
lili-felicity17 分钟前
CANN异步推理实战:从Stream管理到流水线优化
大数据·人工智能
2501_933670791 小时前
2026 高职大数据专业考什么证书对就业有帮助?
大数据
xiaobaibai1531 小时前
营销自动化终极形态:AdAgent 自主闭环工作流全解析
大数据·人工智能·自动化
星辰_mya1 小时前
Elasticsearch更新了分词器之后
大数据·elasticsearch·搜索引擎
xiaobaibai1531 小时前
决策引擎深度拆解:AdAgent 用 CoT+RL 实现营销自主化决策
大数据·人工智能
悟纤1 小时前
学习与专注音乐流派 (Study & Focus Music):AI 音乐创作终极指南 | Suno高级篇 | 第33篇
大数据·人工智能·深度学习·学习·suno·suno api
ESBK20251 小时前
第四届移动互联网、云计算与信息安全国际会议(MICCIS 2026)二轮征稿启动,诚邀全球学者共赴学术盛宴
大数据·网络·物联网·网络安全·云计算·密码学·信息与通信
Elastic 中国社区官方博客2 小时前
Elasticsearch:Workflows 介绍 - 9.3
大数据·数据库·人工智能·elasticsearch·ai·全文检索
B站_计算机毕业设计之家2 小时前
豆瓣电影推荐系统 | Python Django Echarts构建个性化影视推荐平台 大数据 毕业设计源码 (建议收藏)✅
大数据·python·机器学习·django·毕业设计·echarts·推荐算法
莽撞的大地瓜2 小时前
洞察,始于一目了然——让舆情数据自己“说话”
大数据·网络·数据分析