Mapreduce_partition分区入门

分区

将输入的csv按照员工号拆分成每个员工,每个员工存储为员工对象,之后按每个员工的不同部门存储

  1. pom
bash 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hadoop</groupId>
    <artifactId>Mapreduce_partition</artifactId>
    <version>1.0-SNAPSHOT</version>

    <name>Mapreduce_partition</name>
    <description>wunaiieq</description>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <!--版本控制-->
        <hadoop.version>2.7.3</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

    </dependencies>
    <!--构建配置-->
    <build>
        <plugins>
            <plugin>
                <!--声明-->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <!--具体配置-->
                <configuration>
                    <archive>
                        <manifest>
                            <!--jar包的执行入口-->
                            <mainClass>com.hadoop.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
                        允许自定义生成jar文件内容-->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--执行配置-->
                <executions>
                    <execution>
                        <!--执行配置ID,可修改-->
                        <id>make-assembly</id>
                        <!--执行的生命周期-->
                        <phase>package</phase>
                        <goals>
                            <!--执行的目标,single表示创建一个分发包-->
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>
  1. main
bash 复制代码
package com.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Job job =  Job.getInstance(new Configuration());
        job.setJarByClass(Main.class);

        //map
        job.setMapperClass(Map_1.class);
        job.setMapOutputKeyClass(IntWritable.class);//k2
        job.setMapOutputValueClass(Employee.class);//v2

        //指定分区规则
        job.setPartitionerClass(partition.class);
        //分区个数,此处的形参3传递给partition中的num
        job.setNumReduceTasks(3);
        //Reduce
        job.setReducerClass(Reduce_1.class);
        //输出
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Employee.class);

        //输入和输出
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //执行
        job.waitForCompletion(true);
    }
}
  1. Map_1
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
//1,ZhangSan,101,5000
public class Map_1 extends Mapper<LongWritable, Text, IntWritable, Employee> {
    @Override
    protected void map(LongWritable k1, Text v1, Context context)
            throws IOException, InterruptedException {
        //获取数据
        String data = v1.toString();
        //分词
        String[] words =data.split(",");

        Employee e=new Employee();

        //设置v2的输出内容(输出内容为对象e,这里的区别是每个对象不同,以下为属性设置)
        e.setId(Integer.parseInt(words[0]));
        e.setName(words[1]);
        e.setDepartment_id(Integer.parseInt(words[2]));
        e.setSalary(Integer.parseInt(words[3]));

        context.write(new IntWritable(e.getDepartment_id()),e);

    }
}
  1. Reduce_1
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class Reduce_1 extends Reducer<IntWritable,Employee,IntWritable,Employee> {
    @Override
    protected void reduce(IntWritable k3, Iterable<Employee> v3,Context context)
            throws IOException, InterruptedException {
        for (Employee e:v3){
            context.write(k3,e);
        }

    }
}
  1. partition
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

//分区规则,根据map输出
public class partition extends Partitioner<IntWritable,Employee> {
    //k2,v2,分区个数
    @Override
    public int getPartition(IntWritable k2, Employee v2, int num) {
        int department_Id= v2.getDepartment_id();
        //按照部门号存储不同分区
        if (department_Id==101){
            return 1%num;
        }else if (department_Id ==102){
            return 2%num;
        }else {
            return 3%num;
        }

    }
}
  1. 效果
    输出日志,显示4个输出文件

    dfs输出的文件目录

    存储效果
相关推荐
Java 第一深情2 小时前
零基础入门Flink,掌握基本使用方法
大数据·flink·实时计算
MXsoft6182 小时前
华为服务器(iBMC)硬件监控指标解读
大数据·运维·数据库
PersistJiao3 小时前
Spark 分布式计算中网络传输和序列化的关系(二)
大数据·网络·spark·序列化·分布式计算
九河云3 小时前
如何对AWS进行节省
大数据·云计算·aws
FreeIPCC4 小时前
谈一下开源生态对 AI人工智能大模型的促进作用
大数据·人工智能·机器人·开源
梦幻通灵4 小时前
ES分词环境实战
大数据·elasticsearch·搜索引擎
Elastic 中国社区官方博客4 小时前
Elasticsearch 中的热点以及如何使用 AutoOps 解决它们
大数据·运维·elasticsearch·搜索引擎·全文检索
天冬忘忧5 小时前
Kafka 工作流程解析:从 Broker 工作原理、节点的服役、退役、副本的生成到数据存储与读写优化
大数据·分布式·kafka
sevevty-seven6 小时前
幻读是什么?用什么隔离级别可以防止幻读
大数据·sql
Yz98767 小时前
hive复杂数据类型Array & Map & Struct & 炸裂函数explode
大数据·数据库·数据仓库·hive·hadoop·数据库开发·big data