Mapreduce_partition分区入门

分区

将输入的csv按照员工号拆分成每个员工,每个员工存储为员工对象,之后按每个员工的不同部门存储

  1. pom
bash 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hadoop</groupId>
    <artifactId>Mapreduce_partition</artifactId>
    <version>1.0-SNAPSHOT</version>

    <name>Mapreduce_partition</name>
    <description>wunaiieq</description>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <!--版本控制-->
        <hadoop.version>2.7.3</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

    </dependencies>
    <!--构建配置-->
    <build>
        <plugins>
            <plugin>
                <!--声明-->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <!--具体配置-->
                <configuration>
                    <archive>
                        <manifest>
                            <!--jar包的执行入口-->
                            <mainClass>com.hadoop.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
                        允许自定义生成jar文件内容-->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--执行配置-->
                <executions>
                    <execution>
                        <!--执行配置ID,可修改-->
                        <id>make-assembly</id>
                        <!--执行的生命周期-->
                        <phase>package</phase>
                        <goals>
                            <!--执行的目标,single表示创建一个分发包-->
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>
  1. main
bash 复制代码
package com.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Job job =  Job.getInstance(new Configuration());
        job.setJarByClass(Main.class);

        //map
        job.setMapperClass(Map_1.class);
        job.setMapOutputKeyClass(IntWritable.class);//k2
        job.setMapOutputValueClass(Employee.class);//v2

        //指定分区规则
        job.setPartitionerClass(partition.class);
        //分区个数,此处的形参3传递给partition中的num
        job.setNumReduceTasks(3);
        //Reduce
        job.setReducerClass(Reduce_1.class);
        //输出
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Employee.class);

        //输入和输出
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //执行
        job.waitForCompletion(true);
    }
}
  1. Map_1
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
//1,ZhangSan,101,5000
public class Map_1 extends Mapper<LongWritable, Text, IntWritable, Employee> {
    @Override
    protected void map(LongWritable k1, Text v1, Context context)
            throws IOException, InterruptedException {
        //获取数据
        String data = v1.toString();
        //分词
        String[] words =data.split(",");

        Employee e=new Employee();

        //设置v2的输出内容(输出内容为对象e,这里的区别是每个对象不同,以下为属性设置)
        e.setId(Integer.parseInt(words[0]));
        e.setName(words[1]);
        e.setDepartment_id(Integer.parseInt(words[2]));
        e.setSalary(Integer.parseInt(words[3]));

        context.write(new IntWritable(e.getDepartment_id()),e);

    }
}
  1. Reduce_1
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class Reduce_1 extends Reducer<IntWritable,Employee,IntWritable,Employee> {
    @Override
    protected void reduce(IntWritable k3, Iterable<Employee> v3,Context context)
            throws IOException, InterruptedException {
        for (Employee e:v3){
            context.write(k3,e);
        }

    }
}
  1. partition
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

//分区规则,根据map输出
public class partition extends Partitioner<IntWritable,Employee> {
    //k2,v2,分区个数
    @Override
    public int getPartition(IntWritable k2, Employee v2, int num) {
        int department_Id= v2.getDepartment_id();
        //按照部门号存储不同分区
        if (department_Id==101){
            return 1%num;
        }else if (department_Id ==102){
            return 2%num;
        }else {
            return 3%num;
        }

    }
}
  1. 效果
    输出日志,显示4个输出文件

    dfs输出的文件目录

    存储效果
相关推荐
Lill_bin5 小时前
深入理解ElasticSearch集群:架构、高可用性与数据一致性
大数据·分布式·elasticsearch·搜索引擎·zookeeper·架构·全文检索
涛思数据(TDengine)6 小时前
TDengine 与 SCADA 强强联合:提升工业数据管理的效率与精准
大数据·时序数据库·tdengine
isNotNullX7 小时前
如何用SQL Server和Oracle进行数据同步?
大数据·数据库·sql·oracle
RwTo8 小时前
Elasticsearch 聚合搜索
大数据·elasticsearch·搜索引擎·全文检索
isNotNullX8 小时前
HBase在大数据实时处理中的角色
大数据·数据库·hbase
白总Server8 小时前
MySQL在大数据场景应用
大数据·开发语言·数据库·后端·mysql·golang·php
求学小火龙9 小时前
ElasticSearch介绍+使用
java·大数据·elasticsearch
檀越剑指大厂9 小时前
【Elasticsearch系列六】系统命令API
大数据·elasticsearch·搜索引擎
数据运营新视界9 小时前
你知道企业架构中核心的4大架构联系和不同吗?
大数据·架构
h177113472059 小时前
基于区块链的相亲交易系统源码解析
大数据·人工智能·安全·系统架构·交友