Mapreduce_partition分区入门

分区

将输入的csv按照员工号拆分成每个员工,每个员工存储为员工对象,之后按每个员工的不同部门存储

  1. pom
bash 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hadoop</groupId>
    <artifactId>Mapreduce_partition</artifactId>
    <version>1.0-SNAPSHOT</version>

    <name>Mapreduce_partition</name>
    <description>wunaiieq</description>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <!--版本控制-->
        <hadoop.version>2.7.3</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

    </dependencies>
    <!--构建配置-->
    <build>
        <plugins>
            <plugin>
                <!--声明-->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <!--具体配置-->
                <configuration>
                    <archive>
                        <manifest>
                            <!--jar包的执行入口-->
                            <mainClass>com.hadoop.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
                        允许自定义生成jar文件内容-->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--执行配置-->
                <executions>
                    <execution>
                        <!--执行配置ID,可修改-->
                        <id>make-assembly</id>
                        <!--执行的生命周期-->
                        <phase>package</phase>
                        <goals>
                            <!--执行的目标,single表示创建一个分发包-->
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>
  1. main
bash 复制代码
package com.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Job job =  Job.getInstance(new Configuration());
        job.setJarByClass(Main.class);

        //map
        job.setMapperClass(Map_1.class);
        job.setMapOutputKeyClass(IntWritable.class);//k2
        job.setMapOutputValueClass(Employee.class);//v2

        //指定分区规则
        job.setPartitionerClass(partition.class);
        //分区个数,此处的形参3传递给partition中的num
        job.setNumReduceTasks(3);
        //Reduce
        job.setReducerClass(Reduce_1.class);
        //输出
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Employee.class);

        //输入和输出
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //执行
        job.waitForCompletion(true);
    }
}
  1. Map_1
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
//1,ZhangSan,101,5000
public class Map_1 extends Mapper<LongWritable, Text, IntWritable, Employee> {
    @Override
    protected void map(LongWritable k1, Text v1, Context context)
            throws IOException, InterruptedException {
        //获取数据
        String data = v1.toString();
        //分词
        String[] words =data.split(",");

        Employee e=new Employee();

        //设置v2的输出内容(输出内容为对象e,这里的区别是每个对象不同,以下为属性设置)
        e.setId(Integer.parseInt(words[0]));
        e.setName(words[1]);
        e.setDepartment_id(Integer.parseInt(words[2]));
        e.setSalary(Integer.parseInt(words[3]));

        context.write(new IntWritable(e.getDepartment_id()),e);

    }
}
  1. Reduce_1
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class Reduce_1 extends Reducer<IntWritable,Employee,IntWritable,Employee> {
    @Override
    protected void reduce(IntWritable k3, Iterable<Employee> v3,Context context)
            throws IOException, InterruptedException {
        for (Employee e:v3){
            context.write(k3,e);
        }

    }
}
  1. partition
bash 复制代码
package com.hadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

//分区规则,根据map输出
public class partition extends Partitioner<IntWritable,Employee> {
    //k2,v2,分区个数
    @Override
    public int getPartition(IntWritable k2, Employee v2, int num) {
        int department_Id= v2.getDepartment_id();
        //按照部门号存储不同分区
        if (department_Id==101){
            return 1%num;
        }else if (department_Id ==102){
            return 2%num;
        }else {
            return 3%num;
        }

    }
}
  1. 效果
    输出日志,显示4个输出文件

    dfs输出的文件目录

    存储效果
相关推荐
小白学大数据3 分钟前
如何使用Selenium处理JavaScript动态加载的内容?
大数据·javascript·爬虫·selenium·测试工具
15年网络推广青哥15 分钟前
国际抖音TikTok矩阵运营的关键要素有哪些?
大数据·人工智能·矩阵
节点。csn1 小时前
Hadoop yarn安装
大数据·hadoop·分布式
arnold661 小时前
探索 ElasticSearch:性能优化之道
大数据·elasticsearch·性能优化
NiNg_1_2342 小时前
基于Hadoop的数据清洗
大数据·hadoop·分布式
成长的小牛2333 小时前
es使用knn向量检索中numCandidates和k应该如何配比更合适
大数据·elasticsearch·搜索引擎
goTsHgo4 小时前
在 Spark 上实现 Graph Embedding
大数据·spark·embedding
程序猿小柒4 小时前
【Spark】Spark SQL执行计划-精简版
大数据·sql·spark
隔着天花板看星星4 小时前
Spark-Streaming集成Kafka
大数据·分布式·中间件·spark·kafka
奥顺4 小时前
PHPUnit使用指南:编写高效的单元测试
大数据·mysql·开源·php