Hbase整合Mapreduce案例2 hbase数据下载至hdfs中——wordcount

目录

整合结构

案例1的结构差不多,Hbase移动到开头,后面跟随MR程序。

因此对于输入的K1 V1会进行一定的修改

准备

  1. 在HBASE中创建表,并写入数据
bash 复制代码
create "wunaiieq:sentence","colf"
  1. 系统文件上传

datain3.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;

import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.wunaiieq.HBaseConnection;
import org.wunaiieq.HbaseDML;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

public class datain3 {
    public static Connection connection = HBaseConnection.connection;
    public static void main(String[] args) throws IOException {
        BufferedReader bufferedReader =new BufferedReader(
                new FileReader("/opt/module/jar/data.txt")
        );
        String line =null;
        Table table = connection.getTable(TableName.valueOf("wunaiieq", "sentence"));
        int rowkey = 1;
        while ((line=bufferedReader.readLine())!=null){
            Put put = new Put(Bytes.toBytes(rowkey));
            put.addColumn(Bytes.toBytes("colf"),Bytes.toBytes("line"),Bytes.toBytes(line));
            table.put(put);
            rowkey++;
        }
        bufferedReader.close();
    }
}

数据下载

pom.xml

bash 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.hbase</groupId>
    <artifactId>hbase2hdfs</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <hadoop.version>3.1.3</hadoop.version>
        <hbase.version>2.2.3</hbase.version>
    </properties>

    <dependencies>
        <!-- Hadoop Dependencies -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- HBase Dependencies -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>${hbase.version}</version>
        </dependency>

        <!-- Other Dependencies -->
        <dependency>
            <groupId>com.google.protobuf</groupId>
            <artifactId>protobuf-java</artifactId>
            <version>3.19.1</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
            <scope>compile</scope>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <!--声明-->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <!--具体配置-->
                <configuration>
                    <archive>
                        <manifest>
                            <!--jar包的执行入口-->
                            <mainClass>org.wunaiieq.hbase2hdfs.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
                        允许自定义生成jar文件内容-->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--执行配置-->
                <executions>
                    <execution>
                        <!--执行配置ID,可修改-->
                        <id>make-assembly</id>
                        <!--执行的生命周期-->
                        <phase>package</phase>
                        <goals>
                            <!--执行的目标,single表示创建一个分发包-->
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

Main.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
    public static void main(String[] args) throws Exception {
        //配置文件,写在resources目录下
        Job job =Job.getInstance(new Configuration());
        //入口类
        job.setJarByClass(Main.class);
        Scan scan = new Scan();
        TableMapReduceUtil.initTableMapperJob(
               "wunaiieq:sentence",//表名
                scan,//表输入时,可以在此处进行部分设置,如选择查询的列簇,列,过滤行等等
                org.wunaiieq.hbase2hdfs.Map.class,//指定mapper类
                Text.class,//k2
                IntWritable.class,//v2
                job,
                false
        );
        job.setOutputKeyClass(Text.class);//K3
        job.setOutputValueClass(IntWritable.class);//V3
        job.setReducerClass(org.wunaiieq.hbase2hdfs.Reduce.class);
        //手动输入输出路径
        FileOutputFormat.setOutputPath(job,new Path(args[0]));




        job.waitForCompletion(true);

    }
}

Reduce.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;

import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

//                                        K3    V3     K4     V4
public class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{
    private IntWritable v4 =new IntWritable();
    private Text k4 =new Text();
    @Override
    protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException {
        int sum =0;
        for (IntWritable v30:v3){
            sum+=v30.get();
        }
        v4.set(sum);
        k4=k3;
        context.write(k4,v4);
    }
}

Map.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;

import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
//                                      K1   V1
public class Map extends TableMapper<Text,IntWritable> {
    private Text k2=new Text();
    private IntWritable v2 =new IntWritable(1);
    @Override
    protected void map(ImmutableBytesWritable k1, Result v1,Context context) throws IOException, InterruptedException {
        System.out.println("k1:"+k1.toString());
        //读取当前行中的colf:line数据
        byte[] data =v1.getValue(Bytes.toBytes("colf"),Bytes.toBytes("line"));
        String line =Bytes.toString(data);
        String [] words =line.split(" ");
        for (String word :words){
            k2.set(word);
            context.write(k2,v2);
        }
    }
}

操作

打包上传至linux系统中

bash 复制代码
hadoop jar hbase2hdfs-1.0-SNAPSHOT-jar-with-dependencies.jar /output/test

检查文件

bash 复制代码
hdfs dfs -cat /output/test/part-r-00000

总结

没什么特殊点,记录下这两个案例即可,只需要在MR程序中替换掉对应的Mapper和Reducer即可

相关推荐
WL_Aurora1 天前
MapReduce【Shuffle-Combiner】
大数据·mapreduce
zhojiew2 天前
在AWS中国区的EMR集群中实现基于向量语义搜索的HBase运维诊断系统
运维·hbase·aws
早川9193 天前
Hbase、MySQL和Redis区别
redis·mysql·hbase
Volunteer Technology4 天前
MapReduce使用与原理(一)
大数据·eclipse·mapreduce
Volunteer Technology4 天前
MapReduce使用与原理 (二)
大数据·mapreduce
Volunteer Technology5 天前
MapReduce 介绍
大数据·mapreduce
lifewange6 天前
HBase 增删改查(CRUD)完整操作指南
数据库·python·hbase
卷毛迷你猪6 天前
快速实验篇(A1)干旱气象数据上传至HDFS
大数据·hadoop·hdfs
小的~~7 天前
CentOS7安装CDH6.3.2
hive·hdfs·kafka
开开心心就好7 天前
支持添加网址的资源快速打开工具
人工智能·学习·游戏·音视频·hbase·语音识别·storm