Hbase整合Mapreduce案例2 hbase数据下载至hdfs中——wordcount

目录

整合结构

案例1的结构差不多,Hbase移动到开头,后面跟随MR程序。

因此对于输入的K1 V1会进行一定的修改

准备

  1. 在HBASE中创建表,并写入数据
bash 复制代码
create "wunaiieq:sentence","colf"
  1. 系统文件上传

datain3.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;

import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.wunaiieq.HBaseConnection;
import org.wunaiieq.HbaseDML;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

public class datain3 {
    public static Connection connection = HBaseConnection.connection;
    public static void main(String[] args) throws IOException {
        BufferedReader bufferedReader =new BufferedReader(
                new FileReader("/opt/module/jar/data.txt")
        );
        String line =null;
        Table table = connection.getTable(TableName.valueOf("wunaiieq", "sentence"));
        int rowkey = 1;
        while ((line=bufferedReader.readLine())!=null){
            Put put = new Put(Bytes.toBytes(rowkey));
            put.addColumn(Bytes.toBytes("colf"),Bytes.toBytes("line"),Bytes.toBytes(line));
            table.put(put);
            rowkey++;
        }
        bufferedReader.close();
    }
}

数据下载

pom.xml

bash 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.hbase</groupId>
    <artifactId>hbase2hdfs</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <hadoop.version>3.1.3</hadoop.version>
        <hbase.version>2.2.3</hbase.version>
    </properties>

    <dependencies>
        <!-- Hadoop Dependencies -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-yarn-api</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- HBase Dependencies -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>${hbase.version}</version>
        </dependency>

        <!-- Other Dependencies -->
        <dependency>
            <groupId>com.google.protobuf</groupId>
            <artifactId>protobuf-java</artifactId>
            <version>3.19.1</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
            <scope>compile</scope>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <!--声明-->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.3.0</version>
                <!--具体配置-->
                <configuration>
                    <archive>
                        <manifest>
                            <!--jar包的执行入口-->
                            <mainClass>org.wunaiieq.hbase2hdfs.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
                        允许自定义生成jar文件内容-->
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <!--执行配置-->
                <executions>
                    <execution>
                        <!--执行配置ID,可修改-->
                        <id>make-assembly</id>
                        <!--执行的生命周期-->
                        <phase>package</phase>
                        <goals>
                            <!--执行的目标,single表示创建一个分发包-->
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

Main.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
    public static void main(String[] args) throws Exception {
        //配置文件,写在resources目录下
        Job job =Job.getInstance(new Configuration());
        //入口类
        job.setJarByClass(Main.class);
        Scan scan = new Scan();
        TableMapReduceUtil.initTableMapperJob(
               "wunaiieq:sentence",//表名
                scan,//表输入时,可以在此处进行部分设置,如选择查询的列簇,列,过滤行等等
                org.wunaiieq.hbase2hdfs.Map.class,//指定mapper类
                Text.class,//k2
                IntWritable.class,//v2
                job,
                false
        );
        job.setOutputKeyClass(Text.class);//K3
        job.setOutputValueClass(IntWritable.class);//V3
        job.setReducerClass(org.wunaiieq.hbase2hdfs.Reduce.class);
        //手动输入输出路径
        FileOutputFormat.setOutputPath(job,new Path(args[0]));




        job.waitForCompletion(true);

    }
}

Reduce.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;

import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

//                                        K3    V3     K4     V4
public class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{
    private IntWritable v4 =new IntWritable();
    private Text k4 =new Text();
    @Override
    protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException {
        int sum =0;
        for (IntWritable v30:v3){
            sum+=v30.get();
        }
        v4.set(sum);
        k4=k3;
        context.write(k4,v4);
    }
}

Map.java

bash 复制代码
package org.wunaiieq.hbase2hdfs;

import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
//                                      K1   V1
public class Map extends TableMapper<Text,IntWritable> {
    private Text k2=new Text();
    private IntWritable v2 =new IntWritable(1);
    @Override
    protected void map(ImmutableBytesWritable k1, Result v1,Context context) throws IOException, InterruptedException {
        System.out.println("k1:"+k1.toString());
        //读取当前行中的colf:line数据
        byte[] data =v1.getValue(Bytes.toBytes("colf"),Bytes.toBytes("line"));
        String line =Bytes.toString(data);
        String [] words =line.split(" ");
        for (String word :words){
            k2.set(word);
            context.write(k2,v2);
        }
    }
}

操作

打包上传至linux系统中

bash 复制代码
hadoop jar hbase2hdfs-1.0-SNAPSHOT-jar-with-dependencies.jar /output/test

检查文件

bash 复制代码
hdfs dfs -cat /output/test/part-r-00000

总结

没什么特殊点,记录下这两个案例即可,只需要在MR程序中替换掉对应的Mapper和Reducer即可

相关推荐
笨蛋少年派8 小时前
使用hdfs命令报错:Unknown command:dfs(环境变量正确)
大数据·hadoop·hdfs
yumgpkpm12 小时前
CMP(类ClouderaCDP7.3(404次编译) )完全支持华为鲲鹏Aarch64(ARM)POC报告
大数据·hive·hadoop·python·elasticsearch·hbase·cloudera
学习中的阿陈14 小时前
MapReduce运行实例
大数据·mapreduce
蒋星熠1 天前
分布式计算深度解析:从理论到实践的技术探索
分布式·机器学习·spark·自动化·云计算·边缘计算·mapreduce
yumgpkpm4 天前
CMP(类ClouderaCDP7.3(404次编译) )完全支持华为鲲鹏Aarch64(ARM),粉丝数超过200就开源下载
hive·hadoop·redis·mongodb·elasticsearch·hbase·big data
某zhuan7 天前
云计算实验4——CentOS中HBase的安装
centos·云计算·hbase
vivo互联网技术10 天前
vivo HDFS EC 大规模落地实践
大数据·hdfs
yumgpkpm10 天前
CMP (类ClouderaCDP7.3(404次编译) )华为鲲鹏Aarch64(ARM)信创环境 查询2100w行 hive 查询策略
数据库·数据仓库·hive·hadoop·flink·mapreduce·big data
直有两条腿11 天前
【数据迁移】HBase Bulkload批量加载原理
大数据·数据库·hbase
BD_Marathon11 天前
启动hbase后,hbmaster总是挂
hbase