目录
整合结构
和案例1的结构差不多,Hbase移动到开头,后面跟随MR程序。
因此对于输入的K1 V1会进行一定的修改
准备
- 在HBASE中创建表,并写入数据
bash
create "wunaiieq:sentence","colf"
- 系统文件上传
datain3.java
bash
package org.wunaiieq.hbase2hdfs;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.wunaiieq.HBaseConnection;
import org.wunaiieq.HbaseDML;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
public class datain3 {
public static Connection connection = HBaseConnection.connection;
public static void main(String[] args) throws IOException {
BufferedReader bufferedReader =new BufferedReader(
new FileReader("/opt/module/jar/data.txt")
);
String line =null;
Table table = connection.getTable(TableName.valueOf("wunaiieq", "sentence"));
int rowkey = 1;
while ((line=bufferedReader.readLine())!=null){
Put put = new Put(Bytes.toBytes(rowkey));
put.addColumn(Bytes.toBytes("colf"),Bytes.toBytes("line"),Bytes.toBytes(line));
table.put(put);
rowkey++;
}
bufferedReader.close();
}
}
数据下载
pom.xml
bash
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.hbase</groupId>
<artifactId>hbase2hdfs</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>3.1.3</hadoop.version>
<hbase.version>2.2.3</hbase.version>
</properties>
<dependencies>
<!-- Hadoop Dependencies -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-streaming</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- HBase Dependencies -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>${hbase.version}</version>
</dependency>
<!-- Other Dependencies -->
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>3.19.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<!--声明-->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<!--具体配置-->
<configuration>
<archive>
<manifest>
<!--jar包的执行入口-->
<mainClass>org.wunaiieq.hbase2hdfs.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<!--描述符,此处为预定义的,表示创建一个包含项目所有依赖的可执行 JAR 文件;
允许自定义生成jar文件内容-->
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!--执行配置-->
<executions>
<execution>
<!--执行配置ID,可修改-->
<id>make-assembly</id>
<!--执行的生命周期-->
<phase>package</phase>
<goals>
<!--执行的目标,single表示创建一个分发包-->
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
Main.java
bash
package org.wunaiieq.hbase2hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Main {
public static void main(String[] args) throws Exception {
//配置文件,写在resources目录下
Job job =Job.getInstance(new Configuration());
//入口类
job.setJarByClass(Main.class);
Scan scan = new Scan();
TableMapReduceUtil.initTableMapperJob(
"wunaiieq:sentence",//表名
scan,//表输入时,可以在此处进行部分设置,如选择查询的列簇,列,过滤行等等
org.wunaiieq.hbase2hdfs.Map.class,//指定mapper类
Text.class,//k2
IntWritable.class,//v2
job,
false
);
job.setOutputKeyClass(Text.class);//K3
job.setOutputValueClass(IntWritable.class);//V3
job.setReducerClass(org.wunaiieq.hbase2hdfs.Reduce.class);
//手动输入输出路径
FileOutputFormat.setOutputPath(job,new Path(args[0]));
job.waitForCompletion(true);
}
}
Reduce.java
bash
package org.wunaiieq.hbase2hdfs;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
// K3 V3 K4 V4
public class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{
private IntWritable v4 =new IntWritable();
private Text k4 =new Text();
@Override
protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException {
int sum =0;
for (IntWritable v30:v3){
sum+=v30.get();
}
v4.set(sum);
k4=k3;
context.write(k4,v4);
}
}
Map.java
bash
package org.wunaiieq.hbase2hdfs;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
// K1 V1
public class Map extends TableMapper<Text,IntWritable> {
private Text k2=new Text();
private IntWritable v2 =new IntWritable(1);
@Override
protected void map(ImmutableBytesWritable k1, Result v1,Context context) throws IOException, InterruptedException {
System.out.println("k1:"+k1.toString());
//读取当前行中的colf:line数据
byte[] data =v1.getValue(Bytes.toBytes("colf"),Bytes.toBytes("line"));
String line =Bytes.toString(data);
String [] words =line.split(" ");
for (String word :words){
k2.set(word);
context.write(k2,v2);
}
}
}
操作
打包上传至linux系统中
bash
hadoop jar hbase2hdfs-1.0-SNAPSHOT-jar-with-dependencies.jar /output/test
检查文件
bash
hdfs dfs -cat /output/test/part-r-00000
总结
没什么特殊点,记录下这两个案例即可,只需要在MR程序中替换掉对应的Mapper和Reducer即可