二百五十九、Java——采集Kafka数据,解析成一条条数据,写入另一Kafka中(一般JSON)

一、目的

由于部分数据类型频率为1s,从而数据规模特别大,因此完整的JSON放在Hive中解析起来,尤其是在单机环境下,效率特别慢,无法满足业务需求。

而Flume的拦截器并不能很好的转换数据,因为只能采用Java方式,从Kafka的主题A中采集数据,并解析字段,然后写入到放在Kafka主题B中

二 、原始数据格式

JSON格式比较正常,对象中包含数组

{

"deviceNo": "39",

"sourceDeviceType": null,

"sn": null,

"model": null,

"createTime": "2024-09-03 14:10:00",

"data": {

"cycle": 300,

"evaluationList": [{

"laneNo": 1,

"laneType": null,

"volume": 3,

"queueLenMax": 11.43,

"sampleNum": 0,

"stopAvg": 0.54,

"delayAvg": 0.0,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

},

{

"laneNo": 2,

"laneType": null,

"volume": 7,

"queueLenMax": 23.18,

"sampleNum": 0,

"stopAvg": 0.47,

"delayAvg": 10.57,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

},

{

"laneNo": 3,

"laneType": null,

"volume": 9,

"queueLenMax": 11.54,

"sampleNum": 0,

"stopAvg": 0.18,

"delayAvg": 9.67,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

},

{

"laneNo": 4,

"laneType": null,

"volume": 6,

"queueLenMax": 11.36,

"sampleNum": 0,

"stopAvg": 0.27,

"delayAvg": 6.83,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

}]

}

}

三、Java代码

复制代码
package com.kgc;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import java.time.Duration;
import java.util.Collections;
import java.util.Properties;

public class KafkaKafkaEvaluation {
    // 添加 Kafka Producer 配置
    private static Properties producerProps() {
        Properties props = new Properties();
        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.0.70:9092");
        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
        props.put(ProducerConfig.ACKS_CONFIG, "-1");
        props.put(ProducerConfig.RETRIES_CONFIG, "3");
        props.put(ProducerConfig.BATCH_SIZE_CONFIG, "16384");
        props.put(ProducerConfig.LINGER_MS_CONFIG, "1");
        props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, "33554432");
        return props;
    }

    public static void main(String[] args) {
        Properties prop = new Properties();
        prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.0.70:9092");
        prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
        prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000");
        prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        // 每一个消费,都要定义不同的Group_ID
        prop.put(ConsumerConfig.GROUP_ID_CONFIG, "evaluation_group");

        KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
        consumer.subscribe(Collections.singleton("topic_internal_data_evaluation"));
        ObjectMapper mapper = new ObjectMapper();

        // 初始化 Kafka Producer
        KafkaProducer<String, String> producer = new KafkaProducer<>(producerProps());

        while (true) {
            ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(1000));
            for (ConsumerRecord<String, String> record : records) {
                try {
                    JsonNode rootNode = mapper.readTree(record.value());

                    System.out.println("原始数据"+rootNode);

                    String device_no = rootNode.get("deviceNo").asText();
                    String source_device_type = rootNode.get("sourceDeviceType").asText();
                    String sn = rootNode.get("sn").asText();
                    String model = rootNode.get("model").asText();
                    String create_time = rootNode.get("createTime").asText();
                    String cycle = rootNode.get("data").get("cycle").asText();

                    JsonNode evaluationList = rootNode.get("data").get("evaluationList");
                    for (JsonNode evaluationItem : evaluationList) {
                        String lane_no = evaluationItem.get("laneNo").asText();
                        String lane_type = evaluationItem.get("laneType").asText();
                        String volume = evaluationItem.get("volume").asText();
                        String queue_len_max = evaluationItem.get("queueLenMax").asText();
                        String sample_num = evaluationItem.get("sampleNum").asText();
                        String stop_avg = evaluationItem.get("stopAvg").asText();
                        String delay_avg = evaluationItem.get("delayAvg").asText();
                        String pass_rate = evaluationItem.get("passRate").asText();
                        String travel_dist = evaluationItem.get("travelDist").asText();
                        String travel_time_avg = evaluationItem.get("travelTimeAvg").asText();

                        String outputLine = String.format("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s",
                                device_no, source_device_type, sn, model, create_time, cycle,lane_no, lane_type,
                                volume,queue_len_max,sample_num,stop_avg,delay_avg,pass_rate,travel_dist,travel_time_avg);

                        // 发送数据到 Kafka
                        ProducerRecord<String, String> producerRecord = new ProducerRecord<>("topic_db_data_evaluation", record.key(), outputLine);
                        producer.send(producerRecord, (RecordMetadata metadata, Exception e) -> {
                            if (e != null) {
                                e.printStackTrace();
                            } else {
                                System.out.println("The offset of the record we just sent is: " + metadata.offset());
                            }
                        });
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            consumer.commitAsync();
        }
    }

}

1、服务器IP都是 192.168.0.70

2、消费Kafka主题(数据源):topic_internal_data_evaluation

3、生产Kafka主题(目标源):topic_db_data_evaluation

4、注意:字段顺序与ODS层表结构字段顺序一致!!!

四、开启Kafka主题topic_db_data_evaluation消费者

root@localhost bin\]# ./kafka-console-consumer.sh --bootstrap-server 192.168.0.70:9092 --topic topic_db_data_evaluation --from-beginning ## 五、运行测试 ### 1、启动项目 ![](https://i-blog.csdnimg.cn/direct/c87f6243d8514eadad2a7e31088c5274.png) ### 2、消费者输出数据 ![](https://i-blog.csdnimg.cn/direct/a0bf54da8efd44f6b2a117f30be26afa.png) 然后再用Flume采集写入HDFS就行了,不过ODS层表结构需要转变 ## 六、ODS层新表结构 ``` create external table if not exists hurys_dc_ods.ods_evaluation( device_no string COMMENT '设备编号', source_device_type string COMMENT '设备类型', sn string COMMENT '设备序列号 ', model string COMMENT '设备型号', create_time timestamp COMMENT '创建时间', cycle int COMMENT '评价数据周期', lane_no int COMMENT '车道编号', lane_type int COMMENT '车道类型 0:渠化1:来向2:出口3:去向4:左弯待转区5:直行待行区6:右转专用道99:未定义车道', volume int COMMENT '车道内过停止线流量(辆)', queue_len_max float COMMENT '车道内最大排队长度(m)', sample_num int COMMENT '评价数据计算样本量', stop_avg float COMMENT '车道内平均停车次数(次)', delay_avg float COMMENT '车道内平均延误时间(s)', pass_rate float COMMENT '车道内一次通过率', travel_dist float COMMENT '车道内检测行程距离(m)', travel_time_avg float COMMENT '车道内平均行程时间' ) comment '评价数据外部表——静态分区' partitioned by (day string) row format delimited fields terminated by ',' stored as SequenceFile ; ``` ## 七、Flume采集配置文件 ![](https://i-blog.csdnimg.cn/direct/c7ba042aa26c4e23979fe7d5a9e90b2a.png) ## 八、运行Flume任务,检查HDFS文件、以及ODS表数据 ``` --刷新表分区 msck repair table ods_evaluation; --查看表分区 show partitions hurys_dc_ods.ods_evaluation; --查看表数据 select * from hurys_dc_ods.ods_evaluation where day='2024-09-03'; ``` 搞定,这样就不需要在Hive中解析JSON数据了!!!

相关推荐
好奇的菜鸟10 分钟前
如何在IntelliJ IDEA中设置数据库连接全局共享
java·数据库·intellij-idea
m0_5557629015 分钟前
Matlab 频谱分析 (Spectral Analysis)
开发语言·matlab
DuelCode1 小时前
Windows VMWare Centos Docker部署Springboot 应用实现文件上传返回文件http链接
java·spring boot·mysql·nginx·docker·centos·mybatis
浪裡遊1 小时前
React Hooks全面解析:从基础到高级的实用指南
开发语言·前端·javascript·react.js·node.js·ecmascript·php
优创学社21 小时前
基于springboot的社区生鲜团购系统
java·spring boot·后端
幽络源小助理1 小时前
SpringBoot基于Mysql的商业辅助决策系统设计与实现
java·vue.js·spring boot·后端·mysql·spring
猴哥源码1 小时前
基于Java+springboot 的车险理赔信息管理系统
java·spring boot
lzb_kkk2 小时前
【C++】C++四种类型转换操作符详解
开发语言·c++·windows·1024程序员节
YuTaoShao2 小时前
【LeetCode 热题 100】48. 旋转图像——转置+水平翻转
java·算法·leetcode·职场和发展
好开心啊没烦恼2 小时前
Python 数据分析:numpy,说人话,说说数组维度。听故事学知识点怎么这么容易?
开发语言·人工智能·python·数据挖掘·数据分析·numpy