二百五十九、Java——采集Kafka数据,解析成一条条数据,写入另一Kafka中(一般JSON)

一、目的

由于部分数据类型频率为1s,从而数据规模特别大,因此完整的JSON放在Hive中解析起来,尤其是在单机环境下,效率特别慢,无法满足业务需求。

而Flume的拦截器并不能很好的转换数据,因为只能采用Java方式,从Kafka的主题A中采集数据,并解析字段,然后写入到放在Kafka主题B中

二 、原始数据格式

JSON格式比较正常,对象中包含数组

{

"deviceNo": "39",

"sourceDeviceType": null,

"sn": null,

"model": null,

"createTime": "2024-09-03 14:10:00",

"data": {

"cycle": 300,

"evaluationList": [{

"laneNo": 1,

"laneType": null,

"volume": 3,

"queueLenMax": 11.43,

"sampleNum": 0,

"stopAvg": 0.54,

"delayAvg": 0.0,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

},

{

"laneNo": 2,

"laneType": null,

"volume": 7,

"queueLenMax": 23.18,

"sampleNum": 0,

"stopAvg": 0.47,

"delayAvg": 10.57,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

},

{

"laneNo": 3,

"laneType": null,

"volume": 9,

"queueLenMax": 11.54,

"sampleNum": 0,

"stopAvg": 0.18,

"delayAvg": 9.67,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

},

{

"laneNo": 4,

"laneType": null,

"volume": 6,

"queueLenMax": 11.36,

"sampleNum": 0,

"stopAvg": 0.27,

"delayAvg": 6.83,

"passRate": 0.0,

"travelDist": 140.0,

"travelTimeAvg": 0.0

}]

}

}

三、Java代码

复制代码
package com.kgc;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import java.time.Duration;
import java.util.Collections;
import java.util.Properties;

public class KafkaKafkaEvaluation {
    // 添加 Kafka Producer 配置
    private static Properties producerProps() {
        Properties props = new Properties();
        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.0.70:9092");
        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
        props.put(ProducerConfig.ACKS_CONFIG, "-1");
        props.put(ProducerConfig.RETRIES_CONFIG, "3");
        props.put(ProducerConfig.BATCH_SIZE_CONFIG, "16384");
        props.put(ProducerConfig.LINGER_MS_CONFIG, "1");
        props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, "33554432");
        return props;
    }

    public static void main(String[] args) {
        Properties prop = new Properties();
        prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.0.70:9092");
        prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
        prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000");
        prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        // 每一个消费,都要定义不同的Group_ID
        prop.put(ConsumerConfig.GROUP_ID_CONFIG, "evaluation_group");

        KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
        consumer.subscribe(Collections.singleton("topic_internal_data_evaluation"));
        ObjectMapper mapper = new ObjectMapper();

        // 初始化 Kafka Producer
        KafkaProducer<String, String> producer = new KafkaProducer<>(producerProps());

        while (true) {
            ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(1000));
            for (ConsumerRecord<String, String> record : records) {
                try {
                    JsonNode rootNode = mapper.readTree(record.value());

                    System.out.println("原始数据"+rootNode);

                    String device_no = rootNode.get("deviceNo").asText();
                    String source_device_type = rootNode.get("sourceDeviceType").asText();
                    String sn = rootNode.get("sn").asText();
                    String model = rootNode.get("model").asText();
                    String create_time = rootNode.get("createTime").asText();
                    String cycle = rootNode.get("data").get("cycle").asText();

                    JsonNode evaluationList = rootNode.get("data").get("evaluationList");
                    for (JsonNode evaluationItem : evaluationList) {
                        String lane_no = evaluationItem.get("laneNo").asText();
                        String lane_type = evaluationItem.get("laneType").asText();
                        String volume = evaluationItem.get("volume").asText();
                        String queue_len_max = evaluationItem.get("queueLenMax").asText();
                        String sample_num = evaluationItem.get("sampleNum").asText();
                        String stop_avg = evaluationItem.get("stopAvg").asText();
                        String delay_avg = evaluationItem.get("delayAvg").asText();
                        String pass_rate = evaluationItem.get("passRate").asText();
                        String travel_dist = evaluationItem.get("travelDist").asText();
                        String travel_time_avg = evaluationItem.get("travelTimeAvg").asText();

                        String outputLine = String.format("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s",
                                device_no, source_device_type, sn, model, create_time, cycle,lane_no, lane_type,
                                volume,queue_len_max,sample_num,stop_avg,delay_avg,pass_rate,travel_dist,travel_time_avg);

                        // 发送数据到 Kafka
                        ProducerRecord<String, String> producerRecord = new ProducerRecord<>("topic_db_data_evaluation", record.key(), outputLine);
                        producer.send(producerRecord, (RecordMetadata metadata, Exception e) -> {
                            if (e != null) {
                                e.printStackTrace();
                            } else {
                                System.out.println("The offset of the record we just sent is: " + metadata.offset());
                            }
                        });
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            consumer.commitAsync();
        }
    }

}

1、服务器IP都是 192.168.0.70

2、消费Kafka主题(数据源):topic_internal_data_evaluation

3、生产Kafka主题(目标源):topic_db_data_evaluation

4、注意:字段顺序与ODS层表结构字段顺序一致!!!

四、开启Kafka主题topic_db_data_evaluation消费者

root@localhost bin\]# ./kafka-console-consumer.sh --bootstrap-server 192.168.0.70:9092 --topic topic_db_data_evaluation --from-beginning ## 五、运行测试 ### 1、启动项目 ![](https://i-blog.csdnimg.cn/direct/c87f6243d8514eadad2a7e31088c5274.png) ### 2、消费者输出数据 ![](https://i-blog.csdnimg.cn/direct/a0bf54da8efd44f6b2a117f30be26afa.png) 然后再用Flume采集写入HDFS就行了,不过ODS层表结构需要转变 ## 六、ODS层新表结构 ``` create external table if not exists hurys_dc_ods.ods_evaluation( device_no string COMMENT '设备编号', source_device_type string COMMENT '设备类型', sn string COMMENT '设备序列号 ', model string COMMENT '设备型号', create_time timestamp COMMENT '创建时间', cycle int COMMENT '评价数据周期', lane_no int COMMENT '车道编号', lane_type int COMMENT '车道类型 0:渠化1:来向2:出口3:去向4:左弯待转区5:直行待行区6:右转专用道99:未定义车道', volume int COMMENT '车道内过停止线流量(辆)', queue_len_max float COMMENT '车道内最大排队长度(m)', sample_num int COMMENT '评价数据计算样本量', stop_avg float COMMENT '车道内平均停车次数(次)', delay_avg float COMMENT '车道内平均延误时间(s)', pass_rate float COMMENT '车道内一次通过率', travel_dist float COMMENT '车道内检测行程距离(m)', travel_time_avg float COMMENT '车道内平均行程时间' ) comment '评价数据外部表——静态分区' partitioned by (day string) row format delimited fields terminated by ',' stored as SequenceFile ; ``` ## 七、Flume采集配置文件 ![](https://i-blog.csdnimg.cn/direct/c7ba042aa26c4e23979fe7d5a9e90b2a.png) ## 八、运行Flume任务,检查HDFS文件、以及ODS表数据 ``` --刷新表分区 msck repair table ods_evaluation; --查看表分区 show partitions hurys_dc_ods.ods_evaluation; --查看表数据 select * from hurys_dc_ods.ods_evaluation where day='2024-09-03'; ``` 搞定,这样就不需要在Hive中解析JSON数据了!!!

相关推荐
云飞云共享云桌面13 分钟前
东莞精密机械制造工厂如何10个SolidWorks共用一台服务器资源
java·运维·服务器·网络·数据库·电脑·制造
小此方14 分钟前
C语言自定义变量类型结构体理论:从初见到精通(上)
c语言·开发语言
毕设源码-赖学姐19 分钟前
【开题答辩全过程】以 网络药店管理系统为例,包含答辩的问题和答案
java·eclipse
努力也学不会java20 分钟前
【Java并发】揭秘Lock体系 -- 深入理解ReentrantReadWriteLock
java·开发语言·python·机器学习
埃泽漫笔32 分钟前
消息队列延迟与过期问题的实战解决
java·mq
vxtkjzxt88833 分钟前
自动化脚本矩阵运营
开发语言·php
王严培.42 分钟前
7.MATLAB疑难问题诊疗的技术
开发语言·matlab·信息可视化
花花无缺1 小时前
资源泄露问题
java·后端·http
wjs20241 小时前
PHP MySQL 使用 ORDER BY 排序查询
开发语言
爱敲代码的TOM1 小时前
深入剖析Java通信架构下的三种IO模式2
java·开发语言·架构