尚硅谷大数据项目【电商数仓6.0】-Datax-6

安装

sh 复制代码
tar -zxvf datax.tar.gz -C /opt/module/

测试

sh 复制代码
python /opt/module/datax/bin/datax.py /opt/module/datax/job/job.json

MySQLToHDFS

根据官方文档写配置json

json 复制代码
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "mysqlreader",
                    "parameter": {
                        "column": [
                            "id",
                            "name",
                            "region_id",
                            "area_code",
                            "iso_code",
                            "iso_3166_2",
							"create_time",
							"operate_time"
                        ],
                        "where": "id>=3",
                        "connection": [
                            {
                                "jdbcUrl": [
                                    "jdbc:mysql://hadoop102:3306/gmall?useUnicode=true&allowPublicKeyRetrieval=true&characterEncoding=utf-8"
                                ],
                                "table": [
                                    "base_province"
                                ]
                            }
                        ],
                        "password": "000000",
                        "splitPk": "",
                        "username": "root"
                    }
                },
                "writer": {
                    "name": "hdfswriter",
                    "parameter": {
                        "column": [
                            {
                                "name": "id",
                                "type": "bigint"
                            },
                            {
                                "name": "name",
                                "type": "string"
                            },
                            {
                                "name": "region_id",
                                "type": "string"
                            },
                            {
                                "name": "area_code",
                                "type": "string"
                            },
                            {
                                "name": "iso_code",
                                "type": "string"
                            },
                            {
                                "name": "iso_3166_2",
                                "type": "string"
                            },
                            {
                                "name": "create_time",
                                "type": "string"
                            },
                            {
                                "name": "operate_time",
                                "type": "string"
                            }
                        ],
                        "compress": "gzip",
                        "defaultFS": "hdfs://hadoop102:8020",
                        "fieldDelimiter": "\t",
                        "fileName": "base_province",
                        "fileType": "text",
                        "path": "/base_province",
                        "writeMode": "append"
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": 1
            }
        }
    }
}

确保路径存在

sh 复制代码
hadoop fs -mkdir /base_province
sh 复制代码
cd /opt/module/datax
python bin/datax.py job/base_province.json 
sh 复制代码
hadoop fs -mkdir /base_province

使用SQL

json 复制代码
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "mysqlreader",
                    "parameter": {
                        "connection": [
                            {
                                "jdbcUrl": [
                                    "jdbc:mysql://hadoop102:3306/gmall?useUnicode=true&allowPublicKeyRetrieval=true&characterEncoding=utf-8"
                                ],
                                "querySql": [
                                    "select id,name,region_id,area_code,iso_code,iso_3166_2,create_time,operate_time from base_province where id>=3"
                                ]
                            }
                        ],
                        "password": "000000",
                        "username": "root"
                    }
                },
                "writer": {
                    "name": "hdfswriter",
                    "parameter": {
                        "column": [
                            {
                                "name": "id",
                                "type": "bigint"
                            },
                            {
                                "name": "name",
                                "type": "string"
                            },
                            {
                                "name": "region_id",
                                "type": "string"
                            },
                            {
                                "name": "area_code",
                                "type": "string"
                            },
                            {
                                "name": "iso_code",
                                "type": "string"
                            },
                            {
                                "name": "iso_3166_2",
                                "type": "string"
                            },
                            {
                                "name": "create_time",
                                "type": "string"
                            },
                            {
                                "name": "operate_time",
                                "type": "string"
                            }
                        ],
                        "compress": "gzip",
                        "defaultFS": "hdfs://hadoop102:8020",
                        "fieldDelimiter": "\t",
                        "fileName": "base_province",
                        "fileType": "text",
                        "path": "/base_province",
                        "writeMode": "append"
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": 1
            }
        }
    }
}

传时间参数

sh 复制代码
python bin/datax.py -p"-Ddt=2022-06-08" job/base_province.json
json 复制代码
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "mysqlreader",
                    "parameter": {
                        "connection": [
                            {
                                "jdbcUrl": [
                                    "jdbc:mysql://hadoop102:3306/gmall?useUnicode=true&allowPublicKeyRetrieval=true&characterEncoding=utf-8"
                                ],
                                "querySql": [
                                    "select id,name,region_id,area_code,iso_code,iso_3166_2,create_time,operate_time from base_province where id>=3"
                                ]
                            }
                        ],
                        "password": "000000",
                        "username": "root"
                    }
                },
                "writer": {
                    "name": "hdfswriter",
                    "parameter": {
                        "column": [
                            {
                                "name": "id",
                                "type": "bigint"
                            },
                            {
                                "name": "name",
                                "type": "string"
                            },
                            {
                                "name": "region_id",
                                "type": "string"
                            },
                            {
                                "name": "area_code",
                                "type": "string"
                            },
                            {
                                "name": "iso_code",
                                "type": "string"
                            },
                            {
                                "name": "iso_3166_2",
                                "type": "string"
                            },
                            {
                                "name": "create_time",
                                "type": "string"
                            },
                            {
                                "name": "operate_time",
                                "type": "string"
                            }
                        ],
                        "compress": "gzip",
                        "defaultFS": "hdfs://hadoop102:8020",
                        "fieldDelimiter": "\t",
                        "fileName": "base_province",
                        "fileType": "text",
                        "path": "/base_province/${dt}",
                        "writeMode": "append"
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": 1
            }
        }
    }
}

HDFSToMySQL

json 复制代码
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "hdfsreader",
                    "parameter": {
                        "defaultFS": "hdfs://hadoop102:8020",
                        "path": "/base_province",
                        "column": [
                            "*"
                        ],
                        "fileType": "text",
                        "compress": "gzip",
                        "encoding": "UTF-8",
                        "nullFormat": "\\N",
                        "fieldDelimiter": "\t"
                    }
                },
                "writer": {
                    "name": "mysqlwriter",
                    "parameter": {
                        "username": "root",
                        "password": "000000",
                        "connection": [
                            {
                                "table": [
                                    "test_province"
                                ],
                                "jdbcUrl": "jdbc:mysql://hadoop102:3306/gmall?useUnicode=true&allowPublicKeyRetrieval=true&characterEncoding=utf-8"
                            }
                        ],
                        "column": [
                            "id",
                            "name",
                            "region_id",
                            "area_code",
                            "iso_code",
                            "iso_3166_2",
                            "create_time",
                            "operate_time"
                        ],
                        "writeMode": "replace"
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": 1
            }
        }
    }
}

在MySQL中创建gmall.test_province表

c 复制代码
DROP TABLE
IF
	EXISTS `test_province`;
CREATE TABLE `test_province` (
	`id` BIGINT ( 20 ) NOT NULL,
	`name` VARCHAR ( 20 ) CHARACTER 
	SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
	`region_id` VARCHAR ( 20 ) CHARACTER 
	SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
	`area_code` VARCHAR ( 20 ) CHARACTER 
	SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
	`iso_code` VARCHAR ( 20 ) CHARACTER 
	SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
	`iso_3166_2` VARCHAR ( 20 ) CHARACTER 
	SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
	`create_time` VARCHAR ( 20 ) CHARACTER 
	SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
	`operate_time` VARCHAR ( 20 ) CHARACTER 
	SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
	PRIMARY KEY ( `id` ) 
) ENGINE = INNODB CHARACTER 
SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
sh 复制代码
python bin/datax.py job/test_province.json 

DataX配置生成器

类似代码生成器,生成DataX对应的JSON

相关推荐
bigdata-rookie1 天前
Spark 部署模式
大数据·分布式·spark
芝麻开门-新起点1 天前
贝壳GIS数据存储与房屋3D展示技术解析
大数据
玖日大大1 天前
Gemini 3 全维度技术解析:从认知到落地实战指南
大数据
little_xianzhong1 天前
把一个本地项目导入gitee创建的仓库中
大数据·elasticsearch·gitee
金融小师妹1 天前
基于机器学习框架的上周行情复盘:非农数据与美联储政策信号的AI驱动解析
大数据·人工智能·深度学习·1024程序员节
Leo.yuan1 天前
2小时,我搭了一套物流分析看板
大数据·人工智能·金融·企业数字化·现金流
sheji34161 天前
【开题答辩全过程】以 基于Spark的药品库存可视化分析系统为例,包含答辩的问题和答案
大数据·分布式·spark
larance1 天前
spark-submit 常用方式
大数据·spark
Ace_31750887761 天前
微店商品详情接口深度挖掘:从多接口联动到数据全息重构
大数据·python·重构