數據集成平台:datax將hive數據步到mysql(全部列和指定列)

數據集成平台:datax將hive數據步到mysql(全部列和指定列)

1.py腳本

傳入參數:

target_database:數據庫

target_table:表

target_columns:列

target_positions:hive列的下標(從0開始)

bash 复制代码
# coding=utf-8
import json
import getopt
import os
import sys
import MySQLdb

# MySQL相关配置,需根据实际情况作出修改
mysql_host = "hadoop102"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "xx"

# HDFS NameNode相关配置,需根据实际情况作出修改
hdfs_nn_host = "mycluster"
hdfs_nn_port = "8020"


def get_connection():
    return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)


def get_mysql_meta(database, table, columns):
    connection = get_connection()
    cursor = connection.cursor()
    if columns == 'all':
        # 如果传入 '*' 表示要所有列
        sql = "SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s' AND TABLE_NAME='%s' ORDER BY ORDINAL_POSITION" % (database, table)
    else:
        # 传入指定列
        # 将每个列名加上单引号
        columns = ', '.join("'%s'" % col.strip() for col in columns.split(','))
        sql = "SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s' AND TABLE_NAME='%s' AND COLUMN_NAME IN (%s) ORDER BY ORDINAL_POSITION" % (
        database, table, columns)
    cursor.execute(sql)
    fetchall = cursor.fetchall()
    # print(fetchall)
    cursor.close()
    connection.close()
    return fetchall


def get_mysql_columns(database, table, target_columns):
    return map(lambda x: x[0], get_mysql_meta(database, table, target_columns))


def get_hive_columns(database, table, target_columns, target_positions):
    def type_mapping(mysql_type):
        mappings = {
            "bigint": "bigint",
            "int": "bigint",
            "smallint": "bigint",
            "tinyint": "bigint",
            "mediumint": "bigint",
            "decimal": "string",
            "double": "double",
            "float": "float",
            "binary": "string",
            "char": "string",
            "varchar": "string",
            "datetime": "string",
            "time": "string",
            "timestamp": "string",
            "date": "string",
            "text": "string",
            "bit": "string",
        }
        return mappings[mysql_type]

    meta = get_mysql_meta(database, table, target_columns)

    if target_columns == 'all':
        return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta)
    else:
        positions = list(map(int, target_positions.split(',')))
        return map(lambda x, i: {"index": positions[i], "type": type_mapping(x[1].lower())}, meta, range(len(meta)))


def generate_json(target_database, target_table, target_columns, target_positions):
    print(get_hive_columns(target_database, target_table, target_columns, target_positions))
    if target_columns == 'all':
        target_columns_hive = "[*]"
    else:
        target_columns_hive = get_hive_columns(target_database, target_table, target_columns, target_positions)
    job = {
        "job": {
            "setting": {
                "speed": {
                    "channel": 15
                },
                "errorLimit": {
                    "record": 0,
                    "percentage": 0.02
                }
            },
            "content": [{
                "reader": {
                    "name": "hdfsreader",
                    "batchSize": "8192",
                    "batchByteSize": "33554432",
                    "parameter": {
                        "path": "${exportdir}",
                        "defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port,
                        "column": target_columns_hive,
                        "fileType": "orc",
                        "encoding": "UTF-8",
                        "fieldDelimiter": u"\u0001",
                        "nullFormat": "\\N"
                    }
                },
                "writer": {
                    "name": "mysqlwriter",
                    "batchSize": "8192",
                    "batchByteSize": "33554432",
                    "parameter": {
                        "writeMode": "replace",
                        "username": mysql_user,
                        "password": mysql_passwd,
                        "column": get_mysql_columns(target_database, target_table, target_columns),
                        "connection": [
                            {
                                "jdbcUrl":
                                    "jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + target_database + "?useUnicode=true&characterEncoding=utf-8&useSSL=false",
                                "table": [target_table]
                            }
                        ]
                    }
                }
            }]
        }
}

    output_path = "/opt/module/datax/job/export/" + target_database
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    with open(os.path.join(output_path, ".".join([target_database, target_table, "json"])), "w") as f:
        json.dump(job, f)


def main(args):
    target_database = ""
    target_table = ""
    target_columns = ""  # 默认为 None,表示没有指定列信息
    target_positions = ""

    options, arguments = getopt.getopt(args, 'p:d:t:c:', ['positions=', 'targetdb=', 'targettbl=', 'columns='])
    for opt_name, opt_value in options:
        if opt_name in ('-d', '--targetdb'):
            target_database = opt_value
        if opt_name in ('-t', '--targettbl'):
            target_table = opt_value
        if opt_name in ('-c', '--columns'):
            target_columns = opt_value
        if opt_name in ('-p', '--positions'):
            target_positions = opt_value
    print(target_database, target_table, target_columns, target_positions)
    generate_json(target_database, target_table, target_columns, target_positions)


if __name__ == '__main__':
    main(sys.argv[1:])

2.sh腳本

bash 复制代码
#!/bin/bash
python ~/bin/test.py -d db-t table -c all
#kunnr,name1,sort2,addrnumber,country,state -p 0,1,2,3,4,5
#all
相关推荐
Ren_xixi1 小时前
redis和mysql的区别
数据库·redis·mysql
追逐时光者3 小时前
免费、简单、直观的数据库设计工具和 SQL 生成器
后端·mysql
drebander3 小时前
MySQL 查询优化案例分享
数据库·mysql
小林coding6 小时前
阿里云 Java 后端一面,什么难度?
java·后端·mysql·spring·阿里云
18号房客6 小时前
高级sql技巧进阶教程
大数据·数据库·数据仓库·sql·mysql·时序数据库·数据库架构
翔云1234567 小时前
MySQL purged gtid是如何生成和维护的
数据库·mysql
平行线也会相交9 小时前
云图库平台(三)——后端用户模块开发
数据库·spring boot·mysql·云图库平台
恒辉信达10 小时前
hhdb客户端介绍(53)
数据库·mysql·hhdb·数据库可视化界面客户端
Hello.Reader11 小时前
Redis热点数据管理全解析:从MySQL同步到高效缓存的完整解决方案
redis·mysql·缓存
是程序喵呀12 小时前
MySQL备份
android·mysql·adb