canal_json_to_doris

flink同步太消耗资源了,用python写了一个同步的程序。

python 复制代码
# -* coding:utf8 *-
import json
import time
import pymysql
import requests
from kafka import KafkaConsumer
import threading
import Queue
from datetime import datetime

## 使用一个topic 同步所有表
## 创建同步表不要 not null import!
## nohup python  -u canal_doris.py > doris.log 2>&1 &
## -u 关闭缓存
## topic 最好是 单parittion 配置

##### 配置开始
tjtimesecond = 30 ## 日志时间
## doris 设置
tourl = "192.168.150.25"
toport = 9030
touser = "root"
topassword = ""
todatabase = "ods"
topic =[
    'sync_other_all'
]
setfilename = "canal_doris_set.json"
##### 配置结束



## 初始化
tj = {}
datas = {}
lasttjtime = 0
queue = Queue.Queue()
consumer = KafkaConsumer(
    group_id='ggg',
    enable_auto_commit = False,
    auto_offset_reset= 'earliest',
    bootstrap_servers=['zoo1:9092','zoo2:9092','zoo3:9092'],
    max_poll_records = 500
)
consumer.subscribe(topics =topic)

## 表设置

with open(setfilename, 'r') as file:
    content = file.read()
tableset = json.loads(content)

##初始化结束




def sendweixinerror(msg):
    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=www'
    datas = '{\"msgtype\": \"text\",\"text\": {\"content\": \"' + str(msg) + '\"}}'
    headers = {"Content-Type": "application/json"}
    response = requests.post(url=url, data=datas, headers=headers)


def execDoris(tourl ,toport,touser,topassword,todatabase,sql):
    try:
        db = pymysql.connect(host=tourl, port=toport, user=touser, password=topassword, database=todatabase)
        cursor = db.cursor()
        cursor.execute(query=sql)
        res = cursor.fetchall()
        cursor.close()
        db.close()
        return res
    except Exception as e:
        print(e)
    return ""

## 初始化开始时间
for k in tableset:
    x = tableset[k]
    if 'tsStartBeforeHoursFromnow' in x:
        time0 = time.time() - x['tsStartBeforeHoursFromnow']*3600;
        x['fromtime'] = time0*1000 ## ms
    else:
        x['fromtime'] = 0
    tj[k] = {
        "count":0,
        "tsdate":"",
        "partinfo":{},
        "ts":"",
        "sendcount":0
    }

def doris_put_delete2(setkey,db,table,data0):
    wheresql = ""
    for one in data0:
        str = ""
        str0 = ""
        for k in one:
            str0 += k+"='"+one[k]+"' and "
        str += "(" + str0[:-5] + ") or"
        wheresql += str
    wheresql = wheresql[:-2]
    sql = " delete from `"+db+"`.`"+table +"` where "+wheresql
    print sql
    try:
        rs = execDoris(tourl, toport, touser, topassword, todatabase,sql)
    except Exception as e:
        sendweixinerror(db + table + "同步错误2,该表停止更新")
        tableset[setkey]['ifstop'] = True
        tj[setkey]['ifstop'] = True


def doris_put_delete(setkey,db,table,data0):
    try:
        response = requests.put(url="http://192.168.150.24:8040/api/"+db+"/"+table+"/_stream_load",
                                data=json.dumps(data0),
                                headers={
                                    "format": "json",
                                    "strip_outer_array": "true",
                                    "Expect": "100-continue",
                                    "merge_type": "DELETE"
                                },
                                    auth=('root', ''), allow_redirects=False, verify=False
                                )
        if 'ErrorURL' in response.text:
            print "delete"+db+table
            sendweixinerror(db+table+" 同步接口删除错误1,尝试sql删除 ")
            print response.text
            print json.dumps(data0)
            doris_put_delete2(setkey,db,table,data0)
    except Exception as e:
        print "delete" + db + table
        sendweixinerror(db + table + " 同步接口删除错误1,尝试sql删除 "+e)
        print response.text
        print json.dumps(data0)
        doris_put_delete2(setkey, db, table, data0)


def doris_put_append(setkey,db,table,data0):
    try:
        response = requests.put(url="http://192.168.150.24:8040/api/"+db+"/"+table+"/_stream_load",
                                data=json.dumps(data0),
                                headers={
                                    "format": "json",
                                    "strip_outer_array": "true",
                                    "Expect": "100-continue",
                                    "merge_type": "APPEND"
                                },
                                    auth=('root', ''), allow_redirects=False, verify=False
                                )
        if 'ErrorURL' in response.text:
            print "append:"+db+table
            sendweixinerror(db+table+"同步错误,该表停止更新")
            print response.text
            print json.dumps(data0)
            tableset[setkey]['ifstop'] = True
    except Exception as e:
        print "append:" + db + table
        sendweixinerror(db + table + "同步错误,该表停止更新")
        print response.text
        print json.dumps(data0)
        tableset[setkey]['ifstop'] = True
        tj[setkey]['ifstop'] = True


def doris_put_truncate(db,table):
    print " truncate table  `"+db+"`.`"+table+"`  "
    execDoris(tourl ,toport,touser,topassword,todatabase," truncate table  `"+db+"`.`"+table+"`  ")


def post2dorislist(dkey,rows):
    tableset[dkey]['lastsend'] = time.time()
    ifstop = tableset[dkey]['ifstop']
    if ifstop == True:
        return
    db = tableset[dkey]['db']
    table = tableset[dkey]['table']
    if len(rows) >0 :
        deletedata = []
        appenddata = []
        partinfo = {}
        ts = 0
        for rkey in rows:
            row = rows[rkey]
            if( row['type'] == 'insert' or  row['type'] == 'update' ):
                appenddata.append( changeRowData(dkey, row["data"]))
                if row['partition'] not in partinfo:
                    partinfo[row['partition']] = {}
                partinfo[row['partition']]['offset'] = row['offset']
                if(row['ts']>ts):
                    ts = row['ts']
            else:
                deletedata.append( changeRowData(dkey,row["data"]) )
                if row['partition'] not in partinfo:
                    partinfo[row['partition']] = {}
                partinfo[row['partition']]['offset'] = row['offset']
                if (row['ts'] > ts):
                    ts = row['ts']
        if len(deletedata) > 0:
            ##doris_put_delete( db,table,deletedata )
            queue.put({"setkey":dkey, "type":"delete", "db":db,"table":table,"data":deletedata})
            tj[dkey]['count'] += len(appenddata)
            tj[dkey]['sendcount'] += 1
        if len(appenddata) >0:
            ##doris_put_append( db,table,appenddata )
            queue.put({"setkey":dkey,"type": "append", "db": db, "table": table, "data": appenddata})
            tj[dkey]['count']+= len(appenddata)
            tj[dkey]['sendcount'] += 1
        if len(partinfo)>0:
            tj[dkey]['partinfo'].update(partinfo)
        if ts>0:
            tj[dkey]['ts'] = ts
            tj[dkey]['tsdate'] =  str(datetime.fromtimestamp(ts/1000))
        datas[dkey]['rows'] = {}

def printLog():
    global lasttjtime
    while True:
        time.sleep(1)
        if time.time() - lasttjtime > tjtimesecond:
            try:
                with open(setfilename, 'r') as file:
                    content = file.read()
                tableset0 = json.loads(content)
                ifchange =False
                for x in tableset0:
                    if x not in tableset:
                        tableset[x] = tableset0[x]
                        ifchange =True
                if ifchange == True:
                    print str(datetime.now()) + ":-----change tableset"
                    print str(datetime.now())+":"+json.dumps(tableset)
            except Exception as e:
                print e
            lasttjtime = time.time()
            print str(datetime.now())+":"+json.dumps(tj)

def getkeyd(dataone,keylist):
    keys = {}
    for k in keylist:
        if k in dataone:
            keys[k] =dataone[k]
        else:
            return None
    return keys

def changeRowData( dkey,rowdata ):
    try:
        if 'difffields' not in tableset[dkey]:
            return rowdata
        difffields = tableset[dkey]['difffields']
        if len(difffields) == 0:
            return rowdata
        for k in difffields:
           if k in rowdata:
               data =  rowdata[k]
               del  rowdata[k]
               rowdata[difffields[k]] = data
        return rowdata;
    except Exception as e:
        sendweixinerror(dkey+"同步暂停")
        print e;
        print rowdata;
        return rowdata;
# 使用 `consumer.poll()` 方法获取消息



def kafkaProduce():
    print "start !"
    while True:
        time.sleep(0.001)
        messages = consumer.poll(timeout_ms=500)
        # 处理消息
        for topic_partition, message_list in messages.items():
            for message in message_list:
                jsonob = json.loads(message.value)
                datakey = jsonob['database'] + '.' + jsonob['table']
                if datakey not in tableset:
                    continue
                fromtime = tableset[datakey]['fromtime']
                if(jsonob['ts'] < fromtime):
                    continue
                maxnum = tableset[datakey]['maxnum']
                tabletype0 = tableset[datakey]['tabletype']
                keyf = tableset[datakey]['pknames']
                ifstop = tableset[datakey]['ifstop'] ## 如果已经暂停则暂停
                if ifstop == True:
                    continue
                if (datakey not in datas):
                    datas[datakey] = {"lasttime": time.time(), "rows": {}}
                if (jsonob['type'] == 'TRUNCATE'):
                    db = tableset[datakey]['db']
                    table = tableset[datakey]['table']
                    doris_put_truncate(db,table)
                if (jsonob['type'] == 'DELETE'):
                    for dataone in jsonob['data']:
                        keys = getkeyd(dataone, keyf)
                        keyd = ",".join(keys.values())
                        datas[datakey]['rows'][keyd] = {"type": "delete","ts":jsonob['ts'],"data": keys,"partition":topic_partition.partition,"offset":message.offset}
                if (jsonob['type'] == 'INSERT'):
                    for dataone in jsonob['data']:
                        keys = getkeyd(dataone,keyf)
                        keyd = ",".join(keys.values())
                        datas[datakey]['rows'][keyd] = {"type": "insert","ts":jsonob['ts'], "data": dataone,"partition":topic_partition.partition,"offset":message.offset}
                if (jsonob['type'] == 'UPDATE'):
                    for key, dataone in enumerate(jsonob['data']):
                        keys = getkeyd(dataone, keyf)
                        keyd = ",".join(keys.values())
                        olddata = jsonob['old'][key]
                        okeyd = None
                        okeys = getkeyd(olddata, keyf)
                        if okeys != None:
                            okeyd = ",".join(okeys.values())
                        if tabletype0 == 'unique' and  ( okeys == None or  okeyd == keyd):
                            datas[datakey]['rows'][keyd] = {"type": "update","ts":jsonob['ts'], "data": dataone, "partition":topic_partition.partition,"offset":message.offset}
                        else:
                            datas[datakey]['rows'][okeyd] = {"type": "delete","ts":jsonob['ts'], "data": okeys,"partition":topic_partition.partition,"offset":message.offset}
                            datas[datakey]['rows'][keyd] = {"type": "insert","ts":jsonob['ts'], "data": dataone,"partition":topic_partition.partition,"offset":message.offset}
                if len(datas[datakey]['rows'])>maxnum:
                    post2dorislist(datakey,datas[datakey]['rows'])
        for dkey in datas:
            lastsend  = tableset[dkey]['lastsend']
            maxsecond = tableset[dkey]['maxsecond']
            if(time.time() - lastsend>  maxsecond ):
                post2dorislist(dkey,datas[dkey]['rows'])

def consumerThread(queue):
    while True:
        time.sleep(0.001)
        item = queue.get()
        if(item['type'] == "delete"):
            doris_put_delete(item['setkey'],item['db'], item['table'], item['data'])
        if (item['type'] == "append"):
            doris_put_append(item['setkey'],item['db'], item['table'], item['data'])
        queue.task_done()

consumer_thread = threading.Thread(target=consumerThread, args=(queue,))
producer_thread = threading.Thread(target=kafkaProduce, args=())
log_thread = threading.Thread(target=printLog, args=()) ##  日志

producer_thread.setDaemon(True)
consumer_thread.setDaemon(True)
log_thread.setDaemon(True)

producer_thread.start()

consumer_thread.start()

log_thread.start()

producer_thread.join()
sendweixinerror("produce同步暂停")
consumer_thread.join()
sendweixinerror("consumer同步暂停")
log_thread.join()
print "stop at " + str(datetime.now())
sendweixinerror("同步中断")
python 复制代码
{
  "test_canal.testpk": {
    "difffields": {},
    "lastsend": 0,
    "ifstop": false,
    "maxnum": 10000,
    "maxsecond": 5,
    "table": "testpk",
    "db": "ods",
    "tabletype": "unique",
    "pknames": [
      "testid"
    ]
  }
}
相关推荐
炽天使1 小时前
aws rds-mysql不支持性能详情监控
linux·数据库·mysql·云计算·aws·rds
Allen Bright2 小时前
Redis主从架构
数据库·redis·架构
ZHOU西口2 小时前
MySQL系列之远程管理(安全)
数据库·mysql·安全·tls·加密连接·require_secure
金鸣科技官方QQ2 小时前
JSON数据转化为Excel及数据处理分析
json·excel
Karoku0662 小时前
【docker集群应用】Docker网络与资源控制
运维·数据库·docker·容器
扬子鳄0083 小时前
Spring Boot自动配置机制
java·数据库·spring boot
秋意钟3 小时前
sql漏洞
数据库·oracle
cdut_suye3 小时前
C++11新特性探索:Lambda表达式与函数包装器的实用指南
开发语言·数据库·c++·人工智能·python·机器学习·华为
Mr_Xuhhh3 小时前
程序地址空间
android·java·开发语言·数据库
大明湖的狗凯.4 小时前
MySQL 中的乐观锁与悲观锁
java·数据库·mysql