ES海量数据更新及导入导出备份

一、根据查询条件更新字段

java 复制代码
from elasticsearch import Elasticsearch
import redis
import json

# 替换下面的用户名、密码和Elasticsearch服务器地址
username = 'elastic'
password = 'password '
es_host = 'https://127.0.0.2:30674'

# 使用Elasticsearch实例化时传递用户名和密码
es = Elasticsearch(
    hosts=[es_host],
    basic_auth=(username, password),
    verify_certs=False
    # 如果你的Elasticsearch是通过SSL加密的,还可以添加下面的参数
    # use_ssl=True,
    # verify_certs=True,
    # ca_certs='/path/to/ca/cert',
)

# 使用Elasticsearch实例进行操作,例如搜索
# print(es.cluster.state())

# response = es.search(index="remote_statistics_202409", query={"match_all": {}})
# print(response)

# # 连接Elasticsearch和Redis
# #es = Elasticsearch("http://localhost:9200")
r = redis.StrictRedis(host='10.7.9.13', port=32197, db=11)

# 假设你的Redis键是'my_key'
keys = r.keys()

count = 0

my_map = dict()
for key in keys:
    maps = r.hgetall(key)
    print(key)

    #list_map = dict()
    for key01, value in maps.items():
        # print(type(json.loads(key)))
        # print(type(value))
        # print(type(json.loads(value)))
        # print(json.loads(key),  json.loads(value))
        aa = json.loads(value).get("Latitude")
        if not aa.startswith("0."):
            # list_map[json.loads(key01)] = json.loads(value)
            my_map[key.decode('utf-8') + str(json.loads(key01))] = json.loads(value)

    # my_list.append(list_map)

    print(len(my_map))

    # print(type(my_map.get("21V70000110122B000139"+str(1719713910000))))
    # print(my_map.get("21V70000110122B000139"+str(1719713910000)))
    # print(my_map["21V70000110122B0001391719713910000"])

    # new_map={}
    # new_map["21V70000110122B000139"] = my_map["21V70000110122B0001391719713910000"]

    # for key02, value in my_map.items():
    #     print(key02)
    #     print(type(key02))
    #     print(type(value))
    #     count = len(value) + count
    #     print(count)

    # print(key)
    # print(maps.__len__())
    # count = count + maps.__len__()
    # print(count)
    #
    # one = r.hget(name=key, key='1719563700000')
    # print(one)
    # print(type(one))
    #
    # if one is not None:
    #     one_json = json.loads(one)
    #     print(type(one_json))
    #     print(one_json.get("Latitude"))
    #     print(one_json.get("Longitude"))
    #
    # print("-------------------------------------------------")

    # 从Redis获取数据
    # redis_data = r.get(redis_key)
    # key_map = my_map["21V70000110122B000139"]
    # print(type(key_map))
    # key_json = key_map.get(1719713910000)
    # print(type(key_json))
    # print(key_json)
    # print(key_json.get('Latitude'))

    # minutes = create_at // 60000
    # left = create_at % 60000 // 1000
    #
    # if left <= 15:
    #     left = 0
    # elif left > 45:
    #     left = 60
    # else:
    #     left = 30
    #
    # map_key = minutes * 60000 + left * 1000
    # 如果存在,解析数据并更新Elasticsearch
    # 构建更新查询
    script = {
        # "source": "ctx._source.field_to_update = params.new_value",
        # "source": "ctx._source['Latitude'] = params.new_value[ctx._source.RemoteId][ctx._source.createAt].get('Latitude')",
        # "source": "ctx._source['Latitude'] = params.new_value['21V70000110122B0001391719713910000']['Latitude'];ctx._source['Longitude'] = params.new_value['21V70000110122B0001391719713910000']['Longitude'];ctx._source['tmp'] = params.new_value['21V70000110122B0001391719713910000']",
        # "source": "def aa=ctx._source.RemoteId;ctx._source['Latitude'] = params.new_value['21V70000110122B0001391719713910000']['Latitude'];ctx._source['Longitude'] = params.new_value['21V70000110122B0001391719713910000']['Longitude'];ctx._source['rrc'] = params.new_value['21V70000110122B0001391719713910000'];ctx._source.remove('tmp')",
        "source": "def create_at=ctx._source.createAt;def minutes = Math.floor(create_at / 60000);"
                  "def left = Math.floor(create_at % 60000 / 1000);if(left<=15) {left=0} else if(left >45){ left=60} else {left=30} def form_time= minutes * 60000 + left * 1000;"
                  "ctx._source['create_lltime'] = form_time;def key = ctx._source.RemoteId + (Long)form_time ;def rru=  params.new_value[key];"
                  "if(rru !=null) {ctx._source['Latitude']=rru['Latitude'];ctx._source['Longitude']=rru['Longitude'];ctx._source['rrc'] = rru}",
        # "source": "def aa=ctx._source.RemoteId;ctx._source['Latitude'] = aa;ctx._source['Longitude'] = params.new_value['21V70000110122B0001391719713910000']['Longitude'];ctx._source['tmp'] = params.new_value['21V70000110122B0001391719713910000']",
        "params": {
            "new_value": my_map

        }
        , "lang": "painless"
    }
#
# # 更新查询
ret = es.update_by_query(
    index="remote_statistics_202406",
    script=script,
    # body={
    #     "query": {
    #         "match": {
    #             "id_field": my_list['id_value'],,,,,,,
    #         }
    #     }
    #
    # },
    slices="auto",
    wait_for_completion=False,
    conflicts="proceed"

)

print(ret)

二、ES数据批量导出

python 复制代码
import json
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from datetime import datetime
import urllib3
urllib3.disable_warnings()
# 变量
start_date = 1735689600000
end_date = 1738368000000
#index_name = 'remote_statistics_202410'
index_name = 'remote_statistics_202501'

# 替换下面的用户名、密码和Elasticsearch服务器地址
username = 'elastic'
password = 'password'
es_host = 'https://127.0.0.1:32293'

# 使用Elasticsearch实例化时传递用户名和密码
es = Elasticsearch(
    hosts=[es_host],
    basic_auth=(username, password),
    verify_certs=False
    # 如果你的Elasticsearch是通过SSL加密的,还可以添加下面的参数
    # use_ssl=True,
    # verify_certs=True,
    # ca_certs='/path/to/ca/cert',
)



print("----------start---------------")

def fetch_data(start_time, end_time):
    results = helpers.scan(es, body={
        "query": {
            "range": {
                "createAt": {
                    "gte": start_time,
                    "lt": end_time
                }
            }
        }
    }, index=index_name)
    return results


if __name__ == "__main__":
    step = 60 * 60 * 1000

    for i in range(start_date, end_date+8*step, step):
        date = datetime.fromtimestamp(i / 1000)
        print("**********************************************************")
        print(date)
        print("**********************************************************")

        ret = fetch_data(i, i + step)
        count = 0
        with open(str(i) + '.json', 'w') as f:
            for doc in ret:
                f.write(json.dumps(doc['_source']) + '\n')
                count = count + 1
        print(count)

三、ES数据批量导入

python 复制代码
import json
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import os
import urllib3
urllib3.disable_warnings()
# 变量
#index_name = 'remote_statistics_202410'
index_name = 'network_statistics_202410'

# 替换下面的用户名、密码和Elasticsearch服务器地址
username = 'elastic'
password = 'password '
es_host = 'https://127.0.0.1:32067'

# 使用Elasticsearch实例化时传递用户名和密码
es = Elasticsearch(
    hosts=[es_host],
    basic_auth=(username, password),
    verify_certs=False
    # 如果你的Elasticsearch是通过SSL加密的,还可以添加下面的参数
    # use_ssl=True,
    # verify_certs=True,
    # ca_certs='/path/to/ca/cert',
)

def bulk_index_file(idx_name, file_path):

    current_dir = os.getcwd()
    file_names = os.listdir(current_dir)

    for file_name in file_names:
        if file_name.endswith(".json"):
            with open(file_name, 'r') as file:
                try:
                    print(file_name)
                    actions = (json.loads(line) for line in file)
                    helpers.bulk(es, actions, index=idx_name)
                except Exception as e:
                    print("error-----------------------")
                    print(e)
                    print(file_name)


# 调用函数
bulk_index_file(index_name, None)
相关推荐
黑客笔记33 分钟前
攻防世界-XCTF-Web安全最佳刷题路线
大数据·安全·web安全
软件测试小仙女42 分钟前
鸿蒙APP测试实战:从HDC命令到专项测试
大数据·软件测试·数据库·人工智能·测试工具·华为·harmonyos
Elastic 中国社区官方博客1 小时前
Elastic 获得 AWS 教育 ISV 合作伙伴资质,进一步增强教育解决方案产品组合
大数据·人工智能·elasticsearch·搜索引擎·云计算·全文检索·aws
反向跟单策略2 小时前
期货反向跟单运营逻辑推导思路
大数据·人工智能·数据分析·区块链
Tom Boom2 小时前
Git常用命令完全指南:从入门到精通
大数据·git·elasticsearch·docker·自动化测试框架
不吃饭的猪3 小时前
记一次spark在docker本地启动报错
大数据·docker·spark
欧亚学术4 小时前
计算机网络领域所有CCF-A/B/C类期刊汇总!
大数据·计算机网络·计算机·论文·sci·期刊·发表
江瀚视野4 小时前
虎扑正式易主,迅雷完成收购会带来什么变化?
大数据·区块链
星星点点洲4 小时前
【Elasticsearch】 查询优化方式
elasticsearch·搜索引擎
QYR_115 小时前
宠物车载安全座椅市场报告:解读行业趋势与投资前景
大数据·人工智能