ES海量数据更新及导入导出备份

一、根据查询条件更新字段

java 复制代码
from elasticsearch import Elasticsearch
import redis
import json

# 替换下面的用户名、密码和Elasticsearch服务器地址
username = 'elastic'
password = 'password '
es_host = 'https://127.0.0.2:30674'

# 使用Elasticsearch实例化时传递用户名和密码
es = Elasticsearch(
    hosts=[es_host],
    basic_auth=(username, password),
    verify_certs=False
    # 如果你的Elasticsearch是通过SSL加密的,还可以添加下面的参数
    # use_ssl=True,
    # verify_certs=True,
    # ca_certs='/path/to/ca/cert',
)

# 使用Elasticsearch实例进行操作,例如搜索
# print(es.cluster.state())

# response = es.search(index="remote_statistics_202409", query={"match_all": {}})
# print(response)

# # 连接Elasticsearch和Redis
# #es = Elasticsearch("http://localhost:9200")
r = redis.StrictRedis(host='10.7.9.13', port=32197, db=11)

# 假设你的Redis键是'my_key'
keys = r.keys()

count = 0

my_map = dict()
for key in keys:
    maps = r.hgetall(key)
    print(key)

    #list_map = dict()
    for key01, value in maps.items():
        # print(type(json.loads(key)))
        # print(type(value))
        # print(type(json.loads(value)))
        # print(json.loads(key),  json.loads(value))
        aa = json.loads(value).get("Latitude")
        if not aa.startswith("0."):
            # list_map[json.loads(key01)] = json.loads(value)
            my_map[key.decode('utf-8') + str(json.loads(key01))] = json.loads(value)

    # my_list.append(list_map)

    print(len(my_map))

    # print(type(my_map.get("21V70000110122B000139"+str(1719713910000))))
    # print(my_map.get("21V70000110122B000139"+str(1719713910000)))
    # print(my_map["21V70000110122B0001391719713910000"])

    # new_map={}
    # new_map["21V70000110122B000139"] = my_map["21V70000110122B0001391719713910000"]

    # for key02, value in my_map.items():
    #     print(key02)
    #     print(type(key02))
    #     print(type(value))
    #     count = len(value) + count
    #     print(count)

    # print(key)
    # print(maps.__len__())
    # count = count + maps.__len__()
    # print(count)
    #
    # one = r.hget(name=key, key='1719563700000')
    # print(one)
    # print(type(one))
    #
    # if one is not None:
    #     one_json = json.loads(one)
    #     print(type(one_json))
    #     print(one_json.get("Latitude"))
    #     print(one_json.get("Longitude"))
    #
    # print("-------------------------------------------------")

    # 从Redis获取数据
    # redis_data = r.get(redis_key)
    # key_map = my_map["21V70000110122B000139"]
    # print(type(key_map))
    # key_json = key_map.get(1719713910000)
    # print(type(key_json))
    # print(key_json)
    # print(key_json.get('Latitude'))

    # minutes = create_at // 60000
    # left = create_at % 60000 // 1000
    #
    # if left <= 15:
    #     left = 0
    # elif left > 45:
    #     left = 60
    # else:
    #     left = 30
    #
    # map_key = minutes * 60000 + left * 1000
    # 如果存在,解析数据并更新Elasticsearch
    # 构建更新查询
    script = {
        # "source": "ctx._source.field_to_update = params.new_value",
        # "source": "ctx._source['Latitude'] = params.new_value[ctx._source.RemoteId][ctx._source.createAt].get('Latitude')",
        # "source": "ctx._source['Latitude'] = params.new_value['21V70000110122B0001391719713910000']['Latitude'];ctx._source['Longitude'] = params.new_value['21V70000110122B0001391719713910000']['Longitude'];ctx._source['tmp'] = params.new_value['21V70000110122B0001391719713910000']",
        # "source": "def aa=ctx._source.RemoteId;ctx._source['Latitude'] = params.new_value['21V70000110122B0001391719713910000']['Latitude'];ctx._source['Longitude'] = params.new_value['21V70000110122B0001391719713910000']['Longitude'];ctx._source['rrc'] = params.new_value['21V70000110122B0001391719713910000'];ctx._source.remove('tmp')",
        "source": "def create_at=ctx._source.createAt;def minutes = Math.floor(create_at / 60000);"
                  "def left = Math.floor(create_at % 60000 / 1000);if(left<=15) {left=0} else if(left >45){ left=60} else {left=30} def form_time= minutes * 60000 + left * 1000;"
                  "ctx._source['create_lltime'] = form_time;def key = ctx._source.RemoteId + (Long)form_time ;def rru=  params.new_value[key];"
                  "if(rru !=null) {ctx._source['Latitude']=rru['Latitude'];ctx._source['Longitude']=rru['Longitude'];ctx._source['rrc'] = rru}",
        # "source": "def aa=ctx._source.RemoteId;ctx._source['Latitude'] = aa;ctx._source['Longitude'] = params.new_value['21V70000110122B0001391719713910000']['Longitude'];ctx._source['tmp'] = params.new_value['21V70000110122B0001391719713910000']",
        "params": {
            "new_value": my_map

        }
        , "lang": "painless"
    }
#
# # 更新查询
ret = es.update_by_query(
    index="remote_statistics_202406",
    script=script,
    # body={
    #     "query": {
    #         "match": {
    #             "id_field": my_list['id_value'],,,,,,,
    #         }
    #     }
    #
    # },
    slices="auto",
    wait_for_completion=False,
    conflicts="proceed"

)

print(ret)

二、ES数据批量导出

python 复制代码
import json
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from datetime import datetime
import urllib3
urllib3.disable_warnings()
# 变量
start_date = 1735689600000
end_date = 1738368000000
#index_name = 'remote_statistics_202410'
index_name = 'remote_statistics_202501'

# 替换下面的用户名、密码和Elasticsearch服务器地址
username = 'elastic'
password = 'password'
es_host = 'https://127.0.0.1:32293'

# 使用Elasticsearch实例化时传递用户名和密码
es = Elasticsearch(
    hosts=[es_host],
    basic_auth=(username, password),
    verify_certs=False
    # 如果你的Elasticsearch是通过SSL加密的,还可以添加下面的参数
    # use_ssl=True,
    # verify_certs=True,
    # ca_certs='/path/to/ca/cert',
)



print("----------start---------------")

def fetch_data(start_time, end_time):
    results = helpers.scan(es, body={
        "query": {
            "range": {
                "createAt": {
                    "gte": start_time,
                    "lt": end_time
                }
            }
        }
    }, index=index_name)
    return results


if __name__ == "__main__":
    step = 60 * 60 * 1000

    for i in range(start_date, end_date+8*step, step):
        date = datetime.fromtimestamp(i / 1000)
        print("**********************************************************")
        print(date)
        print("**********************************************************")

        ret = fetch_data(i, i + step)
        count = 0
        with open(str(i) + '.json', 'w') as f:
            for doc in ret:
                f.write(json.dumps(doc['_source']) + '\n')
                count = count + 1
        print(count)

三、ES数据批量导入

python 复制代码
import json
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import os
import urllib3
urllib3.disable_warnings()
# 变量
#index_name = 'remote_statistics_202410'
index_name = 'network_statistics_202410'

# 替换下面的用户名、密码和Elasticsearch服务器地址
username = 'elastic'
password = 'password '
es_host = 'https://127.0.0.1:32067'

# 使用Elasticsearch实例化时传递用户名和密码
es = Elasticsearch(
    hosts=[es_host],
    basic_auth=(username, password),
    verify_certs=False
    # 如果你的Elasticsearch是通过SSL加密的,还可以添加下面的参数
    # use_ssl=True,
    # verify_certs=True,
    # ca_certs='/path/to/ca/cert',
)

def bulk_index_file(idx_name, file_path):

    current_dir = os.getcwd()
    file_names = os.listdir(current_dir)

    for file_name in file_names:
        if file_name.endswith(".json"):
            with open(file_name, 'r') as file:
                try:
                    print(file_name)
                    actions = (json.loads(line) for line in file)
                    helpers.bulk(es, actions, index=idx_name)
                except Exception as e:
                    print("error-----------------------")
                    print(e)
                    print(file_name)


# 调用函数
bulk_index_file(index_name, None)
相关推荐
Edingbrugh.南空34 分钟前
Flink Postgres CDC 环境配置与验证
大数据·flink
isNotNullX1 小时前
什么是数据分析?常见方法全解析
大数据·数据库·数据仓库·人工智能·数据分析
小袁拒绝摆烂1 小时前
ElasticSearch快速入门-1
大数据·elasticsearch·搜索引擎
点控云2 小时前
智能私域运营中枢:从客户视角看 SCRM 的体验革新与价值重构
大数据·人工智能·科技·重构·外呼系统·呼叫中心
zkmall3 小时前
企业电商解决方案哪家好?ZKmall模块商城全渠道支持 + 定制化服务更省心
大数据·运维·重构·架构·开源
随缘而动,随遇而安7 小时前
第八十八篇 大数据中的递归算法:从俄罗斯套娃到分布式计算的奇妙之旅
大数据·数据结构·算法
GISer_Jing8 小时前
Git协作开发:feature分支、拉取最新并合并
大数据·git·elasticsearch
IT_10249 小时前
Spring Boot项目开发实战销售管理系统——系统设计!
大数据·spring boot·后端
一只鹿鹿鹿11 小时前
信息化项目验收,软件工程评审和检查表单
大数据·人工智能·后端·智慧城市·软件工程
聚铭网络12 小时前
案例精选 | 某省级税务局AI大数据日志审计中台应用实践
大数据·人工智能·web安全