Python脚本之操作Elasticsearch【一】

本文为博主原创,未经授权,严禁转载及使用。

本文链接:https://blog.csdn.net/zyooooxie/article/details/109588072

前面刚写了 requests发请求 操作Elasticsearch - Search https://blog.csdn.net/zyooooxie/article/details/123730279,再来分享下 使用elasticsearch库 ;

【实际这篇博客推迟发布N个月】

个人博客:https://blog.csdn.net/zyooooxie

【以下所有内容仅为个人项目经历,如有不同,纯属正常】

Python Client

https://www.elastic.co/guide/en/elasticsearch/client/index.html

我使用的 是 7.17.0;

https://pypi.org/project/elasticsearch/7.17.0/

https://www.elastic.co/guide/en/elasticsearch/client/python-api/7.17/overview.html

https://elasticsearch-py.readthedocs.io/en/v7.17.0/index.html

python 复制代码
"""
@blog: https://blog.csdn.net/zyooooxie
@qq: 153132336
@email: zyooooxie@gmail.com
"""


import time
import traceback
import sys
import json
import string
import math
import random
from typing import Optional, Union, List, Any
from user_log import Log

from elasticsearch import Elasticsearch
from elasticsearch.helpers import BulkIndexError

gl_es_host_new = 'http://1.1.1.1:1111'
gl_es_host_new_2 = ['http://1.1.1.1:1111', 'http://2.2.2.2:2222']

# ``port`` needs to be an int.
gl_es_host_new_3 = [{'host': '2.2.2.2', 'port': 2222}]
gl_es_host_new_4 = [{'host': '2.2.2.2', 'port': 2222}, {'host': '1.1.1.1', 'port': 1111}]

gl_es_auth = ('es_username', 'es_password')

gl_type = '_doc'

gl_search_dict = {'size': 100, 'from': 0, "sort": {"xxxXXX": {"order": "desc"}}}


# pip install elasticsearch==7.17.0
# https://pypi.org/project/elasticsearch/7.17.0/
# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/docs.html

# https://elasticsearch-py.readthedocs.io/en/v7.17.0/api.html

# doc_type 不建议使用 【Specifying types in requests is deprecated】
# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/removal-of-types.html

# Note that in 7.0, _doc is a permanent part of the path, and represents the endpoint name rather than the document type.
# In Elasticsearch 7.0, each API will support typeless requests, and specifying a type will produce a deprecation warning.

搜索

python 复制代码
"""
@blog: https://blog.csdn.net/zyooooxie
@qq: 153132336
@email: zyooooxie@gmail.com
"""


def connect_es_client(hosts: Union[str, list], auth: tuple):
    """

    :param hosts:
    :param auth:
    :return:
    """
    client = Elasticsearch(hosts,
                           sniff_on_start=True,  # sniff before doing anything
                           sniff_on_node_failure=True,  # refresh nodes after a node fails to respond
                           request_timeout=60,
                           http_auth=auth)  # HTTP authentication uses the http_auth parameter by passing in a username and password within a tuple

    Log.error('连接-{}'.format(client))

    return client


def close_es_client(client: Elasticsearch):
    """

    :param client:
    :return:
    """
    client.close()

    Log.error('断开连接')


def _es_search(index_str: str, client: Elasticsearch,
               size_: int = 10000, from_: int = 0,
               sort_: Union[str, dict] = {"seq": {"order": "desc"}},
               get_more_10000: bool = False,
               **kwargs):
    """

    :param index_str:
    :param client:
    :param size_:
    :param from_:
    :param sort_: query 传值是 {"seq": {"order": "desc"}} ; body 是 'seq:desc';
    :param get_more_10000:是否查询超过10000条的数据
    :param kwargs: 不建议使用 body传参;查全部时,啥都不传;
    :return:
    """

    # 索引不存在时,返回值是 None
    if not client.indices.exists(index=index_str):
        return None

    # from + size must be less than or equal to: [10000]
    assert size_ + from_ <= 10000

    # # ✅ New usage:
    # es.search(query={...})
    #
    # # ❌ Deprecated usage:
    # es.search(body={"query": {...}})

    Log.debug(locals())

    # search() 的 from: Defaults to 0.   size: Defaults to 10.
    # 但有时候为了查出来所有数据,size 默认给 最大值10000,from 默认给0;
    res = client.search(index=index_str, size=size_, from_=from_, sort=sort_, **kwargs)

    total = res.get('hits').get('total').get('value')
    Log.info(f'total:{total}')

    hits_len = len(res.get('hits').get('hits'))
    Log.info(f'hits有:{hits_len}条')

    result = _search_10000(hits_len=hits_len, first_search_result=res, locals_=locals(),
                           client=client, first_search_size=size_, get_more_10000=get_more_10000)
    Log.info(result[-10:])
    Log.info(f'search返回的结果有:{len(result)}条')

    return result


def _search_10000(client: Elasticsearch, hits_len: int, first_search_result: dict, locals_: dict,
                  first_search_size: int,
                  get_more_10000: bool = False):
    """

    :param client:
    :param hits_len:
    :param first_search_result:
    :param locals_:
    :param first_search_size:
    :param get_more_10000:
    :return:
    """
    if hits_len < first_search_size or not get_more_10000:

        if hits_len:
            return first_search_result.get('hits').get('hits')
        else:
            return []

    else:
        return __search_10000_get_result(client=client, locals_=locals_)


def __search_10000_get_result(client: Elasticsearch, locals_: dict):
    """

    :param client:
    :param locals_:
    :return:
    """
    from xxx_use.common_functions import compare_dict_key

    one_choice = random.getrandbits(2)
    Log.info(one_choice)

    if not one_choice:

        Log.info('scroll + scan')

        scroll_list = __scroll(client=client, locals_=locals_)
        scan_list = __scan(client=client, locals_=locals_)

        # 很多时候 因为sort值不同
        scroll_list = __change_before_compare(scroll_list)
        scan_list = __change_before_compare(scan_list)

        compare_dict_key(scroll_list, scan_list, assert_0=True)
        compare_dict_key(scan_list, scroll_list, assert_0=True)

        return scroll_list

    elif one_choice == 1:

        Log.info('scroll')
        return __scroll(client=client, locals_=locals_)

    elif one_choice == 2:

        Log.info('scan')
        return __scan(client=client, locals_=locals_)

    else:

        # return __limit(client=client, locals_=locals_)

        # 不推荐
        Log.info('指定seq范围 【自己造的假数据 确保 每条都有seq】')
        limit_list = __limit(client=client, locals_=locals_)
        scan_list = __scan(client=client, locals_=locals_)

        limit_list = __change_before_compare(limit_list)
        scan_list = __change_before_compare(scan_list)

        compare_dict_key(limit_list, scan_list, assert_0=True)
        compare_dict_key(scan_list, limit_list, assert_0=True)

        return limit_list


def __change_before_compare(result_list: list):
    """
    scroll + scan 结果比较前 对数据做个统一
    :param result_list:
    :return:
    """
    for rl in result_list:
        # 每个结果还有一个 _score ,它衡量了文档与查询的匹配程度。默认情况下,首先返回最相关的文档结果,就是说,返回的文档是按照 _score 降序排列的。
        rl.pop('sort', '不存在key')

        rl.pop('_score', '不存在key')

    return result_list


def __scan(client: Elasticsearch, locals_: dict):
    """

    :param client:
    :param locals_:
    :return:
    """
    # https://elasticsearch-py.readthedocs.io/en/v7.17.0/helpers.html#scan

    from elasticsearch.helpers import scan

    # query 要传的是 body for the search() api
    # query={"query": {"match": {"blog": "zyooooxie"}}}
    result = scan(client=client, index=locals_.get('index_str'), query=locals_.get('kwargs'),

                  size=5000,
                  scroll="3m")  # Any additional keyword arguments will be passed to the initial search() call

    Log.info(f'{result}, {type(result)}')

    res = [gr for gr in result]
    Log.info(len(res))

    return res


def __scroll(client: Elasticsearch, locals_: dict):
    """

    :param client:
    :param locals_:
    :return:
    """
    # https://elasticsearch-py.readthedocs.io/en/v7.17.0/api.html#elasticsearch.Elasticsearch.scroll

    scroll_time = '3m'
    search_res = client.search(index=locals_.get('index_str'), scroll=scroll_time,
                               query=locals_.get('kwargs').get('query'),
                               size=5000,
                               sort=['_doc'])
    scroll_id = search_res.get('_scroll_id')
    Log.info(scroll_id)

    total = search_res.get('hits').get('total').get('value')
    Log.info(f'总共有{total}条')

    res = search_res.get('hits').get('hits')

    while True:

        scroll_res = client.scroll(scroll_id=scroll_id, scroll=scroll_time)

        scroll_id = scroll_res.get('_scroll_id')

        data = scroll_res.get('hits').get('hits')
        res.extend(data)

        if not data:
            break

    assert total == len(res)

    # Search context are automatically removed when the scroll timeout has been exceeded.
    # 手动清理,using the clear-scroll API
    clear_res = client.clear_scroll(scroll_id=scroll_id)
    Log.info(clear_res)

    return res


def __limit(client: Elasticsearch, locals_: dict):
    """

    :param client:
    :param locals_:
    :return:
    """
    seq_max: int = get_seq_max(client=client, index_str=locals_.get('index_str'))

    query = locals_.get('kwargs').get('query')

    search_size = 10000  # search的传参 取最大
    limit_size = 5000  # 查询时 以seq排序,每次取的长度
    assert limit_size <= search_size

    res = list()

    for i in range(math.ceil(seq_max / limit_size)):
        query_new = {'bool': {'must': [
            query,
            {'range': {'seq': {'gt': limit_size * i, 'lte': limit_size * (i + 1)}}}  # gt、lte
        ]
        }}
        # Log.info(query_new)

        search_res = client.search(index=locals_.get('index_str'),
                                   query=query_new,
                                   size=search_size)
        data = search_res.get('hits').get('hits')

        res.extend(data)

    else:
        Log.info(len(res))

        return res

本文链接:https://blog.csdn.net/zyooooxie/article/details/109588072

个人博客 https://blog.csdn.net/zyooooxie

相关推荐
数据智能老司机3 小时前
精通 Python 设计模式——分布式系统模式
python·设计模式·架构
MacroZheng3 小时前
横空出世!MyBatis-Plus 同款 ES ORM 框架,用起来够优雅!
java·后端·elasticsearch
数据智能老司机4 小时前
精通 Python 设计模式——并发与异步模式
python·设计模式·编程语言
数据智能老司机4 小时前
精通 Python 设计模式——测试模式
python·设计模式·架构
数据智能老司机4 小时前
精通 Python 设计模式——性能模式
python·设计模式·架构
c8i4 小时前
drf初步梳理
python·django
每日AI新事件4 小时前
python的异步函数
python
这里有鱼汤5 小时前
miniQMT下载历史行情数据太慢怎么办?一招提速10倍!
前端·python
databook14 小时前
Manim实现脉冲闪烁特效
后端·python·动效
程序设计实验室15 小时前
2025年了,在 Django 之外,Python Web 框架还能怎么选?
python