ElasticSearch如何使用bulk load

复制代码

curl -X PUT "http://localhost:9200/my_index" \
  -H "Content-Type: application/json" \
  -d'
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0,
    "refresh_interval": "30s"
  },
  "mappings": {
    "properties": {
      "age": {
        "type": "integer"
      },
      "name": {
        "type": "text"
      }
    }
  }
}'

复制代码

{"index": {"_index": "my_index", "_id": "1"}}
{"name": "张三", "age": 25}
{"index": {"_index": "my_index", "_id": "2"}}
{"name": "李四", "age": 30}
{"index": {"_index": "my_index", "_id": "3"}}
{"name": "王五", "age": 35}

复制代码

curl -X POST "http://localhost:9200/_bulk" -H "Content-Type: application/json" --data-binary @bulk.json

复制代码

curl -X GET "http://localhost:9200/my_index/_count"

bash 复制代码

#!/bin/bash

# 配置参数
SOURCE_FILE="processed_output.json"  # 源JSON文件
TEMP_BULK_FILE="bulk.json"           # 临时批量文件
ES_URL="localhost:9200/_bulk"        # ES Bulk API地址
BATCH_LINES=2                        # 每次读取的行数

# 检查源文件是否存在
if [ ! -f "$SOURCE_FILE" ]; then
    echo "错误：源文件 $SOURCE_FILE 不存在！"
    exit 1
fi

# 检查curl是否安装
if ! command -v curl &> /dev/null; then
    echo "错误：curl未安装，请先安装curl！"
    exit 1
fi

# 初始化行计数器
line_count=0
# 清空临时文件
> "$TEMP_BULK_FILE"

# 逐行读取源文件
while IFS= read -r line; do
    # 跳过空行
    if [ -z "$line" ]; then
        continue
    fi

    # 将当前行写入临时文件
    echo "$line" >> "$TEMP_BULK_FILE"
    ((line_count++))

    # 当累计行数达到设定值时执行POST请求
    if [ $line_count -eq $BATCH_LINES ]; then
        echo "=== 发送批量数据（行数：$line_count）==="
        
        # 执行curl POST请求
        response=$(curl -s -X POST "$ES_URL" \
            -H "Content-Type: application/json" \
            --data-binary "@$TEMP_BULK_FILE" \
            -w "%{http_code}" -o "es_response.tmp")
        
        # 获取HTTP状态码（最后3位）
        http_code=${response: -3}
        # 获取响应内容
        response_content=$(cat "es_response.tmp")

        # 检查请求是否成功
        if [ "$http_code" = "200" ]; then
            echo "✅ 请求成功，HTTP状态码：$http_code"
            echo "响应内容：$response_content"
        else
            echo "❌ 请求失败，HTTP状态码：$http_code"
            echo "错误响应：$response_content"
            # 可选：失败时退出脚本
            # exit 1
        fi

        # 重置计数器和临时文件
        line_count=0
        > "$TEMP_BULK_FILE"
        
        # 可选：添加延迟，避免ES压力过大
        # sleep 0.5
    fi

done < "$SOURCE_FILE"

# 处理剩余不足批量行数的内容
if [ $line_count -gt 0 ]; then
    echo "=== 发送剩余数据（行数：$line_count）==="
    
    response=$(curl -s -X POST "$ES_URL" \
        -H "Content-Type: application/json" \
        --data-binary "@$TEMP_BULK_FILE" \
        -w "%{http_code}" -o "es_response.tmp")
    
    http_code=${response: -3}
    response_content=$(cat "es_response.tmp")

    if [ "$http_code" = "200" ]; then
        echo "✅ 剩余数据请求成功，HTTP状态码：$http_code"
        echo "响应内容：$response_content"
    else
        echo "❌ 剩余数据请求失败，HTTP状态码：$http_code"
        echo "错误响应：$response_content"
    fi
fi

# 清理临时文件
rm -f "es_response.tmp"
> "$TEMP_BULK_FILE"

echo "=== 所有数据处理完成 ==="
exit 0