python脚本将json转为bulk api

输入json格式:

复制代码
# tail -1 documents-181998.json
{"@timestamp": 894225595, "clientip":"199.185.0.0", "request": "GET /images/home_intro.anim.gif HTTP/1.0", "status": 200, "size": 60349}

输出json格式:

复制代码
# tail -2 processed_output.json
{"index": {"_index": "documents-181998", "_id": "2708746"}}
{"@timestamp": 894225595, "clientip": "199.185.0.0", "request": "GET /images/home_intro.anim.gif HTTP/1.0", "status": 200, "size": 60349}

python脚本:

python 复制代码
import json

def process_json_file(input_file='documents-181998.json', output_file='processed_output.json'):
    """
    处理JSON文件,为每行数据添加index元数据
    
    Args:
        input_file: 输入文件名
        output_file: 输出文件名
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as infile, \
             open(output_file, 'w', encoding='utf-8') as outfile:
            
            id_num = 1  # 初始化ID计数器
            
            for line_num, line in enumerate(infile, 1):
                # 跳过空行
                if not line.strip():
                    continue
                
                try:
                    # 解析原始JSON数据
                    json_data = json.loads(line.strip())
                    
                    # 1. 创建并写入index行
                    new_data = {
                        'index': {
                            '_index': 'documents-181998',
                            '_id': str(id_num)
                        }
                    }
                    index_line = json.dumps(new_data, ensure_ascii=False)
                    outfile.write(index_line + '\n')
                    
                    # 2. 创建并写入原始数据行(只保留指定字段)
                    old_data = {
                        '@timestamp': json_data.get('@timestamp'),
                        'clientip': json_data.get('clientip'),
                        'request': json_data.get('request'),
                        'status': json_data.get('status'),
                        'size': json_data.get('size')
                    }
                    data_line = json.dumps(old_data, ensure_ascii=False)
                    outfile.write(data_line + '\n')
                    
                    # 3. ID计数器递增
                    id_num += 1
                    
                except json.JSONDecodeError as e:
                    print(f"警告: 第 {line_num} 行JSON解析失败: {e}")
                    continue
                except Exception as e:
                    print(f"警告: 处理第 {line_num} 行时发生错误: {e}")
                    continue
            
            print(f"处理完成! 共处理 {id_num-1} 条记录")
            print(f"输出文件: {output_file}")
            
    except FileNotFoundError:
        print(f"错误: 找不到输入文件 '{input_file}'")
    except Exception as e:
        print(f"错误: {e}")

# 使用示例
if __name__ == "__main__":
    # 默认处理 documents-181998.json 文件
    process_json_file()
    
    # 如果需要指定不同的文件,可以这样调用:
    # process_json_file('my_input.json', 'my_output.json')