输入json格式:
# tail -1 documents-181998.json
{"@timestamp": 894225595, "clientip":"199.185.0.0", "request": "GET /images/home_intro.anim.gif HTTP/1.0", "status": 200, "size": 60349}
输出json格式:
# tail -2 processed_output.json
{"index": {"_index": "documents-181998", "_id": "2708746"}}
{"@timestamp": 894225595, "clientip": "199.185.0.0", "request": "GET /images/home_intro.anim.gif HTTP/1.0", "status": 200, "size": 60349}
python脚本:
python
import json
def process_json_file(input_file='documents-181998.json', output_file='processed_output.json'):
"""
处理JSON文件,为每行数据添加index元数据
Args:
input_file: 输入文件名
output_file: 输出文件名
"""
try:
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:
id_num = 1 # 初始化ID计数器
for line_num, line in enumerate(infile, 1):
# 跳过空行
if not line.strip():
continue
try:
# 解析原始JSON数据
json_data = json.loads(line.strip())
# 1. 创建并写入index行
new_data = {
'index': {
'_index': 'documents-181998',
'_id': str(id_num)
}
}
index_line = json.dumps(new_data, ensure_ascii=False)
outfile.write(index_line + '\n')
# 2. 创建并写入原始数据行(只保留指定字段)
old_data = {
'@timestamp': json_data.get('@timestamp'),
'clientip': json_data.get('clientip'),
'request': json_data.get('request'),
'status': json_data.get('status'),
'size': json_data.get('size')
}
data_line = json.dumps(old_data, ensure_ascii=False)
outfile.write(data_line + '\n')
# 3. ID计数器递增
id_num += 1
except json.JSONDecodeError as e:
print(f"警告: 第 {line_num} 行JSON解析失败: {e}")
continue
except Exception as e:
print(f"警告: 处理第 {line_num} 行时发生错误: {e}")
continue
print(f"处理完成! 共处理 {id_num-1} 条记录")
print(f"输出文件: {output_file}")
except FileNotFoundError:
print(f"错误: 找不到输入文件 '{input_file}'")
except Exception as e:
print(f"错误: {e}")
# 使用示例
if __name__ == "__main__":
# 默认处理 documents-181998.json 文件
process_json_file()
# 如果需要指定不同的文件,可以这样调用:
# process_json_file('my_input.json', 'my_output.json')