just recod for myself--将.txt格式修改成.json格式

脑子不太好使，经常写过的代码，后面需要找的时候，又找不到了，just记录下
功能描述

将原始的coqe代码，转换成大模型需要的json格式，其中的instruction可以根据自己的实际需求，进行更改。
代码实现：

python 复制代码
from tqdm import tqdm, trange
import os
import re
from typing import List
import json
from pdb import set_trace as stop

pipeline_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/test.txt"
jsonl_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/dev_w.json"

dic_en = { -1: 'worse', 0: 'equal', 1: 'better', 2: 'different'}
dic_zh = { -1: '更差', 0: '相等', 1: '更好', 2: '不同'}

def str_to_span(input_str):
   #[3&高 4&端 5&机]--> 高端机
   if len(input_str) == 0:
      span_str = ''
      indexs_str = ''
   else:
   
    if ' , ' in input_str:  # '21&没 22&有 , 25&细 26&致' --> '21&没 22&有 25&细 26&致'
        input_str = input_str.replace(' , ', ' ')
    indexs, span = zip(*[i.split('&') for i in input_str.split()])
    indexs_str = ':'.join(indexs)
    span_str = ''.join(span)

   return indexs_str, span_str

def process_line(text_line, label_line, kind, i):
    # kind: en or zh; line_id: the number of sentences
    text = text_line.split('\t')[0].strip() # text_line:当前行, text：sentence
    have_triples = int(text_line.split('\t')[1]) # obtain the label is comparative (1) or no-comparative (0)
    re_result = re.findall(r'\[\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\]\]', label_line)
    # label_line--> re_result:去除原始数据中的[]，以及;
    raw_labels: List = [[x for x in y] for y in re_result] #一个样本label 存放在一个list中 

    sample = {'id': "identity"+"_"+str(i), 'conversations': []}
    dict_conver1 = {"from": "user", "is_compare":'', "value": ''}
    dict_conver2 = {"from": "assistant", "value":''}
    instruction = f"对比观点抽取任务：从输入语句中抽取所有的对比观点五元组(比较对象,被比较对象,对比属性,观点句,观点极性)，抽取的元素允许为空。\n\n对比观点五元组解释：比较对象、被比较对象、对比属性和观点句必须是输入语句中出现的短语。比较对象和被比较对象可以是商品名、商品型号或代词。对比属性是对比的角度或方面，一版是比较对象和被比较对象的属性。观点句包含主观情感的短语。观点极性包括：更差、等同、更好、不同。\n\n请从输入语句（{text}）中抽取所有的对比观点五元组，并给出比较对象、被比较对象、对比属性、观点句在输入语句中的位置（位置从0开始编号）："

    if have_triples == 0:
        dict_conver1["is_compare"] = 0
        dict_conver1["value"]=instruction
        dict_conver2["value"]= "第1个五元组：(,,,,)\n元组位置：(,,,)\n"
        sample["conversations"].append(dict_conver1)
        sample["conversations"].append(dict_conver2)
        return sample
    
    if have_triples == 1:
        dict_conver1["is_compare"] = 1
        dict_conver1["value"]=instruction
        number = 0
        value = ''
        for label in raw_labels: # 比较句
            number += 1
            sub, obj, asp, op, polarity = label[0], label[1], label[2], label[3], label[4]
            sub_index, sub_span =  str_to_span(sub)
            obj_index, obj_span =  str_to_span(obj)
            asp_index, asp_span =  str_to_span(asp)
            op_index, op_span =  str_to_span(op)
            polarity = dic_zh[int(polarity)]
            quintuple_span= "("+sub_span+","+obj_span +","+asp_span+","+op_span+","+polarity+")"
            quintuple_indexs = "("+sub_index+"," +obj_index+","+asp_index+"," +op_index+")"
            value = value + f"第{number}个五元组：{quintuple_span}\n元组位置：{quintuple_indexs}\n"
        dict_conver2["value"] = value
        sample["conversations"].append(dict_conver1)
        sample["conversations"].append(dict_conver2)  

        return sample      


def load_data(path, kind):
    raw_data = []
    # with open(os.path.join(args.data_path, f'{mode}_char.txt'), 'r') as f:
    with open(path, 'r') as f:
        for line in f:
            raw_data.append(line)
            
    all_samples = []
    line_id, i = 0, 0
    text_line, label_line = '', ''
    for line_id in trange(len(raw_data), desc='processing data for mode'):
        cur_line = raw_data[line_id]
        if len(cur_line.split('\t')) != 2:
            label_line += '\n' + cur_line
        else:
            # a new text line, so push the last text and update text_line
            if text_line != '':
                all_samples.append(process_line(text_line, label_line, kind, i))
                i += 1
            text_line = cur_line
            label_line = ''
    
    all_samples.append(process_line(text_line, label_line, kind, i))

    return all_samples

kind = 'en' 
json_data = load_data(pipeline_data_path, kind)

with open(jsonl_data_path, 'w') as fw:
    fw.write(json.dumps(json_data, ensure_ascii=False))
2024年，第一天，加油，打工人～