just recod for myself--将.txt格式修改成.json格式

脑子不太好使,经常写过的代码,后面需要找的时候,又找不到了,just记录下

功能描述

将原始的coqe代码,转换成大模型需要的json格式,其中的instruction可以根据自己的实际需求,进行更改。

代码实现:

python 复制代码
from tqdm import tqdm, trange
import os
import re
from typing import List
import json
from pdb import set_trace as stop

pipeline_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/test.txt"
jsonl_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/dev_w.json"

dic_en = { -1: 'worse', 0: 'equal', 1: 'better', 2: 'different'}
dic_zh = { -1: '更差', 0: '相等', 1: '更好', 2: '不同'}

def str_to_span(input_str):
   #[3&高 4&端 5&机]--> 高端机
   if len(input_str) == 0:
      span_str = ''
      indexs_str = ''
   else:
   
    if ' , ' in input_str:  # '21&没 22&有 , 25&细 26&致' --> '21&没 22&有 25&细 26&致'
        input_str = input_str.replace(' , ', ' ')
    indexs, span = zip(*[i.split('&') for i in input_str.split()])
    indexs_str = ':'.join(indexs)
    span_str = ''.join(span)

   return indexs_str, span_str

def process_line(text_line, label_line, kind, i):
    # kind: en or zh; line_id: the number of sentences
    text = text_line.split('\t')[0].strip() # text_line:当前行, text:sentence
    have_triples = int(text_line.split('\t')[1]) # obtain the label is comparative (1) or no-comparative (0)
    re_result = re.findall(r'\[\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\]\]', label_line)
    # label_line--> re_result:去除原始数据中的[],以及;
    raw_labels: List = [[x for x in y] for y in re_result] #一个样本label 存放在一个list中 

    sample = {'id': "identity"+"_"+str(i), 'conversations': []}
    dict_conver1 = {"from": "user", "is_compare":'', "value": ''}
    dict_conver2 = {"from": "assistant", "value":''}
    instruction = f"对比观点抽取任务:从输入语句中抽取所有的对比观点五元组(比较对象,被比较对象,对比属性,观点句,观点极性),抽取的元素允许为空。\n\n对比观点五元组解释:比较对象、被比较对象、对比属性和观点句必须是输入语句中出现的短语。比较对象和被比较对象可以是商品名、商品型号或代词。对比属性是对比的角度或方面,一版是比较对象和被比较对象的属性。观点句包含主观情感的短语。观点极性包括:更差、等同、更好、不同。\n\n请从输入语句({text})中抽取所有的对比观点五元组,并给出比较对象、被比较对象、对比属性、观点句在输入语句中的位置(位置从0开始编号):"

    if have_triples == 0:
        dict_conver1["is_compare"] = 0
        dict_conver1["value"]=instruction
        dict_conver2["value"]= "第1个五元组:(,,,,)\n元组位置:(,,,)\n"
        sample["conversations"].append(dict_conver1)
        sample["conversations"].append(dict_conver2)
        return sample
    
    if have_triples == 1:
        dict_conver1["is_compare"] = 1
        dict_conver1["value"]=instruction
        number = 0
        value = ''
        for label in raw_labels: # 比较句
            number += 1
            sub, obj, asp, op, polarity = label[0], label[1], label[2], label[3], label[4]
            sub_index, sub_span =  str_to_span(sub)
            obj_index, obj_span =  str_to_span(obj)
            asp_index, asp_span =  str_to_span(asp)
            op_index, op_span =  str_to_span(op)
            polarity = dic_zh[int(polarity)]
            quintuple_span= "("+sub_span+","+obj_span +","+asp_span+","+op_span+","+polarity+")"
            quintuple_indexs = "("+sub_index+"," +obj_index+","+asp_index+"," +op_index+")"
            value = value + f"第{number}个五元组:{quintuple_span}\n元组位置:{quintuple_indexs}\n"
        dict_conver2["value"] = value
        sample["conversations"].append(dict_conver1)
        sample["conversations"].append(dict_conver2)  

        return sample      


def load_data(path, kind):
    raw_data = []
    # with open(os.path.join(args.data_path, f'{mode}_char.txt'), 'r') as f:
    with open(path, 'r') as f:
        for line in f:
            raw_data.append(line)
            
    all_samples = []
    line_id, i = 0, 0
    text_line, label_line = '', ''
    for line_id in trange(len(raw_data), desc='processing data for mode'):
        cur_line = raw_data[line_id]
        if len(cur_line.split('\t')) != 2:
            label_line += '\n' + cur_line
        else:
            # a new text line, so push the last text and update text_line
            if text_line != '':
                all_samples.append(process_line(text_line, label_line, kind, i))
                i += 1
            text_line = cur_line
            label_line = ''
    
    all_samples.append(process_line(text_line, label_line, kind, i))

    return all_samples

kind = 'en' 
json_data = load_data(pipeline_data_path, kind)

with open(jsonl_data_path, 'w') as fw:
    fw.write(json.dumps(json_data, ensure_ascii=False)) 

2024年,第一天,加油,打工人~

相关推荐
岑梓铭29 分钟前
(CentOs系统虚拟机)Standalone模式下安装部署“基于Python编写”的Spark框架
linux·python·spark·centos
游客52044 分钟前
opencv中的各种滤波器简介
图像处理·人工智能·python·opencv·计算机视觉
Eric.Lee20211 小时前
moviepy将图片序列制作成视频并加载字幕 - python 实现
开发语言·python·音视频·moviepy·字幕视频合成·图像制作为视频
Dontla1 小时前
vscode怎么设置anaconda python解释器(anaconda解释器、vscode解释器)
ide·vscode·python
qq_529025291 小时前
Torch.gather
python·深度学习·机器学习
数据小爬虫@2 小时前
如何高效利用Python爬虫按关键字搜索苏宁商品
开发语言·爬虫·python
Cachel wood2 小时前
python round四舍五入和decimal库精确四舍五入
java·linux·前端·数据库·vue.js·python·前端框架
終不似少年遊*2 小时前
pyecharts
python·信息可视化·数据分析·学习笔记·pyecharts·使用技巧
Python之栈2 小时前
【无标题】
数据库·python·mysql
m0_748232922 小时前
DALL-M:基于大语言模型的上下文感知临床数据增强方法 ,补充
人工智能·语言模型·自然语言处理