just recod for myself--将.txt格式修改成.json格式

脑子不太好使,经常写过的代码,后面需要找的时候,又找不到了,just记录下

功能描述

将原始的coqe代码,转换成大模型需要的json格式,其中的instruction可以根据自己的实际需求,进行更改。

代码实现:

python 复制代码
from tqdm import tqdm, trange
import os
import re
from typing import List
import json
from pdb import set_trace as stop

pipeline_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/test.txt"
jsonl_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/dev_w.json"

dic_en = { -1: 'worse', 0: 'equal', 1: 'better', 2: 'different'}
dic_zh = { -1: '更差', 0: '相等', 1: '更好', 2: '不同'}

def str_to_span(input_str):
   #[3&高 4&端 5&机]--> 高端机
   if len(input_str) == 0:
      span_str = ''
      indexs_str = ''
   else:
   
    if ' , ' in input_str:  # '21&没 22&有 , 25&细 26&致' --> '21&没 22&有 25&细 26&致'
        input_str = input_str.replace(' , ', ' ')
    indexs, span = zip(*[i.split('&') for i in input_str.split()])
    indexs_str = ':'.join(indexs)
    span_str = ''.join(span)

   return indexs_str, span_str

def process_line(text_line, label_line, kind, i):
    # kind: en or zh; line_id: the number of sentences
    text = text_line.split('\t')[0].strip() # text_line:当前行, text:sentence
    have_triples = int(text_line.split('\t')[1]) # obtain the label is comparative (1) or no-comparative (0)
    re_result = re.findall(r'\[\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\]\]', label_line)
    # label_line--> re_result:去除原始数据中的[],以及;
    raw_labels: List = [[x for x in y] for y in re_result] #一个样本label 存放在一个list中 

    sample = {'id': "identity"+"_"+str(i), 'conversations': []}
    dict_conver1 = {"from": "user", "is_compare":'', "value": ''}
    dict_conver2 = {"from": "assistant", "value":''}
    instruction = f"对比观点抽取任务:从输入语句中抽取所有的对比观点五元组(比较对象,被比较对象,对比属性,观点句,观点极性),抽取的元素允许为空。\n\n对比观点五元组解释:比较对象、被比较对象、对比属性和观点句必须是输入语句中出现的短语。比较对象和被比较对象可以是商品名、商品型号或代词。对比属性是对比的角度或方面,一版是比较对象和被比较对象的属性。观点句包含主观情感的短语。观点极性包括:更差、等同、更好、不同。\n\n请从输入语句({text})中抽取所有的对比观点五元组,并给出比较对象、被比较对象、对比属性、观点句在输入语句中的位置(位置从0开始编号):"

    if have_triples == 0:
        dict_conver1["is_compare"] = 0
        dict_conver1["value"]=instruction
        dict_conver2["value"]= "第1个五元组:(,,,,)\n元组位置:(,,,)\n"
        sample["conversations"].append(dict_conver1)
        sample["conversations"].append(dict_conver2)
        return sample
    
    if have_triples == 1:
        dict_conver1["is_compare"] = 1
        dict_conver1["value"]=instruction
        number = 0
        value = ''
        for label in raw_labels: # 比较句
            number += 1
            sub, obj, asp, op, polarity = label[0], label[1], label[2], label[3], label[4]
            sub_index, sub_span =  str_to_span(sub)
            obj_index, obj_span =  str_to_span(obj)
            asp_index, asp_span =  str_to_span(asp)
            op_index, op_span =  str_to_span(op)
            polarity = dic_zh[int(polarity)]
            quintuple_span= "("+sub_span+","+obj_span +","+asp_span+","+op_span+","+polarity+")"
            quintuple_indexs = "("+sub_index+"," +obj_index+","+asp_index+"," +op_index+")"
            value = value + f"第{number}个五元组:{quintuple_span}\n元组位置:{quintuple_indexs}\n"
        dict_conver2["value"] = value
        sample["conversations"].append(dict_conver1)
        sample["conversations"].append(dict_conver2)  

        return sample      


def load_data(path, kind):
    raw_data = []
    # with open(os.path.join(args.data_path, f'{mode}_char.txt'), 'r') as f:
    with open(path, 'r') as f:
        for line in f:
            raw_data.append(line)
            
    all_samples = []
    line_id, i = 0, 0
    text_line, label_line = '', ''
    for line_id in trange(len(raw_data), desc='processing data for mode'):
        cur_line = raw_data[line_id]
        if len(cur_line.split('\t')) != 2:
            label_line += '\n' + cur_line
        else:
            # a new text line, so push the last text and update text_line
            if text_line != '':
                all_samples.append(process_line(text_line, label_line, kind, i))
                i += 1
            text_line = cur_line
            label_line = ''
    
    all_samples.append(process_line(text_line, label_line, kind, i))

    return all_samples

kind = 'en' 
json_data = load_data(pipeline_data_path, kind)

with open(jsonl_data_path, 'w') as fw:
    fw.write(json.dumps(json_data, ensure_ascii=False)) 

2024年,第一天,加油,打工人~

相关推荐
加德霍克1 小时前
【机器学习】使用scikit-learn中的KNN包实现对鸢尾花数据集或者自定义数据集的的预测
人工智能·python·学习·机器学习·作业
matlabgoodboy1 小时前
代码编写java代做matlab程序代编Python接单c++代写web系统设计
java·python·matlab
l1x1n01 小时前
No.37 笔记 | Python面向对象编程学习笔记:探索代码世界的奇妙之旅
笔记·python·学习
wanfeng_091 小时前
视频m3u8形式播放 -- python and html
python·html·video·hls·m3u8
阿俊仔(摸鱼版)2 小时前
Python 常用运维模块之OS模块篇
运维·开发语言·python·云服务器
lly_csdn1232 小时前
【Image Captioning】DynRefer
python·深度学习·ai·图像分类·多模态·字幕生成·属性识别
西猫雷婶3 小时前
python学opencv|读取图像(四十一 )使用cv2.add()函数实现各个像素点BGR叠加
开发语言·python·opencv
金融OG3 小时前
99.11 金融难点通俗解释:净资产收益率(ROE)VS投资资本回报率(ROIC)VS总资产收益率(ROA)
大数据·python·算法·机器学习·金融
小唐C++4 小时前
C++小病毒-1.0勒索
开发语言·c++·vscode·python·算法·c#·编辑器
北 染 星 辰4 小时前
Python网络自动化运维---用户交互模块
开发语言·python·自动化