import os
import sys
import json
def read_by_lines(path):
result = list()
with open(path, "r", encoding="utf8") as infile:
for line in infile:
result.append(line.strip())
return result
def write_by_lines(path, data):
with open(path, "w", encoding="utf8") as outfile:
outfile.write(d + "\\n") for d in data
def data_process(path, model="trigger", is_predict=False):
def label_data(data, start, l, _type):
for i in range(start, start + l):#从起始索引到结束
suffix = "B-" if i == start else "I-" #前缀
datai = "{}{}".format(suffix, _type)
return data
sentences = \[\]
output = "text_a" if is_predict else "text_a\\tlabel"#文本,标签
with open(path) as f:
for line in f:
d_json = json.loads(line.strip())#每一行
_id = d_json"id"#id
text_a = [
"," if t == " " or t == "\n" or t == "\t" else t
for t in list(d_json"text".lower())
]# 文本
if is_predict:
sentences.append({"text": d_json"text", "id": _id})
output.append('\002'.join(text_a))
else:
if model == "trigger":
labels = "O" * len(text_a)#标签初始化为全部非实体
if len(d_json.get("event_list", \[\])) == 0:
continue
for event in d_json.get("event_list"):
event_type = event"event_type"#事件类型
start = event"trigger_start_index"#触发词起始索引
trigger = event"trigger"#触发池
#为触发词设置labels
labels = label_data(labels, start, len(trigger),
event_type)
output.append("{}\t{}".format('\002'.join(text_a),
'\002'.join(labels)))
elif model == "role":
labels = "O" * len(text_a)#标签
if len(d_json.get("event_list", \[\])) == 0:
continue
for event in d_json.get("event_list"):
for arg in event"arguments":
role_type = arg"role"#论元角色类型
argument = arg"argument"#论元
start = arg"argument_start_index"#论元起始
labels = label_data(labels, start, len(argument),
role_type)
output.append("{}\t{}".format('\002'.join(text_a),
'\002'.join(labels)))
return output
def schema_process(path, model="trigger"):
def label_add(labels, _type):
if "B-{}".format(_type) not in labels:#不在里面就添加
labels.extend("B-{}".format(_type), "I-{}".format(_type))#B-,I-
return labels
labels = \[\]#存放事件类型标签或角色标签
for line in read_by_lines(path):
d_json = json.loads(line.strip())
if model == "trigger":
labels = label_add(labels, d_json"event_type")
elif model == "role":
for role in d_json"role_list":
labels = label_add(labels, role"role")
labels.append("O")
tags = \[\]
for index, label in enumerate(labels):
tags.append("{}\t{}".format(index, label))
return tags
conf_dir = "./conf/DuEE1.0"
schema_path ='./datasets/DuEE_1_0/event_schema.json'
tags_trigger_path = "{}/trigger_tag.dict".format(conf_dir)
tags_role_path = "{}/role_tag.dict".format(conf_dir)
read_by_lines(schema_path)0
!unzip DuEE_1_0.zip -d ./datasets/
tags_trigger = schema_process(schema_path, "trigger")
os.makedirs(conf_dir,exist_ok=True)
write_by_lines(tags_trigger_path, tags_trigger)
tags_role = schema_process(schema_path, "role")
write_by_lines(tags_role_path, tags_role)
data_dir = "./datasets/DuEE1.0"
trigger_save_dir = "{}/trigger".format(data_dir)
role_save_dir = "{}/role".format(data_dir)
if not os.path.exists(trigger_save_dir):
os.makedirs(trigger_save_dir)
if not os.path.exists(role_save_dir):
os.makedirs(role_save_dir)
train_tri = data_process("./datasets/DuEE_1_0/train.json","trigger")
read_by_lines('./datasets/DuEE_1_0/train.json')0
write_by_lines("{}/train.tsv".format(trigger_save_dir), train_tri)
dev_tri = data_process("./datasets/DuEE_1_0/dev.json","trigger")
write_by_lines("{}/dev.tsv".format(trigger_save_dir), dev_tri)
test_tri = data_process("./datasets/DuEE_1_0/test.json", "trigger")
write_by_lines("{}/test.tsv".format(trigger_save_dir), test_tri)
train_role = data_process("./datasets/DuEE_1_0/train.json", "role")
write_by_lines("{}/train.tsv".format(role_save_dir), train_role)
dev_role = data_process("./datasets/DuEE_1_0/dev.json", "role")
write_by_lines("{}/dev.tsv".format(role_save_dir), dev_role)
test_role = data_process("./datasets/DuEE_1_0/test.json", "role")
write_by_lines("{}/test.tsv".format(role_save_dir), test_role)