通用信息提取数据预处理

train_data='./datasets/duuie'
output_folder='./datasets/duuie_pre'
ignore_datasets=["DUEE", "DUEE_FIN_LITE"]
schema_folder='./datasets/seen_schema'

# 对CCKS2022 竞赛数据进行预处理
import shutil

# shutil.copytree(train_data,output_folder)

import os

life_folder = os.path.join(output_folder, "DUIE_LIFE_SPO")
org_folder = os.path.join(output_folder, "DUIE_ORG_SPO")

print(life_folder,org_folder)

import json

def load_jsonlines_file(filename):
    return [json.loads(line) for line in open(filename, encoding="utf8")]

life_train_instances = load_jsonlines_file(f"{life_folder}/train.json")
org_train_instances = load_jsonlines_file(f"{org_folder}/train.json")

for i in range(27695,27698):
    print(life_train_instances[i],'|',org_train_instances[i])

class RecordSchema:
    def __init__(self, type_list, role_list, type_role_dict):
        self.type_list = type_list
        self.role_list = role_list
        self.type_role_dict = type_role_dict
    def __repr__(self) -> str:
        repr_list = [f"Type: {self.type_list}\n", f"Role: {self.role_list}\n", f"Map: {self.type_role_dict}"]
        return "\n".join(repr_list)
    @staticmethod
    def get_empty_schema():
        return RecordSchema(type_list=list(), role_list=list(), type_role_dict=dict())
    @staticmethod
    def read_from_file(filename):
        lines = open(filename, encoding="utf8").readlines()
        type_list = json.loads(lines[0])# 类型
        role_list = json.loads(lines[1]) # 角色
        type_role_dict = json.loads(lines[2])#类型-角色
        return RecordSchema(type_list, role_list, type_role_dict)
    def write_to_file(self, filename):
        with open(filename, "w", encoding="utf8") as output:
            # 用于将Python对象编码(序列化)为JSON格式的字符串。设置ensure_ascii=False参数
            # 会告诉json.dumps()函数不要转义非ASCII字符
            output.write(json.dumps(self.type_list, ensure_ascii=False) + "\n")
            output.write(json.dumps(self.role_list, ensure_ascii=False) + "\n")
            output.write(json.dumps(self.type_role_dict, ensure_ascii=False) + "\n")

RecordSchema.read_from_file(f"{life_folder}/record.schema")

life_relation = RecordSchema.read_from_file(f"{life_folder}/record.schema").role_list

org_relation = RecordSchema.read_from_file(f"{org_folder}/record.schema").role_list

from collections import defaultdict

instance_dict = defaultdict(list)

for instance in life_train_instances + org_train_instances:
    instance_dict[instance["text"]] += [instance]

a=[i for i in life_train_instances for j in org_train_instances if i['text']==j['text']]

b=[i for i in org_train_instances for j in a if i['text']==j['text']]

for i in range(3):
    print(a[i]['relation'],'|',b[i]['relation'])

dict_1={1:2,3:4}
for i in dict_1:#相当于字典的keys()
    print(i)

from typing import Tuple, List, Dict

def merge_instance(instance_list):
    def all_equal(_x):#判断是否全相同
        for __x in _x:
            if __x != _x[0]:
                return False
        return True
    def entity_key(_x):
        return (tuple(_x["offset"]), _x["type"])
    def relation_key(_x):
        return (
            tuple(_x["type"]),
            tuple(_x["args"][0]["offset"]),
            _x["args"][0]["type"],
            tuple(_x["args"][1]["offset"]),
            _x["args"][1]["type"],
        )

    def event_key(_x):
        return (tuple(_x["offset"]), _x["type"])
    assert all_equal([x["text"] for x in instance_list])
    element_dict = {
        "entity": dict(),
        "relation": dict(),
        "event": dict(),
    }
    instance_id_list = list()
    for x in instance_list:
        instance_id_list += [x["id"]]
        for entity in x.get("entity", list()):
            element_dict["entity"][entity_key(entity)] = entity
        for relation in x.get("relation", list()):
            element_dict["relation"][relation_key(relation)] = relation
        for event in x.get("event", list()):
            element_dict["event"][event_key(event)] = event

    return {
        "id": "-".join(instance_id_list),
        "text": instance_list[0]["text"],
        "tokens": instance_list[0]["tokens"],
        "entity": list(element_dict["entity"].values()),
        "relation": list(element_dict["relation"].values()),
        "event": list(element_dict["event"].values()),
    }

 for text in instance_dict:
    instance_dict[text] = merge_instance(instance_dict[text])

for i in range(800,802):
    print(list(instance_dict.values())[i]['relation'])

import copy

with open(f"{life_folder}/train.json", "w") as output:
    for instance in instance_dict.values():
        new_instance = copy.deepcopy(instance)
        new_instance["relation"] = list(filter(lambda x: x["type"] in life_relation, instance["relation"]))
        output.write(json.dumps(new_instance) + "\n")

 with open(f"{org_folder}/train.json", "w") as output:
    for instance in instance_dict.values():
        new_instance = copy.deepcopy(instance)
        new_instance["relation"] = list(filter(lambda x: x["type"] in org_relation, instance["relation"]))
        output.write(json.dumps(new_instance) + "\n")

a_instances = load_jsonlines_file(f"{life_folder}/train.json")
b_instances = load_jsonlines_file(f"{org_folder}/train.json")

print(len(a_instances),len(b_instances))

import yaml

def load_definition_schema_file(filename):
    return yaml.load(open(filename, encoding="utf8"), Loader=yaml.FullLoader)

aa = load_definition_schema_file(os.path.join(schema_folder,'体育竞赛.yaml'))

mm=list()
for i in aa['事件'].values():
    mm+=i["参数"]   
mm=list(set(mm))

[x for x in aa['事件']]

aa['事件']['退役']["参数"].keys()

aaa={1:2,3:4}
for k,v in aaa.items():
    print(k,v)

def dump_schema(output_folder, schema_dict):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for schema_name, schema in schema_dict.items():
        schema_file = f"{output_folder}/{schema_name}.schema"
        with open(schema_file, "w", encoding="utf8") as output:
            for element in schema:
                output.write(json.dumps(element, ensure_ascii=False) + "\n")

def dump_event_schema(event_map, output_folder):
    role_list = list()
    for roles in event_map.values():
        role_list += roles["参数"]
    rols_list = list(set(role_list))
    type_list = list(event_map.keys())
    type_role_map = {event_type: list(event_map[event_type]["参数"].keys()) for event_type in event_map}
    dump_schema(
        output_folder=output_folder,
        schema_dict={
            "entity": [[], [], {}],
            "relation": [[], [], {}],
            "event": [type_list, rols_list, type_role_map],
            "record": [type_list, rols_list, type_role_map],
        },
    )

def filter_event_in_instance(instances,required_event_types):
    """Filter events in the instance, keep event mentions with `required_event_types`
    过滤实例中的事件,只保留需要的事件类别的事件标注
    """
    new_instances = list()
    for instance in instances:
        new_instance = copy.deepcopy(instance)
        new_instance["event"] = list(filter(lambda x: x["type"] in required_event_types, new_instance["event"]))
        new_instances += [new_instance]
    return new_instances

def dump_instances(instances, output_filename):
    with open(output_filename, "w", encoding="utf8") as output:
        for instance in instances:
            output.write(json.dumps(instance, ensure_ascii=False) + "\n")

def filter_event(data_folder, event_types, output_folder):
    """Keep event with `event_types` in `data_folder` save to `output_folder`
    过滤 `data_folder` 中的事件,只保留 `event_types` 类型事件保存到 `output_folder`"""
    dump_event_schema(event_types, output_folder)
    for split in ["train", "val"]:
        filename = os.path.join(data_folder, f"{split}.json")
        instances = [json.loads(line.strip()) for line in open(filename, encoding="utf8")]
        new_instances = filter_event_in_instance(instances, required_event_types=event_types)
        dump_instances(new_instances, os.path.join(output_folder, f"{split}.json"))

# 对事件数据进行预处理,过滤除 `灾害意外` 和 `体育竞赛` 外的事件标注
for schema in ["灾害意外", "体育竞赛"]:
    print(f"Building {schema} dataset ...")
    duee_folder = os.path.join(output_folder, "DUEE")
    schema_file = os.path.join(schema_folder, f"{schema}.yaml")
    output_folder2 = os.path.join(output_folder, schema)
    schema = load_definition_schema_file(schema_file)
    filter_event(
        duee_folder,
        schema["事件"],
        output_folder2,
    )

ty_instances = load_jsonlines_file(f"{output_folder}/体育竞赛/train.json")
zh_instances = load_jsonlines_file(f"{output_folder}/灾害意外/train.json")

print(len(ty_instances),len(zh_instances))

for i in range(11508,11608):
    print(ty_instances[i],'|',zh_instances[i])

bb=load_definition_schema_file(os.path.join(schema_folder, "金融信息.yaml"))

for i in bb['事件'].keys():
    print(i)

mm=list()
mm+=bb['事件']['中标']["参数"]   
mm=list(set(mm))

bb["事件"]['中标']["参数"] .keys()

for schema in ["金融信息"]:
    print(f"Building {schema} dataset ...")
    duee_fin_folder = os.path.join(output_folder, "DUEE_FIN_LITE")
    schema_file = os.path.join(schema_folder, f"{schema}.yaml")
    output_folder2 = os.path.join(output_folder, schema)
    schema = load_definition_schema_file(schema_file)
    # 依据不同事件类别将多事件抽取分割成多个单事件类型抽取
    # Separate multi-type extraction to multiple single-type extraction
    for event_type in schema["事件"]:
        filter_event(
           duee_fin_folder,
           {event_type: schema["事件"][event_type]},
            output_folder2 + "_" + event_type,
        )

vv=load_jsonlines_file(f"{output_folder}/DUEE_FIN_LITE/train.json")

zb_instances = load_jsonlines_file(f"{output_folder}/金融信息_中标/train.json")
zy_instances = load_jsonlines_file(f"{output_folder}/金融信息_质押/train.json")

print(len(zb_instances),len(zy_instances))

for i in range(6985,7015):
    print(zb_instances[i],'|',zy_instances[i])

def annonote_graph(
    entities: List[Dict] = [],
    relations: List[Dict] = [],
    events: List[Dict] = []):
    spot_dict = dict()
    asoc_dict = defaultdict(list)
    # 将实体关系事件转换为点关联图
    def add_spot(spot):
        spot_key = (tuple(spot["offset"]), spot["type"])
        spot_dict[spot_key] = spot
    def add_asoc(spot, asoc, tail):
        spot_key = (tuple(spot["offset"]), spot["type"])
        asoc_dict[spot_key] += [(tuple(tail["offset"]), tail["text"], asoc)]
    for entity in entities:
        add_spot(spot=entity)
    for relation in relations:
        add_spot(spot=relation["args"][0])
        add_asoc(spot=relation["args"][0], asoc=relation["type"], tail=relation["args"][1])
    for event in events:
        add_spot(spot=event)
        for argument in event["args"]:
            add_asoc(spot=event, asoc=argument["type"], tail=argument)
    spot_asoc_instance = list()
    for spot_key in sorted(spot_dict.keys()):
        offset, label = spot_key
        if len(spot_dict[spot_key]["offset"]) == 0:
            continue
        spot_instance = {
            "span": spot_dict[spot_key]["text"],
            "label": label,
            "asoc": list(),
        }
        for tail_offset, tail_text, asoc in sorted(asoc_dict.get(spot_key, [])):
            if len(tail_offset) == 0:
                continue
            spot_instance["asoc"] += [(asoc, tail_text)]
        spot_asoc_instance += [spot_instance]
    spot_labels = set([label for _, label in spot_dict.keys()])
    asoc_labels = set()
    for _, asoc_list in asoc_dict.items():
        for _, _, asoc in asoc_list:
            asoc_labels.add(asoc)
    return spot_labels, asoc_labels, spot_asoc_instance

def add_spot_asoc_to_single_file(filename):
    instances = [json.loads(line) for line in open(filename, encoding="utf8")]
    print(f"Add spot asoc to {filename} ...")
    with open(filename, "w", encoding="utf8") as output:
        for instance in instances:
            spots, asocs, spot_asoc_instance = annonote_graph(
                entities=instance["entity"],#实体
                relations=instance["relation"],#关系
                events=instance["event"],#事件
            )
            # 为对象添加spot_asoc
            instance["spot_asoc"] = spot_asoc_instance
            # 为对象添加spot
            instance["spot"] = list(spots)
            # 为对象添加asoc
            instance["asoc"] = list(asocs)
            output.write(json.dumps(instance, ensure_ascii=False) + "\n")

ff = os.path.join(output_folder,'金融信息_企业破产',"train.json")

ff_instances = [json.loads(line) for line in open(ff, encoding="utf8")]

for i in range(1046,1050):
    print(ff_instances[i])

a,b,yyj=annonote_graph( entities=ff_instances[11000]["entity"],
                relations=ff_instances[11000]["relation"],
                events=ff_instances[11000]["event"],)

data_folder=output_folder

def merge_schema(schema_list: List[RecordSchema]):
    type_set = set()
    role_set = set()
    type_role_dict = defaultdict(list)
    for schema in schema_list:
        for type_name in schema.type_list:
            type_set.add(type_name)
        for role_name in schema.role_list:
            role_set.add(role_name)
        for type_name in schema.type_role_dict:
            type_role_dict[type_name] += schema.type_role_dict[type_name]
    for type_name in type_role_dict:
        type_role_dict[type_name] = list(set(type_role_dict[type_name]))
    return RecordSchema(
        type_list=list(type_set),
        role_list=list(role_set),
        type_role_dict=type_role_dict,
    )

def convert_duuie_to_spotasoc(data_folder, ignore_datasets):
    schema_list = list()
    for task_folder in os.listdir(data_folder):#过滤无效
        if task_folder in ignore_datasets:
            continue
        if not os.path.isdir(os.path.join(data_folder, task_folder)):#过滤非文件夹
            continue
        print(f"Add spot asoc to {task_folder} ...")
        # 读取单任务的 Schema
        task_schema_file = os.path.join(data_folder, task_folder, "record.schema")
        # 向单任务数据中添加 Spot Asoc 标注
        add_spot_asoc_to_single_file(os.path.join(data_folder, task_folder, "train.json"))
        add_spot_asoc_to_single_file(os.path.join(data_folder, task_folder, "val.json"))
        record_schema = RecordSchema.read_from_file(task_schema_file)
        schema_list += [record_schema]
    # 融合不同任务的 Schema
    multi_schema = merge_schema(schema_list)
    multi_schema.write_to_file(os.path.join(data_folder, "record.schema"))

convert_duuie_to_spotasoc(output_folder,ignore_datasets)

 

 

 

 

 

 

 

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/pingmian/25943.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

项目:基于httplib/消息队列负载均衡式在线OJ

文章目录 写在前面关于组件开源仓库和项目上线其他文档说明项目亮点 使用技术和环境项目宏观结构模块实现compiler模块runner模块compile_run模块compile_server模块 基于MVC结构的OJ服务什么是MVC?用户请求服务路由功能Model模块view模块Control模块 写在前面 关于…

linux安装jdk + docker+dockercompose+aliyunACR

下载安装包 链接:https://pan.baidu.com/s/1AyFvPA5qwy4IxfZoTQohrQ 提取码:6666 安装jdk jdk-8u411-linux-x64.tar.gz 链接:https://pan.baidu.com/s/1BZ7J4L5PY-9nuQyxBMDGTA 提取码:6666 1、解压jdk tar -xvf jdk-8u411-li…

算法:读取redis中指令查询的键

1,redis查询的代码: 只读取双引号奇数行的数 34575) "7193139"34576) "0"34577) "7078990"34578) "0"34579) "7296242"34580) "0"34581) "1650126"34582) "0"…

如何克隆笔记本电脑上的硬盘?

笔记本电脑的信息存储在硬盘上,一旦硬盘发生故障,数据很容易丢失。克隆技术使我们能够将一个硬盘上的数据精确复制到另一个硬盘上,然后将其用作备份。此外,如果我们决定升级到更大容量或固态硬盘,克隆技术还允许我们将…

linux中xterm窗口怎么调整字体大小

需求:打开的xterm窗口字体比较小,怎么才能调整字体大小,打开的大写: 解决方法: 在home目录下搞一个设置文件 .Xresource,里面内容如下 然后把设置文件添加到 .tcshrc 文件中生效 这样重新打开的xterm字…

真北游记|三江交汇,碧海苍梧,端午去梧州吃龟苓膏

准备 t-14:高铁抢票(A) t-14:订行程(B)酒店(C) T-2:准备水、零食 T-1:物质准备:衣服、纸巾、毛巾、雨伞🌂、拖鞋、口罩😷(D&#xff0…

Helm在线部署Longhorn(1.6.0版本)分布式存储

环境依赖: k8s (版本大于等于v1.21版本)、helm工具 安装前准备 k8s worker 节点都需要执行 yum -y --setopttsflagsnoscripts install iscsi-initiator-utils echo "InitiatorName$(/sbin/iscsi-iname)" > /etc/iscsi/initiatorname.iscsi systemctl …

8.动态内存申请

知识点一&#xff1a;malloc函数和free函数 malloc函数 头文件&#xff1a;#include<stdlib.h>#include<string.h> // 用于memset函数的头文件 void *malloc(unsigned int num_ size); 形参: num_ size需要申请空间大小的字节数。 返回值&#xff1a;成功:返回…

ARMxy赋能温室环境自动化调控

智慧农业正以其独特的魅力描绘着未来的轮廓。作为这一变革的中坚力量&#xff0c;ARMxy工业计算机凭借其高性能、低功耗及高度灵活性&#xff0c;正逐步成为智能温室控制、精准灌溉及作物生长监测领域的核心引擎。 智能温室的智慧大脑 位于某地的现代农业园区&#xff0c;一座…

力扣118. 杨辉三角

给定一个非负整数 numRows&#xff0c;生成「杨辉三角」的前 numRows 行。在「杨辉三角」中&#xff0c;每个数是它左上方和右上方的数的和。 示例 1: 输入: numRows 5 输出: [[1],[1,1],[1,2,1],[1,3,3,1],[1,4,6,4,1]] 示例 2: 输入: numRows 1 输出: [[1…

绝密级的高考考卷大揭秘

今天既是端午节也是高考的最后一日。我相信各位考生在这个举国欢庆的日子都有着属于自己的惬意和喜悦。今日起&#xff0c;我将重启我的情报输出&#xff0c;但是在这个欢乐的小长假中就先来一个有趣的小文案。 一、出题者 一份高考试卷安全无虞地出现在考生面前&#xff0c;…

Python图像处理入门学习——基于霍夫变换的车道线和路沿检测

文章目录 前言一、实验内容与方法二、视频的导入、拆分、合成2.1 视频时长读取2.2 视频的拆分2.3 视频的合成 三、路沿检测3.1 路沿检测算法整体框架3.2 尝试3.3 图像处理->边缘检测(原理)3.4 Canny算子边缘检测(原理)3.5 Canny算子边缘检测(实现)3.5.1 高斯滤波3.5.2 图像转…

软件游戏steam_api.dll丢失的解决方法,总结5种有效的方法

在玩电脑游戏时&#xff0c;我们经常会遇到一些错误提示&#xff0c;其中之一就是“游戏缺少steam_api.dll”。这个问题可能让很多玩家感到困惑和烦恼。那么&#xff0c;究竟是什么原因导致游戏缺少steam_api.dll呢&#xff1f;又该如何解决这个问题呢&#xff1f;本文将为大家…

ISO 19115-2:2019 第6章 获取和处理元数据

6 获取和处理元数据 6.1 获取和处理要求的元数据 ISO 19115-1 确定了描述数字地理资源所需的元数据。本文件扩展了 ISO 19115-1 中确定的元数据,并确定了描述地理资源获取和处理所需的附加元数据。 6.2 获取和处理元数据包及其依赖关系 ISO 地理信息系列标准使用一个或多个…

C# 类或结构体的成员

字段&#xff08;Field&#xff09;属性&#xff08;Property&#xff09;方法&#xff08;Method&#xff09;构造函数&#xff08;Constructor&#xff09;索引器&#xff08;Indexer&#xff09;事件&#xff08;Event&#xff09;嵌套类型&#xff08;Nested Type&#xff…

反射学习记

Java 中的反射是什么意思&#xff1f;有哪些应用场景&#xff1f; 每个类都有⼀个 Class 对象&#xff0c;包含了与类有关的信息。当编译⼀个新类时&#xff0c;会产生一个同名的 .class 文件&#xff0c;该⽂件 内容保存着 Class 对象。类加载相当于 Class 对象的加载&a…

2024.6.10刷题记录

目录 一、881. 救生艇 贪心-排序-双指针 二、8. 字符串转换整数 (atoi) 1.模拟-未考虑溢出 2.考虑溢出问题 三、9. 回文数 1.双指针-字符串 2.数字翻转 3.数字翻转-只翻转一半 一、881. 救生艇 贪心-排序-双指针 class Solution:def numRescueBoats(self, people: Li…

IoC容器加载流程

IoC容器加载流程 流程主要分为两个阶段&#xff1a; Bean的配置解析阶段&#xff1a;将Bean的配置信息转换成BeanDefinitionBean的创建阶段&#xff1a;根据BeanDefinition进行Bean创建 1. 加载流程 1.1 配置解析 加载流程&#xff1a; 资源文件定位&#xff1a;一般是在…

接口测试时, 数据Mock为何如此重要?

一、为什么要mock 工作中遇到以下问题&#xff0c;我们可以使用mock解决&#xff1a; 1、无法控制第三方系统某接口的返回&#xff0c;返回的数据不满足要求 2、某依赖系统还未开发完成&#xff0c;就需要对被测系统进行测试 3、有些系统不支持重复请求&#xff0c;或有访问…

Qt安装时出现无法下载存档,环境配置,main中自定义类编译不过问题

1. Qt安装时出现无法下载存档 进入Qt安装程序exe所在的文件目录&#xff0c;一般在下载文件夹&#xff0c;右键打开cmd。cmd输入&#xff1a;对应的exe镜像提速。 .\qt-online-installer-windows-x64-4.8.0.exe --mirror https://mirrors.cloud.tencent.com/qt/ 2. 环境配置 …