需求

自动识别法院和公积金中心的文书（调解书、判决书、裁定书、通知书）扫描件（PDF或图片），获取特定结构的数据，自动对比。抽取结构如：

['标题','诉讼案号','执行案号','公积金',{'原告': ['姓名', '单位', '生日', '身份证号']},{'被告': ['姓名', '单位', '生日', '身份证号']}]

思路

# 1.遍历目录获取源文件名；
# 2.统一格式，PDF转JPG。自动识别图片方向，把图片转正；
# 3.排序并获取整篇结构化内容，写TXT文件输出；
# 4.导入 Label Studio 标记；
# 5.导出 JSON 格式标记样本；
# 6.将 label studio 导出的 JSON 数据文件格式转换成 doccano 导出的数据文件格式；
# 7.构造网络训练；
# 8.部署模型预测。

环境

Ubuntu 22.04
Anaconda
PaddlePaddle
Label Studio

步骤

预处理依赖

import os
from pdf2image import convert_from_path
from paddleclas import PaddleClas
from PIL import Image
import numpy as np
from paddleocr import PaddleOCR

加载OCR模型

clas_engine = PaddleClas(model_name="text_image_orientation", use_gpu=False)
# 加载OCR模型
ocr = PaddleOCR(ocr_version='PP-OCRv3')

遍历文件

def get_file_names(directory):file_names = []for root, dirs, files in os.walk(directory):for file in files:file_names.append(os.path.join(root, file))return file_names

获取文件后缀名

def get_file_extension(file_name):_, extension = os.path.splitext(file_name)return extension

pdf转jpg

def pdf_to_jpg(pdf_path, output_folder):images = convert_from_path(pdf_path)file_names = []for i, image in enumerate(images):str = f"{output_folder}/page_{i+1}.jpg"image.save(str, "JPEG")file_names.append(str)return file_names

自动创建目录

def create_directory(directory):if not os.path.exists(directory):os.makedirs(directory)print(f"目录 '{directory}' 创建成功")else:print(f"目录 '{directory}' 已存在")

识别图片方向

def model_inference_direction(image) -> tuple:results = clas_engine.predict(image, print_pred=True)try:# 可能有无结果的，直接取值会报错，try忽略掉results = list(results)[0][0]results = int(results["label_names"][0])except Exception as e:print("An error occurred:", str(e))return results

识别文本

def ocr_gettext(image):results = ocr.ocr(image)text = ""for result in results:for res in result:# print(res[1][0])text = text + res[1][0]return text

执行

# 指定目录路径
directory_path = "D:/jerry/code/python/rap/dataset/test/"# 获取目录及其子目录下的文件名
file_names = get_file_names(directory_path)for file_name in file_names:suffix_name = get_file_extension(file_name)# 忽略大小写对比if suffix_name.lower() == '.pdf'.lower():# 调用函数进行转换jpg_output = file_name.replace(".pdf","")create_directory(jpg_output)fnames = pdf_to_jpg(file_name, jpg_output)file_names.extend(fnames)elif suffix_name.lower() == '.txt'.lower():os.remove(file_name)file_names.remove(file_name)else:image = Image.open(file_name)# 可能有4分量的，强制转为3分量image = image.convert("RGB")# 自动识别图片旋转角度（顺90 识别不准）angle = model_inference_direction(np.asarray(image))# 识别出参： 正角度为顺时针旋转# 执行入参：正角度为逆时针旋转image = image.rotate(angle)# image.show()text = ocr_gettext(np.asarray(image))text_out = file_name.replace(suffix_name, ".txt")with open(text_out, "w", encoding="utf-8") as file:file.write(text)

搭建label studio标记，标记完成后导出JSON。

Label Studio JSON转Doccano JSON

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.import argparse
import os
import jsondef append_attrs(data, item, label_id, relation_id):mapp = {}for anno in data["annotations"][0]["result"]:if anno["type"] == "labels":label_id += 1item["entities"].append({"id": label_id,"label": anno["value"]["labels"][0],"start_offset": anno["value"]["start"],"end_offset": anno["value"]["end"],})mapp[anno["id"]] = label_idfor anno in data["annotations"][0]["result"]:if anno["type"] == "relation":relation_id += 1item["relations"].append({"id": relation_id,"from_id": mapp[anno["from_id"]],"to_id": mapp[anno["to_id"]],"type": anno["labels"][0],})return item, label_id, relation_iddef convert(dataset):results = []outer_id = 0label_id = 0relation_id = 0for data in dataset:labels = data["annotations"][0]["result"]outer_id += 1item = {"id": outer_id, "text": data["data"]["value"], "entities": [], "relations": []}item, label_id, relation_id = append_attrs(data, item, label_id, relation_id)results.append(item)return resultsdef do_convert(labelstudio_file, doccano_file):with open(labelstudio_file, "r", encoding="utf-8") as infile:for content in infile:dataset = json.loads(content)results = convert(dataset)print(results)with open(doccano_file, "w", encoding="utf-8") as outfile:for item in results:outline = json.dumps(item, ensure_ascii=False)outfile.write(outline + "\n")labelstudio_file = "data/project-11-at-2023-06-07-08-12-d7affacb.json"
doccano_file = "data/doccano_ext.json"
do_convert(labelstudio_file, doccano_file)

python3 ./labelstudio2doccano.py

构造数据集

# coding=utf-8
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.import argparse
import json
import os
import time
from decimal import Decimalimport numpy as np
from utils import convert_cls_examples, convert_ext_examples, set_seedfrom paddlenlp.trainer.argparser import strtobool
from paddlenlp.utils.log import loggerdef do_convert():set_seed(args.seed)tic_time = time.time()if not os.path.exists(args.doccano_file):raise ValueError("Please input the correct path of doccano file.")if not os.path.exists(args.save_dir):os.makedirs(args.save_dir)if len(args.splits) != 0 and len(args.splits) != 3:raise ValueError("Only []/ len(splits)==3 accepted for splits.")def _check_sum(splits):return Decimal(str(splits[0])) + Decimal(str(splits[1])) + Decimal(str(splits[2])) == Decimal("1")if len(args.splits) == 3 and not _check_sum(args.splits):raise ValueError("Please set correct splits, sum of elements in splits should be equal to 1.")with open(args.doccano_file, "r", encoding="utf-8") as f:raw_examples = f.readlines()def _create_ext_examples(examples,negative_ratio,prompt_prefix="情感倾向",options=["正向", "负向"],separator="##",shuffle=False,is_train=True,schema_lang="ch",):entities, relations, aspects = convert_ext_examples(examples, negative_ratio, prompt_prefix, options, separator, is_train, schema_lang)examples = entities + relations + aspectsif shuffle:indexes = np.random.permutation(len(examples))examples = [examples[i] for i in indexes]return examplesdef _create_cls_examples(examples, prompt_prefix, options, shuffle=False):examples = convert_cls_examples(examples, prompt_prefix, options)if shuffle:indexes = np.random.permutation(len(examples))examples = [examples[i] for i in indexes]return examplesdef _save_examples(save_dir, file_name, examples):count = 0save_path = os.path.join(save_dir, file_name)with open(save_path, "w", encoding="utf-8") as f:for example in examples:f.write(json.dumps(example, ensure_ascii=False) + "\n")count += 1logger.info("Save %d examples to %s." % (count, save_path))if len(args.splits) == 0:if args.task_type == "ext":examples = _create_ext_examples(raw_examples,args.negative_ratio,args.prompt_prefix,args.options,args.separator,args.is_shuffle,schema_lang=args.schema_lang,)else:examples = _create_cls_examples(raw_examples, args.prompt_prefix, args.options, args.is_shuffle)_save_examples(args.save_dir, "train.txt", examples)else:if args.is_shuffle:indexes = np.random.permutation(len(raw_examples))index_list = indexes.tolist()raw_examples = [raw_examples[i] for i in indexes]else:index_list = list(range(len(raw_examples)))i1, i2, _ = args.splitsp1 = int(len(raw_examples) * i1)p2 = int(len(raw_examples) * (i1 + i2))train_ids = index_list[:p1]dev_ids = index_list[p1:p2]test_ids = index_list[p2:]with open(os.path.join(args.save_dir, "sample_index.json"), "w") as fp:maps = {"train_ids": train_ids, "dev_ids": dev_ids, "test_ids": test_ids}fp.write(json.dumps(maps))if args.task_type == "ext":train_examples = _create_ext_examples(raw_examples[:p1],args.negative_ratio,args.prompt_prefix,args.options,args.separator,args.is_shuffle,schema_lang=args.schema_lang,)dev_examples = _create_ext_examples(raw_examples[p1:p2],-1,args.prompt_prefix,args.options,args.separator,is_train=False,schema_lang=args.schema_lang,)test_examples = _create_ext_examples(raw_examples[p2:],-1,args.prompt_prefix,args.options,args.separator,is_train=False,schema_lang=args.schema_lang,)else:train_examples = _create_cls_examples(raw_examples[:p1], args.prompt_prefix, args.options)dev_examples = _create_cls_examples(raw_examples[p1:p2], args.prompt_prefix, args.options)test_examples = _create_cls_examples(raw_examples[p2:], args.prompt_prefix, args.options)_save_examples(args.save_dir, "train.txt", train_examples)_save_examples(args.save_dir, "dev.txt", dev_examples)_save_examples(args.save_dir, "test.txt", test_examples)logger.info("Finished! It takes %.2f seconds" % (time.time() - tic_time))if __name__ == "__main__":# yapf: disableparser = argparse.ArgumentParser()parser.add_argument("--doccano_file", default="./data/doccano_ext.json", type=str, help="The doccano file exported from doccano platform.")parser.add_argument("--save_dir", default="./data", type=str, help="The path of data that you wanna save.")parser.add_argument("--negative_ratio", default=5, type=int, help="Used only for the extraction task, the ratio of positive and negative samples, number of negtive samples = negative_ratio * number of positive samples")parser.add_argument("--splits", default=[0.8, 0.1, 0.1], type=float, nargs="*", help="The ratio of samples in datasets. [0.6, 0.2, 0.2] means 60% samples used for training, 20% for evaluation and 20% for test.")parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str, help="Select task type, ext for the extraction task and cls for the classification task, defaults to ext.")parser.add_argument("--options", default=["正向", "负向"], type=str, nargs="+", help="Used only for the classification task, the options for classification")parser.add_argument("--prompt_prefix", default="情感倾向", type=str, help="Used only for the classification task, the prompt prefix for classification")parser.add_argument("--is_shuffle", default="True", type=strtobool, help="Whether to shuffle the labeled dataset, defaults to True.")parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization")parser.add_argument("--separator", type=str, default='##', help="Used only for entity/aspect-level classification task, separator for entity label and classification label")parser.add_argument("--schema_lang", choices=["ch", "en"], default="ch", help="Select the language type for schema.")args = parser.parse_args()# yapf: enabledo_convert()

python3 doccano.py --doccano_file ./data/doccano_ext.json --task_type ext --save_dir ./data --splits 0.8 0.2 0 --schema_lang ch

工具，命名为utils.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.import json
import math
import random
import re
from typing import List, Optionalimport numpy as np
import paddle
from tqdm import tqdmfrom paddlenlp.utils.log import loggerdef set_seed(seed):paddle.seed(seed)random.seed(seed)np.random.seed(seed)def create_data_loader(dataset, mode="train", batch_size=1, trans_fn=None):"""Create dataloader.Args:dataset(obj:`paddle.io.Dataset`): Dataset instance.mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly.batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch.trans_fn(obj:`callable`, optional, defaults to `None`): function to convert a data sample to input ids, etc.Returns:dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches."""if trans_fn:dataset = dataset.map(trans_fn)shuffle = True if mode == "train" else Falseif mode == "train":sampler = paddle.io.DistributedBatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle)else:sampler = paddle.io.BatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle)dataloader = paddle.io.DataLoader(dataset, batch_sampler=sampler, return_list=True)return dataloaderdef map_offset(ori_offset, offset_mapping):"""map ori offset to token offset"""for index, span in enumerate(offset_mapping):if span[0] <= ori_offset < span[1]:return indexreturn -1def reader(data_path, max_seq_len=512):"""read json"""with open(data_path, "r", encoding="utf-8") as f:for line in f:json_line = json.loads(line)content = json_line["content"].strip()prompt = json_line["prompt"]# Model Input is aslike: [CLS] Prompt [SEP] Content [SEP]# It include three summary tokens.if max_seq_len <= len(prompt) + 3:raise ValueError("The value of max_seq_len is too small, please set a larger value")max_content_len = max_seq_len - len(prompt) - 3if len(content) <= max_content_len:yield json_lineelse:result_list = json_line["result_list"]json_lines = []accumulate = 0while True:cur_result_list = []for result in result_list:if result["end"] - result["start"] > max_content_len:logger.warning("result['end'] - result ['start'] exceeds max_content_len, which will result in no valid instance being returned")if (result["start"] + 1 <= max_content_len < result["end"]and result["end"] - result["start"] <= max_content_len):max_content_len = result["start"]breakcur_content = content[:max_content_len]res_content = content[max_content_len:]while True:if len(result_list) == 0:breakelif result_list[0]["end"] <= max_content_len:if result_list[0]["end"] > 0:cur_result = result_list.pop(0)cur_result_list.append(cur_result)else:cur_result_list = [result for result in result_list]breakelse:breakjson_line = {"content": cur_content, "result_list": cur_result_list, "prompt": prompt}json_lines.append(json_line)for result in result_list:if result["end"] <= 0:breakresult["start"] -= max_content_lenresult["end"] -= max_content_lenaccumulate += max_content_lenmax_content_len = max_seq_len - len(prompt) - 3if len(res_content) == 0:breakelif len(res_content) < max_content_len:json_line = {"content": res_content, "result_list": result_list, "prompt": prompt}json_lines.append(json_line)breakelse:content = res_contentfor json_line in json_lines:yield json_linedef unify_prompt_name(prompt):# The classification labels are shuffled during finetuning, so they need# to be unified during evaluation.if re.search(r"\[.*?\]$", prompt):prompt_prefix = prompt[: prompt.find("[", 1)]cls_options = re.search(r"\[.*?\]$", prompt).group()[1:-1].split(",")cls_options = sorted(list(set(cls_options)))cls_options = ",".join(cls_options)prompt = prompt_prefix + "[" + cls_options + "]"return promptreturn promptdef get_relation_type_dict(relation_data, schema_lang="ch"):def compare(a, b, schema_lang="ch"):if schema_lang == "ch":a = a[::-1]b = b[::-1]res = ""for i in range(min(len(a), len(b))):if a[i] == b[i]:res += a[i]else:breakif res == "":return resif schema_lang == "ch" and res[::-1][0] == "的":return res[::-1][1:]elif schema_lang == "en" and res[-3:] == " of":return res[:-3]return ""relation_type_dict = {}added_list = []for i in range(len(relation_data)):added = Falseif relation_data[i][0] not in added_list:for j in range(i + 1, len(relation_data)):match = compare(relation_data[i][0], relation_data[j][0], schema_lang=schema_lang)if match != "":match = unify_prompt_name(match)if relation_data[i][0] not in added_list:added_list.append(relation_data[i][0])relation_type_dict.setdefault(match, []).append(relation_data[i][1])added_list.append(relation_data[j][0])relation_type_dict.setdefault(match, []).append(relation_data[j][1])added = Trueif not added:added_list.append(relation_data[i][0])if schema_lang == "ch":suffix = relation_data[i][0].rsplit("的", 1)[1]suffix = unify_prompt_name(suffix)relation_type = suffixelse:prefix = relation_data[i][0].split(" of ", 1)[0]prefix = unify_prompt_name(prefix)relation_type = prefixrelation_type_dict.setdefault(relation_type, []).append(relation_data[i][1])return relation_type_dictdef add_entity_negative_example(examples, texts, prompts, label_set, negative_ratio):negative_examples = []positive_examples = []with tqdm(total=len(prompts)) as pbar:for i, prompt in enumerate(prompts):redundants = list(set(label_set) ^ set(prompt))redundants.sort()num_positive = len(examples[i])if num_positive != 0:actual_ratio = math.ceil(len(redundants) / num_positive)else:# Set num_positive to 1 for text without positive examplenum_positive, actual_ratio = 1, 0if actual_ratio <= negative_ratio or negative_ratio == -1:idxs = [k for k in range(len(redundants))]else:idxs = random.sample(range(0, len(redundants)), negative_ratio * num_positive)for idx in idxs:negative_result = {"content": texts[i], "result_list": [], "prompt": redundants[idx]}negative_examples.append(negative_result)positive_examples.extend(examples[i])pbar.update(1)return positive_examples, negative_examplesdef add_relation_negative_example(redundants, text, num_positive, ratio):added_example = []rest_example = []if num_positive != 0:actual_ratio = math.ceil(len(redundants) / num_positive)else:# Set num_positive to 1 for text without positive examplenum_positive, actual_ratio = 1, 0all_idxs = [k for k in range(len(redundants))]if actual_ratio <= ratio or ratio == -1:idxs = all_idxsrest_idxs = []else:idxs = random.sample(range(0, len(redundants)), ratio * num_positive)rest_idxs = list(set(all_idxs) ^ set(idxs))for idx in idxs:negative_result = {"content": text, "result_list": [], "prompt": redundants[idx]}added_example.append(negative_result)for rest_idx in rest_idxs:negative_result = {"content": text, "result_list": [], "prompt": redundants[rest_idx]}rest_example.append(negative_result)return added_example, rest_exampledef add_full_negative_example(examples, texts, relation_prompts, predicate_set, subject_goldens, schema_lang="ch"):with tqdm(total=len(relation_prompts)) as pbar:for i, relation_prompt in enumerate(relation_prompts):negative_sample = []for subject in subject_goldens[i]:for predicate in predicate_set:# The relation prompt is constructed as follows:# subject + "的" + predicate -> Chinese# predicate + " of " + subject -> Englishif schema_lang == "ch":prompt = subject + "的" + predicateelse:prompt = predicate + " of " + subjectif prompt not in relation_prompt:negative_result = {"content": texts[i], "result_list": [], "prompt": prompt}negative_sample.append(negative_result)examples[i].extend(negative_sample)pbar.update(1)return examplesdef generate_cls_example(text, labels, prompt_prefix, options):random.shuffle(options)cls_options = ",".join(options)prompt = prompt_prefix + "[" + cls_options + "]"result_list = []example = {"content": text, "result_list": result_list, "prompt": prompt}for label in labels:start = prompt.rfind(label) - len(prompt) - 1end = start + len(label)result = {"text": label, "start": start, "end": end}example["result_list"].append(result)return exampledef convert_cls_examples(raw_examples, prompt_prefix="情感倾向", options=["正向", "负向"]):"""Convert labeled data export from doccano for classification task."""examples = []logger.info("Converting doccano data...")with tqdm(total=len(raw_examples)):for line in raw_examples:items = json.loads(line)# Compatible with doccano >= 1.6.2if "data" in items.keys():text, labels = items["data"], items["label"]else:text, labels = items["text"], items["label"]example = generate_cls_example(text, labels, prompt_prefix, options)examples.append(example)return examplesdef convert_ext_examples(raw_examples,negative_ratio,prompt_prefix="情感倾向",options=["正向", "负向"],separator="##",is_train=True,schema_lang="ch",
):"""Convert labeled data export from doccano for extraction and aspect-level classification task."""def _sep_cls_label(label, separator):label_list = label.split(separator)if len(label_list) == 1:return label_list[0], Nonereturn label_list[0], label_list[1:]texts = []entity_examples = []relation_examples = []entity_cls_examples = []entity_prompts = []relation_prompts = []entity_label_set = []entity_name_set = []predicate_set = []subject_goldens = []inverse_relation_list = []predicate_list = []logger.info("Converting doccano data...")with tqdm(total=len(raw_examples)) as pbar:for line in raw_examples:items = json.loads(line)entity_id = 0if "data" in items.keys():relation_mode = Falseif isinstance(items["label"], dict) and "entities" in items["label"].keys():relation_mode = Truetext = items["data"]entities = []relations = []if not relation_mode:# Export file in JSONL format which doccano < 1.7.0# e.g. {"data": "", "label": [ [0, 2, "ORG"], ... ]}for item in items["label"]:entity = {"id": entity_id, "start_offset": item[0], "end_offset": item[1], "label": item[2]}entities.append(entity)entity_id += 1else:# Export file in JSONL format for relation labeling task which doccano < 1.7.0# e.g. {"data": "", "label": {"relations": [ {"id": 0, "start_offset": 0, "end_offset": 6, "label": "ORG"}, ... ], "entities": [ {"id": 0, "from_id": 0, "to_id": 1, "type": "foundedAt"}, ... ]}}entities.extend([entity for entity in items["label"]["entities"]])if "relations" in items["label"].keys():relations.extend([relation for relation in items["label"]["relations"]])else:# Export file in JSONL format which doccano >= 1.7.0# e.g. {"text": "", "label": [ [0, 2, "ORG"], ... ]}if "label" in items.keys():text = items["text"]entities = []for item in items["label"]:entity = {"id": entity_id, "start_offset": item[0], "end_offset": item[1], "label": item[2]}entities.append(entity)entity_id += 1relations = []else:# Export file in JSONL (relation) format# e.g. {"text": "", "relations": [ {"id": 0, "start_offset": 0, "end_offset": 6, "label": "ORG"}, ... ], "entities": [ {"id": 0, "from_id": 0, "to_id": 1, "type": "foundedAt"}, ... ]}text, relations, entities = items["text"], items["relations"], items["entities"]texts.append(text)entity_example = []entity_prompt = []entity_example_map = {}entity_map = {}  # id to entity namefor entity in entities:entity_name = text[entity["start_offset"] : entity["end_offset"]]entity_map[entity["id"]] = {"name": entity_name,"start": entity["start_offset"],"end": entity["end_offset"],}entity_label, entity_cls_label = _sep_cls_label(entity["label"], separator)# Define the prompt prefix for entity-level classification# xxx + "的" + 情感倾向 -> Chinese# Sentiment classification + " of " + xxx -> Englishif schema_lang == "ch":entity_cls_prompt_prefix = entity_name + "的" + prompt_prefixelse:entity_cls_prompt_prefix = prompt_prefix + " of " + entity_nameif entity_cls_label is not None:entity_cls_example = generate_cls_example(text, entity_cls_label, entity_cls_prompt_prefix, options)entity_cls_examples.append(entity_cls_example)result = {"text": entity_name, "start": entity["start_offset"], "end": entity["end_offset"]}if entity_label not in entity_example_map.keys():entity_example_map[entity_label] = {"content": text,"result_list": [result],"prompt": entity_label,}else:entity_example_map[entity_label]["result_list"].append(result)if entity_label not in entity_label_set:entity_label_set.append(entity_label)if entity_name not in entity_name_set:entity_name_set.append(entity_name)entity_prompt.append(entity_label)for v in entity_example_map.values():entity_example.append(v)entity_examples.append(entity_example)entity_prompts.append(entity_prompt)subject_golden = []  # Golden entity inputsrelation_example = []relation_prompt = []relation_example_map = {}inverse_relation = []predicates = []for relation in relations:predicate = relation["type"]subject_id = relation["from_id"]object_id = relation["to_id"]# The relation prompt is constructed as follows:# subject + "的" + predicate -> Chinese# predicate + " of " + subject -> Englishif schema_lang == "ch":prompt = entity_map[subject_id]["name"] + "的" + predicateinverse_negative = entity_map[object_id]["name"] + "的" + predicateelse:prompt = predicate + " of " + entity_map[subject_id]["name"]inverse_negative = predicate + " of " + entity_map[object_id]["name"]if entity_map[subject_id]["name"] not in subject_golden:subject_golden.append(entity_map[subject_id]["name"])result = {"text": entity_map[object_id]["name"],"start": entity_map[object_id]["start"],"end": entity_map[object_id]["end"],}inverse_relation.append(inverse_negative)predicates.append(predicate)if prompt not in relation_example_map.keys():relation_example_map[prompt] = {"content": text, "result_list": [result], "prompt": prompt}else:relation_example_map[prompt]["result_list"].append(result)if predicate not in predicate_set:predicate_set.append(predicate)relation_prompt.append(prompt)for v in relation_example_map.values():relation_example.append(v)relation_examples.append(relation_example)relation_prompts.append(relation_prompt)subject_goldens.append(subject_golden)inverse_relation_list.append(inverse_relation)predicate_list.append(predicates)pbar.update(1)logger.info("Adding negative samples for first stage prompt...")positive_examples, negative_examples = add_entity_negative_example(entity_examples, texts, entity_prompts, entity_label_set, negative_ratio)if len(positive_examples) == 0:all_entity_examples = []else:all_entity_examples = positive_examples + negative_examplesall_relation_examples = []if len(predicate_set) != 0:logger.info("Adding negative samples for second stage prompt...")if is_train:positive_examples = []negative_examples = []per_n_ratio = negative_ratio // 3with tqdm(total=len(texts)) as pbar:for i, text in enumerate(texts):negative_example = []collects = []num_positive = len(relation_examples[i])# 1. inverse_relation_listredundants1 = inverse_relation_list[i]# 2. entity_name_set ^ subject_goldens[i]redundants2 = []if len(predicate_list[i]) != 0:nonentity_list = list(set(entity_name_set) ^ set(subject_goldens[i]))nonentity_list.sort()if schema_lang == "ch":redundants2 = [nonentity + "的" + predicate_list[i][random.randrange(len(predicate_list[i]))]for nonentity in nonentity_list]else:redundants2 = [predicate_list[i][random.randrange(len(predicate_list[i]))] + " of " + nonentityfor nonentity in nonentity_list]# 3. entity_label_set ^ entity_prompts[i]redundants3 = []if len(subject_goldens[i]) != 0:non_ent_label_list = list(set(entity_label_set) ^ set(entity_prompts[i]))non_ent_label_list.sort()if schema_lang == "ch":redundants3 = [subject_goldens[i][random.randrange(len(subject_goldens[i]))] + "的" + non_ent_labelfor non_ent_label in non_ent_label_list]else:redundants3 = [non_ent_label + " of " + subject_goldens[i][random.randrange(len(subject_goldens[i]))]for non_ent_label in non_ent_label_list]redundants_list = [redundants1, redundants2, redundants3]for redundants in redundants_list:added, rest = add_relation_negative_example(redundants,texts[i],num_positive,per_n_ratio,)negative_example.extend(added)collects.extend(rest)num_sup = num_positive * negative_ratio - len(negative_example)if num_sup > 0 and collects:if num_sup > len(collects):idxs = [k for k in range(len(collects))]else:idxs = random.sample(range(0, len(collects)), num_sup)for idx in idxs:negative_example.append(collects[idx])positive_examples.extend(relation_examples[i])negative_examples.extend(negative_example)pbar.update(1)all_relation_examples = positive_examples + negative_exampleselse:relation_examples = add_full_negative_example(relation_examples, texts, relation_prompts, predicate_set, subject_goldens, schema_lang=schema_lang)all_relation_examples = [r for relation_example in relation_examples for r in relation_example]return all_entity_examples, all_relation_examples, entity_cls_examplesdef get_dynamic_max_length(examples, default_max_length: int, dynamic_max_length: List[int]) -> int:"""get max_length by examples which you can change it by examples in batch"""cur_length = len(examples[0]["input_ids"])max_length = default_max_lengthfor max_length_option in sorted(dynamic_max_length):if cur_length <= max_length_option:max_length = max_length_optionbreakreturn max_lengthdef convert_example(example, tokenizer, max_seq_len, multilingual=False, dynamic_max_length: Optional[List[int]] = None
):"""example: {titlepromptcontentresult_list}"""if dynamic_max_length is not None:temp_encoded_inputs = tokenizer(text=[example["prompt"]],text_pair=[example["content"]],truncation=True,max_seq_len=max_seq_len,return_attention_mask=True,return_position_ids=True,return_dict=False,return_offsets_mapping=True,)max_length = get_dynamic_max_length(examples=temp_encoded_inputs, default_max_length=max_seq_len, dynamic_max_length=dynamic_max_length)# always pad to max_lengthencoded_inputs = tokenizer(text=[example["prompt"]],text_pair=[example["content"]],truncation=True,max_seq_len=max_length,pad_to_max_seq_len=True,return_attention_mask=True,return_position_ids=True,return_dict=False,return_offsets_mapping=True,)start_ids = [0.0 for x in range(max_length)]end_ids = [0.0 for x in range(max_length)]else:encoded_inputs = tokenizer(text=[example["prompt"]],text_pair=[example["content"]],truncation=True,max_seq_len=max_seq_len,pad_to_max_seq_len=True,return_attention_mask=True,return_position_ids=True,return_dict=False,return_offsets_mapping=True,)start_ids = [0.0 for x in range(max_seq_len)]end_ids = [0.0 for x in range(max_seq_len)]encoded_inputs = encoded_inputs[0]offset_mapping = [list(x) for x in encoded_inputs["offset_mapping"]]bias = 0for index in range(1, len(offset_mapping)):mapping = offset_mapping[index]if mapping[0] == 0 and mapping[1] == 0 and bias == 0:bias = offset_mapping[index - 1][1] + 1  # Includes [SEP] tokenif mapping[0] == 0 and mapping[1] == 0:continueoffset_mapping[index][0] += biasoffset_mapping[index][1] += biasfor item in example["result_list"]:start = map_offset(item["start"] + bias, offset_mapping)end = map_offset(item["end"] - 1 + bias, offset_mapping)start_ids[start] = 1.0end_ids[end] = 1.0if multilingual:tokenized_output = {"input_ids": encoded_inputs["input_ids"],"position_ids": encoded_inputs["position_ids"],"start_positions": start_ids,"end_positions": end_ids,}else:tokenized_output = {"input_ids": encoded_inputs["input_ids"],"token_type_ids": encoded_inputs["token_type_ids"],"position_ids": encoded_inputs["position_ids"],"attention_mask": encoded_inputs["attention_mask"],"start_positions": start_ids,"end_positions": end_ids,}return tokenized_output

训练

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.import os
from dataclasses import dataclass, field
from functools import partial
from typing import List, Optionalimport paddle
from utils import convert_example, readerfrom paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import load_dataset
from paddlenlp.metrics import SpanEvaluator
from paddlenlp.trainer import (CompressionArguments,PdArgumentParser,Trainer,get_last_checkpoint,
)
from paddlenlp.transformers import UIE, UIEM, AutoTokenizer, export_model
from paddlenlp.utils.log import logger@dataclass
class DataArguments:"""Arguments pertaining to what data we are going to input our model for training and eval.Using `PdArgumentParser` we can turn this class into argparse arguments to be able tospecify them on the command line."""train_path: str = field(default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."})dev_path: str = field(default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."})max_seq_length: Optional[int] = field(default=512,metadata={"help": "The maximum total input sequence length after tokenization. Sequences longer ""than this will be truncated, sequences shorter will be padded."},)dynamic_max_length: Optional[List[int]] = field(default=None,metadata={"help": "dynamic max length from batch, it can be array of length, eg: 16 32 64 128"},)@dataclass
class ModelArguments:"""Arguments pertaining to which model/config/tokenizer we are going to fine-tune from."""model_name_or_path: Optional[str] = field(default="uie-base",metadata={"help": "Path to pretrained model, such as 'uie-base', 'uie-tiny', ""'uie-medium', 'uie-mini', 'uie-micro', 'uie-nano', 'uie-base-en', ""'uie-m-base', 'uie-m-large', or finetuned model path."},)export_model_dir: Optional[str] = field(default=None,metadata={"help": "Path to directory to store the exported inference model."},)multilingual: bool = field(default=False, metadata={"help": "Whether the model is a multilingual model."})def main():parser = PdArgumentParser((ModelArguments, DataArguments, CompressionArguments))model_args, data_args, training_args = parser.parse_args_into_dataclasses()if model_args.model_name_or_path in ["uie-m-base", "uie-m-large"]:model_args.multilingual = True# Log model and data configtraining_args.print_config(model_args, "Model")training_args.print_config(data_args, "Data")paddle.set_device(training_args.device)# Log on each process the small summary:logger.warning(f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}")# Detecting last checkpoint.last_checkpoint = Noneif os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:last_checkpoint = get_last_checkpoint(training_args.output_dir)if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:raise ValueError(f"Output directory ({training_args.output_dir}) already exists and is not empty. ""Use --overwrite_output_dir to overcome.")elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:logger.info(f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change ""the `--output_dir` or add `--overwrite_output_dir` to train from scratch.")tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)if model_args.multilingual:model = UIEM.from_pretrained(model_args.model_name_or_path)else:model = UIE.from_pretrained(model_args.model_name_or_path)train_ds = load_dataset(reader, data_path=data_args.train_path, max_seq_len=data_args.max_seq_length, lazy=False)dev_ds = load_dataset(reader, data_path=data_args.dev_path, max_seq_len=data_args.max_seq_length, lazy=False)trans_fn = partial(convert_example,tokenizer=tokenizer,max_seq_len=data_args.max_seq_length,multilingual=model_args.multilingual,dynamic_max_length=data_args.dynamic_max_length,)train_ds = train_ds.map(trans_fn)dev_ds = dev_ds.map(trans_fn)if training_args.device == "npu":data_collator = DataCollatorWithPadding(tokenizer, padding="longest")else:data_collator = DataCollatorWithPadding(tokenizer)criterion = paddle.nn.BCELoss()def uie_loss_func(outputs, labels):start_ids, end_ids = labelsstart_prob, end_prob = outputsstart_ids = paddle.cast(start_ids, "float32")end_ids = paddle.cast(end_ids, "float32")loss_start = criterion(start_prob, start_ids)loss_end = criterion(end_prob, end_ids)loss = (loss_start + loss_end) / 2.0return lossdef compute_metrics(p):metric = SpanEvaluator()start_prob, end_prob = p.predictionsstart_ids, end_ids = p.label_idsmetric.reset()num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)metric.update(num_correct, num_infer, num_label)precision, recall, f1 = metric.accumulate()metric.reset()return {"precision": precision, "recall": recall, "f1": f1}trainer = Trainer(model=model,criterion=uie_loss_func,args=training_args,data_collator=data_collator,train_dataset=train_ds if training_args.do_train or training_args.do_compress else None,eval_dataset=dev_ds if training_args.do_eval or training_args.do_compress else None,tokenizer=tokenizer,compute_metrics=compute_metrics,)trainer.optimizer = paddle.optimizer.AdamW(learning_rate=training_args.learning_rate, parameters=model.parameters())checkpoint = Noneif training_args.resume_from_checkpoint is not None:checkpoint = training_args.resume_from_checkpointelif last_checkpoint is not None:checkpoint = last_checkpoint# Trainingif training_args.do_train:train_result = trainer.train(resume_from_checkpoint=checkpoint)metrics = train_result.metricstrainer.save_model()trainer.log_metrics("train", metrics)trainer.save_metrics("train", metrics)trainer.save_state()# Evaluate and tests modelif training_args.do_eval:eval_metrics = trainer.evaluate()trainer.log_metrics("eval", eval_metrics)# export inference modelif training_args.do_export:# You can also load from certain checkpoint# trainer.load_state_dict_from_checkpoint("/path/to/checkpoint/")if training_args.device == "npu":# npu will transform int64 to int32 for internal calculation.# To reduce useless transformation, we feed int32 inputs.input_spec_dtype = "int32"else:input_spec_dtype = "int64"if model_args.multilingual:input_spec = [paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="input_ids"),paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="position_ids"),]else:input_spec = [paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="input_ids"),paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="token_type_ids"),paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="position_ids"),paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="attention_mask"),]if model_args.export_model_dir is None:model_args.export_model_dir = os.path.join(training_args.output_dir, "export")export_model(model=trainer.model, input_spec=input_spec, path=model_args.export_model_dir)trainer.tokenizer.save_pretrained(model_args.export_model_dir)if training_args.do_compress:@paddle.no_grad()def custom_evaluate(self, model, data_loader):metric = SpanEvaluator()model.eval()metric.reset()for batch in data_loader:if model_args.multilingual:logits = model(input_ids=batch["input_ids"], position_ids=batch["position_ids"])else:logits = model(input_ids=batch["input_ids"],token_type_ids=batch["token_type_ids"],position_ids=batch["position_ids"],attention_mask=batch["attention_mask"],)start_prob, end_prob = logitsstart_ids, end_ids = batch["start_positions"], batch["end_positions"]num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)metric.update(num_correct, num_infer, num_label)precision, recall, f1 = metric.accumulate()logger.info("f1: %s, precision: %s, recall: %s" % (f1, precision, f1))model.train()return f1trainer.compress(custom_evaluate=custom_evaluate)if __name__ == "__main__":main()

## CPU训练
python finetune.py --device cpu --logging_steps 10 --save_steps 100 --eval_steps 100 --seed 42 --model_name_or_path uie-base --output_dir model --train_path data/train.txt --dev_path data/dev.txt --max_seq_length 512 --per_device_eval_batch_size 16 --per_device_train_batch_size  16 --num_train_epochs 20 --learning_rate 1e-5 --label_names "start_positions" "end_positions" --do_train --do_eval --do_export --export_model_dir model --overwrite_output_dir --disable_tqdm True --metric_for_best_model eval_f1 --load_best_model_at_end  True --save_total_limit 1## GPU多卡训练
python3 -u -m paddle.distributed.launch --gpus "0,1" finetune.py --device gpu --logging_steps 10 --save_steps 100 --eval_steps 100 --seed 42 --model_name_or_path uie-base --output_dir model --train_path data/train.txt --dev_path data/dev.txt --max_seq_length 512 --per_device_eval_batch_size 7 --per_device_train_batch_size 7 --num_train_epochs 20 --learning_rate 1e-5 --label_names "start_positions" "end_positions" --do_train --do_eval --do_export --export_model_dir model --overwrite_output_dir --disable_tqdm True --metric_for_best_model eval_f1 --load_best_model_at_end  True --save_total_limit 1

模型部署预测

import gradio as gr
from paddlenlp import Taskflowschema = ['时间', '选手', '赛事名称']
ie = Taskflow('information_extraction', schema=schema, task_path='model')# UGC: Define the inference fn() for your models
def model_inference(schema, text):ie.set_schema(eval(schema))res = ie(text)json_out = {"text": text, "result": res}return json_outdef clear_all():return None, None, Nonewith gr.Blocks() as demo:gr.Markdown("关系抽取")with gr.Column(scale=1, min_width=100):schema = gr.Textbox(placeholder="['时间', '选手', '赛事名称']",label="输入结构:",lines=2)text = gr.Textbox(placeholder="2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌！",label="输入内容:",lines=2)with gr.Row():btn1 = gr.Button("Clear")btn2 = gr.Button("Submit")json_out = gr.JSON(label="Information Extraction Output")btn1.click(fn=clear_all, inputs=None, outputs=[schema, text, json_out])btn2.click(fn=model_inference, inputs=[schema, text], outputs=[json_out])gr.Button.style(1)demo.launch(server_name='192.168.1.111', share=True, server_port=7006)## 裁定书  ['标题',{'申请执行人': ['姓名', '生于', '身份证号']},{'被执行人': ['姓名', '生于', '身份证号']},'公积金']## 调解书 ['法院',{'原告': ['姓名', '生于', '身份证号']},{'被告': ['公司']},'公积金']## 测试 ['标题','诉讼案号','执行案号','公积金',{'原告': ['姓名', '单位', '生日', '身份证号']},{'被告': ['姓名', '单位', '生日', '身份证号']}]

python3 ./app.py

PDF或图片文档内容识别、关系抽取相关推荐

java读取word文档内容_Python读取PDF信息插入Word文档
Hello,上个周末没能搞事情,被一个代码需求给绊住了:朋友在平时工作中会经常重复性地打开不同PDF文件,选取其中特定的几组信息复制粘贴到不同的Word文档中,完成一份PDF文件平均耗时15分钟,想试 ...
java 把pdf图片文档和文章文档转成文字的方法(此方法不好用，转换成文字可以，转成pdf不行)
java 提供了一些库和工具可以用来把 PDF 文档和图片文档转成文本. Apache PDFBox:这是一个开源的 PDF 库,可以用来提取 PDF 文件中的文本内容. iText:这是一个用于创建 ...
java操作office和pdf文件java读取word，excel和pdf文档内容
在平常应用程序中,对office和pdf文档进行读取数据是比较常见的功能,尤其在很多web应用程序中.所以今天我们就简单来看一下Java对word.excel.pdf文件的读取.本篇博客只是讲解简单应 ...
亿愿WORD文档内容搜索及工具集-打造最全面的OFFICE、pdf、txt等各类文件快捷批量处理工具
[亿愿WORD文档内容搜索及工具集(YYWORDSearch)] 1)专业搜索指定目录下的WORD文件(*.DOC;*.DOCX;).WPS文件(*.WPS).Powerpoint文件(*.ppt;* ...
判断pdf、word文档、图片等文件类型（格式）、大小的简便方法
判断pdf.word文档.图片等文件类型(格式).大小的简便方法很久没发文了,今天有时间就写一下吧. 关于上传文件,通常我们都需要对其进行判断,限制上传的类型,如果是上传图片,我们甚至会把图片转化成 ...
如何批量提取 PDF 文档内容，将 PDF 格式文档转为 Txt 文本格式
概要:PDF 文档常常用来阅读.预览或者存档一些资料,PDF 支持的内容也是比较丰富的.可以支持图片.文本等多种类型的元素.那有时候我们就需要将一些纯文本的 PDF 文档中的文字提取出来,转为一个 T ...
gettext()方法输出空白_如何将文档内容输出为无水印图片？超简单的操作方法看这里...
在实际的工作生活中,我们可能会遇到这样的情况:文档内容太多,想截图以图片的形式展现,但是文档超过一屏无法全部截取,将其保存为图片的形式,但使用WPS将文档直接输出为无水印图片需要开启WPS会员. 一连 ...
使用HttpHandler解析并展示PDF文档内容
前言如果我们想将服务端的PDF文档内容展示给客户端,往往会通过URL直接访问的方式.这样一来,PDF文档就会毫无保留的保存到客户端去,通过浏览器的PDF插件,客户端可以随意拷贝PDF的副本.(如下图 ...
HttpHandler解析并展示PDF文档内容
2019独角兽企业重金招聘Python工程师标准>>> HttpHandler 解析并展示 PDF 文档内容如果我们想将服务端的 PDF 文档内容展示给客户端,往往会通过 URL ...

PDF或图片文档内容识别、关系抽取

需求

思路

环境

步骤

PDF或图片文档内容识别、关系抽取相关推荐

最新文章

热门文章