
至于摘要生成过程中存在的幻觉问题,如内在的无中生有,外在的无中生有,有一篇很好的综述:Survey of Hallucination in Natural Language Generation:https://arxiv.org/pdf/2202.03629.pdf


1.1 分词还是不分词?


1.2 词表是以字为基础还是词为基础?统一标准


例如,以字为词表的模型(典型的就是中文BART)在transformers中生成的摘要的这种形式的:['我 是 生 成 的 摘 要']

而以词为词表的模型(典型的就是中文T5 Pegasus)生成的摘要是这种形式的:['我 是 生成摘要']


decoded_preds = ["".join(pred.replace(" ", "")) for pred in decoded_preds]
decoded_labels = ["".join(label.replace(" ", "")) for label in decoded_labels]


比如:1234举起手啊--->分字就是1 2 3 4 举 起 手 啊;按此表就是1234 举 起 手 啊

1.3 计算指标



decoded_preds = [" ".join(jieba.cut(pred.replace(" ", ""))) for pred in decoded_preds]
decoded_labels = [" ".join(jieba.cut(label.replace(" ", ""))) for label in decoded_labels]


['我 是 生成摘要']

['我 是 参考 的 摘要']

二者格式相同,直接计算即可:result = rouge.get_scores(decoded_preds, decoded_labels, avg=True)


decoded_preds = [" ".join(pred.replace(" ", "")) for pred in decoded_preds]
decoded_labels = [" ".join(label.replace(" ", "")) for label in decoded_labels]

result = rouge.get_scores(decoded_preds, decoded_labels, avg=True)


['我 是 生 成摘 要']

['我 是 参 考 的 摘 要']

有空格是因为rouge库计算需要空格隔开,如果你用lawrouge库就不用了,直接调用result = rouge.get_scores(decoded_preds, decoded_labels, avg=True)

1.4 计算差异


from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rougedef compute_metrics(eval_pred):predictions, labels = eval_preddecoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)# Replace -100 in the labels as we can't decode them.labels = np.where(labels != -100, labels, tokenizer.pad_token_id)decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)# 字符级别decoded_preds = [" ".join((pred.replace(" ", ""))) for pred in decoded_preds]decoded_labels = [" ".join((label.replace(" ", ""))) for label in decoded_labels]# 词级别,分词# decoded_preds = [" ".join(jieba.cut(pred.replace(" ", ""))) for pred in decoded_preds]# decoded_labels = [" ".join(jieba.cut(label.replace(" ", ""))) for label in decoded_labels]rouge = Rouge()labels_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in labels]total = 0rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0for decoded_label, decoded_pred in zip(decoded_labels, decoded_preds):total += 1scores = rouge.get_scores(hyps=decoded_pred, refs=decoded_label)rouge_1 += scores[0]['rouge-1']['f']rouge_2 += scores[0]['rouge-2']['f']rouge_l += scores[0]['rouge-l']['f']bleu += sentence_bleu(references=[decoded_label.split(' ')],hypothesis=decoded_pred.split(' '),smoothing_function=SmoothingFunction().method1)bleu /= len(decoded_labels)rouge_1 /= totalrouge_2 /= totalrouge_l /= totalresult = {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l}print(result)# 测试平均与分别计算是否一致result2 = rouge.get_scores(decoded_preds, decoded_labels, avg=True)print(result2)print(bleu)# result = {'rouge-1': result['rouge-1']['f'], 'rouge-2': result['rouge-2']['f'], 'rouge-l': result['rouge-l']['f']}result = {key: value * 100 for key, value in result.items()}result["gen_len"] = np.mean(labels_lens)result["bleu"] = bleu * 100return result


流程比较复杂,例如基于QA的,需要分别训练Question generation与Answer generation模型,模型的训练好坏直接影响效果。先简单介绍QA与QG的训练,其中QA基于BERT,QG基于BART,这里是用的是英文的SQuAD-1.1,中文方法是一样的,使用CMRC2018的SQuAD格式数据,模型换成中文模型就好了。不同论文的实现方式不一样,我只说一个最最简单的方法。


2.2 数据加载



"""SQUAD: The Stanford Question Answering Dataset."""import jsonimport datasets
from datasets.tasks import QuestionAnsweringExtractivelogger = datasets.logging.get_logger(__name__)_CITATION = """\
@article{2016arXiv160605250R,author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},Konstantin and {Liang}, Percy},title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",journal = {arXiv e-prints},year = 2016,eid = {arXiv:1606.05250},pages = {arXiv:1606.05250},
archivePrefix = {arXiv},eprint = {1606.05250},
"""_DESCRIPTION = """\
Stanford Question Answering Dataset (SQuAD) is a reading comprehension \
dataset, consisting of questions posed by crowdworkers on a set of Wikipedia \
articles, where the answer to every question is a segment of text, or span, \
from the corresponding reading passage, or the question might be unanswerable.
"""_URL = r"E:\Project\NLP\dataset\SQuAD-1.1 datasets/"
_URLS = {"train": _URL + "train-v1.1.json","dev": _URL + "dev-v1.1.json",
}class SquadConfig(datasets.BuilderConfig):"""BuilderConfig for SQUAD."""def __init__(self, **kwargs):"""BuilderConfig for SQUAD.Args:**kwargs: keyword arguments forwarded to super."""super(SquadConfig, self).__init__(**kwargs)class Squad(datasets.GeneratorBasedBuilder):"""SQUAD: The Stanford Question Answering Dataset. Version 1.1."""BUILDER_CONFIGS = [SquadConfig(name="plain_text",version=datasets.Version("1.0.0", ""),description="Plain text",),]def _info(self):return datasets.DatasetInfo(description=_DESCRIPTION,features=datasets.Features({"id": datasets.Value("string"),"title": datasets.Value("string"),"context": datasets.Value("string"),"question": datasets.Value("string"),"answers": datasets.features.Sequence({"text": datasets.Value("string"),"answer_start": datasets.Value("int32"),}),}),# No default supervised_keys (as we have to pass both question# and context as input).supervised_keys=None,homepage="https://rajpurkar.github.io/SQuAD-explorer/",citation=_CITATION,task_templates=[QuestionAnsweringExtractive(question_column="question", context_column="context", answers_column="answers")],)def _split_generators(self, dl_manager):downloaded_files = dl_manager.download_and_extract(_URLS)return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),]def _generate_examples(self, filepath):"""This function returns the examples in the raw (text) form."""logger.info("generating examples from = %s", filepath)key = 0with open(filepath, encoding="utf-8") as f:squad = json.load(f)for article in squad["data"]:title = article.get("title", "")for paragraph in article["paragraphs"]:context = paragraph["context"]  # do not strip leading blank spaces GH-2585for qa in paragraph["qas"]:answer_starts = [answer["answer_start"] for answer in qa["answers"]]answers = [answer["text"] for answer in qa["answers"]]# Features currently used are "context", "question", and "answers".# Others are extracted here for the ease of future expansions.yield key, {"title": title,"context": context,"question": qa["question"],"id": qa["id"],"answers": {"answer_start": answer_starts,"text": answers,},}key += 1

2.3 QA训练


# coding=utf-8
import json
import numpy as np
import torch
from datasets import Dataset,load_dataset
from transformers.data.metrics.squad_metrics import compute_exact, compute_f1squad = load_dataset('./squad.py')
squad2 = squad["validation"][0]
# 获取验证数据集# valid_datasets = squad["validation"].flatten().data
# # 获取验证数据集中的content
# valid_contents = valid_datasets[2]
# # 获取验证数据集中的gold answer
# valid_answers = valid_datasets[4]
# # 获取验证数据集中的gold answer start
# valid_answers_start = valid_datasets[5]xx = compute_f1("left Graz and severed all relations with his family","left Graz and severed")print(squad)from transformers import AutoTokenizer, default_data_collator, BertForQuestionAnswering, TrainingArguments, Trainer, \BertTokenizertokenizer = AutoTokenizer.from_pretrained(r"E:\Project\NLP\bert-base-uncased")def preprocess_function(examples):'''用于处理训练集,因为训练集每个问题只有一个参考答案回答'''questions = [q.strip() for q in examples["question"]]inputs = tokenizer(questions,examples["context"],max_length=512,truncation="only_second",return_offsets_mapping=True,padding="max_length",)offset_mapping = inputs.pop("offset_mapping")answers = examples["answers"]start_positions = []end_positions = []for i, offset in enumerate(offset_mapping):answer = answers[i]start_char = answer["answer_start"][0]end_char = answer["answer_start"][0] + len(answer["text"][0])sequence_ids = inputs.sequence_ids(i)# Find the start and end of the contextidx = 0while sequence_ids[idx] != 1:idx += 1context_start = idxwhile sequence_ids[idx] == 1:idx += 1context_end = idx - 1# If the answer is not fully inside the context, label it (0, 0)if offset[context_start][0] > end_char or offset[context_end][1] < start_char:start_positions.append(0)end_positions.append(0)else:# Otherwise it's the start and end token positionsidx = context_startwhile idx <= context_end and offset[idx][0] <= start_char:idx += 1start_positions.append(idx - 1)idx = context_endwhile idx >= context_start and offset[idx][1] >= end_char:idx -= 1end_positions.append(idx + 1)inputs["start_positions"] = start_positionsinputs["end_positions"] = end_positionsreturn inputstokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)# train_x = squad["train"].map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
# valid_x = squad["validation"].map(preprocess_function, batched=True, remove_columns=squad["validation"].column_names)
data_collator = default_data_collator
model = BertForQuestionAnswering.from_pretrained(r"E:\Project\NLP\long-document\bert-base-uncased")training_args = TrainingArguments(# fp16 = True,output_dir="./QA_results",do_train=True,do_eval=True,evaluation_strategy="epoch",# eval_steps=2,learning_rate=1e-4,per_device_train_batch_size=1,per_device_eval_batch_size=1,logging_dir="logs",logging_strategy="steps",save_total_limit=3,logging_steps=1,num_train_epochs=4,weight_decay=0.01,gradient_accumulation_steps=8,
)def compute_metrics(eval_pred):predictions,label_ids = eval_predstart = predictions[0]end = predictions[1]answer_start = np.argmax(predictions[0],axis = 1)answer_end = np.argmax(predictions[1],axis = 1)label_start = label_ids[0] # 这个是token过后的开始与结束为止  不是一个一个字符数的 是一个一个单词数的label_end = label_ids[1]data = tokenized_squad["validation"]gold_answers=[]pred_answers=[]# 遍历每一个验证数据for idx,example in enumerate(data):input_ids = example["input_ids"]# 取出文本label_start = example["start_positions"]# 取出开始label_end = example["end_positions"]# 取出结束gold_answer=""pred_answer=""for i in range(label_end-label_start+1):gold_answer+=str(input_ids[label_start+i])+" "if answer_start[idx] < answer_end[idx]:answer_end[idx] = answer_start[idx]for i in range(answer_end[idx]-answer_start[idx]+1):pred_answer+=str(input_ids[label_start+i])+" "gold_answers.append(gold_answer.strip())pred_answers.append(pred_answer.strip())# 计算f1 score与exact scoref1_score=0exact_score=0for gold_answer,pred_answer in zip(gold_answers,pred_answers):f1_score+=compute_f1(gold_answer,pred_answer)exact_score+=compute_exact(gold_answer,pred_answer)f1_score/=len(gold_answers)exact_score/=len(gold_answers)f1_score*=100exact_score*=100result = {'f1_score': f1_score, 'exact_score': exact_score}return resulttrainer = Trainer(model=model,args=training_args,train_dataset=tokenized_squad["train"],eval_dataset=tokenized_squad["validation"],data_collator=data_collator,tokenizer=tokenizer,compute_metrics=compute_metrics

2.4 QG训练


import json
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, BartTokenizer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, \BartForConditionalGeneration, Seq2SeqTrainer
from transformers.data.metrics.squad_metrics import compute_f1# x = ['Which NFL team represented the AFC at Super Bowl 50?', 'Which NFL team represented the NFC at Super Bowl 50?', 'Where did Super Bowl 50 take place?', 'Which NFL team won Super Bowl 50?', 'What color was used to emphasize the 50th anniversary of the Super Bowl?', 'What was the theme of Super Bowl 50?', 'What day was the game played on?', 'What is the AFC short for?', 'What was the theme of Super Bowl 50?', 'What does AFC stand for?', 'What day was the Super Bowl played on?', 'Who won Super Bowl 50?', 'What venue did Super Bowl 50 take place in?', 'What city did Super Bowl 50 take place in?', 'If Roman numerals were used, what would Super Bowl 50 have been called?', 'Super Bowl 50 decided the NFL champion for what season?', 'What year did the Denver Broncos secure a Super Bowl title for the third time?', 'What city did Super Bowl 50 take place in?', 'What stadium did Super Bowl 50 take place in?', 'What was the final score of Super Bowl 50? ']
# xx = ['Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League', 'Super Bowl 50 was an American football game to determine the champion of the National Football League']max_input_length=512
train_path = r'E:\Project\NLP\dataset\SQuAD-1.1 datasets\train-v1.1.json'
dev_path = r'E:\Project\NLP\dataset\SQuAD-1.1 datasets\dev-v1.1.json'
output_dir=r'E:\Project\NLP\dataset\SQuAD-1.1 datasets\QG_results'
def data_preprocess(path):with open(path, 'r', encoding='utf-8') as f_train:train_set = json.load(f_train)datas = train_set# convertnew_data = []for data in datas["data"]:for d in data['paragraphs']:context = d['context']for qa in d['qas']:new_data.append({'context': context,'answers': qa['answers'],'question': qa['question']})contexts=[]labels=[]for data in new_data:answer_text = data['answers'][0]['text']answer_len = len(answer_text)answer_start = data['answers'][0]['answer_start']hl_context = data['context'][:answer_start] +'<hl>' + answer_text + '<hl>' + data['context'][answer_start + answer_len:]label=data['question'] #+ '</s>'contexts.append(hl_context)labels.append(label)return    contexts, labelstrain_contexts,train_labels=data_preprocess(train_path)
tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
special_tokens_dict = {'additional_special_tokens': ['<hl>']}
tokenizer.add_special_tokens(special_tokens_dict)def preprocess_function(examples):inputs = [doc for doc in examples["contexts"]]model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)# Setup the tokenizer for targetswith tokenizer.as_target_tokenizer():labels = tokenizer(examples["labels"], max_length=max_target_length, truncation=True)# title_len_1 = tokenizer(examples["len_title_1"], max_length=max_target_length, truncation=True)# title_len_all = tokenizer(examples["len_title_all"], max_length=max_target_length, truncation=True)model_inputs["labels"] = labels["input_ids"]return model_inputstokenized_train_dataset = train_dataset.map(preprocess_function,  batched=True, remove_columns=train_dataset.column_names)
tokenized_dev_dataset = dev_dataset.map(preprocess_function,  batched=True, remove_columns=dev_dataset.column_names)batch_size = 1
args = Seq2SeqTrainingArguments(fp16 = True,output_dir=output_dir,num_train_epochs=5,  # demodo_train=True,do_eval=True,per_device_train_batch_size=1,  # demoper_device_eval_batch_size=1,learning_rate=1e-04,warmup_steps=100,weight_decay=0.01,label_smoothing_factor=0.1,predict_with_generate=True,logging_dir="logs",logging_strategy="steps",logging_steps=1,save_total_limit=3,evaluation_strategy="epoch",generation_max_length=max_target_length,generation_num_beams=4,# remove_unused_columns=False,
model = BartForConditionalGeneration.from_pretrained(model_path)
# model, list_en, list_de = create_student_by_copying_alternating_layers(model, 'trian.pth', 12, 3)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)def compute_metrics(eval_pred):predictions, labels = eval_preddecoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)# Replace -100 in the labels as we can't decode them.labels = np.where(labels != -100, labels, tokenizer.pad_token_id)decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)f1_score = 0for label, pred in zip(decoded_labels, decoded_preds):f1_score += compute_f1(label, pred)f1_score /= len(decoded_preds)f1_score *= 100result = {'f1_score': f1_score}return resulttrainer = Seq2SeqTrainer(model,args,train_dataset=tokenized_train_dataset,# train_dataset=dataset_train,eval_dataset=tokenized_dev_dataset,# eval_dataset=dataset_valid,data_collator=data_collator,tokenizer=tokenizer,compute_metrics=compute_metrics,)train_result = trainer.train()

2.5 使用


3 BERTScore


from bert_score import score# data
cands = ['天天干家务烦死了','难受死了啊']
refs = ['这也完全不相干啊','真的难受死了啊']P, R, F1 = score(cands, refs, lang="zh", verbose=True)print(f"System level F1 score: {F1.mean():.3f}")


