【代码阅读】DeepSC--preprocess

Deep Learning Enabled Semantic Communication Systems

实现功能：

1.函数将输入的字符串标准化为一个符合规范的文本字符串（处理空格特殊符号）；
2.文本分割成指定长；
3.添加开始结束点；
4.筛选符号；
5.构建词汇表（vocabulary），统计句子中每个词出现的个数

import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
from tqdm import tqdm
from parameters import para_config# parser = argparse.ArgumentParser()
# parser.add_argument('--input-data-dir', default='europarl/en', type=str)
# parser.add_argument('--output-train-dir', default='europarl/train_data.pkl', type=str)
# parser.add_argument('--output-test-dir', default='europarl/test_data.pkl', type=str)
# parser.add_argument('--output-vocab', default='europarl/vocab.json', type=str)SPECIAL_TOKENS = {'<PAD>': 0,'<START>': 1,'<END>': 2,'<UNK>': 3,
}def unicode_to_ascii(s):return ''.join(c for c in unicodedata.normalize('NFD', s)if unicodedata.category(c) != 'Mn')# 函数将输入的字符串标准化为一个符合规范的文本字符串
def normalize_string(s):# normalize unicode characterss = unicode_to_ascii(s)# remove the XML-tagss = remove_tags(s)# add white space before !.?# 在感叹号、句号和问号前添加一个空格，以便将它们与前面的单词分开。它使用正则表达式将这些标点符号前添加一个空格。s = re.sub(r'([!.?])', r' \1', s)# 除了字母（大小写）、句号、感叹号和问号之外的所有字符替换为空格s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)# 这一步将连续出现的多个空格替换为单个空格。它使用正则表达式\s + 匹配连续出现的多个空格，并将它们替换为单个空格s = re.sub(r'\s+', r' ', s)# change to lower letter# 将字符串s中的所有字母转换为小写字母s = s.lower()return s
# 对清理过的文本进行切割处理，筛选出指定长度范围内的句子。这可能有助于过滤掉过短或过长的句子，以便后续处理或分析
def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):# 创建一个空列表cutted_lines，用于存储切割后的句子。cutted_lines = list()# 遍历输入的清理过的句子列表cleaned中的每个句子for line in cleaned:# 计算当前句子line中的单词数，并将结果存储在变量length中length = len(line.split())# 判断当前句子的单词数是否大于MIN_LENGTH且小于MAX_LENGTHif length > MIN_LENGTH and length < MAX_LENGTH:# 将当前句子line按空格切分成单词，并将结果存储在列表line中line = [word for word in line.split()]# 将切分后的单词列表line重新组合成一个句子，并将该句子添加到cutted_linescutted_lines.append(' '.join(line))return cutted_linesdef save_clean_sentences(sentence, save_path):pickle.dump(sentence, open(save_path, 'wb'))print('Saved: %s' % save_path)def process(text_path):fop = open(text_path, 'r', encoding='utf8')raw_data = fop.read()# 将读取到的内容去除首尾的空白字符，并按换行符('\n')# 进行分割，将文本划分为多个句子，保存在变量sentences中。sentences = raw_data.strip().split('\n')# 对每个句子应用normalize_string函数，将句子进行标准化处理，去除XML标签、添加空格等raw_data_input = [normalize_string(data) for data in sentences]# 筛选出长度在指定范围内的句子raw_data_input = cutted_data(raw_data_input)fop.close()return raw_data_input# 用于将一个字符串 s 切分成一个（字符串）标记列表，即将字符串按指定的分隔符进行分割，
# 并可选择保留或移除特定的标点符号，以及添加起始和结束标记
def tokenize(s, delim=' ',  add_start_token=True, add_end_token=True,punct_to_keep=None, punct_to_remove=None):"""Tokenize a sequence, converting a string s into a list of (string) tokens bysplitting on the specified delimiter. Optionally keep or remove certainpunctuation marks and add start and end tokens."""if punct_to_keep is not None:for p in punct_to_keep:s = s.replace(p, '%s%s' % (delim, p))if punct_to_remove is not None:for p in punct_to_remove:s = s.replace(p, '')tokens = s.split(delim)if add_start_token:tokens.insert(0, '<START>')if add_end_token:tokens.append('<END>')return tokens# 构建词汇表（vocabulary）
def build_vocab(sequences, token_to_idx = { }, min_token_count=1, delim=' ',punct_to_keep=None, punct_to_remove=None, ):# 记录词汇表中每个单词的出现次数token_to_count = {}# 遍历sequences列表中的每个句子seqfor seq in sequences:# 使用了tokenize函数进行分词操作seq_tokens = tokenize(seq, delim=delim, punct_to_keep=punct_to_keep,punct_to_remove=punct_to_remove,add_start_token=False, add_end_token=False)# 统计每个词在所有句子中出现的频次for token in seq_tokens:if token not in token_to_count:token_to_count[token] = 0token_to_count[token] += 1# 出现次数不低于min_token_count的单词才会被添加到token_to_idx# 字典中，并赋予一个唯一的索引值。for token, count in sorted(token_to_count.items()):if count >= min_token_count:token_to_idx[token] = len(token_to_idx)# {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNK>': 3, '': 4, 'a': 5, 'abstentions': 6, 'accordance': 7, 'add': 8,#  'adopted': 9}return token_to_idxdef encode(seq_tokens, token_to_idx, allow_unk=False):seq_idx = []for token in seq_tokens:if token not in token_to_idx:if allow_unk:token = '<UNK>'else:raise KeyError('Token "%s" not in vocab' % token)seq_idx.append(token_to_idx[token])return seq_idxdef decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):tokens = []for idx in seq_idx:tokens.append(idx_to_token[idx])if stop_at_end and tokens[-1] == '<END>':breakif delim is None:return tokenselse:return delim.join(tokens)def main(args):data_dir = '/home/hx301/data/'# args.input_data_dir = args.input_data_dir# args.output_train_dir = args.output_train_dir# args.output_test_dir = args.output_test_dir# args.output_vocab = args.output_vocabprint(args.input_data_dir)sentences = []print('Preprocess Raw Text')for fn in tqdm(os.listdir(args.input_data_dir)):if not fn.endswith('.txt'): continue #直到找到以.txt结尾的文件process_sentences = process(os.path.join(args.input_data_dir, fn))# 将列表process_sentences中的元素添加到列表sentences中的一种简写方式。# 它实际上是将process_sentences列表中的元素逐个追加到sentences列表的末尾。sentences += process_sentences# remove the same sentences# 使用字典a统计sentence列表中的句子出现的次数，并删除重复的句子。最终，将去重后的句子存储在sentence列表中a = {}for set in sentences:if set not in a:a[set] = 0a[set] += 1sentences = list(a.keys())print('Number of sentences: {}'.format(len(sentences)))print('Build Vocab')token_to_idx = build_vocab(sentences, SPECIAL_TOKENS,punct_to_keep=[';', ','], punct_to_remove=['?', '.'])vocab = {'token_to_idx': token_to_idx}print('Number of words in Vocab: {}'.format(len(token_to_idx)))# save the vocabif args.output_vocab != '':with open(args.output_vocab, 'w') as f:json.dump(vocab, f)print('Start encoding txt')results = []count_len = []#重新处理每个句子,根据上面对每个词赋予的token_to_idx,将句子转换成数字列表for seq in tqdm(sentences):words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])tokens = [token_to_idx[word] for word in words]count_len.append(len(tokens))results.append(tokens)print('Writing Data')train_data = results[: round(len(results) * 0.9)]test_data = results[round(len(results) * 0.9):]# 训练集:测试集=9:1with open(args.output_train_dir, 'wb') as f:pickle.dump(train_data, f)with open(args.output_test_dir, 'wb') as f:pickle.dump(test_data, f)if __name__ == '__main__':# Set Parametersargs = para_config()main(args)

处理好的vocab.json文件

{"token_to_idx": {"<PAD>": 0, "<START>": 1, "<END>": 2, "<UNK>": 3, "": 4, "a": 5, "abstentions": 6, "accordance": 7, "add": 8, "adopted": 9, "advertising": 10, "advisers": 11, "against": 12, "agenda": 13, "agriculture": 14, "all": 15, "allowances": 16, "already": 17, "always": 18, "amended": 19, "amendment": 20, "amendments": 21, "among": 22, "and": 23, "any": 24, "applause": 25, "appreciation": 26, "are": 27, "as": 28, "at": 29, "be": 30, "been": 31, "behalf": 32, "being": 33, "business": 34, "but": 35, "by": 36, "can": 37, "capital": 38, "card": 39, "cards": 40, "case": 41, "clearly": 42, "closed": 43, "closely": 44, "cohesion": 45, "coming": 46, "commandment": 47, "commend": 48, "commission": 49, "commissioner": 50, "committee": 51, "compliments": 52, "conclusions": 53, "continue": 54, "coordination": 55, "counted": 56, "creation": 57, "dangerous": 58, "de": 59, "debate": 60, "declared": 61, "depth": 62, "development": 63, "do": 64, "economic": 65, "electronically": 66, "elements": 67, "entitled": 68, "especially": 69, "european": 70, "event": 71, "facts": 72, "familiar": 73, "favour": 74, "few": 75, "finally": 76, "financing": 77, "first": 78, "for": 79, "forget": 80, "forgotten": 81, "from": 82, "fund": 83, "funds": 84, "give": 85, "gladly": 86, "goods": 87, "group": 88, "has": 89, "have": 90, "hear": 91, "her": 92, "his": 93, "house": 94, "i": 95, "if": 96, "in": 97, "include": 98, "included": 99, "indeed": 100, "into": 101, "is": 102, "it": 103, "its": 104, "job": 105, "just": 106, "keeping": 107, "koch": 108, "last": 109, "least": 110, "let": 111, "letter": 112, "like": 113, "link": 114, "logical": 115, "look": 116, "m": 117, "madam": 118, "made": 119, "main": 120, "make": 121, "mandate": 122, "member": 123, "members": 124, "mention": 125, "meticulous": 126, "minute": 127, "more": 128, "mr": 129, "mrs": 130, "much": 131, "my": 132, "necessary": 133, "no": 134, "not": 135, "noted": 136, "now": 137, "objectives": 138, "observed": 139, "of": 140, "on": 141, "oral": 142, "order": 143, "other": 144, "p": 145, "parliament": 146, "party": 147, "perfectly": 148, "period": 149, "piece": 150, "place": 151, "pleased": 152, "poettering": 153, "point": 154, "political": 155, "positions": 156, "ppe": 157, "presented": 158, "presently": 159, "presidency": 160, "president": 161, "principles": 162, "proceed": 163, "proposal": 164, "propose": 165, "pse": 166, "question": 167, "quite": 168, "rapporteur": 169, "reasonable": 170, "received": 171, "regarding": 172, "regions": 173, "reinstated": 174, "reiterate": 175, "rejected": 176, "remain": 177, "repeat": 178, "report": 179, "request": 180, "requests": 181, "result": 182, "road": 183, "room": 184, "rose": 185, "rural": 186, "s": 187, "safety": 188, "schroedter": 189, "segni": 190, "shall": 191, "she": 192, "should": 193, "silence": 194, "since": 195, "sitting": 196, "situation": 197, "so": 198, "social": 199, "socialists": 200, "speak": 201, "speakers": 202, "speaking": 203, "starting": 204, "statement": 205, "strategic": 206, "structural": 207, "substantive": 208, "suggestions": 209, "support": 210, "tabled": 211, "take": 212, "tax": 213, "thank": 214, "that": 215, "the": 216, "their": 217, "themselves": 218, "then": 219, "there": 220, "therefore": 221, "this": 222, "thursday": 223, "thus": 224, "time": 225, "to": 226, "tomorrow": 227, "too": 228, "topical": 229, "transport": 230, "two": 231, "unable": 232, "understood": 233, "union": 234, "upheld": 235, "urgent": 236, "very": 237, "vote": 238, "votes": 239, "voting": 240, "was": 241, "we": 242, "when": 243, "which": 244, "who": 245, "wholehearted": 246, "whose": 247, "will": 248, "willing": 249, "wishes": 250, "with": 251, "withdrawn": 252, "work": 253, "would": 254, "wurtz": 255, "yes": 256, "you": 257, "your": 258}}

【代码阅读】DeepSC--preprocess_text.py相关推荐

深度学习项目代码阅读建议
点击上方"小白学视觉",选择加"星标"或"置顶" 重磅干货,第一时间送达本文转自|机器学习实验室犹豫很久要不要把读代码这个事情专门挑出来写 ...
行人重识别代码阅读（来自郑哲东简单行人重识别代码到88%准确率）
来自郑哲东简单行人重识别代码到88%准确率阅读代码 prepare.py 数据结构部分代码一些函数 model.py ClassBlock ResNet50 train.py 一些参数使用f ...
BNN Pytorch代码阅读笔记
BNN Pytorch代码阅读笔记这篇博客来写一下我对BNN(二值化神经网络)pytorch代码的理解,我是第一次阅读项目代码,所以想仔细的自己写一遍,把细节理解透彻,希望也能帮到大家! 论文链接: ...
菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记（八）—— 模型训练-训练
系列目录: 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(一)--数据菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(二)-- 介绍及分词菜鸟笔记-DuReader阅读理解基线模 ...
CNN去马赛克代码阅读笔记
有的博客链接是之前几周写好的草稿,最近整理的时候才发布的 CNN去马赛克论文及代码下载地址有torch,minimal torch和caffe三种版本关于minimal torch版所做的努力,以 ...
【代码阅读】PointNet++中ball query的CUDA实现
文章目录本文为PointNet++ CUDA代码阅读系列的第三部分,其他详见: (一)PointNet++代码梳理 (二)PointNet++中的FPS的CUDA实现 (三)PointNet++中b ...
VITAL Tracker Pytorch 代码阅读笔记
VITAL Tracker Pytorch 代码阅读笔记论文链接:https://arxiv.org/pdf/1804.04273.pdf 代码链接:https://github.com/abner ...
菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记（九）—— 预测与校验
系列目录: 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(一)--数据菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(二)-- 介绍及分词菜鸟笔记-DuReader阅读理解基线模 ...
StyleGAN2代码阅读笔记
源代码地址:https://github.com/NVlabs/stylegan2-ada-pytorch 这是一篇代码阅读笔记,顾名思义是对代码进行阅读,讲解的笔记.对象是styleGAN2的pyt ...

【代码阅读】DeepSC--preprocess_text.py

实现功能：

处理好的vocab.json文件

【代码阅读】DeepSC--preprocess_text.py相关推荐

最新文章

热门文章