Deep Learning Enabled Semantic Communication Systems

实现功能:

1.函数将输入的字符串标准化为一个符合规范的文本字符串(处理空格特殊符号);
2.文本分割成指定长;
3.添加开始结束点;
4.筛选符号;
5.构建词汇表(vocabulary),统计句子中每个词出现的个数

import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
from tqdm import tqdm
from parameters import para_config# parser = argparse.ArgumentParser()
# parser.add_argument('--input-data-dir', default='europarl/en', type=str)
# parser.add_argument('--output-train-dir', default='europarl/train_data.pkl', type=str)
# parser.add_argument('--output-test-dir', default='europarl/test_data.pkl', type=str)
# parser.add_argument('--output-vocab', default='europarl/vocab.json', type=str)SPECIAL_TOKENS = {'<PAD>': 0,'<START>': 1,'<END>': 2,'<UNK>': 3,
}def unicode_to_ascii(s):return ''.join(c for c in unicodedata.normalize('NFD', s)if unicodedata.category(c) != 'Mn')# 函数将输入的字符串标准化为一个符合规范的文本字符串
def normalize_string(s):# normalize unicode characterss = unicode_to_ascii(s)# remove the XML-tagss = remove_tags(s)# add white space before !.?# 在感叹号、句号和问号前添加一个空格,以便将它们与前面的单词分开。它使用正则表达式将这些标点符号前添加一个空格。s = re.sub(r'([!.?])', r' \1', s)# 除了字母(大小写)、句号、感叹号和问号之外的所有字符替换为空格s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)# 这一步将连续出现的多个空格替换为单个空格。它使用正则表达式\s + 匹配连续出现的多个空格,并将它们替换为单个空格s = re.sub(r'\s+', r' ', s)# change to lower letter# 将字符串s中的所有字母转换为小写字母s = s.lower()return s
# 对清理过的文本进行切割处理,筛选出指定长度范围内的句子。这可能有助于过滤掉过短或过长的句子,以便后续处理或分析
def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):# 创建一个空列表cutted_lines,用于存储切割后的句子。cutted_lines = list()# 遍历输入的清理过的句子列表cleaned中的每个句子for line in cleaned:# 计算当前句子line中的单词数,并将结果存储在变量length中length = len(line.split())# 判断当前句子的单词数是否大于MIN_LENGTH且小于MAX_LENGTHif length > MIN_LENGTH and length < MAX_LENGTH:# 将当前句子line按空格切分成单词,并将结果存储在列表line中line = [word for word in line.split()]# 将切分后的单词列表line重新组合成一个句子,并将该句子添加到cutted_linescutted_lines.append(' '.join(line))return cutted_linesdef save_clean_sentences(sentence, save_path):pickle.dump(sentence, open(save_path, 'wb'))print('Saved: %s' % save_path)def process(text_path):fop = open(text_path, 'r', encoding='utf8')raw_data = fop.read()# 将读取到的内容去除首尾的空白字符,并按换行符('\n')# 进行分割,将文本划分为多个句子,保存在变量sentences中。sentences = raw_data.strip().split('\n')# 对每个句子应用normalize_string函数,将句子进行标准化处理,去除XML标签、添加空格等raw_data_input = [normalize_string(data) for data in sentences]# 筛选出长度在指定范围内的句子raw_data_input = cutted_data(raw_data_input)fop.close()return raw_data_input# 用于将一个字符串 s 切分成一个(字符串)标记列表,即将字符串按指定的分隔符进行分割,
# 并可选择保留或移除特定的标点符号,以及添加起始和结束标记
def tokenize(s, delim=' ',  add_start_token=True, add_end_token=True,punct_to_keep=None, punct_to_remove=None):"""Tokenize a sequence, converting a string s into a list of (string) tokens bysplitting on the specified delimiter. Optionally keep or remove certainpunctuation marks and add start and end tokens."""if punct_to_keep is not None:for p in punct_to_keep:s = s.replace(p, '%s%s' % (delim, p))if punct_to_remove is not None:for p in punct_to_remove:s = s.replace(p, '')tokens = s.split(delim)if add_start_token:tokens.insert(0, '<START>')if add_end_token:tokens.append('<END>')return tokens# 构建词汇表(vocabulary)
def build_vocab(sequences, token_to_idx = { }, min_token_count=1, delim=' ',punct_to_keep=None, punct_to_remove=None, ):# 记录词汇表中每个单词的出现次数token_to_count = {}# 遍历sequences列表中的每个句子seqfor seq in sequences:# 使用了tokenize函数进行分词操作seq_tokens = tokenize(seq, delim=delim, punct_to_keep=punct_to_keep,punct_to_remove=punct_to_remove,add_start_token=False, add_end_token=False)# 统计每个词在所有句子中出现的频次for token in seq_tokens:if token not in token_to_count:token_to_count[token] = 0token_to_count[token] += 1# 出现次数不低于min_token_count的单词才会被添加到token_to_idx# 字典中,并赋予一个唯一的索引值。for token, count in sorted(token_to_count.items()):if count >= min_token_count:token_to_idx[token] = len(token_to_idx)# {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNK>': 3, '': 4, 'a': 5, 'abstentions': 6, 'accordance': 7, 'add': 8,#  'adopted': 9}return token_to_idxdef encode(seq_tokens, token_to_idx, allow_unk=False):seq_idx = []for token in seq_tokens:if token not in token_to_idx:if allow_unk:token = '<UNK>'else:raise KeyError('Token "%s" not in vocab' % token)seq_idx.append(token_to_idx[token])return seq_idxdef decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):tokens = []for idx in seq_idx:tokens.append(idx_to_token[idx])if stop_at_end and tokens[-1] == '<END>':breakif delim is None:return tokenselse:return delim.join(tokens)def main(args):data_dir = '/home/hx301/data/'# args.input_data_dir = args.input_data_dir# args.output_train_dir = args.output_train_dir# args.output_test_dir = args.output_test_dir# args.output_vocab = args.output_vocabprint(args.input_data_dir)sentences = []print('Preprocess Raw Text')for fn in tqdm(os.listdir(args.input_data_dir)):if not fn.endswith('.txt'): continue #直到找到以.txt结尾的文件process_sentences = process(os.path.join(args.input_data_dir, fn))# 将列表process_sentences中的元素添加到列表sentences中的一种简写方式。# 它实际上是将process_sentences列表中的元素逐个追加到sentences列表的末尾。sentences += process_sentences# remove the same sentences# 使用字典a统计sentence列表中的句子出现的次数,并删除重复的句子。最终,将去重后的句子存储在sentence列表中a = {}for set in sentences:if set not in a:a[set] = 0a[set] += 1sentences = list(a.keys())print('Number of sentences: {}'.format(len(sentences)))print('Build Vocab')token_to_idx = build_vocab(sentences, SPECIAL_TOKENS,punct_to_keep=[';', ','], punct_to_remove=['?', '.'])vocab = {'token_to_idx': token_to_idx}print('Number of words in Vocab: {}'.format(len(token_to_idx)))# save the vocabif args.output_vocab != '':with open(args.output_vocab, 'w') as f:json.dump(vocab, f)print('Start encoding txt')results = []count_len = []#重新处理每个句子,根据上面对每个词赋予的token_to_idx,将句子转换成数字列表for seq in tqdm(sentences):words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])tokens = [token_to_idx[word] for word in words]count_len.append(len(tokens))results.append(tokens)print('Writing Data')train_data = results[: round(len(results) * 0.9)]test_data = results[round(len(results) * 0.9):]# 训练集:测试集=9:1with open(args.output_train_dir, 'wb') as f:pickle.dump(train_data, f)with open(args.output_test_dir, 'wb') as f:pickle.dump(test_data, f)if __name__ == '__main__':# Set Parametersargs = para_config()main(args)

处理好的vocab.json文件

{"token_to_idx": {"<PAD>": 0, "<START>": 1, "<END>": 2, "<UNK>": 3, "": 4, "a": 5, "abstentions": 6, "accordance": 7, "add": 8, "adopted": 9, "advertising": 10, "advisers": 11, "against": 12, "agenda": 13, "agriculture": 14, "all": 15, "allowances": 16, "already": 17, "always": 18, "amended": 19, "amendment": 20, "amendments": 21, "among": 22, "and": 23, "any": 24, "applause": 25, "appreciation": 26, "are": 27, "as": 28, "at": 29, "be": 30, "been": 31, "behalf": 32, "being": 33, "business": 34, "but": 35, "by": 36, "can": 37, "capital": 38, "card": 39, "cards": 40, "case": 41, "clearly": 42, "closed": 43, "closely": 44, "cohesion": 45, "coming": 46, "commandment": 47, "commend": 48, "commission": 49, "commissioner": 50, "committee": 51, "compliments": 52, "conclusions": 53, "continue": 54, "coordination": 55, "counted": 56, "creation": 57, "dangerous": 58, "de": 59, "debate": 60, "declared": 61, "depth": 62, "development": 63, "do": 64, "economic": 65, "electronically": 66, "elements": 67, "entitled": 68, "especially": 69, "european": 70, "event": 71, "facts": 72, "familiar": 73, "favour": 74, "few": 75, "finally": 76, "financing": 77, "first": 78, "for": 79, "forget": 80, "forgotten": 81, "from": 82, "fund": 83, "funds": 84, "give": 85, "gladly": 86, "goods": 87, "group": 88, "has": 89, "have": 90, "hear": 91, "her": 92, "his": 93, "house": 94, "i": 95, "if": 96, "in": 97, "include": 98, "included": 99, "indeed": 100, "into": 101, "is": 102, "it": 103, "its": 104, "job": 105, "just": 106, "keeping": 107, "koch": 108, "last": 109, "least": 110, "let": 111, "letter": 112, "like": 113, "link": 114, "logical": 115, "look": 116, "m": 117, "madam": 118, "made": 119, "main": 120, "make": 121, "mandate": 122, "member": 123, "members": 124, "mention": 125, "meticulous": 126, "minute": 127, "more": 128, "mr": 129, "mrs": 130, "much": 131, "my": 132, "necessary": 133, "no": 134, "not": 135, "noted": 136, "now": 137, "objectives": 138, "observed": 139, "of": 140, "on": 141, "oral": 142, "order": 143, "other": 144, "p": 145, "parliament": 146, "party": 147, "perfectly": 148, "period": 149, "piece": 150, "place": 151, "pleased": 152, "poettering": 153, "point": 154, "political": 155, "positions": 156, "ppe": 157, "presented": 158, "presently": 159, "presidency": 160, "president": 161, "principles": 162, "proceed": 163, "proposal": 164, "propose": 165, "pse": 166, "question": 167, "quite": 168, "rapporteur": 169, "reasonable": 170, "received": 171, "regarding": 172, "regions": 173, "reinstated": 174, "reiterate": 175, "rejected": 176, "remain": 177, "repeat": 178, "report": 179, "request": 180, "requests": 181, "result": 182, "road": 183, "room": 184, "rose": 185, "rural": 186, "s": 187, "safety": 188, "schroedter": 189, "segni": 190, "shall": 191, "she": 192, "should": 193, "silence": 194, "since": 195, "sitting": 196, "situation": 197, "so": 198, "social": 199, "socialists": 200, "speak": 201, "speakers": 202, "speaking": 203, "starting": 204, "statement": 205, "strategic": 206, "structural": 207, "substantive": 208, "suggestions": 209, "support": 210, "tabled": 211, "take": 212, "tax": 213, "thank": 214, "that": 215, "the": 216, "their": 217, "themselves": 218, "then": 219, "there": 220, "therefore": 221, "this": 222, "thursday": 223, "thus": 224, "time": 225, "to": 226, "tomorrow": 227, "too": 228, "topical": 229, "transport": 230, "two": 231, "unable": 232, "understood": 233, "union": 234, "upheld": 235, "urgent": 236, "very": 237, "vote": 238, "votes": 239, "voting": 240, "was": 241, "we": 242, "when": 243, "which": 244, "who": 245, "wholehearted": 246, "whose": 247, "will": 248, "willing": 249, "wishes": 250, "with": 251, "withdrawn": 252, "work": 253, "would": 254, "wurtz": 255, "yes": 256, "you": 257, "your": 258}}

【代码阅读】DeepSC--preprocess_text.py相关推荐

  1. 深度学习项目代码阅读建议

    点击上方"小白学视觉",选择加"星标"或"置顶" 重磅干货,第一时间送达本文转自|机器学习实验室 犹豫很久要不要把读代码这个事情专门挑出来写 ...

  2. 行人重识别 代码阅读(来自郑哲东 简单行人重识别代码到88%准确率)

    来自郑哲东 简单行人重识别代码到88%准确率 阅读代码 prepare.py 数据结构 部分代码 一些函数 model.py ClassBlock ResNet50 train.py 一些参数 使用f ...

  3. BNN Pytorch代码阅读笔记

    BNN Pytorch代码阅读笔记 这篇博客来写一下我对BNN(二值化神经网络)pytorch代码的理解,我是第一次阅读项目代码,所以想仔细的自己写一遍,把细节理解透彻,希望也能帮到大家! 论文链接: ...

  4. 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(八)—— 模型训练-训练

    系列目录: 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(一)--数据 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(二)-- 介绍及分词 菜鸟笔记-DuReader阅读理解基线模 ...

  5. CNN去马赛克代码阅读笔记

    有的博客链接是之前几周写好的草稿,最近整理的时候才发布的 CNN去马赛克论文及代码下载地址 有torch,minimal torch和caffe三种版本 关于minimal torch版所做的努力,以 ...

  6. 【代码阅读】PointNet++中ball query的CUDA实现

    文章目录 本文为PointNet++ CUDA代码阅读系列的第三部分,其他详见: (一)PointNet++代码梳理 (二)PointNet++中的FPS的CUDA实现 (三)PointNet++中b ...

  7. VITAL Tracker Pytorch 代码阅读笔记

    VITAL Tracker Pytorch 代码阅读笔记 论文链接:https://arxiv.org/pdf/1804.04273.pdf 代码链接:https://github.com/abner ...

  8. 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(九)—— 预测与校验

    系列目录: 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(一)--数据 菜鸟笔记-DuReader阅读理解基线模型代码阅读笔记(二)-- 介绍及分词 菜鸟笔记-DuReader阅读理解基线模 ...

  9. StyleGAN2代码阅读笔记

    源代码地址:https://github.com/NVlabs/stylegan2-ada-pytorch 这是一篇代码阅读笔记,顾名思义是对代码进行阅读,讲解的笔记.对象是styleGAN2的pyt ...

最新文章

  1. 开源库nothings/stb的介绍及使用(图像方面)
  2. pycharm 配置码云(gitee)教程,如何将已有项目push到码云?
  3. OpenSSL再曝CCS注入漏洞-心伤未愈又成筛子
  4. ASP.NET中的Menu控件的应用
  5. mysql系列_Mysql系列(三)—— Mysql字符集和比较规则
  6. a href点击无效_jquery click()方法模拟点击事件对a标签不生效的解决办法
  7. c 语言 pthread_create_哪种编程语言又快又省电?有人对比了27种语言
  8. 基于maven创建一个javaweb项目
  9. java 正则表达式案例
  10. IOS CA服务器和客户端的配置
  11. 在线去水印网站_一键去水印工具
  12. 如何快速批量修改图片名称?
  13. 进制数的转换方法大全
  14. [UNR #3]百鸽笼
  15. epoll服务器反应堆模型
  16. 国内高校课程资源汇总 2019.3
  17. oracle dbms_repair,DBMS_REPAIR包使用详解
  18. 多元线性回归-EViews
  19. 使用Carthage
  20. Go GPM 调度器介绍

热门文章

  1. Antlr代码生成及命令行验证
  2. 利用Matlab与Arduino制作车牌识别的完整控制系统(1)
  3. UGUI源码分析:LayoutGroup中的纵横布局组件(HorizontalOrVerticalLayoutGroup)
  4. 信号频谱质心matlab,对称频谱信号的中心频率的质心估计方法
  5. 一本通 1270:【例9.14】混合背包(混合背包模板题)
  6. iconv文件格式转换
  7. 实习笔记 —— Git II(远程提交)
  8. TiDB 在金融场景里面那些不得不说的事
  9. CPU卡错误返回码说明
  10. 工作流 流程设置 定制开发