seq2seq (中英对照翻译)Attention
基于PyTorch实现seq2seq模型来实现中文向英文的翻译。
Seq2Seq模型
seq2seq模型主要由Encoder和Decoder这两部分组成,因为是序列到序列网络,之间有两个递归神经网络一起工作,将一个序列转换成另一个序列。编码器网络将输入序列压缩成向量,解码器将其展开为新序列。
我们的数据集是来自 http://www.manythings.org/anki/
找到这个将其下载下来。
1.文本预处理:
SOS_token = 0 EOS_token = 1 #做个标签 以上两个分别代表一个序列的开始和结束 class Lang:def __init__(self, name):self.name = name# 形如 {"hello" : 3}self.word2index = {}# 统计每一个单词出现的次数self.word2count = {}self.index2word = {0: "SOS", 1: "EOS"}# 统计训练集出现的单词数self.n_words = 2 # SOS 和 EOS已经存在了def addSentence(self, sentence):# 第一行为 Go. Va !# 前面是英语,后面是中文,中间用tab分隔for word in sentence.split(" "):self.addWord(word)def addWord(self, word):if word not in self.word2index:self.word2index[word] = self.n_wordsself.word2count[word] = 1# 用现有的总词数作为新的单词的索引self.index2word[self.n_words] = wordself.n_words += 1else:self.word2count[word] += 1# 将Unicode字符串转换为纯ASCII, 感谢https://stackoverflow.com/a/518232/2809427 def unicodeToAscii(s):return ''.join(c for c in unicodedata.normalize('NFD', s)if unicodedata.category(c) != 'Mn')# 小写,修剪和删除非字母字符def normalizeString(s):# 转码之后变小写切除两边空白s = unicodeToAscii(s.lower().strip())# 匹配.!?,并在前面加空格s = re.sub(r"([.!?])", r" \1", s)# 将非字母和.!?的全部变为空白#s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)return sdef readLangs(lang1, lang2, reverse=False):print("Reading lines...")# 读取文件并分为几行# 每一对句子最后会有个换行符\n# lines ==> ['Go.\tVa !', 'Run!\tCours\u202f!'...]lines = open(r"填写自己数据集保存的相对位置的地址",encoding="utf-8").read().strip().split("\n")# 将每一行拆分成对并进行标准化# pairs ==> [["go .","va !"],...]pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines]pairs = np.delete(pairs, 2, axis=1) #这里 你打开数据集你发现每一行是这样一个格式 English + TAB + The Other Language + TAB + Attribution 我们做的是把pairs的第三个对(Attribution)去掉,(TAB是空格),我们留下的是 [English] 和[The Other Language] 这两个相对的语言对 # 反向对,实例Lang# 源文件是先英语后中文# 换完之后就是先中后英 reverse 反转输入数据,这是seq2seq在深度学习进阶这本书中 提到改进的两个点之一,反转之后学习进展会更快,最终进度也提高,如果不反转,正确率会下降很多 有兴趣可以自己做个实验 if reverse:pairs = [list(reversed(p)) for p in pairs]input_lang = Lang(lang2)output_lang = Lang(lang1)else:input_lang = Lang(lang1)output_lang = Lang(lang2)return input_lang, output_lang, pairs lang1 = "cmn" lang2 = "eng" input_lang, output_lang, pairs = readLangs(lang1, lang2) print("input_lang:", input_lang) print("output_lang:", output_lang) print("pairs中的前五个:", pairs[:5])
把pairs打印出来看看 现在我们看出它已经是我们所需要的数据对了。
可以对比原来的数据集,把每句话最后一句话去掉了。
最终代码:
from __future__ import unicode_literals, print_function, division from io import open import unicodedata import re import random import os import os import numpy as np os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" import torch import torch.nn as nn from torch import optim import torch.nn.functional as Fdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")SOS_token = 0 EOS_token = 1class Lang:def __init__(self, name):self.name = name# 形如 {"hello" : 3}self.word2index = {}# 统计每一个单词出现的次数self.word2count = {}self.index2word = {0: "SOS", 1: "EOS"}# 统计训练集出现的单词数self.n_words = 2 # SOS 和 EOS已经存在了def addSentence(self, sentence):# 第一行为 Go. Va !# 前面是英语,后面是法语,中间用tab分隔for word in sentence.split(" "):self.addWord(word)def addWord(self, word):if word not in self.word2index:self.word2index[word] = self.n_wordsself.word2count[word] = 1# 用现有的总词数作为新的单词的索引self.index2word[self.n_words] = wordself.n_words += 1else:self.word2count[word] += 1# 将Unicode字符串转换为纯ASCII, 感谢https://stackoverflow.com/a/518232/2809427 def unicodeToAscii(s):return ''.join(c for c in unicodedata.normalize('NFD', s)if unicodedata.category(c) != 'Mn')# 小写,修剪和删除非字母字符def normalizeString(s):# 转码之后变小写切除两边空白s = unicodeToAscii(s.lower().strip())# 匹配.!?,并在前面加空格s = re.sub(r"([.!?])", r" \1", s)# 将非字母和.!?的全部变为空白#s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)return sdef readLangs(lang1, lang2, reverse=False):print("Reading lines...")# 读取文件并分为几行# 每一对句子最后会有个换行符\n# lines ==> ['Go.\tVa !', 'Run!\tCours\u202f!'...]lines = open(r"绝对路径",encoding="utf-8").read().strip().split("\n")# 将每一行拆分成对并进行标准化# pairs ==> [["go .","va !"],...]pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines]pairs = np.delete(pairs, 2, axis=1)if reverse:pairs = [list(reversed(p)) for p in pairs]input_lang = Lang(lang2)output_lang = Lang(lang1)else:input_lang = Lang(lang1)output_lang = Lang(lang2)return input_lang, output_lang, pairs lang1 = "cmn" lang2 = "fra" input_lang, output_lang, pairs = readLangs(lang1, lang2) print("input_lang:", input_lang) print("output_lang:", output_lang) print("pairs中的前五个:", pairs[:5])MAX_LENGTH = 10 eng_prefixes = ("i am ", "i m ","he is", "he s ","she is", "she s ","you are", "you re ","we are", "we re ","they are", "they re " )def filterPair(p):return len(p[0].split(' ')) < MAX_LENGTH and \len(p[1].split(' ')) < MAX_LENGTH and \p[1].startswith(eng_prefixes)# 留下符合条件的 def filterPairs(pairs):return [pair for pair in pairs if filterPair(pair)]def prepareData(lang1, lang2, reverse=False):input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)print("Read %s sentence pairs" % len(pairs))pairs = filterPairs(pairs)print("Trimmed to %s sentence pairs" % len(pairs))print("Counting words...")for pair in pairs:input_lang.addSentence(pair[0])output_lang.addSentence(pair[1])print("Counted words:")print(input_lang.name, input_lang.n_words)print(output_lang.name, output_lang.n_words)return input_lang, output_lang, pairsinput_lang, output_lang, pairs = prepareData('eng', 'cmn', True) # 随机输出pair对print(random.choice(pairs))class EncoderRNN(nn.Module):def __init__(self, input_size, hidden_size):# 调用父类初始化方法super(EncoderRNN, self).__init__()# 初始化必须的变量self.hidden_size = hidden_sizeself.embedding = nn.Embedding(input_size, hidden_size)# gru的输入为三维,两个参数均指的是最后一维的大小# tensor([1,1,hidden_size])self.gru = nn.GRU(hidden_size, hidden_size)def forward(self, input, hidden):# embedded.size() ==> tensor([1,1,hidden_size])# -1的好处是机器会自动计算# 这里用view扩维的原因是gru必须接受三维的输入embedded = self.embedding(input).view(1, 1, -1)output = embeddedoutput, hidden = self.gru(output, hidden)return output, hiddendef initHidden(self):# 初始化隐层状态全为0# hidden ==> tensor([1,1,hidden_size])return torch.zeros(1, 1, self.hidden_size, device=device)class DecoderRNN(nn.Module):def __init__(self, hidden_size, output_size):super(DecoderRNN, self).__init__()self.hidden_size = hidden_sizeself.embedding = nn.Embedding(output_size, hidden_size)self.gru = nn.GRU(hidden_size, hidden_size)# input_features ==> hidden_size# output_features ==> output_sizeself.out = nn.Linear(hidden_size, output_size)# Log(Softmax(X))self.softmax = nn.LogSoftmax(dim=1)def forward(self, input, hidden):output = self.embedding(input).view(1, 1, -1)output = F.relu(output)output, hidden = self.gru(output, hidden)# output.size() ==> [1,1,hidden_size]# output的第一个1是我们用以适合gru输入扩充的# 所以用output[0]选取前面的output = self.softmax(self.out(output[0]))return output, hiddendef initHidden(self):return torch.zeros(1, 1, self.hidden_size, device=device)class AttnDecoderRNN(nn.Module):def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):super(AttnDecoderRNN, self).__init__()self.hidden_size = hidden_sizeself.output_size = output_sizeself.dropout_p = dropout_pself.max_length = max_lengthself.embedding = nn.Embedding(self.output_size, self.hidden_size)# 因为会将prev_hidden和embedded在最后一个维度# 即hidden_size,进行拼接,所以要*2# max_length用以统一不同长度的句子分配的注意力# 最大长度句子使用所有注意力权重,较短只用前几个self.attn = nn.Linear(self.hidden_size * 2,self.max_length) # 输入一个大小为hidden-size*2长度的【batch-size,hidden-size*2]向量 输出为【batch-size,max_length]self.attn = nn.Linear(self.hidden_size * 2, self.max_length)self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)self.dropout = nn.Dropout(self.dropout_p)self.gru = nn.GRU(self.hidden_size, self.hidden_size)self.out = nn.Linear(self.hidden_size, self.output_size)def forward(self, input, hidden, encoder_outputs):embedded = self.embedding(input).view(1, 1, -1)embedded = self.dropout(embedded)# 因为第一维只是适应模型输入扩充的# 所以拼接时,只需要取后面两个维度attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)# bmm ==> batch matrix multiplication# e.g. a.size() ==> tensor([1,2,3])# b.size() ==> tensor([1,3,4])# torch.bmm(a,b).size() ==> tensor([1,2,4])# 第一维度不变,其他两维就当作矩阵做乘法# unsqueeze(0)用以在在第一维扩充维度# attn_applied赋予encoder_outputs不同部分不同权重attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs.unsqueeze(0)) # 计算两个矩阵的乘积output = torch.cat((embedded[0], attn_applied[0]), 1) # 按列拼接 embedded[0]和embedded[1]output = self.attn_combine(output).unsqueeze(0)output = F.relu(output)output, hidden = self.gru(output, hidden)output = F.log_softmax(self.out(output[0]), dim=1)return output, hidden, attn_weightsdef initHidden(self):return torch.zeros(1, 1, self.hidden_size, device=device)def indexesFromSentence(lang, sentence):return [lang.word2index[word] for word in sentence.split(' ')]def tensorFromSentence(lang, sentence):indexes = indexesFromSentence(lang, sentence)indexes.append(EOS_token)return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)def tensorsFromPair(pair):input_tensor = tensorFromSentence(input_lang, pair[0])target_tensor = tensorFromSentence(output_lang, pair[1])return (input_tensor, target_tensor)teacher_forcing_ratio = 0.5def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,max_length=MAX_LENGTH):# 初始化隐藏状态encoder_hidden = encoder.initHidden()# 梯度清零encoder_optimizer.zero_grad()decoder_optimizer.zero_grad()input_length = input_tensor.size(0)target_length = target_tensor.size(0)# 初始化,等会替换encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)loss = 0for ei in range(input_length):encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)# encoder_output.size() ==> tensor([1,1,hidden_size])encoder_outputs[ei] = encoder_output[0, 0]# 输入为<sos>,decoder初始隐藏状态为encoder的decoder_input = torch.tensor([[SOS_token]], device=device)decoder_hidden = encoder_hidden# 随机决定是否采用teacher_forcinguse_teacher_forcing = True if random.random() < teacher_forcing_ratio else Falseif use_teacher_forcing:# 若采用,label作为下一个时间步输入for di in range(target_length):decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)loss += criterion(decoder_output, target_tensor[di])else:# 若不用,则用预测出的作为Decoder下一个输入for di in range(target_length):decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)# topk代表在所给维度上输出最大值# 参数代表输出前多少个最大值# 若为1,就是最大值# topv,topi 分别为前n个最大值和其对应的索引topv, topi = decoder_output.topk(1)# squeeze()进行降维# detach将与这个变量相关的从计算图中剥离# 从而减少内存的开销decoder_input = topi.squeeze().detach()loss += criterion(decoder_output, target_tensor[di])# 若某个时间步输入为<eos>,则停止if decoder_input.item() == EOS_token:breakloss.backward()# 参数更新encoder_optimizer.step()decoder_optimizer.step()# 返回平均lossreturn loss.item() / target_lengthimport time import mathdef asMinutes(s):m = math.floor(s / 60)s -= m * 60return '%dm %ds' % (m, s)def timeSince(since, percent):now = time.time()s = now - sincees = s / (percent)rs = es - sreturn '%s (- %s)' % (asMinutes(s), asMinutes(rs))def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):start = time.time()plot_losses = []# 每一次重置print_loss_total = 0plot_loss_total = 0# 定义优化器encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)# random.choice(pairs)随机选择training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]criterion = nn.NLLLoss()for iter in range(1, n_iters + 1):training_pair = training_pairs[iter - 1]input_tensor = training_pair[0]target_tensor = training_pair[1]loss = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion)print_loss_total += lossplot_loss_total += loss# 若能整除,就打印此时训练进度if iter % print_every == 0:print_loss_avg = print_loss_total / print_everyprint_loss_total = 0print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),iter, iter / n_iters * 100, print_loss_avg))# 若能整除,则把平均损失加入plot_loss# 为后期画图做准备if iter % plot_every == 0:plot_loss_avg = plot_loss_total / plot_everyplot_losses.append(plot_loss_avg)plot_loss_total = 0showPlot(plot_losses)import matplotlib.pyplot as plt import matplotlib.ticker as ticker import numpy as npdef showPlot(points):plt.figure()fig, ax = plt.subplots()# this locator puts ticks at regular intervalsloc = ticker.MultipleLocator(base=0.2)ax.yaxis.set_major_locator(loc)plt.plot(points)def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):# 评估时停止梯度跟踪,减少内存with torch.no_grad():input_tensor = tensorFromSentence(input_lang, sentence)input_length = input_tensor.size()[0]encoder_hidden = encoder.initHidden()encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)for ei in range(input_length):encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)encoder_outputs[ei] += encoder_output[0, 0]decoder_input = torch.tensor([[SOS_token]], device=device) # SOSdecoder_hidden = encoder_hiddendecoded_words = []decoder_attentions = torch.zeros(max_length, max_length)for di in range(max_length):decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)decoder_attentions[di] = decoder_attention.datatopv, topi = decoder_output.data.topk(1)if topi.item() == EOS_token:decoded_words.append('<EOS>')breakelse:decoded_words.append(output_lang.index2word[topi.item()])decoder_input = topi.squeeze().detach()return decoded_words, decoder_attentions[:di + 1]def evaluateRandomly(encoder, decoder, n=10):for i in range(n):pair = random.choice(pairs)print('>', pair[0])print('=', pair[1])output_words, attentions = evaluate(encoder, decoder, pair[0])output_sentence = ' '.join(output_words)print('<', output_sentence)print('')hidden_size = 256 encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device) attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)trainIters(encoder1, attn_decoder1, 75000, print_every=5000)# 保留网络参数,注意是实例化之后的 torch.save(encoder1.state_dict(), "encoder_parameters") torch.save(attn_decoder1.state_dict(), "decoder_parameters")# 注意力可视化 def showAttention(input_sentence, output_words, attentions):# 用colorbar设置图fig = plt.figure()ax = fig.add_subplot(111)# attentions出来之后是tensor形式,需要转换为numpycax = ax.matshow(attentions.numpy(), cmap='bone')fig.colorbar(cax)# 设置坐标ax.set_xticklabels([''] + input_sentence.split(' ') +['<EOS>'], rotation=90)ax.set_yticklabels([''] + output_words)# 在每个刻度处显示标签,刻度为1的倍数ax.xaxis.set_major_locator(ticker.MultipleLocator(1))ax.yaxis.set_major_locator(ticker.MultipleLocator(1))plt.show()def evaluateAndShowAttention(input_sentence):output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)print('input =', input_sentence)print('output =', ' '.join(output_words))showAttention(input_sentence, output_words, attentions)evaluateAndShowAttention("他比我高两寸。")#evaluateAndShowAttention("我赢了 。")#evaluateAndShowAttention("我们来试试 。")#evaluateAndShowAttention("为什么是我 ?") decoder_hidden = [10, 5, 10]
seq2seq (中英对照翻译)Attention相关推荐
- 【论文翻译】中英对照翻译--(Attentive Generative Adversarial Network for Raindrop Removal from A Single Image)
[开始时间]2018.10.08 [完成时间]2018.10.09 [论文翻译]Attentive GAN论文中英对照翻译--(Attentive Generative Adversarial Net ...
- 论文中英对照翻译--(Fusing Multiple Deep Features for Face Anti-spoofing)
[开始时间]2018.10.22 [完成时间]2018.10.22 [论文翻译]论文中英对照翻译--(Fusing Multiple Deep Features for Face Anti-spoof ...
- 【论文翻译】GoogleNet网络论文中英对照翻译--(Going deeper with convolutions)
[开始时间]2018.09.25 [完成时间]2018.09.26 [论文翻译]GoogleNet网络论文中英对照翻译--(Going deeper with convolutions) [中文译名] ...
- 【论文翻译】VGG网络论文中英对照翻译--(very deep convolutional networks for large-scale image recognition)
[开始时间]2018.09.23 [完成时间]2018.09.24 [论文翻译]VGG网络论文中英对照翻译--(very deep convolutional networks for large-s ...
- 星际争霸人族兵种音效中英对照翻译
Terran SCV (出场00)SCV, good to go, sir. SCV可以开工了 (Err00)I can't build it, something's in the way. 我没 ...
- CSS中居中的完全指南(中英对照翻译)
翻译自:https://css-tricks.com/centering-css-complete-guide/ Centering things in CSS is the poster child ...
- POJ1007 DNA Sorting中英对照翻译与参考解答
POJ1007 DNA Sorting 传送门:POJ 1007 关键词: 水题.排序 Description One measure of ``unsortedness'' in a sequenc ...
- (FLANN论文)fast approximate nearest neighbors with automatic algorithm configuration——中英对照翻译
Fast Approximate Nearest Neighbors With Automatic Algorithm Configuration Abstract 在许多计算机视觉问题中,最耗时的部 ...
- 计算机辅助翻译术语中英对照,翻译专业术语汇编(英汉对照
Absolute Translation 绝对翻译 Abstract Translation 摘要翻译 Abusive Translation 滥译 Acceptability 可接受性 Accura ...
最新文章
- HDU2604 Queuing
- 工具--Eclipse/MarkDown/XMind文章分类目录
- linux下in命令
- 如何识别真正的程序员
- TIMING_04 时序约束的一般步骤
- Android—EventBus使用与源码分析
- ip_vs实现分析(2)
- 好玩的java代码 安卓_各种版本的 玩Android 来啦!
- 解决centos6.5不能识别NTFS格式的移动硬盘或U盘问题
- python代码物理_python+appium的物理按键代码
- 运行elasticsearch时报错:could not find java; set JAVA_HOME or ensure java is in PATH
- 【更新于2019】SCI EI 索引最新查询方法(已验证有效)
- 技术篇(二)Demo示例——Bundle打包和加载
- 产生随机数——起名神器
- Application.DoEvent使用
- 网站发布一般步骤以及解决方法
- 如何理解卷积:信号处理、图像处理中的应用
- CanOpen and EtherCAT
- 英语和汉语语法方面的区别(语言类型学)
- 国家一级学会创办的期刊