
  • 加载包和一些预处理
  • 定义模型结构
    • Encoder结构
    • Decoder结构
  • 小测试
  • 训练和评估函数
  • 训练模型
  • 在decoder中引入注意力机制


%load_ext autoreload
%autoreload 2
%matplotlib inlineimport random
import math
import timeimport torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from typing import *
from torch.nn import Parameter
from torch.nn import init
from torch import Tensor
import numpy as npimport matplotlib.pyplot as plt
import matplotlib.ticker as ticker
# 指定使用的设备
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
def setup_seed(seed):"""确保每次都会生成相同的结果"""torch.manual_seed(seed)np.random.seed(seed)random.seed(seed)torch.backends.cudnn.deterministic = Truetorch.backends.cudnn.benchmark = True




lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=n_layers).

  • input_dim =输入数量(20的维度可代表20个输入)
  • hidden_dim =隐藏状态的大小; 每个LSTM单元在每个时间步产生的输出数。
  • n_layers =隐藏LSTM图层的数量; 通常是1到3之间的值; 值为1表示每个LSTM单元具有一个隐藏状态。 其默认值为1。

out, hidden = lstm(input.view(1, 1, -1), (h0, c0))
对于LSTM输入为:(input, (h0, c0)).

  • input = 输入序列中Tensor; (seq_len,batch,input_size)
  • h0 = Tensor,包含批处理中每个元素的初始隐藏状态
  • c0 = 批次中每个元素的初始单元格内存的Tensor
  • h0和c0默认为0,如果未指定。 它们的尺寸为:(n_layers,batch,hidden_dim)。
class EncoderLSTM(nn.Module):def __init__(self, input_size: int, hidden_size: int):super(EncoderLSTM, self).__init__()self.hidden_size = hidden_sizeself.embedding = nn.Embedding(input_size, hidden_size)self.lstm = nn.LSTM(hidden_size, hidden_size)def forward(self, inputs: Tensor, state: Tuple[Tensor]):(hidden, cell) = state# seq_len,batch都为1embedded = self.embedding(inputs).view(1, 1, -1)output = embeddedoutput, (hidden, cell) = self.lstm(output, (hidden, cell))return output, (hidden, cell)def init_hidden(self):cell = torch.zeros(1, 1, self.hidden_size, device=device)hidden = torch.zeros(1, 1, self.hidden_size, device=device)return hidden, cell


class DecoderLSTM(nn.Module):def __init__(self, hidden_size: int, output_size: int):super(DecoderLSTM, self).__init__()self.hidden_size = hidden_sizeself.embedding = nn.Embedding(output_size, hidden_size)self.lstm = nn.LSTM(hidden_size, hidden_size)self.out = nn.Linear(hidden_size, output_size)self.log_softmax = nn.LogSoftmax(dim=1)self.activation_function = F.reludef forward(self, inputs, state):(hidden, cell) = stateoutput = self.embedding(inputs).view(1, 1, -1)output = self.activation_function(output)output, (hidden, cell) = self.lstm(output, (hidden, cell))output = self.log_softmax(self.out(output[0]))return output, (hidden, cell)def init_hidden(self):"""Init hiddenReturns:hidden:cell:"""cell = torch.zeros(1, 1, self.hidden_size, device=device)hidden = torch.zeros(1, 1, self.hidden_size, device=device)return hidden, cell


[['j ai ans .', 'i m .'],['je vais bien .', 'i m ok .'],['ca va .', 'i m ok .'],
testpair = random.choice(pairs)
['il ne se presente pas aux prochaines elections .','he is not running in the coming election .']
tensor_from_sentence(input_lang, testpair[0])
tensor([[  24],[ 297],[ 882],[2113],[ 246],[ 241],[4280],[3522],[   5],[   1]])
tensor_from_sentence(output_lang, testpair[1])
tensor([[  14],[  40],[ 147],[ 335],[ 102],[ 294],[ 142],[2744],[   4],[   1]])
tensor_from_sentence(output_lang, 'i .')
tensor_from_pair(testpair, input_lang, output_lang)
(tensor([[  24],[ 297],[ 882],[2113],[ 246],[ 241],[4280],[3522],[   5],[   1]]), tensor([[  14],[  40],[ 147],[ 335],[ 102],[ 294],[ 142],[2744],[   4],[   1]]))


Teacher Forcing是一种用来快速而有效地训练循环神经网络模型的方法,这种方法以上一时刻的输出作为下一时刻的输入,能够解决缓慢收敛和不稳定的问题。但是,当生成的序列与训练期间模型看到的不同时(即遇到了训练集中不存在的数据),该方法还可能导致在实践中使用时模型效果不好。


def train_by_sentence(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_fn, use_teacher_forcing=True, reverse_source_sentence=True,max_length=MAX_LENGTH):"""Train by single sentence using EncoderLSTM and DecoderLSTMincluding training and update modelArgs:input_tensor: [input_sequence_len, 1, hidden_size]target_tensor: [target_sequence_len, 1, hidden_size]encoder: EncoderLSTMdecoder: DecoderLSTMencoder_optimizer: optimizer for encoderdecoder_optimizer: optimizer for decoderloss_fn: loss functionuse_teacher_forcing: True is to Feed the target as the next input, False is to use its own predictions as the next inputmax_length: max length for input and outputReturns:loss: scalar"""# 判断是否需要对句子进行逆转if reverse_source_sentence:input_tensor = torch.flip(input_tensor, [0])hidden, cell = encoder.init_hidden()encoder_optimizer.zero_grad()decoder_optimizer.zero_grad()# 获取输入和输出的目标序列的长度input_length = input_tensor.size(0)target_length = target_tensor.size(0)# encoder outputs:  [max_length, hidden_size],在这里定义是为了获取全局变量encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)loss = 0# 获取encoder的输出for ei in range(input_length):encoder_output, (hidden, cell) = encoder(input_tensor[ei], (hidden, cell))# 这里的batchsize和seqlength都是1encoder_outputs[ei] = encoder_output[0, 0]# 初始化为shape为(1,1),值为0的tensorSOS_token = 0decoder_input = torch.tensor([[SOS_token]], device=device)decoder_hidden = (hidden, cell)for di in range(target_length):decoder_output, (hidden, cell) = decoder(decoder_input, (hidden, cell))if use_teacher_forcing:# 将target作为inputloss += loss_fn(decoder_output, target_tensor[di])decoder_input = target_tensor[di]  else:# 将自己预测出来的结果作为下一轮输入的值topv, topi = decoder_output.topk(1)decoder_input = topi.squeeze().detach()loss += loss_fn(decoder_output, target_tensor[di])# 当输入为EOS之后停止if decoder_input.item() == EOS_token:breakloss.backward()encoder_optimizer.step()decoder_optimizer.step()return loss.item() / target_length
def train(encoder, decoder, n_iters, reverse_source_sentence=True, use_teacher_forcing=True,print_every=1000, plot_every=100, learning_rate=0.01):"""Train of Seq2seqArgs:encoder: EncoderLSTMdecoder: DecoderLSTMn_iters: train with n_iters sentences without replacementreverse_source_sentence: True is to reverse the source sentence but keep order of target unchanged,False is to keep order of the source sentence target unchangeduse_teacher_forcing: True is to Feed the target as the next input, False is to use its own predictions as the next inputprint_every: print log every print_every plot_every: plot every plot_every learning_rate: """start = time.time()plot_losses = []print_loss_total = 0plot_loss_total = 0# 使用SGD作为优化器encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)# 获取训练数据training_pairs = [tensor_from_pair(random.choice(pairs), input_lang, output_lang)for _ in range(n_iters)]# 损失函数loss_fn = nn.NLLLoss()for i in range(1, n_iters+1):training_pair = training_pairs[i-1]input_tensor = training_pair[0].to(device)target_tensor = training_pair[1].to(device)            loss = train_by_sentence(input_tensor, target_tensor, encoder, decoder,encoder_optimizer, decoder_optimizer, loss_fn, use_teacher_forcing=use_teacher_forcing,reverse_source_sentence=reverse_source_sentence)print_loss_total += lossplot_loss_total += lossif i % print_every == 0:# Print Lossprint_loss_avg = print_loss_total / print_everyprint_loss_total = 0print("%s (%d %d%%) %.4f" % (time_since(start, i / n_iters),i, i / n_iters * 100, print_loss_avg))if i % plot_every == 0:# Plotplot_loss_avg = plot_loss_total / plot_everyplot_losses.append(plot_loss_avg)plot_loss_total = 0# show plotshow_plot(plot_losses)
def evaluate_by_sentence(encoder, decoder, sentence, reverse_source_sentence, max_length=MAX_LENGTH):"""Evalutae on a source sentenceArgs:encoderdecodersentencemax_lengthReturn:decoded_words: predicted sentence"""with torch.no_grad():# Get tensor of sentenceinput_tensor = tensor_from_sentence(input_lang, sentence).to(device)input_length = input_tensor.size(0)if reverse_source_sentence:input_tensor = torch.flip(input_tensor, [0])# init state for encoder(hidden, cell) = encoder.init_hidden()# encoder outputs: [max_length, hidden_size]encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)for ei in range(input_length):encoder_output, (hidden, cell) = encoder(input_tensor[ei],(hidden, cell))encoder_outputs[ei] += encoder_output[0, 0]# Last state of encoder as the init state of decoderdecoder_input = torch.tensor([[SOS_token]], device=device)decoder_hidden = (hidden, cell)decoded_words = []# When evaluate, use its own predictions as the next inputfor di in range(max_length):decoder_output, (hidden, cell) = decoder(decoder_input, (hidden, cell))topv, topi = decoder_output.data.topk(1)if topi.item() == EOS_token:decoded_words.append("<EOS>")breakelse:decoded_words.append(output_lang.index2word[topi.item()])decoder_input = topi.squeeze().detach()return decoded_words
def evaluate_randomly(encoder, decoder, n=10, reverse_source_sentence=True):"""Random pick sentence from dataset and observe the effect of translationArgs:encoder: decoder:n: numbers of sentences to evaluate"""for _ in range(n):pair = random.choice(pairs)# Source sentenceprint(">", pair[0])# Target sentenceprint("=", pair[1])output_words = evaluate_by_sentence(encoder, decoder, pair[0], reverse_source_sentence)output_sentence = " ".join(output_words)# Predicted sentenceprint("<", output_sentence)print("")
def show_plot(points):"""Plot according to points"""plt.figure()fig, ax = plt.subplots()loc = ticker.MultipleLocator(base=0.2)ax.yaxis.set_major_locator(loc)plt.plot(points)plt.show()


input_lang, output_lang, pairs = prepare_data('eng', 'fra', reverse=True)
Reading lines...
Read 135842 sentence pairs
Reverse source sentence
Trimmed to 10599 sentence pairs
Counting words ...
Counting words:
fra 4345
eng 2803
['elle est hors de danger .', 'she is out of danger .']


hidden_size = 256
reverse_source_sentence = True
use_teacher_forcing = True
encoder = EncoderLSTM(input_lang.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_lang.n_words).to(device)
print(">> Model is on: {}".format(next(encoder.parameters()).is_cuda))
print(">> Model is on: {}".format(next(decoder.parameters()).is_cuda))
>> Model is on: True
>> Model is on: True
iters = 50000
train(encoder, decoder, iters, reverse_source_sentence=reverse_source_sentence, use_teacher_forcing=use_teacher_forcing,print_every=250, plot_every=250)
# Randomly pick up 10 sentence and observe the performance
evaluate_randomly(encoder, decoder, 10, reverse_source_sentence)
> je suis tres fier de nos etudiants .
= i m very proud of our students .
< i m very proud of you . <EOS>> vous etes faibles .
= you re weak .
< you re rude . <EOS>> tu n es pas si vieux .
= you re not that old .
< you re not that old . <EOS>> je songe a demissionner immediatement .
= i am thinking of resigning at once .
< i m thinking about the problem . <EOS>> je suis en retard sur le programme .
= i m behind schedule .
< i m behind schedule . <EOS>> je suis submerge de travail .
= i m swamped with work .
< i m proud of that . <EOS>> je ne vais pas prendre le moindre risque .
= i m not taking any chances .
< i m not taking any chances . <EOS>> je suis au restaurant .
= i m at the restaurant .
< i m in the office . <EOS>> c est toi la doyenne .
= you re the oldest .
< you re the oldest . <EOS>> je suis tres reconnaissant pour votre aide .
= i m very grateful for your help .
< i m very worried about you . <EOS>


class AttentionDecoderLSTM(nn.Module):def __init__(self, hidden_size: int, output_size: int, dropout_p=0.1, max_length=MAX_LENGTH):"""DecoderLSTM with attention mechanism"""super(AttentionDecoderLSTM, self).__init__()self.hidden_size = hidden_sizeself.output_size = output_sizeself.dropout_p = dropout_pself.max_length = max_lengthself.embedding = nn.Embedding(self.output_size, self.hidden_size)self.attention = nn.Linear(self.hidden_size * 2, self.max_length)self.attention_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)self.dropout = nn.Dropout(self.dropout_p)self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)self.out = nn.Linear(self.hidden_size, self.output_size)self.activation_fn = F.reludef forward(self, inputs, state, encoder_outputs):"""ForwardArgs:inputs: [1, hidden_size]state : ([1, 1, hidden_size], [1, 1, hidden_size])encoder_outputs: [max_length, hidden_size]Returns:output:state: (hidden, cell)"""# embedded: [1, 1, hidden_size]embedded = self.embedding(inputs).view(1, 1, -1)embedded = self.dropout(embedded)(hidden, cell) = state# embedded[0]的size会变成[1,hidden_size]attention_weights = F.softmax(self.attention(torch.cat((embedded[0], hidden[0]), 1)), dim=1)# torch.bmm 是在batch的层面上对矩阵进行相乘的意思,比如说输入为[10, 3, 4]和[10, 4, 5],输出结果就是[10, 3, 5]# 下面代码中使用unsqueeze的目的就是添加batch这一维度,使得输出结果为[1,1,hiddensize]attention_applied = torch.bmm(attention_weights.unsqueeze(0),encoder_outputs.unsqueeze(0))# output: [1, hidden_size * 2]output = torch.cat((embedded[0], attention_applied[0]), 1)# output: [1, 1, hidden_size]output = self.attention_combine(output).unsqueeze(0)output = self.activation_fn(output)# output, [1, 1, hidden_size]output, (hidden, cell) = self.lstm(output, (hidden, cell))# output, [1, output_size]output = F.log_softmax(self.out(output[0]), dim=1)return output, (hidden, cell), attention_weightsdef init_hidden(self):"""Init hiddenReturns:hidden:cell:"""cell = torch.zeros(1, 1, self.hidden_size, device=device)hidden = torch.zeros(1, 1, self.hidden_size, device=device)return hidden, cell
def train_by_sentence_attn(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_fn, use_teacher_forcing=True, reverse_source_sentence=True,max_length=MAX_LENGTH):"""Train by single sentence using EncoderLSTM and DecoderLSTMincluding training and update model, combining attention mechanism.Args:input_tensor: [input_sequence_len, 1, hidden_size]target_tensor: [target_sequence_len, 1, hidden_size]encoder: EncoderLSTMdecoder: DecoderLSTMencoder_optimizer: optimizer for encoderdecoder_optimizer: optimizer for decoderloss_fn: loss functionuse_teacher_forcing: True is to Feed the target as the next input, False is to use its own predictions as the next inputmax_length: max length for input and outputReturns:loss: scalar"""if reverse_source_sentence:input_tensor = torch.flip(input_tensor, [0])hidden, cell = encoder.init_hidden()encoder_optimizer.zero_grad()decoder_optimizer.zero_grad()input_length = input_tensor.size(0)target_length = target_tensor.size(0)encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)loss = 0# Get encoder outputsfor ei in range(input_length):encoder_output, (hidden, cell) = encoder(input_tensor[ei], (hidden, cell))encoder_outputs[ei] = encoder_output[0, 0]decoder_input = torch.tensor([[SOS_token]], device=device)decoder_hidden = (hidden, cell)for di in range(target_length):decoder_output, (hidden, cell), _ = decoder(decoder_input, (hidden, cell), encoder_outputs)if use_teacher_forcing:loss += loss_fn(decoder_output, target_tensor[di])decoder_input = target_tensor[di]  # Teacher forcingelse:topv, topi = decoder_output.topk(1)decoder_input = topi.squeeze().detach()loss += loss_fn(decoder_output, target_tensor[di])if decoder_input.item() == EOS_token:breakloss.backward()encoder_optimizer.step()decoder_optimizer.step()return loss.item() / target_length
def train_attn(encoder, decoder, n_iters, reverse_source_sentence=True, use_teacher_forcing=True,print_every=1000, plot_every=100, learning_rate=0.01):"""Train of Seq2seq with attention Args:encoder: EncoderLSTMdecoder: DecoderLSTMn_iters: train with n_iters sentences without replacementreverse_source_sentence: True is to reverse the source sentence but keep order of target unchanged,False is to keep order of the source sentence target unchangeduse_teacher_forcing: True is to Feed the target as the next input, False is to use its own predictions as the next inputprint_every: print log every print_every plot_every: plot every plot_every learning_rate: """start = time.time()plot_losses = []print_loss_total = 0plot_loss_total = 0encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)training_pairs = [tensor_from_pair(random.choice(pairs), input_lang, output_lang)for _ in range(n_iters)]loss_fn = nn.NLLLoss()for i in range(1, n_iters+1):training_pair = training_pairs[i-1]input_tensor = training_pair[0].to(device)target_tensor = training_pair[1].to(device)            loss = train_by_sentence_attn(input_tensor, target_tensor, encoder, decoder,encoder_optimizer, decoder_optimizer, loss_fn, use_teacher_forcing=use_teacher_forcing,reverse_source_sentence=reverse_source_sentence)print_loss_total += lossplot_loss_total += lossif i % print_every == 0:# Print Lossprint_loss_avg = print_loss_total / print_everyprint_loss_total = 0print("%s (%d %d%%) %.4f" % (time_since(start, i / n_iters),i, i / n_iters * 100, print_loss_avg))if i % plot_every == 0:# Plotplot_loss_avg = plot_loss_total / plot_everyplot_losses.append(plot_loss_avg)plot_loss_total = 0# show plotshow_plot(plot_losses)
def evaluate_by_sentence_attn(encoder, decoder, sentence, reverse_source_sentence=True, max_length=MAX_LENGTH):"""Evalutae on a source sentence with model trained with attention mechanismArgs:encoderdecodersentencemax_lengthReturn:decoded_words: predicted sentence"""with torch.no_grad():input_tensor = tensor_from_sentence(input_lang, sentence).to(device)input_length = input_tensor.size(0)if reverse_source_sentence:input_tensor = torch.flip(input_tensor, [0])(hidden, cell) = encoder.init_hidden()# encoder outputs: [max_length, hidden_size]encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)for ei in range(input_length):encoder_output, (hidden, cell) = encoder(input_tensor[ei],(hidden, cell))encoder_outputs[ei] += encoder_output[0, 0]decoder_input = torch.tensor([[SOS_token]], device=device)decoder_hidden = (hidden, cell)decoded_words = []decoder_attentions = torch.zeros(max_length, max_length)for di in range(max_length):decoder_output, (hidden, cell), decoder_attention = \decoder(decoder_input, (hidden, cell), encoder_outputs)topv, topi = decoder_output.data.topk(1)# 获取attentiondecoder_attentions[di] = decoder_attention.dataif topi.item() == EOS_token:decoded_words.append("<EOS>")breakelse:decoded_words.append(output_lang.index2word[topi.item()])decoder_input = topi.squeeze().detach()return decoded_words, decoder_attentions[:di + 1]
def show_attention(input_sentence, output_words, attentions):"""绘制输入语句和输出语句之间的注意力关系"""# Set up figure with colorbarfig = plt.figure()ax = fig.add_subplot(111)cax = ax.matshow(attentions.numpy(), cmap='bone')fig.colorbar(cax)ax.set_xticklabels([''] + input_sentence.split(' ') +['<EOS>'], rotation=90)ax.set_yticklabels([''] + output_words)ax.xaxis.set_major_locator(ticker.MultipleLocator(1))ax.yaxis.set_major_locator(ticker.MultipleLocator(1))plt.show()
def evaluate_and_show_attention(input_sentence, encoder, decoder):"""Evaluate and show attention for a input sentence"""output_words, attentions = evaluate_by_sentence_attn(encoder, decoder, input_sentence)print('input =', input_sentence)print('output =', ' '.join(output_words))show_attention(input_sentence, output_words, attentions)
hidden_size = 256
# Reverse the order of source input sentence
reverse_source_sentence = True
# Feed the target as the next input
use_teacher_forcing = True
encoder = EncoderLSTM(input_lang.n_words, hidden_size).to(device)
decoder = AttentionDecoderLSTM(hidden_size, output_lang.n_words).to(device)
print(">> Model is on: {}".format(next(encoder.parameters()).is_cuda))
print(">> Model is on: {}".format(next(decoder.parameters()).is_cuda))
>> Model is on: True
>> Model is on: True
iters = 50000
train_attn(encoder, decoder, iters, reverse_source_sentence=reverse_source_sentence, use_teacher_forcing=use_teacher_forcing,print_every=250, plot_every=250)
evaluate_and_show_attention("elle a cinq ans de moins que moi .", encoder, decoder)evaluate_and_show_attention("elle est trop petit .", encoder, decoder)evaluate_and_show_attention("je ne crains pas de mourir .", encoder, decoder)evaluate_and_show_attention("c est un jeune directeur plein de talent .", encoder, decoder)
input = elle a cinq ans de moins que moi .
output = she is two years younger than me . <EOS>

input = elle est trop petit .
output = she is too drunk . <EOS>

input = je ne crains pas de mourir .
output = i m not afraid of making mistakes . <EOS>

input = c est un jeune directeur plein de talent .
output = he s a very talented writer . <EOS>


