• 机器翻译(MT):将一段文本从一种语言自动翻译为另一种语言,用神经网络解决这个问题通常称为神经机器翻译(NMT)。 主要特征:输出是单词序列而不是单个单词。 输出序列的长度可能与源序列的长度不同。

数据预处理

  • 去除特殊字符
def preprocess_raw(text):text = text.replace('\u202f', ' ').replace('\xa0', ' ')out = ''for i, char in enumerate(text.lower()):if char in (',', '!', '.') and i > 0 and text[i-1] != ' ':out += ' 'out += charreturn out
  • 分词
num_examples = 50000
source, target = [], []
for i, line in enumerate(text.split('\n')):if i > num_examples:breakparts = line.split('\t')if len(parts) >= 2:source.append(parts[0].split(' '))target.append(parts[1].split(' '))source[0:3], target[0:3]
  • 建立词典
def build_vocab(tokens):tokens = [token for line in tokens for token in line]return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True)
  • 载入数据集
def pad(line, max_len, padding_token):if len(line) > max_len:return line[:max_len]return line + [padding_token] * (max_len - len(line))
def build_array(lines, vocab, max_len, is_source):lines = [vocab[line] for line in lines]if not is_source:lines = [[vocab.bos] + line + [vocab.eos] for line in lines]array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])valid_len = (array != vocab.pad).sum(1) #第一个维度return array, valid_len
def load_data_nmt(batch_size, max_len): # This function is saved in d2l.src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)src_array, src_valid_len = build_array(source, src_vocab, max_len, True)tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)train_iter = data.DataLoader(train_data, batch_size, shuffle=True)return src_vocab, tgt_vocab, train_iter
  • 图解实现机制

  • Encoder
class Seq2SeqEncoder(d2l.Encoder):def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,dropout=0, **kwargs):super(Seq2SeqEncoder, self).__init__(**kwargs)self.num_hiddens=num_hiddensself.num_layers=num_layersself.embedding = nn.Embedding(vocab_size, embed_size)self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)def begin_state(self, batch_size, device):return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens),  device=device),torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens),  device=device)]def forward(self, X, *args):X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size)X = X.transpose(0, 1)  # RNN needs first axes to be time# state = self.begin_state(X.shape[1], device=X.device)out, state = self.rnn(X)# The shape of out is (seq_len, batch_size, num_hiddens).# state contains the hidden state and the memory cell# of the last time step, the shape is (num_layers, batch_size, num_hiddens)return out, stateencoder = Seq2SeqEncoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
output, state = encoder(X)
output.shape, len(state), state[0].shape, state[1].shape
  • 损失函数
def SequenceMask(X, X_len,value=0):maxlen = X.size(1)mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None]   X[~mask]=valuereturn XX = torch.tensor([[1,2,3], [4,5,6]])
SequenceMask(X,torch.tensor([1,2]))class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):# pred shape: (batch_size, seq_len, vocab_size)# label shape: (batch_size, seq_len)# valid_length shape: (batch_size, )def forward(self, pred, label, valid_length):# the sample weights shape should be (batch_size, seq_len)weights = torch.ones_like(label)weights = SequenceMask(weights, valid_length).float()self.reduction='none'output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)return (output*weights).mean(dim=1)
  • 训练
def train_ch7(model, data_iter, lr, num_epochs, device):  # Saved in d2lmodel.to(device)optimizer = optim.Adam(model.parameters(), lr=lr)loss = MaskedSoftmaxCELoss()tic = time.time()for epoch in range(1, num_epochs+1):l_sum, num_tokens_sum = 0.0, 0.0for batch in data_iter:optimizer.zero_grad()X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch]Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)l = loss(Y_hat, Y_label, Y_vlen).sum()l.backward()with torch.no_grad():d2l.grad_clipping_nn(model, 5, device)num_tokens = Y_vlen.sum().item()optimizer.step()l_sum += l.sum().item()num_tokens_sum += num_tokensif epoch % 50 == 0:print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format( epoch, (l_sum/num_tokens_sum), time.time()-tic))tic = time.time()embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(batch_size, max_len,num_examples)
encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
train_ch7(model, train_iter, lr, num_epochs, ctx)

添加注意力机制

  • 注意力机制图解
  • 代码实现
  • softmax屏蔽
def SequenceMask(X, X_len,value=-1e6):maxlen = X.size(1)#print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )mask = torch.arange((maxlen),dtype=torch.float)[None, :] >= X_len[:, None]   #print(mask)X[mask]=valuereturn Xdef masked_softmax(X, valid_length):# X: 3-D tensor, valid_length: 1-D or 2-D tensorsoftmax = nn.Softmax(dim=-1)if valid_length is None:return softmax(X)else:shape = X.shapeif valid_length.dim() == 1:try:valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]except:valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]else:valid_length = valid_length.reshape((-1,))# fill masked elements with a large negative, whose exp is 0X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)return softmax(X).reshape(shape)
  • 点积注意力实现
# Save to the d2l package.
class DotProductAttention(nn.Module): def __init__(self, dropout, **kwargs):super(DotProductAttention, self).__init__(**kwargs)self.dropout = nn.Dropout(dropout)# query: (batch_size, #queries, d)# key: (batch_size, #kv_pairs, d)# value: (batch_size, #kv_pairs, dim_v)# valid_length: either (batch_size, ) or (batch_size, xx)def forward(self, query, key, value, valid_length=None):d = query.shape[-1]# set transpose_b=True to swap the last two dimensions of keyscores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)attention_weights = self.dropout(masked_softmax(scores, valid_length))print("attention_weight\n",attention_weights)return torch.bmm(attention_weights, value)
  • 多层感知机注意力
# Save to the d2l package.
class MLPAttention(nn.Module):  def __init__(self, units,ipt_dim,dropout, **kwargs):super(MLPAttention, self).__init__(**kwargs)# Use flatten=True to keep query's and key's 3-D shapes.self.W_k = nn.Linear(ipt_dim, units, bias=False)self.W_q = nn.Linear(ipt_dim, units, bias=False)self.v = nn.Linear(units, 1, bias=False)self.dropout = nn.Dropout(dropout)def forward(self, query, key, value, valid_length):query, key = self.W_k(query), self.W_q(key)#print("size",query.size(),key.size())# expand query to (batch_size, #querys, 1, units), and key to# (batch_size, 1, #kv_pairs, units). Then plus them with broadcast.features = query.unsqueeze(2) + key.unsqueeze(1)#print("features:",features.size())  #--------------开启scores = self.v(features).squeeze(-1) attention_weights = self.dropout(masked_softmax(scores, valid_length))return torch.bmm(attention_weights, value)
  • 添加后代码更改
class Seq2SeqAttentionDecoder(d2l.Decoder):def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,dropout=0, **kwargs):super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)self.attention_cell = MLPAttention(num_hiddens,num_hiddens, dropout)self.embedding = nn.Embedding(vocab_size, embed_size)self.rnn = nn.LSTM(embed_size+ num_hiddens,num_hiddens, num_layers, dropout=dropout)self.dense = nn.Linear(num_hiddens,vocab_size)def init_state(self, enc_outputs, enc_valid_len, *args):outputs, hidden_state = enc_outputs
#         print("first:",outputs.size(),hidden_state[0].size(),hidden_state[1].size())# Transpose outputs to (batch_size, seq_len, hidden_size)return (outputs.permute(1,0,-1), hidden_state, enc_valid_len)#outputs.swapaxes(0, 1)def forward(self, X, state):enc_outputs, hidden_state, enc_valid_len = state#("X.size",X.size())X = self.embedding(X).transpose(0,1)
#         print("Xembeding.size2",X.size())outputs = []for l, x in enumerate(X):
#             print(f"\n{l}-th token")
#             print("x.first.size()",x.size())# query shape: (batch_size, 1, hidden_size)# select hidden state of the last rnn layer as queryquery = hidden_state[0][-1].unsqueeze(1) # np.expand_dims(hidden_state[0][-1], axis=1)# context has same shape as query
#             print("query enc_outputs, enc_outputs:\n",query.size(), enc_outputs.size(), enc_outputs.size())context = self.attention_cell(query, enc_outputs, enc_outputs, enc_valid_len)# Concatenate on the feature dimension
#             print("context.size:",context.size())x = torch.cat((context, x.unsqueeze(1)), dim=-1)# Reshape x to (1, batch_size, embed_size+hidden_size)
#             print("rnn",x.size(), len(hidden_state))out, hidden_state = self.rnn(x.transpose(0,1), hidden_state)outputs.append(out)outputs = self.dense(torch.cat(outputs, dim=0))return outputs.transpose(0, 1), [enc_outputs, hidden_state,enc_valid_len]

机器翻译及其技术实现相关推荐

  1. 【赠书】熊德意老师的一部不止于技术的神经机器翻译“百科全书”

    每天给你送来NLP技术干货! 2022年7月,Meta(原Facebook)AI 发布了一个大规模机器翻译模型NLLB-200,该模型在神经网络架构上混合了稠密和稀疏神经网络,参数规模达545亿,在覆 ...

  2. 朱靖波:谈谈机器翻译技术发展与产业化

    2020-11-07 00:18:49 作者 | 朱靖波 编辑 | 陈彩娴 由中国中文信息学会主办,山东大学承办的第十七届自然语言处理青年学者研讨会(YSSNLP 2020)于10月24-25日在线上 ...

  3. 阿里巴巴副总裁司罗:达摩院如何搭建NLP技术体系?

    出品 | AI科技大本营(ID:rgznai100) 司罗把人工智能分为四个层面.在计算智能层面,近年来取得了一定成就,而在更高层面的感知.认知和创造智能上还在探索中. 感知智能是指找出自然界的实体, ...

  4. NLP机器翻译深度学习实战课程基础 | 深度应用

    作者 | 小宋是呢 来源 | CSDN博客 0.前言 深度学习用的有一年多了,最近开始 NLP 自然处理方面的研发.刚好趁着这个机会写一系列 NLP 机器翻译深度学习实战课程. 本系列课程将从原理讲解 ...

  5. 每天超50亿推广流量、3亿商品展现,阿里妈妈的推荐技术有多牛?

    作者 | 夕颜 出品 | AI科技大本营(ID:rgznai100) 随着深度学习.强化学习.知识图谱.AutoML 等 AI 技术出现更多突破,推荐系统领域的企业和开发者开始将这些技术与传统推荐算法 ...

  6. eBay数据科学家李睿:自然语言处理在eBay的技术实践 数据 网络 类别 技术 分类器 阅读1593 近日,在飞马网主办的“FMI人工智能大数据高峰论坛”上,来自eBay的数据科学家李睿

    eBay数据科学家李睿:自然语言处理在eBay的技术实践 数据 网络 类别 技术 分类器 阅读1593  近日,在飞马网主办的"FMI人工智能&大数据高峰论坛"上,来自eB ...

  7. 百度机器翻译已经进化到什么程度?

    9月2日晚7:00-8:30,百度大脑语言与知识技术峰会系列公开课<机器翻译前沿技术及应用>将在百度APP.CSDN.B站等渠道同步直播,本次课程将由百度资深研发工程师对百度翻译核心技术及 ...

  8. nlp论文-《Neural Machine Translation by Jointly Learning to Align and Translate》-基于联合学习对齐和翻译的神经机器翻译(一)

    <Neural Machine Translation by Jointly Learning to Align and Translate>--基于联合学习对齐和翻译的神经机器翻译 作者 ...

  9. 送几本~ 648页全彩+395张插图 机器翻译 扛鼎之作

    广义上讲,"翻译"是指把一个事物转化为另一个事物的过程. 在人类语言的翻译中,一种语言文字通过人脑转化为另一种语言表达,这是一种自然语言的"翻译". 如图1所示 ...

最新文章

  1. linux系统的5种文件类型及其属性符号
  2. Facebook这次开源的代码名为“Big Sur”,
  3. Could not find artifact
  4. 我喜欢用计算机400字,我喜欢电脑画画作文400字
  5. SSM期末复习题(仅供参考)
  6. Flutter使用fluwx实现微信分享
  7. 2013-2015阿里双十一技术网络文章总结
  8. win10商店无法打开,错误0x80131500、0x80072EFD
  9. 开源接口测试平台BTest
  10. 10的负8次方用python_matplotlib;10的分数次幂;科学记数法
  11. 【JAVA】网页版登录注册系统2.0
  12. LazyAn-—《合成大西瓜》怎么做?原版游戏还原
  13. centos中redis设置密码
  14. Java比较日期时间大小
  15. 2021哈工大软件构造Lab3
  16. ceph-iscsi原理及部署
  17. kali虚拟机网络配置
  18. 《奔跑吧Linux》之Linux内核奔跑卷
  19. sqlserver 删除表中重复的数据
  20. 基于ESB实现商友与K3财务凭证集成

热门文章

  1. 控制器设计 计算机组成原理,计算机组成原理课程设计-模型计算机控制器的设计.doc...
  2. ScrollRect 回到顶部
  3. 谷歌AI研究员离职后续:伦理AI团队负责人遭解雇
  4. 用Python做动态时钟
  5. electron 任务栏提示和托盘闪烁
  6. Reports Builder 数据源(文本和XML导入)
  7. 虚拟服务器 自己搭建,自己搭建虚拟主机服务器
  8. POJ 3764 Language: The xor-longest Path (01字典树+DFS)
  9. 指针收尾,指针进阶版。
  10. 计算机网络禁用了怎么恢复,电脑网络禁用怎么恢复图文教程