来源

B站up：deep_thoughts
https://www.bilibili.com/video/BV1cP4y1V7GF/?spm_id_from=333.1007.top_right_bar_window_history.content.click&vd_source=46b0ded1b361f3be84555a12b5121509

word embedding

1.序列建模：source和target，里面的字符是单词索引
2.创建embedding_table，1中的索引表示对应table的位置，0留给padding
3.创建src_embedding_table或tgt_embedding_table

第2步

batch_size = 2

生成src_len、tgt_len

src_len=torch.randint(2,5,(batch_size,))
tgt_len=torch.randint(2,5,(batch_size,))

使其固定

src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([4,3]).to(torch.int32)

此时设置单词最大数为8

#单词索引构成的句子，即每个句子里是单词所在的索引数
#src_seq：[tensor([7, 4]), tensor([2, 1, 5, 4])]
#tgt_seq：[tensor([4, 4, 4, 3]), tensor([5, 7, 6])]
src_seq = [torch.randint(1,max_num_src_words,(L,))for L in src_len]
tgt_seq = [torch.randint(1,max_num_tgt_words,(L,))for L in tgt_len]

L分别取2，4，取2是长度为2的一个元组，取4是长度为4的一个元组
因为长度不一样，需要进行padding，使其对齐。
使用F.pad()补齐，F.pad(补齐对象，（左边几个零，后面几个零）)，默认补0
生成的是一个列表，里面两个元素

src_seq = [F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max(src_len)-L))for L in src_len]
tgt_seq = [F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max(tgt_len)-L))for L in tgt_len]

src_seq:[tensor([5, 3, 0, 0, 0]), tensor([4, 2, 1, 5, 0])]
tgt_seq:[tensor([6, 3, 1, 5, 0]), tensor([1, 2, 5, 0, 0])]
进行拼接
使用torch.cat()拼接

src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max(src_len)-L)),0)for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max(tgt_len)-L)),0)for L in tgt_len])

src_seq:[tensor([5, 3, 0, 0, 0]), tensor([4, 2, 1, 5, 0])]
src_seq【2，5】=》【1，2，5】
由列表变成了一个张量

第3步

nn.Embedding(嵌入大小，)、

print(src_embedding_table)
print(src_embedding_table.weight)
print(src_seq)
print(src_embedding)

src_seq中索引是几，就是src_embedding_table中的第几行。

代码

batch_size = 2#单词表大小
#单词对应的索引数的最大值
max_num_src_words = 8
max_num_tgt_words = 8
#每一个单词的大小
model_dim=8#序列最大长度
#一整个句子的最大长度
max_src_seq_len=5
max_tgt_seq_len=5src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([4,3]).to(torch.int32) #单词索引构成源句子和目标句子，即每个句子里是单词所在的索引数。构建batch，做了padding，默认值为0
#L分别取2，4，取2是长度为2的一个元组，取4是长度为4的一个元组
#src_seq：[tensor([7, 4]), tensor([2, 1, 5, 4])]
#tgt_seq：[tensor([4, 4, 4, 3]), tensor([5, 7, 6])]
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max(src_len)-L)),0)for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max(tgt_len)-L)),0)for L in tgt_len])#构造embedding
#一个单词对应一行，从1开始，pad的0，所以0行对应pad
#每一行一个embedding向量，每个单词索引是几，我们就取第几行
src_embedding_table=nn.Embedding(max_num_src_words+1,model_dim)
tgt_embedding_table=nn.Embedding(max_num_tgt_words+1,model_dim)
src_embedding=src_embedding_table(src_seq)
tgt_embedding=tgt_embedding_table(tgt_seq)

positiona embedding

pos_mat=torch.arange(max_position_len).reshape(-1,1)
i_mat=torch.pow(10000,torch.arange(0,model_dim,2).reshape(1,-1)/model_dim)
print(pos_mat)
print(i_mat)

pe_embedding_table[:,::2]=torch.sin(pos_mat/i_mat)
pe_embedding_table[:,1::2]=torch.cos(pos_mat/i_mat)

偶数列：

奇数列：


pe_embedding=nn.Embedding(max_position_len,model_dim)
pe_embedding.weight=nn.Parameter(pe_embedding_table,requires_grad=False)print(pe_embedding_table)
print(pe_embedding)
print(pe_embedding.weight)

src_pos=[torch.arange(max(src_len))for _ in src_len]
tgt_pos=[torch.arange(max(tgt_len))for _ in tgt_len]src_pos_embedding=pe_embedding(src_pos)
tgt_pos_embedding=pe_embedding(tgt_pos)

TypeError: embedding(): argument ‘indices’ (position 2) must be Tensor, not list
torch.Tensor()只能转换单个元素

#构造position embedding
pos_mat=torch.arange(max_position_len).reshape(-1,1)
i_mat=torch.pow(10000,torch.arange(0,model_dim,2).reshape(1,-1)/model_dim)
pe_embedding_table=torch.zeros(max_position_len,model_dim)
pe_embedding_table[:,::2]=torch.sin(pos_mat/i_mat)
pe_embedding_table[:,1::2]=torch.cos(pos_mat/i_mat)#改写了pe_embedding
pe_embedding=nn.Embedding(max_position_len,model_dim)
pe_embedding.weight=nn.Parameter(pe_embedding_table,requires_grad=False)src_pos=torch.Tensor(torch.cat([torch.unsqueeze(torch.arange(max(src_len)),0)for _ in src_len]))
tgt_pos=torch.Tensor(torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)),0)for _ in tgt_len]))src_pos_embedding=pe_embedding(src_pos)
tgt_pos_embedding=pe_embedding(tgt_pos)

valid_encoder_pos=torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max(src_len)-L)),0)for L in src_len]),2)
v=torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max(src_len)-L)),0)for L in src_len])
#两个矩阵相乘可以得到两两之间的关联性
valid_encoder_pos_matrix=torch.bmm(valid_encoder_pos,valid_encoder_pos.transpose(1,2))
print(v)
print(valid_encoder_pos)
print(valid_encoder_pos.shape)
print(src_len)
print(valid_encoder_pos_matrix)

对于该句子，前面有俩单词，第一行是第一个单词对其他位置的关联性，由于剩下两个是pad的0，所以相关性为0。

invalid_encoder_pos_matrix=1-valid_encoder_pos_matrix
# True代表这个位置我们需要对它mask
mask_encoder_self_attention=invalid_encoder_pos_matrix.to(torch.bool)print(invalid_encoder_pos_matrix)
print(mask_encoder_self_attention)

valid_encoder_pos=torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max(src_len)-L)),0)for L in src_len]),2)#两个矩阵相乘可以得到两两之间的关联性
valid_encoder_pos_matrix=torch.bmm(valid_encoder_pos,valid_encoder_pos.transpose(1,2))
invalid_encoder_pos_matrix=1-valid_encoder_pos_matrix
# True代表这个位置我们需要对它mask
mask_encoder_self_attention=invalid_encoder_pos_matrix.to(torch.bool)score=torch.randn(batch_size,max(src_len),max(src_len))
masked_score=score.masked_fill(mask_encoder_self_attention,-1e9)
prob=F.softmax(masked_score,-1)print(score)
print(masked_score)
print(prob)

masked_fill（mask，value），mask是元素为布尔值的张量（Tensor），把true位置填充value值。

代码

import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as Fbatch_size = 2#单词表大小
#单词对应的索引数的最大值
max_num_src_words = 8
max_num_tgt_words = 8
#每一个单词的大小
model_dim=8#序列最大长度
#一整个句子的最大长度,一个句子最多拥有的单词数
max_src_seq_len=5
max_tgt_seq_len=5
max_position_len=5src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([4,3]).to(torch.int32) #单词索引构成源句子和目标句子，即每个句子里是单词所在的索引数。构建batch，做了padding，默认值为0
#L分别取2，4，取2是长度为2的一个元组，取4是长度为4的一个元组
#src_seq：[tensor([7, 4]), tensor([2, 1, 5, 4])]
#tgt_seq：[tensor([4, 4, 4, 3]), tensor([5, 7, 6])]
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max(src_len)-L)),0)for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max(tgt_len)-L)),0)for L in tgt_len])#构造embedding
#一个单词对应一行，从1开始，pad的0，所以0行对应pad
#每一行一个embedding向量，每个单词索引是几，我们就取第几行
src_embedding_table=nn.Embedding(max_num_src_words+1,model_dim)
tgt_embedding_table=nn.Embedding(max_num_tgt_words+1,model_dim)
src_embedding=src_embedding_table(src_seq)
tgt_embedding=tgt_embedding_table(tgt_seq)#构造position embedding
pos_mat=torch.arange(max_position_len).reshape(-1,1)
i_mat=torch.pow(10000,torch.arange(0,model_dim,2).reshape(1,-1)/model_dim)
pe_embedding_table=torch.zeros(max_position_len,model_dim)
pe_embedding_table[:,::2]=torch.sin(pos_mat/i_mat)
pe_embedding_table[:,1::2]=torch.cos(pos_mat/i_mat)#改写了pe_embedding
pe_embedding=nn.Embedding(max_position_len,model_dim)
pe_embedding.weight=nn.Parameter(pe_embedding_table,requires_grad=False)src_pos=torch.Tensor(torch.cat([torch.unsqueeze(torch.arange(max(src_len)),0)for _ in src_len]))
tgt_pos=torch.Tensor(torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)),0)for _ in tgt_len]))src_pos_embedding=pe_embedding(src_pos)
tgt_pos_embedding=pe_embedding(tgt_pos)# #softmax演示，scaled的重要性
# alpha1=0.1
# alpha2=10
# score=torch.randn(5)
# prob1=F.softmax(score*alpha1,-1)
# prob2=F.softmax(score*alpha2,-1)
# def softmax_fun(score):
#     return F.softmax(score,-1)
# jaco_mat1=torch.autograd.functional.jacobian(softmax_fun,score*alpha1)
# jaco_mat2=torch.autograd.functional.jacobian(softmax_fun,score*alpha2)# 构造encoder的self-attention mask,就是一个关系矩阵，没有因果
# mask的shape：[batch_size,max_src_len,max_src_len],值为1或负无穷，负无穷经过softmax变为0，1相乘维持不变
# valid_encoder_pos：[2,4]=>[2,1,4]
valid_encoder_pos=torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max(src_len)-L)),0)for L in src_len]),2)#两个矩阵相乘可以得到两两之间的关联性
valid_encoder_pos_matrix=torch.bmm(valid_encoder_pos,valid_encoder_pos.transpose(1,2))
invalid_encoder_pos_matrix=1-valid_encoder_pos_matrix
# True代表这个位置我们需要对它mask
mask_encoder_self_attention=invalid_encoder_pos_matrix.to(torch.bool)score=torch.randn(batch_size,max(src_len),max(src_len))
masked_score=score.masked_fill(mask_encoder_self_attention,-1e9)
prob=F.softmax(masked_score,-1)print(score)
print(masked_score)
print(prob)

decoder

# step 6:构造decoder self-attention的mask
tril_matrix=[torch.tril(torch.ones(L,L))for L in tgt_len]
print(tril_matrix)

这里的1表示有特殊字符。
对于第一行，解码器的输入给一个特殊字符，解码器的输入与输出有一个shift，输入往左shift一位，刚好和输出有一个偏移。
对于第二行，解码器的输入给一个特殊字符和第一个字符，预测下一个字符。

代码

import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as Fbatch_size = 2#单词表大小
#单词对应的索引数的最大值
max_num_src_words = 8
max_num_tgt_words = 8
#每一个单词的大小
model_dim=8#序列最大长度
#一整个句子的最大长度,一个句子最多拥有的单词数
max_src_seq_len=5
max_tgt_seq_len=5
max_position_len=5src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([4,3]).to(torch.int32) # step 1:
#单词索引构成源句子和目标句子，即每个句子里是单词所在的索引数。构建batch，做了padding，默认值为0
#L分别取2，4，取2是长度为2的一个元组，取4是长度为4的一个元组
#src_seq：[tensor([7, 4]), tensor([2, 1, 5, 4])]
#tgt_seq：[tensor([4, 4, 4, 3]), tensor([5, 7, 6])]
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max(src_len)-L)),0)for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max(tgt_len)-L)),0)for L in tgt_len])## step 2:构造word embedding
#一个单词对应一行，从1开始，pad的0，所以0行对应pad
#每一行一个embedding向量，每个单词索引是几，我们就取第几行
src_embedding_table=nn.Embedding(max_num_src_words+1,model_dim)
tgt_embedding_table=nn.Embedding(max_num_tgt_words+1,model_dim)
src_embedding=src_embedding_table(src_seq)
tgt_embedding=tgt_embedding_table(tgt_seq)#step 3:构造position embedding
pos_mat=torch.arange(max_position_len).reshape(-1,1)
i_mat=torch.pow(10000,torch.arange(0,model_dim,2).reshape(1,-1)/model_dim)
pe_embedding_table=torch.zeros(max_position_len,model_dim)
pe_embedding_table[:,::2]=torch.sin(pos_mat/i_mat)
pe_embedding_table[:,1::2]=torch.cos(pos_mat/i_mat)#改写了pe_embedding
pe_embedding=nn.Embedding(max_position_len,model_dim)
pe_embedding.weight=nn.Parameter(pe_embedding_table,requires_grad=False)src_pos=torch.Tensor(torch.cat([torch.unsqueeze(torch.arange(max(src_len)),0)for _ in src_len]))
tgt_pos=torch.Tensor(torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)),0)for _ in tgt_len]))src_pos_embedding=pe_embedding(src_pos)
tgt_pos_embedding=pe_embedding(tgt_pos)# #softmax演示，scaled的重要性
# alpha1=0.1
# alpha2=10
# score=torch.randn(5)
# prob1=F.softmax(score*alpha1,-1)
# prob2=F.softmax(score*alpha2,-1)
# def softmax_fun(score):
#     return F.softmax(score,-1)
# jaco_mat1=torch.autograd.functional.jacobian(softmax_fun,score*alpha1)
# jaco_mat2=torch.autograd.functional.jacobian(softmax_fun,score*alpha2)# step 4:构造encoder的self-attention mask,就是一个关系矩阵，没有因果
# mask的shape：[batch_size,max_src_len,max_src_len],值为1或负无穷，负无穷经过softmax变为0，1相乘维持不变
# valid_encoder_pos：[2,4]=>[2,1,4]
valid_encoder_pos=torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max(src_len)-L)),0)for L in src_len]),2)#两个矩阵相乘可以得到两两之间的关联性
valid_encoder_pos_matrix=torch.bmm(valid_encoder_pos,valid_encoder_pos.transpose(1,2))
invalid_encoder_pos_matrix=1-valid_encoder_pos_matrix
# True代表这个位置我们需要对它mask
mask_encoder_self_attention=invalid_encoder_pos_matrix.to(torch.bool)score=torch.randn(batch_size,max(src_len),max(src_len))
masked_score=score.masked_fill(mask_encoder_self_attention,-1e9)
prob=F.softmax(masked_score,-1)#step 5：构造intra-attention的mask
# Q @ K^T shape:[batch_size,tgt_seq_len,src_seq_len]
valid_decoder_pos=torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max(tgt_len)-L)),0)for L in tgt_len]),2)valid_cross_pos_matrix=torch.bmm(valid_decoder_pos,valid_encoder_pos.transpose(1,2))
invalid_cross_pos_matrix=1-valid_cross_pos_matrix
mask_cross_attention=invalid_cross_pos_matrix.to(torch.bool)score=torch.randn(batch_size,max(tgt_len),max(src_len))
masked_cross_score=score.masked_fill(mask_cross_attention,-1e9)
prob2=F.softmax(masked_cross_score,-1)# step 6:构造decoder self-attention的mask
valid_decoder_tril_matrix=torch.cat([torch.unsqueeze(F.pad(torch.tril(torch.ones(L,L)),(0,max(tgt_len)-L,0,max(tgt_len)-L)),0)for L in tgt_len],0)
invalid_decoder_tril_matrix=1-valid_decoder_tril_matrix
invalid_decoder_tril_matrix=invalid_decoder_tril_matrix.to(torch.bool)score=torch.randn(batch_size,max(tgt_len),max(tgt_len))
masked_score=score.masked_fill(invalid_decoder_tril_matrix,-1e9)
prob=F.softmax(masked_score,-1)# 构建scaled self-attention
# Q,K,V shape:[batch_size*num_head,seq_len,model_dim/num_head]
def scaled_dot_product_attention(Q,K,V,attn_mask):score=torch.bmm(Q,K.transpose(-2,-1))/torch.sqrt(model_diml_dim)masked_score=score.masked_fill(attn_mask,-1e9)prob=F.softmax(masked_score,-1)context=torch.bmm(prob,V)return context

1.Transformer的word embedding、position embedding、编码器子注意力的掩码相关推荐

Transformer的position embedding
1. position embedding 位置编码我们为什么要引入位置编呢?主要有以下几个原因: 文本是时序型数据,词与词之间的顺序关系往往影响整个句子的含义. transformer模型的sel ...
Roformer：Enhanced Transformer with rotary position embedding
Roformer:Enhanced Transformer with rotary position embedding Intorduction Method Experiment 代码实现 Con ...
文献阅读：RoFormer: Enhanced Transformer with Rotary Position Embedding
文献阅读:RoFormer: Enhanced Transformer with Rotary Position Embedding 1. 工作简介 2. 常见位置编码方式 1. 绝对位置编码 1. ...
position embedding
[转载] 关于Transformer中的position embedding 一文教你彻底理解Transformer中的positional encoding Transformer中position ...
TRS 中的position embedding
Rotary Position Embedding (RoPE, 旋转式位置编码) | 原理讲解+torch代码实现
android item 点击获取position,Android ListView 子控件onClick正确获取position的方法
在实际开发中,我们有时候不仅需要响应ListView的onItemClick,还需要响应其子控件的点击事件,这个时候我们就会发现,由于复用等原因,如果直接在子控件的onClick事件中调用getVi ...
【发展史】自然语言处理中的预训练技术发展史—Word Embedding到Bert模型
目录自然语言处理中的预训练技术发展史-Word Embedding到Bert模型 1 图像领域的预训练 2 Word Embedding考古史 3 从Word Embedding到ELMO 4 从W ...
Transformer落地：使用话语重写器改进多轮人机对话
作者丨袁一鸣学校丨武汉大学硕士生研究方向丨对话系统.目标检测概述本文发表于自然语言处理顶会 ACL 2019,数据集以及 LSTM 版本的模型可在以下链接找到(由于 Transformer 版 ...

1.Transformer的word embedding、position embedding、编码器子注意力的掩码

来源

目录

word embedding

第2步

第3步

代码

positiona embedding

代码

decoder

代码

1.Transformer的word embedding、position embedding、编码器子注意力的掩码相关推荐

最新文章

热门文章