求助AttributeError: ‘Embeddings’ object has no attribute ‘d_model’

embedding.d_model不是transformer_utils.py文件的源码吗，为什么说没有这个属性呢

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
import matplotlib.pyplot as plt
import numpy as np
import copy
import osos.environ['KMP_DUPLICATE_LIB_OK']='True'# 构建Embedding类来实现文本嵌入层
class Embeddings(nn.Module):def __init__(self,embedding_dim,vocab):# embedding_dim:词嵌入的维度# vocab:词表的大小super(Embeddings, self).__init__()# 定义Embedding层self.lut=nn.Embedding(vocab,embedding_dim)# 将参数传入类中self.embedding_dim=embedding_dimdef forward(self, x):# x:代表输入进模型的文本通过词汇映射后的数字张量return self.lut(x)* math.sqrt(self.embedding_dim)embedding_dim=512
vocab=1000
x=Variable(torch.LongTensor([[100,2,421,508],[491,998,1,221]]))
emb=Embeddings(embedding_dim,vocab)
embr=emb(x)
# print("embr:",embr)
# print(embr.shape)# 构建位置编码器的类
class PositionalEncoding(nn.Module):def __init__(self,embedding_dim,dropout,max_len=5000):# embedding_dim:代表词嵌入的维度#droput: 代表Dropout 层的置零比率#max_len:代表每隔句子的最大长度super(PositionalEncoding, self).__init__()#实例化Dropoutself.dropout=nn.Dropout(p=dropout)#初始化一个位置编码矩阵，大小是max_len*embedding_dimpe=torch.zeros(max_len,embedding_dim)#初始化一个绝对值矩阵，max_len*1position=torch.arange(0.0,max_len).unsqueeze(1)#定义一个变化矩阵div_term，跳跃式的初始化div_term=torch.exp(torch.arange(0.0,embedding_dim,2)*-(math.log(10000.0)/embedding_dim))#将前面定义的变化矩阵进行奇数，偶数的分别赋值pe[:,0::2]=torch.sin(position*div_term)pe[:,1::2]=torch.cos(position*div_term)# 将二维张量扩充成三维张量pe=pe.unsqueeze(0)# 将位置编码矩阵注册成模型的buffer，这个buffer不是模型中的参数，不跟随优化器同时更新# 注册成buffer后我们就可以在模型保存后重新加载的时候，将这个位置编码器和模型参数一同加载进来self.register_buffer('pe',pe)def forward(self, x):# x:代表文本序列的词嵌入表示# 首先明确pe的编码太长了，将第二个维度，也就是max_len对应的那个维度缩小成x的句子长度同等的长度x=x+Variable(self.pe[:,:x.size(1)],requires_grad=False)return self.dropout(x)embedding_dim=512
dropout=0.1
max_len=60x=embr
pe=PositionalEncoding(embedding_dim,dropout,max_len)
pe_result=pe(x)
# print(pe_result)
# print(pe_result.shape)# 设置一个画布
# plt.figure(figsize=(15,5))
#
# # 实例化PositionalEncoding类对象，词嵌入维度给20，置零比率设置为0
# pe=PositionalEncoding(20,0)
# # 向pe中传入一个全零初始化的x，相当于展示pe
# y=pe(Variable(torch.zeros(1,100,20)))
# plt.plot(np.arange(100),y[0,:,4:8].data.numpy())
# plt.legend(["dim %d"%p for p in [4,5,6,7]])# print(np.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]],k=-1))
# print(np.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]],k=0))
# print(np.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]],k=1))# 构建掩码张量的函数
def subsequent_mask(size):# size：代表掩码张量后两个维度，形成一个方阵attn_shape=(1,size,size)# 使用np.ones()先构建一个全1的张量，然后利用np.triu()形成上三角矩阵subsequent_mask=np.triu(np.ones(attn_shape),k=1).astype('uint8')#使得这个三角矩阵反转return torch.from_numpy(1-subsequent_mask)size=5
sm=subsequent_mask(size)
# print("sm:",sm)# plt.figure(figsize=(5,5))
# plt.imshow(subsequent_mask(20)[0])x=Variable(torch.randn(5,5))
# print(x)mask=Variable(torch.zeros(5,5))
# print(mask)y=x.masked_fill(mask==0,-1e9)
# print(y)def attention(query,key,value,mask=None,dropout=None):# query,key,value:代表注意力的三个输入张量# mask:掩码张量# dropout：传入的Dropout实例化对象# 首先将query的最后一个维度提取出来，代表的是词嵌入的维度d_k=query.size(-1)# 按照注意力计算公式，将query和key的转置进行矩阵乘法，然后除以缩放稀疏scores=torch.matmul(query,key.transpose(-2,-1))/math.sqrt(d_k)# 判断是否使用掩码张量if mask is not None:# 利用masked_fill方法，将掩码张量和0进行位置的意义比较，如果等于0，替换成一个非常小的数scores=scores.masked_fill(mask == 0, -1e9)# 对scores的最后一个维度上进行softmax操作p_attn=F.softmax(scores,dim=-1)# 判断是否使用dropoutif dropout is not None:p_attn=dropout(p_attn)# 最后一步完成p_attn和value张量的乘法，并返回query注意力表示return torch.matmul(p_attn,value),p_attnquery=key=value=pe_result
mask=Variable(torch.zeros(2,4,4))
attn,p_attn=attention(query, key, value,mask=mask)
# print('attn:',attn)
# print(attn.shape)
# print('p_attn',p_attn)
# print(p_attn.shape)# x=torch.randn(4,4)
# # print(x.size())
# y=x.view(16)
# # print(y.size())
# z=x.view(-1,8)
# # print(z.size())
#
# a=torch.randn(1,2,3,4)
# print(a.size())
# print(a)
# b=a.transpose(1,2)
# print(b.size())
# print(b)
# c=a.view(1,3,2,4)
# print(c.size())
# print(c)# 首先需要定义克隆函数，因为在多头注意力机制的实现中，用到多个结构相同的线性层
# 我们将使用clone函数将他们一同初始化在一个网络层列表对象中，之后的结构中也会使用到该函数
def clones(module,N):# module：表示要克隆的目标网络层，N代表需要克隆的数量# 然后将其放在nn.ModuleList类型的列表中存放return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])#构建多头注意力机制
class MultiHeadAttention(nn.Module):def __init__(self,head,embedding_dim,dropout=0.1):super(MultiHeadAttention,self).__init__()# 首先使用一个测试中常用的assert语句，判断h是否能被d_model整除#这是因为我们之后要给每个头分配等量的词特征，也就是embedding_dim/head个assert embedding_dim%head==0# 得到每个头获得的分割词向量维度d_kself.d_k=embedding_dim//head# 传入头数hself.head=headself.embedding_dim=embedding_dim# 然后获得线性层对象，通过nn的Linear实例化，它的内部变化矩阵是embedding_dim X embedding_dim# 多头注意力中，Q，K，V各需要一个，最后拼接的矩阵还需要一个self.linears=clones(nn.Linear(embedding_dim,embedding_dim),4)# self.attn为none，它代表最后得到的注意力张量，现在还没有结果所以为Noneself.attn=None# 最后就是一个self.dropout对象，它通过nn中的Dropout实例化而来，置零比率为传进来的参数dropoutself.dropout=nn.Dropout(p=dropout)def forward(self, query,key,value,mask=None):# 最后一个是注意力机制中可能需要的mask掩码张量，默认是Noneif mask is not None:# 使用unsqueeze拓展维度，代表多头中的第n头mask=mask.unsqueeze(1)# 得到batch_size，它是query尺寸的第1个数字，代表有多少条样本batch_size=query.size(0)# 首先使用zip将网络层和输入数据连接在一起，模型的输出利用view和transpose进行维度和形状的改变query,key,value=[model(x).view(batch_size,-1,self.head,self.d_k).transpose(1,2) for model,x in zip(self.linears,(query,key,value))]x,self.attn=attention(query,key,value,mask=mask,dropout=self.dropout)# 得到没个头的计算结果是4维张量，需要进行形状的转换# 前面已经将1,2两个维度进行过转置，在这里要重新转置回来# 注意：经历了transpose（）方法后，必须使用contiguous方法，不然无法使用view（）方法x=x.transpose(1,2).contiguous().view(batch_size,-1,self.head*self.d_k)#最后将x输入线性层列表中的最后一个线性层中进行处理，得到最终的多头注意力结构输出return self.linears[-1](x)#实例化若干参数
head=8
embedding_dim=512
dropout=0.2# 若干参数的初始化
query=key=value=pe_resultmask=Variable(torch.zeros(2,4,4))
mha=MultiHeadAttention(head,embedding_dim,dropout)
mha_result=mha(query,key,value,mask)
# print('mha_result',mha_result)
# print(mha_result.shape)# 通过类PositionwiseFeedForward来实现前馈全连接层
class PositionwiseFeedForward(nn.Module):def __init__(self,d_model,d_ff,dropout):#d_model:代表词嵌入的维度，同时也是两个线性层的输入维度和输出维度# d_ff:代表第一个线性层的输出维度和第二个线性层的输入维度super(PositionwiseFeedForward, self).__init__()#定义两层全连接的线性层# 首先使用nn实例化两个线性层对象，self.w1和self.w2# 参数分别是d_model，d_ff和d_ff，d_modelself.w1=nn.Linear(d_model,d_ff)self.w2=nn.Linear(d_ff,d_model)# 然后使用nn的Dropout实例化了对象的self.dropoutself.dropout=nn.Dropout(dropout)def forward(self, x):# 首先经过第一个线性层，然后使用Funtional中的relu函数进行激活# 再使用dropout进行随机置0，最后通过第二个线性层w2，返回最终结果return self.w2(self.dropout(F.relu(self.w1(x))))d_model=512
d_ff=64
dropout=0.2x=mha_result
ff=PositionwiseFeedForward(d_model,d_ff,dropout)
ff_result=ff(x)
# print("ff_result",ff_result)
# print(ff_result.shape)# 通过LayerNorm实现规范化层的类
class LayerNorm(nn.Module):def __init__(self,features,eps=1e-6):# features：代表词嵌入的维度# eps:一个足够小的正数，用来在规范化计算公式的分母中，防止除0操作super(LayerNorm, self).__init__()self.a2=nn.Parameter(torch.ones(features))self.b2=nn.Parameter(torch.zeros(features))self.eps=epsdef forward(self, x):# 首先对x进行最后一个维度上的求均值操作，同时保持输出的维度和输入维度一致mean=x.mean(-1,keepdim=True)# 对x进行最后一个维度上的求标准差的操作，同时保持输出维度和输入维度一致std=x.std(-1,keepdim=True)# 按照规范化公式进行计算并返回return self.a2*(x-mean)/(std+self.eps)+self.b2features=d_mode=512
eps=1e-6
x=ff_result
ln=LayerNorm(features,eps)
ln_result=ln(x)
# print("ln_result",ln_result)
# print(ln_result.shape)# 使用SublayerConnection来实现子层连接结构的类
class SublayerConnection(nn.Module):def __init__(self,size,dropout=0.1):# size:表示词嵌入的维度super(SublayerConnection, self).__init__()# 实例化sekf.normself.norm=LayerNorm(size)self.dropout=nn.Dropout(p=dropout)self.size=sizedef forward(self, x,sublayer):#因为存在跳跃连接，所以是将输入的x与dropout后的子层输出结果相加作为最终的子层连接输出# 残差连接return x+self.dropout(sublayer(self.norm(x)))size=512
dropout=0.2
head=8
d_model=512
# 令x为位置编码器的输出
x=pe_result
mask=Variable(torch.zeros(2,4,4))#假设子层中装的是多头注意力层，实例化这个类
self_attn=MultiHeadAttention(head,d_model)
#使用lambda获得一个函数的子层
sublayer=lambda x:self_attn(x,x,x,mask)sc=SublayerConnection(size,dropout)
sc_result=sc(x,sublayer)
# print("sc_result",sc_result)
# print(sc_result.shape)# 使用EncoderLayer类实现编码器层
class EncoderLayer(nn.Module):def __init__(self,size,self_attn,feed_forward,dropout=0.1):super(EncoderLayer, self).__init__()# 首先将self_attn和feeed_forward传入其中self.self_attn=self_attnself.feed_forward=feed_forwardself.size = size#编码器层有两个子层连接结构，所以使用clonees函数进行克隆self.sublayer=clones(SublayerConnection(size,dropout),2)def forward(self, x,mask):# 首先通过第一个子层连接结构，其中包含多头自注意力子层# 然后通过第二个子层连接结构，其中包含前馈全连接网络，最返回结果x=self.sublayer[0](x,lambda x:self.self_attn(x,x,x,mask))return self.sublayer[1](x,self.feed_forward)size=d_model=512
head=8
d_ff=64
x=pe_result
dropout=0.2
self_attn=MultiHeadAttention(head,d_model)
ff=PositionwiseFeedForward(d_model,d_ff,dropout)
mask=Variable(torch.zeros(2,4,4))el=EncoderLayer(size,self_attn,ff,dropout)
el_result=el(x,mask)
# print(el_result)
# print(el_result.shape)# 使用Encoder类来实现编码器
class Encoder(nn.Module):def __init__(self,layer,N):# layer:表示编码器层# N：表示编码器层的个数super(Encoder, self).__init__()# 首先使用clones函数克隆N个编码层放在self.layers中self.layers=clones(layer,N)# 再初始化一个规范化层，它将用在编码器的最后面self.norm=LayerNorm(layer.size)def forward(self, x,mask):# 首先对我们克隆的编码器层进行循环，每次都会得到一个新的x# 循环的过程相当于输出的x经过了N个编码器层的处理# 最后通过规范化层的对象self.norm进行处理for layer in self.layers:x=layer(x,mask)return self.norm(x)#第一个实例化参数layer，它是一个编码器层的实例化对象，因此需要传入编码器层的参数
# 因为编码器层中的子层是不共享的，因此需要使用深度拷贝各个对象
size=512
head=8
d_model=512
d_ff=64
dropout=0.2
c=copy.deepcopy
attn=MultiHeadAttention(head,d_model)
ff=PositionwiseFeedForward(d_model,d_ff,dropout)
layer=EncoderLayer(size,c(attn),c(ff),dropout)N=8
mask=Variable(torch.zeros(2,4,4))en=Encoder(layer,N)
en_result=en(x,mask)
# print(en_result)
# print(en_result.shape)# 使用DecoderLayer的类实现解码器层
class DecoderLayer(nn.Module):def __init__(self,size,self_attn,src_attn,feed_forward,dropout):# self_attn:多头自注意力对象，这个自注意力机制需要Q=K=V# src_attn：多头注意力对象，这个自注意力机制需要Q!=K=Vsuper(DecoderLayer, self).__init__()self.size=sizeself.self_attn=self_attnself.src_attn=src_attnself.feed_forward=feed_forward# 按照结构图使用clones函数克隆三个子层连接对象self.sublayer=clones(SublayerConnection(size,dropout),3)def forward(self, x,memory,source_mask,target_mask):# 来自编码器层的语义存储变量mermory，以及源数据掩码张量和目标数据掩码张量m=memory#将x传入第一个子层结构，第一个子层结构的输入分别是x和self_attn函数，因为是自注意力机制，所以Q，K，V都是x# 最后一个参数是目标数据掩码张量，这是要对目标数据进行遮掩，因为此时模型可能还没有生成任何目标数据# 比如在解码器准备生成第一个字符或词汇时，我们其实已经传入了第一个字符以便计算损失，#但是我们不希望在生成第一个字符时模型能利用这些信息，因此我们会将其遮掩，同样生成第二个字符或词汇时# 模型只能使用第一个字符或词汇信息，第二个字符以及之后的信息都不允许被模型使用x=self.sublayer[0](x,lambda x:self.self_attn(x,x,x,target_mask))# 接着进入第二个子层，这个子层中常规的注意力机制，q是输入x；k，v是编码层输出memory，# 同样进行也传入source_mask，但是进行源数据遮掩的原因并非是抑制信息泄露，而是遮蔽掉对结果没有意义的字符而产生的注意力值# 以此提升模型效果和训练速度，这样就完成了第二个子层的处理x=self.sublayer[1](x,lambda x:self.src_attn(x,m,m,source_mask))# 最后一个子层就是前馈全连接子层，经过它的处理就可以返回结果，这就是我们的解码结构return self.sublayer[2](x,self.feed_forward)head=8
size=d_model=512
d_ff=64
dropout=0.2
self_attn=src_attn=MultiHeadAttention(head,d_model,dropout)ff=PositionwiseFeedForward(d_model,d_ff,dropout)#x是来自目标数据的词嵌入表示，但形势和源数据的词嵌入表示相同
x=pe_result
# memory是来自编码器的输出
memory=en_result
#实际中source_mask和target_mask并不相同，这里为了计算方便使它们都是mask
mask=Variable(torch.zeros(2,4,4))
source_mask=target_mask=maskdl=DecoderLayer(size,self_attn,src_attn,ff,dropout)
dl_result=dl(x,memory,source_mask,target_mask)
# print(dl_result)
# print(dl_result.shape)# 使用类Decoder来实现解码器
class Decoder(nn.Module):def __init__(self,layer,N):super(Decoder, self).__init__()# 先使用clones函数克隆N个layer，然后实例化 一个规范化层# 因为数据走过了所有的解码器层后最后要做规范化处理self.layers=clones(layer,N)self.norm=LayerNorm(layer.size)def forward(self, x,memory,source_mask,target_mask):#对每个层进行循环，这个循环就是变量x通过每一个层的处理#得到最后的结果，再进行一次规范化返回即可for layer in self.layers:x=layer(x,memory,source_mask,target_mask)return self.norm(x)size=d_model=512
head=8
d_ff=64
dropout=0.2
c=copy.deepcopy
attn=MultiHeadAttention(head,d_model)
ff=PositionwiseFeedForward(d_model,d_ff,dropout)
layer=DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout)
N=8x=pe_result
memory=en_result
mask=Variable(torch.zeros(2,4,4))
source_mask=target_mask=maskde=Decoder(layer,N)
de_result=de(x,memory,source_mask,target_mask)
# print(de_result)
# print(de_result.shape)# 将线性层和softmax计算层一起实现，因为二者的共同目标是生成最后的结构
# 因此把类的名字叫做Generator，生成器
class Generator(nn.Module):def __init__(self,d_model,vocab_size):super(Generator, self).__init__()# 先使用nn中的预定义线性层进行实例化，得到一个对象self.project等待使用# 这个线性层的参数有两个，就是初始化函数传进来的两个参数：d_model，vocab_sizeself.project=nn.Linear(d_model,vocab_size)def forward(self, x):# 在函数中，使用上一步的得到的self.project对x进行线性变化# 然后使用F中已经实现的log_softmax进行softmax处理# 在这里之所以使用log_softmax是因为和pytorch版本的损失函数有关# log_softmax就是对softmax的结果又取了对数，因为对数函数是单调递增函数# 因此对最终我们取最大的概率值没有影响，最后返回结果即可return F.log_softmax(self.project(x),dim=-1)m=nn.Linear(20,30)
input=torch.randn(128,20)
output=m(input)
# print(output.size())d_model=512
vocab_size=1000
x=de_result
gen=Generator(d_model, vocab_size)
gen_result=gen(x)
# print(gen_result)
# print(gen_result.shape)# 使用EncoderDecoder类来实现编码器-解码器结构
class EncoderDecoder(nn.Module):def __init__(self,encoder,decoder,source_embed,target_embed,generator):super(EncoderDecoder, self).__init__()self.encoder=encoderself.decoder=decoderself.src_embed=source_embedself.tgt_embed=target_embedself.generator=generatordef forward(self, source,target,source_mask,target_mask):# 将source，source_mask传入编码函数，得到结果后# 与source_mask，target和target_mask一起传给解码函数return self.decode(self.encode(source,source_mask),source_mask,target,target_mask)def encode(self,source,source_mask):#使用src_embed对source做处理，然后和source_mask一起传给self.encoderreturn self.encoder(self.src_embed(source),source_mask)def decode(self,memory,source_mask,target,target_mask):#使用tgt_embed对target做处理，然后和source_mask,target_mask,memory一起传给self.decoderreturn self.decoder(self.tgt_embed(target),memory,source_mask,target_mask)vocab_size=1000
d_model=512
encoder=en
decoder=de
source_embed=nn.Embedding(vocab_size,d_model)
target_embed=nn.Embedding(vocab_size,d_model)
generator=gen
#假设源数据与目标数据相同，实际中并不相同
source=target=Variable(torch.LongTensor([[100,2,421,508],[491,998,1,221]]))
#假设src_mask和tgt_mask相同，实际中并不相同
source_mask=target_mask=Variable(torch.zeros(2,4,4))ed=EncoderDecoder(encoder,decoder,source_embed,target_embed, generator)
ed_result=ed(source,target,source_mask,target_mask)
# print(ed_result)
# print(ed_result.shape)# 模型构建
def make_model(source_vocab,target_vocab,N=6,d_model=512,d_ff=2048,head=8,dropout=0.1):#首先使用一个深度拷贝命令，很多结构都需要进行深度拷贝#来保证他们之间相互独立，不受干扰c=copy.deepcopyattn=MultiHeadAttention(head,d_model)ff=PositionwiseFeedForward(d_model,d_ff,dropout)#实例化前馈全连接层，得到对象positionposition=PositionalEncoding(d_model,dropout)#根据结构图，最外层是EncoderDecoder，在EncoderDecoder中，#分别是编码器层，解码器层，源数据Embedding层和位置编码组成的有序结构#目标数据Embedding层和位置编码组成的有序结构，以及类别生成器层#在编码器层中有attention子层以及前馈全连接子层#在解码器层中有两个attention子层以及前馈全连接层model=EncoderDecoder(Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),N),Decoder(DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout),N),nn.Sequential(Embeddings(d_model,source_vocab),c(position)),nn.Sequential(Embeddings(d_model,target_vocab),c(position)),Generator(d_model,target_vocab))# 模型构建完成后，接下来 就是初始化模型中的参数，比如线性层中的变化矩阵# 这里一但判断参数的维度大于1，则会将其初始化成一个服从均匀分布的矩阵for p in model.parameters():if p.dim()>1:nn.init.xavier_uniform(p)return model#结果服从均匀分布U(-a,a)
w=torch.empty(3,5)
w=nn.init.xavier_uniform_(w,gain=nn.init.calculate_gain('relu'))
# print(w)source_vocab=11
target_vocab=11
N=6
#其他参数使用默认值from pyitcast.transformer_utils import Batch
from pyitcast.transformer_utils import get_std_opt
from pyitcast.transformer_utils import LabelSmoothing
from pyitcast.transformer_utils import SimpleLossCompute
from pyitcast.transformer_utils import run_epoch
from pyitcast.transformer_utils import greedy_decodedef data_generator(V,batch_size,num_batch):for i in range(num_batch):#使用numpy中的random.randint()来随机生成[1,V)#分布的形状(batch,10)data=torch.from_numpy(np.random.randint(1,V,size=(batch_size,10)))# 将数据的第一列全部设置为1，作为起始标志data[:,0]=1#因为是copy任务，所以源数据和目标数据完全一致#设置参数requires_grad=False，样本的参数不需要参与梯度的计算source=Variable(data,requires_grad=False)target=Variable(data,requires_grad=False)yield Batch(source,target)V=11
batch_size=20
num_batch=30# if __name__ == '__main__':
#     res=make_model(source_vocab,target_vocab,N)
#     print(res)# 使用make_model()函数获得模型的实例化对象
model=make_model(V,V,N=2)
#使用工具包get_std_opt获得模型的优化器
model_optimizer=get_std_opt(model)
#使用工具包LabelSmoothing获得标签平滑对象
criterion=LabelSmoothing(size=V,padding_idx=0,smoothing=0.0)
#使用工具包SimpleLossCompute获得利用标签平滑的结果得到的损失计算方法
loss=SimpleLossCompute(model.generator,criterion,model_optimizer)# crit=LabelSmoothing(size=5,padding_idx=0,smoothing=0.5)
#
# predict=Variable(torch.FloatTensor([[0,0.2,0.7,0.1,0],
#                                     [0,0.2,0.7,0.1,0],
#                                     [0,0.2,0.7,0.1,0]]))
# target=Variable(torch.LongTensor([2,1,0]))
# crit(predict,target)
# plt.imshow(crit.true_dist)def run(model,loss,epochs=10):for epoch in range(epochs):# 模型进入训练模式，所有的参数将会被更新model.train()#训练时，传入的batch_size是20run_epoch(data_generator(V,8,20),model,loss)#训练结束后，进入评估模式，所有的参数固定不变model.eval()# 评估时，传入的batch_size是5run_epoch(data_generator(V,8,5),model,loss)if __name__ == '__main__':run(model,loss)

运行之后就报错了

D:/Deep_Project/Transformer_learning/transformer.py:589: UserWarning: nn.init.xavier_uniform is now deprecated in favor of nn.init.xavier_uniform_.nn.init.xavier_uniform(p)
Traceback (most recent call last):File "D:\Anaconda\envs\tf1\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_codeexec(code_obj, self.user_global_ns, self.user_ns)File "<ipython-input-2-3051b94c8a03>", line 1, in <module>runfile('D:/Deep_Project/Transformer_learning/transformer.py', wdir='D:/Deep_Project/Transformer_learning')File "C:\Program Files\JetBrains\PyCharm 2021.3.3\plugins\python\helpers\pydev\_pydev_bundle\pydev_umd.py", line 198, in runfilepydev_imports.execfile(filename, global_vars, local_vars)  # execute the scriptFile "C:\Program Files\JetBrains\PyCharm 2021.3.3\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfileexec(compile(contents+"\n", file, 'exec'), glob, loc)File "D:/Deep_Project/Transformer_learning/transformer.py", line 638, in <module>model_optimizer=get_std_opt(model)File "D:\Anaconda\envs\tf1\lib\site-packages\pyitcast\transformer_utils.py", line 91, in get_std_optreturn NoamOpt(model.src_embed[0].d_model, 2, 4000,File "D:\Anaconda\envs\tf1\lib\site-packages\torch\nn\modules\module.py", line 535, in __getattr__type(self).__name__, name))
AttributeError: 'Embeddings' object has no attribute 'd_model'

这是什么问题，难道是版本问题吗？？？

AttributeError: ‘Embeddings‘ object has no attribute ‘d_model‘相关推荐

Python错误：AttributeError: 'generator' object has no attribute 'next'解决办法
今天在学习生成器对象(generation object)运行以下代码时,遇到了一个错误: #定义生成器函数 def liebiao(): for x in range(10): yield x #函 ...
AttributeError: 'dict' object has no attribute 'status_code'
前端AJAX请求数据,提示错误:"AttributeError: 'dict' object has no attribute 'status_code'". 原因:是提示返回对象 ...
Traceback (most recent call last): File AttributeError: 'NoneType' object has no attribute 'group'
Traceback (most recent call last):File "<stdin>", line 1, in <module> Attribut ...
解决：AttributeError: ‘Graph‘ object has no attribute ‘number_of_selfloops‘
解决:AttributeError: 'Graph' object has no attribute 'number_of_selfloops' 目录解决:AttributeError: 'Grap ...
AttributeError: ‘FPDF‘ object has no attribute ‘unifontsubset‘
AttributeError: 'FPDF' object has no attribute 'unifontsubset' 目录 AttributeError: 'FPDF' object has ...
AttributeError: ‘Series‘ object has no attribute ‘as_matrix‘
AttributeError: 'Series' object has no attribute 'as_matrix' 问题: y_test = test_shifted["y_t+1&q ...
AttributeError: ‘SVC‘ object has no attribute ‘_probA‘
AttributeError: 'SVC' object has no attribute '_probA' 问题: # Save the Modle to file in the current w ...
sklearn使用FeatureHasher处理字符串特征: AttributeError: ‘str‘ object has no attribute ‘items‘
sklearn使用FeatureHasher处理字符串特征: AttributeError: 'str' object has no attribute 'items' 目录 sklearn使用Fea ...
Keras问题“AttributeError: 'NoneType' object has no attribute 'update”解决
BUG 在使用Keras训练模型时,在每个epoch完成后save_model时会报错 "AttributeError: 'NoneType' object has no attribute ...

AttributeError: ‘Embeddings‘ object has no attribute ‘d_model‘

求助AttributeError: ‘Embeddings’ object has no attribute ‘d_model’

AttributeError: ‘Embeddings‘ object has no attribute ‘d_model‘相关推荐

最新文章

热门文章