基于pytorch的文本情感识别(LSTM,CNN)
文章目录
- 前言
- 一、数据处理与Word2vec词向量训练
- 二、创建神经网络的输入batch
- 三、神经网络模型
- 1.LSTM
- 2.CNN
- 四、训练与测试
- 六、实验结果
- 七、完整代码
- 1.LSTM
- 2.CNN
前言
本文使用pytorch,利用两种神经网络(lstm,cnn)实现中文的文本情感识别。代码都有详细的注释说明。使用的是谭松波酒店评价语料库,其中包含3000条负面评价,7000条正面评价。
一、数据处理与Word2vec词向量训练
原始的语料数据如下图
通过txt进行处理,将文本前的1与空格去除,得到结果如下图,将其作为程序的输入
将输入的文本进行预处理,利用jieba函数库进行分词
def del_stop_words(text): #分词word_ls = jieba.lcut(text)#word_ls = [i for i in word_ls if i not in stopwords]return word_lswith open("F:/python_data/practice/tansongbo/neg.txt", "r", encoding='UTF-8') as e: # 加载负面语料neg_data1 = e.readlines()with open("F:/python_data/practice/tansongbo/pos.txt", "r", encoding='UTF-8') as s: # 加载正面语料pos_data1 = s.readlines()neg_data = sorted(set(neg_data1), key=neg_data1.index) #列表去重 保持原来的顺序
pos_data = sorted(set(pos_data1), key=pos_data1.index)neg_data = [del_stop_words(data.replace("\n", "")) for data in neg_data] # 处理负面语料
pos_data = [del_stop_words(data.replace("\n", "")) for data in pos_data]
all_sentences = neg_data + pos_data # 全部语料 用于训练word2vec
训练词向量,创建词向量词典
####训练过一次后可以不再训练词向量模型########用于训练词向量模型###model = Word2Vec(all_sentences, # 上文处理过的全部语料size=100, # 词向量维度 默认100维min_count=1, # 词频阈值 词出现的频率 小于这个频率的词 将不予保存window=5 # 窗口大小 表示当前词与预测词在一个句子中的最大距离是多少)
model.save('f.model') # 保存模型#加载模型,提取出词索引和词向量
def create_dictionaries(model):gensim_dict = Dictionary() # 创建词语词典gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号w2vec = {word: model[word] for word in w2indx.keys()} # 词语的词向量return w2indx, w2vecmodel = Word2Vec.load('F:/python_data/practice/tansongbo/f.model') # 加载模型
index_dict, word_vectors= create_dictionaries(model) # 索引字典、词向量字典#使用pickle进行字典索引与词向量的存储
output = open('F:/python_data/practice/tansongbo/dict.txt' + ".pkl", 'wb')
pickle.dump(index_dict, output) # 索引字典
pickle.dump(word_vectors, output) # 词向量字典
output.close()
二、创建神经网络的输入batch
将文本句子转换为词向量的多维矩阵,并创建输入到神经网络中的batch
#参数设置
vocab_dim = 100 # 向量维度
maxlen = 28 # 文本保留的最大长度
n_epoch = 10 # 迭代次数
batch_size = 64 #每次送入网络的句子数#加载词向量数据,填充词向量矩阵
f = open("F:/python_data/practice/tansongbo/dict.txt.pkl", 'rb') # 预先训练好的
index_dict = pickle.load(f) # 索引字典,{单词: 索引数字}
word_vectors = pickle.load(f) # 词向量, {单词: 词向量(100维长的数组)}n_symbols = len(index_dict) + 1 # 索引数字的个数,因为有的词语索引为0,所以+1
embedding_weights = np.zeros((n_symbols, vocab_dim)) # 创建一个n_symbols * 100的0矩阵for w, index in index_dict.items(): # 从索引为1的词语开始,用词向量填充矩阵embedding_weights[index, :] = word_vectors[w] # 词向量矩阵,第一行是0向量(没有索引为0的词语,未被填充)#将文本数据映射成数字(是某个词的编号,不是词向量)
def text_to_index_array(p_new_dic, p_sen): ##文本或列表转换为索引数字if type(p_sen) == list:new_sentences = []for sen in p_sen:new_sen = []for word in sen:try:new_sen.append(p_new_dic[word]) # 单词转索引数字except:new_sen.append(0) # 索引字典里没有的词转为数字0new_sentences.append(new_sen)return np.array(new_sentences) # 转numpy数组else:new_sentences = []sentences = []p_sen = p_sen.split(" ")for word in p_sen:try:sentences.append(p_new_dic[word]) # 单词转索引数字except:sentences.append(0) # 索引字典里没有的词转为数字0new_sentences.append(sentences)return new_sentences#将数据切割成一样的指定长度
def text_cut_to_same_long(sents):data_num = len(sents)new_sents = np.zeros((data_num,maxlen)) #构建一个矩阵来装修剪好的数据se = []for i in range(len(sents)):new_sents[i,:] = sents[i,:maxlen] new_sents = np.array(new_sents)return new_sents#将每个句子的序号矩阵替换成词向量矩阵
def creat_wordvec_tensor(embedding_weights,X_T):X_tt = np.zeros((len(X_T),maxlen,vocab_dim))num1 = 0num2 = 0for j in X_T:for i in j:X_tt[num1,num2,:] = embedding_weights[int(i),:]num2 = num2+1num1 = num1+1num2 = 0return X_ttdevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('正在使用计算的是:%s'%device)
data = all_sentences #获取之前分好词的数据
# 读取语料类别标签
label_list = ([0] * len(neg_data) + [1] * len(pos_data))# 划分训练集和测试集,此时都是list列表
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(data, label_list, test_size=0.2)
#print (X_train_l[0])
# 转为数字索引形式# token = Tokenizer(num_words=3000) #字典数量
# token.fit_on_texts(train_text)X_train = text_to_index_array(index_dict, X_train_l)
X_test = text_to_index_array(index_dict, X_test_l)
#print("训练集shape: ", X_train[0])y_train = np.array(y_train_l) # 转numpy数组
y_test = np.array(y_test_l)##将数据切割成一样的指定长度
from torch.nn.utils.rnn import pad_sequence
#将数据补长变成和最长的一样长
X_train = pad_sequence([torch.from_numpy(np.array(x)) for x in X_train],batch_first=True).float()
X_test = pad_sequence([torch.from_numpy(np.array(x)) for x in X_test],batch_first=True).float()
#将数据切割成需要的样子
X_train = text_cut_to_same_long(X_train)
X_test = text_cut_to_same_long(X_test)#将词向量字典序号转换为词向量矩阵
X_train = creat_wordvec_tensor(embedding_weights,X_train)
X_test = creat_wordvec_tensor(embedding_weights,X_test)#print("训练集shape: ", X_train.shape)
#print("测试集shape: ", X_test.shape)####Datloader和创建batch####
from torch.utils.data import TensorDataset, DataLoader# 创建Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))# shuffle是打乱数据顺序
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
三、神经网络模型
1.LSTM
class lstm(nn.Module):def __init__(self):super(lstm, self).__init__()self.lstm = nn.LSTM(input_size=vocab_dim,hidden_size=128,batch_first=True) #batch_first 是因为DataLoader所读取的数据与lstm所需的输入input格式是不同的,#所在的位置不同,故通过batch_first进行修改self.fc = nn.Linear(128, 2)#连接层的输入维数是hidden_size的大小def forward(self, x):out, (h_0, c_0) = self.lstm(x)out = out[:, -1, :]out = self.fc(out)out = F.softmax(out, dim= 1)return out, h_0model = lstm()
optimizer = torch.optim.Adam(model.parameters())
model = model.to(device) #将模型放入GPU
2.CNN
class CNN(nn.Module):def __init__(self, embedding_dim, n_filters, filter_sizes, dropout):super(CNN, self).__init__()self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim))for fs in filter_sizes]) #.ModuleList将模块放入一个列表self.fc = nn.Linear(n_filters * len(filter_sizes), 2)self.dropout = nn.Dropout(dropout) #防止过拟合def forward(self, text):# text = [batch_size, sent_len, emb_dim]embedded = text.unsqueeze(1)# embedded = [batch_size, 1, sent_len, emb_dim]convd = [conv(embedded).squeeze(3) for conv in self.convs]# conv_n = [batch_size, n_filters, sent_len - fs + 1]pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convd]# pooled_n = [batch_size, n_filters]cat = self.dropout(torch.cat(pooled, dim=1)) #torch.cat使张量进行拼接# cat = [batch_size, n_filters * len(filter_sizes)]return self.fc(cat)n_filters = 100
filter_sizes = [2, 3, 4]
dropout = 0.5model = CNN(vocab_dim, n_filters, filter_sizes, dropout)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())
四、训练与测试
下面代码展示的是LSTM模型的代码,CNN基本也相同,主要差别在于输出结果,具体不同可以查看最后的完整代码。
####训练train data####
from sklearn.metrics import accuracy_score, classification_report
print ('————————进行训练集训练————————')
for epoch in range(n_epoch):correct = 0total = 0epoch_loss = 0model.train()for batch_idx, (data, target) in enumerate(train_loader): #print (data.shape)data = torch.as_tensor(data, dtype=torch.float32)target = target.long() ##要保证label的数据类型是longoptimizer.zero_grad()data,target = data.cuda(),target.cuda() #将数据放入GPUoutput, h_state = model(data)#labels = output.argmax(dim= 1)#acc = accuracy_score(target, labels)correct += int(torch.sum(torch.argmax(output, dim=1) == target))total += len(target)#梯度清零;反向传播;optimizer.zero_grad()loss = F.cross_entropy(output, target) #交叉熵损失函数;epoch_loss += loss.item()loss.backward() optimizer.step()loss = epoch_loss / (batch_idx + 1)print ('epoch:%s'%epoch, 'accuracy:%.3f%%'%(correct *100 / total), 'loss = %s'%loss)####进行测试集验证####
print ('————————进行测试集验证————————')
for epoch in range(1):correct = 0total = 0epoch_loss = 0model.train()for batch_idx, (data, target) in enumerate(test_loader): #print (data.shape)data = torch.as_tensor(data, dtype=torch.float32)target = target.long() ##要保证label的数据类型是longdata,target = data.cuda(),target.cuda() #将数据放入GPUoptimizer.zero_grad()output, h_state = model(data)#labels = output.argmax(dim= 1)#acc = accuracy_score(target, labels)correct += int(torch.sum(torch.argmax(output, dim=1) == target))total += len(target)#梯度清零;反向传播;optimizer.zero_grad()loss = F.cross_entropy(output, target) #交叉熵损失函数;epoch_loss += loss.item()loss.backward() optimizer.step()loss = epoch_loss / (batch_idx + 1)print ('epoch:%s'%epoch, 'accuracy:%.3f%%'%(correct *100 / total), 'loss = %s'%loss)
六、实验结果
1.LSTM
训练了40个epoch,最终正确率在83%左右
2.CNN
训练了10个epoch,正确率在78%左右
七、完整代码
1.LSTM
# -*- coding: utf-8 -*-
####数据预处理####
#分词
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import jieba
from sklearn.model_selection import train_test_split#f = open('./stop_words.txt', encoding='utf-8') # 加载停用词
#stopwords = [i.replace("\n", "") for i in f.readlines()] # 停用词表def del_stop_words(text): #分词word_ls = jieba.lcut(text)#word_ls = [i for i in word_ls if i not in stopwords]return word_lswith open("F:/python_data/practice/tansongbo/neg.txt", "r", encoding='UTF-8') as e: # 加载负面语料neg_data1 = e.readlines()with open("F:/python_data/practice/tansongbo/pos.txt", "r", encoding='UTF-8') as s: # 加载正面语料pos_data1 = s.readlines()neg_data = sorted(set(neg_data1), key=neg_data1.index) #列表去重 保持原来的顺序
pos_data = sorted(set(pos_data1), key=pos_data1.index)neg_data = [del_stop_words(data.replace("\n", "")) for data in neg_data] # 处理负面语料
pos_data = [del_stop_words(data.replace("\n", "")) for data in pos_data]
all_sentences = neg_data + pos_data # 全部语料 用于训练word2vec####文本向量化####
#创建word2vec词向量模型
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
import pickle
import logging#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 将日志输出到控制台####训练过一次后可以不再训练词向量模型########用于训练词向量模型###model = Word2Vec(all_sentences, # 上文处理过的全部语料size=100, # 词向量维度 默认100维min_count=1, # 词频阈值 词出现的频率 小于这个频率的词 将不予保存window=5 # 窗口大小 表示当前词与预测词在一个句子中的最大距离是多少)
model.save('f.model') # 保存模型#加载模型,提取出词索引和词向量
def create_dictionaries(model):gensim_dict = Dictionary() # 创建词语词典gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号w2vec = {word: model[word] for word in w2indx.keys()} # 词语的词向量return w2indx, w2vecmodel = Word2Vec.load('F:/python_data/practice/tansongbo/f.model') # 加载模型
index_dict, word_vectors= create_dictionaries(model) # 索引字典、词向量字典#使用pickle进行字典索引与词向量的存储
output = open('F:/python_data/practice/tansongbo/dict.txt' + ".pkl", 'wb')
pickle.dump(index_dict, output) # 索引字典
pickle.dump(word_vectors, output) # 词向量字典
output.close()####LSTM训练####
#参数设置
vocab_dim = 100 # 向量维度
maxlen = 50 # 文本保留的最大长度
n_epoch = 40 # 迭代次数
batch_size = 64 #每次送入网络的句子数#加载词向量数据,填充词向量矩阵
f = open("F:/python_data/practice/tansongbo/dict.txt.pkl", 'rb') # 预先训练好的
index_dict = pickle.load(f) # 索引字典,{单词: 索引数字}
word_vectors = pickle.load(f) # 词向量, {单词: 词向量(100维长的数组)}n_symbols = len(index_dict) + 1 # 索引数字的个数,因为有的词语索引为0,所以+1
embedding_weights = np.zeros((n_symbols, vocab_dim)) # 创建一个n_symbols * 100的0矩阵for w, index in index_dict.items(): # 从索引为1的词语开始,用词向量填充矩阵embedding_weights[index, :] = word_vectors[w] # 词向量矩阵,第一行是0向量(没有索引为0的词语,未被填充)#将文本数据映射成数字(是某个词的编号,不是词向量)
def text_to_index_array(p_new_dic, p_sen): ##文本或列表转换为索引数字if type(p_sen) == list:new_sentences = []for sen in p_sen:new_sen = []for word in sen:try:new_sen.append(p_new_dic[word]) # 单词转索引数字except:new_sen.append(0) # 索引字典里没有的词转为数字0new_sentences.append(new_sen)return np.array(new_sentences) # 转numpy数组else:new_sentences = []sentences = []p_sen = p_sen.split(" ")for word in p_sen:try:sentences.append(p_new_dic[word]) # 单词转索引数字except:sentences.append(0) # 索引字典里没有的词转为数字0new_sentences.append(sentences)return new_sentences#将数据切割成一样的指定长度
def text_cut_to_same_long(sents):data_num = len(sents)new_sents = np.zeros((data_num,maxlen)) #构建一个矩阵来装修剪好的数据se = []for i in range(len(sents)):new_sents[i,:] = sents[i,:maxlen] new_sents = np.array(new_sents)return new_sents#加载数据特征与标签,将数据特征映射成数字,分割训练集与测试集with open("F:/python_data/practice/tansongbo/neg.txt", "r", encoding='UTF-8') as f:neg_data1 = f.readlines()
with open("F:/python_data/practice/tansongbo/pos.txt", "r", encoding='UTF-8') as g:pos_data1 = g.readlines()
neg_data = sorted(set(neg_data1), key=neg_data1.index) #列表去重 保持原来的顺序
pos_data = sorted(set(pos_data1), key=pos_data1.index)neg_data = [del_stop_words(data) for data in neg_data]
pos_data = [del_stop_words(data) for data in pos_data]
data = neg_data + pos_data#将每个句子的序号矩阵替换成词向量矩阵
def creat_wordvec_tensor(embedding_weights,X_T):X_tt = np.zeros((len(X_T),maxlen,vocab_dim))num1 = 0num2 = 0for j in X_T:for i in j:X_tt[num1,num2,:] = embedding_weights[int(i),:]num2 = num2+1num1 = num1+1num2 = 0return X_ttdevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('正在使用计算的是:%s'%device)
data = all_sentences #获取之前分好词的数据
# 读取语料类别标签
label_list = ([0] * len(neg_data) + [1] * len(pos_data))# 划分训练集和测试集,此时都是list列表
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(data, label_list, test_size=0.2)
#print (X_train_l[0])
# 转为数字索引形式# token = Tokenizer(num_words=3000) #字典数量
# token.fit_on_texts(train_text)X_train = text_to_index_array(index_dict, X_train_l)
X_test = text_to_index_array(index_dict, X_test_l)
#print("训练集shape: ", X_train[0])y_train = np.array(y_train_l) # 转numpy数组
y_test = np.array(y_test_l)##将数据切割成一样的指定长度
from torch.nn.utils.rnn import pad_sequence
#将数据补长变成和最长的一样长
X_train = pad_sequence([torch.from_numpy(np.array(x)) for x in X_train],batch_first=True).float()
X_test = pad_sequence([torch.from_numpy(np.array(x)) for x in X_test],batch_first=True).float()
#将数据切割成需要的样子
X_train = text_cut_to_same_long(X_train)
X_test = text_cut_to_same_long(X_test)#将词向量字典序号转换为词向量矩阵
X_train = creat_wordvec_tensor(embedding_weights,X_train)
X_test = creat_wordvec_tensor(embedding_weights,X_test)#print("训练集shape: ", X_train.shape)
#print("测试集shape: ", X_test.shape)####Datloader和创建batch####
from torch.utils.data import TensorDataset, DataLoader# 创建Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))# shuffle是打乱数据顺序
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)class lstm(nn.Module):def __init__(self):super(lstm, self).__init__()self.lstm = nn.LSTM(input_size=vocab_dim,hidden_size=64,batch_first=True) #batch_first 是因为DataLoader所读取的数据与lstm所需的输入input格式是不同的,#所在的位置不同,故通过batch_first进行修改self.fc = nn.Linear(64, 2)#连接层的输入维数是hidden_size的大小def forward(self, x):out, (h_0, c_0) = self.lstm(x)out = out[:, -1, :]out = self.fc(out)out = F.sigmoid(out) #二分类使用sigmoid函数,多分类使用softmax函数 out = F.softmax(out,dim=1)return out, h_0model = lstm()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())####训练train data####
from sklearn.metrics import accuracy_score, classification_report
print ('————————进行训练集训练————————')
for epoch in range(n_epoch):correct = 0total = 0epoch_loss = 0model.train()for batch_idx, (data, target) in enumerate(train_loader): #print (data.shape)data = torch.as_tensor(data, dtype=torch.float32)target = target.long() ##要保证label的数据类型是longoptimizer.zero_grad()data,target = data.cuda(),target.cuda() #将数据放入GPUoutput, h_state = model(data)#labels = output.argmax(dim= 1)#acc = accuracy_score(target, labels)correct += int(torch.sum(torch.argmax(output, dim=1) == target))total += len(target)#梯度清零;反向传播;optimizer.zero_grad()loss = F.cross_entropy(output, target) #交叉熵损失函数;epoch_loss += loss.item()loss.backward() optimizer.step()loss = epoch_loss / (batch_idx + 1)print ('epoch:%s'%epoch, 'accuracy:%.3f%%'%(correct *100 / total), 'loss = %s'%loss)####进行测试集验证####
print ('————————进行测试集验证————————')
for epoch in range(1):correct = 0total = 0epoch_loss = 0model.train()for batch_idx, (data, target) in enumerate(test_loader): #print (data.shape)data = torch.as_tensor(data, dtype=torch.float32)target = target.long() ##要保证label的数据类型是longoptimizer.zero_grad()data,target = data.cuda(),target.cuda() #将数据放入GPUoutput, h_state = model(data)#labels = output.argmax(dim= 1)#acc = accuracy_score(target, labels)correct += int(torch.sum(torch.argmax(output, dim=1) == target))total += len(target)#梯度清零;反向传播;optimizer.zero_grad()loss = F.cross_entropy(output, target) #交叉熵损失函数;epoch_loss += loss.item()loss.backward() optimizer.step()loss = epoch_loss / (batch_idx + 1)print ('epoch:%s'%epoch, 'accuracy:%.3f%%'%(correct *100 / total), 'loss = %s'%loss)
2.CNN
# -*- coding: utf-8 -*-
####数据预处理####
#分词
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import jieba
from sklearn.model_selection import train_test_split#f = open('./stop_words.txt', encoding='utf-8') # 加载停用词
#stopwords = [i.replace("\n", "") for i in f.readlines()] # 停用词表def del_stop_words(text): #分词word_ls = jieba.lcut(text)#word_ls = [i for i in word_ls if i not in stopwords]return word_lswith open("F:/python_data/practice/tansongbo/neg.txt", "r", encoding='UTF-8') as e: # 加载负面语料neg_data1 = e.readlines()with open("F:/python_data/practice/tansongbo/pos.txt", "r", encoding='UTF-8') as s: # 加载正面语料pos_data1 = s.readlines()neg_data = sorted(set(neg_data1), key=neg_data1.index) #列表去重 保持原来的顺序
pos_data = sorted(set(pos_data1), key=pos_data1.index)neg_data = [del_stop_words(data.replace("\n", "")) for data in neg_data] # 处理负面语料
pos_data = [del_stop_words(data.replace("\n", "")) for data in pos_data]
all_sentences = neg_data + pos_data # 全部语料 用于训练word2vec####文本向量化####
#创建word2vec词向量模型
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
import pickle
import logging#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 将日志输出到控制台####训练过一次后可以不再训练词向量模型########用于训练词向量模型###model = Word2Vec(all_sentences, # 上文处理过的全部语料size=100, # 词向量维度 默认100维min_count=1, # 词频阈值 词出现的频率 小于这个频率的词 将不予保存window=5 # 窗口大小 表示当前词与预测词在一个句子中的最大距离是多少)
model.save('f.model') # 保存模型#加载模型,提取出词索引和词向量
def create_dictionaries(model):gensim_dict = Dictionary() # 创建词语词典gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号w2vec = {word: model[word] for word in w2indx.keys()} # 词语的词向量return w2indx, w2vecmodel = Word2Vec.load('F:/python_data/practice/tansongbo/f.model') # 加载模型
index_dict, word_vectors= create_dictionaries(model) # 索引字典、词向量字典#使用pickle进行字典索引与词向量的存储
output = open('F:/python_data/practice/tansongbo/dict.txt' + ".pkl", 'wb')
pickle.dump(index_dict, output) # 索引字典
pickle.dump(word_vectors, output) # 词向量字典
output.close()####LSTM训练####
#参数设置
vocab_dim = 100 # 向量维度
maxlen = 28 # 文本保留的最大长度
n_epoch = 10 # 迭代次数
batch_size = 64 #每次送入网络的句子数#加载词向量数据,填充词向量矩阵
f = open("F:/python_data/practice/tansongbo/dict.txt.pkl", 'rb') # 预先训练好的
index_dict = pickle.load(f) # 索引字典,{单词: 索引数字}
word_vectors = pickle.load(f) # 词向量, {单词: 词向量(100维长的数组)}n_symbols = len(index_dict) + 1 # 索引数字的个数,因为有的词语索引为0,所以+1
embedding_weights = np.zeros((n_symbols, vocab_dim)) # 创建一个n_symbols * 100的0矩阵for w, index in index_dict.items(): # 从索引为1的词语开始,用词向量填充矩阵embedding_weights[index, :] = word_vectors[w] # 词向量矩阵,第一行是0向量(没有索引为0的词语,未被填充)#将文本数据映射成数字(是某个词的编号,不是词向量)
def text_to_index_array(p_new_dic, p_sen): ##文本或列表转换为索引数字if type(p_sen) == list:new_sentences = []for sen in p_sen:new_sen = []for word in sen:try:new_sen.append(p_new_dic[word]) # 单词转索引数字except:new_sen.append(0) # 索引字典里没有的词转为数字0new_sentences.append(new_sen)return np.array(new_sentences) # 转numpy数组else:new_sentences = []sentences = []p_sen = p_sen.split(" ")for word in p_sen:try:sentences.append(p_new_dic[word]) # 单词转索引数字except:sentences.append(0) # 索引字典里没有的词转为数字0new_sentences.append(sentences)return new_sentences#将数据切割成一样的指定长度
def text_cut_to_same_long(sents):data_num = len(sents)new_sents = np.zeros((data_num,maxlen)) #构建一个矩阵来装修剪好的数据se = []for i in range(len(sents)):new_sents[i,:] = sents[i,:maxlen] new_sents = np.array(new_sents)return new_sents#将每个句子的序号矩阵替换成词向量矩阵
def creat_wordvec_tensor(embedding_weights,X_T):X_tt = np.zeros((len(X_T),maxlen,vocab_dim))num1 = 0num2 = 0for j in X_T:for i in j:X_tt[num1,num2,:] = embedding_weights[int(i),:]num2 = num2+1num1 = num1+1num2 = 0return X_ttdevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('正在使用计算的是:%s'%device)
data = all_sentences #获取之前分好词的数据
# 读取语料类别标签
label_list = ([0] * len(neg_data) + [1] * len(pos_data))# 划分训练集和测试集,此时都是list列表
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(data, label_list, test_size=0.2)
#print (X_train_l[0])
# 转为数字索引形式# token = Tokenizer(num_words=3000) #字典数量
# token.fit_on_texts(train_text)X_train = text_to_index_array(index_dict, X_train_l)
X_test = text_to_index_array(index_dict, X_test_l)
#print("训练集shape: ", X_train[0])y_train = np.array(y_train_l) # 转numpy数组
y_test = np.array(y_test_l)##将数据切割成一样的指定长度
from torch.nn.utils.rnn import pad_sequence
#将数据补长变成和最长的一样长
X_train = pad_sequence([torch.from_numpy(np.array(x)) for x in X_train],batch_first=True).float()
X_test = pad_sequence([torch.from_numpy(np.array(x)) for x in X_test],batch_first=True).float()
#将数据切割成需要的样子
X_train = text_cut_to_same_long(X_train)
X_test = text_cut_to_same_long(X_test)#将词向量字典序号转换为词向量矩阵
X_train = creat_wordvec_tensor(embedding_weights,X_train)
X_test = creat_wordvec_tensor(embedding_weights,X_test)#print("训练集shape: ", X_train.shape)
#print("测试集shape: ", X_test.shape)####Datloader和创建batch####
from torch.utils.data import TensorDataset, DataLoader# 创建Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))# shuffle是打乱数据顺序
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)class CNN(nn.Module):def __init__(self, embedding_dim, n_filters, filter_sizes, dropout):super(CNN, self).__init__()self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim))for fs in filter_sizes]) #.ModuleList将模块放入一个列表self.fc = nn.Linear(n_filters * len(filter_sizes), 2)self.dropout = nn.Dropout(dropout) #防止过拟合def forward(self, text):# text = [batch_size, sent_len, emb_dim]embedded = text.unsqueeze(1)# embedded = [batch_size, 1, sent_len, emb_dim]convd = [conv(embedded).squeeze(3) for conv in self.convs]# conv_n = [batch_size, n_filters, sent_len - fs + 1]pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convd]# pooled_n = [batch_size, n_filters]cat = self.dropout(torch.cat(pooled, dim=1)) #torch.cat使张量进行拼接# cat = [batch_size, n_filters * len(filter_sizes)]return self.fc(cat)n_filters = 100
filter_sizes = [2, 3, 4]
dropout = 0.5model = CNN(vocab_dim, n_filters, filter_sizes, dropout)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())####训练train data####
from sklearn.metrics import accuracy_score, classification_report
print ('————————进行训练集训练————————')
for epoch in range(n_epoch):correct = 0total = 0epoch_loss = 0model.train()for batch_idx, (data, target) in enumerate(train_loader): #print (data.shape)data = torch.as_tensor(data, dtype=torch.float32)target = target.long() ##要保证label的数据类型是longoptimizer.zero_grad()data,target = data.cuda(),target.cuda() #将数据放入GPUoutput = model(data)#labels = output.argmax(dim= 1)#acc = accuracy_score(target, labels)correct += int(torch.sum(torch.argmax(output, dim=1) == target))total += len(target)#梯度清零;反向传播;optimizer.zero_grad()loss = F.cross_entropy(output, target) #交叉熵损失函数;epoch_loss += loss.item()loss.backward() optimizer.step()loss = epoch_loss / (batch_idx + 1)print ('epoch:%s'%epoch, 'accuracy:%.3f%%'%(correct *100 / total), 'loss = %s'%loss)####进行测试集验证####
print ('————————进行测试集验证————————')
for epoch in range(1):correct = 0total = 0epoch_loss = 0model.train()for batch_idx, (data, target) in enumerate(test_loader): #print (data.shape)data = torch.as_tensor(data, dtype=torch.float32)target = target.long() ##要保证label的数据类型是longoptimizer.zero_grad()data,target = data.cuda(),target.cuda() #将数据放入GPUoutput = model(data)#labels = output.argmax(dim= 1)#acc = accuracy_score(target, labels)correct += int(torch.sum(torch.argmax(output, dim=1) == target))total += len(target)#梯度清零;反向传播;optimizer.zero_grad()loss = F.cross_entropy(output, target) #交叉熵损失函数;epoch_loss += loss.item()loss.backward() optimizer.step()loss = epoch_loss / (batch_idx + 1)print ('epoch:%s'%epoch, 'accuracy:%.3f%%'%(correct *100 / total), 'loss = %s'%loss)
基于pytorch的文本情感识别(LSTM,CNN)相关推荐
- 基于Pytorch的语音情感识别系统
前言 大家好,我是阿光. 本专栏整理了<PyTorch深度学习项目实战100例>,内包含了各种不同的深度学习项目,包含项目原理以及源码,每一个项目实例都附带有完整的代码+数据集. 正在更新 ...
- 复盘:基于attention的多任务多模态情绪情感识别,基于BERT实现文本情感分类(pytorch实战)
复盘:基于attention机制的多任务多模态情绪情感识别(pytorch实战),基于BERT实现文本情感分类 提示:系列被面试官问的问题,我自己当时不会,所以下来自己复盘一下,认真学习和总结,以应对 ...
- 基于Transformer的文本情感分析编程实践(Encoder编码器-Decoder解码器框架 + Attention注意力机制 + Positional Encoding位置编码)
日萌社 人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新) Encoder编码器-Decoder解码器框架 + Atten ...
- NLP之基于TextCNN的文本情感分类
TextCNN 文章目录 TextCNN 1.理论 1.1 基础概念 **最大汇聚(池化)层:** ![请添加图片描述](https://img-blog.csdnimg.cn/10e6e1ed6bf ...
- 基于pytorch的MNIST数据集的四层CNN,测试准确率99.77%
基于pytorch的MNIST数据集的四层CNN,测试准确率99.77% MNIST数据集 环境配置 文件存储结构 代码 引入库 调用GPU 初始化变量 导入数据集并进行数据增强 导入测试集 加载测试 ...
- vader —— 一种基于规则的英文文本情感识别方法
之前舆情项目的合作cp方提出有海外舆情监控的需求,原先的舆情系统的大部分功能都可以天然支持非中文语言,唯独在情感识别上,由于中英文语法和词库的不同,需要重新实现英文的情感识别算法. 考虑 ...
- 基于SVM的语音情感识别系统设计
基于SVM的语音情感识别系统设计 目录 基于SVM的语音情感识别系统设计 1 引言 2 情感语音信号的采集 3. 特征参数 §3.1 基音频率参数 §3.3 能量参数 4 语音情感的识别 §4. ...
- 综述 | 基于脑电图信号的情感识别研究
文章来源于微信公众号(茗创科技),欢迎有兴趣的朋友搜索关注. 情感在我们的日常生活中扮演着重要的角色,不仅体现在人际交往中,在决策过程中,也在我们对周围世界的感知中.由于最近学术界非常关注人机交互尤其 ...
- 基于Matlab神经网络语音情感识别系统
文件大小:5.1M 代码行数:55行(主程序) 开发环境:Matlab2018 点击下载:点击下载 简要概述:基于Matlab神经网络语音情感识别系统,五种基本情感:'生气','高兴','中性','悲 ...
最新文章
- python线性回归算法简介_Python机器学习(二):线性回归算法
- spring配置详解-属性注入(构造函数)
- html移动滚动彩字字幕特效,如何制作滚动字幕 旋转好莱坞字幕特效效果图(超多滚动效果)...
- webservice引用spring的bean
- [html] 怎样使用iframe刷新父级页面
- Qt图形界面编程入门(信号和槽通信机制)
- 【X264系列】之编码YUV的内存流程
- keycloak mysql_keycloak搭配mysql
- 原生JS大揭秘—原型链
- 遗传算法原理及应用一(编码、适应度函数与选择算子)
- 我的 Java/C# web 后端开发技术选择
- 用java编写英寸到厘米的转换_java-将英寸转换为英尺和英寸
- echarts中的x轴y轴颜色,文字颜色改变
- 彻底弄明白Gradle相关配置
- Freebase Data Dump结构初探
- python大数据是什么_python和大数据有什么关系?
- 对条件变量(condition variable)的讨论
- 化妆品店营销策划方案 - Whale帷幄
- PixHawk飞控 配置参数
- Python多线程下载有声小说