一、一些说明

主要工作就是通过对120000条微博评论的数据集训练模型，预测评论的情绪倾向，将情绪结果简单的分为积极情绪和消极情绪。
虽然在训练集和测试集上表现的不错，但是经过自己的手动测试发现效果不太好。原因应该是数据集的数据质量不高，没有做数据清洗（后续可以尝试）其中的很多评论会掺杂很多一些符号表情的代码和@符号。
尽管如此，本文的代码是可以套用到别的数据集中的，如果有需要可以使用自制数据集获得更好的模型。希望本文的整套流程可以为朋友们带来一些帮助。
PS：全篇实现过程全部为套用别的文章，如有雷同，算我抄袭。

二、数据集

由于时间紧凑，本文数据集是网上随便找的，如果不嫌弃这里提供链接供大家下载使用：

https://pan.baidu.com/s/1obgHCkb8N3ga1UulVZzXxg?pwd=0806
提取码: 0806

三、预热

自己做的记录，可以跳过

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager
from itertools import accumulate# 设置matplotlib绘图时的字体，这个一般都没有吧
#my_font = font_manager.FontProperties(fname="C:\Windows\Fonts\Songti.ttc")
# 统计句子长度及长度出现的频数
df = pd.read_csv('C:\\数据集\\情感分析60000\\all.csv',encoding='gbk')
print(df.groupby('label')['label'].count())
df['length'] = df['evaluation'].apply(lambda x: len(x))
len_df = df.groupby('length').count()
sent_length = len_df.index.tolist()
sent_freq = len_df['evaluation'].tolist()
# 绘制句子长度及出现频数统计图
plt.bar(sent_length, sent_freq)
plt.title("评论长度及出现频数统计图")
plt.xlabel("评论长度")
plt.ylabel("评论长度出现的频数")
plt.rcParams['font.sans-serif']=['Simhei']
plt.show()

# 绘制评论长度累积分布函数(CDF)
sent_pentage_list = [(count/sum(sent_freq)) for count in accumulate(sent_freq)]# 绘制CDF
plt.plot(sent_length, sent_pentage_list)# 寻找分位点为quantile的评论长度
quantile = 0.9
#print(list(sent_pentage_list))
for length, per in zip(sent_length, sent_pentage_list):if round(per, 2) == quantile:index = lengthbreak
print("\n分位点为%s的微博长度:%d." % (quantile, index))# 绘制评论长度累积分布函数图
plt.plot(sent_length, sent_pentage_list)
plt.hlines(quantile, 0, index, colors="c", linestyles="dashed")
plt.vlines(index, 0, quantile, colors="c", linestyles="dashed")
plt.text(0, quantile, str(quantile))
plt.text(index, 0, str(index))
plt.title("评论长度累积分布函数图")
plt.xlabel("评论长度")
plt.ylabel("评论长度累积频率")
plt.show()

四、模型训练及测试

各层的参数设置（别人的图）：

import pickle
import numpy as np
import pandas as pd
from keras.utils import np_utils
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Embedding,Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score# load dataset
# ['evaluation'] is feature, ['label'] is label
def load_data(filepath,input_shape=20):df=pd.read_csv(filepath,encoding='gbk')# 标签及词汇表labels,vocabulary=list(df['label'].unique()),list(df['evaluation'].unique())# 构造字符级别的特征string=''for word in vocabulary:string+=wordvocabulary=set(string)# 字典列表word_dictionary={word:i+1 for i,word in enumerate(vocabulary)}with open('word_dict.pk','wb') as f:pickle.dump(word_dictionary,f)inverse_word_dictionary={i+1:word for i,word in enumerate(vocabulary)}label_dictionary={label:i for i,label in enumerate(labels)}with open('label_dict.pk','wb') as f:pickle.dump(label_dictionary,f)output_dictionary={i:labels for i,labels in enumerate(labels)}# 词汇表大小vocab_size=len(word_dictionary.keys())# 标签类别数量label_size=len(label_dictionary.keys())# 序列填充，按input_shape填充，长度不足的按0补充x=[[word_dictionary[word] for word in sent] for sent in df['evaluation']]x=pad_sequences(maxlen=input_shape,sequences=x,padding='post',value=0)y=[[label_dictionary[sent]] for sent in df['label']]'''np_utils.to_categorical用于将标签转化为形如(nb_samples, nb_classes)的二值序列。假设num_classes = 10。如将[1, 2, 3,……4]转化成：[[0, 1, 0, 0, 0, 0, 0, 0][0, 0, 1, 0, 0, 0, 0, 0][0, 0, 0, 1, 0, 0, 0, 0]……[0, 0, 0, 0, 1, 0, 0, 0]]'''y=[np_utils.to_categorical(label,num_classes=label_size) for label in y]y=np.array([list(_[0]) for _ in y])return x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary# 创建深度学习模型，Embedding + LSTM + Softmax
def create_LSTM(n_units,input_shape,output_dim,filepath):x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary=load_data(filepath)model=Sequential()model.add(Embedding(input_dim=vocab_size+1,output_dim=output_dim,input_length=input_shape,mask_zero=True))model.add(LSTM(n_units,input_shape=(x.shape[0],x.shape[1])))model.add(Dropout(0.2))model.add(Dense(label_size,activation='softmax'))model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])plot_model(model,to_file='./model_lstm.png',show_shapes=True)# 输出模型信息model.summary()return model# 模型训练
def model_train(input_shape,filepath,model_save_path):# 将数据集分为训练集和测试集，占比为9：1# input_shape=100x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary=load_data(filepath,input_shape)train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.1,random_state=42)# 模型输入参数，需要根据自己需要调整n_units=100batch_size=32epochs=5output_dim=20# 模型训练lstm_model=create_LSTM(n_units,input_shape,output_dim,filepath)lstm_model.fit(train_x,train_y,epochs=epochs,batch_size=batch_size,verbose=1)# 模型保存lstm_model.save(model_save_path)# 测试条数N= test_x.shape[0]predict=[]label=[]for start,end in zip(range(0,N,1),range(1,N+1,1)):print(f'start:{start}, end:{end}')sentence=[inverse_word_dictionary[i] for i in test_x[start] if i!=0]y_predict=lstm_model.predict(test_x[start:end])print('y_predict:',y_predict)label_predict=output_dictionary[np.argmax(y_predict[0])]label_true=output_dictionary[np.argmax(test_y[start:end])]print(f'label_predict:{label_predict}, label_true:{label_true}')# 输出预测结果print(''.join(sentence),label_true,label_predict)predict.append(label_predict)label.append(label_true)# 预测准确率acc=accuracy_score(predict,label)print('模型在测试集上的准确率:%s'%acc)if __name__=='__main__':filepath='C:\\数据集\\情感分析60000\\all.csv'input_shape=180model_save_path='C:\\数据集\\情感分析60000\\corpus_model.h5'model_train(input_shape,filepath,model_save_path)

测试集的结果：

五、自测

# Import the necessary modules
import pickle
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences# 导入字典
with open('word_dict.pk', 'rb') as f:word_dictionary = pickle.load(f)
with open('label_dict.pk', 'rb') as f:output_dictionary = pickle.load(f)try:# 数据预处理input_shape = 180# 在这里改字，可以自己玩一下，效果不太好sent = "啊啊啊啊啊啊，烦死了"x = [[word_dictionary[word] for word in sent]]x = pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0)# 载入模型model_save_path = 'C:\\数据集\\情感分析60000\\corpus_model.h5'lstm_model = load_model(model_save_path)# 模型预测y_predict = lstm_model.predict(x)label_dict = {v:k for k,v in output_dictionary.items()}print('输入语句: %s' % sent)print('情感预测结果: %s' % label_dict[np.argmax(y_predict)])except KeyError as err:print("您输入的句子有汉字不在词汇表中，请重新输入！")print("不在词汇表中的单词为：%s." % err)

微博评论情感分析（NLP，LSTM）相关推荐

Python实现某站热门评论情感分析----NLP自然语言处理
文章目录前言准备工作总体思路开始动手 Python 爬取热门视频评论信息 Baidu AI 分析所得评论的情感信息 Excel 自动写入所有评论情感信息主函数(开始执行) 运行结果 Exce ...
基于机器学习算法的微博评论情感分析实战（毕设项目）
项目概述: 通过对微博评论进行预处理.分词以及特征选择等,建立特征词典,构建每条评论的特征向量.之后利用分类算法,如朴素贝叶斯.SVM等,针对训练集的特征向量以及类标签进行训练,得到分类模型,并通过计 ...
利用LSTM+CNN+glove词向量预训练模型进行微博评论情感分析（二分类）
先上代码和数据集 https://pan.baidu.com/s/1tpEKb0nCun2oxlBXGlPvxA 提取码:cryy 里面所需要的,都在文件里, 数据是微博评论(共12万,没记错的话,0 ...
python微博评论情感分析_Python采集微博热评进行情感分析祝你狗年脱单
Ps: 重要的事情说三遍!!! 结尾有彩蛋,结尾有彩蛋,结尾有彩蛋. 如果自己需要爬(cai)虫(ji)的数据量比较大,为了防止被网站封Ip,可以分时段爬取,另外对于爬到的数据一般是用来存储数据库,这 ...
python微博评论情感分析_基于Python的微博情感分析系统设计
2019 年第 6 期信息与电脑 China Computer & Communication 软件开发与应用基于 Python 的微博情感分析系统设计王欣周文龙 (武汉工程大学邮电 ...
基于bert bert-wmm的微博评论情感分析
视频参考:https://www.bilibili.com/video/BV12Y4y127mi/?vd_source=8f3cf4ad6c08a40d40ca6809c9c9e8ca 直接看项目文件 ...
Python_001_旅游评论情感倾向性分析_000_分析(基于深度学习的微博评论情感倾向性分析_胡西祥)论文
Python_001_旅游评论情感倾向性分析_000_分析论文-2020-8-21 知网链接:基于深度学习的微博评论情感倾向性分析 - 中国知网 (cnki.net) ps.只做分析概括目录一.论 ...
【自然语言处理（NLP）】基于FNN网络的电影评论情感分析
[自然语言处理(NLP)]基于FNN网络的电影评论情感分析作者简介:在校大学生一枚,华为云享专家,阿里云专家博主,腾云先锋(TDP)成员,云曦智划项目总负责人,全国高等学校计算机教学与产业实践资源建 ...
NLP实战之–螺蛳粉评论情感分析和建模分类
NLP实战之–螺蛳粉评论情感分析和建模分类写在前面: 本文首发于我的微信公众号.新文章首发都会在微信公众号上. 自然语言处理(Natural Language Processing)是目前人工智能的 ...

微博评论情感分析（NLP，LSTM）