LSTM+word2vec电影情感分析

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from plot_result import plot_history
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
data = pd.read_csv('dataset/labeledTrainData.tsv', sep='\t', usecols=['sentiment', 'review'])
data.head()
sentiment review
0 1 With all this stuff going down at the moment w...
1 1 \The Classic War of the Worlds\" by Timothy Hi...
2 0 The film starts with a manager (Nicholas Bell)...
3 0 It must be assumed that those who praised this...
4 1 Superbly trashy and wondrously unpretentious 8...
# 定义函数进行简单的文本处理
def detail_text(w):import re# text = re.sub(r"([?,.!])", r" \1", w)# text = re.sub(r'[" "]+', " ", text)# text = re.sub(r'[^a-zA-Z?,!.]+', " ", text)token = re.compile('[A-Za-z]+|[!?,.()]')text = token.findall(w)new_text = [word.lower() for word in text if word not in stopwords.words('english') and len(word) > 2]return new_text
data['review'] = data.review.apply(detail_text)
data['review'][0]
['with','stuff','going','moment','started',················································'stupid','guy','one','sickest','liars','hope','latter']
# 构建x y
sentences = data.review.values
y = data.sentiment.values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
max_words = 20000
# 创建Tokenizer对象
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(sentences_train)
tokenizer.fit_on_texts(sentences_test)
# 将text转成序列
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
print(X_train[0])
[271, 286, 13, 17, 3, 834, 6, 552, 14, 565, 437, 7, 4, 107, 17, 3, 2362, 244, 5130, 133, 311, 1100, 2669, 540, 942, 5130, 1283, 3025, 581, 750, 943, 944, 6665, 5131, 6666, 5132, 46, 51, 26, 386, 3026, 6667, 6665, 5131, 3497, 6668, 1654, 6669, 5133, 945, 942, 366, 387, 1655, 1284, 62, 290, 9578, 438, 1041, 6670, 245, 472, 134, 122, 515, 2363, 9579, 863, 12, 105, 116, 138, 344, 3498, 348, 18, 327, 3499, 9580, 41, 1799, 6671, 2142, 607, 22, 2670, 93, 17, 1154, 1042, 3, 3500, 117, 3500, 1800, 1043, 140, 277, 1100, 5134, 4231, 300, 66, 1801, 301, 96, 1101, 1101, 516, 5130, 141, 291, 2671, 6672, 311, 70, 133, 9581, 751, 49, 192, 337, 73, 66, 2143, 6673, 35, 123, 946, 2144, 699, 9582, 24, 903, 161, 439, 9583, 947, 2672, 2, 1363, 1102, 52, 327, 194, 9584, 1364, 226, 234, 566, 150, 499, 388, 2145, 6674, 6675, 2146, 1365, 176, 185, 656, 1366, 1367, 9585, 353, 52, 236, 1560, 9586, 5135, 344, 43, 54, 328, 806, 292, 2, 29, 9587, 1155, 948, 141, 277, 2364, 140, 4232, 1214, 9588, 2365, 176, 135, 71, 608, 18, 197, 2366, 23, 2, 24, 5, 499, 1368, 3, 108, 1368, 3027, 5134, 5136, 105, 2367, 440, 11, 1965, 235, 4, 680]
# 进行pad_sequences
maxlen = 150
X_train = keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size", vocab_size)
vocab_size 17909
# 使用训练好的word2vec语料库glove.6B.50d.txt   构建词向量  过滤无关词语
def create_embedding_matrix(filepath, word_index, embedding_dim):"""将自己的词对应的向量在 被人训练好的文本向量模型中找到  对应起来"""vocab_size = len(word_index) + 1  # keras文档中指定需要+1embedding_matrix = np.zeros((vocab_size, embedding_dim))with open(filepath, encoding='utf-8') as f:for line in f:word, *vector = line.split()if word in word_index:idx = word_index[word]embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]return embedding_matrix
# 构建模型
embedding_dim = 50
embedding_matrix = create_embedding_matrix('dataset/glove.6B.50d.txt', tokenizer.word_index, embedding_dim)model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],  # 权重不再是随机指定的向量 而是word2vec词向量# input_length=max_len,trainable=True))  # 词嵌入层继续训练
model.add(keras.layers.LSTM(64, return_sequences=True))  # 输出64维
model.add(keras.layers.LSTM(64, return_sequences=True))
model.add(keras.layers.LSTM(128, return_sequences=False))
model.add(keras.layers.Dense(10, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))opt = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(X_train, y_train,epochs=50,verbose=1,# callbacks=[keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')],validation_data=(X_test, y_test),batch_size=64)
loss_train5, accuracy = model.evaluate(X_train, y_train, verbose=False)  # verbose=False  不打印训练结果
print("Training Accuracy: {:.4f}".format(accuracy))
loss_test5, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history, name='base_Embedding Layer')
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
embedding (Embedding)        (None, None, 50)          895450
_________________________________________________________________
lstm (LSTM)                  (None, None, 64)          29440
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 64)          33024
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               98816
_________________________________________________________________
dense (Dense)                (None, 10)                1290
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11
=================================================================
Total params: 1,058,031
Trainable params: 1,058,031
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
12/12 [==============================] - 7s 361ms/step - loss: 0.6918 - acc: 0.4593 - val_loss: 0.6883 - val_acc: 0.6000
Epoch 2/50
12/12 [==============================] - 3s 265ms/step - loss: 0.6914 - acc: 0.5462 - val_loss: 0.6868 - val_acc: 0.5760
Epoch 3/50
12/12 [==============================] - 3s 263ms/step - loss: 0.6801 - acc: 0.5928 - val_loss: 0.6753 - val_acc: 0.6280
Epoch 4/50
12/12 [==============================] - 3s 267ms/step - loss: 0.6393 - acc: 0.5929 - val_loss: 0.6576 - val_acc: 0.5960
Epoch 5/50
12/12 [==============================] - 3s 296ms/step - loss: 0.5889 - acc: 0.6644 - val_loss: 0.6133 - val_acc: 0.6840
Epoch 6/50
12/12 [==============================] - 4s 339ms/step - loss: 0.5103 - acc: 0.7634 - val_loss: 0.6980 - val_acc: 0.6440
Epoch 7/50
12/12 [==============================] - 4s 336ms/step - loss: 0.5457 - acc: 0.7641 - val_loss: 0.7750 - val_acc: 0.5800
Epoch 8/50
12/12 [==============================] - 4s 361ms/step - loss: 0.6201 - acc: 0.6423 - val_loss: 0.6544 - val_acc: 0.5800
Epoch 9/50
12/12 [==============================] - 4s 327ms/step - loss: 0.5474 - acc: 0.6986 - val_loss: 0.6293 - val_acc: 0.7120
Epoch 10/50
12/12 [==============================] - 4s 359ms/step - loss: 0.4952 - acc: 0.7804 - val_loss: 0.7398 - val_acc: 0.5320
Epoch 11/50
12/12 [==============================] - 4s 337ms/step - loss: 0.5603 - acc: 0.6619 - val_loss: 0.7509 - val_acc: 0.5840
Epoch 12/50
12/12 [==============================] - 4s 327ms/step - loss: 0.4849 - acc: 0.7315 - val_loss: 0.6508 - val_acc: 0.7200
Epoch 13/50
12/12 [==============================] - 4s 334ms/step - loss: 0.4055 - acc: 0.8081 - val_loss: 0.6827 - val_acc: 0.7120
Epoch 14/50
12/12 [==============================] - 4s 353ms/step - loss: 0.3476 - acc: 0.8556 - val_loss: 0.7940 - val_acc: 0.6400
Epoch 15/50
12/12 [==============================] - 4s 324ms/step - loss: 0.4679 - acc: 0.7623 - val_loss: 0.6620 - val_acc: 0.7280
Epoch 16/50
12/12 [==============================] - 4s 369ms/step - loss: 0.3791 - acc: 0.8254 - val_loss: 0.7203 - val_acc: 0.6760
Epoch 17/50
12/12 [==============================] - 4s 376ms/step - loss: 0.3724 - acc: 0.8902 - val_loss: 0.6440 - val_acc: 0.6720
Epoch 18/50
12/12 [==============================] - 4s 336ms/step - loss: 0.3935 - acc: 0.8351 - val_loss: 0.6178 - val_acc: 0.7480
Epoch 19/50
12/12 [==============================] - 4s 334ms/step - loss: 0.2949 - acc: 0.9042 - val_loss: 0.7082 - val_acc: 0.7400
Epoch 20/50
12/12 [==============================] - 4s 336ms/step - loss: 0.2276 - acc: 0.9234 - val_loss: 0.7059 - val_acc: 0.7440
Epoch 21/50
12/12 [==============================] - 4s 325ms/step - loss: 0.2561 - acc: 0.8967 - val_loss: 0.9920 - val_acc: 0.6080
Epoch 22/50
12/12 [==============================] - 4s 324ms/step - loss: 0.3445 - acc: 0.8129 - val_loss: 0.7695 - val_acc: 0.7280
Epoch 23/50
12/12 [==============================] - 4s 322ms/step - loss: 0.3999 - acc: 0.7718 - val_loss: 0.6474 - val_acc: 0.6000
Epoch 24/50
12/12 [==============================] - 4s 330ms/step - loss: 0.3869 - acc: 0.7256 - val_loss: 1.0462 - val_acc: 0.5400
Epoch 25/50
12/12 [==============================] - 4s 348ms/step - loss: 0.5651 - acc: 0.6890 - val_loss: 0.7888 - val_acc: 0.7440
Epoch 26/50
12/12 [==============================] - 4s 366ms/step - loss: 0.4958 - acc: 0.8671 - val_loss: 0.5955 - val_acc: 0.7360
Epoch 27/50
12/12 [==============================] - 4s 354ms/step - loss: 0.3138 - acc: 0.9130 - val_loss: 0.7231 - val_acc: 0.7160
Epoch 28/50
12/12 [==============================] - 4s 330ms/step - loss: 0.2183 - acc: 0.9386 - val_loss: 0.7489 - val_acc: 0.7360
Epoch 29/50
12/12 [==============================] - 4s 321ms/step - loss: 0.2112 - acc: 0.9353 - val_loss: 0.8334 - val_acc: 0.7360
Epoch 30/50
12/12 [==============================] - 4s 320ms/step - loss: 0.2336 - acc: 0.9256 - val_loss: 0.8929 - val_acc: 0.6800
Epoch 31/50
12/12 [==============================] - 4s 325ms/step - loss: 0.2843 - acc: 0.8972 - val_loss: 0.8821 - val_acc: 0.7280
Epoch 32/50
12/12 [==============================] - 4s 324ms/step - loss: 0.2211 - acc: 0.9382 - val_loss: 0.8275 - val_acc: 0.6640
Training Accuracy: 0.5387
Testing Accuracy:  0.5200

总结

上面LSTM模型结果不理想 最主要的原因是数据太少
下面是优化代码

# 使用CNN
model = keras.Sequential()
# Embedding  输入一般都是语料库大小 (None, 30, 50)  每行文本输入不限制 每次输入30个词  每个词50个维度
# model.add(keras.layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=maxlen))
model.add(keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(keras.layers.Conv1D(64, 11, activation='relu', padding='same'))
model.add(keras.layers.MaxPooling1D())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Conv1D(128, 11, activation='relu', padding='same'))
model.add(keras.layers.MaxPooling1D())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Conv1D(256, 11, activation='relu', padding='same'))
model.add(keras.layers.MaxPooling1D())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.GlobalMaxPool1D())  # 全局池化 变成二维的
model.add(keras.layers.Dense(10, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
model.summary()
# keras.utils.plot_model(model, to_file='model1.png')# 训练过程中降低学习速率 理解: 连续3个epoch学习速率没下降 lr*0.5  最后不超过最低值
# callbacks = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=0.00001)
history = model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_test, y_test))
loss_train1, accuracy = model.evaluate(X_train, y_train, verbose=False)  # verbose=False  不打印训练结果
print("Training Accuracy: {:.4f}".format(accuracy))
loss_test1, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
# 可视化
plot_history(history, name='base_Embedding Layer')# 接着上次训练的步骤训练
callbacks1 = keras.callbacks.EarlyStopping(patience=2, monitor='val_loss')# 设置保存最优参数
checkpoint = keras.callbacks.ModelCheckpoint(filepath='./model1.h5',monitor="val_loss",verbose=1,save_best_only=True,mode="min",
)
callbacks = [callbacks1, checkpoint]history = model.fit(X_train, y_train, epochs=100, batch_size=128, callbacks=callbacks, validation_data=(X_test, y_test))

总结:
用全部数据跑, 训练集和验证集acc能到95%.

LSTM+word2vec电影情感分析相关推荐

  1. 循环神经网络LSTM实现电影情感分类

    循环神经网络LSTM实现电影情感分类 一.数据集: 为了对word embedding这种常用的文本向量化的方法进行巩固,这里我们会完成一个文本情感分类的案例 现在我们有一个经典的数据集IMDB数据集 ...

  2. ML之NB、LoR:基于NB和LoR算法对Kaggle IMDB影评数据集(国外类似豆瓣电影)情感分析进行分类

    ML之NB.LoR:基于NB和LoR算法对Kaggle IMDB影评数据集(国外类似豆瓣电影)情感分析进行分类 注:LoR code waiting 目录 输出结果 核心代码 输出结果 数据集:Dat ...

  3. NLP之TEA之NB/LoR:基于NB和LoR算法对Kaggle IMDB影评数据集(国外类似豆瓣电影)情感分析进行分类

    NLP之TEA之NB/LoR:基于NB和LoR算法对Kaggle IMDB影评数据集(国外类似豆瓣电影)情感分析进行分类 注:LoR code waiting 目录 输出结果 核心代码 输出结果 数据 ...

  4. 基于双向LSTM的影评情感分析算法设计学习记录

    引言 需求分析,技术可行性(目前的研究进度),主流怎么做?我要如何做? 随着互联网技术的飞速发展及联网的便捷,越来越多的人在网上表达自己的意见.其中,电影评论受到广泛关注.很多人选择在闲暇时间观看电影 ...

  5. 【Keras】IMDB电影情感分析(三种神经网络)

    IMDB电影数据集含有50000个电影评论,分为正反两类.train包中包含25000个正反类评论用于训练,test中包含25000个评论用于测试. 在keras提供的IMDB数据集中,word被映射 ...

  6. 基于LSTM分类文本情感分析

    背景介绍 文本情感分析作为NLP的常见任务,具有很高的实际应用价值.本文将采用LSTM模型,训练一个能够识别文本postive, neutral, negative三种情感的分类器. 本文的目的是快速 ...

  7. 自然语言处理(NLP)之使用LSTM进行文本情感分析

    情感分析简介 文本情感分析(Sentiment Analysis)是自然语言处理(NLP)方法中常见的应用,也是一个有趣的基本任务,尤其是以提炼文本情绪内容为目的的分类.它是对带有情感色彩的主观性文本 ...

  8. Python自然语言处理相,新词发现,主题模型,隐马尔模型词性标注,Word2Vec,情感分析...

    向AI转型的程序员都关注了这个号???????????? 机器学习AI算法工程   公众号:datayx 代码环境:python --version 3.5.2 tensorflow keras 代码 ...

  9. python word2vector_python使用Word2Vec进行情感分析解析

    python实现情感分析(Word2Vec) ** 前几天跟着老师做了几个项目,老师写的时候劈里啪啦一顿敲,写了个啥咱也布吉岛,线下自己就瞎琢磨,终于实现了一个最简单的项目.输入文本,然后分析情感,判 ...

最新文章

  1. set firefox new tab url
  2. 韩顺平循序渐进学java 第10.11讲 继承.重载.覆盖
  3. WannaCry勒索软件还在继续传播和感染中
  4. 为什么我的对象被 IntelliJ IDEA 悄悄修改了?
  5. 云服务器 生物信息学,云服务器 生物信息学
  6. 组态王opc_组态王做 OPC Cilent
  7. php mysql link_php与mysql连接
  8. window 装oracle,在Window下安装Oracle
  9. 2022秋招求职记录(图像 视觉 c++)
  10. linux打印机pdf,在Deepin系统下安装CPUS-PDF虚拟PDF打印机的方法
  11. android 6 root权限,「经验」android手机怎么开启Root权限
  12. springboot基于安卓的移动数字图书馆的设计与实现毕业设计源码030911
  13. Java规则引擎easy rules
  14. 苹果微信分身版ios_苹果手机ios14系统微信分身地址安装教程
  15. LaTeX的入门使用(新手使用向)
  16. blackjack算法java_[CareerCup] 8.1 Implement Blackjack 实现21点纸牌
  17. WLAN快速上网指南
  18. 水星UD6S网卡Linux驱动,水星UD6S无线网卡驱动
  19. 2020抖音小店入驻流程
  20. 计算机毕业设计SpringBoot+Vue选题推荐—电子招投标系统

热门文章

  1. DevExpress chartControl 基本结构说明
  2. 华为交换机重置console账户密码(恢复出厂设置)
  3. 【Qt入门第1篇】 基础(一)Qt开发环境的搭建和hello world
  4. NYOJ符号配对(栈的思想)
  5. QR扫码综合示例教程(二十二)Qt5.15.2+Qt6.2.1(qml)+opencv4.5.4解决扫码聚焦、闪光灯(多线程、微信扫码)
  6. Java输出一个菱形(数字符号都可)
  7. EAP和EAPOL资料
  8. Python库参考手册
  9. CATIA-CAA二次开发:利用MkDAfrTool创建声明式应用程式
  10. 业务架构20多年,技术人员理解了吗?