tensorflow 实践（一）使用神经网络做中文情感分析

本文使用哈工大做文本预处理；两层隐层神经网络；
后注：不是标准的ann，做了去停用词和词性筛选，没有端到端。

# -*- coding: utf-8 -*-
# @bref :使用tensorflow做中文情感分析
import numpy as np
import tensorflow as tf
import random
from sklearn.feature_extraction.text import CountVectorizer
import os
import tracebackreal_dir_path = os.path.split(os.path.realpath(__file__))[0]
pos_file = os.path.join(real_dir_path, 'data/pos_bak.txt')
neg_file = os.path.join(real_dir_path, 'data/neg_bak.txt')#使用哈工大分词和词性标注
from pyltp import Segmentor, Postagger
seg = Segmentor()
seg.load('/root/git/ltp_data/cws.model')
poser = Postagger()
poser.load('/root/git/ltp_data/pos.model')
real_dir_path = os.path.split(os.path.realpath(__file__))[0] #文件所在路径
stop_words_file = os.path.join(real_dir_path, '../util/stopwords.txt')
#定义允许的词性
allow_pos_ltp = ('a', 'i', 'j', 'n', 'nh', 'ni', 'nl', 'ns', 'nt', 'nz', 'v', 'ws')#分词、去除停用词、词性筛选
def cut_stopword_pos(s):words = seg.segment(''.join(s.split()))poses = poser.postag(words)stopwords = {}.fromkeys([line.rstrip() for line in open(stop_words_file)])sentence = []for i, pos in enumerate(poses):if (pos in allow_pos_ltp) and (words[i] not in stopwords):sentence.append(words[i])return sentencedef create_vocab(pos_file, neg_file):def process_file(file_path):with open(file_path, 'r') as f:v = []lines = f.readlines()for line in lines:sentence = cut_stopword_pos(line)v.append(' '.join(sentence))return vsent = process_file(pos_file)sent += process_file(neg_file)tf_v = CountVectorizer(max_df=0.9, min_df=1)tf = tf_v.fit_transform(sent)#print tf_v.vocabulary_return tf_v.vocabulary_.keys()#获取词汇
vocab = create_vocab(pos_file, neg_file)#依据词汇将评论转化为向量
def normalize_dataset(vocab):dataset = []# vocab:词汇表; review:评论; clf:评论对应的分类, [0, 1]表示负面评论,[1, 0]表示正面def string_to_vector(vocab, review, clf):words = cut_stopword_pos(review) # list of strfeatures = np.zeros(len(vocab))for w in words:if w.decode('utf-8') in vocab:features[vocab.index(w.decode('utf-8'))] = 1return [features, clf]with open(pos_file, 'r') as f:lines = f.readlines()for line in lines:one_sample = string_to_vector(vocab, line, [1, 0])dataset.append(one_sample)with open(neg_file, 'r') as f:lines = f.readlines()for line in lines:one_sample = string_to_vector(vocab, line, [0, 1])dataset.append(one_sample)return datasetdataset = normalize_dataset(vocab)
random.shuffle(dataset)  #打乱顺序#取样本的10%作为测试数据
test_size = int(len(dataset) * 0.1)
dataset = np.array(dataset)
train_dataset = dataset[:-test_size]
test_dataset = dataset[-test_size:]
print 'test_size = {}'.format(test_size)
#print 'size of train_dataset is {}'.format(train_dataset)#Feed-forward nueral network
#定义每个层有多少个神经元
n_input_layer = len(vocab)   #输入层每个神经元代表一个termn_layer_1 = 1000  #hiden layer
n_layer_2 = 1000 # hiden layer
n_output_layer = 2#定义待训练的神经网络
def neural_netword(data):#定义第一层神经元的w和b, random_normal定义服从正态分布的随机变量layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])layer_1 = tf.nn.relu(layer_1) #relu做激活函数layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])layer_2 = tf.nn.relu(layer_2)layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])return layer_outputbatch_size = 50
X = tf.placeholder('float', [None, n_input_layer])  #None表示样本数量任意; 每个样本纬度是term数量
Y = tf.placeholder('float')#使用数据训练神经网络
def train_neural_network(X, Y):predict = neural_netword(X)#cost func是输出层softmax的cross entropy的平均值。 将softmax 放在此处而非nn中是为了效率.cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predict, labels=Y))#设置优化器optimizer = tf.train.AdamOptimizer().minimize(cost_func)epochs = 13  #epoch本意是时代、纪, 这里是迭代周期with tf.Session() as session:session.run(tf.initialize_all_variables()) #初始化所有变量,包括w,brandom.shuffle(train_dataset)train_x = train_dataset[:, 0] #每一行的features;train_y = train_dataset[:, 1] #每一行的labelprint 'size of train_x is {}'.format(len(train_x))for epoch in range(epochs):epoch_loss = 0 #每个周期的lossi = 0while i < len(train_x):start = iend = i + batch_sizebatch_x = train_x[start:end]batch_y = train_y[start:end]#run的第一个参数fetches可以是单个,也可以是多个。 返回值是fetches的返回值。#此处因为要打印cost,所以cost_func也在fetches中_, c = session.run([optimizer, cost_func], feed_dict={X:list(batch_x), Y:list(batch_y)})epoch_loss += ci = endprint(epoch, ' : ', epoch_loss)#评估模型test_x = test_dataset[:, 0]test_y = test_dataset[:, 1]#argmax能给出某个tensor对象在某一维上的其数据最大值所在的索引值, 这里是索引值的list。tf.equal用于检测匹配,返回bool型的listcorrect = tf.equal(tf.argmax(predict, 1), tf.argmax(Y, 1))#tf.cast 可以将[True, False, True] 转化为[1, 0, 1]#reduce_mean用于在某一维上计算平均值, 未指定纬度则计算所有元素accurqcy = tf.reduce_mean(tf.cast(correct, 'float'))print('准确率: {}'.format(accurqcy.eval({X:list(test_x), Y:list(test_y)})))#等价: print session.run(accuracy, feed_dict={X:list(test_x), Y:list(test_y)})train_neural_network(X, Y)

最终的执行显示：

size of train_x is 31612
(0, ' : ', 105508.38228607178)
(1, ' : ', 11773.463727131188)
(2, ' : ', 4551.4978754326503)
(3, ' : ', 3576.6907950473492)
(4, ' : ', 3144.6771814899175)
(5, ' : ', 2911.1803286887775)
(6, ' : ', 2691.8284285693276)
(7, ' : ', 2651.9982114042473)
(8, ' : ', 2882.4479921576026)
(9, ' : ', 2665.3818837262743)
(10, ' : ', 2551.3030235993206)
(11, ' : ', 2838.3546982686303)
(12, ' : ', 2770.5539811982608)
准确率: 0.828587830067

tensorflow 实践（一）使用神经网络做中文情感分析相关推荐

python中文情感分析分类和英文情感分析的库和方法汇总
情感分析是自然语言处理中的一个常见任务.以下是用 Python 写的一个简单的情感分析分类函数的代码示例: import jieba import numpy as npdef sentiment_a ...
毕业设计:微博语料中文情感分析
向AI转型的程序员都关注了这个号???????????? 人工智能大数据与深度学习公众号:datayx 微博的强大影响力已经深深的吸引了更多的人加入.而对微博的情感分析,不仅可以获取网民的此时的心 ...
利用Python实现酒店评论的中文情感分析，含数据集
利用Python实现酒店评论的情感分析完整代码下载地址:利用Python实现酒店评论的中文情感分析情感极性分析,即情感分类,对带有主观情感色彩的文本进行分析.归纳.情感极性分析主要有两种分类方法: ...
【中文情感分析】SO-PMI算法（HarvestText库的修正以及解析）
简述在最近的RA过程中,需要使用一个库,同时对其的原理需要理解.然后就研究了一些,发现了几个有趣的,其中一个HarvestText中文情感分析. 我研究了他的源码后,再结合理论知识发现了这个库在情感 ...
基于大连理工大学的情感词汇表的中文情感分析
前言为什么要写这篇文章? 前段时间帮人写了一个这样的小项目,在网上查找资料的过程中,有不少关于该项目的资料,由于各个博主写的代码不尽相同,且没有一个详尽的分析方法,所以我在完成该项目后,想到可以把该 ...
python情感分析语料库_python 中文情感分析 Snownlp库的使用
不甘心的时候,就是在进步:痛苦的时候,就是在成长. 文章目录一.Snownlp 简介 SnowNLP是一个python写的类库,可以方便的处理中文文本内容,是受到了TextBlob的启发而写的,由于 ...
scrapy 豆瓣短评数据分析 + 中文情感分析 + 可视化（一）
scrapy 豆瓣短评数据分析 + 中文情感分析 + 可视化 (一) 一.scrapy 爬取豆瓣短评本次爬取的是哪吒之魔童降世短评 .本次爬取的是静态网页还是蛮简单的. 1.开始地址 http ...
python snownlp情感分析_白杨数说 | 不会做文本情感分析？试试这两个Python包
情感分析是自然语言处理(NLP)领域的一类任务,又称倾向性分析,意见抽取,意见挖掘,情感挖掘,主观分析等,它是对带有情感色彩的主观性文本进行分析.处理.归纳和推理的过程.具体到数据新闻领域,文本情感分 ...
基于逻辑回归，支持向量机，朴素贝叶斯以及简单深度学习文本分类方法（BiLSTM、CNN）实现的中文情感分析，含数据集可直接运行
基于逻辑回归,支持向量机,朴素贝叶斯以及简单深度学习文本分类方法(BiLSTM.CNN)实现的中文情感分析,含数据集可直接运行完整代码下载地址:中文情感分析中文情感分析本项目旨在通过一个中文情感 ...

tensorflow 实践（一）使用神经网络做中文情感分析

tensorflow 实践（一）使用神经网络做中文情感分析相关推荐

最新文章

热门文章