自然语言处理入门实战1：基于机器学习的文本分类

基于机器学习的文本分类

配置文件
数据集
数据预处理
model
模型
主函数
预测
结果
参考代码

本文参考复旦大学自然语言处理入门练习，主要是实现用tensorflow实现基于logistic/softmax regression的文本分类。
环境：python3.7
版本：tensorflow1.13
数据集：
数据集采用gaussic的数据集，https://github.com/gaussic/text-classification-cnn-rnn
链接: https://pan.baidu.com/s/1hugrfRu 密码: qfud
下载文件中的cnews.train.txt，放到data目录下即可
数据集有十个类别，分别是：
体育|娱乐|家居|房产|教育|时尚|时政|游戏|科技|财经

配置文件

config文件夹里描述了配置文件的路径信息和变量：
cnews.train.txt 为训练数据集
stopwords.txt 为停用词数据
categories.txt 为分类数据

best_validation 所保存的最好的模型

import ospwd_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))class LrConfig(object):#  训练模型用到的路径dataset_path = os.path.join(pwd_path + '/data' + "/cnews.train.txt")stopwords_path = os.path.join(pwd_path + '/data' + "/stopwords.txt")tfidf_model_save_path = os.path.join(pwd_path + '/model' + "/tfidf_model.m")categories_save_path = os.path.join(pwd_path + '/data' + '/categories.txt')lr_save_dir = os.path.join(pwd_path + '/model' + "/checkpoints")lr_save_path = os.path.join(lr_save_dir, 'best_validation')#  变量num_epochs = 100  # 总迭代轮次num_classes = 10  # 类别数print_per_batch = 10  # 每多少轮输出一次结果

数据集

data文件夹：
categories.txt：

cnews.train.txt：

stopwords.txt：

数据预处理

datahelper文件夹：
data_process.py：

from config.lr_config import LrConfig
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals import joblib
import jieba
import numpy as npconfig = LrConfig()class DataProcess(object):def __init__(self, dataset_path=None, stopwords_path=None, model_save_path=None):self.dataset_path = dataset_pathself.stopwords_path = stopwords_pathself.model_save_path = model_save_pathdef read_data(self):"""读取数据"""stopwords = list()with open(self.dataset_path, encoding='utf-8') as f1:data = f1.readlines()with open(self.stopwords_path, encoding='utf-8') as f2:temp_stopwords = f2.readlines()for word in temp_stopwords:stopwords.append(word[:-1])return data, stopwordsdef save_categories(self, data, save_path):"""将文本的类别写到本地"""with open(save_path, 'w', encoding='utf-8') as f:f.write('|'.join(data))def pre_data(self, data, stopwords, test_size=0.2):"""数据预处理"""label_list = list()text_list = list()for line in data:label, text = line.split('\t', 1)# print(label)seg_text = [word for word in jieba.cut(text) if word not in stopwords]text_list.append(' '.join(seg_text))label_list.append(label)# 标签转化为one-hot格式encoder_nums = LabelEncoder()label_nums = encoder_nums.fit_transform(label_list)categories = list(encoder_nums.classes_)self.save_categories(categories, config.categories_save_path)label_nums = np.array([label_nums]).Tencoder_one_hot = OneHotEncoder()label_one_hot = encoder_one_hot.fit_transform(label_nums)label_one_hot = label_one_hot.toarray()return model_selection.train_test_split(text_list, label_one_hot, test_size=test_size, random_state=1024)# TODO:后续做def get_bow(self):"""提取词袋模型特征"""pass# TODO:这里可能出现维度过大，内存不足的问题，目前是去除低频词解决，可以做lda或者pca降维（后续做）def get_tfidf(self, X_train, X_test):"""提取tfidf特征"""vectorizer = TfidfVectorizer(min_df=100)vectorizer.fit_transform(X_train)X_train_vec = vectorizer.transform(X_train)X_test_vec = vectorizer.transform(X_test)return X_train_vec, X_test_vec, vectorizer# TODO:后续做def get_word2vec(self):"""提取word2vec特征"""passdef provide_data(self):"""提供数据"""data, stopwords = self.read_data()#  1、提取bag of word参数#  2、提取tf-idf特征参数X_train, X_test, y_train, y_test = self.pre_data(data, stopwords, test_size=0.2)X_train_vec, X_test_vec, vectorizer = self.get_tfidf(X_train, X_test)joblib.dump(vectorizer, self.model_save_path)#  3、提取word2vec特征参数return X_train_vec, X_test_vec, y_train, y_testdef batch_iter(self, x, y, batch_size=64):"""迭代器，将数据分批传给模型"""data_len = len(x)num_batch = int((data_len-1)/batch_size)+1indices = np.random.permutation(np.arange(data_len))x_shuffle = x[indices]y_shuffle = y[indices]for i in range(num_batch):start_id = i*batch_sizeend_id = min((i+1)*batch_size, data_len)yield x_shuffle[start_id: end_id], y_shuffle[start_id: end_id]

model

checkpoints为保存训练的模型

模型

lr_model.py：
用softmax得到的分类模型，交叉熵损失函数，准确率

import tensorflow as tfclass LrModel(object):def __init__(self, config, seq_length):self.config = configself.seq_length = seq_lengthself.lr()def lr(self):self.x = tf.placeholder(tf.float32, [None, self.seq_length])w = tf.Variable(tf.zeros([self.seq_length, self.config.num_classes]))b = tf.Variable(tf.zeros([self.config.num_classes]))# softmaxy = tf.nn.softmax(tf.matmul(self.x, w) + b)# 对矩阵按行或列计算最大值，输出最大值的下标，1为按行self.y_pred_cls = tf.argmax(y, 1)# 交叉熵损失self.y_ = tf.placeholder(tf.float32, [None, self.config.num_classes])cross_entropy = tf.reduce_mean(-tf.reduce_sum(self.y_ * tf.log(y), reduction_indices=[1]))self.loss = tf.reduce_mean(cross_entropy)self.train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)# 利用tf.argmax()按行求出真实值y_、预测值y最大值的下标，用tf.equal()求出真实值和预测值相等的数量，也就是预测结果正确的数量correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(self.y_, 1))self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

主函数

main.py：
读取数据，训练模型，评估模型，保存模型

import time
from datetime import timedelta
from datahelper.data_process import DataProcess
from config.lr_config import LrConfig
from lr_model import LrModel
import tensorflow as tfdef get_time_dif(start_time):"""获取已经使用的时间"""end_time = time.time()time_dif = end_time-start_timereturn timedelta(seconds=int(round(time_dif)))def evaluate(sess, x_, y_):"""测试集上准曲率评估"""data_len = len(x_)batch_eval = data_get.batch_iter(x_, y_, 128)total_loss = 0total_acc = 0for batch_xs, batch_ys in batch_eval:batch_len = len(batch_xs)loss, acc = sess.run([model.loss, model.accuracy], feed_dict={model.x: batch_xs, model.y_: batch_ys})total_loss += loss * batch_lentotal_acc += acc * batch_lenreturn total_loss/data_len, total_acc/data_lendef get_data():# 读取数据集print("Loading training and validation data...")X_train, X_test, y_train, y_test = data_get.provide_data()X_train = X_train.toarray()X_test = X_test.toarray()return X_train, X_test, y_train, y_test, len(X_train[0])def train(X_train, X_test, y_train, y_test):# 配置Saversaver = tf.train.Saver()# 训练模型print("Training and evaluating...")start_time = time.time()total_batch = 0  # 总批次best_acc_val = 0.0  # 最佳验证集准确率last_improved = 0  # 记录上一次提升批次require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练flag = Falsewith tf.Session() as sess:sess.run(tf.global_variables_initializer())for step in range(config.num_epochs):batch_train = data_get.batch_iter(X_train, y_train)for batch_xs, batch_ys in batch_train:if total_batch % config.print_per_batch == 0:loss_train, acc_train = sess.run([model.loss, model.accuracy], feed_dict={model.x: X_train, model.y_: y_train})loss_val, acc_val = evaluate(sess, X_test, y_test)if acc_val > best_acc_val:# 保存最好结果best_acc_val = acc_vallast_improved = total_batchsaver.save(sess=sess, save_path=config.lr_save_path)improve_str = "*"else:improve_str = ""time_dif = get_time_dif(start_time)msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%}, '\+ 'Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improve_str))sess.run(model.train_step, feed_dict={model.x: batch_xs, model.y_: batch_ys})total_batch += 1if total_batch - last_improved > require_improvement:#  验证集准确率长期不提升，提前结束训练print("No optimization for a long time, auto-stopping...")flag = Truebreakif flag:break# TODO:后续有需要再做
def test():"""目前直接输入一个语料，分为训练集和验证集合也可以输入两个，一个训练集用sklearn分为训练集和验证集，单独找一个验证集再这测试还可以输入训练集、验证集、测试集，测试集在这做测试"""passif __name__ == "__main__":config = LrConfig()data_get = DataProcess(config.dataset_path, config.stopwords_path, config.tfidf_model_save_path)X_train, X_test, y_train, y_test, seq_length = get_data()model = LrModel(config, seq_length)train(X_train, X_test, y_train, y_test)

预测

predict.py：
TensorFlow通过tf.train.Saver类实现神经网络模型的保存和提取

saver = tf.train.Saver() // 先要创建一个Saver对象
saver.save(sess=sess, save_path=config.lr_save_path) // # saver.save，保存模型

saver.restore(sess=session, save_path=config.lr_save_path) //
saver.restore, 模型提取，重载模型的参数，继续训练或用于测试数据

用模型把一条数据进行预测：

import tensorflow as tf
#from sklearn.externals import joblib
import joblib
import jieba
from config.lr_config import LrConfig
from lr_model import LrModeldef pre_data(data, config):"""分词去停用词"""stopwords = list()text_list = list()with open(config.stopwords_path, 'r', encoding='utf-8') as f:for word in f.readlines():stopwords.append(word[:-1])seg_text = jieba.cut(data)text = [word for word in seg_text if word not in stopwords]text_list.append(' '.join(text))return text_listdef read_categories():"""读取类别"""with open(config.categories_save_path, 'r', encoding='utf-8') as f:categories = f.readlines()return categories[0].split('|')def predict_line(data, categories):"""预测结果"""session = tf.Session()session.run(tf.global_variables_initializer())# 先要创建一个Saver对象saver = tf.train.Saver()# saver.restore, 模型提取，重载模型的参数，继续训练或用于测试数据saver.restore(sess=session, save_path=config.lr_save_path)y_pred_cls = session.run(model.y_pred_cls, feed_dict={model.x: data})return categories[y_pred_cls[0]]if __name__ == "__main__":data = "三星ST550以全新的拍摄方式超越了以往任何一款数码相机"config = LrConfig()line = pre_data(data, config)tfidf_model = joblib.load(config.tfidf_model_save_path)X_test = tfidf_model.transform(line).toarray()model = LrModel(config, len(X_test[0]))categories = read_categories()print(predict_line(X_test, categories))

结果

主函数运行结果：

总共训练了13640次，时间56分钟，最好的结果为

Iter: 12640, Train Loss: 0.25, Train Acc: 95.61%, Val Loss: 0.26, Val Acc: 95.63%, Time: 0:52:48 *
训练集损失：0.25，训练集准确度：98.61%
验证集损失：0.26，验证集准确度：95.63%

测试函数运行结果：
测试数据为：

data = “三星ST550以全新的拍摄方式超越了以往任何一款数码相机”

参考代码

https://github.com/Alic-yuan/nlp-beginner-finish/tree/master/task1