文本数据挖掘:实现文本分类

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: 廖文龙
# datetime: 2021/10/20 13:06
# ide: PyCharm
# Copyright © 2021 WellonLeo.All rights reserved.
import osimport pandas as pd
import torch
import torch.nn as nn
from partition import partition
from vectorize import vectorize
import numpy as np
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
#firstly implements softmax function
def softmax(x):if len(x.shape) > 1:# 矩阵tmp = np.max(x, axis=1)x -= tmp.reshape((x.shape[0], 1))x = np.exp(x)tmp = np.sum(x, axis=1)x /= tmp.reshape((x.shape[0], 1))else:# 向量tmp = np.max(x)x -= tmpx = np.exp(x)tmp = np.sum(x)x /= tmpreturn x#--------------------------------------------------divide-------------------------------------------------
def divide(srcFilePath, trainingData, testData):f= open(srcFilePath, 'r')lines = f.readlines()partitioned_data=partition(instances=lines,proportion=[0.7,0.3],shuffle=True)f2=open(trainingData,'w')f3=open(testData,'w')for each in partitioned_data[0]:f2.write(each)for each in partitioned_data[1]:f3.write(each)#--------------------------------------------------vectorize-------------------------------------------------
def vectorize_dataset(trainingData, vectorizedTrainingData, dictPath):vectorize(trainingData,dictPath,vectorizedTrainingData)#--------------------------------------------------train-------------------------------------------------def collect_data(vectorizedTrainingData,words_dict_length):f = open(vectorizedTrainingData, 'r')lines = f.readlines()x_all=[]y_all=[]y_all_hated=[]for line in lines:y_all.append(line.split('\t')[0])vec = np.zeros((1, words_dict_length))for i in line.split('\t')[1].strip().split(' '):vec[0][int(i)]=1x_all.append(vec)cats_num= len(set(y_all))cats_list=[i for i in set(y_all)]for i in y_all:i=cats_list.index(i)y_all_hated.append(i)# print(cats_num,cats_list)return y_all_hated,np.array(x_all).squeeze()def gen_training_data(y_all_hated,x_all,iteration,batch_size):y_iter_data=y_all_hated[iteration*batch_size:(iteration+1)*batch_size]x_iter_data=x_all[iteration*batch_size:(iteration+1)*batch_size]return y_iter_data,x_iter_datadef forward(W,X,Y,ifCalcloss,ifBackprop,LR,epoch,iter):#coreY_pred=X.dot(W.transpose())#softmaxY_pred=softmax(Y_pred)#loss funcloss=0predict_result=[]if(ifCalcloss):t, f = 0, 0for i in range(0,len(Y)):loss-=np.log(Y_pred[i,int(Y[i])])predict_result.append(np.argmax(Y_pred[i,:]))if Y[i]==np.argmax(Y_pred[i,:]):t+=1else:f+=1acc=t/(t+f)print('epoch:',epoch,'iteration:',iter,'Loss',loss,'accuracy',acc)#backpropif(ifBackprop):grads=np.zeros_like(W)for m in range(len(Y)):# for k in range(W.shape[0]):#compute gradientsfor i in range(W.shape[-1]):if np.argmax(Y_pred[m, :]) == Y[m]:zhishifunc = 1else:zhishifunc = 0grads[Y[m], i] -= (zhishifunc - Y_pred[m, Y[m]]) * X[m, i]grads=grads/len(Y)#undate weightsW = W-LR*gradsreturn W, predict_result#--------------------------------------------------test-------------------------------------------------
def test(testData, dictPath, answerPath):W=np.load('model_weights3.npy')print(W)f = open(dictPath, 'r')lines = f.readlines()words_dict_length = len(lines)y_all_hated, x_all = collect_data(testData, words_dict_length)one_column_test=np.full(x_all.shape[0],1)x_all_test=np.column_stack((x_all,one_column_test))print('testing...................................')W,pred_res = forward(W, x_all_test, y_all_hated, True, False, 0.01, 0, 0)df = pd.DataFrame({'ground_truth': y_all_hated, 'pred':pred_res})df.to_csv(answerPath, sep='\t', header=False, index=False)#--------------------------------------------------evaluate-------------------------------------------------
def evaluate(answerPath, resultPath):f = open(answerPath, 'r')f2=open(resultPath,'w+')all_cats=['城市管理', '城乡住房', '交通管理', '环境保护', '农林牧副渔']lines = f.readlines()ground_truth=[]label_num=5pred=[]confMatrix = np.zeros([label_num, label_num], dtype=np.int32)label_prec=[]label_recall=[]label_F1score=[]for line in lines:label,pred_lable=int(line.split("\t")[0]),int(line.split("\t")[1])ground_truth.append(label)pred.append(pred_lable)for i in range(len(ground_truth)):true_labels_idx = ground_truth[i]predict_labels_idx = pred[i]confMatrix[true_labels_idx][predict_labels_idx] += 1total_sum = confMatrix.sum()correct_sum = (np.diag(confMatrix)).sum()acc = round(100 * float(correct_sum) / float(total_sum), 2)f2.write('Accuracy='+str(acc/100.0000)+'\n')for i in range(label_num):# calc precisionlabel_total_sum = confMatrix.sum(axis=0)[i]label_correct_sum = confMatrix[i][i]prec = 0if label_total_sum != 0:prec = round(100 * float(label_correct_sum) / float(label_total_sum), 2)label_prec.append(prec)#calc recalllabel_total_sum = confMatrix.sum(axis=1)[i]label_correct_sum = confMatrix[i][i]recall = 0if label_total_sum != 0:recall = round(100 * float(label_correct_sum) / float(label_total_sum), 2)label_recall.append(recall)# calc F1 Scoreif (prec + recall) == 0:F1=0else:F1=round(2 * prec * recall / (prec + recall), 2)label_F1score.append(F1)for i in range(label_num):f2.write(all_cats[i]+'\t'+'Precision='+str(label_prec[i]/100.0000)+'\t'+'Recall='+str(label_recall[i]/100.0000)+'\t'+'F-Measure='+str(label_F1score[i]/100.0000)+'\n')dictPath='indexed_text_mining_data.txt'
trainingData='training.txt'
# testData='testData.txt'
srcFilePath='text_mining_data.txt'
destFilePath='dest.txt'
test_destFilePath='test_dest.txt'
answerPath='predAnswer.txt'
resultPath='eval_result.txt'
test(test_destFilePath,dictPath,answerPath)
evaluate(answerPath,resultPath)
# f=open(dictPath,'r')
# lines=f.readlines()
# words_dict_length=len(lines)
#
# f2=open(trainingData,'r')
# lines=f2.readlines()
# cats_list=[item.split('\t')[0] for item in lines]
# cats_num= len(set(cats_list))
# print(cats_num,words_dict_length)# # all_cats=['科教文体广新', '城市管理', '城乡住房', '交通管理', '环境保护', '国土与拆迁安置', '民政', '市场监督', '公安政法', '劳动保障']
# all_cats=['城市管理', '城乡住房', '交通管理', '环境保护', '农林牧副渔']
# # vectorize(srcFilePath,dictPath,destFilePath)
# # vectorize(srcFilePath,dictPath,test_destFilePath)
# y_all_hated,x_all=collect_data(destFilePath,words_dict_length)
# # y_all_hated_test,x_all_test=collect_data(test_destFilePath,words_dict_length)
#
# # y_all_hated=y_all_hated[:2000]
# # x_all=x_all[:2000]
# one_column=np.full(x_all.shape[0],1)
# x_all=np.column_stack((x_all,one_column))
# # one_column_test=np.full(x_all_test.shape[0],1)
# # x_all_test=np.column_stack((x_all_test,one_column_test))
# W=np.load('model_weights2.npy')
# # W=np.random.randn(cats_num,words_dict_length+1)
# # W=np.zeros([cats_num,words_dict_length+1])
# # -----------set epoch iteration batch_size
# epoch=50
# iteration=100
# for j in range(epoch):
#     #shuffle
#     state = np.random.get_state()
#     np.random.shuffle(x_all)
#     np.random.set_state(state)
#     np.random.shuffle(y_all_hated)
#
#     LR=0.01
#     for i in range(iteration):
#         Y,X=gen_training_data(y_all_hated,x_all,i,128)
#         W,pred_res=forward(W,X,Y,True,True,LR,j,i)
#
#
#     np.save('model_weights2.npy',W)

Accuracy=0.1825
城市管理 Precision=0.1616 Recall=0.09849999999999999 F-Measure=0.12240000000000001
城乡住房 Precision=0.2753 Recall=0.36939999999999995 F-Measure=0.3155
交通管理 Precision=0.1558 Recall=0.0347 F-Measure=0.056799999999999996
环境保护 Precision=0.11199999999999999 Recall=0.235 F-Measure=0.1517
农林牧副渔 Precision=0.2432 Recall=0.19260000000000002 F-Measure=0.215

文本数据挖掘实验:文本分类相关推荐

  1. 【文本数据挖掘】中文命名实体识别:HMM模型+BiLSTM_CRF模型(Pytorch)【调研与实验分析】

    1️⃣本篇博文是[文本数据挖掘]大作业-中文命名实体识别-调研与实验分析 2️⃣在之前的自然语言课程中也完成过一次命名实体识别的实验 [一起入门NLP]中科院自然语言处理作业三:用BiLSTM+CRF ...

  2. U1C1 数据挖掘与文本分析的背景与实际应用

    U1C1 Background and practical applications of data mining and text analytics 一.机器学习 与 数据挖掘 (Machine ...

  3. 文本数据挖掘 Matrix67: The Aha Moments

    转自:http://www.matrix67.com/blog/archives/5044 互联网时代的社会语言学:基于SNS的文本数据挖掘 今年上半年,我在人人网实习了一段时间,期间得到了很多宝贵的 ...

  4. ML之NB:基于NB朴素贝叶斯算法训练20类新闻文本数据集进行多分类预测

    ML之NB:基于NB朴素贝叶斯算法训练20类新闻文本数据集进行多分类预测 目录 输出结果 设计思路 核心代码 输出结果 设计思路 核心代码 vec = CountVectorizer() X_trai ...

  5. ML之NBLoR:利用NB(朴素贝叶斯)、LoR(逻辑斯蒂回归)算法(+TfidfVectorizer)对Rotten Tomatoes影评数据集进行文本情感分析—五分类预测

    ML之NB&LoR:利用NB(朴素贝叶斯).LoR(逻辑斯蒂回归)算法(+TfidfVectorizer)对Rotten Tomatoes影评数据集进行文本情感分析-五分类预测 目录 输出结果 ...

  6. ML之NBLoR:利用NB(朴素贝叶斯)、LoR(逻辑斯蒂回归)算法(+CountVectorizer)对Rotten Tomatoes影评数据集进行文本情感分析—五分类预测

    ML之NB&LoR:利用NB(朴素贝叶斯).LoR(逻辑斯蒂回归)算法(+CountVectorizer)对Rotten Tomatoes影评数据集进行文本情感分析-五分类预测 目录 输出结果 ...

  7. 16Python文本数据分析:新闻分类任务 (贝叶斯算法应用实例)

    唐宇迪<python数据分析与机器学习实战>学习笔记 16Python文本数据分析:新闻分类任务 (贝叶斯算法应用实例) ** 一.流程分析 ** 数据如下图:content为主体内容, ...

  8. 30天数据分析与机器学习实践之Day16——Python文本数据分析:新闻分类任务

    30天数据分析与机器学习实践之Day16--Python文本数据分析:新闻分类任务 一.文本分析与关键词提取 1.1文本数据 1.2停用词 1.语料中大量出现2.没啥大用3.留着过年嘛? 1.3Tf- ...

  9. 鬼吹灯文本挖掘5:sklearn实现文本聚类和文本分类

    鬼吹灯文本挖掘1:jieba分词和CountVectorizer向量化 鬼吹灯文本挖掘2:wordcloud 词云展示 鬼吹灯文本挖掘3:关键词提取和使用sklearn 计算TF-IDF矩阵 鬼吹灯文 ...

  10. 【新词发现】基于SNS的文本数据挖掘、短语挖掘

    互联网时代的社会语言学:基于SNS的文本数据挖掘 python实现 https://github.com/jtyoui/Jtyoui/tree/master/jtyoui/word  这是一个无监督训 ...

最新文章

  1. app如何打开了request url_手机日历app内如何打开节日提醒功能?支持提前提醒节日的云便签...
  2. 读张鸣-《辛亥:摇晃的中国》感
  3. 消防信号总线原理_消防报警系统中消防模块分类与用途简介
  4. vue 在已有的购买列表中(数据库返回的数据)修改商品数量
  5. 计算机网络之传输层:1、概述(传输层功能、端口号、套接字、TCP、UDP)
  6. 95-190-542-源码-window-清除器(Evictors)-CountEvitor简介
  7. wps 选择 高亮_简单实用:一组WPS表格技巧
  8. mysql+服务+监控_mysql从服务器监控
  9. pyclewn调试带参数程序
  10. 时间插件只能选择整点和半点_外贸人如何把控合适的客户开发时间及跟进频率...
  11. 学堂在线体育与社会作业答案(武汉体育学院)(学堂在线)网课搜题
  12. [面经]星环科技大数据后台开发实习面试
  13. 现代数字信号处理课后作业【第六章】
  14. 深入解读Redis之数据类型解析-SDS
  15. 使用键盘设置桌面计算机图标的显示不出来,显示桌面,小编教你显示桌面图标不见了如何恢复...
  16. 使用SQLiteSpy用sql语句操作数据库
  17. Centos删除乱码文件或文件夹
  18. python合并列表并按升序排序_程序合并间隔并在Python中按升序对其进行排序
  19. 内网服务器反弹映射到公网ip去访问
  20. 交换机、路由器、网关的概念,并知道各自的用途

热门文章

  1. 360全景虚拟现实图片拍摄的注意细节
  2. QCC3040---peer UI module
  3. mysql与phpmyadmin安装_phpMyAdmin下载、安装和使用入门_MySQL
  4. dfs应用,迷宫地图解救小哈
  5. iphone5计算机没有了,电脑无法识别iphone5怎么解决
  6. 三角波c语言编程,51单片机简易波形发生器(正弦波 锯齿波 三角波)仿真+源程序+电路原理图...
  7. 使用 FireDAC的 TFDConnection, TFDQuery 最初只能显示50条记录,TDateSet.RecordCount总是获得50的解决方法。
  8. 计算机视觉——Harris角点检测(课堂实验)
  9. 时间格式2020-01-13T16:00:00.000Z中的T和Z分别表示什么,如何处理
  10. RocketMQ调优总结(system busy或broker busy报错解决)