def get_traindata():   #输出总的文本矩阵,文本向量,词典folder_path = ('F:/train_data')folder_list = os.listdir(folder_path)sum_list = []corpus = []sum_vocab = set([])sum_dict = {}print('正在生成训练集总文本向量...')for folder in folder_list:                            #先获取总的文本向量new_folder_path = folder_path + '/' + str(folder)files = os.listdir(new_folder_path)for file in files:rs = []with open(new_folder_path+'/'+file,'r',encoding='utf-8') as fp:for ln in fp:rs.extend(ln.strip().split(' '))# print(type(rs))sum_list.append(rs)print(len(sum_list))print('生成完毕!')sum_num = len(sum_list)                              #文本总数past_num = 0train_num = 0print('正在生成词典...')for folder in folder_list:                           #对每一类求类向量并降维new_folder_path = folder_path + '/' + str(folder)files = os.listdir(new_folder_path)train_num = len(files)class_list = []for file in files:rs = []with open(new_folder_path + '/' + file, 'r', encoding='utf-8') as fp:for ln in fp:rs.extend(ln.strip().split(' '))class_list.extend(rs)corpus.append(str(class_list))# class_vocab  = createVocablist(class_list)# for word in class_vocab:#     if word in sum_vocab:#         class_vocab.remove(word)# class_vocab1 = class_tfidf(class_vocab,class_list,sum_list)# sum_vocab = sum_vocab | set(class_vocab1)sum_dict[str(folder)] = [0]*past_num+[1]*train_num+[0]*(sum_num-past_num-train_num)past_num +=train_numvectorizer = CountVectorizer()transformer = TfidfTransformer()tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))word = vectorizer.get_feature_names()weight = tfidf.toarray()vocab = set([])for i in range(len(weight)):list = sorted(weight[i], reverse=True)num = list[5]for j in range(len(word)):if weight[i][j] >= num:vocab.add(j)vocab_list =[]for i in vocab:vocab_list.append(word[i])#         class_list1 = []#         for i in dict_list:#             class_list1.append(i[0])#     sum_vocab = sum_vocab | set(class_list1)## vocab_list = list(sum_vocab)     #最终的词典列表print('生成的词典为:%s'% str(vocab_list))vocab_file = open('F:/myVocab.txt','w')vocab_file.write(str(vocab_list))vocab_file.close()return sum_list,sum_dict,vocab_list    #返回总的文档列表、总的分类向量、词典def createVocablist(dataSet):  #去重复词创建词库vocabSet=set([])for document in dataSet:vocabSet=vocabSet | set(document)return list(vocabSet)# def class_tfidf(class_vocab, class_list, sum_list):
#     class_dict = {}
#     tf1_dict = {}
#     tf2 = 0
#     idf2_dict = {}
#
#     for word in class_vocab:
#         tf1_dict[word] = 0  # 该类中该词的词频
#         idf2_dict[word] = 1  # 总文档中包含该词的文档数
#     idf1 = len(sum_list)  # 总文档数
#
#     print('step 1')
#     for list in class_list:
#         for word in list:
#             if word in class_vocab:
#                 tf1_dict[word] += 1
#                 tf2 += 1
#             else:
#                 tf2 += 1
#
#     print('step 2')
#     for list in sum_list:
#         for word in class_vocab:
#             if word in list:
#                 idf2_dict[word] += 1
#
#     print('step 3')
#     for word in class_vocab:
#         class_dict[word] = (tf1_dict[word] / tf2) * log(idf1 / idf2_dict[word])
#     dict = sorted(class_dict.items(), key=lambda d: d[1], reverse=True)
#     dict_list = dict[:1000]
#     class_list1 = []
#     for i in dict_list:
#         class_list1.append(i[0])
#     return class_list1  # 返回维度为1000的类词典
#     print('完成一类TFIDF')def setOfWords2Vec(vocablist,inputSet):  #生成文本向量returnVec=[0]*len(vocablist)for word in inputSet:if word in vocablist:returnVec[vocablist.index(word)]=1else:pass# print('The word:%s is not in my Vocabulary!' % word)return returnVecdef trainNBO(trainMatrix,sum_dic):           #求每种分类的概率向量numTrainDocs=len(trainMatrix)            #总文档数numWords=len(trainMatrix[0])             #向量长度folder_path = ('F:/train_data')folder_list = os.listdir(folder_path)for folder in folder_list:train_num=0new_folder_path=folder_path+'/'+str(folder)train_num=len(os.listdir(new_folder_path))if folder=='baby':pClass0=train_num/float(numTrainDocs)elif folder=='car':pClass1=train_num/float(numTrainDocs)elif folder=='food':pClass2=train_num/float(numTrainDocs)elif folder=='health':pClass3=train_num/float(numTrainDocs)elif folder=='legend':pClass4=train_num/float(numTrainDocs)elif folder=='life':pClass5=train_num/float(numTrainDocs)elif folder=='love':pClass6=train_num/float(numTrainDocs)elif folder=='news':pClass7=train_num/float(numTrainDocs)elif folder=='science':pClass8=train_num/float(numTrainDocs)else :pClass9=train_num/float(numTrainDocs)p0Num = ones(numWords)p1Num = ones(numWords)p2Num = ones(numWords)p3Num = ones(numWords)p4Num = ones(numWords)p5Num = ones(numWords)p6Num = ones(numWords)p7Num = ones(numWords)p8Num = ones(numWords)p9Num = ones(numWords)p0Denom = numWordsp1Denom = numWordsp2Denom = numWordsp3Denom = numWordsp4Denom = numWordsp5Denom = numWordsp6Denom = numWordsp7Denom = numWordsp8Denom = numWordsp9Denom = numWordsfor i in range(numTrainDocs):if sum_dic['baby'][i] == 1:p0Num += trainMatrix[i]p0Denom += sum(trainMatrix[i])elif sum_dic['car'][i] == 1:p1Num += trainMatrix[i]p1Denom += sum(trainMatrix[i])elif sum_dic['food'][i] == 1:p2Num += trainMatrix[i]p2Denom += sum(trainMatrix[i])elif sum_dic['health'][i] == 1:p3Num += trainMatrix[i]p3Denom += sum(trainMatrix[i])elif sum_dic['legend'][i] == 1:p4Num += trainMatrix[i]p4Denom += sum(trainMatrix[i])elif sum_dic['life'][i] == 1:p5Num += trainMatrix[i]p5Denom += sum(trainMatrix[i])elif sum_dic['love'][i] == 1:p6Num += trainMatrix[i]p6Denom += sum(trainMatrix[i])elif sum_dic['news'][i] == 1:p7Num += trainMatrix[i]p7Denom += sum(trainMatrix[i])elif sum_dic['science'][i] == 1:p8Num += trainMatrix[i]p8Denom += sum(trainMatrix[i])else :p9Num += trainMatrix[i]p9Denom += sum(trainMatrix[i])p0Vect = log(p0Num / p0Denom)p1Vect = log(p1Num / p1Denom)p2Vect = log(p2Num / p2Denom)p3Vect = log(p3Num / p3Denom)p4Vect = log(p4Num / p4Denom)p5Vect = log(p5Num / p5Denom)p6Vect = log(p6Num / p6Denom)p7Vect = log(p7Num / p7Denom)p8Vect = log(p8Num / p8Denom)p9Vect = log(p9Num / p9Denom)return p0Vect,p1Vect,p2Vect,p3Vect,p4Vect,p5Vect,p6Vect,p7Vect,p8Vect,p9Vect,\pClass0,pClass1,pClass2,pClass3,pClass4,pClass5,pClass6,pClass7,pClass8,pClass9def classify(vec2Classify,p0Vec,p1Vec,p2Vec,p3Vec,p4Vec,p5Vec,p6Vec,p7Vec,p8Vec,p9Vec,pClass0,pClass1,pClass2,pClass3,pClass4,pClass5,pClass6,pClass7,pClass8,pClass9):p0 = sum(vec2Classify * p0Vec) + log(pClass0)p1 = sum(vec2Classify * p1Vec) + log(pClass1)p2 = sum(vec2Classify * p2Vec) + log(pClass2)p3 = sum(vec2Classify * p3Vec) + log(pClass3)p4 = sum(vec2Classify * p4Vec) + log(pClass4)p5 = sum(vec2Classify * p5Vec) + log(pClass5)p6 = sum(vec2Classify * p6Vec) + log(pClass6)p7 = sum(vec2Classify * p7Vec) + log(pClass7)p8 = sum(vec2Classify * p8Vec) + log(pClass8)p9 = sum(vec2Classify * p9Vec) + log(pClass9)if max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p0:return 'baby'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p1:return 'car'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p2:return 'food'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p3:return 'health'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p4:return 'legend'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p5:return 'life'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p6:return 'love'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p7:return 'news'elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p8:return 'science'else:return 'sexual'def Result(str1,str2,r):if str1 == 'baby':a=0elif str1 == 'car':a=1elif str1 == 'food':a=2elif str1 == 'health':a=3elif str1 == 'legend':a=4elif str1 == 'life':a=5elif str1 == 'love':a=6elif str1 == 'news':a=7elif str1 == 'science':a=8else:a=9if str2 == 'baby':b=0elif str2 == 'car':b=1elif str2 == 'food':b=2elif str2 == 'health':b=3elif str2 == 'legend':b=4elif str2 == 'life':b=5elif str2 == 'love':b=6elif str2 == 'news':b=7elif str2 == 'science':b=8else:b=9r[a][b]+=1return rdef Output_result(r):r0=sum(r[0])r1=sum(r[1])r2=sum(r[2])r3=sum(r[3])r4=sum(r[4])r5=sum(r[5])r6=sum(r[6])r7=sum(r[7])r8=sum(r[8])r9=sum(r[9])sum_right=0sum_all=0for i in range(10):sum_right += r[i][i]sum_all=r0+r1+r2+r3+r4+r5+r6+r7+r8+r9p_rate = [0]*10r_rate = [0]*10f_meas = [0]*10for i in range(10):p_rate[i] = r[i][i]/sum(r[0])sum_list = 0for j in range(10):sum_list += r[j][i]r_rate[i] = r[i][i]/sum_listf_meas[i] = (2*p_rate[i]*r_rate[i])/(p_rate[i]+r_rate[i])print('baby类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[0]*100,r_rate[0]*100,f_meas[0]*100))print('car类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'    % (p_rate[1]*100,r_rate[1]*100,f_meas[1]*100))print('food类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f '  % (p_rate[2]*100,r_rate[2]*100,f_meas[2]*100))print('health类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[3]*100,r_rate[3]*100,f_meas[3]*100))print('legend类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[4]*100,r_rate[4]*100,f_meas[4]*100))print('life类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[5]*100,r_rate[5]*100,f_meas[5]*100))print('love类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[6]*100,r_rate[6]*100,f_meas[6]*100))print('news类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[7]*100,r_rate[7]*100,f_meas[7]*100))print('science类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'% (p_rate[8]*100,r_rate[8]*100,f_meas[8]*100))print('sexual类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[9]*100,r_rate[9]*100,f_meas[9]*100))print('平均准确率为:%.2f%%'% (sum(p_rate)*10))print('平均召回率为:%.2f%%'% (sum(r_rate)*10))print('总的分类正确率为:%d%%'% ((sum_right/sum_all)*100))def Training():sum_list,sum_dict,myVocabList = get_traindata()# print('词典为:%s' % str(myVocabList))print('维度为:%d' % len(myVocabList))trainMat = []print(len(sum_list))for postinDoc in sum_list:trainMat.append(setOfWords2Vec(myVocabList, postinDoc))p0Vect, p1Vect, p2Vect, p3Vect, p4Vect, p5Vect, p6Vect, p7Vect, p8Vect, p9Vect, pClass0, pClass1, pClass2, pClass3, pClass4, pClass5, pClass6, pClass7, pClass8, pClass9 = trainNBO(array(trainMat), sum_dict)r = []for i in range(10):r.append([0] * 10)folder_path = ('F:/train_data')folder_list = os.listdir(folder_path)for folder in folder_list:print('正在对%s类文本进行分类......' % folder)new_folder_path = folder_path + '/' + str(folder)files = os.listdir(new_folder_path)for file in files:rs = []file__path = new_folder_path + '/' + filewith open(file__path, 'r', encoding='utf-8') as fp:for ln in fp:rs.extend(ln.strip().split(' '))Test_vect = setOfWords2Vec(myVocabList, rs)Test_result = classify(Test_vect, p0Vect, p1Vect, p2Vect, p3Vect, p4Vect, p5Vect, p6Vect, p7Vect, p8Vect,p9Vect, pClass0, pClass1, pClass2, pClass3, pClass4, pClass5, pClass6, pClass7,pClass8, pClass9)r = Result(folder, Test_result, r)print('对%s类文本分类完毕!' % folder)print(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], sep='\n')Output_result(r)if __name__=='__main__':Training()

手写朴素贝叶斯文本分类相关推荐

  1. 【NLP实战系列】朴素贝叶斯文本分类实战

    实战是学习一门技术最好的方式,也是深入了解一门技术唯一的方式.因此,NLP专栏计划推出一个实战专栏,让有兴趣的同学在看文章之余也可以自己动手试一试. 本篇介绍自然语言处理中一种比较简单,但是有效的文本 ...

  2. 朴素贝叶斯文本分类java_朴素贝叶斯文本分类简单介绍

    本文介绍朴素贝叶斯算法如何对文本进行分类.比如,每个用户的购物评论就是一篇文本,识别出这篇文本属于正向评论还是负面评论 就是分类的过程,而类别就是:{正面评论,负面评论}.正面评论为Positive, ...

  3. 朴素贝叶斯文本分类(python代码实现)

    朴素贝叶斯(naive bayes)法是基于贝叶斯定理与特征条件独立假设的分类方法. 优点:在数据较少的情况下仍然有效,可以处理多分类问题. 缺点:对入输入数据的准备方式较为敏感. 使用数据类型:标称 ...

  4. 朴素贝叶斯文本分类java_基于朴素贝叶斯的文本分类算法

    基于朴素贝叶斯的文本分类算法 摘要:常用的文本分类方法有支持向量机.K-近邻算法和朴素贝叶斯.其中朴素贝叶斯具有容易实现,运行速度快的特点,被广泛使用.本文详细介绍了朴素贝叶斯的基本原理,讨论多项式模 ...

  5. python朴素贝叶斯的文本分类_自给自足,完全手写一个朴素贝叶斯分类器,完成文本分类...

    Part 1: 本文解决的问题: 我在有这样的一个数据集,里面存放了人们对近期播放电影的评价,当然评价也就分成两部分,好评和差评.我们想利用这些数据训练一个模型,然后可以自动的对影评做出判断,到底是好 ...

  6. 八、朴素贝叶斯中文分类实战

    1.朴素贝叶斯中文分类实战 文本分类的流程如下图所示: 朴素贝叶斯中文分类的目录结构 中文分类的目录机构包括停用词文件.训练集文件和和测试集文件,具体内容如下图所示: 2 数据准备与处理 2.1 数据 ...

  7. 朴素贝叶斯实现分类_关于朴素贝叶斯分类及其实现的简短教程

    朴素贝叶斯实现分类 Naive Bayes classification is one of the most simple and popular algorithms in data mining ...

  8. 朴素贝叶斯算法-分类算法

    朴素贝叶斯算法-分类算法 1 概率基础 概率定义为一件事情发生的可能性 联合概率:包含多个条件,且所有条件同时成立的概率,记作P(A,B) 条件概率:事件A在另一个事件B已经发生条件下的发送概率,记作 ...

  9. r包调用legend函数_R语言实现基于朴素贝叶斯构造分类模型数据可视化

    本文内容原创,未经作者许可禁止转载! 目录 一.前言 二.摘要 三.关键词 四.算法原理 五.经典应用 六.R建模 1.载入相关包(内含彩蛋): 1.1 library包载入 1.2 pacman包载 ...

  10. 贝叶斯文本分类python_scikit_learn 中朴素贝叶斯算法做文本分类的 实践总结

    朴素贝叶斯算法对于分类非常高效 想了解的可以参考这篇博文:贝叶斯从浅入深详细解析,详细例子解释 - zwan0518的专栏 - 博客频道 - CSDN.NET贝叶斯从浅入深 先来做个小小总结说明 在这 ...

最新文章

  1. java转换ip地址格式转换_Java编程IP地址和数字相互转换代码示例
  2. 大小端判断和网络字节序
  3. hashMap和hashTable的区别(个人总结)
  4. 自定义DrawableTextView——实现TextView左上右下的点击监听
  5. 服务降级,服务熔断,服务限流
  6. C# 事件详解附实例分析
  7. 在centos8 stream启用 Extra Packages
  8. Strange Memory Gym - 102832F
  9. C语言整数与字符串相互转换
  10. 从技术到应用实践,揭秘京东区块链布局全景
  11. idea报错:Lambda expressions are not supported at language level '7'
  12. 前端开发IDE---VSCode前端开发环境配置
  13. 【Unity】制作一个商店场景
  14. 计算机表格列宽怎么设置,excel自动调整列宽在哪?excel中怎么自动调整各行宽度...
  15. 使用XPlanner进行敏捷项目计划和进度跟踪管理
  16. dm8127 A8 yuv420sp 送入到videoM3编码--已经解决
  17. js设计模式--代理模式
  18. 网络扫描工具Nmap使用教程(1)
  19. 【沙龙预告】移动媒体产品新趋势
  20. 洗衣服wash 题解

热门文章

  1. 2020年Google SEO 8大趋势
  2. SGX攻防部分POC
  3. excel去掉单元格的隐藏字符
  4. 2021牛客寒假算法基础集训营1 C 无根树问题的处理策略 前序后序遍历 奇偶匹配 DFS
  5. c语言自定义函数返回值的作用,C语言自定义函数
  6. python死循环_Python for死循环
  7. matlab中将数据存为dat格式,matlab中将数据保存为txt或dat格式四种方案
  8. 关于机器人方面的sci论文_近十年机器人学科中国学者SCI十大发文期刊 - 论文投稿 - 小木虫 - 学术 科研 互动社区...
  9. php trying to get,php 做微信认证登陆 返回错误 Trying to get property of non-object
  10. 怎样设定计算机屏幕锁定时间,电脑屏幕锁屏时间怎么设置