中文文本分析（matplotlib的库的应用）

Request:
1.结合wordcloud将《红楼梦》、《水浒传》、《三国演义》分别绘制人物的词云图（按照人物出现的频率）
2.分别统计《红楼梦》、《水浒传》、《三国演义》前20个主要人物的出场次数，并绘制出场次数的统计图
3.结合networkx绘制《红楼梦》、《水浒传》、《三国演义》主要人物的社交关系网络图

首先介绍一下代码来源，生成词云，绘制关系网络图参考书本《Python语言程序设计》（上海交通大学出版社),我主要写的是加了一个类的模块和GUI实现，有许多注释操作，没有删除，但是都有一点的意义（本来是想先用plt保存图片，后来懒得搞了，直接用plt里面的界面显示，所以有很多存储操作被我注释掉了）
上代码`import re
import jieba
import wordcloud
import matplotlib.pyplot as plt
from imageio import imread
import numpy as np
import networkx as nx
import matplotlib
import tkinter as tk

import os

matplotlib.rcParams[‘font.sans-serif’]=[‘SimHei’]
class TextAnalysisAPP:
‘’‘根据接收的路径名打开并分析文本’’’
count=20
time=[]
name=[]
wordFreq_fpath=’’ #词频文件路径
wordFreq_pc_path=’’#词云图片路径名
wordFreq_statistics_path=’’ #统计图片文件路径
def init(self,filepath,needname_path):
self.filepath=filepath
self.needname_path=needname_path

def needwordslist(self):   #读取need文件needwords=[line.strip() for line in open(self.needname_path,'r').readlines()]needwords=[i for i in needwords if(len(str(i))!=0)]return needwords   #返回词列表def getText(self):   #读取文本数据,返回字符串类型with open(self.filepath,'r',encoding='utf-8') as f:text=f.read()return text         #返回文本所有字符def wordFreq(self):#使用jieba库读取并切割，并统计词频if True:self.time=[]self. name=[]words=jieba.lcut(self.getText().strip())        #函数返回字符串counts={}needwords=self.needwordslist()   #读取需要名字的列表for word in words:if len(word)==1:continueelif word  in needwords:counts[word]=counts.get(word,0)+1items=list(counts.items())  #转换成元组列表排序items.sort(key=lambda x:x[1],reverse=True)name=re.split(r'[/\\.]',self.filepath)[-2]
# if not os.path.exists(filepath[2:5]+'_词频.txt'):
#     os.mkdir(filepath[2:5]+'_词频.txt')with open(name+'_词频.txt','w') as f:for i in range(self.count):   #topn为需要展示的人物数量word,count=items[i]f.writelines('{}\t{}\n'.format(word, count))self.name.append(word)self.time.append(count)wordFreq_fpath=''wordFreq_fpath=name+'_词频.txt'self.wordFreq_fpath=wordFreq_fpathreturn wordFreq_fpath      #返回词频文件名else:return self.wordFreq_fpathdef wordcloud_pc(self):   #展示词云if True:bg_pic=imread('.\六边形.png')  #用这个库貌似读取不了图片
# bg_pic=np.array(Image.open('pc_cloud.jpg'))filepath=self.wordFreq()   #返回词频文件名with open(filepath,'r') as f:text=f.read()              #generate会自动将text中后面整数作为频率wcloud=wordcloud.WordCloud(background_color='white',font_path=r"C:\Windows\Fonts\simhei.ttf",mask=bg_pic,width=1000,height=860,max_words=1000,margin=2).generate(text)#mask为词云形状,margin为每个单词间隔# wcloud.to_file(re.split(r'[/\\.]',self.filepath)[-2]+'.png')# self.wordFreq_pc_path=re.split(r'[/\\.]',self.filepath)[-2]+'.jpg'# else:return self.wordFreq_pc_pathplt.imshow(wcloud)plt.axis('off')plt.show()def WordFreqAnalysis(self):if True:Xi=np.array(self.time)Yi=np.array(self.name)x=np.arange(0,self.count)width=0.6plt.rcParams['font.sans-serif']=['SimHei']  #用正常显示中文plt.figure(figsize=(8,6))#图像比例plt.barh(x,Xi,width,color='SkyBlue',alpha=0.8)#横向的图,alpha为柱体透明度plt.xlabel('time')plt.ylabel('name')for a,b,c in zip(Xi,Yi,x):# print(a,b,c)plt.text(a+10,c-0.4,'%d'%int(a),ha='center',va='bottom')#ha为垂直对齐方式，va为水平对齐方式plt.yticks(x,Yi)  #Yi为每个人物名称# name=re.split(r'[/\\.]',self.filepath)[-2]# plt.savefig(name+'_statistics.jpg')# self.wordFreq_statistics_path=name+'_statistics.jpg'plt.show()# return self.wordFreq_statistics_path# else:#     plt.show()#     return self.wordFreq_statistics_path# plt.show()# plt.close
def CharacterRelation(self):Names=self.needwordslist()  #需要的名字列表relations={}s=self.getText()lst_para=s.split('\n')for text in lst_para:for name1 in Names:if name1 in text:for name2 in Names:if name2 in text and name1!=name2 and (name2,name1)not in relations:relations[(name1,name2)]=relations.get((name1,name2),0)+1# print(relations.items())maxRela=max([v for k,v in relations.items()])relations={k:v/maxRela for k,v in relations.items()}relations={k:v for k,v in relations.items() if v>0.2}# print(relations.items())plt.figure(figsize=(15,15))G=nx.Graph()for k,v in relations.items():G.add_edge(k[0],k[1],weight=v)elarge=[(u,v) for (u,v,d) in G.edges(data=True) if(d['weight']>0.6)]emidle=[(u,v) for (u,v,d) in G.edges(data=True) if(d['weight']<=0.6)&(d['weight']>0.3)]esmall=[(u,v) for (u,v,d) in G.edges(data=True) if(d['weight']<=0.3)]pos=nx.spring_layout(G)nx.draw_networkx_nodes(G, pos, alpha=0.8,node_size=800)nx.draw_networkx_edges(G, pos, edgelist=elarge,width=2.5,alpha=0.9,edge_color='g')nx.draw_networkx_edges(G, pos, edgelist=emidle,width=1.5,alpha=0.6,edge_color='y')nx.draw_networkx_edges(G, pos, edgelist=esmall,width=1,alpha=0.4,edge_color='b',style='dashed') nx.draw_networkx_labels(G, pos, font_size=12)# plt.savefig(re.split(r'[/\\.]',self.filepath)[-2]+'_关系.jpg')plt.show()# plt.close()

class GUIAPP:
def init(self):
self.root=None

def operate(self):# self.recover_root()# frame=tk.Frame(self.root)# frame.pack()self.root=tk.Tk()self.root.title("GUI的测试窗口")self.root.geometry("300x400+500+100")tk.Button(self.root,text='红楼梦',command=lambda :self.first_button('红楼梦')).pack()tk.Button(self.root,text='水浒传',command=lambda :self.first_button('水浒传')).pack()tk.Button(self.root,text='三国演义',command=lambda :self.first_button('三国演义')).pack()self.root.mainloop()def first_button(self,name):self.root.destroy()# self.recover_root()self.root=tk.Tk()self.root.title("GUI的测试窗口")self.root.geometry("300x400+500+100")# root1 =tk.Tk()# root1.title("GUI的测试窗口")# root1.geometry("300x400+150+0")frame=tk.Frame(self.root)frame.pack()tk.Button(frame,text=name+'词云',command=lambda :self.wordcloud_button(name)).pack()tk.Button(frame,text=name+'人物统计',command=lambda :self.statistics_button(name)).pack()tk.Button(frame,text=name+'人物关系',command=lambda :self.relations_button(name)).pack()tk.Button(frame,text='返回',command=lambda :self.return_button(name)).pack()self.root.mainloop()def wordcloud_button(self,name):filepath='.\文本汇总\{}.txt'.format(name)needname_path='.\人物汇总\{}人物.txt'.format(name)sanguo=TextAnalysisAPP(filepath,needname_path)sanguo.wordcloud_pc()
def statistics_button(self,name):filepath='.\文本汇总\{}.txt'.format(name)needname_path='.\人物汇总\{}人物.txt'.format(name)sanguo=TextAnalysisAPP(filepath,needname_path)sanguo.wordFreq()   #返回词频文件名sanguo.WordFreqAnalysis()  #词频分析，返回文件名
def relations_button(self,name):filepath='.\文本汇总\{}.txt'.format(name)needname_path='.\人物汇总\{}人物.txt'.format(name)sanguo=TextAnalysisAPP(filepath,needname_path)sanguo.CharacterRelation()
def return_button(self,name):self.root.destroy()self.operate()

def main():
# filepath=’.\文本汇总\红楼梦.txt’
# needname_path=’.\人物汇总\红楼梦人物.txt’
# sanguo=TextAnalysisAPP(filepath,needname_path)
# ‘’‘词云显示’’’
# sanguo.wordcloud_pc()
# ‘’‘词频统计分析’’’
# sanguo.wordFreq() #返回词频文件名
# sanguo.WordFreqAnalysis() #词频分析，返回文件名
# print(sanguo.wordFreq_fpath,sanguo.wordFreq_pc_path,sanguo.wordFreq_statistics_path)
# ‘’‘人物关系分析’’’
# sanguo.CharacterRelation()
# Names=sanguo.needwordslist() #需要的名字列表
# print(Names)
# root =tk.Tk()
# root.title(“GUI的测试窗口”)
# root.geometry(“300x400+150+0”)
gui=GUIAPP()
gui.operate()
if name==‘main’:
main()

最后附上运行视频

中文文本分析（matplotlib的库的应用）相关推荐

python 中文文本分析
中文文本分析 Mac 安装pip 和 jieba curl https://bootstrap.pypa.io/get-pip.py | python3 你可以接着输入 pip --version 看 ...
Python中文文本分析基础
文章目录一. 中文文本分析相关库 1. 中文分词jieba库 (1). jieba库概述 (2). jieba库安装 (3). jieba分词原理 (4). jieba库的使用说明 1. 精确模式 ...
中文文本分析, Text-Analysis
中文文本分析, Text-Analysis Text-Analysis包括analysis-word 词语分析和analysis-classify 文本分类数据分析等, 支持python3读写word ...
中文文本分析（3）--文本相似度
中文文本分析(3)--文本相似度 1. 需要的包 2.流程 3.代码应用场景: ①信息检索,通过相似度识别相似的词语,找出与检索词语相似的结果. ②自动问答,通过关键词进行搜索问题,相似程度最高的问 ...
Python中文文本分析时遇到的编码问题小结
最近在做python的中文文本分析,万事开头难,最开始就发现了很多中文编码问题,以下对问题进行了汇总. 问题1:中文文本读取时遇到的编码问题在读取文件后,直接print就会产生问题,如下: 解决方案 ...
南京邮电大学C语言中文文本分析处理
南京邮电大学C语言中文文本分析处理程序设计题2:中文文本分析处理 1问题描述编写一个程序,对一篇中文文章进行分析和处理. 2功能要求要能提供以下几个基本功能: (1)从硬盘读入事先录入的中文文档 ...
ik分词和jieba分词哪个好_Python 中文文本分析实战：jieba分词+自定义词典补充+停用词词库补充+词频统计...
最近项目需要,实现文本的词频分析,折腾了几天才完成任务,有点成就感,最后整理总结一下这部分的内容,希望更多同僚受益. 一.使用前准备环境:Python3.6 安装结巴:pip install ji ...
python中文文本分析_python--文本分析
一. 导读文本分析主要用来分词分析,情感分析以及主题分析,参考知乎用户的文章,他从方法代码上讲解了中英文分词(wordcloud,jieba),中英文情感分析(textblob,snownlp), ...
python中文文本分析_python使用snownlp进行中文文本处理以及分词和情感分析 - pytorch中文网...
SnowNLP: 一个简单的中文文本处理库 SnowNLP是一个python写的类库,可以方便的处理中文文本内容,是受到了TextBlob的启发而写的,由于现在大部分的自然语言处理库基本都是针对英文的 ...
python中文文本分析_Python有趣|中文文本情感分析
前言前文给大家说了python机器学习的路径,这光说不练假把式,这次,罗罗攀就带大家完成一个中文文本情感分析的机器学习项目,今天的流程如下: 数据情况和处理数据情况这里的数据为大众点评上的评论数 ...

中文文本分析（matplotlib的库的应用）

中文文本分析（matplotlib的库的应用）

import os

中文文本分析（matplotlib的库的应用）相关推荐

最新文章

热门文章