用python实现小说的平均句长，词性占比，关键词，标点符号，词形统计

需求如下

代码：

词性占比

import jieba
from wordcloud import WordCloud
import refrom PIL import Imageimport matplotlib.pyplot as pltdef read_file_gbk(filename):with open(filename,'r',encoding='GBK') as f:s = f.read()s = re.sub('/C', '', s)s = re.sub('\r|\n|\s','',s)return s
import jieba
import numpy as np#打开词典文件，返回列表
def open_dict(Dict = 'hahah', path=r''):path = path + '%s.txt' % Dictdictionary = open(path, 'r', encoding='utf-8')dict = []for word in dictionary:word = word.strip(' ,\n')dict.append(word)return dictdef judgeodd(num):if (num % 2) == 0:return 'even'else:return 'odd'#注意，这里你要修改path路径。
deny_word = open_dict(Dict = '否定词', path= r'')
posdict = open_dict(Dict = 'positive', path= r'')
negdict = open_dict(Dict = 'negative', path= r'')
degree_word = open_dict(Dict = '程度级别词语', path= r'')
mostdict = degree_word[degree_word.index('extreme')+1 : degree_word.index('very')]#权重4，即在情感词前乘以4
verydict = degree_word[degree_word.index('very')+1 : degree_word.index('more')]#权重3
moredict = degree_word[degree_word.index('more')+1 : degree_word.index('ish')]#权重2
ishdict = degree_word[degree_word.index('ish')+1 : degree_word.index('last')]#权重0.5def sentiment_score_list(dataset):seg_sentence = dataset.split('。|！|？')count1 = []count2 = []for sen in seg_sentence: #循环遍历每一个评论segtmp = jieba.lcut(sen, cut_all=False,HMM=False)  #把句子进行分词，以列表的形式返回i = 0 #记录扫描到的词的位置a = 0 #记录情感词的位置poscount = 0 #积极词的第一次分值poscount2 = 0 #积极词反转后的分值poscount3 = 0 #积极词的最后分值（包括叹号的分值）negcount = 0negcount2 = 0negcount3 = 0for word in segtmp:poscount = 0neg_count = 0poscount2 = 0neg_count2 = 0poscount3 = 0neg_count3 = 0if word in posdict:  # 判断词语是否是情感词poscount += 1c = 0for w in segtmp[a:i]:  # 扫描情感词前的程度词if w in mostdict:poscount *= 4.0elif w in verydict:poscount *= 3.0elif w in moredict:poscount *= 2.0elif w in ishdict:poscount *= 0.5elif w in deny_word:c += 1if judgeodd(c) == 'odd':  # 扫描情感词前的否定词数，如果为奇数：poscount *= -1.0poscount2 += poscountposcount = 0poscount3 = poscount + poscount2 + poscount3poscount2 = 0else: # 扫描情感词前的否定词数，如果为偶数：poscount3 = poscount + poscount2 + poscount3poscount = 0a = i + 1  # 情感词的位置变化elif word in negdict:  # 消极情感的分析，与上面一致negcount += 1d = 0for w in segtmp[a:i]:if w in mostdict:negcount *= 4.0elif w in verydict:negcount *= 3.0elif w in moredict:negcount *= 2.0elif w in ishdict:negcount *= 0.5elif w in deny_word:d += 1if judgeodd(d) == 'odd':negcount *= -1.0negcount2 += negcountnegcount = 0negcount3 = negcount + negcount2 + negcount3negcount2 = 0else:negcount3 = negcount + negcount2 + negcount3negcount = 0a = i + 1elif word == '！' or word == '!':  ##判断句子是否有感叹号for w2 in segtmp[::-1]:  # 扫描感叹号前的情感词，发现后权值+2，然后退出循环if w2 in posdict or negdict:poscount3 += 2negcount3 += 2breaki += 1 # 扫描词位置前移# 以下是防止出现负数的情况pos_count = 0neg_count = 0if poscount3 < 0 and negcount3 > 0:neg_count += negcount3 - poscount3pos_count = 0elif negcount3 < 0 and poscount3 > 0:pos_count = poscount3 - negcount3neg_count = 0elif poscount3 < 0 and negcount3 < 0:neg_count = -poscount3pos_count = -negcount3else:pos_count = poscount3neg_count = negcount3count1.append([pos_count, neg_count])count2.append(count1)count1 = []return count2def sentiment_score(senti_score_list):score = []for review in senti_score_list:score_array = np.array(review)Pos = np.sum(score_array[:, 0])Neg = np.sum(score_array[:, 1])AvgPos = np.mean(score_array[:, 0])AvgPos = float('%.1f'%AvgPos)AvgNeg = np.mean(score_array[:, 1])AvgNeg = float('%.1f'%AvgNeg)StdPos = np.std(score_array[:, 0])StdPos = float('%.1f'%StdPos)StdNeg = np.std(score_array[:, 1])StdNeg = float('%.1f'%StdNeg)score.append([Pos, Neg, AvgPos, AvgNeg, StdPos, StdNeg])return scoredef sentiment_sen(data):x = sentiment_score(sentiment_score_list(data))[0][4]y = sentiment_score(sentiment_score_list(data))[0][5]return x-y#情感分析
def calculate_motion(text):print("emotion analyse start")pos=0neg=0neutral=0s = read_file_gbk(text)sentences = re.split(r' *[\.\。][\'"\)\]]* *', s)sen_list=[]for stuff in sentences:sen_list.append(stuff)print(sen_list.__sizeof__())for x in sen_list:if len(x)>0:if sentiment_sen(x)>0:pos=pos+1elif sentiment_sen(x)==0:neutral=neutral+1elif sentiment_sen(x)<0:neg=neg+1print("positive negative and neutral sentence size is为：{}、{}、{}".format(pos,neg,neutral))x_data = ["positive", "negative", "neutral"]y_data = [pos,neg,neutral]bar_width = 0.3plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号# 将X轴数据改为使用range(len(x_data), 就是0、1、2...plt.bar(x=x_data, height=y_data, label='',color='steelblue', alpha=0.8, width=bar_width)# 将X轴数据改为使用np.arange(len(x_data))+bar_width,# 就是bar_width、1+bar_width、2+bar_width...这样就和第一个柱状图并列了# 在柱状图上显示具体数值, ha参数控制水平对齐方式, va控制垂直对齐方式for x, y in enumerate(y_data):plt.text(x, y + 100, '%s' % y, ha='center', va='bottom')# 设置标题plt.title("情感计算")# 为两条坐标轴设置名称plt.xlabel("类型")plt.ylabel("数量")# 显示图例plt.legend()plt.show()calculate_motion('XX.txt')
calculate_motion('YQ.txt')

效果图

平均句长

import jieba
from wordcloud import WordCloud
import refrom PIL import Imageimport matplotlib.pyplot as pltdef read_file_gbk(filename):with open(filename,'r',encoding='GBK') as f:s = f.read()s = re.sub('/C', '', s)s = re.sub('\r|\n|\s','',s)return s
import jieba
import numpy as np#统计平均句长
def calculate_avg_length(text):size = 0num = 0s = read_file_gbk(text)sentences = re.split(r' *[\.\？！。 ，][\'"\)\]]* *', s)for stuff in sentences:size = size+stuff.__sizeof__()num = num +1print("avg_length_num is "+str(size/num))# 构建数据x_data = ["句子总数","总句数","平均句长"]y_data = [size,num,size/num]bar_width = 0.3plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号# 将X轴数据改为使用range(len(x_data), 就是0、1、2...plt.bar(x=x_data, height=y_data, label='',color='steelblue', alpha=0.8, width=bar_width)# 将X轴数据改为使用np.arange(len(x_data))+bar_width,# 就是bar_width、1+bar_width、2+bar_width...这样就和第一个柱状图并列了# 在柱状图上显示具体数值, ha参数控制水平对齐方式, va控制垂直对齐方式for x, y in enumerate(y_data):plt.text(x, y + 100, '%s' % y, ha='center', va='bottom')# 设置标题plt.title("平局句长计算")# 为两条坐标轴设置名称plt.xlabel("类型")plt.ylabel("数量")# 显示图例plt.legend()plt.show()#计算休闲小说平均句长
calculate_avg_length('XX.txt')
#计算言情小说平均句长
calculate_avg_length('YQ.txt')

效果图

关键词词云

import jieba
from wordcloud import WordCloud
import re
import  numpy as np
from PIL import Imageimport matplotlib.pyplot as plt#生成词云函数
def generate_wordcloud(text):list = []text = open(text, 'r', encoding='GBK').read()with open('stopword.txt', 'r', encoding='utf-8') as f:for line in f:list.append(line.strip('\n'))# print(list)cut_text = jieba.cut(text)# print(type(cut_text))# print(next(cut_text))# print(next(cut_text))# 3.以空格拼接起来result = " ".join(cut_text)image=np.array(Image.open('star.jpg'))stopwords = set(list)# print(result)# 4.生成词云wc = WordCloud(font_path='simhei.ttf',  # 字体路劲background_color='white',  # 背景颜色width=1000,height=600,max_font_size=100,  # 字体大小min_font_size=20,# mask=plt.imread('xin.jpg'),  #背景图片max_words=20,font_step=2,stopwords=stopwords,  # 设置停用词mask= image)wc.generate(result)wc.to_file('result.png')  # 图片保存# 5.显示图片plt.figure('result')  # 图片显示的名字plt.imshow(wc)plt.axis('off')  # 关闭坐标plt.show()#生成修仙小说词云
generate_wordcloud('XX.txt')
#生成言情小说词云
generate_wordcloud('YQ.txt')

效果图

标点符号

import jieba
from wordcloud import WordCloud
import refrom PIL import Imageimport matplotlib.pyplot as pltdef read_file_gbk(filename):with open(filename,'r',encoding='GBK') as f:s = f.read()s = re.sub('/C', '', s)s = re.sub('\r|\n|\s','',s)return sdef calculate_sign(text):s = read_file_gbk(text)s1 =re.findall('。(.*?)！', s)print('! num is '+str(s1.__sizeof__()))s2=re.findall('。(.*?)？', s)print('? num is ' +str(s2.__sizeof__()))x_data = ["感叹号数量", "逗号数量"]y_data = [s1.__sizeof__(),s2.__sizeof__()]bar_width = 0.3plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号# 将X轴数据改为使用range(len(x_data), 就是0、1、2...plt.bar(x=x_data, height=y_data, label='',color='steelblue', alpha=0.8, width=bar_width)# 将X轴数据改为使用np.arange(len(x_data))+bar_width,# 就是bar_width、1+bar_width、2+bar_width...这样就和第一个柱状图并列了# 在柱状图上显示具体数值, ha参数控制水平对齐方式, va控制垂直对齐方式for x, y in enumerate(y_data):plt.text(x, y + 100, '%s' % y, ha='center', va='bottom')# 设置标题plt.title("标点符号计算")# 为两条坐标轴设置名称plt.xlabel("类型")plt.ylabel("数量")# 显示图例plt.legend()plt.show()calculate_sign('XX.txt')
calculate_sign('YQ.txt')

效果图

词形

import jieba
from wordcloud import WordCloud
import refrom PIL import Imageimport matplotlib.pyplot as pltdef read_file_gbk(filename):with open(filename,'r',encoding='GBK') as f:s = f.read()s = re.sub('/C', '', s)s = re.sub('\r|\n|\s','',s)return s
import jieba
import numpy as np#统计形状
def calculate_shape(text):print("start calculate_shape")#  读取文本，输出为长串字符s = read_file_gbk(text)# 通过标点符合进行切分，同时去掉特殊字符sentences = re.split(r' *[\.\？！，。…… —— oo ll 99][\'"\)\]]* *', s)SIZE_AA = 0;SIZE_AABB = 0;SIZE_ABB = 0;SIZE_ABAB = 0;for stuff in sentences:# print(stuff)# 原理解析# "(.)\1(.)\2"这个正则，# .表示除换行外任意字符# \1 表示第一个括号里面的字符重复，默认重复一次，想重复4次加{4} 即(.)\1{4}# \2 表示第二个括号里面的字符重复# 开始匹配AAstrings = re.finditer(r'(.)\1', stuff)# print(type(strings))for i in strings:SIZE_AA = SIZE_AA + 1strings = re.finditer(r'(.)\1(.)\2', stuff)for i in strings:SIZE_AABB = SIZE_AABB + 1strings = re.finditer(r'(.)\1(.)\2', stuff)for i in strings:SIZE_ABB = SIZE_ABB + 1strings = re.finditer(r'(..)\1', stuff)for i in strings:SIZE_ABAB = SIZE_ABAB + 1print("AA shape num is " + str(SIZE_AA))print("AABB shape num is " + str(+SIZE_AABB) )print("ABB shape num is " + str(SIZE_ABB) )print("ABAB shape num is " + str(SIZE_ABAB)  )# start draw# 构建数据x_data = ['AA', 'AABB', 'ABB', 'ABAB']y_data = [SIZE_AA, SIZE_AABB, SIZE_ABB, SIZE_ABAB]bar_width = 0.3plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号# 将X轴数据改为使用range(len(x_data), 就是0、1、2...plt.bar(x=x_data, height=y_data, label='',color='steelblue', alpha=0.8, width=bar_width)# 将X轴数据改为使用np.arange(len(x_data))+bar_width,# 就是bar_width、1+bar_width、2+bar_width...这样就和第一个柱状图并列了# 在柱状图上显示具体数值, ha参数控制水平对齐方式, va控制垂直对齐方式for x, y in enumerate(y_data):plt.text(x, y + 100, '%s' % y, ha='center', va='bottom')# 设置标题plt.title("词形分析")# 为两条坐标轴设置名称plt.xlabel("类型")plt.ylabel("数量")# 显示图例plt.legend()plt.show()print("end calculate_shape")#生成修仙小说词形
calculate_shape('XX.txt')
#生成言情小说词形
calculate_shape('YQ.txt')

效果图

用python实现小说的平均句长，词性占比，关键词，标点符号，词形统计相关推荐

python爬虫小说代码示例-中文编程，用python编写小说网站爬虫
原标题:中文编程,用python编写小说网站爬虫作者:乘风龙王原文:https://zhuanlan.zhihu.com/p/51309019 为保持源码格式, 转载时使用了截图. 原文中的源码块 ...
零基础小白10分钟用Python搭建小说网站！网友：我可以！
都说Python什么都能做,本来我是不信的!直到我在CSDN站内看到了一件真事儿:一位博主贴出了自己10分钟用Python搭建小说网站的全过程!全程只用了2步操作,简直太秀了!!-- 第一步:爬取小说 ...
python爬小说代码_中文编程，用python编写小说网站爬虫
原标题:中文编程,用python编写小说网站爬虫作者:乘风龙王原文:https://zhuanlan.zhihu.com/p/51309019 为保持源码格式, 转载时使用了截图. 原文中的源码块 ...
python使用fpdf将生成的长字符串手动换行写入pdf
python使用fpdf将生成的长字符串手动换行写入pdf 目录 python使用fpdf将生成的长字符串手动换行写入pdf #lassocv生成特征系数dataframe
SpringBoot+MyBatisPlus+Echarts实现查询并显示平均时长占比饼状图
场景数据库中每个数据都有开始时间以及结束时间两个字段. 需要根据创建时间筛选出当天的四种类型的数据的平均执行时间. SpringBoot+Echarts实现请求后台数据显示饼状图: https:// ...
【转】 python socket向百度发送http长连接请求并做搜索
http://hi.baidu.com/leejun_2005/blog/item/30fe9bd23a396c28960a1640.html [转] python socket向百度发送http长连 ...
python自动发微信-python实现微信每日一句自动发送给喜欢的人
本文实例为大家分享了python实现微信每日一句自动发送的具体代码,供大家参考,具体内容如下代码: # -*- coding: utf-8 -*- """ 这是一个用来 ...
python爬虫-小说《大江大河》
python爬虫-小说<大江大河> 最近看了电视剧大江大河电视剧,挺好看的,就在网上找找小说看. 最近看了电视剧大江大河电视剧,挺好看的,就在网上找找小说看. 大江大河小说地址:傲宇中文网 ...
基于python简易小说阅读器（一）
基于python简易小说阅读器(一) 实现功能:界面显示小说内容,界面的最下方有按键实现章节切换后台需要有小说内容的数据,针对这一块,python的爬虫模块requests可以满足项目需求,前端 ...
python+selenium+webdriver 截取全页面长图
有的时候我们要截取整个页面,而不是当前的屏幕. python+selenium+webdriver 截取全页面长图: from selenium import webdriver import tim ...

用python实现小说的平均句长，词性占比，关键词，标点符号，词形统计

用python实现小说的平均句长，词性占比，关键词，标点符号，词形统计

需求如下

代码：

词性占比

效果图

平均句长

效果图

关键词词云

效果图

标点符号

效果图

词形

效果图

用python实现小说的平均句长，词性占比，关键词，标点符号，词形统计相关推荐

最新文章

热门文章