需求

第一步：爱奇艺《青春有你2》评论数据爬取(参考链接：https://www.iqiyi.com/v_19ryfkiv8w.html#curid=15068699100_9f9bab7e0d1e30c494622af777f4ba39)

爬取任意一期正片视频下评论
评论条数不少于1000条

第二步：词频统计并可视化展示

数据预处理：清理清洗评论中特殊字符（如：@#￥%、emoji表情符）,清洗后结果存储为txt文档
中文分词：添加新增词（如：青你、奥利给、冲鸭），去除停用词（如：哦、因此、不然、也好、但是）
统计top10高频词
可视化展示高频词

第三步：绘制词云

根据词频生成词云
可选项-添加背景图片，根据背景图片轮廓生成词云

第四步：结合PaddleHub，对评论进行内容审核

需要的配置和准备

中文分词需要jieba
词云绘制需要wordcloud
可视化展示中需要的中文字体
网上公开资源中找一个中文停用词表
根据分词结果自己制作新增词表
准备一张词云背景图（附加项，不做要求，可用hub抠图实现）
paddlehub配置

!pip install jieba
!pip install wordcloud# Linux系统默认字体文件路径
!ls /usr/share/fonts/
# 查看系统可用的ttf格式中文字体
!fc-list :lang=zh | grep ".ttf"!wget https://mydueros.cdn.bcebos.com/font/simhei.ttf!mkdir .fonts# 复制字体文件到该路径!cp fonts/simhei.ttf .fonts/#!cp fonts/simhei.ttf /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/
#!rm -rf .cache/matplotlib#安装模型
!hub install porn_detection_lstm==1.1.0
!pip install --upgrade paddlehub

from __future__ import print_function
import requests
import json
import re #正则匹配
import time #时间处理模块
import jieba #中文分词
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from PIL import Image
from wordcloud import WordCloud  #绘制词云模块
import paddlehub as hub
import os, sys

#请求爱奇艺评论接口，返回response信息
def getMovieinfo(url):'''请求爱奇艺评论接口，返回response信息参数  url: 评论的url:return: response信息'''headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}try:response = requests.get(url, headers=headers)if response.status_code==200:return json.loads(response.text)else:return Noneexcept Exception as e:print(e)return None#解析json数据，获取评论
def saveMovieInfoToFile(lastId,arr):'''解析json数据，获取评论参数  lastId:最后一条评论ID  arr:存放文本的list:return: 新的lastId'''# &callback=jsonp_1587899511324_59086url = 'https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=a3fFx3Pr964sHm1rgm3RbWjyKIeM04eCWEOpxPs2pHLam1Oeom2Nqc9R1r1cHrF57ydg9a56&business_type=17&content_id=15068699100&hot_size=10&last_id='+lastId+'&page=&page_size=40&types=hot,time'content=getMovieinfo(url)# start_index=content.find('''{"data"''');# data=json.loads(content[start_index:-14]);# print(len(comment_list))comment_list=content['data']['comments']for comment in comment_list:if 'content' in comment.keys():arr.append(comment['content'])lastId=comment['id']#递归if len(arr) < 1100 :time.sleep(0.5)saveMovieInfoToFile(lastId,arr)else:return

注意爱奇艺评论页面，此次不是直接在html获取数据，而且F12-network中找到评论接口，然后分析参数，再进行爬。获取下一页的逻辑是每次把上一页最后一条数据的lastId给带上。

#去除文本中特殊字符
def clear_special_char(content):'''正则处理特殊字符参数 content:原文本return: 清除后的文本'''s = re.sub(r"</?(.+?)>|&nbsp;|\t|\r", "", content)s = re.sub(r"\n"," ",s)s = re.sub(r"\*", "\\*", s)s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]',"",s)#去除不可见字符s=re.sub("[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+", "",s)s = re.sub('[a-zA-Z]','',s)s = re.sub('^\d+(\.\d+)?$',"",s)return s

def fenci(dictPath,text):'''利用jieba进行分词参数 text:需要分词的句子或文本return：分词结果'''jieba.load_userdict(dictPath) #添加自定义字典seg = jieba.lcut(text, cut_all=False)return seg

def stopwordslist(filepath):'''创建停用词表参数 file_path:停用词文本路径return：停用词list'''stopwords = [line.strip() for line in open(filepath, encoding= "UTF-8").readlines()]return stopwords

def movestopwords(phrase,stopwords,counts):'''去除停用词,统计词频参数 file_path:停用词文本路径 stopwords:停用词list counts: 词频统计结果return：None'''out =[]for word in phrase :if word not in stopwords :if len(word) != 1:counts[word] = counts . get(word,0) + 1return None

def drawcounts(counts,num):'''绘制词频统计表参数 counts: 词频统计结果 num:绘制topNreturn：none'''x_aixs =[]y_aixs = []c_order =sorted(counts.items(), key=lambda x:x[1],reverse=True)# print(c_order)for c in c_order[:num]:x_aixs.append(c[0])y_aixs.append(c[1])#解决中文显示问题plt.rcParams['font.family'] = ['sans-serif']plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题plt.bar(x_aixs, y_aixs)plt.title( "词频统计结果")plt.show()

def drawcloud(content):aimask=np.array(Image.open("ai2.png"))wc = WordCloud(font_path='fonts/simhei.ttf',  # 设置字体background_color="white",  # 背景颜色max_words=1000,  # 词云显示的最大词数max_font_size=50,  # 字体最大值min_font_size=10, #字体最小值random_state=42, #随机数mask=aimask,margin=10,collocations=False, #避免重复单词)# print(str(content))wc.generate(str(content))  plt.figure(dpi=200) #通过这里可以放大或缩小plt.imshow(wc, interpolation='bilinear')plt.axis("off") #隐藏坐标plt.show()或者如下配置------------------------------------------'''根据词频绘制词云图参数 word_f:统计出的词频结果return：none'''#加载背景图片cloud_mask = np.array(Image.open('cluod.png'))#忽略显示的词st=set(["东西","这是"])#生成wordcloud对象wc = WordCloud(background_color='white', mask=cloud_mask,max_words=150,font_path='simhei.ttf',min_font_size=10,max_font_size=100, width=400, relative_scaling=0.3,stopwords=st)wc.fit_words(word_f)wc.to_file('pic.png')'''根据词频绘制词云图参数 word_f:统计出的词频结果return：none'''

def text_detection(test_text, path):porn_detection_lstm = hub.Module(name="porn_detection_lstm")f = open(path, 'r', encoding='utf-8')for line in f:if len(line.strip()) == 1:  # 判断评论长度是否为1continueelse:test_text.append(line)f.close()input_dict = {"text": test_text}results = porn_detection_lstm.detection(data=input_dict, use_gpu=True, batch_size=1)for index, item in enumerate(results):if item['porn_detection_key'] == 'porn':if item['porn_probs']>0.9:print(item['text'], ':', item['porn_probs'])'''使用hub对评论进行内容分析return：分析结果'''

#评论是多分页的，得多次请求爱奇艺的评论接口才能获取多页评论,有些评论含有表情、特殊字符之类的
#num 是页数，一页10条评论，假如爬取1000条评论，设置num=100
if __name__ == "__main__":filePath='data/iqiyi.txt'fenciAddDictPath='fenci_add_dict.txt' fenciForbidDictPath='fenci_forbid_dict.txt' #【1】第一步：通过“递归”爱奇艺《青春有你2》
#爬取任意一期正片视频下评论
#评论条数不少于1000条arr=[]
print('评论抓取中')
saveMovieInfoToFile('',arr)
print('爬取评论数量为%d'%len(arr))#【2】词频统计并可视化展示
# 数据预处理：清理清洗评论中特殊字符（如：@#￥%、emoji表情符）,清洗后结果存储为txt文档if os.path.exists(filePath):os.remove(filePath)
with open(filePath,'a',encoding='utf-8') as f:#遍历清洗评论for txt in arr:text=clear_special_char(txt)if(text.strip()!=''):try:f.write(text+'\n')except Exception as e:print("find exception",e)f=open(filePath,'r',encoding='utf-8')
# 统计top10高频词
counts={}
for line in f:
# 中文分词：添加新增词（如：青你、奥利给、冲鸭），去除停用词（如：哦、因此、不然、也好、但是）words = fenci(fenciAddDictPath,line)  stopwords = stopwordslist(fenciForbidDictPath)movestopwords (words ,stopwords,counts)# 可视化展示高频词
drawcounts(counts,10) #绘制top10 高频词# 【3】绘制词云
# 根据词频生成词云
# 可选项-添加背景图片，根据背景图片轮廓生成词云drawcloud(counts) #绘制词云
f.close( )# 【4】结合PaddleHub，对评论进行内容审核
test_text =[]
text_detection(test_text,filePath)

爱奇艺评论爬虫、词频统计、词云、PaddleHub内容审核相关推荐

爱奇艺向抖音开启授权，打开内容价值的新大门
7月19日,爱奇艺和抖音集团(以下简称:抖音)宣布达成合作,爱奇艺将长视频内容授权给抖音进行二创.推广等行为.活跃在短视频平台上的影视内容,开始走向正版化. 随之而来的是资本市场的支持,爱奇艺隔夜美股 ...
爱奇艺容器实践（内附云原生落地沙龙干货下载）
4月10日下午,爱奇艺技术产品团队举办了"i技术会"线下技术沙龙,本次技术会的主题是"云原生落地探索与实践",邀请快手.百度和字节跳动的技术专家,与爱奇艺技术产 ...
影视级XR技术直播演唱会诞生，爱奇艺沉浸式虚拟制作呈现“云演出”
作者:张博佳去年4月,刚刚获得第26届格莱美奖的Travis Scott举行了一场名为"Astronomical"的演唱会.不过,和传统演唱会不同,这场演出选择了一个虚拟地点开展 ...
爱奇艺：商业模式、技术创新、内容沉淀
网络视频行业已经非常成熟了,体现在哪儿?第一是市场集中度的提高,这次大会发布的网络视听报告,已经分成了几个阵营,综合性的网站已经非常少了,市场集中度高度提高,已经实现向用户收费,这是市场成熟的标志. ...
爱奇艺AR应用亮相，联合Nreal探索内容新玩法
爱奇艺又拿下"一块屏". 近日,爱奇艺宣布推出AR应用,并率先与AR眼镜品牌Nreal合作发布第一款定制版AR应用.通过自制全息内容与AR终端的深度融合,爱奇艺将为用户带来全新的沉 ...
爱奇艺抢跑影视工业化继续“押注”好内容
北京时间5月18日,百度.爱奇艺.网易等巨头相继发布新一季度业绩.从财报发布后的股价表现来看,三家公司均有不同幅度的上涨,其中爱奇艺因一季度财报超预期,当天股价大涨10%. 在财报超市场预期背后,我们 ...
php 上传到爱奇艺,php 爬虫爱奇艺视频、内容
function getdata( $i, $url) { $data = array(); // 把整个文件读入到字符串中 $str = file_get_contents( $url); $str ...
从B站、爱奇艺、映客的IPO上市，看国内视频公司的内容审核现状
欢迎访问网易云社区,了解更多网易技术产品运营经验. 中央电视台<经济半小时>栏目 3月30日,中央电视台<经济半小时>栏目讲述了网络上的一个顽症--色情内容.在这期主题为< ...
全球AI技术开放日系列5（上海站）：走进爱奇艺
主题: 全球AI技术开放日系列 5 (上海站): 走进爱奇艺时间: 8月18日 12:30-17:00 报名:点击阅读原文,半价早鸟票限时优惠内容: 全球AI技术开放日(系列)是AICamp 发起 ...

爱奇艺评论爬虫、词频统计、词云、PaddleHub内容审核

需求

需要的配置和准备

爱奇艺评论爬虫、词频统计、词云、PaddleHub内容审核相关推荐

最新文章

热门文章