爬取bilibili的国创动漫的追番人数排行番剧,爬取该网页的番剧信息和各自番播放网页里的简介信息后,将数据存储到Excel和Database,最后通过网页将数据可视化。

1、爬取网页、数据解析、数据保存

from bs4 import BeautifulSoup
import re
import xlwt
import urllib.request,urllib.error
import sqlite3
from io import BytesIO
import gzip
import time
import json
import os
import xlrd
from xlutils.copy import copy
import string#异步爬取
def main():#URL网址baseurl = "https://api.bilibili.com/pgc/season/index/result?st=4&order=3&season_version=-1&is_finish=-1&copyright=-1&season_status=-1&year=-1&style_id=-1&sort=0&page=1&season_type=4&pagesize=20&type=1"#1. 爬取网页datalist = GetData(baseurl)savepath = "bili国漫.xls"#Excel路径dbpath = "cartoon.db"#数据库路径#3. 保存数据SavaData(datalist,savepath)SaveDataDb(datalist,dbpath)findintr = re.compile(r'<span class="absolute">(.*?)</span>',re.S)#简介的规则(re.S使.匹配换行在内的所有字符)#解析URL数据
def GetData(baseurl):datalist = []# 1. 爬取网页for i in range(0,35):#获取35个页面信息str_temp = "page=" + str(i+1)url = baseurl.replace("page=1",str_temp)#替换URL网址jsonbili = AskUrl(url)#保存获取的网页源码(源码为json数据)#2. 数据解析# print(soup.prettify())#使HTML标准化输出;HTML文件中排版:ctrl+alt+ldatafind = re.findall(r"\"list\":(.+?),\"num\"",str(jsonbili))#返回列表jsondata = json.loads(datafind[0])#将已编码的 json字符串解码为 python 对象,转换为字典for item in jsondata:data = []title = item["title"]  # 番剧名称data.append(title.strip())num = item["order"]  # 追番人数data.append(num.strip())status = item["index_show"]  # 更新状态data.append(status.strip())image = item["cover"]#封面链接data.append(image.strip())link = item["link"]#番剧链接data.append(link.strip())# 进入番剧链接爬取更多信息html = AskLinkUrl(item["link"])  # 保存获取的网页源码soup = BeautifulSoup(html, "html.parser")  # 解析为树结构for item1 in soup.find_all("span", class_="absolute"):item1 = str(item1)  # 转换为字符串用于正则表达式搜索introduction = re.findall(findintr, item1)  # 正则表达式搜索data.append(introduction[0].strip())#番剧简介datalist.append(data)#追加每页信息print(datalist)return  datalist#获取指定的URL网页内容
def AskUrl(url):head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}#通过Uaer-Agent伪装为谷歌浏览器request = urllib.request.Request(url,headers=head)#封装请求html = ""try:response = urllib.request.urlopen(request)#发送请求html = response.read().decode("utf-8")#非压缩数据:读取、解码返回值except urllib.error.URLError as e:if hasattr(e, "code"):  # 出错代码print(e.code)if hasattr(e, "reason"):  # 出错原因print(e.reason)return html#返回网页数据#获取指定的URL网页内容
def AskLinkUrl(url):head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}#通过Uaer-Agent伪装为谷歌浏览器request = urllib.request.Request(url,headers=head)#封装请求html = ""try:response = urllib.request.urlopen(request)#发送请求htmls = response.read()package = BytesIO(htmls)#压缩数据:读取、解压、解码返回值data = gzip.GzipFile(fileobj = package)html = data.read().decode('utf-8')except urllib.error.URLError as e:if hasattr(e, "code"):  # 出错代码print(e.code)if hasattr(e, "reason"):  # 出错原因print(e.reason)return html#返回网页数据#自适应列宽设置
def Auto_Type(datalist,sheet):col_width = []for i in range(len(datalist[0])):# 每列for j in range(len(datalist)):# 每行number1 = number2 = 0#统计字符宽度for char in datalist[j][i]:try:if 0x4e00 <= ord(char) <= 0x9fff or ord(char) == 0x0020:#unicode字符集(utf-8解码)number1 += 2else:number2 += 1except Exception as e:if hasattr(e, "code"):  # 出错代码print(e.code)if hasattr(e, "reason"):  # 出错原因print(e.reason)number = number1 + number2if j == 0:col_width.append(number)# 数组增加一个元素else:if col_width[i] < number:# 获得每列中的内容的最大宽度col_width[i] = numberwidth = 256*(col_width[i]+1)if width >= 65535:width = 65535sheet.col(i).width = width#设置列宽#保存数据到Excel
def SavaData(datalist,savepath):if not(os.path.isfile(savepath)):book = xlwt.Workbook(encoding="utf-8")#创建文件sheet = book.add_sheet("bili国漫")#创建表单Auto_Type(datalist, sheet)#自适应列宽print("表格创建成功\n")else:rb = xlrd.open_workbook(savepath,formatting_info=True)#打开文件book = copy(rb)sheet = book.get_sheet(0)#打开表单print("表格打开成功\n")col = ["番剧名称","追番人数","更新状态","封面链接","番剧名称","番剧简介"]         for i in range(len(datalist[0])):sheet.write(0,i,col[i])#写入第一行for i in range(len(datalist)):#存入数据print("正在写入第%s条"%(i+1))data = datalist[i]for j in range(len(datalist[0])):sheet.write(i+1,j,data[j])book.save(savepath)#保存数据#数据存储到数据库
def SaveDataDb(datalist,dbpath):Init_Db(dbpath)#创建数据表con = sqlite3.connect(dbpath)#连接数据库cur = con.cursor()#建立游标for data in datalist:for index in range(len(data)):data[index] = "'"+data[index]+"'"sql = '''insert into bilicartoon(name, number, status, image, link, introduction)values(%s)'''%",".join(data)print(sql)cur.execute(sql)#执行操作con.commit()#提交操作cur.close()#关闭游标con.close()#关闭数据库#数据库初始化
def Init_Db(dbpath):sql = '''create table if not exists bilicartoon(id integer primary key autoincrement,name text,number int,status text,image text,link text,introduction text)'''#没有bilicartoon时创建数据表connect = sqlite3.connect(dbpath)#连接数据库cursor = connect.cursor()#建立游标cursor.execute(sql)#执行操作connect.commit()#提交操作connect.close()#关闭数据库print("创建/打开数据库成功\n")#程序执行入口
if __name__ == "__main__":#调用函数main()print("爬取完毕")

注意:
①追番人数页面是动态网页,需要异步爬取。
②各番播放网页为静态网页,需要同步爬取。
③保存到Excel时最好设置自适应的表格排版。

2、数据可视化

路由设置:

from flask import Flask,render_template
import os.path
import sqlite3app = Flask(__name__)@app.route('/')
def index():return render_template("index.html")@app.route('/index.html')
def index1():return render_template("index.html")@app.route('/fan.html')
def fan():datalist = []dir_path = os.path.dirname(os.path.abspath(__file__))  # 使用绝对路径可连接数据库db_path = os.path.join(dir_path, "analysis/cartoon.db")con = sqlite3.connect(db_path)cur = con.cursor()sql = "select * from bilicartoon"data = cur.execute(sql)for item in data:datalist.append(item)cur.close()con.close()# print(datalist)return render_template("fan.html",fans = datalist)@app.route('/data.html')
def data():return render_template("data.html")@app.route('/contact.html')
def contact():return render_template("contact.html")if __name__ == '__main__':app.run(debug = True)

词云图制作:

#获取词云
con = sqlite3.connect('analysis/cartoon.db')
cur = con.cursor()
sql = 'select introduction from bilicartoon'
data = cur.execute(sql)
text = ""
for item in data:text = text + item[0]
# print(text)
cur.close()
con.close()#分词
cut = jieba.cut(text)
string = ' '.join(cut)
print(len(string))#分词个数#遮罩图设置
img = Image.open('cloud3.jpg')#白底图
img_array = np.array(img)#将图片转换为数组,做计算
wc = WordCloud(background_color='white',mask=img_array,font_path="simhei.ttf"#电脑字体路径C:\Windows\Fonts
)
wc.generate_from_text(string)#绘图
fig = plt.figure(1)
plt.imshow(wc)
plt.axis('off')#不显示坐标轴
# plt.show()#显示生成词云图
plt.savefig('word3.jpg',dpi=1000)#词云图文件输出保存

首页:

<!DOCTYPE HTML>
<html lang="zh-CN"><head><title></title><meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no"><link rel="stylesheet" type="text/css" href="../static/css/bootstrap.min.css"/><link rel="stylesheet" type="text/css" href="../static/css/main.css"/><style></style></head><body><nav class="navbar navbar-default"><div class="container"><a class="logo pull-left" href="#"><h1>爬虫<span>实例</span></h1></a><div class="navbar-header"><button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false"><span class="sr-only">Toggle navigation</span><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></button></div><div class="collapse navbar-collapse " id="bs-example-navbar-collapse"><ul class="nav navbar-nav navbar-right"><li class="navBg"></li><li class="active"><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></div></div></nav><!--space--><div class="space"></div><!--honor--><div class="honor padT80 padB80 greyBg"><div class="container"><section class="title"><h2 style="font-size: 40px">可视化内容</h2></section><div class="row padT80"><div class="col-sm-4"><div class="honty"><div><div class="ty"><span>A</span></div><div class="tycon"><h3 style="font-size: 22px">国创番剧名单</h3><p>FAN PLAY NAME</p></div></div><p style="font-size: 18px">为用户展示bilibili追番人数排行榜的所有番剧信息~</p></div></div>    <div class="col-sm-4"><div class="honty"><div><div class="ty"><span>B</span></div><div class="tycon"><h3 style="font-size: 22px">数据分析展示</h3><p>DATA ANALYSIS</p></div></div><p style="font-size: 18px">为用户展示番剧的词云图~</p></div></div><div class="col-sm-4"><div class="honty"><div><div class="ty"><span>C</span></div><div class="tycon"><h3 style="font-size: 22px">联系作者</h3><p>CONTACT THE AUTHOR</p></div></div><p style="text-align: center;font-size: 18px">邮件联系作者~</p></div></div></div></div></div><footer><nav><ul><li><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></nav><p><span class="glyphicon glyphicon-phone-alt"></span>123-12345678<span class="glyphicon glyphicon-earphone"></span>12345678900<span class="glyphicon glyphicon-envelope"></span>123456@123.com</p></footer><script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script></body>
</html>

番剧页:

<!DOCTYPE HTML>
<html lang="zh-CN"><head><title></title><meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no"><link rel="stylesheet" type="text/css" href="../static/css/bootstrap.min.css"/><link rel="stylesheet" type="text/css" href="../static/css/main.css"/><style type="text/css">h2{font-size: 40px;}table {width: 100%;font-family: verdana,arial,sans-serif;color:#333333;border-width: 1px;border-color: #999999;border-collapse: collapse;  }th {font-size: 18px;background-color:#c3dde0;border-width: 1px;padding: 8px;border-style: solid;border-color: #a9c6c9;  }tr {background-color:#d4e3e5;  }td {font-size: 15px;border-width: 1px;padding: 8px;border-style: solid;border-color: #a9c6c9;  }</style></head><body><nav class="navbar navbar-default"><div class="container"><a class="logo pull-left" href="#"><h1>爬虫<span>实例</span></h1></a><div class="navbar-header"><button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false"><span class="sr-only">Toggle navigation</span><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></button></div><div class="collapse navbar-collapse " id="bs-example-navbar-collapse"><ul class="nav navbar-nav navbar-right"><li class="navBg"></li><li class="active"><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></div></div></nav><!--space--><div class="space"></div><!--honor--><div class="honor padT80 padB80 greyBg"><div class="container"><section class="title"><h2>番剧信息</h2></section><table ><!--列表,样式为bootstrap--><tr><!--行----><th>人气排名</th><th>番剧名称</th><th>追番人数</th><th>更新状态</th></tr><!--显示信息-->{% for fan in fans %}<tr onmouseover="this.style.backgroundColor='#C8C8C8 ';" onmouseout="this.style.backgroundColor='#d4e3e5';"><!--行----><td>{{ fan[0] }}</td><td><a href="{{ fan[5] }}"target="_blank"><!--打开新网页-->{{ fan[1] }}</a></td><td>{{ fan[2] }}</td><td>{{ fan[3] }}</td></tr>{% endfor %}</table></div></div><footer><nav><ul><li><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></nav><p><span class="glyphicon glyphicon-phone-alt"></span>123-12345678<span class="glyphicon glyphicon-earphone"></span>12345678900<span class="glyphicon glyphicon-envelope"></span>123456@123.com</p></footer><script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script></body>
</html>

词云页:

<!DOCTYPE HTML>
<html lang="zh-CN"><head><title></title><meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no"><link rel="stylesheet" type="text/css" href="static/css/bootstrap.min.css"/><link rel="stylesheet" type="text/css" href="static/css/main.css"/></head><body><nav class="navbar navbar-default"><div class="container"><a class="logo pull-left" href="#"><h1>爬虫<span>实例</span></h1></a><div class="navbar-header"><button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false"><span class="sr-only">Toggle navigation</span><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></button></div><div class="collapse navbar-collapse " id="bs-example-navbar-collapse"><ul class="nav navbar-nav navbar-right"><li class="navBg"></li><li class="active"><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></div></div></nav><!--space--><div class="space"></div><!--case--><div class="case padT80 padB80"><div class="container"><section class="title"><h2>葫芦娃词云</h2></section><ul class="row padT80"><li class="col-sm-4 col-xs-6"><img src="../static/images/case/word1.jpg" height="350" width="300"/></li><li class="col-sm-4 col-xs-6"><img src="../static/images/case/word2.jpg" height="350" width="300"/></li><li class="col-sm-4 col-xs-6"><img src="../static/images/case/word3.jpg" height="350" width="300"/>
{#                  </li>#}
{#              </ul>#}</div></div><footer><nav><ul><li><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></nav><p><span class="glyphicon glyphicon-phone-alt"></span>123-12345678<span class="glyphicon glyphicon-earphone"></span>12345678900<span class="glyphicon glyphicon-envelope"></span>123456@123.com</p></footer><script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script></body>
</html>

联系页:

<!DOCTYPE HTML>
<html lang="zh-CN"><head><title></title><meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no"><link rel="stylesheet" type="text/css" href="../static/css/bootstrap.min.css"/><link rel="stylesheet" type="text/css" href="../static/css/main.css"/></head><body><nav class="navbar navbar-default"><div class="container"><a class="logo pull-left" href="#"><h1>爬虫<span>实例</span></h1></a><div class="navbar-header"><button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse" aria-expanded="false"><span class="sr-only">Toggle navigation</span><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></button></div><div class="collapse navbar-collapse " id="bs-example-navbar-collapse"><ul class="nav navbar-nav navbar-right"><li class="navBg"></li><li class="active"><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></div></div></nav><!--space--><div class="space"></div><div class="conPg"><div class="container padT80"><div class="address row padT80 padB80"><div class="col-sm-6 col-xs-12"><section class="title"><h2>联系作者</h2></section><ul class="padT80"><li><span class="glyphicon glyphicon-phone-alt"></span>座机号码:123-123455678</li><li><span class="glyphicon glyphicon-map-marker"></span>作者地址:xx省xx市xx大学</li><li><span class="glyphicon glyphicon-envelope"></span>QQ邮箱:123456@qq.com</li><li><span class="glyphicon glyphicon-phone"></span>联系电话:12345678900</li></ul></div><div class="col-sm-6 col-xs-12 padT80"><form><input placeholder="姓名" type="name" id="name"/><input placeholder="邮箱" type="email" id="email"/><input placeholder="电话" type="text" id="text"/><textarea placeholder="消息" rows="5"></textarea></form><a class="btn btn-primary">发送</a></div></div></div></div><footer><nav><ul><li><a href="index.html">网站首页</a></li><li><a href="fan.html">国创番剧</a></li><li><a href="data.html">数据统计</a></li><li><a href="contact.html">联系作者</a></li></ul></nav><p><span class="glyphicon glyphicon-phone-alt"></span>123-12345678<span class="glyphicon glyphicon-earphone"></span>12345678900<span class="glyphicon glyphicon-envelope"></span>123456@123.com</p></footer><script src="../static/js/jquery.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/bootstrap.min.js" type="text/javascript" charset="utf-8"></script><script src="../static/js/main.js" type="text/javascript" charset="utf-8"></script></body>
</html>

3、结果

表格:

数据库:

首页:

番剧页:

词云页:

联系页:

爬虫实战:bilibili番剧排名爬取并数据可视化相关推荐

  1. [爬虫] B站番剧信息爬取

    申明:本文对爬取的数据仅做学习使用,不涉及任何商业活动,侵删 简述 本次爬取目标是: 番剧的基本信息(名字, 类型, 集数, 连载or完结, 链接等) 番剧的参数信息(播放量, 点赞, 投币, 追番人 ...

  2. 爬虫实战2(下):爬取豆瓣影评

       上篇笔记我详细讲诉了如何模拟登陆豆瓣,这次我们将记录模拟登陆+爬取影评(复仇者联盟4)实战.本文行文结构如下: 模拟登陆豆瓣展示 分析网址和源码爬取数据 进行面对对象重构 总结   一.模拟登陆 ...

  3. 爬虫实战2(上):爬取豆瓣影评

       这次我们将主要尝试利用python+requsets模拟登录豆瓣爬取复仇者联盟4影评,首先让我们了解一些模拟登录相关知识补充.本文结构如下: request模块介绍与安装 get与post方式介 ...

  4. 多线程爬虫实战--彼岸图网壁纸爬取

    多线程爬虫实战–彼岸图网壁纸爬取 普通方法爬取 import requests from lxml import etree import os from urllib import requesth ...

  5. python爬虫bilibili_python爬虫下载Bilibili番剧弹幕

    本文绍如何利用python爬虫下载bilibili番剧弹幕. 准备: python3环境 需要安装BeautifulSoup,selenium包 phantomjs 原理: 代码: # -*- cod ...

  6. python3爬虫实战:requests库+正则表达式爬取头像

    python3爬虫实战:requests库+正则表达式爬取头像 网站url:https://www.woyaogexing.com/touxiang/qinglv/new/ 浏览网页:可以发现每个图片 ...

  7. python爬虫多久能学会-不踩坑的Python爬虫:如何在一个月内学会爬取大规模数据...

    原标题:不踩坑的Python爬虫:如何在一个月内学会爬取大规模数据 Python爬虫为什么受欢迎 如果你仔细观察,就不难发现,懂爬虫.学习爬虫的人越来越多,一方面,互联网可以获取的数据越来越多,另一方 ...

  8. 爬虫系列4:Requests+Xpath 爬取动态数据

    爬虫系列4:Requests+Xpath 爬取动态数据 [抓取]:参考前文 爬虫系列1:https://www.cnblogs.com/yizhiamumu/p/9451093.html [分页]:参 ...

  9. Python爬取豆瓣+数据可视化

    博客原文和源码下载:Python爬取豆瓣+数据可视化 前言 前段时间应我姐邀请,看了一下Python爬虫.不得不说Python的语法确实简洁优美,可读性强,比较接近自然语言,非常适合编程的初学者上手. ...

最新文章

  1. [CSS3]环形进度条
  2. SLAM:SLAM(即时定位与地图构建)的简介、发展、案例应用之详细攻略
  3. Archive引擎初探
  4. 使用GPU在caffe上进行CNN训练
  5. matplotlib设置多个图例横向水平放置
  6. 【ES】ES 拼音 Pinyin 分词器
  7. 为什么越普通的男人越自信?
  8. Tensorflow Estimator之LinearRegressor
  9. Linux下rc.local不执行问题
  10. python定时任务之cron_Python定时任务框架APScheduler 3.0.3 Cron示例
  11. B2C电商系统源码 在线商城源码
  12. plt的默认风格/样式设置 or 将plt.rcParams恢复恢复到默认参数设置
  13. 如何给excel添加开发者选项
  14. (文末福利)如果代码莫名其妙跑起来了,就不要去动它了……吗?
  15. 【电源】之【常用稳压IC大全】
  16. 03-lvs-persistence
  17. WEB系列(四)_uploadfile笔记
  18. drupal 用的什么php框架,7个使用Drupal的理由
  19. 【总结】新手必看!超过60个小时+600页文档的免费AI深度学习理论与实践课程...
  20. 800G以太网强势来袭:你准备好了吗?

热门文章

  1. 推荐一款不错的播放器客户端——乐鱼播放器
  2. 计算机应用基础奥鹏2021,2021奥鹏计算机应用基础《Word 大作业》离线作业.docx
  3. java中config是什么意思,详解Spring中的JavaConfig注解
  4. 求职找工作的最后一步:如何Argue薪资?
  5. 新的RA Group勒索软件针对美国组织进行双重勒索攻击
  6. 英语听力训练1:遗失的乔布斯访谈
  7. 克鲁斯卡尔算法(kruskal)
  8. Nokia 手机软件签名安装全过程.
  9. Web前端仿小米官网实战总结
  10. 后退到错误html页面,wap2app本地html跳转的远程页面后点击后退报mui not defined