爬虫虎牙斗鱼网站主播人气和姓名

虎牙

from urllib import request
import reclass Pachong():# 虎牙url ='https://www.huya.com/g/2168'root =' <span class="txt">[\s\S]*?</li>'name_patter ='<i class="nick" title="(.*)">.*</i>'number_patter ='<i class="js-num">(.*)</i>'# 获取页面信息（html）def __request_html(self):r = request.urlopen(Pachong.url)htmls = r.read()htmls = str(htmls,encoding='utf-8')# print(htmls)return htmls# 处理html，获取想要的数据def __request_name(self,htmls):root_html = re.findall(Pachong.root,htmls)ans = []for html in root_html:name = re.findall(Pachong.name_patter,html)number = re.findall(Pachong.number_patter,html)an = {'name':name,'number':number}ans.append(an)# print(root_html[0])# print(ans)return ansdef __refine(self,anchors):l = lambda anchor:{'name':anchor['name'][0],'number':anchor['number'][0]}return map(l,anchors)def __sort(self,anchors):anchors = sorted(anchors,key=self.__sort_seed,reverse=True)return anchorsdef __sort_seed(self,anchor):r = re.findall('\d*', anchor['number'])number = float(r[0])if '万' in anchor['number']:number *= 10000return numberdef __show(self,anchors):for rank in range(0,len(anchors)):print('主播排名第'+ str(rank+1)+": " +anchors[rank]['name']+'  ' +anchors[rank]['number'])def go(self):htmls = self.__request_html()anchors = self.__request_name(htmls)anchors = self.__refine(anchors)anchors = self.__sort(anchors)self.__show(anchors)pachong = Pachong()
pachong.go()

斗鱼

from urllib import request
from io import BytesIO
import gzip
import re
# 断点调试
class Spider():# 私有方法  读取斗鱼网页的HTML页面信息url = 'https://www.douyu.com/g_yz'# ?是解决贪婪问题，[\s\S]*正则表达式# .DyListCover-inforoot_pattern = '<div class="DyListCover-info">([\s\S]*?)</div>'# DyListCover - userName# name_pattern = '<div class="DyListCover-userName is-template">([\s\S]*?)</div>'name_pattern = '<div class="DyListCover-userName is-template">(.*)'# .DyListCover - hotIconnumber_pattern = '<span class="DyListCover-hot is-template"><svg><use xlink:href="#icon-hot_8a57f0b"></use></svg>([\s\S]*?)</span>'def __fetch_content(self):r = request.urlopen(Spider.url)# htmls获取的都是字节码byteshtmls = r.read()# 首先我们观察第一个print输出的字节码可以看到它是以"b’\x1f\x8b\x08"开头的 ，说明它是gzip压缩过的数据，这也是报错的原因，所以我们需要对我们接收的字节码进行一个解码操作。buff = BytesIO(htmls)f =gzip.GzipFile(fileobj=buff)htmls = f.read().decode('utf-8')return htmlsdef __analysis(self,htmls):# 获取大标签root_html = re.findall(Spider.root_pattern,htmls)anchors = []for i in range(len(root_html)):if i%2 == 1:html = root_html[i]# 获取小标签name = re.findall(Spider.name_pattern,html)number = re.findall(Spider.number_pattern,html)anchor ={'name':name,'number':number}anchors.append(anchor)return anchorsdef __refine(self,anchors):l = lambda anchor:{'name':anchor['name'][0].strip(),'number':anchor['number'][0]}return map(l,anchors)def __sort(self,anchors):anchors = sorted(anchors,key = self.__sort_seed,reverse=True)return anchorsdef __sort_seed(self,anchor):r = re.findall('\d*',anchor['number'])number = float(r[0])if '万' in anchor['number']:number *=10000return numberdef __show(self,anchors):for rank in range(0,len(anchors)):print('主播排名第'+str(rank +1)+ ': ' + anchors[rank]['name']+'  ' + anchors[rank]['number'])def go(self):htmls = self.__fetch_content()anchors = self.__analysis(htmls)anchors = list(self.__refine(anchors))anchors = self.__sort(anchors)self.__show(anchors)spider = Spider()
spider.go()

爬虫虎牙斗鱼网站主播人气和姓名相关推荐

爬虫虎牙网站lol主播人气和姓名（附人气值排名新方法）
爬虫虎牙网站lol主播人气和姓名(附人气值排名新方法) 大家好,我是bd,新人小白,这是我第一次尝试将自己对于爬虫知识的一些心得分享给大家.由于是最近刚接触python,所以在学习完爬虫这个知识点后自 ...
爬取斗鱼LOL主播人气数据，并显示排行榜 [网络爬虫] [应用案例][请求头][模块]
您的"关注"和"点赞",是信任,是认可,是支持,是动力- 如意见相佐,可留言. 本人必将竭尽全力试图做到准确和全面,终其一生进行修改补充更新. 文章目录 1 爬 ...
python刷直播人气_python3爬取斗鱼某些版块的主播人气
python 3爬取斗鱼某些版块的主播人气 1.爬虫版块 import Test3 import urllib.request from bs4 import BeautifulSoup import ...
python斗鱼抽奖_python3爬取斗鱼某些版块的主播人气
python 3爬取斗鱼某些版块的主播人气 1.爬虫版块 import Test3 import urllib.request from bs4 import BeautifulSoup import ...
爬取虎牙游戏主播人气分析实战
爬取虎牙游戏主播人气分析实战: 在写的时候遇得了几个坑,然后又填上了: 第一个是: 开头我想爬取主播人气:但是找不到 div 只找到 span 遇到下一个</span>就结束了无奈只能找 ...
python原生爬虫爬取熊猫TV LOL主播人气排行
本文采取phthon原生爬虫,没有采用常用的爬虫框架,比较适合新手练手. 首先进入熊猫TV英雄联盟主页----https://www.panda.tv/cate/lol?pdt=1.24.s1.2.4 ...
斗鱼美女主播封面爬取 python request urlretrieve jsonpath 爬虫
斗鱼美女主播封面爬取 python request urlretrieve jsonpath 爬虫输出结果
《原生爬虫》爬取某直播平台某分类下的主播人气，生成排行榜
此原生爬虫项目旨在爬取直播平台(本项目以pandaTV为例)上某分类(本项目为Dota2)的主播人气,进行排行.得出排行榜. 首先对爬虫的框架进行分析,得到以下思维导图: 从思维导图中可以得到该爬虫应 ...
Python3原生爬虫获取熊猫直播某一分类下的主播人气并保存到Excel
import re import openpyxl from urllib import request# 断点调试class Spider:url = 'https://www.panda.tv/c ...

爬虫虎牙斗鱼网站主播人气和姓名

爬虫虎牙斗鱼网站主播人气和姓名相关推荐

最新文章

热门文章