导入包

import requests
import re
import json
from bs4 import BeautifulSoup
import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger
from apscheduler.schedulers.background import BackgroundScheduler
import pandas as pd
from snownlp import SnowNLP
import numpy as np

获取自己要爬取的网站地址、cookie和User-Agent

base_url = '' #爬取的网站
cookie='' #自己的浏览器cookie
headers = {'User-Agent': '',#自己浏览器的User-Agent'Cookie':cookie,  'Accept-Encoding':'gzip, deflate, br',}

函数封装

所有函数封装在一个spider类里面，模块化开发。
Timed_crawling（）函数：实现定时爬取功能，使用apscheduler模块。
apscheduler模块中BlockingScheduler类和BackgroundScheduler类都可以用来定时任务爬取数据，区别在于BlockingScheduler是阻塞的，就是start（）以后如果没有爬完就不会进行下面的代码，jupyter notebook可以看到一直是*。如果想要运行下面的代码而让爬虫在后台仍然运行，可以将BlockingScheduler改成BackgroundScheduler类，scheduler = BackgroundScheduler(timezone='Asia/Shanghai')。
data1=re.compile('target="_blank">(.+)</a>') data2=data1.findall(soup)[1:-2]，这两句代码的意思是在爬取网页的html页面内容soup后，获取全部以target="_blank">开头 </a>结束的中间内容，这里截取的是热搜内容。当然你也可以根据你想获取的内容，使用其他正则表达式获取。
情感分析使用的是snownlp模块，可以输出情感分数，越高越积极，越低越消极。

class spider():def __init__(self, base_url, cookie,headers):self.base_url = base_urlself.cookie = cookieself.headers=headersdef web_crawler(self):#爬虫response = requests.get(self.base_url, headers=self.headers)response.raise_for_status()response.encoding = response.apparent_encodingdata = response.textsoup = str(BeautifulSoup(data,'lxml'))#解析data1=re.compile('target="_blank">(.+)</a>')data2=data1.findall(soup)[1:-2]print(datetime.datetime.now(),len(data2))print(data2)print("******************************************************************")file = open('wb_result.txt','a',encoding='utf-8')for i in data2:file.write( str(datetime.datetime.now())[:19]+"," )file.write( i+"\n" )# 关闭打开的文件file.close()def Timed_crawling(self):#定时调度，改时间就可scheduler = BlockingScheduler(timezone='Asia/Shanghai')scheduler.add_job(self.web_crawler, 'interval', seconds=900,start_date='2023-04-16 12:17:00',end_date='2023-04-16 12:18:00')#scheduler.remove_job(0)scheduler.start()#scheduler.shutdown(wait=False)def data(self):#数据读取与处理df = pd.read_csv("wb_result.txt", sep=",", names=["time", "hot_word"])return dfdef Sentiment_analysis(self,df):#情感分析E_word=list(set(df["hot_word"]))E_result={}for i in E_word:E_result[i]=SnowNLP(i).sentimentsE_result=pd.Series(E_result)Most_negative=E_result.sort_values(ascending=False)[-3:].reset_index()most_positive=E_result.sort_values(ascending=False)[:3].reset_index() Most_negative.columns=["Most_negative_hotword","scores"]Most_negative=Most_negative.sort_values(by=['scores'],ascending=True)most_positive.columns=["most_positive_hotword","scores"]Most_negative.index=["第一名","第二名","第三名"]most_positive.index=["第一名","第二名","第三名"]print("最正面的3条和最负面的3条热搜如下")display(pd.concat([Most_negative,most_positive],axis=1,join='inner'))def Hot_search_queries(self,df):#热搜查询hot_search_statistics=pd.DataFrame()for i in list(set(df.time)):hot=df[df["time"]==i].hot_wordhot=pd.DataFrame(hot.values,columns=[i])hot_search_statistics=pd.concat([hot_search_statistics,hot],axis=1)hot_search_statistics=hot_search_statistics.sort_index(axis=1)print("历史某节点热搜榜单:\n -----------------")hot_search_statistics.index=hot_search_statistics.index.values+1hot_search_statistics.index.name="rank"display(hot_search_statistics)def length_on_list(self,df):#在榜时长length_on_list_total={}for t in list(set(df.hot_word)):#print(t)L=df[df["hot_word"]==t].time.to_list()i=1length_on_list=0while i<len(L)-1:end_time=datetime.datetime.strptime(L[i+1], "%Y-%m-%d %H:%M:%S")#print(end_time)start_time=datetime.datetime.strptime(L[i], "%Y-%m-%d %H:%M:%S")#print(start_time)#print((end_time-start_time).seconds)if (end_time-start_time).seconds==900:length_on_list=length_on_list+900i=i+1if length_on_list==0:length_on_list_total[t]="小于15分钟"else:length_on_list_total[t]=length_on_list/60print("在榜时长：\n-----------------")    display(pd.DataFrame({"hot_word":length_on_list_total.keys(),"on_list(min)":length_on_list_total.values()}) )

函数调用

weibo_spider=spider(base_url,cookie,headers)
weibo_spider.Timed_crawling()
df=weibo_spider.data()
weibo_spider.Sentiment_analysis(df)
weibo_spider.Hot_search_queries(df)
weibo_spider.length_on_list(df)

此代码可以在jupyter notebook 跑，若只想跑通爬虫代码，可以删除weibo_spider.Sentiment_analysis(df)
weibo_spider.Hot_search_queries(df)
weibo_spider.length_on_list(df)，这三个函数，因为他们是用来做统计的

简单又详细的网页爬虫案例相关推荐

python网页爬虫漫画案例_python实现网络段子页爬虫案例
网上的Python教程大都是2.X版本的,python2.X和python3.X相比较改动比较大,好多库的用法不太一样,我安装的是python3.X,我们来看看详细的例子 0x01 春节闲着没事(是有 ...
最简单的爬虫案例开发，Python原生爬虫
大家好,我是小帅今天给大家来点最简单的爬虫案例,后续我会再给大家更新一些爬虫相关技能点,请持续关注,另外你的三连是对小帅最大的支持不过声明一下小帅发的所有案例都是供大家学习的,不要随便乱用,或者 ...
python爬虫简单实例-最简单的Python爬虫案例，看得懂说明你已入门，附赠教程
原标题:最简单的Python爬虫案例,看得懂说明你已入门,附赠教程这是最简单的Python爬虫案例,如果你能看懂,那么请你保持信心,因为你已经入门Python爬虫,只要带着信心和努力,你的技术能力在 ...
实战|Python轻松实现动态网页爬虫(附详细源码)
用浅显易懂的语言分享爬虫.数据分析及可视化等干货,希望人人都能学到新知识. 项目背景事情是这样的,前几天我公众号写了篇爬虫入门的实战文章,叫做<实战|手把手教你用Python爬虫(附详细源码) ...
python实现监控电脑打开网页_Python轻松实现动态网页爬虫(附详细源码)
AJAX动态加载网页一什么是动态网页 J哥一向注重理论与实践相结合,知其然也要知其所以然,才能以不变应万变. 所谓的动态网页,是指跟静态网页相对的一种网页编程技术.静态网页,随着html代码的生成 ...
beautifulsoup解析动态页面div未展开_实战|Python轻松实现动态网页爬虫(附详细源码)...
用浅显易懂的语言分享爬虫.数据分析及可视化等干货,希望人人都能学到新知识.项目背景事情是这样的,前几天我公众号写了篇爬虫入门的实战文章,叫做<实战|手把手教你用Python爬虫(附详细源码)&g ...
Python轻松实现动态网页爬虫(附详细源码)！
AJAX动态加载网页一什么是动态网页 J哥一向注重理论与实践相结合,知其然也要知其所以然,才能以不变应万变. 所谓的动态网页,是指跟静态网页相对的一种网页编程技术.静态网页,随着html代码的生成 ...
Python之网络爬虫（selenium爬取动态网页、爬虫案例分析、哈希算法与RSA加密）
文章目录一.selenium爬取动态网页二.爬虫案例分析三.哈希hash算法与RSA加密一.selenium爬取动态网页 1.动态网页认知爬虫其实就是在模仿浏览器的行为应对要多次数据的交互 ...
python制作简单网页_Python制作简单的网页爬虫
1.准备工作: 工欲善其事必先利其器,因此我们有必要在进行Coding前先配置一个适合我们自己的开发环境,我搭建的开发环境是: 操作系统:Ubuntu 14.04 LTS Python版本:2.7.6 ...

简单又详细的网页爬虫案例

导入包

获取自己要爬取的网站地址、cookie和User-Agent

函数封装

函数调用

简单又详细的网页爬虫案例相关推荐

最新文章

热门文章

简单又详细的网页爬虫案例

导入包

获取自己要爬取的网站地址 、cookie和User-Agent

函数封装

函数调用

简单又详细的网页爬虫案例相关推荐

最新文章

热门文章

获取自己要爬取的网站地址、cookie和User-Agent