python知乎首页文章数据爬取

一、知识点总结和操作步骤以及现存问题

二、源码展示

import urllib.request
import gzip
import io
import random
import threading
import time
import pandas
import json
import sqlite3"""设置代理user_agent"""
user_agent_set = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50','Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36']"""使用IP代理池,此处手动拼接成的ip代理,付费可以购买隧道""""""设置header"""
header = {
'Host': 'www.zhihu.com',
'Connection': 'keep-alive',
'x-ab-param': 'qap_question_author=0;ls_video_commercial=0;li_sp_mqbk=0;li_vip_verti_search=0;li_panswer_topic=0;qap_question_visitor= 0;tp_contents=1;zr_expslotpaid=1;zr_intervene=0;li_edu_page=old;pf_profile2_tab=0;li_paid_answer_exp=0;tp_zrec=1;pf_adjust=1;se_ffzx_jushen1=0;top_test_4_liguangyi=1;zr_slotpaidexp=2;tp_dingyue_video=0;tp_topic_style=0;pf_noti_entry_num=2;li_video_section=1',
'x-ab-pb': 'Ck49DPMLJgwPC+QKWAvXC1IM4AsnDEsLrAsgDEwLuQvPC0sMtAo+DJYL7Ao3DAAMmwvhC5oLhgsHDAELUgu1CyIMIQxgCzQM9AtWDA8M3AsSJwAAAAAAAAAAAAEBAQAAAQsAAAAAAQEAAgEAAQEBAQMAAQABAAAAAA==',
'x-api-version': '3.0.53',
'User-Agent':random.choice(user_agent_set),
'x-zse-86': '1.0_a_x0Hh9y6TxpNg28G0YBeAr8r_YpS8YyzBYq67U8cLSp',
'x-requested-with': 'fetch',
'x-zse-83': '3_2.0',
'Accept': '*/*',
'Referer': 'https://www.zhihu.com/',
'Accept-Encoding':' gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '**************************************************'
}"""全局变量设置"""
result = [] #结果存储变量
thread_set = []  #线程存储变量
threadLock = threading.Lock()  #设置线程锁
url_set = []  #请求参数集合
# df_result = pandas.DataFrame()  #结果存储表格
result_information = []  #解析结果存储文件class Tread_zhihu(threading.Thread):"""创建知乎线程对象"""def __init__(self,threadID,name,start_index,end_index):threading.Thread.__init__(self)self.threadID = threadIDself.name = nameself.start_index = start_indexself.end_index = end_indexdef run(self):"""获取data"""print(str(self.threadID)+str(self.name)+time.ctime())threadLock.acquire()get_data(self.start_index,self.end_index)threadLock.release()# time.sleep(0.2)def get_data(start_index,end_index):for i in range(start_index,end_index):data_temp = request_data(url_set[i])data_temp1 = json.loads(data_temp)  #将json数据转换成python格式print(data_temp1)result.append(data_temp1)return resultdef get_url():"""获得url链接集合"""for i in range(0,16):page_num_value = i+2after_id_value = 5 + (i*6)url_temp_url = 'https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=c1b01d7c7522284a68ecc9cbbbf3748e&desktop=true&page_number='+ str(page_num_value) +'&limit=6&action=down&after_id='+ str(after_id_value) +'&ad_interval=-1'url_set.append(url_temp_url)return url_setdef request_data(url):"""请求获取数据"""req = urllib.request.Request(url, headers=header)response = urllib.request.urlopen(req)data = response.read()buff = io.BytesIO(data)f = gzip.GzipFile(fileobj=buff)res = f.read().decode('utf-8')return resdef data_parsing(result):"""解析提取json数据"""for datax in result:  #对于result中数据的循环for i in range(len(datax)):  #对于list中data个数的循环try:data_title = datax['data'][i]['target']['title']  #提取文章题目except BaseException:data_title = '无题'try:data_author = datax['data'][i]['target']['author']['name']  #提取文章作者名字except BaseException:data_author = 'default'  # 提取文章作者名字try:data_voteuoCount = datax['data'][i]['target']['voteup_count']  #提取文章的点赞数except BaseException:data_author = 'default'  #提取文章的点赞数try:data_contentIntroduce = datax['data'][i]['target']['excerpt_new']  #提取每条信息的内容简介except BaseException:data_author = 'default'  #提取每条信息的内容简介information = {'标题':data_title,'作者':data_author,'点赞数':data_voteuoCount,'内容简介':data_contentIntroduce}  #信息字典result_information.append(information)return result_informationdef data_save(result_information):"""存储数据至文档"""title_temp = []author_temp = []countVote_temp = []contentIntroduce_temp = []for info in result_information:title_temp.append(info['标题'])author_temp.append(info['作者'])countVote_temp.append(info['点赞数'])contentIntroduce_temp.append(info['内容简介'])da = {"标题":title_temp,"作者":author_temp,"点赞数":countVote_temp,"内容简介":contentIntroduce_temp}df = pandas.DataFrame(da)df.to_excel('./out.xls',index=False)def creat_sqlite():"""创建数据库sqlite"""con = sqlite3.connect('Zhihudata.db')cur = con.cursor()sql = 'CREATE TABLE table_one(Serial_number INTEGER PRIMARY KEY AUTOINCREMENT,Title varchar(30) NOT NULL,Author varchar(30) NOT NULL,' \'Voteup_count varchar(30) NOT NULL,Content_introduce varchar(30) NOT NULL)'try:cur.execute(sql)except Exception as e:print(e)print('创表失败')finally:cur.close()con.close()def insert_sqlite(result_information):"""插入数据至数据库"""con = sqlite3.connect('*************')cur = con.cursor()for info in result_information:sql = 'insert into table_one(Title,Author,Voteup_count,Content_introduce)' \'values(:标题,:作者,:点赞数,:内容简介)'cur.execute(sql,info)con.commit()cur.close()if __name__ == '__main__':get_url()concurrent_num = 4  #设置并发线程数thread_circle_num = int(16/concurrent_num)  #设置线程步长for i in range(1,concurrent_num+1):if i == 1:thread = Tread_zhihu(i,"Thread-"+str(i),0,thread_circle_num+1)elif i == 2:thread = Tread_zhihu(i,"Thread-"+str(i),thread_circle_num+1,i*thread_circle_num+1)else:thread = Tread_zhihu(i,"Thread-"+str(i),(i-1)*thread_circle_num+1,i*thread_circle_num)thread_set.append(thread)# print(thread_set)"""执行线程和等待线程结束"""for i in range(len(thread_set)):thread_set[i].start()for i in range(len(thread_set)):thread_set[i].join()print('data request is over')data_parsing(result)  #解析数据data_save(result_information) #存储数据insert_sqlite(result_information)  #插入数据至sqlite3数据库

三、成果展示

四、现存问题

1.简介内容特别字符没有处理。

2.sqlite一个表格插入1000数据不可再插入，爬取下来的数据不能完全存入数据库。

3.欢迎大佬批评指正。

python知乎首页文章数据爬取相关推荐

python爬虫实例——某二手车数据爬取
某二手车网站数据爬取要求: 找到所要爬取的网站网址(url): 今天案例的网址(url):https://www.guazi.com/gy/dazhong/o1/#bread. 观察网站,点开检查, ...
Python爬虫|高德地图地铁数据爬取与制图
目录一.高德地图数据爬取 1.爬取思路 2.python核心代码二.Arcmap制图一.高德地图数据爬取 1.爬取思路首先,谷歌浏览器打开高德地图官网,点击上方菜单栏地铁进入地铁线路网站如下, ...
[Python]百度慧眼人口热力图数据爬取--以深圳市为例
百度慧眼人口热力图数据爬取--以深圳市为例数据爬取坐标转换 1.读取坐标映射表 2.利用sklearn进行回归分析 3.坐标转换输出完整代码利用python爬取深圳市百度慧眼人口热力图数据,线 ...
python爬虫案例-陶瓷公司数据爬取
用requests爬取要注意HTTPConnectionPool(host=xxx, port=xxx): Max retries exceeded with url...异常,出现这个异常的解决方法 ...
Python爬虫 —— 以北京天气数据爬取为例
本文以北京天气为例讲解数据爬取的整个流程,不涉及网络爬虫的原理,直接讲爬取代码怎么写! 1.首先找到你要爬取的网站url:'http://www.tianqihoubao.com/lishi/beij ...
Python selenium Boss直聘数据爬取（仅供学习使用）
写在前面,因为最近刚好需要分析行业数据,又在查询时,发现了许多博主写了一些东西,但很多都已经失效了,所以写了那么一篇文章,希望能够帮到大家注:BOSS直聘数据为js加载数据,故使用selenium ...
Python爬虫应用实战-网站数据爬取及数据分析
实战一:中国大学排名前言由于上一篇文章中教会了大家如何存储数据,但是由于篇幅过大,就没有加入实战篇.想必大家也等着急了吧,所以今天就为大家带来两篇实战内容,希望可以帮助到各位更好的认识到爬虫与My ...
【Python学习】各国人口数据爬取
爬取网站:https://www.phb123.com/city/renkou/rk.html # 导入相关函数包 import requests from bs4 import BeautifulS ...
基于Python的bilibili会员购数据爬取
一.确定好需要爬取的网站二.右键检查网页源码,找到所需要爬取的数据所在的位置通过分析链接可得所需要爬取的数据都在这个页面,并且通过链接可以看到不通的页面page和不通的类型type之间都有差别,可 ...

python知乎首页文章数据爬取

python知乎首页文章数据爬取相关推荐

最新文章

热门文章