python爬知乎_python爬行求知。,爬取,知乎,精华
import json
import csv
import requests
import re
import time
def getchina(str1): # 提取中文
res1 = ''.join(re.findall('[\u4e00-\u9fa5]',str1))
return res1
def gettime(timeStamp): # 将时间戳转为时间字符串
timeArray = time.localtime(timeStamp)
#otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
return otherStyleTime
def writecsv(data): # 将数据写入csv
for item in data:
#item['target']['content']
if 'title' not in item['target'].keys():
#print(item['target']['question']['title'],gettime(item['target']['updated_time']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content']))
with open('ifo.csv','a',newline='') as f:
csv_writer = csv.writer(f)
csv_writer.writerow([item['target']['question']['title'],gettime(item['target']['updated_time']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content'])])
else :
#print(item['target']['title'],gettime(item['target']['updated']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content']))
with open('ifo.csv','a',newline='') as f:
csv_writer = csv.writer(f)
csv_writer.writerow([item['target']['title'],gettime(item['target']['updated']),item['target']['author']['name'],item['target']['voteup_count'],item['target']['comment_count'],getchina(item['target']['content'])])
with open('ifo.csv','a',newline='',encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["title","time","name","voteup","comment","content"])
url = "http://www.zhihu.com/api/v4/topics/21238418/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Refer":"https://www.zhihu.com/"
}
resp = requests.get(url,headers=headers)
content = resp.content.decode('utf-8')
res = json.loads(content)
data = res['data']
count = 0
while(res['paging']['next']!=url):
count = count + 1
print(count)
writecsv(data)
url = res['paging']['next']
resp = requests.get(url,headers=headers)
content = resp.content.decode('utf-8')
res = json.loads(content)
data = res['data']
python爬知乎_python爬行求知。,爬取,知乎,精华相关推荐
- python爬虫电影信息_Python爬虫入门 | 爬取豆瓣电影信息
这是一个适用于小白的Python爬虫免费教学课程,只有7节,让零基础的你初步了解爬虫,跟着课程内容能自己爬取资源.看着文章,打开电脑动手实践,平均45分钟就能学完一节,如果你愿意,今天内你就可以迈入爬 ...
- python网络爬虫代理服务器_python爬虫如何抓取代理服务器
一年前突然有个灵感,想搞个强大的网盘搜索引擎,但由于大学本科学习软件工程偏嵌入式方向,web方面的能力有点弱,不会jsp,不懂html,好久没有玩过sql,但就是趁着年轻人的这股不妥协的劲儿,硬是把以 ...
- python爬虫贴吧_Python爬虫如何爬取贴吧内容
爬取贴吧内容 先了解贴吧url组成: 每个贴吧url都是以'https://tieba.baidu.com/f?'开头,然后是关键字 kw=''贴吧名字'',再后面是 &pn=页数 (pn=0 ...
- python爬虫实例手机_Python爬虫实现爬取京东手机页面的图片(实例代码)
实例如下所示: __author__ = 'Fred Zhao' import requests from bs4 import BeautifulSoup import os from urllib ...
- python爬虫金融数据_python爬虫项目-爬取雪球网金融数据(关注、持续更新)
(一)python金融数据爬虫项目 爬取目标:雪球网(起始url:https://xueqiu.com/hq#exchange=cn&firstname=1&secondname=1_ ...
- python爬虫外贸客户_python实战成功爬取海外批发商价格信息并写入记事本
运行平台:windows Python版本:Python 3.7.0 用到的第三方库:requests ,Beautiful Soup,re IDE:jupyter notebook 浏览器:Chro ...
- python 百度云盘 数据迁移_python爬虫,爬取百度云盘,找你兄弟的机器活塞运动原理文件?...
寻找并分析百度云的转存api 首先你得有一个百度云盘的账号,然后登录,用浏览器(这里用火狐浏览器做示范)打开一个分享链接.F12打开控制台进行抓包.手动进行转存操作:全选文件->保存到网盘-&g ...
- python外国网站爬虫_python 网络爬虫-爬取网页外部网站
前言 上一篇中我们在维基百科的内部网站上随机跳转进入文章类网页,而忽视外部网站链接.本篇文章将处理网站的外部链接并试图收集一些网站数据.和单个域名网站爬取不同,不同域名的网站结构千差万别,这就意味我们 ...
- python爬去新浪微博_Python 超简单爬取新浪微博数据 (高级版)
新浪微博的数据可是非常有价值的,你可以拿来数据分析.拿来做网站.甚至是*.不过很多人由于技术限制,想要使用的时候只能使用复制粘贴这样的笨方法.没关系,现在就教大家如何批量爬取微博的数据,大大加快数据迁 ...
最新文章
- 生产环境WEB服务管理脚本之日志检测脚本
- OSPF详解-2 区域结构
- 【博客话题】我的2011项目总结
- js代码收集(1)_隐藏div、table间隔样式设置
- json loads No JSON object could be decoded 问题解决
- python asyncio_Python 的异步 IO:Asyncio 简介
- koa --- [MVC实现之四]Router、Controller、Service的实现
- ref是什么意思_终于有人说出A股不敢说的话:为什么股价不断下跌,大单却持续流入,你知道是什么缘由吗?...
- 利用win7系统自身修复还原功能
- Python 解释器中使用help()命令如何退出
- 我有几个粽子,和一个故事
- ArcGIS行政区位图制作流程(附行政区划练习数据)
- C#语言与Java语言程序的比较[转自chinaitlab]
- AI视频增强,提高视频画面的清晰度
- 谈一谈凑单页的那些优雅设计
- c语言消消乐字母游戏代码,基于pygame的小游戏———数字消消乐
- QQ不能远程控制的解决办法
- 解放双手,Windows Admin Center简化服务器管理
- 凤舞丹心东方美,中华才女竞风采 ——黛兰娜杯《中华才女大赛》七月汇演
- 软件项目需求管理培训
热门文章
- 国外设计公司H5网站模板
- C Primer Plus 第六版编程练习第五章答案
- c语言程序电机,直流电机控制C语言程序
- 【trajectory optimization】1 intro
- java版铁傀儡刷新机制,我的世界:新版村庄的铁傀儡数量都快赶上村民了?刷新效率很高!...
- JS生成条形码/二维码 barcode.js、JsBarcode
- Online Learning and Pricing with Reusable Resources: Linear Bandits with Sub-Exponential Rewards: Li
- 爬取听书网有声小说音频数据
- 计算机 图像处理 ei 期刊,【EA-ISET协会】中科院3区视觉图像处理类SCIEI源刊征稿...
- 工商总局抽检电商 天猫1号店等仍存售假