知乎爬虫逻辑的实现以及answer的提取

打开上一篇分析过的answer接口的json文件

这样子我们就可以很方便的提取出answer中的具体字段。如下

    def parse_answer(self, reponse):#处理questiona的answerans_json = json.loads(reponse.text)is_end = ans_json["paging"]["is_end"]next_url = ans_json["paging"]["next"]#提取answer的具体字段for answer in ans_json["data"]:answer_item = ZhihuAnswerItem()answer_item["zhihu_id"] = answer["id"]answer_item["url"] = answer["url"]answer_item["question_id"] = answer["question"]["id"]answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None#用户匿名时id这个字段就为空answer_item["content"] = answer["content"] if "content" in answer else None#有些情况下cotent字段也为空answer_item["praise_num"] = answer["voteup_count"]answer_item["comments_num"] = answer["comment_count"]answer_item["create_time"] = answer["created_time"]answer_item["update_time"] = answer["updated_time"]answer_item["crawl_time"] = datetime.datetime.now()yield answer_itemif not is_end:yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)

断点测试结果如下，成功。

到现在为止，整个知乎的爬虫逻辑已经实现了。代码为

# -*- coding: utf-8 -*-import re
import json
import datetimetry:import urlparse as parse
except:from urllib import parseimport scrapy
from scrapy.loader import ItemLoader
from items import ZhihuQuestionItem, ZhihuAnswerItemclass ZhihuSpider(scrapy.Spider):name = "zhihu"allowed_domains = ['www.zhihu.com']start_urls = ['https://www.zhihu.com/']# question的第一页answer的请求urlstart_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"headers = {"HOST": "www.zhihu.com","Referer": "https://www.zhizhu.com",'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"}custom_settings = {"COOKIES_ENABLED": True}def parse(self, response):"""提取出html页面中的所有url 并跟踪这些url进行一步爬取如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数"""all_urls = response.css("a::attr(href)").extract()all_urls = [parse.urljoin(response.url, url) for url in all_urls]all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)for url in all_urls:match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)if match_obj:# 如果提取到question相关的页面则下载后交由提取函数进行提取request_url = match_obj.group(1)yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)else:# 如果不是question页面则直接进一步跟踪yield scrapy.Request(url, headers=self.headers, callback=self.parse)def parse_question(self, response):# 处理question页面， 从页面中提取出具体的question itemif "QuestionHeader-title" in response.text:# 处理新版本match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)if match_obj:question_id = int(match_obj.group(2))item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)item_loader.add_css("title", "h1.QuestionHeader-title::text")item_loader.add_css("content", ".QuestionHeader-detail")item_loader.add_value("url", response.url)item_loader.add_value("zhihu_id", question_id)item_loader.add_css("answer_num", ".List-headerText span::text")item_loader.add_css("comments_num", ".QuestionHeaderActions button::text")item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")question_item = item_loader.load_item()else:# 处理老版本页面的item提取match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)if match_obj:question_id = int(match_obj.group(2))item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)item_loader.add_xpath("title","//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")item_loader.add_css("content", "#zh-question-detail")item_loader.add_value("url", response.url)item_loader.add_value("zhihu_id", question_id)item_loader.add_css("answer_num", "#zh-question-answer-num::text")item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")item_loader.add_xpath("watch_user_num","//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")item_loader.add_css("topics", ".zm-tag-editor-labels a::text")question_item = item_loader.load_item()yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,callback=self.parse_answer)yield question_itempassdef parse_answer(self, reponse):#处理questiona的answerans_json = json.loads(reponse.text)is_end = ans_json["paging"]["is_end"]next_url = ans_json["paging"]["next"]#提取answer的具体字段for answer in ans_json["data"]:answer_item = ZhihuAnswerItem()answer_item["zhihu_id"] = answer["id"]answer_item["url"] = answer["url"]answer_item["question_id"] = answer["question"]["id"]answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None#用户匿名时id这个字段就为空answer_item["content"] = answer["content"] if "content" in answer else None#有些情况下cotent字段也为空answer_item["praise_num"] = answer["voteup_count"]answer_item["comments_num"] = answer["comment_count"]answer_item["create_time"] = answer["created_time"]answer_item["update_time"] = answer["updated_time"]answer_item["crawl_time"] = datetime.datetime.now()yield answer_itemif not is_end:yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)#异步I/O，通过callback执行下一步def start_requests(self):from selenium import webdriverbrowser = webdriver.Chrome(executable_path="C:/Users/Fitz/Desktop/software/chromedriver.exe")browser.get("https://www.zhihu.com/signin")browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper Input").send_keys("你的账号")browser.find_element_by_css_selector(".SignFlow-password Input").send_keys("你的密码")browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()import timetime.sleep(10)Cookies = browser.get_cookies()print(Cookies)cookie_dict = {}import picklefor cookie in Cookies:# 写入文件f = open('C:/Users/Fitz/Desktop/scrapy/ArticleSpider/cookies/zhihu/' + cookie['name'] + '.zhihu', 'wb')pickle.dump(cookie, f)f.close()cookie_dict[cookie['name']] = cookie['value']browser.close()return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict, headers=self.headers)]

知乎爬虫逻辑的实现以及answer的提取相关推荐

知乎爬虫与数据分析（二）pandas+pyecharts数据可视化分析篇（上）
注:代码完整版可移步Github--https://github.com/florakl/zhihu_spider. 知乎爬虫与数据分析(一)数据爬取篇知乎爬虫与数据分析(三)pandas+pyec ...
知乎爬虫(非登录版)
2019独角兽企业重金招聘Python工程师标准>>> 事出于那天小牛问有没有爬知乎小姐姐照片的脚本, "可以有", 于是就有了... 代码 #!/usr/bin ...
知乎爬虫之4:抓取页面数据
git爬虫项目地址( 终于上传代码了~~~~关注和star在哪里):https://github.com/MatrixSeven/ZhihuSpider(已完结) 附赠之前爬取的数据一份(mysql) ...
零基础写Java知乎爬虫之进阶篇
转载自零基础写Java知乎爬虫之进阶篇前面几篇文章,我们都是简单的实现了java爬虫抓取内容的问题,那么如果遇到复杂情况,我们还能继续那么做吗?答案当然是否定的,之前的仅仅是入门篇,都是些基础知识 ...
java 百度爬虫_零基础写Java知乎爬虫之先拿百度首页练练手
上一集中我们说到需要用Java来制作一个知乎爬虫,那么这一次,我们就来研究一下如何使用代码获取到网页的内容. 首先,没有HTML和CSS和JS和AJAX经验的建议先去W3C(点我点我)小小的了解一下. ...
知网爬虫——爬取某个主题下的文章标题以及发表时间
前几天帮朋友做了个知网的爬虫,爬取了"新闻传播"主题下的文章标题及发表时间:自己拖拖拉拉写了2天才写完,自己还是太弱了.个人认为这个作为一个练手的爬虫小项目还是不错的,于是写了主要 ...
java 爬数据工具知乎_知乎爬虫之4:抓取页面数据
本文由博主原创,转载请注明出处:知乎爬虫之4:抓取页面数据咱们上一篇分析了知乎的登陆请求和如何拿到粉丝/关注的请求,那么咱们这篇就来研究下如何拿利用Jsoup到咱们想要的数据. 那么咱们说下,首先请 ...
知乎爬虫（scrapy默认配置下单机1小时可爬取60多万条数据）
知乎爬虫(scrapy默认配置下单机1小时可爬取60多万条数据) 版本:1.0 作者: AlexTan CSDN: http://blog.csdn.net/alextan_ e-mail: alex ...
【python爬虫专项（7）】爬虫实战项目一（豆瓣图书类别的书籍信息数据获取——爬虫逻辑1）
任意一图书类别的书籍信息数据参考网址:豆瓣读书网爬虫逻辑:[分页网页url采集]-[数据信息网页url采集]-[数据采集] 针对爬虫逻辑的三步走,采用函数式编程的方式进行数据爬取函数1: get ...

知乎爬虫逻辑的实现以及answer的提取

知乎爬虫逻辑的实现以及answer的提取相关推荐

最新文章

热门文章