打开上一篇分析过的answer接口的json文件

这样子我们就可以很方便的提取出answer中的具体字段。如下

    def parse_answer(self, reponse):#处理questiona的answerans_json = json.loads(reponse.text)is_end = ans_json["paging"]["is_end"]next_url = ans_json["paging"]["next"]#提取answer的具体字段for answer in ans_json["data"]:answer_item = ZhihuAnswerItem()answer_item["zhihu_id"] = answer["id"]answer_item["url"] = answer["url"]answer_item["question_id"] = answer["question"]["id"]answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None#用户匿名时id这个字段就为空answer_item["content"] = answer["content"] if "content" in answer else None#有些情况下cotent字段也为空answer_item["praise_num"] = answer["voteup_count"]answer_item["comments_num"] = answer["comment_count"]answer_item["create_time"] = answer["created_time"]answer_item["update_time"] = answer["updated_time"]answer_item["crawl_time"] = datetime.datetime.now()yield answer_itemif not is_end:yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)

断点测试结果如下,成功。

到现在为止,整个知乎的爬虫逻辑已经实现了。代码为

# -*- coding: utf-8 -*-import re
import json
import datetimetry:import urlparse as parse
except:from urllib import parseimport scrapy
from scrapy.loader import ItemLoader
from items import ZhihuQuestionItem, ZhihuAnswerItemclass ZhihuSpider(scrapy.Spider):name = "zhihu"allowed_domains = ['www.zhihu.com']start_urls = ['https://www.zhihu.com/']# question的第一页answer的请求urlstart_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"headers = {"HOST": "www.zhihu.com","Referer": "https://www.zhizhu.com",'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"}custom_settings = {"COOKIES_ENABLED": True}def parse(self, response):"""提取出html页面中的所有url 并跟踪这些url进行一步爬取如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数"""all_urls = response.css("a::attr(href)").extract()all_urls = [parse.urljoin(response.url, url) for url in all_urls]all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)for url in all_urls:match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)if match_obj:# 如果提取到question相关的页面则下载后交由提取函数进行提取request_url = match_obj.group(1)yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)else:# 如果不是question页面则直接进一步跟踪yield scrapy.Request(url, headers=self.headers, callback=self.parse)def parse_question(self, response):# 处理question页面, 从页面中提取出具体的question itemif "QuestionHeader-title" in response.text:# 处理新版本match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)if match_obj:question_id = int(match_obj.group(2))item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)item_loader.add_css("title", "h1.QuestionHeader-title::text")item_loader.add_css("content", ".QuestionHeader-detail")item_loader.add_value("url", response.url)item_loader.add_value("zhihu_id", question_id)item_loader.add_css("answer_num", ".List-headerText span::text")item_loader.add_css("comments_num", ".QuestionHeaderActions button::text")item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")question_item = item_loader.load_item()else:# 处理老版本页面的item提取match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)if match_obj:question_id = int(match_obj.group(2))item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)item_loader.add_xpath("title","//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")item_loader.add_css("content", "#zh-question-detail")item_loader.add_value("url", response.url)item_loader.add_value("zhihu_id", question_id)item_loader.add_css("answer_num", "#zh-question-answer-num::text")item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")item_loader.add_xpath("watch_user_num","//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")item_loader.add_css("topics", ".zm-tag-editor-labels a::text")question_item = item_loader.load_item()yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,callback=self.parse_answer)yield question_itempassdef parse_answer(self, reponse):#处理questiona的answerans_json = json.loads(reponse.text)is_end = ans_json["paging"]["is_end"]next_url = ans_json["paging"]["next"]#提取answer的具体字段for answer in ans_json["data"]:answer_item = ZhihuAnswerItem()answer_item["zhihu_id"] = answer["id"]answer_item["url"] = answer["url"]answer_item["question_id"] = answer["question"]["id"]answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None#用户匿名时id这个字段就为空answer_item["content"] = answer["content"] if "content" in answer else None#有些情况下cotent字段也为空answer_item["praise_num"] = answer["voteup_count"]answer_item["comments_num"] = answer["comment_count"]answer_item["create_time"] = answer["created_time"]answer_item["update_time"] = answer["updated_time"]answer_item["crawl_time"] = datetime.datetime.now()yield answer_itemif not is_end:yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)#异步I/O,通过callback执行下一步def start_requests(self):from selenium import webdriverbrowser = webdriver.Chrome(executable_path="C:/Users/Fitz/Desktop/software/chromedriver.exe")browser.get("https://www.zhihu.com/signin")browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper Input").send_keys("你的账号")browser.find_element_by_css_selector(".SignFlow-password Input").send_keys("你的密码")browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()import timetime.sleep(10)Cookies = browser.get_cookies()print(Cookies)cookie_dict = {}import picklefor cookie in Cookies:# 写入文件f = open('C:/Users/Fitz/Desktop/scrapy/ArticleSpider/cookies/zhihu/' + cookie['name'] + '.zhihu', 'wb')pickle.dump(cookie, f)f.close()cookie_dict[cookie['name']] = cookie['value']browser.close()return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict, headers=self.headers)]

知乎爬虫逻辑的实现以及answer的提取相关推荐

  1. 知乎爬虫与数据分析(二)pandas+pyecharts数据可视化分析篇(上)

    注:代码完整版可移步Github--https://github.com/florakl/zhihu_spider. 知乎爬虫与数据分析(一)数据爬取篇 知乎爬虫与数据分析(三)pandas+pyec ...

  2. 知乎爬虫(非登录版)

    2019独角兽企业重金招聘Python工程师标准>>> 事出于那天小牛问有没有爬知乎小姐姐照片的脚本, "可以有", 于是就有了... 代码 #!/usr/bin ...

  3. 知乎爬虫之4:抓取页面数据

    git爬虫项目地址( 终于上传代码了~~~~关注和star在哪里):https://github.com/MatrixSeven/ZhihuSpider(已完结) 附赠之前爬取的数据一份(mysql) ...

  4. 零基础写Java知乎爬虫之进阶篇

    转载自 零基础写Java知乎爬虫之进阶篇 前面几篇文章,我们都是简单的实现了java爬虫抓取内容的问题,那么如果遇到复杂情况,我们还能继续那么做吗?答案当然是否定的,之前的仅仅是入门篇,都是些基础知识 ...

  5. java 百度爬虫_零基础写Java知乎爬虫之先拿百度首页练练手

    上一集中我们说到需要用Java来制作一个知乎爬虫,那么这一次,我们就来研究一下如何使用代码获取到网页的内容. 首先,没有HTML和CSS和JS和AJAX经验的建议先去W3C(点我点我)小小的了解一下. ...

  6. 知网爬虫——爬取某个主题下的文章标题以及发表时间

    前几天帮朋友做了个知网的爬虫,爬取了"新闻传播"主题下的文章标题及发表时间:自己拖拖拉拉写了2天才写完,自己还是太弱了.个人认为这个作为一个练手的爬虫小项目还是不错的,于是写了主要 ...

  7. java 爬数据工具 知乎_知乎爬虫之4:抓取页面数据

    本文由博主原创,转载请注明出处:知乎爬虫之4:抓取页面数据 咱们上一篇分析了知乎的登陆请求和如何拿到粉丝/关注的请求,那么咱们这篇就来研究下如何拿利用Jsoup到咱们想要的数据. 那么咱们说下,首先请 ...

  8. 知乎爬虫(scrapy默认配置下单机1小时可爬取60多万条数据)

    知乎爬虫(scrapy默认配置下单机1小时可爬取60多万条数据) 版本:1.0 作者: AlexTan CSDN: http://blog.csdn.net/alextan_ e-mail: alex ...

  9. 【python爬虫专项(7)】爬虫实战项目一( 豆瓣图书类别的书籍信息数据获取——爬虫逻辑1)

    任意一图书类别的书籍信息数据 参考网址:豆瓣读书网 爬虫逻辑:[分页网页url采集]-[数据信息网页url采集]-[数据采集] 针对爬虫逻辑的三步走,采用函数式编程的方式进行数据爬取 函数1: get ...

最新文章

  1. php中介模式,中介者模式(Mediator pattern)详解及PHP实现
  2. 数据库连接池到底应该设多大?
  3. 计算机视觉开源库OpenCV绘制轮廓,并将轮廓排序~
  4. RH033 Unit 2 Linux Usage Basics
  5. 使用Diskpart建立GPT磁盘分区图文教程
  6. win+mysql自动备份吗_Windows下mysql自动备份的最佳方案
  7. 避免uicollection view被错误的移动
  8. 2015年总结与2016年目标
  9. java url参数值替换_修改url中参数的值
  10. python中存储坐标_利用python进行坐标提取以及筛选(文件操作的小应用)
  11. 电脑用电量_为什么换了智能电表后,我家用电量一下子多了这么多呢?
  12. 数学建模-层次分析法(AHP)方法回顾
  13. LED Designing
  14. vscode vue es6语法配置检测
  15. java计算机毕业设计快滴预约平台源码+mysql数据库+系统+lw文档+部署
  16. 基本概念的理解与讨论
  17. 三硝基溴硼亚酞菁(BTNSubPc)齐岳生物介绍酞菁溶解度,定制多种酞菁材料
  18. 水果店线上做活动方案,水果店线上运营策划方案
  19. python语句太长时,想换行怎么办
  20. flutter中List遍历

热门文章

  1. DAS、SAN、NAS三种存储方式的概念及应用
  2. 时针分针夹角公式与重合问题
  3. Elixir元编程基础知识
  4. python3攻击服务器_Python服务器用套接字互相攻击
  5. SYS/BIOS内存分配相关问题
  6. UI设计和web前端哪个好学?
  7. BT.656协议讲解与解码
  8. 关于C#程序无故退出
  9. 好看的代码,千篇一律!难看的代码,卧槽卧槽~
  10. svn 插件选择 Subclipse与Subversive比较