一 。  去重的规则组件

  

  去重数据,中通过set() 去重的, 留下的数据存在redis 中,

  找到这个类  : from scrapy.dupefilter import RFPDupeFilter  
     

          a. 爬虫中yield Request(...dont_filter=False) b. 类 from scrapy.dupefilter import BaseDupeFilterimport redisfrom scrapy.utils.request import request_fingerprintclass XzxDupefilter(BaseDupeFilter):def __init__(self,key):self.conn = Noneself.key = key@classmethoddef from_settings(cls, settings):key = settings.get('DUP_REDIS_KEY')return cls(key)def open(self):self.conn = redis.Redis(host='127.0.0.1',port=6379)def request_seen(self, request):fp = request_fingerprint(request)added = self.conn.sadd(self.key, fp)return added == 0c. settings中配置# 默认dupefilter# DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'DUPEFILTER_CLASS = 'xzx.dupfilter.XzxDupefilter'  # 可以自定义的 

  这个类给url 添加一个唯一的标识:

     from scrapy.utils.request import request_fingerprint

        补充:调度器中有一段代码来规定def enqueue_request(self, request):# dont_filter=True, =>  False  -> 添加到去重规则:False,True# dont_filter=False, => True  -> 添加到去重规则: False,Trueif not request.dont_filter and self.df.request_seen(request):return False# 添加到调度器dqok = self._dqpush(request)

 二 。调度器

  1. 广度优先 (本质就是栈)

  2.深度优先 (本质就是队列)

  3. 优先级队列 (redis的有序集合)

三  下载中间件

  这个中间件事 调度器 于 下载器之间的中间件。

   

a.     scrapy中下载中间件的作用?统一对所有请求批量对request对象进行下载前的预处理。b. 针对user-agent,默认中间件 内置的默认的执行, 获取的是stettings 中自己配置的user-agentclass UserAgentMiddleware(object):"""This middleware allows spiders to override the user_agent"""def __init__(self, user_agent='Scrapy'):self.user_agent = user_agent # USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'@classmethoddef from_crawler(cls, crawler):o = cls(crawler.settings['USER_AGENT'])crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)return odef spider_opened(self, spider):self.user_agent = getattr(spider, 'user_agent', self.user_agent)def process_request(self, request, spider):if self.user_agent:request.headers.setdefault(b'User-Agent', self.user_agent)c. 关于重定向   内置对的默认的class BaseRedirectMiddleware(object):enabled_setting = 'REDIRECT_ENABLED'def __init__(self, settings):if not settings.getbool(self.enabled_setting):raise NotConfiguredself.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')@classmethoddef from_crawler(cls, crawler):return cls(crawler.settings)def _redirect(self, redirected, request, spider, reason):ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)redirects = request.meta.get('redirect_times', 0) + 1if ttl and redirects <= self.max_redirect_times:redirected.meta['redirect_times'] = redirectsredirected.meta['redirect_ttl'] = ttl - 1redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \[request.url]redirected.dont_filter = request.dont_filterredirected.priority = request.priority + self.priority_adjustlogger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",{'reason': reason, 'redirected': redirected, 'request': request},extra={'spider': spider})return redirectedelse:logger.debug("Discarding %(request)s: max redirections reached",{'request': request}, extra={'spider': spider})raise IgnoreRequest("max redirections reached")def _redirect_request_using_get(self, request, redirect_url):redirected = request.replace(url=redirect_url, method='GET', body='')redirected.headers.pop('Content-Type', None)redirected.headers.pop('Content-Length', None)return redirectedclass RedirectMiddleware(BaseRedirectMiddleware):"""Handle redirection of requests based on response statusand meta-refresh html tag."""def process_response(self, request, response, spider):if (request.meta.get('dont_redirect', False) orresponse.status in getattr(spider, 'handle_httpstatus_list', []) orresponse.status in request.meta.get('handle_httpstatus_list', []) orrequest.meta.get('handle_httpstatus_all', False)):return responseallowed_status = (301, 302, 303, 307, 308)if 'Location' not in response.headers or response.status not in allowed_status:return responselocation = safe_url_string(response.headers['location'])redirected_url = urljoin(request.url, location)if response.status in (301, 307, 308) or request.method == 'HEAD':redirected = request.replace(url=redirected_url)return self._redirect(redirected, request, spider, response.status)redirected = self._redirect_request_using_get(request, redirected_url)return self._redirect(redirected, request, spider, response.status)d. 关于cookie   是内置的默认的就执行              用法 自己写的逻辑里 yield 加上meta={“cookieJar”:1}}:        
def start_requests(self):    for url in self.start_urls:        yield Request(url=url,callback=self.parse,meta={"cookieJar":1})
            class CookiesMiddleware(object):"""This middleware enables working with sites that need cookies"""def __init__(self, debug=False):self.jars = defaultdict(CookieJar)self.debug = debug@classmethoddef from_crawler(cls, crawler):if not crawler.settings.getbool('COOKIES_ENABLED'):raise NotConfiguredreturn cls(crawler.settings.getbool('COOKIES_DEBUG'))def process_request(self, request, spider):if request.meta.get('dont_merge_cookies', False):return# cookiejarkey = 1cookiejarkey = request.meta.get("cookiejar")jar = self.jars[cookiejarkey] # CookieJar对象-> 空容器cookies = self._get_request_cookies(jar, request)for cookie in cookies:jar.set_cookie_if_ok(cookie, request)# set Cookie headerrequest.headers.pop('Cookie', None)jar.add_cookie_header(request)self._debug_cookie(request, spider)def process_response(self, request, response, spider):if request.meta.get('dont_merge_cookies', False):return response# extract cookies from Set-Cookie and drop invalid/expired cookiescookiejarkey = request.meta.get("cookiejar")jar = self.jars[cookiejarkey]jar.extract_cookies(response, request)self._debug_set_cookie(response, spider)return responsedef _debug_cookie(self, request, spider):if self.debug:cl = [to_native_str(c, errors='replace')for c in request.headers.getlist('Cookie')]if cl:cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)msg = "Sending cookies to: {}\n{}".format(request, cookies)logger.debug(msg, extra={'spider': spider})def _debug_set_cookie(self, response, spider):if self.debug:cl = [to_native_str(c, errors='replace')for c in response.headers.getlist('Set-Cookie')]if cl:cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)msg = "Received cookies from: {}\n{}".format(response, cookies)logger.debug(msg, extra={'spider': spider})def _format_cookie(self, cookie):# build cookie stringcookie_str = '%s=%s' % (cookie['name'], cookie['value'])if cookie.get('path', None):cookie_str += '; Path=%s' % cookie['path']if cookie.get('domain', None):cookie_str += '; Domain=%s' % cookie['domain']return cookie_strdef _get_request_cookies(self, jar, request):if isinstance(request.cookies, dict):cookie_list = [{'name': k, 'value': v} for k, v in \six.iteritems(request.cookies)]else:cookie_list = request.cookiescookies = [self._format_cookie(x) for x in cookie_list]headers = {'Set-Cookie': cookies}response = Response(request.url, headers=headers)return jar.make_cookies(response, request)默认中间件:DOWNLOADER_MIDDLEWARES_BASE = {# Engine side'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,# Downloader side}

    注意点:

    process_request   不用返回,

        1. 如果 有返回response,就会找最后一个process—ressponse

        2. 如果返回request , 就到直接根据返回的request 到调度器中执行

    process_response:必须有返回值

 

四  。 爬虫中间件

  下载器组件 到 爬虫组件中间件,

  默认有 优先级的中间件 和 深度的中间件

编写中间件class XzxSpiderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects.@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()return sdef process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, dict or Item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Response, dict# or Item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield r配置文件:SPIDER_MIDDLEWARES = {'xzx.middlewares.XzxSpiderMiddleware': 543,}内置爬虫中间件 settings 中的配置 :深度 :DEPTH_LIMIT = 8优先级DEPTH_PRIORITY = 1, 请求的优先级:0 -1  -2  -3 。。。。DEPTH_PRIORITY = -1,请求的优先级:0 1 2 3 。。。。SPIDER_MIDDLEWARES_BASE = {# Engine side'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,# Spider side}

  

总结:

1. DupeFilter

- 默认放在set集合
- url变更为唯一标记
- 将去重规则放到redis中的意义何在?
- 去重+dont_filter

2. 调度器

- 爬虫中什么是深度和广度优先?
- 用什么可以实现?
- 栈
- 队列
- 优先级集合

3,开放封闭原则:

  对源码封闭,对配置文件开放, 通过修改配置文件,实现自己想要的功能.

转载于:https://www.cnblogs.com/xuerh/p/9348849.html

python 中的爬虫· scrapy框架 重要的组件的介绍相关推荐

  1. python组件介绍_python 中的爬虫· scrapy框架 重要的组件的介绍

    一 .  去重的规则组件 去重数据,中通过set() 去重的, 留下的数据存在redis 中, 找到这个类  : from scrapy.dupefilter import RFPDupeFilter ...

  2. python cookie池_Python爬虫scrapy框架Cookie池(微博Cookie池)的使用

    下载代码Cookie池(这里主要是微博登录,也可以自己配置置其他的站点网址) 下载代码GitHub:https://github.com/Python3WebSpider/CookiesPool 下载 ...

  3. python scrapy爬虫视频_python爬虫scrapy框架的梨视频案例解析

    之前我们使用lxml对梨视频网站中的视频进行了下载 下面我用scrapy框架对梨视频网站中的视频标题和视频页中对视频的描述进行爬取 分析:我们要爬取的内容并不在同一个页面,视频描述内容需要我们点开视频 ...

  4. python爬虫--Scrapy框架--Scrapy+selenium实现动态爬取

    python爬虫–Scrapy框架–Scrapy+selenium实现动态爬取 前言 本文基于数据分析竞赛爬虫阶段,对使用scrapy + selenium进行政策文本爬虫进行记录.用于个人爬虫学习记 ...

  5. Python爬虫—Scrapy框架—Win10下载安装

    Python爬虫-Scrapy框架-Win10下载安装 1. 下载wheel 2.下载twisted 3. 下载pywin32 4. 下载安装Scrapy 5. 创建一个scrapy项目 6. fir ...

  6. Python爬虫-Scrapy框架(四)- 内置爬虫文件 - 4.2 初探Crawl Spider

    Python爬虫-Scrapy框架(四)- 内置爬虫文件 - 4.2 初探Crawl Spider 写在前面 初探Crawl Spider 创建Crawl Spider项目 对比Basic与Crawl ...

  7. Python爬虫 scrapy框架爬取某招聘网存入mongodb解析

    这篇文章主要介绍了Python爬虫 scrapy框架爬取某招聘网存入mongodb解析,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下 创建项目 sc ...

  8. python 爬虫Scrapy框架入门

    简单介绍Scrapy框架 Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架,我们只需要实现少量的代码,就能够快速的抓取. Scrapy使用了Twisted异步网络框架,可以加快我们的 ...

  9. 『Python学习笔记』Python中的异步Web框架之fastAPI介绍RestAPI

    Python中的异步Web框架之fastAPI介绍&RestAPI 文章目录 一. fastAPI简要介绍 1.1. 安装 1.2. 创建 1.3. get方法 1.4. post方法 1.5 ...

最新文章

  1. java培训要学习多久?
  2. 图解:轻松看懂机器学习十大常用算法
  3. ARKit如何将太阳系装进iPhone(二)
  4. windows server 2008 iis 添加ssl证书
  5. Linux下实用的查看内存和多核CPU状态命令
  6. 大页内存(HugePages)
  7. 关于Android studio找不到sqlite数据库的解决方法
  8. DB2 9 底子(730 考试)认证指南,第 3 局部: 拜访 DB2 数据(3)
  9. vscode设置templates_在VScode中创建你的代码模板的方法
  10. shader一些语义或术语的解释
  11. 在云栖小镇,新华三呈现物联网金秋硕果
  12. 8.业务架构·应用架构·数据架构实战 --- 技术方案书
  13. 6月8日 论文书写——公式
  14. 信息安全密码学期末复习重点总结
  15. Java项目(五子棋)
  16. 消费新品周报 | 歌帝梵携手大白兔推出全冰品系列;MK推出新一代智能触屏腕表...
  17. Error Domain=AVFoundationErrorDomain Code=-11841 (null) timeRange 和 duration 不一致
  18. 《缠中说禅108课》108:何谓底部?从月线看中期走势演化
  19. [转]设置IE背景色保护你的眼睛视力_鹤壁吧_贴吧
  20. PPT无法插入页码解决办法

热门文章

  1. 凝血酶分子机器人_了不得!这个机器人可以拟制癌细胞生长
  2. python 实现点击右键用某个程序打开功能_4.PYTHON开发利器之使用VS Code进行python程序开发...
  3. ICH10R服务器主板是什么芯片,Intel ICH10R 芯片组 RAID配置
  4. Eclipse中svn插件:Subclipse插件安装
  5. html高德地图api使用教程,高德地图API如何使用?
  6. java字段不序列化注解_@Transient注解的使用(不被序列化和作为临时变量存储)...
  7. 查看pg 用户组_PostgreSQL 角色管理
  8. 自动驾驶数据集_Argo AI和Waymo公开发布自动驾驶数据集
  9. shell swt 样式_swt shell设置窗口位于屏幕中间
  10. python django框架分析_Django框架模型简单介绍与使用分析