xlb.py

import scrapy
import re
from test_spider.items import TestSpiderItemclass XlbSpider(scrapy.Spider):name = 'xlb'    # name 唯一，一个爬虫一个 nameallowed_domains = ['xiaohua.zol.com.cn/']   #指定网址，不是该网址下的网页不爬start_urls = ['http://xiaohua.zol.com.cn/lengxiaohua/1.html']    #起始 urlbase_domain = 'http://xiaohua.zol.com.cn'def parse(self, response):"""response 是一个 scrapy.http.response.html.HtmlResponse 对象，可以执行 xpath 和 css 语法来提取数据提取出来的数据是一个 SelectorList 对象，如果想要获取其中的字符串，用 getall() 或者 get()getall()：获取 Selector 中的所有文本，返回的是一个列表get()：获取 Selector 中的第一个文本，返回的是 str如果数据解析回来，要传给 pipeline 处理，可以使用 yield 返回，也可以将数据 append 到列表中，统一进行 returnitem：建议在 items.py 中定义好模型，以后就不要使用字典pipeline：这个是专门用来保存数据的，其中有三个方法是会经常用到的1. open_spider(self,spider)：当爬虫被打开时执行2. process_item(self,item,spider)：当爬虫有 item 传过来的时候会被调用3. close_spider(self,spider)：当爬虫关闭的时候会被调用要激活 pipeline，应该在 settings.py 中，设置 ITEM_PIPELINES"""content_left = response.xpath('//ul[@class="article-list"]/li[@class="article-summary"]')for content in content_left:# items = []author = content.xpath('.//span[@class="article-title"]/a[@target="_blank"]/text()').get()conts = content.xpath('.//div[@class="summary-text"]//text()').getall()# 这里如果不替换掉这三个而是用 strip()，会发现没效果# \t：tab，制表符    \r：回车符  \n：换行符# ''.join()：将列表中的字符串拼接起来，比遍历列表拼接字符串简单cont = ''.join(conts).replace('\t','').replace('\r','').replace('\n','')item = TestSpiderItem(author=author,content=cont)# duanzi= {TestSpiderItem.author: author, TestSpiderItem.content: cont}# 将 duanzi yield 给引擎，引擎移交给 pipelines# yield duanzi# 用 item 好处：可以规定传递值的参数，也便于规划# items.append(item)yield itemnext_url = response.xpath('//div[@class="page"]/a[@class="page-next"]/@href').get()next_url = self.base_domain + next_url# 爬取前 1-9 页if '10' in next_url:returnyield scrapy.Request(next_url,callback=self.parse,dont_filter=True)# print('*'*50)# print(item)# print(type(item))   #<class 'test_spider.items.TestSpiderItem'># return items

settings.py

...
# 下载延迟
DOWNLOAD_DELAY = random.randint(1,3)...DEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en','User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 \Safari/537.36'
}...# 为了 pipelines.py 能够运行，需要将 ITEM_PIPELINES 取消注释
ITEM_PIPELINES = {# 300 表示优先级，值越小，优先级越高'test_spider.pipelines.TestSpiderPipeline': 300,
}
...

pipelines.py

import json
# 用 json# class TestSpiderPipeline(object):
#     """
#     三个方法：
#         open_spider：打开爬虫
#         process_item：item 参数即引擎转交的内容，如 xlb.py 中的 duanzi
#         close_spider：完成爬虫后关闭
#     """
#     # 为了 pipelines.py 能够运行，需要将 settings.py 中的 ITEM_PIPELINES 取消注释
#     def __init__(self):
#         # 也可以在 open_spider(self, spider) 中打开
#         self.fp = open('duanzi.json','w',encoding='utf8')
#
#     def open_spider(self,spider):
#         # with open('duanzi.json','w',encoding='utf8')
#         print('spider is running!')
#
#     def process_item(self, item, spider):
#         # 解析数据那里返回的 item 不是字典格式，需要先转换成字典格式，再转换成 json
#         item_json = json.dumps(dict(item),ensure_ascii=False)
#         self.fp.write(item_json + '\n')
#         return item
#
#     def close_spider(self,spider):
#         self.fp.close()
#         print('spider was closed!')# 用 scrapy.exporters.JsonItemExporter
# JsonItemExporter 是将 item 都放入一个列表中，暂存在内存中，在 finish 时一起写入，比较耗内存
# JsonLinesItemExporter 是将 item 的字典逐行写入，节约内存，并且不用 start 和 finish
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporterclass TestSpiderPipeline(object):def __init__(self):# 以 bytes 格式写入self.fp = open('duanzi.json','wb')# self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf8')self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf8')# self.exporter.start_exporting()def open_spider(self,spider):print('spider is running!')def process_item(self, item, spider):self.exporter.export_item(item)return itemdef close_spider(self,spider):# self.exporter.finish_exporting()self.fp.close()print('spider was closed!')

items.py

import scrapyclass TestSpiderItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()# 将 json 中的 key 在这里定义，然后在解析数据中导入author = scrapy.Field()content = scrapy.Field()

scrapy 爬 zol 笑话大全相关推荐

【python实现网络爬虫（7）】scrapy爬取笑话大全网站全过程（505问题的解决）
确定要爬取的网站及内容笑话大全网站中的冷笑话,如下要采集的字段,有标题,来源.正文内容创建scrapy项目步骤一.启动爬虫项目在某处(比如桌面)创建一个名称为"scrapy爬取笑话 ...
Scrapy爬取姓名大全，看看那个名字最受父母青睐
点击上方"AI搞事情"关注我们最近在做的项目需要用到名字的数据,可哪儿有这么多名字给我用呢?经一通搜索,不仅找到一个神奇的网站姓名大全,还有人开源了爬虫的代码.让我一番修改,得到 ...
scrapy mysql 词云_利用Scrapy爬取姓名大全作词云分析
scrapy介绍 Scrapy 是一套基于Twisted.纯python实现的异步爬虫框架,用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,相当的方便- 整体架构和组 ...
笑话大全爬虫实战笔记[xiaohua.zol.com.cn]
免责声明:本文所记录的技术手段及实现过程,仅作为爬虫技术学习使用,不对任何人完全或部分地依据本文的全部或部分内容从事的任何事情和因其任何作为或不作为造成的后果承担任何责任. 爬取需求:通过百度搜索关键 ...
【python实现网络爬虫（5）】第一个Scrapy爬虫实例项目（Scrapy原理及Scrapy爬取名言名句网站信息）
Scrapy介绍总共有五部分组成的:具体的流程可看图示引擎.调度器.下载器.蜘蛛和项目管道爬取流程针对于每个URL, Scheduler -> Downloader -> Spid ...
【爬虫】Scrapy爬取腾讯社招信息
目标任务:爬取腾讯社招信息,需要爬取的内容为:职位名称,职位的详情链接,职位类别,招聘人数,工作地点,发布时间. 一.预备基础 1.Scrapy简介 Scrapy是用纯Python实现一个为了爬取网站 ...
Python scrapy爬取京东，百度百科出现乱码，解决方案
Python scrapy爬取京东百度百科出现乱码解决方案十分想念顺店杂可... 抓取百度百科,出现乱码把页面源码下载下来之后,发现全是乱码,浏览器打开但是浏览器链接打开就没有乱码以下是浏 ...
四十三、Scrapy 爬取前程无忧51jobs
@Author:Runsen 之前爬了拉钩,爬了boss ,你认为我会放过51jobs 吗这是不可能的,今日用下scrapy 来爬 51jobs,前程无忧关于新建项目和spider 不说了,今日用 ...
四十一、完成scrapy爬取官方网站新房的数据
@Author:Runsen 文章目录前言分析网页新建项目加请求头搞定item 首页调试详情页调试保存json 前言在前几天,接到一个大学生的作业的爬虫单子,要求采用scrapy爬取链 ...

scrapy 爬 zol 笑话大全

文章目录

xlb.py

settings.py

pipelines.py

items.py

scrapy 爬 zol 笑话大全相关推荐

最新文章

热门文章