scrapy

持久化保存相关代码

pipelines.py

#存储图片
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class imgsPipeLine(ImagesPipeline):#可以根据图片地址进行图片数据的请求def get_media_requests(self, item, info): yield scrapy.Request(item['img_src'])    #指定图片存储的路径def file_path(self, request, response=None, info=None, *, item=None): imgName = request.url.split('/')[-1]return imgNamedef item_completed(self, results, item, info):return item #放回给下一个即将被执行的管道类

#保存至txt文档

class XiaohuaproPipeline:fp = Nonedef open_spider(self,item):print('开始爬虫...')self.fp = open('./xiaohua.txt','w',encoding='utf-8')passdef process_item(self, item, spider):img_name = item['img_name']img_href = item['img_href']self.fp.write(img_name)self.fp.write('  ')self.fp.write(img_href)self.fp.write('\n')return itemdef close_spider(self,item):print('结束爬虫！')self.fp.close()

#保存至mysql数据库中

import pymysql
class mysqlPileLine(object):conn = Nonecursor = Nonedef open_spider(self,spider):self.conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='root',db='xiaohua')passdef process_item(self,item,spider):self.cursor = self.conn.cursor()try:sql = 'insert into xiaohua values("%s","%s");'%(item['img_name'],item['img_href'])self.cursor.execute(sql)self.conn.commit()passexcept Exception as e:print(e)self.conn.rollback()return itemdef close_spider(self,spider):self.cursor.close()self.conn.close()

items.py

class XiaohuaproItem(scrapy.Item):
# define the fields for your item here like:    img_name = scrapy.Field()    img_href = scrapy.Field()

settings.py（分布式爬虫设置）

# -*- coding: utf-8 -*-# Scrapy settings for fbsPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'fbsPro'SPIDER_MODULES = ['fbsPro.spiders']
NEWSPIDER_MODULE = 'fbsPro.spiders'USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fbsPro (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'fbsPro.middlewares.FbsproSpiderMiddleware': 543,
#}# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {'fbsPro.middlewares.FbsproDownloaderMiddleware': 543,
}# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'fbsPro.pipelines.FbsproPipeline': 300,
#}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'# 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = TrueITEM_PIPELINES = {'scrapy_redis.pipelines.RedisPipeline': 400
}# 这里的 host 和 port 需要更换
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
# REDIS_ENCODING = ‘utf-8’
# REDIS_PARAMS = {‘password’:’123456’}

spiders文档

项目名.py

准备操作

#1、需要在 items.py 设置返回值：

import scrapyclass XiaohuaproItem(scrapy.Item):# define the fields for your item here like:img_name = scrapy.Field()img_href = scrapy.Field()pass

#2、在 settings.py 设置：

ROBOTSTXT_OBEY = FalseLOG_LEVEL = 'ERROR'
USER-AGENT = 'xxx'    # 根据自己的user-agent写

#指定图片存储的目录IMAGES_STORE = './imgs'

#3、在管道pipelines.py 内（做持久化保存）：

根据图片地址进行图片数据的请求：

def get_media_requests(self, item, info):yield scrapy.Request(item['img_src'])#指定图片存储的路径
def file_path(self, request, response=None, info=None, *, item=None):imgName = request.url.split('/')[-1]return imgName
def item_completed(self, results, item, info):return item #放回给下一个即将被执行的管道类

spider文件下img.py文档

#获取图片
import scrapy
from imgPro.items import ImgproItem
class ImgSpider(scrapy.Spider):name = 'img'# allowed_domains = ['www.xxx.com']start_urls = ['https://www.woyaogexing.com/tupian/']page_num = 2url = 'https://www.woyaogexing.com/tupian/index_%d.html'img_list = []def parse(self, response, args, kwargs):div_list = response.xpath('//[@id="main"]/div[2]/div[1]/div[2]/div')for div in div_list:img_name = div.xpath('./a[2]/text()').extract_first()img_src = 'https:' + div.xpath('./a[1]/img/@src').extract_first()self.img_list.append(img_name)self.img_list.append(img_src)item = ImgproItem()item['img_name'] = img_nameitem['img_src'] = img_srcyield itemif self.page_num <= 11:new_url = format(self.url % self.page_num)self.page_num += 1# 全栈数据爬取，类似于递归调用，递归调用 parse()方法yield scrapy.Request(url=new_url, callback=self.parse)

设置代理ip

middlewares.py

#免费代理ip地址：http://www.kxdaili.com/dailiip.html

import random
class SunproDownloaderMiddleware:PROXY_HTTP = ['45.79.90.143:8002','117.74.65.215:9080','222.242.106.7:80','222.74.73.202:    42055']PROXY_HTTPS = ['123.182.58.46:8089','112.87.140.163:9443','111.225.153.141:8089']def process_exception(self, request, exception, spider):if request.url.split(':')[0] == 'http':request.meta['proxy'] = 'http://' + random.choice(self.PROXY_HTTP)else:request.meta['proxy'] = 'https://' + random.choice(self.PROXY_HTTPS)pass

CrawSpider

基本工具

LinkExtractor（allwo=r'正则'）#链接提取器：根据指定规则（allow = "正则"）进行指定链接的提取：

如：link = LinkExtractor(allow=r'page=\d+')

rules = ( #规则解析器：将链接提取器提取到的链接进行指定规则（callback）的解析操作 Rule(link, callback='parse_item', follow=False), # follow=True：可以将链接提取器继续作用到链接提取器提取到的链接所对应的页面中 # 即使用全栈数据爬取 Rule(link_detail,callback='parse_detail')

)

def parse_item(self, response):

pass

esponse 就可以直接响应 link（链接提取器）获取到的链接

要实现持久化保存

注意：parse_item 解析方法中是不可以实现请求传参！

需要设置两个 item，即在items.py中设置两个类，如：

class SunproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() new_num = scrapy.Field() passclass DetailItem(scrapy.Item): new_id = scrapy.Field() content = scrapy.Field()

在管道中（pipelines.py）

#如何判定 item 的类型？

if item.__class__.__name__ == 'DetailItem':

selenium自动化工具

基础用法

from selenium import webdriver#实例化一个浏览器对象（传入浏览器的驱动程序）
from selenium.webdriver.chrome.service import
Services = Service('./chromedriver.exe')
bro = webdriver.Chrome(service=s)、
bro.get(url)  可视化浏览器

其他自动化操作

#标签定位

Label = bro.find_element(by='标签',values=)

#给标签中的文本框输入值

label.send_keys('值')

#执行js程序

bro.execute_script('js指令')

#单击操作

bro.click()

#前进

bro.back()

#后退

bro.forward()

动作链

#遇到 iframe 标签：
bro.switch_to.frame('iframeResult')     #切换浏览器标签定位的作用域
#实例化动作链action = ActionChains(bro)
#点击长按指定的标签action.click_and_hold(div)
#.perform()：立即执行动作链操作#x：水平方向；y：竖直方向
action.move_by_offset(17,0).perform()   #水平拖动
#释放动作链
action.release()

无头浏览器+规避检测（直接复制粘贴即可）

#直接复制粘贴即可，不用背、记、理解#实现无可视化界面的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#如何实现让selenium规避被检测到的风险
option = ChromeOptions()
option.add_experimental_option('excludeSwitches',['enable-automation'])
s = Service(executable_path='./chromedriver.exe')
bro = webdriver.Chrome(service=s,chrome_options=chrome_options,options=option)

实现截图

from selenium import webdriver
from selenium.webdriver.chrome.options import Option
from selenium.webdriver.chrome.service import Service
from PIL import Image#先定位到某个页面
driver.get('https://www.12306.cn/index/')
# save_screenshot就是将当前页面截图保存driver.save_screenshot('./aa.png')
#确定验证码图片
code_img_ele = driver.find_element(by='xpath',value='//*[@id="toolbar_Div"]/div[3]/div[2]')
#验证码图片对应的左上角及右下角的坐标
location = code_img_ele.location        #验证码图片的左上角的坐标
size = code_img_ele.size    #验证码标签对应的长和宽
#左上角和右下角的坐标
rangle = (    int(location['x']),int(location['y']),int(location['x'] + size['width']),int(location['y'] + size['height']))
#验证码图片区域就确定下来了
i = Image.open('./aa.png')code_img_name = './code.png'    #crop根据指定区域进行图片裁剪
frame = i.crop(rangle)
frame.save(code_img_name)

上传文档操作

也就是通过xpath定位到 input 标签，再通过 .send_keys()方法将本地图片上传（注意需要写绝对路径）

这里，Path(图片相对路径).absolute() 可以返回该图片的绝对路径。

from pathlib import Pathfrom selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By# 文件上传
s = Service('../chromedriver.exe')
bro = webdriver.Chrome(service=s)
bro.get("https://layuion.com/demo/upload.html")
img = bro.find_element(By.XPATH,'//*[@id="LAY_preview"]/div[1]/input')
img.send_keys(str(Path("aa.png").absolute()))
bro.quit()

python爬虫相关技术相关推荐

python爬虫企业级技术点_Python爬虫必备技术点（二）
Python爬虫必备技术点[续] 一.非爬虫框架 1.1 爬虫的认知数据请求(网络请求库) 数据解析(re/xpath/bs4) 数据存储(csv/pymysql/json??) 反反爬的策略 ip ...
Python后端相关技术/工具栈
Python后端相关技术/工具栈转载http://python.jobbole.com/83486/ 整理下目前涉及到的python的技术栈和工具栈(用过或了解的, 其他的后续用到再补充) 编辑器 ...
python爬虫企业级技术点_Python爬虫必备技术点（一）
爬虫必备技术面向具有Python基础的Python爬虫爱好者,urllib和requests两个库在实际的项目中应用也非常广泛,不仅用于爬虫技术也可以应用在API接口调用方面.如果需要相关的案例可以 ...
python爬虫入门技术手册
点击"简说Python",选择"星标公众号" 福利干货,第一时间送达! 图片by@unsplash 本文授权转载自数据EDTA,禁二次转载作者:livan 阅 ...
Python爬虫入门教程 57-100 python爬虫高级技术之验证码篇3-滑动验证码识别技术
滑动验证码介绍本篇博客涉及到的验证码为滑动验证码,不同于极验证,本验证码难度略低,需要的将滑块拖动到矩形区域右侧即可完成. 这类验证码不常见了,官方介绍地址为:https://promotion.a ...
Python 爬虫相关库
一.请求库 1.urllib3 库提供很多Python 标准库里所没有的重要特性:线程安全,连接池,客户端SSL/TLS验证,文件分部编码上传,协助处理重复请求和HTTP 重定位,支 ...
python爬虫相关坑坑洼洼
1.请求cookies时,发现认证失败问题,报错如下: requests.exceptions.SSLError: HTTPSConnectionPool(host='55.14.6.197', po ...
python流行的爬虫框架_Python爬虫相关框架
Python爬虫相关框架,Python的爬虫框架就是一些爬虫项目的半成品.比如我们可以将一些常见爬虫功能的实现代码写好,然后留下一些接口,在做不同的爬虫项目时,我们只需要根据实际情况,只需要写少量需要 ...
python网络爬虫的基本步骤-python爬虫入门需要哪些基础/python 网络爬虫教程
如何入门 Python 爬虫入门个吊,放弃 python爬虫入门需要哪些基础现在之所以有多的小伙伴热衷于爬虫技术,无外乎是因为爬我们做很多事情,比如搜索引擎.采集数据.广告过滤等,以Python为 ...

python爬虫相关技术