python爬虫相关技术
scrapy
持久化保存相关代码
pipelines.py
#存储图片
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class imgsPipeLine(ImagesPipeline):#可以根据图片地址进行图片数据的请求def get_media_requests(self, item, info): yield scrapy.Request(item['img_src']) #指定图片存储的路径def file_path(self, request, response=None, info=None, *, item=None): imgName = request.url.split('/')[-1]return imgNamedef item_completed(self, results, item, info):return item #放回给下一个即将被执行的管道类
#保存至txt文档
class XiaohuaproPipeline:fp = Nonedef open_spider(self,item):print('开始爬虫...')self.fp = open('./xiaohua.txt','w',encoding='utf-8')passdef process_item(self, item, spider):img_name = item['img_name']img_href = item['img_href']self.fp.write(img_name)self.fp.write(' ')self.fp.write(img_href)self.fp.write('\n')return itemdef close_spider(self,item):print('结束爬虫!')self.fp.close()
#保存至mysql数据库中
import pymysql
class mysqlPileLine(object):conn = Nonecursor = Nonedef open_spider(self,spider):self.conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='root',db='xiaohua')passdef process_item(self,item,spider):self.cursor = self.conn.cursor()try:sql = 'insert into xiaohua values("%s","%s");'%(item['img_name'],item['img_href'])self.cursor.execute(sql)self.conn.commit()passexcept Exception as e:print(e)self.conn.rollback()return itemdef close_spider(self,spider):self.cursor.close()self.conn.close()
items.py
class XiaohuaproItem(scrapy.Item):
# define the fields for your item here like: img_name = scrapy.Field() img_href = scrapy.Field()
settings.py(分布式爬虫设置)
# -*- coding: utf-8 -*-# Scrapy settings for fbsPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'fbsPro'SPIDER_MODULES = ['fbsPro.spiders']
NEWSPIDER_MODULE = 'fbsPro.spiders'USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fbsPro (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'fbsPro.middlewares.FbsproSpiderMiddleware': 543,
#}# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {'fbsPro.middlewares.FbsproDownloaderMiddleware': 543,
}# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'fbsPro.pipelines.FbsproPipeline': 300,
#}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'# 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = TrueITEM_PIPELINES = {'scrapy_redis.pipelines.RedisPipeline': 400
}# 这里的 host 和 port 需要更换
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
# REDIS_ENCODING = ‘utf-8’
# REDIS_PARAMS = {‘password’:’123456’}
spiders文档
项目名.py
准备操作
#1、需要在 items.py 设置返回值:
import scrapyclass XiaohuaproItem(scrapy.Item):# define the fields for your item here like:img_name = scrapy.Field()img_href = scrapy.Field()pass
#2、在 settings.py 设置:
ROBOTSTXT_OBEY = FalseLOG_LEVEL = 'ERROR'
USER-AGENT = 'xxx' # 根据自己的user-agent写
#指定图片存储的目录IMAGES_STORE = './imgs'
#3、在管道pipelines.py 内(做持久化保存):
根据图片地址进行图片数据的请求:
def get_media_requests(self, item, info):yield scrapy.Request(item['img_src'])#指定图片存储的路径
def file_path(self, request, response=None, info=None, *, item=None):imgName = request.url.split('/')[-1]return imgName
def item_completed(self, results, item, info):return item #放回给下一个即将被执行的管道类
spider文件下img.py文档
#获取图片
import scrapy
from imgPro.items import ImgproItem
class ImgSpider(scrapy.Spider):name = 'img'# allowed_domains = ['www.xxx.com']start_urls = ['https://www.woyaogexing.com/tupian/']page_num = 2url = 'https://www.woyaogexing.com/tupian/index_%d.html'img_list = []def parse(self, response, args, kwargs):div_list = response.xpath('//[@id="main"]/div[2]/div[1]/div[2]/div')for div in div_list:img_name = div.xpath('./a[2]/text()').extract_first()img_src = 'https:' + div.xpath('./a[1]/img/@src').extract_first()self.img_list.append(img_name)self.img_list.append(img_src)item = ImgproItem()item['img_name'] = img_nameitem['img_src'] = img_srcyield itemif self.page_num <= 11:new_url = format(self.url % self.page_num)self.page_num += 1# 全栈数据爬取,类似于递归调用,递归调用 parse()方法yield scrapy.Request(url=new_url, callback=self.parse)
设置代理ip
middlewares.py
#免费代理ip地址:http://www.kxdaili.com/dailiip.html
import random
class SunproDownloaderMiddleware:PROXY_HTTP = ['45.79.90.143:8002','117.74.65.215:9080','222.242.106.7:80','222.74.73.202: 42055']PROXY_HTTPS = ['123.182.58.46:8089','112.87.140.163:9443','111.225.153.141:8089']def process_exception(self, request, exception, spider):if request.url.split(':')[0] == 'http':request.meta['proxy'] = 'http://' + random.choice(self.PROXY_HTTP)else:request.meta['proxy'] = 'https://' + random.choice(self.PROXY_HTTPS)pass
CrawSpider
基本工具
LinkExtractor(allwo=r'正则')#链接提取器:根据指定规则(allow = "正则")进行指定链接的提取:
如:link = LinkExtractor(allow=r'page=\d+')
rules = ( #规则解析器:将链接提取器提取到的链接进行指定规则(callback)的解析操作 Rule(link, callback='parse_item', follow=False), # follow=True:可以将链接提取器 继续作用到 链接提取器提取到的链接 所对应的页面中 # 即使用全栈数据爬取 Rule(link_detail,callback='parse_detail')
)
def parse_item(self, response):
pass
esponse 就可以直接响应 link(链接提取器)获取到的链接
要实现持久化保存
注意:parse_item 解析方法中是不可以实现请求传参!
需要设置两个 item,即在items.py中设置两个类,如:
class SunproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() new_num = scrapy.Field() passclass DetailItem(scrapy.Item): new_id = scrapy.Field() content = scrapy.Field()
在管道中(pipelines.py)
#如何判定 item 的类型?
if item.__class__.__name__ == 'DetailItem':
selenium自动化工具
基础用法
from selenium import webdriver#实例化一个浏览器对象(传入浏览器的驱动程序)
from selenium.webdriver.chrome.service import
Services = Service('./chromedriver.exe')
bro = webdriver.Chrome(service=s)、
bro.get(url) 可视化浏览器
其他自动化操作
#标签定位
Label = bro.find_element(by='标签',values=)
#给标签中的文本框输入值
label.send_keys('值')
#执行js程序
bro.execute_script('js指令')
#单击操作
bro.click()
#前进
bro.back()
#后退
bro.forward()
动作链
#遇到 iframe 标签:
bro.switch_to.frame('iframeResult') #切换浏览器标签定位的作用域
#实例化动作链action = ActionChains(bro)
#点击长按指定的标签action.click_and_hold(div)
#.perform():立即执行动作链操作#x:水平方向;y:竖直方向
action.move_by_offset(17,0).perform() #水平拖动
#释放动作链
action.release()
无头浏览器+规避检测(直接复制粘贴即可)
#直接复制粘贴即可,不用背、记、理解#实现无可视化界面的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#如何实现让selenium规避被检测到的风险
option = ChromeOptions()
option.add_experimental_option('excludeSwitches',['enable-automation'])
s = Service(executable_path='./chromedriver.exe')
bro = webdriver.Chrome(service=s,chrome_options=chrome_options,options=option)
实现截图
from selenium import webdriver
from selenium.webdriver.chrome.options import Option
from selenium.webdriver.chrome.service import Service
from PIL import Image#先定位到某个页面
driver.get('https://www.12306.cn/index/')
# save_screenshot就是将当前页面截图保存driver.save_screenshot('./aa.png')
#确定验证码图片
code_img_ele = driver.find_element(by='xpath',value='//*[@id="toolbar_Div"]/div[3]/div[2]')
#验证码图片对应的左上角及右下角的坐标
location = code_img_ele.location #验证码图片的左上角的坐标
size = code_img_ele.size #验证码标签对应的长和宽
#左上角和右下角的坐标
rangle = ( int(location['x']),int(location['y']),int(location['x'] + size['width']),int(location['y'] + size['height']))
#验证码图片区域就确定下来了
i = Image.open('./aa.png')code_img_name = './code.png' #crop根据指定区域进行图片裁剪
frame = i.crop(rangle)
frame.save(code_img_name)
上传文档操作
也就是通过xpath定位到 input 标签,再通过 .send_keys()方法将本地图片上传(注意需要写绝对路径)
这里,Path(图片相对路径).absolute() 可以返回该图片的绝对路径。
from pathlib import Pathfrom selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By# 文件上传
s = Service('../chromedriver.exe')
bro = webdriver.Chrome(service=s)
bro.get("https://layuion.com/demo/upload.html")
img = bro.find_element(By.XPATH,'//*[@id="LAY_preview"]/div[1]/input')
img.send_keys(str(Path("aa.png").absolute()))
bro.quit()
python爬虫相关技术相关推荐
- python爬虫企业级技术点_Python爬虫必备技术点(二)
Python爬虫必备技术点[续] 一.非爬虫框架 1.1 爬虫的认知 数据请求(网络请求库) 数据解析(re/xpath/bs4) 数据存储(csv/pymysql/json??) 反反爬的策略 ip ...
- Python后端相关技术/工具栈
Python后端相关技术/工具栈 转载http://python.jobbole.com/83486/ 整理下目前涉及到的python的技术栈和工具栈(用过或了解的, 其他的后续用到再补充) 编辑器 ...
- python爬虫企业级技术点_Python爬虫必备技术点(一)
爬虫必备技术 面向具有Python基础的Python爬虫爱好者,urllib和requests两个库在实际的项目中应用也非常广泛,不仅用于爬虫技术也可以应用在API接口调用方面.如果需要相关的案例可以 ...
- python爬虫入门技术手册
点击"简说Python",选择"星标公众号" 福利干货,第一时间送达! 图片by@unsplash 本文授权转载自数据EDTA,禁二次转载 作者:livan 阅 ...
- Python爬虫入门教程 57-100 python爬虫高级技术之验证码篇3-滑动验证码识别技术
滑动验证码介绍 本篇博客涉及到的验证码为滑动验证码,不同于极验证,本验证码难度略低,需要的将滑块拖动到矩形区域右侧即可完成. 这类验证码不常见了,官方介绍地址为:https://promotion.a ...
- Python 爬虫相关库
一.请求库 1.urllib3 库 提供很多Python 标准库里所没有的重要特性:线程安全,连接池,客户端SSL/TLS验证,文件分部编码上传,协助处理重复请求和HTTP 重定位,支 ...
- python爬虫相关坑坑洼洼
1.请求cookies时,发现认证失败问题,报错如下: requests.exceptions.SSLError: HTTPSConnectionPool(host='55.14.6.197', po ...
- python流行的爬虫框架_Python爬虫相关框架
Python爬虫相关框架,Python的爬虫框架就是一些爬虫项目的半成品.比如我们可以将一些常见爬虫功能的实现代码写好,然后留下一些接口,在做不同的爬虫项目时,我们只需要根据实际情况,只需要写少量需要 ...
- python网络爬虫的基本步骤-python爬虫入门需要哪些基础/python 网络爬虫教程
如何入门 Python 爬虫 入门个吊,放弃 python爬虫入门需要哪些基础 现在之所以有多的小伙伴热衷于爬虫技术,无外乎是因为爬我们做很多事情,比如搜索引擎.采集数据.广告过滤等,以Python为 ...
最新文章
- Codeforces Round #539 (Div. 1)
- java 打印 排序 数字塔_求助,用循环方法输出以下数字塔
- datagridview 设置选中行_Excel实用两大技巧,删除重复行,神奇选择性粘贴
- Visual Stdio的解决方案资源管理器位置调整
- leetcode 220. 存在重复元素 III(排序)
- java 内嵌调用_Java高级开发必会的50个性能优化的细节(珍藏版)
- 剑指offer:39-42记录
- Linux可以对目录进行硬链接,Linux硬链接与软链接原理及用法解析
- 斯坦福 cs234 强化学习笔记整理活动 | ApacheCN
- bzoj 2002: [Hnoi2010]Bounce 弹飞绵羊(分块)
- 电脑屏幕录制软件免费
- 两种方式实现序列检测:三段式状态机、移位寄存器+比较器(含testbench激励代码)
- 黑苹果HIDPI开启问题
- 2022最新全天狼星网络验证系统源码
- 深入理解CAS-认证原理
- 李开复给中国学生的4封信
- Unity Shader学习记录(18) —— Shader动画
- 语言表达的6c原则是指什么,商业计划书的6C原则是什么?
- 设计师:室内设计师的简介、工作内容、工作要求、设计常识(硬装/软装/榻榻米/马卡龙/地台/公共空间/玄关/闭水实验、家具知识(欧式雕花家具-欧式雕花家具)、室内设计常用尺寸之详细攻略
- matlab 毕业论文题目,matlab论文题目