实在不行来瓶敌敌畏吧!Scrapy框架爬取某农药肥料网站+异步Mysql数据库存储
#先来通过查看网页请求看看爬虫基本的逻辑思路
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import scrapy
import json
from nongcha.items import NongchaItem # 引入item字段class NcSpider(scrapy.Spider):name = 'nc'allowed_domains = ['cha.191.cn']#start_urls = ['http://cha.191.cn/home/search']ua = UserAgent()user_agent = f'{ua.random}'def __init__(self):super(NcSpider, self).__init__()self.page = 1def start_requests(self):"""重写父类的请求函数"""yield scrapy.FormRequest(url='http://cha.191.cn/home/search/dosearch',formdata={'p': f'{self.page}','keyword': '敌敌畏'},headers={'Cookie': 'PHPSESSID=komdv7aiqci9eq1hkf0u4npc63',# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Referer': 'http://cha.191.cn/home/search'})# 发起formrequest格式的请求,携带请求需要的相关参数def parse(self, response):"""解析返回的数据的函数"""data = json.loads(response.text)if data['status'] == '0': # 判断数据是否返回成功 0 为一个字符串infos = data['data']['list'] # 获取详细数据,列表嵌套20个字典if infos: # 如果返回的数据为真for info in infos: # 迭代这个列表nc_id = info.get('id') # 商品idnc_name = info.get('name') # 商品名称nc_company = info.get('company_name') # 商品公司名称nc_registry = info.get('registry') # 商品登记证号nc_img = 'http://cha.191.cn/home/product/detailToImage/productId/' + nc_id # 拼接商品详情页图片的地址print(nc_company)item = NongchaItem() # 创建item实例 以便下载+存储数据item['nc_id'] = nc_iditem['nc_name'] = nc_nameitem['nc_company'] = nc_companyitem['nc_registry'] = nc_registryitem['nc_img'] = [nc_img]yield itemself.page += 1 # 下一页数据try:yield scrapy.FormRequest(url='http://cha.191.cn/home/search/dosearch',formdata={'p': f'{self.page}','keyword': '敌敌畏'},headers={'Cookie': 'PHPSESSID=komdv7aiqci9eq1hkf0u4npc63',# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Referer': 'http://cha.191.cn/home/search'},callback=self.parse)# 发起formrequest格式的请求,携带请求需要的相关参数except Exception as out_of_page:print(out_of_page)print('全部数据爬取完成!')if __name__ == '__main__':from scrapy.cmdline import executeexecute(['scrapy', 'crawl', 'nc'])
- ↓↓↓pipeline的码子↓↓↓最复杂的吧 分为下载图片+Excel表格存储+异步数据库存储
# -*- coding: utf-8 -*-# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline # 引入scrapy自带的图片下载pipeline
from scrapy import Request
import xlwt
from twisted.enterprise import adbapi
import pymysql
from nongcha.items import NongchaItemclass NongchaPipeline(object):def process_item(self, item, spider):return itemclass CustomImagePipeline(ImagesPipeline):"""图片下载数据模型,继承自自带模型"""def get_media_requests(self, item, info): # 重写一个创建请求对象的函数return [Request(x, meta={'item': item}) for x in item.get(self.images_urls_field, [])]# 创建图片地址的请求对象,并将item放在request中进行携带def file_path(self, request, response=None, info=None):"""返回图片路径以及保存地址的函数"""item = request.meta.get('item') # 从request中取出itemif item:nc_company = item['nc_company'] # 商品名称nc_id = item['nc_id'] # 商品idpath = f'{nc_company}/{nc_id}.jpg' # 拼接图片路径item['img_path'] = f'images/{path}' # 将图片存储的路径 写入item字段return pathclass ExcelPipeline(object):"""将数据写入Excel表格的模型"""def open_spider(self, spider):"""启动爬虫时执行的函数"""self.workbook = xlwt.Workbook(encoding='utf-8') # 建立工作簿self.sheet = self.workbook.add_sheet('ncc_data') # 写入表头self.sheet.write(0, 0, 'nc_id')self.sheet.write(0, 1, 'nc_name')self.sheet.write(0, 2, 'nc_company')self.sheet.write(0, 3, 'nc_registry')self.sheet.write(0, 4, 'nc_img')self.sheet.write(0, 5, 'img_path')self.count = 0 # 记录行号def process_item(self, item, spider):"""处理数据的函数"""self.count += 1 # 逐行加一self.sheet.write(self.count, 0, item['nc_id'])self.sheet.write(self.count, 1, item['nc_name'])self.sheet.write(self.count, 2, item['nc_company'])self.sheet.write(self.count, 3, item['nc_registry'])self.sheet.write(self.count, 4, item['nc_img'][0])self.sheet.write(self.count, 5, item['img_path'])self.workbook.save('ncc.data.xls') # 保存数据return itemclass AsyncWriteMysql(object):"""异步写入数据库的模型"""def __init__(self):"""初始化属性"""prams = dict(host='127.0.0.1', # 本地port=3306, # 端口user='root', # 用户password='123456', # 密码db='ncc', # 数据库名称charset='utf8', # 字符集use_unicode=True, # 是否使用unicode编码cursorclass=pymysql.cursors.DictCursor # 使用字典类型的游标) # 创建字典型的参数self.db_pool = adbapi.ConnectionPool('pymysql', **prams) # 创建连接池对象 1.用来操作mysql的第三方包名称 2.连接数据库所需的参数def process_item(self, item, spider):"""处理数据的函数"""result = self.db_pool.runInteraction(self.insert_item, item) # 让连接池执行任务,异步执行任务,任务完成之后 会返回执行的结果result.addErrback(self.insert_error, item) # 给执行结果添加错误回调函数return itemdef insert_item(self, cursor, item):"""向数据库插入数据的函数 cursor为必要参数"""# item = NongchaItem()item.save(cursor) # 判断当前数据属于哪一个爬虫类,并调取函数作相应保存def insert_error(self, fail, item): # 错误回调函数print(item['nc_id']) # 输出出现错误的数据idprint(fail) # 输出错误信息
- **↓↓↓item的码子↓↓↓主要是写入数据字段以及保存数据进数据库 **
# -*- coding: utf-8 -*-# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass NongchaItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()nc_id = scrapy.Field()nc_name = scrapy.Field()nc_company = scrapy.Field()nc_registry = scrapy.Field()nc_img = scrapy.Field()img_path = scrapy.Field()def save(self, cursor):"""将数据保存至数据库的函数"""cursor.execute("INSERT INTO ncc_data(nc_id, nc_name, nc_company, nc_registry, nc_img, img_path)VALUES (%s, %s, %s, %s, %s, %s)", (self['nc_id'], self['nc_name'], self['nc_company'], self['nc_registry'], self['nc_img'][0], self['img_path']))# 执行sql语句
- **↓↓↓settings的码子↓↓↓大部分注释掉 **
# -*- coding: utf-8 -*-# Scrapy settings for nongcha project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'nongcha'SPIDER_MODULES = ['nongcha.spiders']
NEWSPIDER_MODULE = 'nongcha.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'nongcha (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS ={'Host': 'cha.191.cn',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# #激活爬虫的中间件
# SPIDER_MIDDLEWARES = {
# 'nongcha.middlewares.NongChaMiddleware': 100,
# }# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'nongcha.middlewares.NongChaMiddleware': 543,
# }# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'nongcha.pipelines.CustomImagePipeline': 200,'nongcha.pipelines.ExcelPipeline': 300,'nongcha.pipelines.AsyncWriteMysql': 250}
# 保存图片地址的字段名
IMAGES_URLS_FIELD = 'nc_img'
# 图片保存的路径
IMAGES_STORE = 'images'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
↓↓↓数据存储界面↓↓↓
#敌敌畏图片数据 基本包含所有的信息了
#EXCEL表单数据
#Mysql数据库
end()
实在不行来瓶敌敌畏吧!Scrapy框架爬取某农药肥料网站+异步Mysql数据库存储相关推荐
- scrapy实现爬取全书网小说到Mysql数据库(附代码)
前言 本篇文章实现python的scrapy框架爬取全书网小说,scrapy框架的安装我在这里就不在赘述了,建议window用户使用anaconda安装,这里比较省心一些.运行环境python3(实际 ...
- scrapy框架爬取糗妹妹网站妹子图分类的所有图片
爬取所有图片,一个页面的图片建一个文件夹.难点,图片中有不少.gif图片,需要重写下载规则, 创建scrapy项目 scrapy startproject qiumeimei 创建爬虫应用 cd qi ...
- 利用Python Scrapy框架爬取“房天下”网站房源数据
文章目录 分析网页 获取新房.二手房.租房数据 新房数据 租房数据: 二手房数据 反反爬虫 将数据保存至MongoDB数据库 JSON格式 CSV格式 MongoDB数据库 分析网页 "房天 ...
- scrapy框架爬取校花网站的升级版
**spider目录下的文件:定义DemoSpider类** # -*- coding: utf-8 -*- from scrapy.spiders import CrawlSpider,Rule f ...
- 在ubuntu 16.04里使用python—scrapy将爬取到的数据存到mysql数据库中的一些随笔
一.将爬取的数据保存到mysql数据库的代码(已经能将爬取的数据保存到json文件) (1)编辑Pipeline.py文件 (2)编辑settings.py文件 二.将数据保存至mysql数据库出现的 ...
- 03_使用scrapy框架爬取豆瓣电影TOP250
前言: 本次项目是使用scrapy框架,爬取豆瓣电影TOP250的相关信息.其中涉及到代理IP,随机UA代理,最后将得到的数据保存到mongoDB中.本次爬取的内容实则不难.主要是熟悉scrapy相关 ...
- scrapy获取a标签的连接_python爬虫——基于scrapy框架爬取网易新闻内容
python爬虫--基于scrapy框架爬取网易新闻内容 1.需求[前期准备] 2.分析及代码实现(1)获取五大板块详情页url(2)解析每个板块(3)解析每个模块里的标题中详情页信息 点击此处,获取 ...
- python中scrapy可以爬取多少数据_python中scrapy框架爬取携程景点数据
------------------------------- [版权申明:本文系作者原创,转载请注明出处] 文章出处:https://blog.csdn.net/sdksdk0/article/de ...
- 利用python的scrapy框架爬取google搜索结果页面内容
scrapy google search 实验目的 爬虫实习的项目1,利用python的scrapy框架爬取google搜索结果页面内容. https://github.com/1012598167/ ...
- 基于Scrapy框架爬取豆瓣《复联4》影评,并生成词云
基于Scrapy框架爬取豆瓣<复联4>影评,并生成词云 1. 介绍及开发环境 2. 爬虫实现 2.1 新建项目 2.2 构造请求 2.3 提取信息 2.4 数据存储 2.4 运行结果 3. ...
最新文章
- altium designer2020中文版
- java三层架构是不是策略模式,把「策略模式」应用到实际项目中
- jzoj3084-超级变变变【数学】
- 对Group_concaT函数利用剖析 (转)
- Doris之资源管理
- c# timer 销毁_C# task和timer实现定时操作
- Android开发之软键盘遮盖EditText
- 每个Linux 用户都应该知道的Linux技巧
- 数学建模 CUMCM2018年A题真题(本文由西南民族大学白白不加糖、薄荷CC糖原创)
- HTC T329手机如何删除系统自带的软件?HTC一键解锁、获取ROOT权限、豌豆荚删除系统软件...
- MEME:motif分析的综合性工具
- IDEA自带的逆向工程功能
- gopher攻击mysql_CTFweb类型(二十七)gopher对mysql的利用及例题讲解
- 读 Paxos 到 ZooKeeper ¥ 50大洋
- 苹果电脑系统如何设置成中文
- 使用GUID分区表(GPT)的笔记本硬盘做移动硬盘,windowsXP系统不识别的问题
- 控制科学与工程(自动化)保研经验【2】——南开、同济篇
- Marlin固件之—:基础入门与测试
- 根据大脑频率调整状态
- java double转float_如何将double转换成float类型