实在不行来瓶敌敌畏吧！Scrapy框架爬取某农药肥料网站+异步Mysql数据库存储

#先来通过查看网页请求看看爬虫基本的逻辑思路

 # -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import scrapy
import json
from nongcha.items import NongchaItem  # 引入item字段class NcSpider(scrapy.Spider):name = 'nc'allowed_domains = ['cha.191.cn']#start_urls = ['http://cha.191.cn/home/search']ua = UserAgent()user_agent = f'{ua.random}'def __init__(self):super(NcSpider, self).__init__()self.page = 1def start_requests(self):"""重写父类的请求函数"""yield scrapy.FormRequest(url='http://cha.191.cn/home/search/dosearch',formdata={'p': f'{self.page}','keyword': '敌敌畏'},headers={'Cookie': 'PHPSESSID=komdv7aiqci9eq1hkf0u4npc63',#     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Referer': 'http://cha.191.cn/home/search'})# 发起formrequest格式的请求，携带请求需要的相关参数def parse(self, response):"""解析返回的数据的函数"""data = json.loads(response.text)if data['status'] == '0':  # 判断数据是否返回成功 0 为一个字符串infos = data['data']['list']  # 获取详细数据，列表嵌套20个字典if infos:   #  如果返回的数据为真for info in infos:  # 迭代这个列表nc_id = info.get('id')  # 商品idnc_name = info.get('name')  # 商品名称nc_company = info.get('company_name')  # 商品公司名称nc_registry = info.get('registry')  # 商品登记证号nc_img = 'http://cha.191.cn/home/product/detailToImage/productId/' + nc_id  # 拼接商品详情页图片的地址print(nc_company)item = NongchaItem()  # 创建item实例  以便下载+存储数据item['nc_id'] = nc_iditem['nc_name'] = nc_nameitem['nc_company'] = nc_companyitem['nc_registry'] = nc_registryitem['nc_img'] = [nc_img]yield itemself.page += 1  # 下一页数据try:yield scrapy.FormRequest(url='http://cha.191.cn/home/search/dosearch',formdata={'p': f'{self.page}','keyword': '敌敌畏'},headers={'Cookie': 'PHPSESSID=komdv7aiqci9eq1hkf0u4npc63',#     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Referer': 'http://cha.191.cn/home/search'},callback=self.parse)# 发起formrequest格式的请求，携带请求需要的相关参数except Exception as out_of_page:print(out_of_page)print('全部数据爬取完成！')if __name__ == '__main__':from scrapy.cmdline import executeexecute(['scrapy', 'crawl', 'nc'])

↓↓↓pipeline的码子↓↓↓最复杂的吧分为下载图片+Excel表格存储+异步数据库存储

# -*- coding: utf-8 -*-# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline  # 引入scrapy自带的图片下载pipeline
from scrapy import Request
import xlwt
from twisted.enterprise import adbapi
import pymysql
from nongcha.items import NongchaItemclass NongchaPipeline(object):def process_item(self, item, spider):return itemclass CustomImagePipeline(ImagesPipeline):"""图片下载数据模型，继承自自带模型"""def get_media_requests(self, item, info):  # 重写一个创建请求对象的函数return [Request(x, meta={'item': item}) for x in item.get(self.images_urls_field, [])]# 创建图片地址的请求对象，并将item放在request中进行携带def file_path(self, request, response=None, info=None):"""返回图片路径以及保存地址的函数"""item = request.meta.get('item')  # 从request中取出itemif item:nc_company = item['nc_company']  # 商品名称nc_id = item['nc_id']  # 商品idpath = f'{nc_company}/{nc_id}.jpg'  # 拼接图片路径item['img_path'] = f'images/{path}'  # 将图片存储的路径 写入item字段return pathclass ExcelPipeline(object):"""将数据写入Excel表格的模型"""def open_spider(self, spider):"""启动爬虫时执行的函数"""self.workbook = xlwt.Workbook(encoding='utf-8')  # 建立工作簿self.sheet = self.workbook.add_sheet('ncc_data')  # 写入表头self.sheet.write(0, 0, 'nc_id')self.sheet.write(0, 1, 'nc_name')self.sheet.write(0, 2, 'nc_company')self.sheet.write(0, 3, 'nc_registry')self.sheet.write(0, 4, 'nc_img')self.sheet.write(0, 5, 'img_path')self.count = 0  # 记录行号def process_item(self, item, spider):"""处理数据的函数"""self.count += 1  # 逐行加一self.sheet.write(self.count, 0, item['nc_id'])self.sheet.write(self.count, 1, item['nc_name'])self.sheet.write(self.count, 2, item['nc_company'])self.sheet.write(self.count, 3, item['nc_registry'])self.sheet.write(self.count, 4, item['nc_img'][0])self.sheet.write(self.count, 5, item['img_path'])self.workbook.save('ncc.data.xls')  # 保存数据return itemclass AsyncWriteMysql(object):"""异步写入数据库的模型"""def __init__(self):"""初始化属性"""prams = dict(host='127.0.0.1',  # 本地port=3306,  # 端口user='root',  # 用户password='123456',  # 密码db='ncc',  # 数据库名称charset='utf8',  # 字符集use_unicode=True,  # 是否使用unicode编码cursorclass=pymysql.cursors.DictCursor  # 使用字典类型的游标)  # 创建字典型的参数self.db_pool = adbapi.ConnectionPool('pymysql', **prams)  # 创建连接池对象 1.用来操作mysql的第三方包名称 2.连接数据库所需的参数def process_item(self, item, spider):"""处理数据的函数"""result = self.db_pool.runInteraction(self.insert_item, item)  # 让连接池执行任务，异步执行任务，任务完成之后 会返回执行的结果result.addErrback(self.insert_error, item)  # 给执行结果添加错误回调函数return itemdef insert_item(self, cursor, item):"""向数据库插入数据的函数 cursor为必要参数"""# item = NongchaItem()item.save(cursor)  # 判断当前数据属于哪一个爬虫类，并调取函数作相应保存def insert_error(self, fail, item):  # 错误回调函数print(item['nc_id'])  # 输出出现错误的数据idprint(fail)  # 输出错误信息

**↓↓↓item的码子↓↓↓主要是写入数据字段以及保存数据进数据库 **

# -*- coding: utf-8 -*-# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass NongchaItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()nc_id = scrapy.Field()nc_name = scrapy.Field()nc_company = scrapy.Field()nc_registry = scrapy.Field()nc_img = scrapy.Field()img_path = scrapy.Field()def save(self, cursor):"""将数据保存至数据库的函数"""cursor.execute("INSERT INTO ncc_data(nc_id, nc_name, nc_company, nc_registry, nc_img, img_path)VALUES (%s, %s, %s, %s, %s, %s)", (self['nc_id'], self['nc_name'], self['nc_company'], self['nc_registry'], self['nc_img'][0], self['img_path']))# 执行sql语句

**↓↓↓settings的码子↓↓↓大部分注释掉 **

# -*- coding: utf-8 -*-# Scrapy settings for nongcha project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'nongcha'SPIDER_MODULES = ['nongcha.spiders']
NEWSPIDER_MODULE = 'nongcha.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'nongcha (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS ={'Host': 'cha.191.cn',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# #激活爬虫的中间件
# SPIDER_MIDDLEWARES = {
#    'nongcha.middlewares.NongChaMiddleware': 100,
# }# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'nongcha.middlewares.NongChaMiddleware': 543,
# }# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'nongcha.pipelines.CustomImagePipeline': 200,'nongcha.pipelines.ExcelPipeline': 300,'nongcha.pipelines.AsyncWriteMysql': 250}
# 保存图片地址的字段名
IMAGES_URLS_FIELD = 'nc_img'
#  图片保存的路径
IMAGES_STORE = 'images'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

↓↓↓数据存储界面↓↓↓

#敌敌畏图片数据基本包含所有的信息了

#EXCEL表单数据

#Mysql数据库
end()

实在不行来瓶敌敌畏吧！Scrapy框架爬取某农药肥料网站+异步Mysql数据库存储相关推荐

scrapy实现爬取全书网小说到Mysql数据库（附代码）
前言本篇文章实现python的scrapy框架爬取全书网小说,scrapy框架的安装我在这里就不在赘述了,建议window用户使用anaconda安装,这里比较省心一些.运行环境python3(实际 ...
scrapy框架爬取糗妹妹网站妹子图分类的所有图片
爬取所有图片,一个页面的图片建一个文件夹.难点,图片中有不少.gif图片,需要重写下载规则, 创建scrapy项目 scrapy startproject qiumeimei 创建爬虫应用 cd qi ...
利用Python Scrapy框架爬取“房天下”网站房源数据
文章目录分析网页获取新房.二手房.租房数据新房数据租房数据: 二手房数据反反爬虫将数据保存至MongoDB数据库 JSON格式 CSV格式 MongoDB数据库分析网页 "房天 ...
scrapy框架爬取校花网站的升级版
**spider目录下的文件:定义DemoSpider类** # -*- coding: utf-8 -*- from scrapy.spiders import CrawlSpider,Rule f ...
在ubuntu 16.04里使用python—scrapy将爬取到的数据存到mysql数据库中的一些随笔
一.将爬取的数据保存到mysql数据库的代码(已经能将爬取的数据保存到json文件) (1)编辑Pipeline.py文件 (2)编辑settings.py文件二.将数据保存至mysql数据库出现的 ...
03_使用scrapy框架爬取豆瓣电影TOP250
前言: 本次项目是使用scrapy框架,爬取豆瓣电影TOP250的相关信息.其中涉及到代理IP,随机UA代理,最后将得到的数据保存到mongoDB中.本次爬取的内容实则不难.主要是熟悉scrapy相关 ...
scrapy获取a标签的连接_python爬虫——基于scrapy框架爬取网易新闻内容
python爬虫--基于scrapy框架爬取网易新闻内容 1.需求[前期准备] 2.分析及代码实现(1)获取五大板块详情页url(2)解析每个板块(3)解析每个模块里的标题中详情页信息点击此处,获取 ...
python中scrapy可以爬取多少数据_python中scrapy框架爬取携程景点数据
------------------------------- [版权申明:本文系作者原创,转载请注明出处] 文章出处:https://blog.csdn.net/sdksdk0/article/de ...
利用python的scrapy框架爬取google搜索结果页面内容
scrapy google search 实验目的爬虫实习的项目1,利用python的scrapy框架爬取google搜索结果页面内容. https://github.com/1012598167/ ...
基于Scrapy框架爬取豆瓣《复联4》影评，并生成词云
基于Scrapy框架爬取豆瓣<复联4>影评,并生成词云 1. 介绍及开发环境 2. 爬虫实现 2.1 新建项目 2.2 构造请求 2.3 提取信息 2.4 数据存储 2.4 运行结果 3. ...

实在不行来瓶敌敌畏吧！Scrapy框架爬取某农药肥料网站+异步Mysql数据库存储

实在不行来瓶敌敌畏吧！Scrapy框架爬取某农药肥料网站+异步Mysql数据库存储相关推荐

最新文章

热门文章