汽车之家车型的简单爬取 spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from mininova.items import carItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class SplashSpider(scrapy.Spider):#spider名字name = 'car_home'allowed_domains = ['autohome.com.cn']start_urls = []# 自定义配置custom_settings = {'ITEM_PIPELINES': {'mininova.pipelines.CarPipeline': 300,}}def start_requests(self): #重新定义起始爬取点#所有首字母words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']#按照首字母,组合对应的页面,压入start_urlsfor word in words:self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html') #根据start_urls,抓取页面for url in self.start_urls:yield Request(url,meta={'word':word})#定义默认的抓取函数def parse(self, response): print('url')print(response.url)word = response.meta['word']car_cates = response.xpath('//dl').extract()brand_id = 0total_cars = []for brand_index in range(len(car_cates)):#品牌编号brand_num = brand_index + 1brand_num = str(brand_num)#品牌名brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0]print('brand:'+brand)#品牌logobrand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0]#品牌小类别brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract()#品牌小类别对应的页面brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract()for brand_item_index in range(len(brand_items)):#品牌小类别的编号brand_item_num = brand_item_index + 1brand_item_num = str(brand_item_num)#品牌小类别名brand_item = brand_items[brand_item_index]#品牌小类别对应的页面的urlbrand_item_url = brand_item_urls[brand_item_index]print('brand_item:'+brand_item)print('brand_item_url:'+brand_item_url)#品牌小类别中的所有车cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract()print('cars_count:'+str(len(cars)))for car_index in range(len(cars)):car_num = car_index + 1car_num = str(car_num)#具体车的名称name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0]#车对应的页面url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0]#报价(最低价-最高价)price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0]prices = price.split('-')price_base = '万'if len(prices) != 2:max_price = '暂无'min_price = '暂无'else:max_price = str(prices[1].replace(price_base,''))min_price = str(prices[0])print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base)car_item = carItem()car_item['name'] = namecar_item['url'] = urlcar_item['brand_item'] = brand_itemcar_item['first_word'] = wordcar_item['brand'] = brandcar_item['brand_logo_url'] = brand_logo_urlcar_item['max_price'] = max_pricecar_item['min_price'] = min_pricetotal_cars.append(car_item)return total_cars
复制代码

item

# -*- coding: utf-8 -*-
import scrapy
class carItem(scrapy.Item):#具体车名name = scrapy.Field()#对应的介绍页面urlurl = scrapy.Field()#最高报价,单位(万)max_price = scrapy.Field()#最低报价,单位(万)min_price = scrapy.Field()#品牌名brand = scrapy.Field()#品牌logobrand_logo_url = scrapy.Field()#品牌小类别名brand_item = scrapy.Field()#品牌首字母first_word = scrapy.Field()
复制代码

mongo_car

from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():db_name = 'car'brand_set_name = 'brand'brand_item_set_name = 'brand_item'car_set_name = 'car'def __init__(self):self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])def insert(self,item):brand_where = {'name':item['brand']}brand = self.brand_exist(self.db,brand_where)if brand == False:brand = {'name':item['brand'],'first_word':item['first_word']}brand = self.insert_brand(self.db,brand)print('brand insert ok!')else:brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}brand = self.update_brand(self.db,brand_where,brand)print('brand_exist!')brand_item_where = {'name':item['brand_item']}brand_item = self.brand_item_exist(self.db,brand_item_where)if brand_item == False:brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}brand_item = self.insert_brand_item(self.db,brand_item)print('brand_item insert ok!')else:print('brand_item_exist!')car_where = {'name':item['brand_item'],'name':item['name']}car = self.car_exist(self.db,car_where)if car == False:car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}car = self.insert_car(self.db,car)print('car insert ok!')else:print('car_exist!')if car != False:return True;else:return False;def update_brand(self,db,brand_where,brand):my_set = db.set(self.db_name,self.brand_set_name)my_set.update_one(brand_where,{'$set':brand})exist = my_set.find_one(brand_where)if(exist is None):return Falseelse:return existdef brand_exist(self,db,brand):my_set = db.set(self.db_name,self.brand_set_name)exist = my_set.find_one(brand)if(exist is None):return Falseelse:return existdef insert_brand(self,db,brand):my_set = db.set(self.db_name,self.brand_set_name)my_set.insert_one(brand)brand = my_set.find_one(brand)return branddef brand_item_exist(self,db,brand_item):my_set = db.set(self.db_name,self.brand_item_set_name)exist = my_set.find_one(brand_item)if(exist is None):return Falseelse:return existdef insert_brand_item(self,db,brand_item):my_set = db.set(self.db_name,self.brand_item_set_name)my_set.insert_one(brand_item)brand = my_set.find_one(brand_item)return branddef car_exist(self,db,car):my_set = db.set(self.db_name,self.car_set_name)exist = my_set.find_one(car)if(exist is None):return Falseelse:return existdef insert_car(self,db,car):my_set = db.set(self.db_name,self.car_set_name)my_set.insert_one(car)brand = my_set.find_one(car)return brand
复制代码

pipeline

from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
from mininova.mongo_car import MongoCar
import copy
class CarPipeline(object):   def process_item(self,item,spider):mongo_car = MongoCar()mongo_car.insert(item)print(item['name'])print('item insert ok!')
复制代码

setting

mongo_setting = {'mongo_host' : 'xxx.xxx.xxx.xxx','mongo_port' : 27017,'mongo_user' : 'username','mongo_password' : 'password'
}
复制代码

scrapy汽车之家车型的简单爬取相关推荐

  1. python爬取汽车之家图片_Python 汽车之家 车型全数据 爬取

    所有车型数据 分析发现所有车型数据在一个js文件中: ps:当然也可通过解析网页 xpath提取,或通过接口,获取方式有很多种,此文主要需要seriesId 车型ID 这一项数据 为获取车型价格做准备 ...

  2. python爬虫之汽车之家论坛帖子内容爬取

    Datawhale爬虫 第五期 Day7 实战项目:汽车之家车型论坛帖子信息 作为国内目前第一大汽车论坛,反爬虫很恶心,中间很多坑. 新手,第一次搞这么复杂的爬虫,前期没有排查,都是遇到坑的时候再返回 ...

  3. python爬取汽车之家图片,Python requests 爬取汽车之家全部品牌logo,urllib下载到本地...

    首先是汽车之家品牌页面的HTML 我们定位到图片那个位置,这个img标签的src加上https就是图片完整的地址 那么品牌名称就是下面那个p标签的text 接下来我们的目的就是从中取出src和text ...

  4. table 汽车之家 车型对比 简单实现 列变行

    在线demo https://ldtec.github.io/ 1.转换前: 2.转换后: 3.代码: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML ...

  5. 如何去除网页噪声提取数据(02) —— 汽车之家(字体反爬)

    如何去除网页噪声提取数据(02) -- 汽车之家(字体反爬) 1. 需求介绍 继去哪儿网之后,我又盯上了汽车之家这个网站,这个网站的反爬策略挺有意思的,采用了时下最流行的字体反爬技术,让我心神荡漾,对 ...

  6. Scrapy学习之第一个简单爬取小程序

    1.首先,先安装scrapy模块,使用命令:pip install scrapy,安装如果出现error: Microsoft Visual C++ 14.0 is required错误的话可参考文章 ...

  7. scrapy简单爬取内容

    scrapy的简单爬取不用新建项目.安装好scrapy后编写爬虫文件 import scrapyclass ZaobaoScrapy(scrapy.Spider):name = "zaoba ...

  8. Scrapy笔记十二:简单爬取苏宁书籍网站

    文章目录 简单爬取苏宁书籍网站 参考网址: 整个爬取过程思路: 未解决的问题: 代码如下: 简单爬取苏宁书籍网站 参考网址: 网址1:https://book.suning.com/ 网址2:http ...

  9. Scrapy框架的学习(2.scrapy入门,简单爬取页面,并使用管道(pipelines)保存数据)

    上个博客写了:  Scrapy的概念以及Scrapy的详细工作流程 https://blog.csdn.net/wei18791957243/article/details/86154068 1.sc ...

最新文章

  1. 牛顿迭代法的可视化详解
  2. 使用Source Safe for SQL Server解决数据库版本管理问题
  3. Mysql大型数量下的数据库构建的30条建议
  4. windows10环境下安装Anaconda环境
  5. 为不同的屏幕尺寸提供不同的图片(为那些没有必要下载全尺寸大图的设备节省带宽)...
  6. ArcGIS客户端开发学习笔记(二)——XML
  7. wolfssl 何如 https post_干货:手把手教你优化关键词|亚马逊|流量|搜索量|长尾词|https...
  8. 成都程序员俱乐部通知
  9. z-buffer的概念和算法
  10. java设计模式案例及使用
  11. win10卸载内置应用_如何卸载Windows 10的内置应用程序(以及如何重新安装它们)...
  12. newifi mini php,WBB - Newifi mini刷小米路由mini固件 + 屏蔽广告Adbyby插件小记
  13. 影视双端360版2.0带三级分销
  14. C语言中.和-的区别
  15. 12月组队学习——JoyfulPandas第二章思维导图
  16. 老毛桃PE系统,迁移系统机械到固态硬盘
  17. java 假设检验_Spark机器学习-Java版(二)-相关系数和假设检验
  18. 百度阿里网易大疆等大小厂前端校招面筋 1
  19. apktool解包和打包apk
  20. 文献总结——气象中的滞后相关应用举例

热门文章

  1. Android商城App购物车规格联动选择
  2. Verilog 1.0常用CD4000系列标准数字电路及其框图
  3. Linux_Geany使用指南
  4. Python之12306余票查询
  5. 西门子SMART LINE触摸屏以太网连接松下PLC
  6. Unity Shader unity文档学习笔记(十一):战争迷雾核心算法
  7. Java项目:在线博客问答系统(java+Springboot+jsp+maven+mysql)
  8. 阿里yum源与华为yum源的配置
  9. 光云电力发电领域碳排放太高,光云电力指出调整能源结构是关键一步
  10. Python全栈开发-Python爬虫-03 正则表达式详解