scrapy汽车之家车型的简单爬取

汽车之家车型的简单爬取 spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from mininova.items import carItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class SplashSpider(scrapy.Spider):#spider名字name = 'car_home'allowed_domains = ['autohome.com.cn']start_urls = []# 自定义配置custom_settings = {'ITEM_PIPELINES': {'mininova.pipelines.CarPipeline': 300,}}def start_requests(self): #重新定义起始爬取点#所有首字母words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']#按照首字母，组合对应的页面，压入start_urlsfor word in words:self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html') #根据start_urls，抓取页面for url in self.start_urls:yield Request(url,meta={'word':word})#定义默认的抓取函数def parse(self, response): print('url')print(response.url)word = response.meta['word']car_cates = response.xpath('//dl').extract()brand_id = 0total_cars = []for brand_index in range(len(car_cates)):#品牌编号brand_num = brand_index + 1brand_num = str(brand_num)#品牌名brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0]print('brand:'+brand)#品牌logobrand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0]#品牌小类别brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract()#品牌小类别对应的页面brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract()for brand_item_index in range(len(brand_items)):#品牌小类别的编号brand_item_num = brand_item_index + 1brand_item_num = str(brand_item_num)#品牌小类别名brand_item = brand_items[brand_item_index]#品牌小类别对应的页面的urlbrand_item_url = brand_item_urls[brand_item_index]print('brand_item:'+brand_item)print('brand_item_url:'+brand_item_url)#品牌小类别中的所有车cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract()print('cars_count:'+str(len(cars)))for car_index in range(len(cars)):car_num = car_index + 1car_num = str(car_num)#具体车的名称name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0]#车对应的页面url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0]#报价（最低价-最高价）price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0]prices = price.split('-')price_base = '万'if len(prices) != 2:max_price = '暂无'min_price = '暂无'else:max_price = str(prices[1].replace(price_base,''))min_price = str(prices[0])print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base)car_item = carItem()car_item['name'] = namecar_item['url'] = urlcar_item['brand_item'] = brand_itemcar_item['first_word'] = wordcar_item['brand'] = brandcar_item['brand_logo_url'] = brand_logo_urlcar_item['max_price'] = max_pricecar_item['min_price'] = min_pricetotal_cars.append(car_item)return total_cars
复制代码

item

# -*- coding: utf-8 -*-
import scrapy
class carItem(scrapy.Item):#具体车名name = scrapy.Field()#对应的介绍页面urlurl = scrapy.Field()#最高报价，单位（万）max_price = scrapy.Field()#最低报价，单位（万）min_price = scrapy.Field()#品牌名brand = scrapy.Field()#品牌logobrand_logo_url = scrapy.Field()#品牌小类别名brand_item = scrapy.Field()#品牌首字母first_word = scrapy.Field()
复制代码

mongo_car

from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():db_name = 'car'brand_set_name = 'brand'brand_item_set_name = 'brand_item'car_set_name = 'car'def __init__(self):self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])def insert(self,item):brand_where = {'name':item['brand']}brand = self.brand_exist(self.db,brand_where)if brand == False:brand = {'name':item['brand'],'first_word':item['first_word']}brand = self.insert_brand(self.db,brand)print('brand insert ok!')else:brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}brand = self.update_brand(self.db,brand_where,brand)print('brand_exist!')brand_item_where = {'name':item['brand_item']}brand_item = self.brand_item_exist(self.db,brand_item_where)if brand_item == False:brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}brand_item = self.insert_brand_item(self.db,brand_item)print('brand_item insert ok!')else:print('brand_item_exist!')car_where = {'name':item['brand_item'],'name':item['name']}car = self.car_exist(self.db,car_where)if car == False:car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}car = self.insert_car(self.db,car)print('car insert ok!')else:print('car_exist!')if car != False:return True;else:return False;def update_brand(self,db,brand_where,brand):my_set = db.set(self.db_name,self.brand_set_name)my_set.update_one(brand_where,{'$set':brand})exist = my_set.find_one(brand_where)if(exist is None):return Falseelse:return existdef brand_exist(self,db,brand):my_set = db.set(self.db_name,self.brand_set_name)exist = my_set.find_one(brand)if(exist is None):return Falseelse:return existdef insert_brand(self,db,brand):my_set = db.set(self.db_name,self.brand_set_name)my_set.insert_one(brand)brand = my_set.find_one(brand)return branddef brand_item_exist(self,db,brand_item):my_set = db.set(self.db_name,self.brand_item_set_name)exist = my_set.find_one(brand_item)if(exist is None):return Falseelse:return existdef insert_brand_item(self,db,brand_item):my_set = db.set(self.db_name,self.brand_item_set_name)my_set.insert_one(brand_item)brand = my_set.find_one(brand_item)return branddef car_exist(self,db,car):my_set = db.set(self.db_name,self.car_set_name)exist = my_set.find_one(car)if(exist is None):return Falseelse:return existdef insert_car(self,db,car):my_set = db.set(self.db_name,self.car_set_name)my_set.insert_one(car)brand = my_set.find_one(car)return brand
复制代码

pipeline

from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
from mininova.mongo_car import MongoCar
import copy
class CarPipeline(object):   def process_item(self,item,spider):mongo_car = MongoCar()mongo_car.insert(item)print(item['name'])print('item insert ok!')
复制代码

setting

mongo_setting = {'mongo_host' : 'xxx.xxx.xxx.xxx','mongo_port' : 27017,'mongo_user' : 'username','mongo_password' : 'password'
}
复制代码

scrapy汽车之家车型的简单爬取相关推荐

python爬取汽车之家图片_Python 汽车之家车型全数据爬取
所有车型数据分析发现所有车型数据在一个js文件中: ps:当然也可通过解析网页 xpath提取,或通过接口,获取方式有很多种,此文主要需要seriesId 车型ID 这一项数据为获取车型价格做准备 ...
python爬虫之汽车之家论坛帖子内容爬取
Datawhale爬虫第五期 Day7 实战项目:汽车之家车型论坛帖子信息作为国内目前第一大汽车论坛,反爬虫很恶心,中间很多坑. 新手,第一次搞这么复杂的爬虫,前期没有排查,都是遇到坑的时候再返回 ...
python爬取汽车之家图片,Python requests 爬取汽车之家全部品牌logo，urllib下载到本地...
首先是汽车之家品牌页面的HTML 我们定位到图片那个位置,这个img标签的src加上https就是图片完整的地址那么品牌名称就是下面那个p标签的text 接下来我们的目的就是从中取出src和text ...
table 汽车之家车型对比简单实现列变行
在线demo https://ldtec.github.io/ 1.转换前: 2.转换后: 3.代码: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML ...
如何去除网页噪声提取数据（02） —— 汽车之家（字体反爬）
如何去除网页噪声提取数据(02) -- 汽车之家(字体反爬) 1. 需求介绍继去哪儿网之后,我又盯上了汽车之家这个网站,这个网站的反爬策略挺有意思的,采用了时下最流行的字体反爬技术,让我心神荡漾,对 ...
Scrapy学习之第一个简单爬取小程序
1.首先,先安装scrapy模块,使用命令:pip install scrapy,安装如果出现error: Microsoft Visual C++ 14.0 is required错误的话可参考文章 ...
scrapy简单爬取内容
scrapy的简单爬取不用新建项目.安装好scrapy后编写爬虫文件 import scrapyclass ZaobaoScrapy(scrapy.Spider):name = "zaoba ...
Scrapy笔记十二：简单爬取苏宁书籍网站
文章目录简单爬取苏宁书籍网站参考网址: 整个爬取过程思路: 未解决的问题: 代码如下: 简单爬取苏宁书籍网站参考网址: 网址1:https://book.suning.com/ 网址2:http ...
Scrapy框架的学习(2.scrapy入门，简单爬取页面，并使用管道(pipelines)保存数据)
上个博客写了: Scrapy的概念以及Scrapy的详细工作流程 https://blog.csdn.net/wei18791957243/article/details/86154068 1.sc ...

scrapy汽车之家车型的简单爬取

scrapy汽车之家车型的简单爬取相关推荐

最新文章

热门文章