Scrapy爬取重庆安居客二手房并存入mysql数据库（下）

上篇中我们获取了重庆的一二级区（Scrapy爬取重庆安居客二手房并存入mysql数据库（上）），这一篇我们根据二级区获取相应的二手房信息。

初始化数据库

创建二手房信息数据库表，house表存放二手房信息，house_price存放价格（定期获取分析价格趋势）：

CREATE TABLE `house` (`id` int UNSIGNED AUTO_INCREMENT,`area_id` int NOT NULL DEFAULT 0,`area_code` varchar(255) DEFAULT NULL,`title` varchar(2000) DEFAULT NULL,`unit_price` decimal(19,4) NOT NULL DEFAULT 0,`total_price` decimal(19,4) NOT NULL DEFAULT 0,`code` varchar(255) DEFAULT NULL,`community` varchar(255) DEFAULT NULL,`location` varchar(255) DEFAULT NULL,`build_years` varchar(255) DEFAULT NULL,`floor` varchar(255) DEFAULT NULL,`layout` varchar(255) DEFAULT NULL,`size` varchar(255) DEFAULT NULL,`picture_url` varchar(255) DEFAULT NULL,`url` varchar(1024) DEFAULT NULL,`created_on` timestamp DEFAULT current_timestamp,`updated_on` timestamp DEFAULT current_timestamp on update current_timestamp,PRIMARY KEY (id)
) ENGINE=INNODB DEFAULT CHARSET=utf8;CREATE TABLE `house_price` (`id` int UNSIGNED AUTO_INCREMENT,`house_id` int NOT NULL DEFAULT 0,`house_code` varchar(255) DEFAULT NULL,`unit_price` decimal(19,4) NOT NULL DEFAULT 0,`total_price` decimal(19,4) NOT NULL DEFAULT 0,`created_on` timestamp DEFAULT current_timestamp,`updated_on` timestamp DEFAULT current_timestamp on update current_timestamp,PRIMARY KEY (id)
) ENGINE=INNODB DEFAULT CHARSET=utf8;

创建项目

创建二手房scrapy项目：

scrapy startproject hourse

项目创建成功后，目录结构如下：

定义item

打开items.py文件，定义二手房信息item：

import scrapyclass HourseItem(scrapy.Item):area_code = scrapy.Field()title = scrapy.Field()unit_price = scrapy.Field()total_price = scrapy.Field()code = scrapy.Field()community = scrapy.Field()location = scrapy.Field()build_years = scrapy.Field()floor = scrapy.Field()layout = scrapy.Field()size = scrapy.Field()picture_url = scrapy.Field()url = scrapy.Field()

开发spider

在spiders目录中新建二手房爬虫的spider文件housespider.py，根据页面结构使用css选择器解析处理数据：

import scrapy
import re
import pymysql
from hourse.items import HourseItemclass HouseSpider(scrapy.Spider):name = 'house'allow_domains = ["anjuke.com"]# start_urls = [#     'https://chongqing.anjuke.com/sale/p1/',# ]def __init__(self):self.db = pymysql.connect("localhost", "root", "123456", "house", charset="utf8")self.cursor = self.db.cursor()# 从数据库读取区域，按照区域查询数据results = self.get_areas()urls = []for area_item in results:urls.append('https://chongqing.anjuke.com/sale/' + area_item[1] + '/')# breakself.start_urls = urlsself.enable_next_page = Truedef close(self):self.db.close()def get_areas(self):# 区域数量太多，会触发人机校验，请求连接会跳转到人工操作页面，这里添加条件只获取部分区域select_sql = "select * from house_area where parent_id > 0"# select_sql = "select * from house_area where id = 1"self.cursor.execute(select_sql)results = self.cursor.fetchall()return resultsdef parse(self, response):house_list = response.css('li.list-item')house_item = HourseItem()house_item['area_code'] = ''current_url = re.search('sale/([^/]+)/', response.url)if current_url:house_item['area_code'] = current_url.group(1)pat_code = '/([a-zA-Z0-9]+)\?'for item in house_list:house_item['title'] = item.css('.house-title a::text').extract_first().strip()house_item['url'] = item.css('.house-title a::attr(href)').extract_first().strip()# 总价万为单位house_item['total_price'] = re.search('\d+', item.css('.pro-price .price-det').extract_first().strip()).group()house_item['code'] = ''if house_item['url']:search_code = re.search(pat_code, house_item['url'])if search_code:house_item['code'] = search_code.group(1)house_item['community'] = ''house_item['location'] = item.css('.details-item:nth-child(3) span.comm-address::attr(title)').extract_first().replace(u'\xa0', u' ').strip()if house_item['location']:address_items = re.split('\s+', house_item['location'])if len(address_items) > 1:house_item['community'] = address_items[0]house_item['build_years'] = item.css('.details-item:nth-child(2) span:nth-child(7)::text').extract_first().strip()house_item['floor'] = item.css('.details-item:nth-child(2) span:nth-child(5)::text').extract_first().strip()house_item['layout'] = item.css('.details-item:nth-child(2) span:nth-child(1)::text').extract_first().strip()house_item['size'] = item.css('.details-item:nth-child(2) span:nth-child(3)::text').extract_first().strip()house_item['picture_url'] = item.css('.item-img img::attr(src)').extract_first().strip()house_item['unit_price'] = re.search('\d+', item.css('.pro-price .unit-price::text').extract_first().strip()).group()yield house_item#分页操作next_page = response.css('.multi-page a.aNxt::attr(href)').extract_first()if self.enable_next_page and next_page:#构建新的Request对象next_url = next_page.strip()yield scrapy.Request(next_url, callback=self.parse)

开发pipeline

打开pipelines.py文件，将spider中爬取处理的数据存入数据库：

import pymysqlclass HoursePipeline(object):def __init__(self):self.db = pymysql.connect("localhost", "root", "123456", "house", charset="utf8")self.cursor = self.db.cursor()def process_item(self, item, spider):select_area_sql = "select id from house_area where code='%s'" % item['area_code']is_area_exist = self.cursor.execute(select_area_sql)house_area = self.cursor.fetchone()area_id = 0if house_area:area_id = house_area[0]select_sql = "select id from house where code='%s'" % item['code']already_save = self.cursor.execute(select_sql)house_item = self.cursor.fetchone()self.db.commit()if already_save == 1:# 更新信息house_id = house_item[0]update_sql = "update house set title='%s', unit_price='%d', total_price='%d', url='%s' where id='%d'" % (item['title'],int(item['unit_price']),int(item['total_price']),item['url'],int(house_id))self.cursor.execute(update_sql)self.db.commit()# 插入价格self.add_price(house_id, item['code'], item['unit_price'], item['total_price'])  else:# 插入信息sql = "insert into house(area_id,area_code,title,unit_price,total_price,code,community,location,build_years,floor,layout,size,picture_url,url)\values('%d','%s','%s','%d','%d','%s','%s','%s','%s','%s','%s','%s','%s','%s')"\%(area_id,item['area_code'],item['title'],int(item['unit_price']),int(item['total_price']),item['code'],item['community'],item['location'],\item['build_years'],item['floor'], item['layout'],item['size'],item['picture_url'],item['url'])self.cursor.execute(sql)house_id = int(self.db.insert_id())self.db.commit()# 插入价格self.add_price(house_id, item['code'], item['unit_price'], item['total_price'])  return itemdef add_price(self, house_id, house_code, unit_price, total_price):sql = "insert into house_price(house_id, house_code, unit_price, total_price)\values('%d','%s','%d','%d')" % (int(house_id), house_code, int(unit_price), int(total_price))self.cursor.execute(sql)self.db.commit() def __del__(self):self.db.close()

设置settings

打开settings.py文件，这只项目的USER_AGENT和ITEM_PIPELINES，其余设置保持不变：

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'ITEM_PIPELINES = {'hourse.pipelines.HoursePipeline': 300,
}

启动爬虫

在项目根目录下，在命令行中运行如下命令开始爬虫程序（博主在vscode中项目下直接运行的命令）：

scrapy crawl house

至此使用scrapy获取重庆的二手房数据已经完成，博主才入坑python，通过该项目学习了scrapy框架和一些基础的python语法。如果有不妥的笛梵，欢迎大家提意见。

项目的Github地址：https://github.com/liuhuifly/scrapy_cqanjuke，数据库初始化脚本在hourse项目的sqlscripts目录下。

声明：本项目仅仅供学习使用，使用该项目从事的一切商业行为与博主无关，自行承担责任。