本次爬取是安居客的房产信息：话不多说，直接上代码！

一：爬虫板块：

1.运行文件：run.py

from scrapy import cmdline

cmdline.execute(‘scrapy crawl anjuke_shanghai’.split())

    2.网页解析：anjuke_shanghai.py

import scrapy
import time
from anjuke.items import AnjukeItem

class AnjukeShanghaiSpider(scrapy.Spider):
name = ‘anjuke_shanghai’
allowed_domains = [‘anjuke.com’]
start_urls = [‘https://shanghai.anjuke.com/sale/p11/#filtersort’]

next_page_id = 12
def parse(self, response):for ajk in response.xpath("//ul[@id='houselist-mod-new']/li"):time.sleep(5)item = AnjukeItem()title = ajk.xpath(".//div[@class='house-title']/a/text()")[0].extract()time.sleep(1)item['title'] = title.strip()# print(item['title'])price = ajk.xpath(".//span[@class='price-det']/strong/text()")[0].extract()time.sleep(1)item['price'] = price# print(item['price'])unit_price = ajk.xpath(".//span[@class='unit-price']/text()")[0].extract()time.sleep(1)if len(unit_price) > 0:item['unit_price'] = unit_price.replace("元/m²", "")else:item['unit_price'] = ""# print(item['unit_price'])site = ajk.xpath(".//span[@class='comm-address']/text()").extract()time.sleep(1)if len(site) > 0:st = site[0].split()item['site'] = " ".join(st)else:item['site'] = ""# print(item['site'])house_type = ajk.xpath(".//div[@class='details-item']/span[1]/text()").extract()time.sleep(1)if len(house_type) > 0:item['house_type'] = house_type[0]else:item['house_type'] = ""# print(item['house_type'])area = ajk.xpath(".//div[@class='details-item']/span[2]/text()")[0].extract()time.sleep(1)if len(area) > 0:item['area'] = area.replace("m²", "")else:item['area'] = ""item['house_url'] = ajk.xpath(".//div[@class='house-title']/a/@href")[0].extract()time.sleep(1)# print(item['house_url'])yield itemurl = "https://shanghai.anjuke.com/sale/p{}/#filtersort".format(self.next_page_id)if self.next_page_id < 50:time.sleep(5)yield scrapy.Request(url=url, dont_filter=True, callback=self.parse)# print(self.page_id)self.next_page_id += 13.  items.py

Define here the models for your scraped items

See documentation in:

https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class AnjukeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 标题
title = scrapy.Field()

# 总价
price = scrapy.Field()# 单价
unit_price = scrapy.Field()# 地点
site = scrapy.Field()# 类型
house_type = scrapy.Field()# 面积
area = scrapy.Field()# 链接
house_url = scrapy.Field()4. middlewares.py

Define here the models for your spider middleware

See documentation in:

https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

useful for handling different item types with a single interface

from itemadapter import is_item, ItemAdapter

class AnjukeSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return None
def process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, or item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Request or item objects.pass
def process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)

class AnjukeDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of#   installed downloader middleware will be calledreturn None
def process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpass
def spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)5. pipelines.py

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

from itemadapter import ItemAdapter
import pymysql

class AnjukePipeline:
def init(self):
self.connect = pymysql.connect(host=“localhost”, user=“root”, passwd=“1234”, db=“anjuke”)
self.cursor = self.connect.cursor()
print(“数据库连接成功”)

def process_item(self, item, spider):print("开始保存数据")insql = "insert into anjuke_shanghai(title,price,unit_price,site,house_type,area,house_url) values (%s,%s,%s,%s,%s,%s,%s)"self.cursor.execute(insql, (item['title'], item['price'], item['unit_price'], item['site'], item['house_type'],item['area'], item['house_url']))self.connect.commit()print("保存数据成功")return itemdef parse_close(self):self.connect.close()self.cursor.close()6. settings.py

Scrapy settings for anjuke project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

https://docs.scrapy.org/en/latest/topics/settings.html

https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = ‘anjuke’
SPIDER_MODULES = [‘anjuke.spiders’]
NEWSPIDER_MODULE = ‘anjuke.spiders’

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘anjuke (+http://www.yourdomain.com)’

Obey robots.txt rules

ROBOTSTXT_OBEY = False

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

COOKIES_ENABLED = True

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36’,
‘Cookie’: ‘aQQ_ajkguid=8E3DD02F-E811-A2DA-DA53-C1B88CD60608; id58=e87rkF/lNzIYHcjBD+SdAg==; _ga=GA1.2.93190540.1608857396; _gid=GA1.2.334371282.1608857396; 58tj_uuid=6fc5ade0-bfd0-4187-bd4e-9686d7082817; new_uv=1; als=0; sessid=B70FA124-E42F-8DAD-3813-6C91C72B7A20; ctid=11; twe=2; obtain_by=2; ajk_member_verify=QUbPDLTnm9FWHSOd33buoCZE2z1wm%2FVudTO6LdSsWYs%3D; ajk_member_verify2=MTYwMDA4MTUwfFUxNTU3Mjk4NzEwNDM3NXwx; xxzl_cid=7380c6b8f44840bea607d5323fb011f4; xzuid=a8fd56b1-e885-46cd-b255-5dcd8fa79dc4; ajkAuthTicket=TT=f841c95d589fd9118d083c3ba68b97a3&TS=1608895520230&PBODY=VcG9Y6AtpZbA4ERSDzm8x-gaGSpJliB6sqdOLZ5r43ZgbMtoUuIQ3_UEzjH93WSEcM1W26Q_96d7T9tcmKpasHOQN42asUK9WLXeGZ4ssbi9u2MLY5aKXbsVALuXFkdG1gu6vlvjxUMNOn_EEGoo7fk8RHanQCv-vKtjgHmzDBk&VER=2’
}

Enable or disable spider middlewares

See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeDownloaderMiddleware’: 543,

}

Enable or disable extensions

See https://docs.scrapy.org/en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
‘anjuke.pipelines.AnjukePipeline’: 300,
}

Enable and configure the AutoThrottle extension (disabled by default)

See https://docs.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

二：数据版块：

    1.数据库内容：2.使用pyecharts分析截图：

scrapy爬虫+echarts数据分析（安居客）相关推荐

scrapy爬虫实战：安居客深圳二手房
温馨提示:想要本次爬虫源代码的同学请关注公众号:python小咖回复 ' 安居客爬虫 ' 获取源码 --------------------------------- 接下来进入正题本次爬虫实现 ...
Scrapy爬取重庆安居客二手房并存入mysql数据库（下）
上篇中我们获取了重庆的一二级区(Scrapy爬取重庆安居客二手房并存入mysql数据库(上)),这一篇我们根据二级区获取相应的二手房信息. 初始化数据库创建二手房信息数据库表,house表存放二手房 ...
python爬虫爬取安居客并进行简单数据分析
此篇博客为普通方式爬取安居客租房数据一共提取出1200条,但是在进行大规模的数据爬取时,不建议使用这种方式,速度太慢是最大的诟病,在进行大规模爬取时,使用分布式爬虫是第一选择爬取过程一.指定爬取数 ...
爬虫爬取安居客二手房和新房信息，你是买新房还是二手的呢？
本文主要讲解爬取安居客买房类别中的二手房和新房,将提取的信息存储在记事本中,也可以转存CSV格式或者MongoDB中. 网站HTML信息提取比较简单,没有什么特别的地方,作为爬虫入门可以让初学者快速了 ...
python爬虫爬取安居客房源信息
爬取安居客房源信息 Xpath插件的安装爬取重庆花溪附近的房源信息(进入正题啦~) 梳理下逻辑爬取数据的通用流程代码代码的问题 & 运行时可能出现的问题结果数据处理部分(写给我自己 ...
requests爬虫实践之安居客二手房屋数据（python实现）
1.先从安居客官网上淘到如下数据(详细方法可见博主爬取爱彼迎那篇博客): 2.源码(警告:若频繁爬取安居客官网数据,将被要求入网验证-) import requestsfrom bs4 import ...
Scrapy爬取重庆安居客二手房并存入mysql数据库（上）
scrapy是什么 Scrapy是Python开发的一个快速.高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据. 官网地址:https://scrapy.org/ 官方文 ...
python爬虫爬取房源_python爬虫爬取安居客房源信息
Xpath插件的安装链接:https://pan.baidu.com/s/1T3V11Ev8dPODa2fCRbeuCg 提取码:qvzf 将这个安装包解压缩打开谷歌浏览器的扩展程序 ----&g ...
python 安居客爬虫_爬虫学习6：爬取安居客的VR房源信息
公司的VR产品在推广前夕,需要做一个较详细的市场分析报告,我们可以从下面几个步骤来深入探讨: 1.需要展望整个VR的市场规模有多大,从而论证我们需要面对的市场分量, 2.在这个大市场下面,我们面对的细 ...

scrapy爬虫+echarts数据分析（安居客）

Define here the models for your scraped items

See documentation in:

https://docs.scrapy.org/en/latest/topics/items.html

Define here the models for your spider middleware

See documentation in:

https://docs.scrapy.org/en/latest/topics/spider-middleware.html

useful for handling different item types with a single interface

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

Scrapy settings for anjuke project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

https://docs.scrapy.org/en/latest/topics/settings.html

https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

https://docs.scrapy.org/en/latest/topics/spider-middleware.html

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘anjuke (+http://www.yourdomain.com)’

Obey robots.txt rules

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

Enable or disable spider middlewares

See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeDownloaderMiddleware’: 543,

}

Enable or disable extensions

See https://docs.scrapy.org/en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

Enable and configure the AutoThrottle extension (disabled by default)

See https://docs.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

The maximum download delay to be set in case of high latencies

The average number of requests Scrapy should be sending in parallel to

each remote server

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

scrapy爬虫+echarts数据分析（安居客）相关推荐

最新文章

热门文章