scarpy-爬取链家所有成交数据

spider的代码

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from fake_useragent import UserAgent
from scrapy.linkextractors import LinkExtractor
from lianjia.items import LianjiaItemclass RsfjySpider(CrawlSpider):name = 'rsfjy'allowed_domains = ['bj.lianjia.com']start_urls = ['https://bj.lianjia.com/chengjiao']ua = UserAgent()# 设置随机请求头# 得到北京各区的初始链接rules = (Rule(LinkExtractor(restrict_xpaths=['//div[@data-role="ershoufang"]//a']), follow=True, callback='all_links', ),)# 分析详情页 处理数据并保存def parse_info(self, response):item = LianjiaItem()c_title = response.xpath('/html/body/div[4]/div/text()').get().split()[0]  # 小区名  titleh_type = response.xpath('/html/body/div[4]/div/text()').get().split()[1]  # 户型p_square = response.xpath('/html/body/div[4]/div/text()').get().split()[2]  # 平米数c_time = response.xpath('/html/body/div[4]/div/span/text()').get()  # 成交时间# title = response.xpath('/html/body/div[4]/div/h1/text()').get()c_price = response.xpath("/html/body/section[1]/div[2]/div[2]/div[1]/span/i/text()").get()  # 成交价格g_price = response.xpath("/html/body/section[1]/div[2]/div[2]/div[3]/span[1]/label/text()").get()  # 挂牌价格c_cycle = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[2]/label/text()').get()  # 成交周期t_frequency = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[3]/label/text()').get()  # 调价次数watch_num = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[4]/label/text()').get()  # 带看次数focus_num = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[5]/label/text()').get()  # 关注人数l_browse = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[6]/label/text()').get()  # 浏览次数item['c_title'] = c_titleitem['h_type'] = h_typeitem['p_square'] = p_squareitem['c_time'] = c_timeitem['c_price'] = c_priceitem['g_price'] = g_priceitem['c_cycle'] = c_cycleitem['t_frequency'] = t_frequencyitem['watch_num'] = watch_numitem['focus_num'] = focus_numitem['l_browse'] = l_browseprint(item)yield item# 得到所有的列表页 然后去访问每套房子的详情页def parse_item(self, response):info_ilst = response.xpath('//ul[@class="listContent"]/li')for info in info_ilst:title = info.xpath("div[@class='info']/div[@class='title']/a/text()").get()link = info.xpath("div[@class='info']/div[@class='title']/a/@href ").get()print(title, link)yield scrapy.Request(url=response.urljoin(link), callback=self.parse_info,headers={'User-Agent': self.ua.random}, )# 翻页 得到北京所有交易的列表页def next_page(self, response):page_url = response.xpath('//@page-url').extract_first()page_data = response.xpath('//@page-data').extract_first()total_page = eval(page_data)['totalPage']# total_page = 2for page in range(1, total_page + 1):rel_url = page_url.format(page=page)# print(rel_url)yield scrapy.Request(url=response.urljoin(rel_url), callback=self.parse_item,headers={'User-Agent': self.ua.random}, )def all_links(self, response):links = response.xpath('//div[@data-role="ershoufang"]//a/@href').getall()for url in links:yield scrapy.Request(url=response.urljoin(url), callback=self.next_page,headers={'User-Agent': self.ua.random}, )

seting 设置

from fake_useragent import UserAgent
import random
import timeBOT_NAME = 'lianjia'SPIDER_MODULES = ['lianjia.spiders']
NEWSPIDER_MODULE = 'lianjia.spiders'ua = UserAgent()
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'lianjia (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False# 开启日志
LOG_ENABLED = True
# 日志名称及路径
LOG_FILE = 'lianjia.log'
# 设置日志编码
LOG_ENCODING = 'utf-8'
# 设置日志等级  警告级以上写入
LOG_LEVEL = 'WARNING'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = time.sleep(random.random() * 10)# 随机休息防止反爬
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
# COOKIES_ENABLED = True# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en','UserAgent': ua.random,
}# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {#    'lianjia.middlewares.LianjiaSpiderMiddleware': 543,
#    #  'lianjia.middlewares.ProxyMiddleware':543
# }# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {#     'lianjia.middlewares.LianjiaDownloaderMiddleware': 543,
# }# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {#    'scrapy.extensions.telnet.TelnetConsole': None,
# }# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'lianjia.pipelines.LianjiaPipeline': 300,
}

items设置

import scrapyclass LianjiaItem(scrapy.Item):c_title = scrapy.Field()  # 小区名  titleh_type = scrapy.Field()  # 户型p_square =scrapy.Field()# 平米c_time = scrapy.Field()# 成交时间c_price = scrapy.Field()# 成交价格g_price = scrapy.Field()# 挂牌价格c_cycle = scrapy.Field()# 成交周期t_frequency = scrapy.Field()# 调价次数focus_num = scrapy.Field()# 关注人数watch_num = scrapy.Field()# 带看次数l_browse = scrapy.Field()# 浏览次数# price = scrapy.Field()# average_price = scrapy.Field()# link = scrapy.Field()

管道设置

from scrapy.exporters import JsonLinesItemExporterclass LianjiaPipeline:# 初始化创建文件def __init__(self):self.file = open('lianjia.json', 'wb', )self.exproter = JsonLinesItemExporter(self.file, ensure_ascii=False, encoding='utf-8')# 写入数据def process_item(self, item, spider):self.exproter.export_item(item, )return item# 自动关闭文件def close_item(self, spider):self.file.close()

数据预览

scarpy-爬取链家所有成交数据相关推荐

利用xpath爬取链家租房房源数据并利用pandas保存到Excel文件中
我们的需求是利用xpath爬取链家租房房源数据,并将数据通过pandas保存到Excel文件当中下面我们看一下链家官网的房源信息(以北京为例) 如图所示,我们通过筛选得到北京租房信息那么我们需要将 ...
租房不入坑不进坑，Python爬取链家二手房的数据，提前了解租房信息
目录前言一.查找数据所在位置: 二.确定数据存放位置: 三.获取html数据: 四.解析html,提取有用数据: 前言贫穷限制了我的想象,从大学进入到社会这么久,从刚开始的兴致勃勃,觉得钱有什么 ...
爬取链家网二手房数据并保存到mongodb中
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档爬取链家网二手房数据并保存到mongodb中文章目录前言一.爬虫的介绍二.协程的介绍三.css选择器四.基于asyncio ...
Scrapy实战篇（一）之爬取链家网成交房源数据（上）
今天,我们就以链家网南京地区为例,来学习爬取链家网的成交房源数据. 这里推荐使用火狐浏览器,并且安装firebug和firepath两款插件,你会发现,这两款插件会给我们后续的数据提取带来很大的方便. ...
python爬取链家新房_Python爬虫实战：爬取链家网二手房数据
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...
爬取链家北京租房数据并做简单分析
在一个来北京不久的学生眼中,北京是一个神秘又充满魅力的大城市.它无比美好,但又无时无刻不再觊觎这你薄弱的钱包. 租房是很多人都离不开的硬性需求,这里就对从链家爬取的北京地区房屋出租数据进行一个简单分析 ...
掌财社:python怎么爬取链家二手房的数据？爬虫实战！
我们知道爬虫的比较常见的应用都是应用在数据分析上,爬虫作为数据分析的前驱,它负责数据的收集.今天我们以python爬取链家二手房数据为例来进行一个python爬虫实战.(内附python爬虫源代码) ...
python爬取链家新房数据_Python爬虫实战：爬取链家网二手房数据
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...
Scrapy实战篇（二）之爬取链家网成交房源数据（下）
在上一小节中,我们已经提取到了房源的具体信息,这一节中,我们主要是对提取到的数据进行后续的处理,以及进行相关的设置. 数据处理我们这里以把数据存储到mongo数据库为例. 编写pipelines.p ...
爬取链家二手房交易数据
请求:浏览器的地址栏的url向服务器发送请求关注的内容:请求的url 请求的方式method get/post 请求参数响应: 作出响应响应状态码:200 418 404 500 浏览器的工作原 ...

scarpy-爬取链家所有成交数据

spider的代码

seting 设置

items设置

管道设置

scarpy-爬取链家所有成交数据相关推荐

最新文章

热门文章