房天下全国658个城市新房，二手房爬取

房天下北京二手房分布式抓取：

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpiderclass LianjiaSpider(RedisCrawlSpider):name = 'ftx'allowed_domains = ['esf.fang.com']#start_urls=['https://esf.fang.com']redis_key = 'ftx'rules = (Rule(LinkExtractor(allow='https://esf.fang.com/house/i\d+/')),Rule(LinkExtractor(allow='https://esf.fang.com/chushou/.*'), callback='parse_detail',follow=True),)def parse_detail(self, response):item = {}item['title'] = response.xpath('//*[@class="title floatl"]/text()').extract_first().strip().replace('\r\n','')item['price'] = response.xpath('string(//div[@class="trl-item_top"]/div[1])').extract_first()item['area'] = response.xpath('//div[@class="tt"]/text()').extract_first().strip().replace('\r\n','')#print(item)return item

ftx_spider：全国城市主页开始遍历城市提取url，解析各个城市的新房、二手房的基本信息

import scrapy,re
from fangtx.items import NewHouseItem,ESFHouseItem# https://www.cnblogs.com/derek1184405959/p/9446544.htmlclass FtxSpider(scrapy.Spider):name = 'ftx'allowed_domains = ['fang.com']start_urls = ['https://www.fang.com/SoufunFamily.htm']def parse(self, response):trs = response.xpath('//div[@class="outCont"]//tr')province=Nonefor tr in trs:tds=tr.xpath('.//td[not(@class)]')province_td=tds[0]provice_text = province_td.xpath(".//text()").get()province_text=re.sub(r'\s','',provice_text)if province_text:province=province_text# 排除海外城市if province == '其它':continuecity_td=tds[1]city_links = city_td.xpath(".//a")for city in city_links:city_name=city.xpath('.//text()').extract_first()city_url=city.xpath('.//@href').extract_first()# print('省份：',province)# print('城市：',city_name)# print('城市url：',city_url)url_module = city_url.split("//")scheme = url_module[0]     #http:domain = url_module[1]     #cq.fang.com/if 'bj' in domain:newhouse_url = ' http://newhouse.fang.com/house/s/'esf_url = ' http://esf.fang.com/'else:#新房urlnewhouse_url = scheme + '//' + "newhouse." + domain + "house/s/"#二手房urlesf_url = scheme + '//' + "esf." + domain + "house/s/"# print('城市：%s%s'%(province,city_name))# print("新房链接：",newhouse_url)# print("二手房链接：",esf_url)yield scrapy.Request(url=newhouse_url,callback = self.parse_newhouse, meta = {'info':(province,city_name)})yield scrapy.Request(url=esf_url,callback=self.parse_esf,meta={'info': (province, city_name)})def parse_newhouse(self, response):# 新房provice, city = response.meta.get('info')lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")for li in lis:name = li.xpath(".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()").get()if name:name = re.sub(r"\s", "", name)# 居室house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()house_type_list = list(map(lambda x: re.sub(r"\s", "", x), house_type_list))rooms = list(filter(lambda x: x.endswith("居"), house_type_list))# 面积area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())area = re.sub(r"\s|－|/", "", area)# 地址address = li.xpath(".//div[@class='address']/a/@title").get()address = re.sub(r"[请选择]", "", address)sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())price = re.sub(r"\s|广告", "", price)# 详情页urlorigin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()item = NewHouseItem(name=name,rooms=rooms,area=area,address=address,sale=sale,price=price,origin_url=origin_url,provice=provice,city=city)print(item)# yield item# 下一页next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()if next_url:yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={'info': (provice, city)})def parse_esf(self, response):# 二手房provice, city = response.meta.get('info')dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")for dl in dls:item = ESFHouseItem(provice=provice, city=city)name = dl.xpath(".//span[@class='tit_shop']/text()").get()if name:infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()infos = list(map(lambda x: re.sub(r"\s", "", x), infos))for info in infos:if "厅" in info:item["rooms"] = infoelif '层' in info:item["floor"] = infoelif '向' in info:item['toward'] = infoelif '㎡' in info:item['area'] = infoelif '年建' in info:item['year'] = re.sub("年建", "", info)item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()# 总价item['price'] = "".join(dl.xpath(".//span[@class='red']//text()").getall())# 单价item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()item['name'] = namedetail = dl.xpath(".//h4[@class='clearfix']/a/@href").get()item['origin_url'] = response.urljoin(detail)print(item)#yield item# 下一页next_url = response.xpath("//div[@class='page_al']/p/a/@href").get()if next_url:yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={'info': (provice, city)})

二手房抓取：房天下，链家，安居客。。。进行数据分析与挖掘

数据的抓取：

headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}def detail_html(url):response = requests.get(url, headers=headers)response = response.content.decode('gbk')result = etree.HTML(response)title_list=result.xpath('//*[@class="shop_list shop_list_4"]/dl')for title in title_list:item={}name=title.xpath('.//*[@class="clearfix"]/a/@title')item['name']=name[0] if len(name)>0 else Noneitem['style']=title.xpath('string(.//p[@class="tel_shop"])').strip().replace('\r\n','').replace(' ','')item['price']=title.xpath('string(.//span[@class="red"])')item['place']=title.xpath('.//p[@class="add_shop"]//span/text()')item['place'] = item['place'][0] if len(name) > 0 else Noneitem['house_name']=title.xpath('.//p[@class="add_shop"]/a/@title')item['house_name'] = item['house_name'][0] if len(name) > 0 else Noneprint(item)def main():for i in range(1,101):url = 'https://hz.esf.fang.com/house/i3{}/'.format(i)detail_html(url)if __name__=='__main__':main()

数据分析：

待续。。。。

房天下全国658个城市新房，二手房爬取相关推荐

Python爬取马蜂窝-推荐小众城市旅游及爬取某一城市的景点信息和游记信息
目的: 推荐小众城市旅游及爬取某一城市的景点信息和游记信息. 第一部分首先从目的地页面获得各省专属5位数字编号,之后进入各省城市列表获得热门城市专属5位数字编号. 1.获得直辖市编号和热门省编号,h ...
python分布式(scrapy-redis)实现对房天下全国二手房与新房的信息爬取(偏小白,有源码有分析)
文章目录 Scrapy实现, 确定需求进入分析分析url 分析页面结构代码 spiders(爬虫) items pipelines middlewares settings start Scra ...
爬虫实战—爬取房天下全国所有的楼盘并入库（附源码）
1.创建项目使用命令创建scrapy项目:scrapy startproject fang进入到spiders文件中: cd fang/fang/spiders创建爬虫文件:scrapy gensp ...
python对城市规划_Python对城市距离自动化爬取【必学小型项目】
本地创建数据库,将 excel 数据存储到 city 表中,再取 | 湖北省 | 的所有地级市和县.县级市.区数据作为样表数据记录在样表中.利用 python 的 xlrd 包,定义 process_ ...
【Python】模拟登陆房天下的总结
[Python]requests模拟登陆房天下的总结最近想爬取些与房价有关的数据,看了几个网站,感觉房天下包含的内容比较多,于是,先对房天下入手.为了保证后面数据爬取,想先模拟登陆获取cookies ...
爬虫Scrapy框架运用----房天下二手房数据采集
在许多电商和互联网金融的公司为了更好地服务用户,他们需要爬虫工程师对用户的行为数据进行搜集.分析和整合,为人们的行为选择提供更多的参考依据,去服务于人们的行为方式,甚至影响人们的生活方式.我们的scr ...
房天下二手交易平台房源数据采集
在许多电商和互联网金融的公司为了更好地服务用户,他们需要爬虫工程师对用户的行为数据进行搜集.分析和整合,为人们的行为选择提供更多的参考依据,去服务于人们的行为方式,甚至影响人们的生活方式.我们的scr ...
python写爬虫4-多线程爬虫(采集58出租房信息)_python爬虫爬取58同城上所有城市的租房信息详解...
代码如下 from fake_useragent import UserAgent from lxml import etree import requests, os import time, re ...
全国历史天气查询/历史天气预报查询——全国各月份数据爬取
全国历史天气查询/历史天气预报查询--全国各月份数据爬取效果图1 目标爬取数据图2 最终实验效果实验效果:最终可将官网已有的数据进行爬取整理,共363个城市,从2011年1月--至今数据已上 ...

房天下全国658个城市新房，二手房爬取

房天下全国658个城市新房，二手房爬取相关推荐

最新文章

热门文章