requests爬取链家网房源数据

无不良目的，纯学习

策略：

1、增量爬取二手房成交数据，最多3000条，所以每天直接增量爬取即可。

2、老数据有几种方式，我用的并不是最优的，先从安居客爬取所有小区入库（安居客反爬比较强，锁也是增量爬取），链家查询每个小区成交房源数据。

3、还有更好的策略，只是练习，所以没有完善，总共5万多数据，抓了4万。

代码:

增量代码：

import requests
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 增量抓取所有链家房源
class Lianjia:def __init__(self):self.web_requests = WebRequests()self.mc = Mc()def run(self):url_one = "https://xa.lianjia.com/chengjiao/pg1/"response = self.web_requests.get(url_one)selector = Selector(text=response.text)url = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-url").extract_first()page = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-data").extract_first()page_dic = eval(page)total_page = page_dic.get('totalPage')curPage = page_dic.get('curPage')while curPage<=total_page:time.sleep(1)next_url = parse.urljoin(response.url, url.format(page=str(curPage)))print('===url:{}'.format(next_url))r = self.web_requests.get(next_url)selector = Selector(text=r.text)ul = selector.xpath("//ul[@class='listContent']/li")for li in ul:# 小区名 户型 面积title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()a = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()house_id = int(re.match('.*?(\d+).*', a).group(1))# 朝向position = li.xpath('.//div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first()) # 总价money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first()) # 单价success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first() # 成交日期success_data = datetime.datetime.strptime(success_data,'%Y.%m.%d')link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first() # 房屋链接try:name,house_type,size = title.split(' ')except Exception as e:print('====error,title:{}'.format(title))continueimg = li.xpath('.//a/img/@src').extract_first()house_size = float(size.replace('平米',''))sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)r = self.mc.query(sql)if not r:sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(house_id, name, house_type, house_size, money_all, money_every, success_data,img,link)print(sql)self.mc.insert(sql)curPage+=1if __name__ == '__main__':Lianjia().run()

抓取安居客房源数据

from utils.common import WebRequests, Mc
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import re
import timefrom scrapy import Selectorfrom utils.common import WebRequests, Mc# 安居客抓取所有小区
class Home:def __init__(self):self.web_requests = WebRequests()self.mc = Mc()def run(self):url_one = "https://xa.anjuke.com/community/"response = self.web_requests.get(url_one)selector = Selector(text=response.text)urls = selector.xpath("//div[@class='div-border items-list']//div[1]/span[2]/a/@href").extract()positions = []for url in urls[15:]:position = re.match(r'https://xa.anjuke.com/community/(.*)/', url).group(1)positions.append(position)print(positions)anjuke_url = 'https://xa.anjuke.com/community/'for position in positions[1:]:url = anjuke_url + position + '/p{}'response = self.web_requests.get(url.format(1))selector = Selector(text=response.text)counts = selector.xpath("//div[@class='sortby']/span/em[2]/text()").extract()if counts and int(counts[0]) == 0:continuetry:page_count = int(int(counts[0]) / 30)except Exception as e:print(e)print(counts)for page in range(1, page_count + 1):print('====position:{},page:{}'.format(position, page))time.sleep(1)response = self.web_requests.get(url.format(page))selector = Selector(text=response.text)homes = selector.xpath("//div[@class='list-content']/div")for item in homes[1:]:home = item.xpath('.//div[@class="li-info"]/h3/a/text()').extract_first()home = home.replace(' ', '').replace('\n', '')quyu = item.xpath('.//div[@class="li-info"]/address/text()').extract_first().replace(' ','').replace('\n', '')price = item.xpath('.//div[@class="li-side"]/p/strong/text()').extract_first().replace('\n', '')sql = "select * from xian_home where home='{}'".format(home)r = self.mc.query(sql)if not r:sql = "insert into xian_home (home,position,money_every) values ('{}','{}',{})".format(home, quyu, price)self.mc.insert(sql)else:sql = "update xian_home set money_every={} where home='{}'".format(price,home)if __name__ == '__main__':Home().run()

通过安居客房源数据在链家上爬取小区所有房源

import urllib
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import requests
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 通过库里的小区名，在链家上查询所有成交房源
class Lianjia:def __init__(self):self.web_requests = WebRequests()self.mc = Mc()def get_home(self):# 库里查询所有小区sql = 'select home from xian_home'homes = self.mc.query(sql)return homesdef run(self):homes = self.get_home()for idx,home in enumerate(homes):url_first = 'http://xa.lianjia.com/chengjiao/pg1rs{}'.format(home[0])# url_first = url_first.decode('gbk', 'replace')# url_first = urllib.quote(url_first.encode('utf-8', 'replace'))response = self.web_requests.get(url_first)selector = Selector(text=response.text)count = selector.xpath('//div[@class="total fl"]/span/text()').extract_first()if count:count = int(count.replace(' ',''))pages = int(count/30)+1else:continueif pages>50:continuefor page in range(1,pages+1):time.sleep(1)url = 'http://xa.lianjia.com/chengjiao/pg{}rs{}/'.format(page,home[0])response = self.web_requests.get(url)selector = Selector(text=response.text)items = selector.xpath("//ul[@class='listContent']/li")for li in items:try:title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()if '车位' in title:continuea = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()house_id = int(re.match('.*?(\d+).*', a).group(1))if house_id == 101109708199:print('here')# 朝向position = li.xpath('.//div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first())  # 总价money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first())  # 单价success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first()  # 成交日期success_data = datetime.datetime.strptime(success_data, '%Y.%m.%d')link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()  # 房屋链接try:name, house_type, size = title.split(' ')except Exception as e:print('====error,title:{}'.format(title))continueimg = li.xpath('.//a/img/@src').extract_first()try:house_size = float(size.replace('平米', ''))except Exception as e:print('====error,title:{}'.format(title))sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)r = self.mc.query(sql)if not r:sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(house_id, name, house_type, house_size, money_all, money_every, success_data, img, link)print(sql)self.mc.insert(sql)except Exception as e:continueif __name__ == '__main__':Lianjia().run()

工具model（数据库信息隐藏）

import pymysql
import sysclass Mc:'''类Mc:把mysql的一些操作封装成类ExcQuery(sql):查找,返回类型:tupleExcUpdate(sql):增删改,出错时输出错误信息用法:mc = Mc()sql = "SELECT * FROM `biaotiku`"data = mc.ExcQuery(sql)for i in data:print(i)sql="INSERT INTO `biaotiku` (`id`, `text`, `beizhu`) VALUES (NULL, 'test', '123')"mc.ExcUpdate(sql)'''def __init__(self, db_host="xxx.xxx.xxx.xxx", username="xxx", pw="xxx", dbname="spider"):self.db_host = db_hostself.username = usernameself.pw = pwself.dbname = dbnameself.db = pymysql.connect(self.db_host, self.username, self.pw, self.dbname)self.cursor = self.db.cursor()def query(self, sql):self.cursor.execute(sql)r = self.cursor.fetchall()if r:return list(r)else:return []def update(self, sql):try:self.cursor.execute(sql)self.db.commit()except:print(sys.exc_info())def insert(self, sql):try:self.cursor.execute(sql)self.db.commit()except:print(sys.exc_info())def __del__(self):self.db.close()
if __name__ == '__main__':r = Mc().query('select * from proxy_ip where id=3;')if r:print(r)

工具common

import randomimport requests
import time
from utils.model import Mcclass WebRequests:def __init__(self):self.ips = []sql = 'select ip from proxy_ip where is_delete=0;'all = Mc().query(sql)for ip in all:self.ips.append(ip[0])@propertydef user_agent(self):"""return an User-Agent at random:return:"""from fake_useragent import UserAgentua = UserAgent()return ua.random@propertydef header(self):"""basic header:return:"""return {'User-Agent': self.user_agent,'Accept': '*/*','Connection': 'keep-alive','Accept-Language': 'zh-CN,zh;q=0.8'}@propertydef proxy(self):return random.choice(self.ips)# return self.ips[random.randint(1, len(self.ips) + 1)]def get(self, url, header=None, retry_time=1, retry_interval=5, timeout=10, *args, **kwargs):"""get method:param url: target url:param header: headers:param retry_time: retry time:param retry_interval: retry interval:param timeout: network timeout:return:"""headers = self.headerif header and isinstance(header, dict):headers.update(header)proxies = {"http": "http://" + str(self.proxy)}i = 0while True:try:# r = requests.get(url, headers=headers, proxies=proxies,timeout=timeout)r = requests.get(url, headers=headers,timeout=timeout)i = 0return rexcept Exception as e:i+=1print('====请求失败，{}s后重试{}次'.format(retry_time,i))time.sleep(retry_time)if i==retry_interval:print('====请求失败，请检查：{}'.format(url))

requests爬取链家网房源数据相关推荐

利用xpath爬取链家租房房源数据并利用pandas保存到Excel文件中
我们的需求是利用xpath爬取链家租房房源数据,并将数据通过pandas保存到Excel文件当中下面我们看一下链家官网的房源信息(以北京为例) 如图所示,我们通过筛选得到北京租房信息那么我们需要将 ...
爬取链家网二手房数据并保存到mongodb中
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档爬取链家网二手房数据并保存到mongodb中文章目录前言一.爬虫的介绍二.协程的介绍三.css选择器四.基于asyncio ...
python+selenium爬取链家网房源信息并保存至csv
python+selenium爬取链家网房源信息并保存至csv 抓取的信息有:房源', '详细信息', '价格','楼层', '有无电梯 import csv from selenium import ...
python爬取链家新房_Python爬虫实战：爬取链家网二手房数据
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...
python爬取链家新房数据_Python爬虫实战：爬取链家网二手房数据
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...
Scrapy实战篇（一）之爬取链家网成交房源数据（上）
今天,我们就以链家网南京地区为例,来学习爬取链家网的成交房源数据. 这里推荐使用火狐浏览器,并且安装firebug和firepath两款插件,你会发现,这两款插件会给我们后续的数据提取带来很大的方便. ...
爬取链家网北京房源及房价分析
爬取链家网北京房源及房价分析文章开始把我喜欢的这句话送个大家:这个世界上还有什么比自己写的代码运行在一亿人的电脑上更酷的事情吗,如果有那就是让这个数字再扩大十倍 1.数据获取 # 获取某市区域的所有 ...
python爬房源信息_用python爬取链家网的二手房信息
题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...
python 爬取链家数据_用python爬取链家网的二手房信息
题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...
python爬取链家网的房屋数据
python爬取链家网的房屋数据爬取内容爬取源网站爬取内容爬取思路爬取的数据代码获取房屋url 获取房屋具体信息爬取内容爬取源网站北京二手房 https://bj.lianjia. ...

requests爬取链家网房源数据

requests爬取链家网房源数据相关推荐

最新文章

热门文章