无不良目的,纯学习

策略:

1、增量爬取二手房成交数据,最多3000条,所以每天直接增量爬取即可。

2、老数据有几种方式,我用的并不是最优的,先从安居客爬取所有小区入库(安居客反爬比较强,锁也是增量爬取),链家查询每个小区成交房源数据。

3、还有更好的策略,只是练习,所以没有完善,总共5万多数据,抓了4万。

代码:

增量代码:

import requests
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 增量抓取所有链家房源
class Lianjia:def __init__(self):self.web_requests = WebRequests()self.mc = Mc()def run(self):url_one = "https://xa.lianjia.com/chengjiao/pg1/"response = self.web_requests.get(url_one)selector = Selector(text=response.text)url = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-url").extract_first()page = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-data").extract_first()page_dic = eval(page)total_page = page_dic.get('totalPage')curPage = page_dic.get('curPage')while curPage<=total_page:time.sleep(1)next_url = parse.urljoin(response.url, url.format(page=str(curPage)))print('===url:{}'.format(next_url))r = self.web_requests.get(next_url)selector = Selector(text=r.text)ul = selector.xpath("//ul[@class='listContent']/li")for li in ul:# 小区名 户型 面积title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()a = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()house_id = int(re.match('.*?(\d+).*', a).group(1))# 朝向position = li.xpath('.//div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first()) # 总价money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first()) # 单价success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first() # 成交日期success_data = datetime.datetime.strptime(success_data,'%Y.%m.%d')link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first() # 房屋链接try:name,house_type,size = title.split(' ')except Exception as e:print('====error,title:{}'.format(title))continueimg = li.xpath('.//a/img/@src').extract_first()house_size = float(size.replace('平米',''))sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)r = self.mc.query(sql)if not r:sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(house_id, name, house_type, house_size, money_all, money_every, success_data,img,link)print(sql)self.mc.insert(sql)curPage+=1if __name__ == '__main__':Lianjia().run()

抓取安居客房源数据

from utils.common import WebRequests, Mc
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import re
import timefrom scrapy import Selectorfrom utils.common import WebRequests, Mc# 安居客抓取所有小区
class Home:def __init__(self):self.web_requests = WebRequests()self.mc = Mc()def run(self):url_one = "https://xa.anjuke.com/community/"response = self.web_requests.get(url_one)selector = Selector(text=response.text)urls = selector.xpath("//div[@class='div-border items-list']//div[1]/span[2]/a/@href").extract()positions = []for url in urls[15:]:position = re.match(r'https://xa.anjuke.com/community/(.*)/', url).group(1)positions.append(position)print(positions)anjuke_url = 'https://xa.anjuke.com/community/'for position in positions[1:]:url = anjuke_url + position + '/p{}'response = self.web_requests.get(url.format(1))selector = Selector(text=response.text)counts = selector.xpath("//div[@class='sortby']/span/em[2]/text()").extract()if counts and int(counts[0]) == 0:continuetry:page_count = int(int(counts[0]) / 30)except Exception as e:print(e)print(counts)for page in range(1, page_count + 1):print('====position:{},page:{}'.format(position, page))time.sleep(1)response = self.web_requests.get(url.format(page))selector = Selector(text=response.text)homes = selector.xpath("//div[@class='list-content']/div")for item in homes[1:]:home = item.xpath('.//div[@class="li-info"]/h3/a/text()').extract_first()home = home.replace(' ', '').replace('\n', '')quyu = item.xpath('.//div[@class="li-info"]/address/text()').extract_first().replace(' ','').replace('\n', '')price = item.xpath('.//div[@class="li-side"]/p/strong/text()').extract_first().replace('\n', '')sql = "select * from xian_home where home='{}'".format(home)r = self.mc.query(sql)if not r:sql = "insert into xian_home (home,position,money_every) values ('{}','{}',{})".format(home, quyu, price)self.mc.insert(sql)else:sql = "update xian_home set money_every={} where home='{}'".format(price,home)if __name__ == '__main__':Home().run()

通过安居客房源数据在链家上爬取小区所有房源

import urllib
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import requests
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 通过库里的小区名,在链家上查询所有成交房源
class Lianjia:def __init__(self):self.web_requests = WebRequests()self.mc = Mc()def get_home(self):# 库里查询所有小区sql = 'select home from xian_home'homes = self.mc.query(sql)return homesdef run(self):homes = self.get_home()for idx,home in enumerate(homes):url_first = 'http://xa.lianjia.com/chengjiao/pg1rs{}'.format(home[0])# url_first = url_first.decode('gbk', 'replace')# url_first = urllib.quote(url_first.encode('utf-8', 'replace'))response = self.web_requests.get(url_first)selector = Selector(text=response.text)count = selector.xpath('//div[@class="total fl"]/span/text()').extract_first()if count:count = int(count.replace(' ',''))pages = int(count/30)+1else:continueif pages>50:continuefor page in range(1,pages+1):time.sleep(1)url = 'http://xa.lianjia.com/chengjiao/pg{}rs{}/'.format(page,home[0])response = self.web_requests.get(url)selector = Selector(text=response.text)items = selector.xpath("//ul[@class='listContent']/li")for li in items:try:title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()if '车位' in title:continuea = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()house_id = int(re.match('.*?(\d+).*', a).group(1))if house_id == 101109708199:print('here')# 朝向position = li.xpath('.//div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first())  # 总价money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first())  # 单价success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first()  # 成交日期success_data = datetime.datetime.strptime(success_data, '%Y.%m.%d')link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()  # 房屋链接try:name, house_type, size = title.split(' ')except Exception as e:print('====error,title:{}'.format(title))continueimg = li.xpath('.//a/img/@src').extract_first()try:house_size = float(size.replace('平米', ''))except Exception as e:print('====error,title:{}'.format(title))sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)r = self.mc.query(sql)if not r:sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(house_id, name, house_type, house_size, money_all, money_every, success_data, img, link)print(sql)self.mc.insert(sql)except Exception as e:continueif __name__ == '__main__':Lianjia().run()

工具model(数据库信息隐藏)

import pymysql
import sysclass Mc:'''类Mc:把mysql的一些操作封装成类ExcQuery(sql):查找,返回类型:tupleExcUpdate(sql):增删改,出错时输出错误信息用法:mc = Mc()sql = "SELECT * FROM `biaotiku`"data = mc.ExcQuery(sql)for i in data:print(i)sql="INSERT INTO `biaotiku` (`id`, `text`, `beizhu`) VALUES (NULL, 'test', '123')"mc.ExcUpdate(sql)'''def __init__(self, db_host="xxx.xxx.xxx.xxx", username="xxx", pw="xxx", dbname="spider"):self.db_host = db_hostself.username = usernameself.pw = pwself.dbname = dbnameself.db = pymysql.connect(self.db_host, self.username, self.pw, self.dbname)self.cursor = self.db.cursor()def query(self, sql):self.cursor.execute(sql)r = self.cursor.fetchall()if r:return list(r)else:return []def update(self, sql):try:self.cursor.execute(sql)self.db.commit()except:print(sys.exc_info())def insert(self, sql):try:self.cursor.execute(sql)self.db.commit()except:print(sys.exc_info())def __del__(self):self.db.close()
if __name__ == '__main__':r = Mc().query('select * from proxy_ip where id=3;')if r:print(r)

工具common

import randomimport requests
import time
from utils.model import Mcclass WebRequests:def __init__(self):self.ips = []sql = 'select ip from proxy_ip where is_delete=0;'all = Mc().query(sql)for ip in all:self.ips.append(ip[0])@propertydef user_agent(self):"""return an User-Agent at random:return:"""from fake_useragent import UserAgentua = UserAgent()return ua.random@propertydef header(self):"""basic header:return:"""return {'User-Agent': self.user_agent,'Accept': '*/*','Connection': 'keep-alive','Accept-Language': 'zh-CN,zh;q=0.8'}@propertydef proxy(self):return random.choice(self.ips)# return self.ips[random.randint(1, len(self.ips) + 1)]def get(self, url, header=None, retry_time=1, retry_interval=5, timeout=10, *args, **kwargs):"""get method:param url: target url:param header: headers:param retry_time: retry time:param retry_interval: retry interval:param timeout: network timeout:return:"""headers = self.headerif header and isinstance(header, dict):headers.update(header)proxies = {"http": "http://" + str(self.proxy)}i = 0while True:try:# r = requests.get(url, headers=headers, proxies=proxies,timeout=timeout)r = requests.get(url, headers=headers,timeout=timeout)i = 0return rexcept Exception as e:i+=1print('====请求失败,{}s后重试{}次'.format(retry_time,i))time.sleep(retry_time)if i==retry_interval:print('====请求失败,请检查:{}'.format(url))

requests爬取链家网房源数据相关推荐

  1. 利用xpath爬取链家租房房源数据并利用pandas保存到Excel文件中

    我们的需求是利用xpath爬取链家租房房源数据,并将数据通过pandas保存到Excel文件当中 下面我们看一下链家官网的房源信息(以北京为例) 如图所示,我们通过筛选得到北京租房信息 那么我们需要将 ...

  2. 爬取链家网二手房数据并保存到mongodb中

    提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档 爬取链家网二手房数据并保存到mongodb中 文章目录 前言 一.爬虫的介绍 二.协程的介绍 三.css选择器 四.基于asyncio ...

  3. python+selenium爬取链家网房源信息并保存至csv

    python+selenium爬取链家网房源信息并保存至csv 抓取的信息有:房源', '详细信息', '价格','楼层', '有无电梯 import csv from selenium import ...

  4. python爬取链家新房_Python爬虫实战:爬取链家网二手房数据

    前言 本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...

  5. python爬取链家新房数据_Python爬虫实战:爬取链家网二手房数据

    前言 本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...

  6. Scrapy实战篇(一)之爬取链家网成交房源数据(上)

    今天,我们就以链家网南京地区为例,来学习爬取链家网的成交房源数据. 这里推荐使用火狐浏览器,并且安装firebug和firepath两款插件,你会发现,这两款插件会给我们后续的数据提取带来很大的方便. ...

  7. 爬取链家网北京房源及房价分析

    爬取链家网北京房源及房价分析 文章开始把我喜欢的这句话送个大家:这个世界上还有什么比自己写的代码运行在一亿人的电脑上更酷的事情吗,如果有那就是让这个数字再扩大十倍 1.数据获取 # 获取某市区域的所有 ...

  8. python爬房源信息_用python爬取链家网的二手房信息

    题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...

  9. python 爬取链家数据_用python爬取链家网的二手房信息

    题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...

  10. python爬取链家网的房屋数据

    python爬取链家网的房屋数据 爬取内容 爬取源网站 爬取内容 爬取思路 爬取的数据 代码 获取房屋url 获取房屋具体信息 爬取内容 爬取源网站 北京二手房 https://bj.lianjia. ...

最新文章

  1. const与#define的异同
  2. linux定时备份mysql数据库文件脚本
  3. Java动态追踪技术--BTrace
  4. 巨坑!这公司的行为,挺适合清明节!
  5. C++连接MySQL(Windows)
  6. 半正定矩阵 正定
  7. 尚学堂高淇python课件代码_尚学堂高淇Python400集全套视频教程百度云网盘分享
  8. 鸿蒙与混沌的区别,混沌的近义词(混沌鸿蒙同义词)
  9. Html的页面演变史02
  10. 在线预览pdf(不可下载)
  11. three.js纹理贴图不显示
  12. 实现微信机器人开发,个微api
  13. bootstrap实战项目总结 源码及知识点
  14. SpringBoot与knif4j学习
  15. 代理服务器拒绝连接,连接失败
  16. 转:builder模式分析
  17. WCF的服务部署方法
  18. Symbian OS 开发初级手册 9(转)
  19. 中国电信联合RIM发布黑莓9630手机
  20. matlab fsolve函数 误差,求助 关于matlab中fsolve函数的问题

热门文章

  1. c++_十进制数-10的三进制4位补码是多少?
  2. eclipse4.7的tomcat插件安装(三只小猫)
  3. 智能红外遥控器(四):手机蓝牙控制格力风扇康佳加湿器
  4. 但行好事 莫问前程(九月)
  5. 关于Ceisum中的heading、pitch、roll的定义
  6. Bitbucket安装配置
  7. 西安千锋培训python
  8. 高等代数 线性空间(第8章)1 线性空间与子空间
  9. 机器学习笔记之 K-NEAREST NEIGHBORS
  10. 零基础学习嵌入式C语言要学习什么?