爬取链家网流程为request.get得到网页信息,将得到的信息放入BeautifulSoup,再调用select爬取所需信息,用re正则表达式进行更细的筛选,用strip过滤无用字符串。具体代码如下:

import re
import time
import pandas as pd
import json
import time#伪造设置浏览器请求头user-agent
#修改starturl_list即可
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}
starturl_list = ['https://cs.lianjia.com/chengjiao/']#获取县级市的url
def get_cityurls(url):request = requests.get(url,headers=head)request.encoding = 'utf-8'soup = BeautifulSoup(request.text,'html.parser')cityurls = []prenews = soup.select('div.position>dl>dd>div>div>a')pre_news =  ''.join([str(i) for i in prenews])nameslist = re.findall("/chengjiao/[a-zA-Z0-9]+/. t",pre_news)namesliststrip = [i.lstrip('/chengjiao/').rstrip('" t')  for i in nameslist]k = len(namesliststrip)i = 0for i in range(k):newcity = url + '{}'.format(namesliststrip[i])cityurls.append(newcity)i += 1return cityurls#获取二手房每一页的url
def get_pageurls(url):request = requests.get(url,headers=head)request.encoding = 'utf-8'soup = BeautifulSoup(request.text,'html.parser')totalnum = json.loads(soup.find('div',{'class':"page-box house-lst-page-box"}).get('page-data'))['totalPage']+1pageurls_list = []pageurls_list.append(url)for num in range(2,totalnum):newurl = url + 'pg{}/'.format(num)pageurls_list.append(newurl)return pageurls_list#获取每一页的二手房url
def get_eachurls(url):eachurl_list = []request = requests.get(url,headers=head)request.encoding = 'utf-8'soup = BeautifulSoup(request.text,'html.parser')address_a = soup.select('li > div.info > div.title>a')for i in address_a:eachurl_list.append(i['href'])return eachurl_listdef news_ershoufang(url):data_all = []res = requests.get(url, headers=head)res.encoding = 'utf-8'soup = BeautifulSoup(res.text, 'html.parser')pre_data = soup.select('div.content > ul > li')pre_datanews = ''.join([str(i) for i in pre_data])# 城市data_all.append('长沙')# 小区名字names = soup.select('div.house-title>div.wrapper')names_pre = ''.join(str(i) for i in names)names_predata = re.findall("^<div class=.wrapper.>[\u4e00-\u9fa5]+ ", names_pre)if len(names) == 0:data_all.append('None')else:names_news = names_predata[0].lstrip('<div class="wrapper">').rstrip(' ')data_all.append(names_news)# 室厅厨卫shi = re.findall(u"房屋户型</span>[\d\u4e00-\u9fa5]+", pre_datanews)if len(shi) == 0:data_all.append('None')else:shi_news = shi[0].lstrip('房屋户型</span>')data_all.append(shi_news)# 高度与楼层floor = re.findall(u"所在楼层</span>.+</li><li><span class=.label.>建筑面积", pre_datanews)if len(floor) == 0:data_all.append('None')else:floor_news = floor[0].lstrip('所在楼层</span>').rstrip('</li><li><span class="label">建筑面积')data_all.append(floor_news)# 建筑面积area = re.findall(u"建筑面积</span>.+户型结构", pre_datanews)if len(area) == 0:data_all.append('None')else:area_news = area[0].lstrip('建筑面积</span>').rstrip('</li><li><span class="label">户型结构')data_all.append(area_news)# 户型结构huxing = re.findall(u"户型结构</span>[\u4e00-\u9fa5]+", pre_datanews)if len(huxing) == 0:data_all.append('None')else:huxing_news = huxing[0].lstrip('户型结构</span>')data_all.append(huxing_news)# 套内面积home_area = re.findall(u"套内面积</span>.+<li><span class=.label.>建筑类型|套内面积</span>[\u4e00-\u9fa5]+<li><span class=.label.>建筑类型",pre_datanews)if len(home_area) == 0:data_all.append('None')else:home_areanews = home_area[0].lstrip('套内面积</span>').rstrip('<li><span class="label">建筑类型').rstrip('      </')data_all.append(home_areanews)# 建筑类型label = re.findall(u"建筑类型</span>[\u4e00-\u9fa5]+", pre_datanews)if len(label) == 0:data_all.append('None')else:label_news = label[0].lstrip('建筑类型</span>')data_all.append(label_news)# 房屋朝向if len(pre_data) < 7:data_all.append('None')else:direction_news = pre_data[6].text.lstrip('房屋朝向')data_all.append(direction_news)# 建成年代com_time = re.findall(u"建成年代</span>\d+", pre_datanews)if len(com_time) == 0:data_all.append('None')else:com_timenews = com_time[0].lstrip('建成年代</span>')data_all.append(com_timenews)# 装修情况fitment = re.findall(u"装修情况</span>[\u4e00-\u9fa5]+", pre_datanews)if len(fitment) == 0:data_all.append('None')else:fitment_news = fitment[0].lstrip('装修情况</span>')data_all.append(fitment_news)# 建筑结构building = re.findall(u"建筑结构</span>[\u4e00-\u9fa5]+", pre_datanews)if len(building) == 0:data_all.append('None')else:building_news = building[0].lstrip('建筑结构</span>')data_all.append(building_news)# 供暖方式heating_method = re.findall(u"供暖方式</span>[\u4e00-\u9fa5]+", pre_datanews)if len(heating_method) == 0:data_all.append('None')else:heating_method_news = heating_method[0].lstrip('供暖方式</span>')data_all.append(heating_method_news)# 梯户比例tihu = re.findall(u"梯户比例</span>[\u4e00-\u9fa5]+", pre_datanews)if len(tihu) == 0:data_all.append('None')else:tihu_news = tihu[0].lstrip('梯户比例</span>')data_all.append(tihu_news)# 产权年限chanquan = re.findall(u"产权年限</span>\d+[\u4e00-\u9fa5]", pre_datanews)if len(chanquan) == 0:data_all.append('None')else:chanquan_news = chanquan[0].lstrip('产权年限</span>')data_all.append(chanquan_news)# 是否配备电梯dianti = re.findall(u"配备电梯</span>[\u4e00-\u9fa5]+", pre_datanews)if len(dianti) == 0:data_all.append('None')else:dianti_news = dianti[0].lstrip('配备电梯</span>')data_all.append(dianti_news)# 链家编号numberlist = re.findall(u"链家编号</span>\d+", pre_datanews)if len(numberlist) == 0:data_all.append('None')else:numberlist_news = numberlist[0].lstrip('链家编号</span>')data_all.append(numberlist_news)# 交易权属quanshu = re.findall(u"交易权属</span>[\u4e00-\u9fa5]+", pre_datanews)if len(quanshu) == 0:data_all.append('None')else:quanshu_news = quanshu[0].lstrip('交易权属</span>\n<span>')data_all.append(quanshu_news)# 挂牌时间guapai = re.findall(u"挂牌时间</span>\d+-\d+-\d+|挂牌时间</span>\d+-\d+", pre_datanews)if len(guapai) == 0:data_all.append('None')else:guapai_news = guapai[0].lstrip('挂牌时间</span>\n<span>')data_all.append(guapai_news)# 房屋用途yongtu = re.findall(u"房屋用途</span>[\u4e00-\u9fa5]+", pre_datanews)if len(yongtu) == 0:data_all.append('None')else:yongtu_news = yongtu[0].lstrip('房屋用途</span>')data_all.append(yongtu_news)# 房屋年限nianxian = re.findall(u"房屋年限</span>[\u4e00-\u9fa5]+", pre_datanews)if len(nianxian) == 0:data_all.append('None')else:nianxian_news = nianxian[0].lstrip('房屋年限</span>')data_all.append(nianxian_news)# 产权所属suoshu = re.findall(u"房权所属</span>[\u4e00-\u9fa5]+", pre_datanews)if len(suoshu) == 0:data_all.append('None')else:suoshu_news = suoshu[0].lstrip('房权所属</span>')data_all.append(suoshu_news)jiaoyi = soup.select('div.price')jiaoyi_news = jiaoyi[0].text# 成交额chengjiaoprice = re.findall(u"\d+万", jiaoyi_news)if len(chengjiaoprice) == 0:data_all.append('None')else:chengjiaoprice_news = chengjiaoprice[0]data_all.append(chengjiaoprice_news)# 成交单位价格danweiprice = re.findall(u"\d+元/平|\d+元\平", jiaoyi_news)if len(danweiprice) == 0:data_all.append('None')else:danweiprice_news = danweiprice[0]data_all.append(danweiprice_news)# 成交日期cjtime = re.findall("\d+.\d+.\d+ +成交", names_pre)if len(cjtime) == 0:data_all.append('None')else:cjtime_news = cjtime[0].rstrip('成交')data_all.append(cjtime_news)dataformsg = soup.select('div.msg')dataformsg_news = ''.join(str(i) for i in dataformsg)# 挂牌价格guapaiprice = re.findall("\d+</label>挂牌价格", dataformsg_news)if len(guapaiprice) == 0:data_all.append('None')else:guapaiprice_news = guapaiprice[0].rstrip('</label>挂牌价格')data_all.append(guapaiprice_news)# 成交周期cjzq = re.findall("\d+</label>成交周期", dataformsg_news)if len(cjzq) == 0:data_all.append('None')else:cjzq_news = cjzq[0].rstrip('</label>成交周期')data_all.append(cjzq_news)# 调价次数adjust = re.findall("\d+</label>调价", dataformsg_news)if len(adjust) == 0:data_all.append('None')else:adjust_news = adjust[0].rstrip('</label>调价')data_all.append(adjust_news)# 带看次数daikan = re.findall("\d+</label>带看", dataformsg_news)if len(daikan) == 0:data_all.append('None')else:daikan_news = daikan[0].rstrip('</label>带看')data_all.append(daikan_news)# 关注guanzhu = re.findall("\d+</label>关注", dataformsg_news)if len(guanzhu) == 0:data_all.append('None')else:guanzhu_news = guanzhu[0].rstrip('</label>关注')data_all.append(guanzhu_news)# 浏览次数liulan = re.findall("\d+</label>浏览", dataformsg_news)if len(liulan) == 0:data_all.append('None')else:liulan_news = liulan[0].rstrip('</label>浏览')data_all.append(liulan_news)biaoqian_all = soup.select('div.baseattribute.clear>div.name')xiangqing_all = soup.select('div.baseattribute.clear>div.content')# 标签1详情1if len(biaoqian_all) <= 0:data_all.append('None')else:data_all.append(biaoqian_all[0].text)if len(xiangqing_all) <= 0:data_all.append('None')else:data_all.append(xiangqing_all[0].text.lstrip('\n                    ').rstrip('\n                    '))# 标签2详情2if len(biaoqian_all) <= 1:data_all.append('None')else:data_all.append(biaoqian_all[1].text)if len(xiangqing_all) <= 1:data_all.append('None')else:data_all.append(xiangqing_all[1].text.lstrip('\n                    ').rstrip('\n                    '))# 标签3详情3if len(biaoqian_all) <= 2:data_all.append('None')else:data_all.append(biaoqian_all[2].text)if len(xiangqing_all) <= 2:data_all.append('None')else:data_all.append(xiangqing_all[2].text.lstrip('\n                    ').rstrip('\n                    '))# 标签4详情4if len(biaoqian_all) <= 3:data_all.append('None')else:data_all.append(biaoqian_all[3].text)if len(xiangqing_all) <= 3:data_all.append('None')else:data_all.append(xiangqing_all[3].text.lstrip('\n                    ').rstrip('\n                    '))# 标签5详情5if len(biaoqian_all) <= 4:data_all.append('None')else:data_all.append(biaoqian_all[4].text)if len(xiangqing_all) <= 4:data_all.append('None')else:data_all.append(xiangqing_all[4].text.lstrip('\n                    ').rstrip('\n                    '))# 标签6详情6if len(biaoqian_all) <= 5:data_all.append('None')else:data_all.append(biaoqian_all[5].text)if len(xiangqing_all) <= 5:data_all.append('None')else:data_all.append(xiangqing_all[5].text.lstrip('\n                    ').rstrip('\n                    '))# 地铁dtdata = soup.select('.introContent.showbasemore')dtdata_news = ''.join(str(i) for i in dtdata)dt = re.findall(u">地铁</a>", dtdata_news)if len(dt) == 0:data_all.append('None')else:dt_news = dt[0].lstrip('>').rstrip('</a>')data_all.append(dt_news)return data_alldata_pageurls = []
a = []
data_eachurls = []
alldata = []city_list = get_cityurls(starturl_list[0])
#得到每页的url
m = 1
for i in city_list:try:a = get_pageurls(i)data_pageurls.extend(a)print('得到第{}页网址成功'.format(m))except:print('得到第{}页网址不成功'.format(m))m +=1#得到每个房子信息的url
n = 1
for i in data_pageurls:try:b = get_eachurls(i)data_eachurls.extend(b)print('得到第{}个房子网址成功'.format(n))except:print('得到第{}个房子网址不成功'.format(n))n +=1#得到每户房子信息
r = 1
for i in data_eachurls:try:c = news_ershoufang(i)alldata.append(c)print('得到第{}户房子信息成功'.format(r),c[0])except:print('得到第{}户房子信息不成功'.format(r))time.sleep(5)r +=1df = pd.DataFrame(alldata)
df.columns = ['城市','小区名字','房屋户型','所在楼层','建筑面积','户型结构',\'套内面积','建筑类型','房屋朝向','建成年代','装修情况',\'建筑结构','供暖方式','梯户比例','产权年限','配备电梯',\'链家编号','交易权属','挂牌时间','房屋用途','房屋年限',\'产权所属','成交额(万元)','单价(元/平)','上次交易',\'挂牌价格','成交周期','调价次数','近30天带看次数','关注人次',\'浏览次数','标签1','详情1','标签2','详情2','标签3','详情3','标签4','详情4','标签5','详情5','标签6','详情6','地铁']
df.to_excel('长沙.xlsx')

git链接

python 爬取链家成交房数据案例相关推荐

  1. 租房不入坑不进坑,Python爬取链家二手房的数据,提前了解租房信息

    目录 前言 一.查找数据所在位置: 二.确定数据存放位置: 三.获取html数据: 四.解析html,提取有用数据: 前言 贫穷限制了我的想象,从大学进入到社会这么久,从刚开始的兴致勃勃,觉得钱有什么 ...

  2. python爬取链家网的房屋数据

    python爬取链家网的房屋数据 爬取内容 爬取源网站 爬取内容 爬取思路 爬取的数据 代码 获取房屋url 获取房屋具体信息 爬取内容 爬取源网站 北京二手房 https://bj.lianjia. ...

  3. 掌财社:python怎么爬取链家二手房的数据?爬虫实战!

    我们知道爬虫的比较常见的应用都是应用在数据分析上,爬虫作为数据分析的前驱,它负责数据的收集.今天我们以python爬取链家二手房数据为例来进行一个python爬虫实战.(内附python爬虫源代码) ...

  4. python 爬取链家数据_用python爬取链家网的二手房信息

    题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...

  5. python爬房源信息_用python爬取链家网的二手房信息

    题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...

  6. python关于二手房的课程论文_基于python爬取链家二手房信息代码示例

    基本环境配置 python 3.6 pycharm requests parsel time 相关模块pip安装即可 确定目标网页数据 哦豁,这个价格..................看到都觉得脑阔 ...

  7. 利用xpath爬取链家租房房源数据并利用pandas保存到Excel文件中

    我们的需求是利用xpath爬取链家租房房源数据,并将数据通过pandas保存到Excel文件当中 下面我们看一下链家官网的房源信息(以北京为例) 如图所示,我们通过筛选得到北京租房信息 那么我们需要将 ...

  8. 爬取链家网二手房数据并保存到mongodb中

    提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档 爬取链家网二手房数据并保存到mongodb中 文章目录 前言 一.爬虫的介绍 二.协程的介绍 三.css选择器 四.基于asyncio ...

  9. python爬取链家新房_Python爬虫实战:爬取链家网二手房数据

    前言 本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...

最新文章

  1. 乌托邦畅想:众筹开源城市
  2. 如何确定最初克隆本地Git存储库的URL?
  3. GPU Gems1 - 19 基于图像的光照(Image-Based Lighting)
  4. JDK源码解析之 Java.lang.String
  5. SSH安全登陆原理:密码登陆与公钥登陆
  6. python核心理念_《三天搞定Python基础概念之第一天》中文版
  7. silverlight html 传参,Silverlight与html、JavaScript三者交互
  8. (CFD)投影法求解二维不可压缩N-S方程
  9. XueTr(PC Hunter) pro 注册分析
  10. 图像知识 太经典 膜拜
  11. Python 矩形法求sin(x)的定积分(完美实现)
  12. 2017 Multi-University Training Contest 5 solutions BY 吉如一
  13. 字道-最美中国字硬笔书法教学|培养孩子正确的执笔写字姿势有多重要?看完你就明白了!
  14. 2023河南大学计算机考研信息汇总
  15. ESP8266开启热点和TCP Server
  16. Java Attach机制
  17. 马斯克被指性骚扰空姐,已支付170万封口费,马一龙:这是他们卑鄙的剧本!...
  18. restTemplate访问接口
  19. C语言(谭浩强版本,主讲人:小甲鱼)P1-P9
  20. Element UI 组件库分析和二次开发(一)

热门文章

  1. Java 程序员必备的辅助开发神器(2022 年版),建议收仓
  2. 软件质量测试雨课堂习题
  3. html自学网页制作,HTML入门学习教程:简单网页制作
  4. 数据的聚合与分组运算
  5. 用JavaBean封装数据库操作
  6. Web变灰-grayscale
  7. Netty4.x 的逆袭之路 —— 再识 Netty
  8. Mac 程序员的十种武器
  9. 联邦学习隐私保护相关知识总结
  10. matlab自带的音乐,MATLAB乐器(如何用matlab演奏音乐)