房天下网站二手房爬虫、数据清洗及可视化(python)

爬虫代码

###爬取完的数据存入MangoDB中,需自行下载MangoDB

import requests, json, threading
from bs4 import BeautifulSoup
import numpy as np
import re
from tqdm import trange
from pymongo import MongoClient as Clientcity = ['北京', '上海', '广东', '深圳', '沈阳', '大连']
e_city = ['beijing', 'shanghai', 'guangdong', 'shenzheng', 'shenyang', 'dalian']
eshouse = ['https://esf.fang.com/house/i3{}/', 'https://sh.esf.fang.com/house/i3{}/','https://gz.esf.fang.com/house/i3{}/', 'https://sz.esf.fang.com/house/i3{}/','https://sy.esf.fang.com/house/i3{}/', 'https://dl.esf.fang.com/house/i3{}/']header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ''AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/64.0.3282.186 Safari/537.36'}is_craw = {}
proxies = []def write_to_mongo(ips, city):'''将数据写入mongoDB'''client = Client(host='localhost', port=27017)db = client['fs_db']coll = db[city + '_good']for ip in ips:coll.insert_one({'name': ip[0], \'price': ip[1],'addresses': ip[2],'areas': ip[3],'eq': ip[4]})client.close()def read_from_mongo(city):client = Client(host='localhost', port=27017)db = client['fs_db']coll = db[city + '_good']li = coll.find()client.close()return liclass Consumer(threading.Thread):def __init__(self, args):threading.Thread.__init__(self, args=args)def run(self):global is_crawurl_demo, i, city_id, lock = self._argsprint("{}, 第{}页".format(city[city_id], i))url = url_demo.format(i)soup = get_real(url)names = []for name in soup.select('.tit_shop'):names.append(name.text.strip())addresses = []for item in soup.find_all('p', attrs={'class': 'add_shop'}):address = item.a.text + " " + item.span.textaddresses.append(address.replace('\t', '').replace('\n', ''))es = []for item in soup.find_all('p', attrs={'class': 'tel_shop'}):es.append(item.text.replace('\t', '').replace('\n', ''))moneys = []for money in soup.find_all("span", attrs={"class": 'red'}):moneys.append(money.text.strip())areas = []for area in soup.find_all('dd', attrs={'class': 'price_right'}):areas.append(area.find_all('span')[-1].text)houses = []for idx in range(len(names)):try:item = [names[idx], moneys[idx], addresses[idx], areas[idx], es[idx]]print(item)houses.append(item)except Exception as e:print(e)lock.acquire()write_to_mongo(houses, e_city[city_id])lock.release()print("线程结束{}".format(i))def dict2proxy(dic):s = dic['type'] + '://' + dic['ip'] + ':' + str(dic['port'])return {'http': s, 'https': s}def get_real(url):resp = requests.get(url, headers=header)soup = BeautifulSoup(resp.content, 'html.parser', from_encoding='gb18030')if soup.find('title').text.strip() == '跳转...':pattern1 = re.compile(r"var t4='(.*?)';")script = soup.find("script", text=pattern1)t4 = pattern1.search(str(script)).group(1)pattern1 = re.compile(r"var t3='(.*?)';")script = soup.find("script", text=pattern1)t3 = re.findall(pattern1, str(script))[-2]url = t4 + '?' + t3HTML = requests.get(url, headers=header)soup = BeautifulSoup(HTML.content, 'html.parser', from_encoding='gb18030')elif soup.find('title').text.strip() == '访问验证-房天下':passreturn soupdef read_proxies():client = Client(host='localhost', port=27017)db = client['proxies_db']coll = db['proxies']# 先检测,再写入,防止重复dic = list(coll.find())client.close()return dicdef craw():lock = threading.Lock()for idx in trange(len(e_city)):url = eshouse[idx]soup = get_real(url.format(2))try:page_number = int(soup.find('div', attrs={'class': 'page_al'}).find_all('span')[-1].text[1:-1])pages = list(range(1, page_number + 1))except:pages = list(range(1, 101))url_demo = urlts = []# pages = [1, 2, 3]while len(pages) != 0:for i in range(10):t = Consumer((url_demo, pages.pop(), idx, lock))t.start()ts.append(t)if len(pages) == 0:breakfor t in ts:t.join()ts.remove(t)if __name__ == '__main__':craw()

数据清洗代码

import csv
import pandas as pd
try:df = pd.read_csv(r'shenyang_good.csv', encoding='utf-8')
except:df = pd.read_csv(r'shenyang_good.csv', encoding='gbk')
df.drop_duplicates(subset=None, keep='first', inplace=True)out = open('清洗后城市二手房数据.csv', 'a', newline='')
csv_write = csv.writer(out, dialect='excel')
# csv_write.writerow(['城市', '名称', '地址', '单价', '总价', '规格',
#                     '面积', '层段', '总层数', '朝向', '建房时期', '介绍'])old_addresses = []
old_areas = []
old_eq = []
old_name = []
old_price = []
for index in df['addresses']:old_addresses.append(index)
for index in df['areas']:old_areas.append(index)
for index in df['eq']:old_eq.append(index)
for index in df['name']:old_name.append(index)
for index in df['price']:old_price.append(index)for index in range(len(old_addresses)):try:city = '沈阳'name, address = old_addresses[index].split(' ')yuan = old_areas[index][:-3:]price = old_price[index][:-1:]intro = old_name[index]specs, rate, totals, toward, buildTime, people = old_eq[index].split('|')rate = rate[:-1:]layer, total = totals.split('(')total = total[1:-2:]buildTime = buildTime[:4:]csv_write.writerow([city, name, address, yuan, price, specs, rate,layer, total, toward, buildTime, intro])except:continue
print('清洗完成!')

可视化代码

##1.堆叠柱状图代码

from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.commons.utils import JsCode
from pyecharts.globals import ThemeType
import pandas as pdtry:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')
except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')
city_lst = ['北京', '上海', '广州', '深圳', '沈阳', '大连']  # 城市
city = []  # 城市
buildTime = []  # 建房时期
new_buildTime = [[[], []] for i in range(6)]
data1 = []
data2 = []
for index in df['城市']:city.append(index)
for index in df['建房时期']:buildTime.append(index)
for index in range(len(city)):for num in range(len(city_lst)):if city[index] == city_lst[num]:if int(buildTime[index]) >= 2008:new_buildTime[num][1].append(buildTime[index])else:new_buildTime[num][0].append(buildTime[index])
for g in range(len(new_buildTime)):value1 = len(new_buildTime[g][0])value2 = len(new_buildTime[g][1])result1 = {'value': value1, 'percent': '%.2f' % (value1/(value1+value2))}result2 = {'value': value2, 'percent': '%.2f' % (value2/(value1+value2))}data1.append(result1)data2.append(result2)
print(data1)
print(data2)c = (Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)).add_xaxis(city_lst).add_yaxis("2008年以前建房", data1, stack="stack1", category_gap="50%").add_yaxis("2008年以后建房", data2, stack="stack1", category_gap="50%").set_series_opts(label_opts=opts.LabelOpts(position="right",formatter=JsCode("function(x){return Number(x.data.percent * 100).toFixed() + '%';}"),)).render("不同城市2008年前后建房数量-堆叠柱状图.html")
)

##效果图

##2. 3D柱状图代码

from pyecharts.charts import Pie, Bar, Line, Scatter3D, Bar3D
from tqdm import trange
from pymongo import MongoClient as Clientimport csv
from pyecharts import options as opts
def draw():with open("清洗后城市二手房数据.csv", 'r') as f:reader = csv.reader(f)data = list(reader)Bar3D().add(series_name="",data=[[item[0], item[4], item[6]]for item in data],xaxis3d_opts=opts.Axis3DOpts(name='城市',type_='category'),yaxis3d_opts=opts.Axis3DOpts(name='每平方米价格'),zaxis3d_opts=opts.Axis3DOpts(name='面积')).render("不同城市价格对应房屋规格3D.html")if __name__ == '__main__':draw()

##效果图

##3.折线图代码

import pyecharts.options as opts
from pyecharts.charts import Line
import pandas as pdtry:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')
except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')
city_lst = ['北京', '上海', '广州', '深圳', '沈阳', '大连']  # 城市
city = []  # 城市
price = []  # 建房时期
new_buildTime = [[[], [],[],[],[],[]] for i in range(6)]
data1 = []
data2 = []
data3 = []
data4 = []
data5 = []
data6 = []
for index in df['城市']:city.append(index)
for index in df['总价']:price.append(index)
for index in range(len(city)):for num in range(len(city_lst)):if city[index] == city_lst[num]:if str(city[index]) == "北京":new_buildTime[num][0].append(price[index])elif str(city[index]) == "上海":new_buildTime[num][1].append(price[index])elif str(city[index]) == "广东":new_buildTime[num][2].append(price[index])elif str(city[index]) == "深圳":new_buildTime[num][3].append(price[index])elif str(city[index]) == "沈阳":new_buildTime[num][4].append(price[index])elif str(city[index]) == "大连":new_buildTime[num][5].append(price[index])for g in range(len(new_buildTime)):data1.append(new_buildTime[g][0])data2.append(new_buildTime[g][1])data3.append(new_buildTime[g][2])data4.append(new_buildTime[g][3])data5.append(new_buildTime[g][4])data6.append(new_buildTime[g][5])x_data = ["0-100万", "100-200万", "200-300万", "300-400万", "400-500万", "500-600万",'600-700万','700-800万','800-900万','900-1000万','1000-2000万'](Line().add_xaxis(xaxis_data=x_data).add_yaxis(series_name="北京",stack="总量",y_axis=new_buildTime[0][0],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="上海",stack="总量",y_axis=new_buildTime[1][1],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="广东",stack="总量",y_axis=new_buildTime[2][2],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="深圳",stack="总量",y_axis=new_buildTime[3][3],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="沈阳",stack="总量",y_axis=new_buildTime[4][4],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="大连",stack="总量",y_axis=new_buildTime[5][5],label_opts=opts.LabelOpts(is_show=False),).set_global_opts(title_opts=opts.TitleOpts(title="折线图堆叠"),tooltip_opts=opts.TooltipOpts(trigger="axis"),yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value}"),axistick_opts=opts.AxisTickOpts(is_show=True),splitline_opts=opts.SplitLineOpts(is_show=True),),xaxis_opts=opts.AxisOpts(type_="category", boundary_gap=False),).render("不同城市房价对比-折线图.html")
)

##效果图

##4.多维散点图代码

from pyecharts import options as opts
from pyecharts.charts import Scatter
from pyecharts.commons.utils import JsCode
from pyecharts.faker import Faker
import pandas as pd
import pyecharts as pyec
try:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')
except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')
city_lst = ['北京', '上海', '广州', '深圳', '沈阳', '大连']  # 城市
city = []  # 城市
guige = []  # 规格
area = [] #面积
new_guige = [[[],[],[],[],[],[]] for i in range(6)]
new_area = [[[],[],[],[],[],[]] for i in range(6)]for index in df['城市']:city.append(index)
for index in df['规格']:guige.append(index)
for index in df['面积']:area.append(index)
for index in range(len(city)):for num in range(len(city_lst)):if city[index] == city_lst[num]:if str(city[index]) == "北京":new_guige[num][0].append(guige[index])new_area[num][0].append(area[index])elif str(city[index]) == "上海":new_guige[num][1].append(guige[index])new_area[num][1].append(area[index])elif str(city[index]) == "广州":new_guige[num][2].append(guige[index])new_area[num][2].append(area[index])elif str(city[index]) == "深圳":new_guige[num][3].append(guige[index])new_area[num][3].append(area[index])elif str(city[index]) == "沈阳":new_guige[num][4].append(guige[index])new_area[num][4].append(area[index])elif str(city[index]) == "大连":new_guige[num][5].append(guige[index])new_area[num][5].append(area[index])
y_data = ["1室1厅", "1室0厅", "1室2厅", "1室3厅", "1室4厅", "2室0厅",'2室1厅','2室2厅','2室3厅','2室4厅',"3室0厅",'3室1厅','3室2厅','3室3厅','3室4厅',"4室0厅",'4室1厅','4室2厅','4室3厅','4室4厅',"5室0厅",'5室1厅','5室2厅','5室3厅','5室4厅',"6室0厅",'6室1厅','6室2厅','6室3厅','6室4厅']
x_data = ['0-50万','50-100万','100-150万','150-200万','200-250万','250-300万','300-350万','350-400万','400-450万','450-500万','500-550万','550-600万']c = (Scatter().add_xaxis(x_data).add_yaxis("北京",[list(z) for z in zip(new_area[0][0],new_guige[0][0])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}")  #params.value[1]对应y轴Faker.values() :  params.value[2]对应y轴Faker.choose()),).add_yaxis("上海",[list(z) for z in zip(new_area[1][1],new_guige[1][1])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}")  #params.value[1]对应y轴Faker.values() :  params.value[2]对应y轴Faker.choose()),).add_yaxis("广州",[list(z) for z in zip(new_area[2][2],new_guige[2][2])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}")  #params.value[1]对应y轴Faker.values() :  params.value[2]对应y轴Faker.choose()),).add_yaxis("深圳",[list(z) for z in zip(new_area[3][3],new_guige[3][3])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}")  #params.value[1]对应y轴Faker.values() :  params.value[2]对应y轴Faker.choose()),).add_yaxis("沈阳",[list(z) for z in zip(new_area[4][4],new_guige[4][4])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}")  #params.value[1]对应y轴Faker.values() :  params.value[2]对应y轴Faker.choose()),).add_yaxis("大连",[list(z) for z in zip(new_area[5][5],new_guige[5][5])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}")  #params.value[1]对应y轴Faker.values() :  params.value[2]对应y轴Faker.choose()),).set_global_opts(title_opts=opts.TitleOpts(title="Scatter-多维度数据"),# 提示框配置项tooltip_opts=opts.TooltipOpts(formatter=JsCode(# 构造回调函数"function (params) {return params.name + ' : ' + params.value[2];}")  #params.name对应x轴的Faker.choose() : params.value[2]对应y轴Faker.choose()),# 视觉映射配置项visualmap_opts=opts.VisualMapOpts(#颜色映射type_="color",max_=150,min_=20,dimension=1  ## 组件映射维度),).render("不同城市价格对应房屋规格-多维散点图.html")
)

##效果图

##5.漏斗图代码

import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Funneltry:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')
except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')
df.drop_duplicates(subset=None, keep='first', inplace=True)
city = []
unit_price = []
for index in df['城市']:city.append(index)
for index in df['单价']:unit_price.append(index)
last_city = list(set(city))
new_unit_price = [[] for i in range(len(last_city))]
last_unit_price = []
for i in range(len(unit_price)):for j in range(len(last_city)):if city[i] == last_city[j]:new_unit_price[j].append(int(unit_price[i]))
for index in new_unit_price:result = sum(index)/len(index)last_unit_price.append('%.2f' % result)c = (Funnel().add("平均单价",[list(z) for z in zip(last_city, last_unit_price)],sort_="ascending",label_opts=opts.LabelOpts(position="inside"),).set_global_opts(title_opts=opts.TitleOpts(title="不同城市平均单位房价(元/m^2)")).render("不同城市的平均单位房价-漏斗图.html")
)

##效果图

##6.饼状图代码

import pandas as pd
df=pd.read_csv(r'D:\清洗后城市二手房数据.csv',encoding='gbk')
df.drop(df[df['层段'].str.contains('层段')].index, inplace=True)
data_heigh=df['层段']
lst_height_value=data_heigh.value_counts().keys().tolist()
lst_counts=data_heigh.value_counts().tolist()
import pyecharts.options as opts
from pyecharts.charts import Piedef get_pie():x_data = lst_height_valuey_data = lst_countsdata_pair = [list(z) for z in zip(x_data, y_data)]data_pair.sort(key=lambda x: x[1])c=(Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c")).add(series_name="层段信息",data_pair=data_pair,rosetype="radius",radius="55%",center=["50%", "50%"],label_opts=opts.LabelOpts(is_show=False, position="center"),).set_global_opts(title_opts=opts.TitleOpts(title="Customized Pie",pos_left="center",pos_top="20",title_textstyle_opts=opts.TextStyleOpts(color="#fff"),),legend_opts=opts.LegendOpts(is_show=False),).set_series_opts(tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"),label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"),))return c
if __name__ == '__main__':get_pie().render('所有城市楼盘的层段分析-饼状图.html')

##效果图

数据集

链接:链接:https://pan.baidu.com/s/1qN00fxxo9wi1tF5wiqj4ZQ
提取码:uvy3

房天下网站二手房爬虫、数据清洗及可视化(python)相关推荐

  1. 爬虫实战-爬取房天下网站全国所有城市的新房和二手房信息(最新)

    看到https://www.cnblogs.com/derek1184405959/p/9446544.html项目:爬取房天下网站全国所有城市的新房和二手房信息和其他博客的代码,因为网站的更新或者其 ...

  2. 利用Python Scrapy框架爬取“房天下”网站房源数据

    文章目录 分析网页 获取新房.二手房.租房数据 新房数据 租房数据: 二手房数据 反反爬虫 将数据保存至MongoDB数据库 JSON格式 CSV格式 MongoDB数据库 分析网页 "房天 ...

  3. python爬取淘宝商品做数据挖掘_Python 3爬虫 数据清洗与可视化实战 Python数据抓取技术 python3网络爬虫教程书籍 运用Python工具获取电商平台页面数据挖掘书籍...

    A8 书    名:Python 3爬虫 数据清洗与可视化实战 作 译 者:零一,韩要宾,黄园园 出版时间:2018-03 千 字 数:200 版    次:01-01 页    数:212 开   ...

  4. Python:利用python编程将上海十六区,2020年5月份房价实时地图(数据来源房天下)进行柱状图、热图可视化

    Python:利用python编程将上海十六区,2020年5月份房价实时地图(数据来源房天下)进行柱状图.热图可视化 目录 上海十六区,2020年5月份房价实时地图(数据来源房天下)可视化 雷达图.柱 ...

  5. 爬取贝壳和房天下——北京二手房在售和已成交百万级别数据(附详细思路和部分源码)

    前言 在港校做RA期间,本周老师安排了爬取国内二手房的交易网站,我先调研对比了各平台,之后选取了房天下和贝壳的数据进行爬取 贝壳 房天下 贝壳需要注册登录才能访问,房天下不需要 因课题保密的要求,数据 ...

  6. python分布式(scrapy-redis)实现对房天下全国二手房与新房的信息爬取(偏小白,有源码有分析)

    文章目录 Scrapy实现, 确定需求 进入分析 分析url 分析页面结构 代码 spiders(爬虫) items pipelines middlewares settings start Scra ...

  7. Python 使用selenium爬取房天下网站,房源动态信息

    什么是Selenium selenium 是一套完整的web应用程序测试系统,包含了测试的录制(selenium IDE),编写及运行(Selenium Remote Control)和测试的并行处理 ...

  8. 使用scrapy框架实现,房天下网站全站爬取,详情,动态,评论,户型,图片.

    scrapy  实现代码,代码有点多,没有优化,,下面有链接,不懂得留言 Github全部代码,https://github.com/Agile929/scrapy_fang # -*- coding ...

  9. python爬虫数据可视化_python 爬虫与数据可视化--python基础知识

    摘要:偶然机会接触到python语音,感觉语法简单.功能强大,刚好朋友分享了一个网课<python 爬虫与数据可视化>,于是在工作与闲暇时间学习起来,并做如下课程笔记整理,整体大概分为4个 ...

最新文章

  1. 集成学习之Bagging
  2. _disable_logging 对于归档数据库的影响
  3. Spring Integration学习资料
  4. android 编译过程
  5. 批处理start命令学习
  6. 用RadASM开发窗口程序(2)
  7. TypeError: Cannot red property 'style' of null 错误解决
  8. Numpy 排序 -- sort()、argsort()
  9. 多线程 全局变量_python高级:6.多线程part1
  10. 在vlan2用计算机名访问,计算机是如何访问一个网页的?vlan间如何实现通信?
  11. 【华为云技术分享】【一统江湖的大前端】PPT制作库impress.js
  12. java线程通讯的方式
  13. java 读取excel wps_安装WPS引发的excel上传问题
  14. ShowModalDialog数据缓存的清除方法
  15. SQL查询语句大全集锦
  16. 学习OpenCV 4(一)
  17. Zigbee物联网应用
  18. 22春天津大学《财务会计》在线作业2
  19. 华为防火墙USG6309E开局基础配置之安全策略
  20. JavaScript网页特效-折叠面板

热门文章

  1. Arcgis实例操作20--线分割面数据、删除线多余节点、提取点群最小边界几何
  2. Ubuntu18.04搜狗输入法安装教程
  3. [20151215]bvi使用问题.txt
  4. break的作用与用法
  5. html css背景图居中显示,网站背景图居中自适应以及拉伸填充CSS代码解决方法
  6. ThinkPad E430c 2G内存升级为4G【本机硬件信息备份+拆机图】
  7. 7-47 抢红包 (25 分)
  8. 周老师每日提问②(2019/11/21)
  9. 关于FTP服务器出现227 Entering Passive Mode的解决方法
  10. 喜欢动漫的小伙伴看过来