2020年链家网成都二手房销售情况可视化总结

功课

前端框架：

python web微框架——flask，https://dormousehole.readthedocs.io/en/latest/#
python web应用框架——Django

可视化引擎：

蚂蚁金服 AntV 数据可视化团队推出的AntV 地理可视化引擎 L7 ,https://ant.design/index-cn
mapbox 是一个开源的地图引擎、强大的前段地图框架mapboxgl。前端使用 ant.design 进行布局,https://blog.csdn.net/supermapsupport/article/details/78343391
百度API 3D ,http://lbsyun.baidu.com/solutions/visualization
inMap 丰富的图层、更好的用户体验、大数据地理可视化库。

房价预测系：
7. K-Means聚类地理信息可视化.https://zhuanlan.zhihu.com/p/30138130
8. 《DataFocus 数据可视化》第四章地理信息可视化.https://www.douban.com/note/725396232/
9. 数据挖掘——房价项目预测（四）matplotlib与Seaborn数据可视化学习 .https://blog.csdn.net/weixin_41975471/article/details/106235600. Matplotlib 是一个 Python 的 2D绘图库
10. FineBI数据可视化软件.http://www.fanruansem.com/、http://www.fanruansem.com/finebi
11. OurwayBI.https://tv.sohu.com/v/dXMvMTgxMjczMjU0Lzk0MjcwNjk5LnNodG1s.html

1、分析数据需求

数据来自某网https://cd.lianjia.com/
需要用

地址
总价
售价
小区名称
所在区域
经度
纬度

用地址是因为需要在百度API里得到经纬度做定位，也用来做label;
用总价、售价、小区名称做离散图和label
等等

2、分析网页结构

略

3、爬爬虫虫的编写


# coding=UTF-8
import importlib
import json
import requests
import re,sysfrom multiprocessing import Process
from fake_useragent import UserAgent
from bs4 import BeautifulSoupimportlib.reload(sys)ua = UserAgent()#循环下一页
#正则表示、去重
title_link = ['wuhou','jinniu','shuangliu']for pre in range(3):for i in range(1, 2):# 循环构造urlfor n in range(100):url = 'https://cd.lianjia.com/ershoufang/' + str(title_link[pre]) + '/pg' + str(n) + '{}/'print(str(title_link[pre]) + '的第' + str(n) + '页')k = url.format(i)# 添加请求头headers = {'Referer': 'https://cd.lianjia.com/ershoufang/', 'user-agent': ua.random}res = requests.get(k, headers=headers)# 基于正则表达式来解析网页内容，拿到所有的详情url# 原始可能是这么做的，但是后来发现bs4给我们提供了更方便的方法来取得各元素的内容# 正则表达式最重要的两个东西，.任意匹配字符，*匹配任意次数，？以html结束text = res.textre_set = re.compile('https://cd.lianjia.com/ershoufang/[0-9]*.?html')re_get = re.findall(re_set, text)# 去重lst2 = {}.fromkeys(re_get).keys()# 获得经纬度for name in lst2:res = requests.get(name, headers=headers)info = {}text2 = res.textsoup = BeautifulSoup(text2, 'html.parser')try:info['地址'] = soup.select('.main')[0].textinfo['总价【万】'] = soup.select('.total')[0].textinfo['每平方售价【万/平方米】'] = soup.select('.unitPriceValue')[0].textinfo['小区名称'] = soup.select('.info')[0].textinfo['区'] = soup.select('.info a')[0].text + '区'info['街道'] = soup.select('.info a')[1].textinfo['房屋朝向'] = soup.select('.type .mainInfo')[0].textinfo['所在楼层'] = soup.select('.content ul li')[1].textinfo['梯户比例'] = soup.select('.content ul li')[9].textinfo['配备电梯'] = soup.select('.content ul li')[10].text# 根据地址获取对应经纬度，通过高德地图的api接口来进行mc = soup.select('.info')[0].textlocation1 = '成都' + mc# print(location1)base = 'https://restapi.amap.com/v3/geocode/geo?key=32420527fb3f52a21761956860a27921&address=' + location1response = requests.get(base)result = json.loads(response.text)info['经纬度'] = result['geocodes'][0]['location']info['经度'] = info['经纬度'][-20:-11]info['纬度'] = info['经纬度'][-9:-1]print(info)except:with open('/opt/dataFile/error/' + str(title_link[pre]) + '_error.csv', 'a',encoding='utf-8') as error:error.write(base + '\n')continuewith open('/opt/dataFile/data/' + str(title_link[pre]) + '.csv', 'a', encoding='utf-8')as data:data.write(str(info) + ',\n')

2020年链家网成都二手房销售部分数据.tar

4、可视化

使用python

import plotly_express as px     # 可视化模块
import plotly.offline           # 生成html文件模块
import numpy                    # 数据格式转换模块
import pandas as pd             # 数据格式转换模块# plotly_express 二维散点图
def plotly_Express_scatter():gapminder = px.data.gapminder()px.scatter(gapminder.query("year==2007"), x="gdpPercap", y="lifeExp",size="pop", color="continent", hover_name="country", log_x=True,size_max=60)# plotly_express 二维散点图测试
def plotly_Express_scatter_test():data = 'E:/Sourese/lianjia/data/list_data.json'# 转成DataFram数据结构df = pd.read_json(data,encoding='UTF-8')fig = px.scatter(df, x="Price", y="Acreage",size="Tolprice", color="Area",hover_name="Area", log_x=True,size_max=60)# 生成Html文件# plotly.offline.plot(fig,filename='D:/_workspace/HBuilderX/可视化大作业/file/scatter_list_data.html')# print(df["Area"])# plotly_express 三维散点图测试
def plotly_Express_scatter_3d():df = px.data.election()fig = px.scatter_3d(df, x="Joly", y="Coderre", z="Bergeron", color="winner", size="total", hover_name="district",symbol="result", color_discrete_map={"Joly": "blue", "Bergeron": "green", "Coderre": "red"})with open('E:Sourese/lianjia/Textdata/test.txt' ,'a' , encoding='UTF-8') as test:test.writelines(str(fig))def plotly_Express_scatter_3d_test():data = 'E:/Sourese/lianjia/data/list_data.json'df = px.data.election()fig = px.scatter_3d(df, x="Joly", y="Coderre", z="Bergeron", color="Area", size="Area", hover_name="Area",symbol="Area", color_discrete_map={"Joly": "blue", "Bergeron": "green", "Coderre": "red"})plotly.offline.plot(fig)# plotly_express 二维柱状图
def bar_11():data = pd.read_json("E:/Sourese/lianjia/Textdata/Sheet2.json" ,'r',encoding='UTF-8')print(data)# Str 类型转Pandas.DataFrame类型；Str——>IO（文件对象）——>DataFrame类型fig = px.bar(data , x='Count' , y='Area' , text='Count',orientation='h',title='dj',template='plotly_white')plotly.offline.plot(fig ,filename='D:/_workspace/HBuilderX/可视化大作业/file/bar_11.html')if __name__ == '__main__':# bar_11()# plotly_Express_scatter()plotly_Express_scatter_test()# area_number()# plotly_Express_scatter_3d()# plotly_Express_scatter_3d_test()

Python词云.py

from PIL import Image as image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
import jiebadef GetWordCloud():path_txt = 'E://Sourese//lianjia//data//data.txt'#path_img = "E://Sourese//lianjia//data//house.png"f = open(path_txt, 'r', encoding='UTF-8').read()#background_image = np.array(image.open(path_img))# 结巴分词，生成字符串，如果不通过分词，无法直接生成正确的中文词云,感兴趣的朋友可以去查一下，有多种分词模式# Python join() 方法用于将序列中的元素以指定的字符连接生成一个新的字符串。cut_text = " ".join(jieba.cut(f))wordcloud = WordCloud(max_words = 120,# 设置字体，不然会出现口字乱码，文字的路径是电脑的字体一般路径，可以换成别的font_path="C:/Windows/Fonts/simfang.ttf",background_color='white',# mask参数=图片背景，必须要写上，另外有mask参数再设定宽高是无效的#mask=background_image).generate(cut_text)# 生成颜色值#image_colors = ImageColorGenerator(background_image)# 下面代码表示显示图片plt.imshow(wordcloud.recolor(), interpolation="bilinear")plt.axis("off")plt.show()def GetWordCoud2():#path_image = 'E://Sourese//lianjia//data//house.png'#mask = np.array(image.open(path_image))with open('E://Sourese//lianjia//data//wordcloud.txt','r',encoding='UTF-8') as data:text = data.read()wordCloud = WordCloud(font_path='C:/Windows/Fonts/simfang.ttf',background_color='white',#mask = mask,mode= 'green').generate(text)image_produce = WordCloud.to_image(wordCloud)image_produce.show()if __name__ == '__main__':GetWordCloud()# GetWordCoud2()"""
WordCloud参数讲解：font_path表示用到字体的路径width和height表示画布的宽和高prefer_horizontal可以调整词云中字体水平和垂直的多少mask即掩膜，产生词云背景的区域scale:计算和绘图之间的缩放min_font_size设置最小的字体大小max_words设置字体的多少stopwords设置禁用词background_color设置词云的背景颜色max_font_size设置字体的最大尺寸mode设置字体的颜色 但设置为RGBA时背景透明relative_scaling设置有关字体大小的相对字频率的重要性regexp设置正则表达式collocations 是否包含两个词的搭配
"""