一.分析网页结构并编写程序

import requests
import csv
import time
import math
import random
from lxml import etree
from multiprocessing.dummy import Pool

def getPage(url):time.sleep(random.choice([2, 2.5, 3, 3.5]))page = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.110"})return etree.HTML(page.text)

def csvWrite(item):with open("lianjia_sh_ershoufang_data.csv", "a", encoding="utf-8", newline="") as f:csv.writer(f).writerow(item)

def get_areas_regions_urls():areas = ["pudong","minhang","baoshan","xuhui","putuo","yangpu","changning","songjiang","jiading","huangpu","jingan","zhabei","hongkou","qingpu","fengxian","jinshan","chongming","shanghaizhoubian"]areas_regions_urls = []#这是我们要返回的元组列表，其内每一个元组将包含地区、地点、和urlfor area in areas:page = getPage("https://sh.lianjia.com/ershoufang/" + area)region_names = page.xpath("/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a/text()")#获取地点名region_urls = page.xpath("/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a/@href")#获取地点对应的urlfor url in region_urls:#创建元组并将其写入目标列表areas_regions_urls.append((area,region_names[region_urls.index(url)], "https://gz.lianjia.com"+url))# print(area,region_names[region_urls.index(url)],"https://gz.lianjia.com"+url)# print("Region urls in Area {} have been added!".format(area))print("All regions urls have been added")return areas_regions_urls

def region_spider(x):#获取信息条数info_num = int(getPage(x[2]).xpath("/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0])#计算信息页数（已知每页最多30条数据）page_num = math.ceil(info_num/30)# print("{}有{}条数据，共{}页".format(x[1],info_num,page_num))for url in [x[2]+"pg" + str(num+1) for num in range(page_num)]:page = getPage(url)for house in page.xpath("/html/body/div[4]/div[1]/ul/li"):try:# print(house.xpath("div[1]/div[1]/a/text()")[0])#x代表get_areas_regions_urls()返回的列表中的每一个元组，则x[0]代表地区，x[1]代表地点，x[2]代表urlArea = x[0]Region = x[1]info = house.xpath("div[1]/div[2]/div/text()")[0].split("|")#由于别墅房源和普通房源的网页结构稍有不同，所以这里我们需要做一个判断if info[1].strip()[-2:]=="别墅":Garden = house.xpath("div[1]/div[2]/div/a/text()")[0]Layout = info[2]Acreage = info[3].strip()Direction = info[4].strip()Renovation = info[5].strip()Elevator = info[6].strip()Price = int(house.xpath("div[1]/div[6]/div[1]/span/text()")[0])BuiltYear = re.search("\d{4}",house.xpath("div[1]/div[3]/div/text()")[0]).group()Height = re.search("\d层",house.xpath("div[1]/div[3]/div/text()")[0]).group()Building = info[1].strip()else:Garden = house.xpath("div[1]/div[2]/div/a/text()")[0]Layout = info[1]Acreage = info[2].strip()Direction = info[3].strip()Renovation = info[4].strip()try:Elevator = info[5].strip()#并不是所有房源都有电梯信息，若无则设为“无数据”except:Elevator = "无数据"Price = house.xpath("div[1]/div[6]/div[1]/span/text()")[0]try:    BuiltYear = re.search("\d{4}",house.xpath("div[1]/div[3]/div/text()")[0]).group()#并不是所有房源都有年代信息，若无则设为0except:BuiltYear = 0Height = house.xpath("div[1]/div[3]/div/text()")[0][0:3]try:#并不是所有房源都有建筑类型信息，若无则设为“无数据”Building = re.search("..楼",house.xpath("div[1]/div[3]/div/text()")[0]).group()[-2:]except:Building = "无数据"except:print("Error")else:#写入并打印爬到的数据csvWrite([Area,Region,Garden,Acreage,Direction,Layout,Renovation,Height,Elevator,BuiltYear,Building,Price])# print([Area,Region,Garden,Acreage,Direction,Layout,Renovation,Height,Elevator,BuiltYear,Building,Price])print("All data of District{} in Area {} have sbeen downloaded!".format(x[1],x[0]))

if __name__ == "__main__":url_list = get_areas_regions_urls()pool = Pool()#创建线程池pool.map(region_spider,url_list)#使用多线程运行爬虫pool.close()#关闭线程池pool.join()#等待所有线程结束

二.数据分析

import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns%matplotlib inline

/opt/conda/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88return f(*args, **kwds)
/opt/conda/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88return f(*args, **kwds)

df = pd.read_csv("/home/kesci/work/lianjia_sh_ershoufang_data.csv", header = None)
df.columns = ["Area", "Region", "Garden", "Acreage", "Direction", "Layout", "Renovation", \"Height", "Elevator", "BuiltYear", "Building", "Price"]
df.head()

	Area	Region	Garden	Acreage	Direction	Layout	Renovation	Height	Elevator	Building	Price
0	pudong	潍坊	富丽家园中行宿舍	95.3平米	南	3室2厅	简装	中楼层	无数据	无数据	172.0
1	pudong	潍坊	金沙湾花园	108.8平米	北	3室2厅	精装	高楼层	无数据	无数据	215.0
2	pudong	潍坊	中海誉城南苑	71.68平米	东北	2室1厅	精装	低楼层	无数据	无数据	190.0
3	pudong	潍坊	万科东荟城	70平米	北	2室2厅	精装	高楼层	无数据	无数据	222.0
4	pudong	潍坊	恒宝华庭	66.75平米	东西	2室1厅	简装	低楼层	无数据	无数据	318.0

数据清洗

爬取的数据存在瑕疵，在进行数据分析前需检查数据。观察数据具有重复值，故需对数据去重。

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314674 entries, 0 to 314673
Data columns (total 12 columns):
Area          314674 non-null object
Region        314674 non-null object
Garden        314674 non-null object
Acreage       314674 non-null object
Direction     314674 non-null object
Layout        314674 non-null object
Renovation    314674 non-null object
Height        314674 non-null object
Elevator      314674 non-null object
BuiltYear     314674 non-null int64
Building      314674 non-null object
Price         314674 non-null float64
dtypes: float64(1), int64(1), object(10)
memory usage: 28.8+ MB

df.isnull().sum()

Area          0
Region        0
Garden        0
Acreage       0
Direction     0
Layout        0
Renovation    0
Height        0
Elevator      0
BuiltYear     0
Building      0
Price         0
dtype: int64

df_clean = df.drop_duplicates()

观察数据，房龄,电梯和建筑皆为无用特征，在此删除

df_clean["BuiltYear"].value_counts()
df_clean["Building"].value_counts()
df_clean["Elevator"].value_counts()
df_clean = df_clean.drop(["BuiltYear", "Building", "Elevator"], axis=1)

df_clean.head()

	Area	Region	Garden	Acreage	Direction	Layout	Renovation	Height	Price
0	pudong	潍坊	富丽家园中行宿舍	95.3平米	南	3室2厅	简装	中楼层	172.0
1	pudong	潍坊	金沙湾花园	108.8平米	北	3室2厅	精装	高楼层	215.0
2	pudong	潍坊	中海誉城南苑	71.68平米	东北	2室1厅	精装	低楼层	190.0
3	pudong	潍坊	万科东荟城	70平米	北	2室2厅	精装	高楼层	222.0
4	pudong	潍坊	恒宝华庭	66.75平米	东西	2室1厅	简装	低楼层	318.0

房屋面积转化成数字型，便于后面的分析建模。

df_clean["Acreage"] = df_clean["Acreage"].str[:-2].astype(float)

定义函数，分类特征和标签

def feature_label(data):feature = data.drop("Price", axis = 1)label = data["Price"]return feature, label

去除异常点

首先通过拟合线性回归模型，求出拟合值与实际值的差。此时问题 [去除异常点] 转化成 [去除残差中的异常点]

未处理前

from sklearn.linear_model import LinearRegression

def LR(data):LR = LinearRegression()LR.fit(np.array(data["Acreage"]).reshape(-1, 1), data["Price"])res = LR.predict(np.array(data["Acreage"]).reshape(-1, 1))return res

pred = LR(df_clean)

def pl(data, res):fig = plt.figure(figsize=(16,6))plt.plot(data["Acreage"], res, color = "b")plt.scatter(data["Acreage"], data["Price"], color = "y", alpha = 0.5);pl(df_clean, pred)

数据中存在极端值，不利于建模和泛化，在建模前需去除异常点

def rem_error(res, data):bias = res - data["Price"]bias_max = 2.5*bias.describe()[6] - 1.5*bias.describe()[4]bias_min = 2.5*bias.describe()[4] - 1.5*bias.describe()[6]index = bias[(bias >= bias_min) & (bias <= bias_max)].indexreturn indexindex = rem_error(pred, df_clean)

除去异常点后的模型

df_clean = df_clean.ix[index]
pred = LR(df_clean)
pl(df_clean, pred);

/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexingSee the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated"""Entry point for launching an IPython kernel.

对于X室X厅，考虑转换成数值型变量

df_clean["厅"] = df_clean["Layout"].str[1].astype(float)
df_clean["室"] = df_clean["Layout"].str[3].astype(float)
df_clean.drop(columns = "Layout", inplace=True)

对装修状况进行编码

from sklearn.preprocessing import LabelEncoder

df_clean["Renovation"].unique()

array(['简装', '精装', '毛坯', '其他'], dtype=object)

le = LabelEncoder()
le.fit(df_clean["Renovation"])
df_clean["Renovation"] = le.transform(df_clean["Renovation"])

将房屋朝向用one-hot编码

观察数据存在多种朝向混合，直接对列one-hot编码，特征数过大，所以这里采用对朝向列的字符串进行one-hot编码

df_clean["Direction"].unique()

array(['南', '北', '东北', '东 西', '东南', '西北', '西', '南 北', '东南 西南', '西南', '东','东 东南', '西北 北', '南 西南', '东北 东南', '西南 西', '西南 北', '东南 北', '东 西北','北 东北', '东南 南', '西北 东北', '南 西', '西南 东北', '东南 南 西南', '东 西 北', '东 南','南 西 北', '东 南 北', '东 北', '北 南', '南 东北', '东 南 北 西', '南 东南', '东 东北','西 北', '东南 西 北', '西 东北', '西南 西北', '东南 西北', '南 东', '东南 南 北', '西 西南','东南 东', '西北 西南', '东 南 西 北'], dtype=object)

Direction = df_clean["Direction"].str.get_dummies(sep=" ")
df_clean = df_clean.join(Direction)
df_clean.drop(columns = "Direction", inplace=True)
df_clean.head()

	Area	Region	Garden	Acreage	Renovation	Height	Price	厅	室	东	东北	北	南	西
0	pudong	潍坊	富丽家园中行宿舍	95.30	2	中楼层	172.0	3.0	2.0	0	0	0	1	0
1	pudong	潍坊	金沙湾花园	108.80	3	高楼层	215.0	3.0	2.0	0	0	1	0	0
2	pudong	潍坊	中海誉城南苑	71.68	3	低楼层	190.0	2.0	1.0	0	1	0	0	0
3	pudong	潍坊	万科东荟城	70.00	3	高楼层	222.0	2.0	2.0	0	0	1	0	0
4	pudong	潍坊	恒宝华庭	66.75	2	低楼层	318.0	2.0	1.0	1	0	0	0	1

观察楼层主要集中在中楼层，高楼层，低楼层，在这考虑将楼层分类为中，高，低，其他，以降低维度

Height = df_clean["Height"].str[0]
index = Height[(Height != "高") & (Height.str[0] != "中") & (Height.str[0] != "低")].index
Height[index] = "其他楼层"Height = pd.get_dummies(Height)

看下清洗完的数据

df_clean.drop(columns = "Height", inplace=True)
df_clean = df_clean.join(Height)df_clean.head()

	Area	Region	Garden	Acreage	Renovation	Price	厅	室	东	东北	北	南	西	中	低	高
0	pudong	潍坊	富丽家园中行宿舍	95.30	2	172.0	3.0	2.0	0	0	0	1	0	1	0	0
1	pudong	潍坊	金沙湾花园	108.80	3	215.0	3.0	2.0	0	0	1	0	0	0	0	1
2	pudong	潍坊	中海誉城南苑	71.68	3	190.0	2.0	1.0	0	1	0	0	0	0	1	0
3	pudong	潍坊	万科东荟城	70.00	3	222.0	2.0	2.0	0	0	1	0	0	0	0	1
4	pudong	潍坊	恒宝华庭	66.75	2	318.0	2.0	1.0	1	0	0	0	1	0	1	0

房屋所在地合并，降低维度

location = df_clean["Area"] + df_clean["Region"] + df_clean["Garden"]
df_clean = df_clean.join(pd.get_dummies(location))

df_clean.drop(columns=["Area", "Region", "Garden"], inplace=True)
df_clean.head()

	Acreage	Renovation	Price	厅	室	东	东北	北	南	...
0	95.30	2	172.0	3.0	2.0	0	0	0	1	...
1	108.80	3	215.0	3.0	2.0	0	0	1	0	...
2	71.68	3	190.0	2.0	1.0	0	1	0	0	...
3	70.00	3	222.0	2.0	2.0	0	0	1	0	...
4	66.75	2	318.0	2.0	1.0	1	0	0	0	...

5 rows × 3790 columns

交叉验证

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=12, random_state=42, shuffle=True)def rmsle(y, y_pred):return np.sqrt(mean_squared_error(y, y_pred))def cv_rmse(model, X, y):rmse = np.sqrt(-cross_val_score(model, X, y,scoring="neg_mean_squared_error", cv=kf))return (rmse)

from sklearn.preprocessing import MinMaxScaler

features_raw, label = feature_label(df_clean)

scaler = MinMaxScaler()
features = scaler.fit_transform(features_raw)

/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/data.py:323: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by MinMaxScaler.return self.partial_fit(X, y)

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.25, random_state=40)

lr = LinearRegression().fit(X_train, y_train)
pred = lr.predict(X_test)

train_pred = lr.predict(X_train)
test_pred = lr.predict(X_test)print('MSE train data: %.3f, MSE test data: %.3f' % (
mean_squared_error(y_train, train_pred),
mean_squared_error(y_test, test_pred)))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train, train_pred),
r2_score(y_test, test_pred)))

MSE train data: 539.724, MSE test data: 16170516411079238651010351104.000
R2 train data: 0.977, R2 test data: -694086926740721479188480.000

训练结果明显过拟合，选用网格，调整超参数来选取最佳模型

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

def performance_metric(y_true, y_predict):""" Calculates and returns the performance score between true and predicted values based on the metric chosen. """score = r2_score(y_true, y_predict)return score

def fit_model(X, y):""" Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=42)regressor = RandomForestRegressor()params = {'max_depth': [i for i in range(1,11)]}# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric)grid = GridSearchCV(regressor, param_grid=params, scoring=scoring_fnc, cv=4)# Fit the grid search object to the data to compute the optimal modelgrid = grid.fit(X, y)# Return the optimal model after fitting the datareturn grid.best_estimator_

# Fit the training data to the model using grid search
reg = fit_model(features, label)# Produce the value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']));

Parameter 'max_depth' is 10 for the optimal model.

train_pred = reg.predict(X_train)
test_pred = reg.predict(X_test)print('MSE train data: %.3f, MSE test data: %.3f' % (
mean_squared_error(y_train, train_pred),
mean_squared_error(y_test, test_pred)))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train, train_pred),
r2_score(y_test, test_pred)))

MSE train data: 9356.258, MSE test data: 8988.234
R2 train data: 0.847, R2 test data: 0.864

reg.score(X_test, test_pred)

1.0

总结

爬虫爬取的数据是存在瑕疵的，数据格式也不利于建模，在此需要花费大量时间清洗。
一个最优的模型不一定是一个健壮模型。有的时候模型会过于复杂或者过于简单，以致于难以泛化新增添的数据；有的时候模型采用的学习算法并不适用于特定的数据结构；有的时候样本本身可能有太多噪点或样本过少，使得模型无法准确地预测目标变量。这些情况下我们会说模型是欠拟合的。
尝试多个模型，调整超参数，混合模型等方式让模型更加健壮

爬取链家网上海二手房数据，并进行分析建模相关推荐

Python爬取链家网获取二手房数据并调用高德api获得经纬度
链家网获得数据地址,高德api获得经纬度(同理链家网也可以换成其他58同城,赶集网的数据,因为反爬虫比较厉害,没时间整,就用链家网的数据先试试水) 首先爬链家网,Info包含一条信息 import j ...
python 爬取链家数据_用python爬取链家网的二手房信息
题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...
python爬房源信息_用python爬取链家网的二手房信息
题外话:这几天用python做题,算是有头有尾地完成了.这两天会抽空把我的思路和方法,还有代码贴出来,供python的初学者参考.我python的实战经历不多,所以代码也是简单易懂的那种.当然过程中还 ...
python爬取链家网的房屋数据
python爬取链家网的房屋数据爬取内容爬取源网站爬取内容爬取思路爬取的数据代码获取房屋url 获取房屋具体信息爬取内容爬取源网站北京二手房 https://bj.lianjia. ...
五十七、爬取链家网北京二手房信息，并进行线性回归建模
@Author : By Runsen @Date:2020/5/31 作者介绍:Runsen目前大三下学期,专业化学工程与工艺,大学沉迷日语,Python, Java和一系列数据分析软件.导致翘课严 ...
Scrapy实战篇（一）之爬取链家网成交房源数据（上）
今天,我们就以链家网南京地区为例,来学习爬取链家网的成交房源数据. 这里推荐使用火狐浏览器,并且安装firebug和firepath两款插件,你会发现,这两款插件会给我们后续的数据提取带来很大的方便. ...
爬虫，爬取链家网北京二手房信息
# 链家网二手房信息爬取 import re import time import requests import pandas as pd from bs4 import BeautifulSoup ...
Scrapy实战篇（二）之爬取链家网成交房源数据（下）
在上一小节中,我们已经提取到了房源的具体信息,这一节中,我们主要是对提取到的数据进行后续的处理,以及进行相关的设置. 数据处理我们这里以把数据存储到mongo数据库为例. 编写pipelines.p ...
爬取链家网二手房数据并保存到mongodb中
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档爬取链家网二手房数据并保存到mongodb中文章目录前言一.爬虫的介绍二.协程的介绍三.css选择器四.基于asyncio ...
python爬取链家新房_Python爬虫实战：爬取链家网二手房数据
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 买房装修,是每个人都要经历的重要事情之一.相对于新房交易市场来说,如今的二手房交易市场一点也 ...

爬取链家网上海二手房数据，并进行分析建模

一.分析网页结构并编写程序

二.数据分析

数据清洗

爬取的数据存在瑕疵，在进行数据分析前需检查数据。观察数据具有重复值，故需对数据去重。

观察数据，房龄,电梯和建筑皆为无用特征，在此删除

房屋面积转化成数字型，便于后面的分析建模。

定义函数，分类特征和标签

去除异常点

首先通过拟合线性回归模型，求出拟合值与实际值的差。此时问题 [去除异常点] 转化成 [去除残差中的异常点]

未处理前

数据中存在极端值，不利于建模和泛化，在建模前需去除异常点

除去异常点后的模型

对于X室X厅，考虑转换成数值型变量

对装修状况进行编码

将房屋朝向用one-hot编码

观察数据存在多种朝向混合，直接对列one-hot编码，特征数过大，所以这里采用对朝向列的字符串进行one-hot编码

观察楼层主要集中在中楼层，高楼层，低楼层，在这考虑将楼层分类为中，高，低，其他，以降低维度

看下清洗完的数据

房屋所在地合并，降低维度

交叉验证

总结

爬取链家网上海二手房数据，并进行分析建模相关推荐

最新文章

热门文章