# 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snsfrom sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.feature_selection import mutual_info_regression as MICimport xgboost as xgb
import lightgbm as lgbfrom sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取文件
train = pd.read_csv('used_car_train_20200313.csv',sep=' ')
test = pd.read_csv('used_car_testB_20200421.csv',sep=' ')print('train:{}'.format(train.shape))
train:(150000, 31)
test:(50000, 30)
# 查看数据信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):#   Column             Non-Null Count   Dtype
---  ------             --------------   -----  0   SaleID             150000 non-null  int64  1   name               150000 non-null  int64  2   regDate            150000 non-null  int64  3   model              149999 non-null  float644   brand              150000 non-null  int64  5   bodyType           145494 non-null  float646   fuelType           141320 non-null  float647   gearbox            144019 non-null  float648   power              150000 non-null  int64  9   kilometer          150000 non-null  float6410  notRepairedDamage  150000 non-null  object 11  regionCode         150000 non-null  int64  12  seller             150000 non-null  int64  13  offerType          150000 non-null  int64  14  creatDate          150000 non-null  int64  15  price              150000 non-null  int64  16  v_0                150000 non-null  float6417  v_1                150000 non-null  float6418  v_2                150000 non-null  float6419  v_3                150000 non-null  float6420  v_4                150000 non-null  float6421  v_5                150000 non-null  float6422  v_6                150000 non-null  float6423  v_7                150000 non-null  float6424  v_8                150000 non-null  float6425  v_9                150000 non-null  float6426  v_10               150000 non-null  float6427  v_11               150000 non-null  float6428  v_12               150000 non-null  float6429  v_13               150000 non-null  float6430  v_14               150000 non-null  float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
# 查看数据前五行
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
0 0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482

5 rows × 31 columns

# 查看测试集信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):#   Column             Non-Null Count  Dtype
---  ------             --------------  -----  0   SaleID             50000 non-null  int64  1   name               50000 non-null  int64  2   regDate            50000 non-null  int64  3   model              50000 non-null  float644   brand              50000 non-null  int64  5   bodyType           48496 non-null  float646   fuelType           47076 non-null  float647   gearbox            48032 non-null  float648   power              50000 non-null  int64  9   kilometer          50000 non-null  float6410  notRepairedDamage  50000 non-null  object 11  regionCode         50000 non-null  int64  12  seller             50000 non-null  int64  13  offerType          50000 non-null  int64  14  creatDate          50000 non-null  int64  15  v_0                50000 non-null  float6416  v_1                50000 non-null  float6417  v_2                50000 non-null  float6418  v_3                50000 non-null  float6419  v_4                50000 non-null  float6420  v_5                50000 non-null  float6421  v_6                50000 non-null  float6422  v_7                50000 non-null  float6423  v_8                50000 non-null  float6424  v_9                50000 non-null  float6425  v_10               50000 non-null  float6426  v_11               50000 non-null  float6427  v_12               50000 non-null  float6428  v_13               50000 non-null  float6429  v_14               50000 non-null  float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64
0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64
# 将‘-’转化成空值,并将notRepairedDamage特征转换成数值型
train['notRepairedDamage'] = train['notRepairedDamage'].replace('-', np.nan).astype('float')
0.0    111361
1.0     14315
Name: notRepairedDamage, dtype: int64
test['notRepairedDamage'] = test['notRepairedDamage'].replace('-', np.nan).astype('float')
0.0    37224
1.0     4707
Name: notRepairedDamage, dtype: int64
# 查看缺失值特征, 看到全是分类特征
0.0      11762
19.0      9573
4.0       8445
1.0       6038
29.0      5186...
240.0        2
209.0        2
245.0        2
242.0        2
247.0        1
Name: model, Length: 248, dtype: int64
0.0    41420
1.0    35272
2.0    30324
3.0    13491
4.0     9609
5.0     7607
6.0     6482
7.0     1289
Name: bodyType, dtype: int64
0.0    91656
1.0    46991
2.0     2212
3.0      262
4.0      118
5.0       45
6.0       36
Name: fuelType, dtype: int64
0.0    111623
1.0     32396
Name: gearbox, dtype: int64
# 获取有缺失值的特征
col_train_null = train.columns[train.isnull().any()].to_list()
col_test_null = test.columns[test.isnull().any()].to_list()print(col_train_null)
['model', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
['bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
# 使用SimpleImputer进行缺失值填充
imp = SimpleImputer(strategy='most_frequent')train[col_train_null] = imp.fit_transform(train[col_train_null])
test[col_train_null] = imp.fit_transform(test[col_train_null])
# 检查特征
# 检查日期列的异常值
03    14949
06    13809
04    12798
05    12614
07    11937
10    11490
00    11347
11    10687
12    10637
09    10522
01     9943
08     9936
02     9331
Name: regDate, dtype: int64
# 定义函数,用于转换月份为零的值
def tran_date(x):month = int(x[4:6])if month == 0:month = 1return x[0:4] + '-' + str(month) + '-' + x[6:]
# 日期替换
train['regDate'] = pd.to_datetime(train['regDate'].astype('str').apply(tran_date))
test['regDate'] = pd.to_datetime(test['regDate'].astype('str').apply(tran_date))
train['creatDate'] = pd.to_datetime(train['creatDate'].astype('str').apply(tran_date))
test['creatDate'] = pd.to_datetime(test['creatDate'].astype('str').apply(tran_date))
# 密度图查看price列

# 取对数调整偏态

# 查看一下分布
count    150000.000000
mean       5923.327333
std        7501.998477
min          11.000000
1%          150.000000
25%        1300.000000
50%        3250.000000
75%        7700.000000
99%       34950.000000
max       99999.000000
Name: price, dtype: float64
# 查看年份和价格的关系
train.resample('Y', on='regDate')['price'].mean().to_period('Y').plot(kind='bar')

# 创造特征
train['diff_day'] = (train['creatDate'] - train['regDate']).dt.days
train['diff_year'] = round(train['diff_day'] / 365, 1)
train['regDate_year'] = train['regDate'].dt.year
train['regDate_month'] = train['regDate'].dt.month
train['regDate_day'] = train['regDate'].dt.day
train['creatDate_year'] = train['creatDate'].dt.year
train['creatDate_month'] = train['creatDate'].dt.month
train['creatDate_day'] = train['creatDate'].dt.day
test['diff_day'] = (test['creatDate'] - test['regDate']).dt.days
test['diff_year'] = round(test['diff_day'] / 365, 1)
test['regDate_year'] = test['regDate'].dt.year
test['regDate_month'] = test['regDate'].dt.month
test['regDate_day'] = test['regDate'].dt.day
test['creatDate_year'] = test['creatDate'].dt.year
test['creatDate_month'] = test['creatDate'].dt.month
test['creatDate_day'] = test['creatDate'].dt.day
# 查看name特征有多少种类
# 对power的描述性统计
count    150000.000000
mean        119.316547
std         177.168419
min           0.000000
25%          75.000000
50%         110.000000
75%         150.000000
max       19312.000000
Name: power, dtype: float64
# 对model的描述性统计
count    150000.000000
mean         47.128707
std          49.536024
min           0.000000
25%          10.000000
50%          30.000000
75%          66.000000
max         247.000000
Name: model, dtype: float64
# 对power进行分箱
bin = [i*20 for i in range(0,31)]train['power_bin'] = pd.cut(train['power'], bin, labels=False).fillna(31)
test['power_bin'] = pd.cut(test['power'], bin, labels=False).fillna(31)
# 对model进行分箱
bin_model = [i*10 for i in range(0,26)]
train['model_bin'] = pd.cut(train['model'], bin_model, labels=False)
test['model_bin'] = pd.cut(test['model'], bin_model, labels=False)
0.0     25963
1.0     21123
2.0     18095
4.0     14872
3.0     11069
6.0      8748
7.0      5193
5.0      4629
8.0      3879
10.0     3818
11.0     3376
9.0      2550
12.0     2417
16.0     2096
15.0     1993
17.0     1699
13.0     1623
14.0     1162
18.0      988
19.0      860
21.0      771
20.0      640
22.0      473
23.0      171
24.0       29
Name: model_bin, dtype: int64
# 找出分类型特征
col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage', 'seller', 'offerType']
brand bodyType fuelType gearbox kilometer notRepairedDamage seller offerType
0 6 1.0 0.0 0.0 12.5 0.0 0 0
1 1 2.0 0.0 0.0 15.0 0.0 0 0
2 15 1.0 0.0 0.0 12.5 0.0 0 0
3 10 0.0 0.0 1.0 15.0 0.0 0 0
4 5 1.0 0.0 0.0 5.0 0.0 0 0
... ... ... ... ... ... ... ... ...
149995 10 4.0 0.0 1.0 15.0 0.0 0 0
149996 11 0.0 0.0 0.0 10.0 0.0 0 0
149997 11 1.0 1.0 0.0 6.0 0.0 0 0
149998 10 3.0 1.0 0.0 15.0 0.0 0 0
149999 28 6.0 0.0 1.0 12.5 0.0 0 0

150000 rows × 8 columns

# 观察分类型特征的数据分布
for i in range(len(col_clf)):plt.subplot(2,4,i+1)train[col_clf[i]].value_counts().plot(kind='bar',color='yellow')test[col_clf[i]].value_counts().plot(kind='bar',color='blue')plt.title(col_clf[i])

# 可以发现其中有两个特征只有一种类型,删除seller和offerType列
train = train.drop(['seller', 'offerType'], axis=1)
test = test.drop(['seller', 'offerType'], axis=1)
col_clf = ['brand', 'bodyType', 'fuelType', 'gearbox', 'kilometer', 'notRepairedDamage']
# 查看不同分类和价格的关系
for i in range(len(col_clf)):plt.subplot(2,3,i+1)train.groupby(col_clf[i])['price'].mean().plot(kind='bar')plt.title(col_clf[i])

# 删去id这一列
train = train.drop(['SaleID'], axis=1)
test = test.drop(['SaleID'], axis=1)
# 绘制热力图,观察特征之间的关系
corr = train.corr()

# 将name列转换成计数
train['name_count'] = train.groupby('name')['brand'].agg(['count'])
test['name_count'] = test.groupby('name')['brand'].agg(['count'])
# 删除name列
train = train.drop('name', axis=1)
test = test.drop('name', axis=1)
# 将分类特征和价格组合出新的特征
col_clf = ['brand', 'model', 'kilometer', 'fuelType', 'bodyType']
for col in col_clf:train_gb = train.groupby(col)all_info = {}for kind, kind_data in train_gb:info = {}info[col + '_amount'] = len(kind_data)info[col + '_price_max'] = kind_data.price.max()info[col + '_price_median'] = kind_data.price.median()info[col + '_price_min'] = kind_data.price.min()info[col + '_price_sum'] = kind_data.price.sum()info[col + '_price_std'] = kind_data.price.std()info[col+'_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)all_info[kind] = infofe = pd.DataFrame(all_info).T.reset_index().rename(columns={'index':col})train = train.copy().merge(fe, how='left', on=col)test = test.copy().merge(fe, how='left', on=col)print(train.shape)
(150000, 73)
(50000, 72)
# kilometer和power组合形成新特征
col_kp = ['kilometer', 'power']
t1 = train.groupby(col_kp[0], as_index=False)[col_kp[1]].agg({col_kp[0] + '_' + col_kp[1] + '_count':'count',col_kp[0] + '_' + col_kp[1] + '_max':'max',col_kp[0] + '_' + col_kp[1] + '_median':'median',col_kp[0] + '_' + col_kp[1] + '_min':'min',col_kp[0] + '_' + col_kp[1] + '_sum':'sum',col_kp[0] + '_' + col_kp[1] + '_std':'std',col_kp[0] + '_' + col_kp[1] + '_mean':'mean'
train = train.copy().merge(t1, how='left', on=col_kp[0])
test = test.copy().merge(t1, how='left', on=col_kp[0])print(train.shape)
(150000, 80)
(50000, 79)
# 将与价格相关性高的匿名特征分别进行计算,生成新的特征
col_v = [0,3,8,12]
for i in col_v:for j in col_v:train[str(i)+'*'+str(j)] = train['v_'+str(i)] * train['v_'+str(j)]test[str(i)+'*'+str(j)] = test['v_'+str(i)] * test['v_'+str(j)]for i in col_v:for j in col_v:train[str(i)+'+'+str(j)] = train['v_'+str(i)] + train['v_'+str(j)]test[str(i)+'+'+str(j)] = test['v_'+str(i)] + test['v_'+str(j)]for i in col_v:for j in col_v:train[str(i)+'-'+str(j)] = train['v_'+str(i)] - train['v_'+str(j)]test[str(i)+'-'+str(j)] = test['v_'+str(i)] - test['v_'+str(j)]for i in col_v:train[str(i)+'*diff_year'] = train['v_'+str(i)] * train['diff_year']test[str(i)+'*diff_year'] = test['v_'+str(i)] * test['diff_year']print(train.shape)
(150000, 132)
(50000, 131)
# 深复制
train_new = train.copy(deep=True)
test_new = test.copy(deep=True)
# 删除没有用的特征
X_train = train_new.drop(['price', 'regDate', 'creatDate', 'regionCode'], axis=1)
X_test = test_new.drop(['regDate', 'creatDate', 'regionCode'], axis=1)
y_train = train_new['price']
X_train = pd.read_csv('X_train.csv', index_col=0)
X_test = pd.read_csv('X_test.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)
y_train = np.ravel(y_train)
from sklearn.model_selection import GridSearchCV
from lightgbm.sklearn import LGBMRegressor
from time import time
import datetime
# lgbm模型调参 n_estimators
lgbm_scores = []
time0 = time()
for i in np.arange(200, 2001, 100):reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=i, objective='regression_l1', random_state=42)lgbm_score = cross_val_score(reg_lgbm, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()lgbm_scores.append(lgbm_score)print(time() - time0, lgbm_score)print(max(lgbm_scores))
print(np.arange(200, 2001, 100)[np.argmax(lgbm_scores)])plt.figure(figsize=(8,6))
plt.plot(np.arange(200, 2001, 100), lgbm_scores)
8.246474504470825 -594.8424586944835
19.095320463180542 -568.7034240842332
32.98492646217346 -552.3891983733455
48.85633111000061 -542.3964373457885
67.14593052864075 -535.7482170534481
87.76364278793335 -529.8703107609151
111.7964539527893 -525.7724715224499
137.1214382648468 -522.2536456032711
164.56334352493286 -519.7242720183268
194.57858514785767 -517.0013123928143
226.66670727729797 -515.3243156345435
261.38421607017517 -513.8374464322388
298.36878204345703 -512.2929007437376
337.6414248943329 -511.0992392114774
379.26119804382324 -510.01377926737086
423.065954208374 -508.88194129037237
468.8004615306854 -507.9469756382571
516.7109439373016 -506.9312361319216
567.1637334823608 -505.91707028368864

# lgbm模型调参 max_depth和num_leaves
parameters = {'max_depth':[4,5,6,7],'num_leaves':np.arange(5,100,5)
reg_lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=2000, objective='regression_l1', random_state=42)gs = GridSearchCV(reg_lgbm, param_grid=parameters, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
gs_model = gs.fit(X_train, y_train)
最优参数:{'max_depth': 7, 'num_leaves': 45}
最优模型:LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45,objective='regression_l1', random_state=42)
# xgboost模型调参 learning_rate
xgb_scores = []
time0 = time()
for i in np.arange(0.05,0.31,0.05)reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=i)xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()xgb_scores.append(xgb_score)print(time() - time0)print(max(xgb_scores))
plt.plot(np.arange(0.05,0.31,0.05), xgb_scores)

# xgboost模型调参 max_depth
xgb_scores = []
time0 = time()
for i in np.arange(5,12,1):reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=i)xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()xgb_scores.append(xgb_score)print(time() - time0, xgb_score)print(max(xgb_scores))
plt.plot(np.arange(5,12,1), xgb_scores)
106.13332343101501 -581.7825425249935
233.66106414794922 -559.3930964745617
386.93016719818115 -545.8423732084185
577.3439819812775 -540.0337358052888
789.385425567627 -535.3663493749481
1027.0641367435455 -537.2171228026253
1293.055543422699 -540.2481429747703

# xgboost模型调参 colsample_bytree
xgb_scores = []
time0 = time()
for i in np.arange(0.4,0.8,0.1):reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=i)xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()xgb_scores.append(xgb_score)print(time() - time0, xgb_score)print(max(xgb_scores))
plt.plot(np.arange(0.4,0.8,0.1), xgb_scores)
101.24388527870178 -540.0945292119669
219.8485279083252 -536.1881194847441
358.25814485549927 -534.7100199133007
509.0920376777649 -534.7369636623599

# xgboost模型调参 colsample_bylevel
xgb_scores = []
time0 = time()
for i in np.arange(0.5,1.1,0.1):reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=i)xgb_score = cross_val_score(reg_xgb, X_train, y_train, cv=3, scoring='neg_mean_absolute_error').mean()xgb_scores.append(xgb_score)print(time() - time0, xgb_score)print(max(xgb_scores))
plt.plot(np.arange(0.5,1.1,0.1), xgb_scores)
82.85519623756409 -534.1242236725466
176.23594546318054 -535.4707890065283
279.6718213558197 -534.5832091972042
391.7265202999115 -533.988677477093
518.4175012111664 -533.3711266578522
656.7450432777405 -534.7100199133007657.538763999939 nan

# 导入sklearn自带的模型融合库
from sklearn.ensemble import StackingRegressor
# 实例化模型
reg_lgbm = LGBMRegressor(max_depth=7, n_estimators=2000, num_leaves=45, objective='regression_l1', random_state=42)
reg_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.15, max_depth=9, colsample_bytree=0.6, colsample_bylevel=0.9)
estimators=[('lgbm',reg_lgbm), ('xgb',reg_xgb)]sr = StackingRegressor(estimators, verbose=True)
sr_scores = cross_val_score(sr, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished
# 查看模型融合分数
array([-491.53751775, -494.7186037 , -486.69418657])
# 导出预测结果
sr.fit(X_train, y_train)
sr_predict = sr.predict(X_test)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.1min finished


