导包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#支持向量机
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

数据聚合

train_data = pd.read_csv('./zhengqi_train.txt',sep='\t')
test_data = pd.read_csv('./zhengqi_test.txt',sep='\t')
#合并训练数据和预测数据
train_data["origin"]="train"
test_data["origin"]="test"
data_all=pd.concat([train_data,test_data],axis=0,ignore_index=True)
#View data
data_all
V0 V1 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V2 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V5 V6 V7 V8 V9 origin target
0 0.566 0.016 -0.940 -0.307 -0.073 0.550 -0.484 0.000 -1.707 -1.162 -0.573 -0.991 -0.143 0.610 -0.400 -0.063 0.356 0.800 -0.223 0.796 0.168 -0.450 0.136 0.407 0.109 -0.615 0.327 -4.627 -4.789 -5.101 -2.608 -3.508 0.452 -0.901 -1.812 -2.360 -0.436 -2.114 train 0.175
1 0.968 0.437 0.188 -0.455 -0.134 1.109 -0.488 0.000 -0.977 -1.162 -0.571 -0.836 0.066 0.588 -0.802 -0.063 0.357 0.801 -0.144 1.057 0.338 0.671 -0.128 0.566 0.124 0.032 0.600 -0.843 0.160 0.364 -0.335 -0.730 0.194 -0.893 -1.566 -2.360 0.332 -2.114 train 0.676
2 1.013 0.568 0.874 -0.051 -0.072 0.767 -0.493 -0.212 -0.618 -0.897 -0.564 -0.558 0.235 0.576 -0.477 -0.063 0.355 0.961 -0.067 0.915 0.326 1.287 -0.009 0.370 0.361 0.277 -0.116 -0.843 0.160 0.364 0.765 -0.589 0.112 -0.797 -1.367 -2.360 0.396 -2.114 train 0.633
3 0.733 0.368 0.011 0.102 -0.014 0.769 -0.371 -0.162 -0.429 -0.897 -0.574 -0.564 0.283 0.272 -0.491 -0.063 0.352 1.435 0.113 0.898 0.277 1.298 0.015 0.165 0.417 0.279 0.603 -0.843 -0.065 0.364 0.333 -0.112 0.599 -0.679 -1.200 -2.086 0.403 -2.114 train 0.206
4 0.684 0.638 -0.251 0.570 0.199 -0.349 -0.342 -0.138 -0.391 -0.897 -0.572 -0.394 0.260 0.106 0.309 -0.259 0.352 0.881 0.221 0.386 0.332 1.289 0.183 0.209 1.078 0.328 0.418 -0.843 -0.215 0.364 -0.280 -0.028 0.337 -0.454 -1.073 -2.086 0.314 -2.114 train 0.384
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4808 -1.362 -1.553 -2.551 0.518 0.396 0.928 1.452 0.867 -5.143 1.227 -3.573 0.107 -3.096 -0.088 0.227 2.953 -1.538 -0.630 -3.072 -1.120 -1.674 0.525 0.171 -0.444 -4.488 -5.793 -4.050 -1.187 -0.852 -2.131 -2.564 0.597 0.381 1.375 -4.854 -5.331 -4.074 -3.838 test NaN
4809 -2.698 -3.452 -2.525 0.311 -1.786 1.871 1.885 1.135 -5.774 1.227 -0.965 0.193 -3.620 -0.506 -0.574 3.149 -1.479 -0.204 -3.432 -2.101 -1.773 -0.446 1.297 -1.066 -0.613 -7.698 -0.674 -1.187 -0.852 -2.131 -2.564 1.215 -1.385 1.378 -4.927 -5.103 -4.393 -1.683 test NaN
4810 -2.615 -3.564 -2.529 -0.029 -1.151 1.976 2.337 0.504 -4.752 1.492 -1.568 0.301 -3.402 0.109 -0.541 3.511 -1.085 1.057 -2.409 0.477 -1.585 -0.447 0.552 -0.422 0.125 -6.111 0.275 -1.851 -1.548 -1.537 -2.544 1.612 -1.272 1.121 -4.223 -4.315 -5.196 -3.407 test NaN
4811 -2.661 -3.646 -2.560 -0.028 -1.512 1.520 2.243 0.206 -4.200 1.492 -1.282 -0.036 -3.271 -1.015 -0.203 3.511 -1.084 0.800 -2.339 0.050 -1.410 -0.447 0.318 -0.699 1.086 -5.268 0.683 -1.645 -1.471 -1.537 -2.549 1.431 -1.270 1.116 -3.716 -3.809 -4.735 -2.976 test NaN
4812 -2.321 -3.037 0.056 0.306 -1.154 0.847 2.221 0.206 -3.960 1.492 -1.213 0.592 -3.214 -1.502 0.153 3.609 -1.088 0.799 -2.339 -0.077 -1.242 -0.442 0.323 -1.594 -0.774 -5.211 1.618 -1.703 -1.471 -1.537 -1.123 1.988 -0.910 1.259 -3.616 -3.747 -4.368 -2.976 test NaN

4813 rows × 40 columns

特征探索

#探索出去最后两列的数字属性
data_all.columns[:-2]
Index(['V0', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17','V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26','V27', 'V28', 'V29', 'V3', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35','V36', 'V37', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9'],dtype='object')
#38个特征将一些不重要的删除
#特征分布情况,训练和测试数据特征分布不均匀,删除
for column in data_all.columns[0:-2]:g = sns.kdeplot(data_all[column][(data_all["origin"] == "train")], color="Red", shade = True)g = sns.kdeplot(data_all[column][(data_all["origin"] == "test")], ax =g, color="Blue", shade= True)g.set_xlabel(column)g.set_ylabel("Frequency")g = g.legend(["train","test"])plt.show()

fig = plt.figure(figsize=(10, 10))
for i in range(len(data_all.columns)-2):g = sns.FacetGrid(data_all, col='origin')g = g.map(sns.distplot, data_all.columns[i])
<Figure size 720x720 with 0 Axes>

#通过图示可以看出'V11','V17','V22','V5',波动太大,删除
drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels,axis=1,inplace=True)

相关性系数corr

# 找出相关程度
plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
mcorr = train_data.corr()  # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图(看两两相似度)
plt.show
<function matplotlib.pyplot.show(*args, **kw)>

在这里插入图片描述

# 通过相关性系数找到7个相关性不大的属性
cond = mcorr.loc['target'].abs()<0.1
drop_labels = mcorr.loc['target'][cond].index
#['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34']#查看属性分布后,将分布不好的删除  ('V14', 'V21', )
drop_labels = ['V14', 'V21']
data_all.drop(drop_labels,axis=1,inplace=True)
#删除了6个属性
data_all.shape
(4813, 34)

对数据进行归一化

data = data_all.iloc[:,:-2]
minmaxscale = MinMaxScaler()
data = minmaxscale.fit_transform(data)
data
array([[0.77577505, 0.723449  , 0.22174265, ..., 0.43285165, 0.66410771,0.73528007],[0.83374189, 0.77878549, 0.37388724, ..., 0.43285165, 0.7548128 ,0.73528007],[0.84023071, 0.79600421, 0.46641489, ..., 0.43285165, 0.76237156,0.73528007],...,[0.31708724, 0.25289169, 0.0074184 , ..., 0.17367095, 0.10192512,0.64706284],[0.31045422, 0.24211356, 0.00323712, ..., 0.24075302, 0.1563718 ,0.67646858],[0.35948089, 0.32216088, 0.35608309, ..., 0.24897256, 0.19971655,0.67646858]])
#归一化数据
data_all_norm = pd.DataFrame(data,columns=data_all.columns[:-2])
data_all_norm
V0 V1 V10 V12 V13 V15 V16 V18 V19 V2 V20 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V6 V7 V8 V9
0 0.775775 0.723449 0.221743 0.570828 0.694786 0.402245 0.487950 0.375125 0.380238 0.582197 0.537946 0.792169 0.569153 0.375250 0.730736 0.902936 0.279341 0.406834 0.665193 0.603714 0.729379 0.679479 0.000000 0.000000 0.242424 0.000000 0.018343 0.571839 0.508616 0.432852 0.664108 0.735280
1 0.833742 0.778785 0.373887 0.564418 0.778544 0.402245 0.569779 0.375374 0.401962 0.611588 0.534996 0.792304 0.569419 0.381824 0.762915 0.924734 0.437095 0.371596 0.689434 0.605676 0.796005 0.721792 0.374950 0.499949 0.800020 0.289702 0.436025 0.544381 0.541225 0.432852 0.754813 0.735280
2 0.840231 0.796004 0.466415 0.570933 0.727300 0.372870 0.610021 0.376246 0.440925 0.635354 0.533387 0.792035 0.611893 0.388232 0.745407 0.923195 0.523783 0.387480 0.659552 0.636673 0.821234 0.610818 0.374950 0.499949 0.800020 0.429901 0.457224 0.535653 0.567603 0.432852 0.762372 0.735280
3 0.799856 0.769716 0.350013 0.577028 0.727600 0.379798 0.631207 0.375000 0.440084 0.642104 0.492625 0.791633 0.737722 0.403212 0.743312 0.916912 0.525331 0.390683 0.628297 0.643997 0.821440 0.722257 0.374950 0.477220 0.800020 0.374841 0.528943 0.587484 0.589740 0.469177 0.763198 0.735280
4 0.792790 0.805205 0.314675 0.599412 0.560084 0.383123 0.635467 0.375249 0.463910 0.638869 0.470367 0.791633 0.590656 0.412200 0.680187 0.923965 0.524064 0.413107 0.635005 0.730447 0.826485 0.693583 0.374950 0.462067 0.800020 0.296712 0.541573 0.559600 0.606575 0.469177 0.752687 0.735280
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4808 0.497765 0.517219 0.004451 0.620113 0.751423 0.522378 0.102791 0.001246 0.534128 0.166924 0.444355 0.538214 0.189541 0.138149 0.494514 0.666752 0.416549 0.411506 0.535447 0.002485 0.196169 0.001085 0.340864 0.397717 0.545455 0.005608 0.635544 0.564283 0.105382 0.038977 0.234440 0.617657
4809 0.305119 0.267613 0.007958 0.390815 0.892718 0.559512 0.032059 0.326271 0.546181 0.093236 0.388308 0.546125 0.302628 0.108189 0.373567 0.654058 0.279904 0.561799 0.440616 0.509286 0.000000 0.524334 0.340864 0.397717 0.545455 0.005608 0.728462 0.376330 0.095705 0.069203 0.196764 0.764686
4810 0.317087 0.252892 0.007418 0.457545 0.908451 0.472080 0.146620 0.251122 0.561317 0.123893 0.470770 0.598954 0.637377 0.193326 0.691407 0.678164 0.279764 0.462360 0.538802 0.605807 0.163423 0.671420 0.275069 0.327407 0.606061 0.008157 0.788152 0.388357 0.189024 0.173671 0.101925 0.647063
4811 0.310454 0.242114 0.003237 0.419609 0.840126 0.430788 0.208497 0.286765 0.514085 0.142315 0.320059 0.599088 0.569153 0.199151 0.638762 0.700603 0.279764 0.431127 0.496570 0.731494 0.250232 0.734656 0.295482 0.335185 0.606061 0.007520 0.760938 0.388570 0.256230 0.240753 0.156372 0.676469
4812 0.359481 0.322161 0.356083 0.457230 0.739287 0.430788 0.235400 0.295364 0.602102 0.150330 0.254760 0.598552 0.568888 0.199151 0.623104 0.722144 0.280467 0.431794 0.360116 0.488229 0.256101 0.879572 0.289734 0.335185 0.606061 0.189268 0.844685 0.426884 0.269486 0.248973 0.199717 0.676469

4813 rows × 32 columns

#将oringin和target属性merage上
data_all_norm = pd.merge(data_all_norm,data_all.iloc[:,-2:],left_index=True,right_index=True)
data_all_norm.describe()
V0 V1 V10 V12 V13 V15 V16 V18 V19 V2 V20 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V6 V7 V8 V9 target
count 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 2888.000000
mean 0.694172 0.721357 0.348518 0.578507 0.612372 0.402251 0.679294 0.446542 0.519158 0.602300 0.456147 0.744438 0.356712 0.393796 0.632582 0.881401 0.342653 0.388683 0.603139 0.589459 0.792709 0.628824 0.458493 0.483790 0.762873 0.332385 0.545795 0.523743 0.748823 0.745740 0.715607 0.879536 0.126353
std 0.144198 0.131443 0.134882 0.105088 0.149835 0.138561 0.112095 0.124627 0.140166 0.140628 0.134083 0.134085 0.265512 0.083226 0.123294 0.128221 0.140731 0.133475 0.152462 0.130786 0.102976 0.155003 0.099095 0.101020 0.102037 0.127456 0.150356 0.106430 0.132560 0.132577 0.118105 0.068244 0.983966
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -3.044000
25% 0.626676 0.679416 0.284327 0.532892 0.519928 0.299016 0.629414 0.399302 0.414436 0.514414 0.370475 0.719362 0.040616 0.347870 0.566515 0.888575 0.278778 0.292445 0.503888 0.550092 0.761816 0.562461 0.409037 0.454490 0.727273 0.270584 0.445647 0.478182 0.683324 0.696938 0.664934 0.852903 -0.350250
50% 0.729488 0.752497 0.366469 0.591635 0.627809 0.391437 0.700258 0.456256 0.540294 0.617072 0.447305 0.788817 0.381736 0.388815 0.641228 0.916015 0.279904 0.375734 0.614270 0.594428 0.815055 0.643056 0.454518 0.499949 0.800020 0.347056 0.539317 0.535866 0.774125 0.771974 0.742884 0.882377 0.313000
75% 0.790195 0.799553 0.432965 0.641971 0.719958 0.489954 0.753279 0.501745 0.623125 0.700464 0.522660 0.792706 0.574728 0.427597 0.713599 0.932555 0.413031 0.471837 0.710474 0.650798 0.852229 0.719777 0.500000 0.511365 0.800020 0.414861 0.643061 0.585036 0.842259 0.836405 0.790835 0.941189 0.793250
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 2.538000
def scale_minmax(data):return (data - data.min())/(data.max() - data.min())
#使用Box-Cox将连续数据转换的更加平滑(主要处理类似正太分布)
from scipy import stats
fcols = 6
frows = len(data_all_norm.columns[:10])
plt.figure(figsize=(4*fcols,4*frows))
i = 0for col in data_all_norm.columns[:10]:dat = data_all_norm[[col, 'target']].dropna()#     这条线就是数据分布dist:distribution(分布)i+=1plt.subplot(frows,fcols,i)sns.distplot(dat[col],fit = stats.norm);plt.title(col+' Original')plt.xlabel('')#     第二个图:skew统计分析中中一个属性
#     skewness 偏斜系数,对正太分布的度量i+=1plt.subplot(frows,fcols,i)_=stats.probplot(dat[col], plot=plt)#画图,偏析度plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col])))plt.xlabel('')plt.ylabel('')#     散点图i+=1plt.subplot(frows,fcols,i)
#     plt.plot(dat[var], dat['target'],'.',alpha=0.5)plt.scatter(dat[col],dat['target'],alpha=0.5)plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1]))#     !!!对数据进行了处理!!!
#   数据分布图distributioni+=1plt.subplot(frows,fcols,i)trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1)trans_var = scale_minmax(trans_var)      sns.distplot(trans_var , fit=stats.norm);plt.title(col+' Tramsformed')plt.xlabel('')#     偏斜度i+=1plt.subplot(frows,fcols,i)_=stats.probplot(trans_var, plot=plt)plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))plt.xlabel('')plt.ylabel('')#     散点图i+=1plt.subplot(frows,fcols,i)plt.plot(trans_var, dat['target'],'.',alpha=0.5)plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

# 将数据进行Box-Cox转换
# 统计建模中常用的数据变化
# 数据更加正态化,标准化
for col in data_all_norm.columns[:-2]:boxcox,maxlog = stats.boxcox(data_all_norm[col] + 1)data_all_norm[col] = scale_minmax(boxcox)
data_all_norm
V0 V1 V10 V12 V13 V15 V16 V18 V19 V2 V20 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V6 V7 V8 V9 origin target
0 0.507483 0.357070 0.134959 0.303471 0.561751 0.539735 0.136013 0.239798 0.272914 0.442658 0.694629 0.425929 0.592470 0.626176 0.552721 0.394651 0.377657 0.559002 0.581357 0.323667 0.267157 0.440715 0.000000 0.000000 0.026476 0.000000 0.020896 0.353680 0.165759 0.094056 0.304061 0.253539 train 0.175
1 0.610419 0.445015 0.253597 0.297055 0.668704 0.539735 0.197424 0.240004 0.292263 0.474668 0.692110 0.426178 0.592730 0.632878 0.597390 0.488267 0.547131 0.522334 0.608782 0.325784 0.376810 0.496931 0.355631 0.432280 0.467466 0.175457 0.466089 0.325746 0.190998 0.094056 0.430326 0.253539 train 0.676
2 0.622895 0.475812 0.336900 0.303577 0.602125 0.509062 0.234823 0.240729 0.328031 0.501306 0.690732 0.425680 0.634207 0.639320 0.572815 0.481023 0.630446 0.539067 0.575040 0.360554 0.427728 0.359039 0.355631 0.432280 0.467466 0.290367 0.487365 0.317154 0.213520 0.094056 0.442528 0.253539 train 0.633
3 0.548433 0.429476 0.233498 0.309766 0.602504 0.516390 0.256694 0.239694 0.327245 0.508997 0.654865 0.424935 0.755301 0.654039 0.569916 0.452488 0.631879 0.542400 0.540475 0.369141 0.428167 0.497575 0.355631 0.409624 0.467466 0.242724 0.558547 0.370220 0.233980 0.113283 0.443879 0.253539 train 0.206
4 0.536158 0.492987 0.204775 0.333233 0.409379 0.519887 0.261284 0.239901 0.349778 0.505305 0.634468 0.424935 0.613508 0.662648 0.486902 0.484632 0.630707 0.565360 0.547831 0.482005 0.439062 0.458936 0.355631 0.394675 0.467466 0.180709 0.570959 0.341058 0.250548 0.113283 0.426944 0.253539 train 0.384
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4808 0.184452 0.144638 0.002294 0.356001 0.633015 0.655194 0.009837 0.000579 0.419213 0.092622 0.609861 0.127612 0.205965 0.302314 0.286901 0.033158 0.526439 0.563741 0.442151 0.000452 0.007696 0.000245 0.322227 0.332586 0.152596 0.002675 0.662217 0.345855 0.012300 0.002622 0.035277 0.128271 test NaN
4809 0.074357 0.036169 0.004113 0.155465 0.830440 0.687976 0.002518 0.200542 0.431591 0.048667 0.553814 0.132960 0.324364 0.246144 0.187745 0.028743 0.378309 0.702706 0.348424 0.232703 0.000000 0.271468 0.322227 0.332586 0.152596 0.002675 0.750654 0.182800 0.010893 0.005091 0.026747 0.298339 test NaN
4810 0.079352 0.032715 0.003833 0.203011 0.854218 0.608691 0.015871 0.145112 0.447329 0.066348 0.634842 0.173746 0.658944 0.395001 0.501062 0.037668 0.378146 0.613591 0.445590 0.325925 0.005566 0.430530 0.258323 0.267339 0.203097 0.003899 0.806584 0.191546 0.027408 0.017464 0.010723 0.152833 test NaN
4811 0.076558 0.030332 0.001667 0.175014 0.753601 0.568571 0.026862 0.170668 0.398930 0.077385 0.479359 0.173862 0.592470 0.404054 0.436769 0.048282 0.378146 0.583346 0.402911 0.483508 0.012412 0.514964 0.278067 0.274424 0.203097 0.003592 0.781167 0.191703 0.044201 0.029609 0.019058 0.181492 test NaN
4812 0.098781 0.051178 0.238555 0.202767 0.617374 0.568571 0.032709 0.177030 0.490804 0.082286 0.400802 0.173400 0.592210 0.404054 0.418681 0.061087 0.378960 0.584005 0.274131 0.215143 0.013031 0.751303 0.272500 0.274424 0.203097 0.105645 0.858952 0.221029 0.048101 0.031384 0.027365 0.181492 test NaN

4813 rows × 34 columns

过滤异常值

ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])cond = data_all_norm['origin'] == 'train'X_train = data_all_norm[cond].iloc[:,:-2]
# 真实值
y_train = data_all_norm[cond]['target']
# 算法拟合数据和目标值的时候,不可能100%拟合
ridge.fit(X_train,y_train)
# 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
y_ = ridge.predict(X_train)cond = abs(y_ - y_train) > y_train.std()
print(cond.sum())
# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')
40<matplotlib.axes._subplots.AxesSubplot at 0x2403c0836a0>

index = cond[cond].indexdata_all_norm.drop(index,axis = 0,inplace=True)
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
y_train = data_all_norm[cond]['target']cond = data_all_norm['origin'] == 'test'
X_test = data_all_norm[cond].iloc[:,:-2]

使用不同算法进行计算,最后求取平均值!!

estimators = {}
estimators['forest'] = RandomForestRegressor(n_estimators=300)
estimators['gbdt'] = GradientBoostingRegressor(n_estimators=300)
estimators['ada'] = AdaBoostRegressor(n_estimators=300)
estimators['extreme'] = ExtraTreesRegressor(n_estimators=300)
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor(n_estimators=300)
estimators['xgb'] = XGBRegressor(n_estimators=300)
#将结果存入列表中,求取平均值作为最后答案
result = []
for key,model in estimators.items():model.fit(X_train,y_train)y_ = model.predict(X_test)result.append(y_)
y_ = np.mean(result,axis = 0)pd.Series(y_).to_csv('./norm.txt',index = False)
[19:51:26] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

天池大赛之工业蒸汽处理(改进版 ---- 0.1235)相关推荐

  1. 工业视觉中的目标检测——兼谈天池大赛优胜方案

    点击我爱计算机视觉标星,更快获取CVML新技术 目标检测是计算机视觉领域研究最火热的方向,这从计算机视觉顶会CVPR相关论文数量就能看得出来. 目标检测应用范围广泛,上至卫星遥感,下至自动驾驶,大到鲸 ...

  2. 第六届全国工业互联网数据创新应用大赛:工业生产反应装置的建模预测【天池】

    赛题背景 在流程工业中,生产装置将不同原料经过物理或化学反应加工成高附加值产物.在化工领域,这个转变一般是由各类反应器负责完成的.反应装置通过复杂的一系列化学反应,把进料转化为一定浓度的目标产物.生产 ...

  3. 天池大赛-数智重庆比赛日志

    文章目录 基本环境与mmdetection配置与应用 1. 安装cuda和cudnn 2.anaconda安装+创建子环境 3.安装mmdetection mmd论文翻译 coco数据集以及标注方式 ...

  4. 【赠书】阿里云天池大赛赛题解析,深度学习篇!

    ‍‍ 阿里云天池作为国内知名的竞赛平台和AI社区,自诞生以来就一直秉持着让更多人公平获得大数据的理念.也正因此,天池每场经典赛事沉淀的课题和数据集都会永久保留和开放.截至目前,天池平台已举办了超过20 ...

  5. 阿里云天池大赛赛题解析――深度学习篇

    作者:天池平台 出版社:电子工业出版社 品牌:电子工业出版社 出版时间:2021-09-01 阿里云天池大赛赛题解析――深度学习篇

  6. 【读书向】阿里云天池大赛赛题解析——可视化

    [读书向]阿里云天池大赛赛题解析--可视化 目录 [读书向]阿里云天池大赛赛题解析--可视化 箱型图 获取异常数据的函数 直方图和Q-Q图 KDE分布图 线性回归图 特征变量的相关性 Box-Cox变 ...

  7. 【啃书】【阿里云天池大赛赛题解析】目录

    算法与业务结合的开发步骤:业务理解->数据探索->特征工程->模型训练->模型验证->特征优化->模型融和.其中蕴含着模型的重构与参数的优化. 实际业务场景应用机器 ...

  8. 阿里云天池大赛赛题(机器学习)——天猫用户重复购买预测(完整代码)

    目录 赛题背景 全代码 导入包 读取数据(训练数据前10000行,测试数据前100条) 读取全部数据 获取训练和测试数据 切分40%数据用于线下验证 交叉验证:评估估算器性能 F1验证 Shuffle ...

  9. 【读书向】阿里云天池大赛赛题解析——模型部分

    [读书向]阿里云天池大赛赛题解析--模型部分 目录 [读书向]阿里云天池大赛赛题解析--模型部分 回归 线性回归调用方法 K近邻回归调用方法 决策树回归调用方法 随机森林回归 封装 Pipeline ...

最新文章

  1. Python为图像添加文本内容(Writing Text on Image)
  2. Use Asynchronous Apex
  3. webBrowser强制在本窗口打开,禁止在新窗口打开
  4. android 嵌套分组拖动_GitHub - Mosect/DragLayout: Android拖拽控件,支持上下左右滑动、折叠或者嵌套ListView、RecyclerView等...
  5. TCP/IP的基本工作原理
  6. 玩转oracle 11g(41):rman备份-00554或者04005
  7. JS组件系列——Bootstrap Table 表格行拖拽
  8. 【Unity游戏开发】Unity基础(一)面板介绍
  9. IOS如何使用第三方字体
  10. libjpeg、libpng使用方法
  11. java学习技术栈总结
  12. 分布式服务器中的数据安全问题及其解决办法
  13. 数据科学家的一天,是怎样度过的?
  14. 物联网设备分为哪几种,NB-IoT主要有什么优势?
  15. echarts 折现图和柱状图 样式修改 设置折线样式 背景(文末附带完整代码)
  16. 推荐一款简洁的浏览器标签页
  17. 如何在服务器右下角显示时间,win7 64位右下角怎么显示时间日期与星期?
  18. php- osc,PHP_OS的常见值 - osc_2ltf3y0p的个人空间 - OSCHINA - 中文开源技术交流社区
  19. 微信小程序阿里云服务器https搭建
  20. 【图像修复】----论文阅读笔记《EdgeConnect: Generative Image Inpainting with Adversarial Edge Learning》

热门文章

  1. css中table标签详解(一)
  2. 教你批量查询韵达快递物流并分析出包含提前签收的单号
  3. quicksql安装与调试
  4. 第十章 软件项目收尾和验收
  5. //对中文键盘输入英文 (去除中文系统自带的UTF8编码)
  6. AI数字人提供线上线下场景应用,世优BOTA构建车企虚拟人新体验
  7. HDOJ 3783 ZOJ
  8. python爬取网易云歌曲高清封面
  9. 编译原理-实验二-LL(1)语法分析程序的设计
  10. 测试百公里加速软件,YYP说透百公里加速:专业测试方法最详细剖析