集成学习-xgboost学习
XGboost相关学习
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
data = load_boston()
#波士顿数据集非常简单,但它所涉及到的问题却很多
data
{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,4.9800e+00],[2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,9.1400e+00],[2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,4.0300e+00],...,[6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,5.6400e+00],[1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,6.4800e+00],[4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,7.8800e+00]]),'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3, 8.8,7.2, 10.5, 7.4, 10.2, 11.5, 15.1, 23.2, 9.7, 13.8, 12.7, 13.1,12.5, 8.5, 5. , 6.3, 5.6, 7.2, 12.1, 8.3, 8.5, 5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3, 7. , 7.2, 7.5, 10.4,8.8, 8.4, 16.7, 14.2, 20.8, 13.4, 11.7, 8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4, 9.6, 8.7, 8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2, 7. , 8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:** \n\n :Number of Instances: 506 \n\n :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n :Attribute Information (in order):\n - CRIM per capita crime rate by town\n - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n - INDUS proportion of non-retail business acres per town\n - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n - NOX nitric oxides concentration (parts per 10 million)\n - RM average number of rooms per dwelling\n - AGE proportion of owner-occupied units built prior to 1940\n - DIS weighted distances to five Boston employment centres\n - RAD index of accessibility to radial highways\n - TAX full-value property-tax rate per $10,000\n - PTRATIO pupil-teacher ratio by town\n - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n - LSTAT % lower status of the population\n - MEDV Median value of owner-occupied homes in $1000's\n\n :Missing Attribute Values: None\n\n :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980. N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems. \n \n.. topic:: References\n\n - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}
X = data.data
y = data.target
X.shape
(506, 13)
y.shape
(506,)
y
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3, 8.8,7.2, 10.5, 7.4, 10.2, 11.5, 15.1, 23.2, 9.7, 13.8, 12.7, 13.1,12.5, 8.5, 5. , 6.3, 5.6, 7.2, 12.1, 8.3, 8.5, 5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3, 7. , 7.2, 7.5, 10.4,8.8, 8.4, 16.7, 14.2, 20.8, 13.4, 11.7, 8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4, 9.6, 8.7, 8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2, 7. , 8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #训练
reg.predict(Xtest) #传统接口predict
reg.score(Xtest,Ytest) #你能想出这里应该返回什么模型评估指标么?利用shift+Tab可以知道,R^2评估指标
y.mean()
MSE(Ytest,reg.predict(Xtest))#可以看出均方误差是平均值y.mean()的1/3左右,结果不算好也不算坏
reg.feature_importances_ #树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择
#xgboost可以使用嵌入法进行特征选择
reg = XGBR(n_estimators=100) #交叉验证中导入的没有经过训练的模型
CVS(reg,Xtrain,Ytrain,cv=5).mean()
#这里应该返回什么模型评估指标,还记得么? 返回的是与reg.score相同的评估指标R^2(回归),准确率(分类)
0.8017863029875325
#严谨的交叉验证与不严谨的交叉验证之间的讨论:训练集 or 全数据?
array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])
#严谨 vs 不严谨
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
-16.041115480238048
#来查看一下sklearn中所有的模型评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
['accuracy','adjusted_mutual_info_score','adjusted_rand_score','average_precision','balanced_accuracy','brier_score_loss','completeness_score','explained_variance','f1','f1_macro','f1_micro','f1_samples','f1_weighted','fowlkes_mallows_score','homogeneity_score','mutual_info_score','neg_log_loss','neg_mean_absolute_error','neg_mean_squared_error','neg_mean_squared_log_error','neg_median_absolute_error','normalized_mutual_info_score','precision','precision_macro','precision_micro','precision_samples','precision_weighted','r2','recall','recall_macro','recall_micro','recall_samples','recall_weighted','roc_auc','v_measure_score']
#使用随机森林和线性回归进行一个对比
rfr = RFR(n_estimators=100)
CVS(rfr,Xtrain,Ytrain,cv=5).mean()#0.7975497480638329
0.7975497480638329
CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-16.998723616338033
-16.998723616338033
lr = LinearR()
CVS(lr,Xtrain,Ytrain,cv=5).mean()#0.6835070597278085
0.6835070597278085
CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-25.34950749364844
-25.34950749364844
#如果开启参数slient:在数据巨大,预料到算法运行会非常缓慢的时候可以使用这个参数来监控模型的训练进度
reg = XGBR(n_estimators=10,silent=True)#xgboost库silent=True不会打印训练进程,只返回运行结果,默认是False会打印训练进程
#sklearn库中的xgbsoost的默认为silent=True不会打印训练进程,想打印需要手动设置为False
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-92.67865836936579
-92.67865836936579
def plot_learning_curve(estimator,title, X, y, ax=None, #选择子图ylim=None, #设置纵坐标的取值范围cv=None, #交叉验证n_jobs=None #设定索要使用的线程):from sklearn.model_selection import learning_curveimport matplotlib.pyplot as pltimport numpy as nptrain_sizes, train_scores, test_scores = learning_curve(estimator, X, y,shuffle=True,cv=cv,random_state=420,n_jobs=n_jobs) if ax == None:ax = plt.gca()else:ax = plt.figure()ax.set_title(title)if ylim is not None:ax.set_ylim(*ylim)ax.set_xlabel("Training examples")ax.set_ylabel("Score")ax.grid() #绘制网格,不是必须ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color="r",label="Training score")ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color="g",label="Test score")ax.legend(loc="best")return ax
cv = KFold(n_splits=5, shuffle = True, random_state=42) #交叉验证模式
plot_learning_curve(XGBR(n_estimators=100,random_state=420),"XGB",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xpQh3MDR-1619417048279)(output_28_0.png)]
#=====【TIME WARNING:25 seconds】=====#axisx = range(10,1010,50)
rs = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
660 0.8046775284172915
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6da4aVib-1619417048281)(output_29_1.png)]
#选出来的n_estimators非常不寻常,我们是否要选择准确率最高的n_estimators值呢?
#======【TIME WARNING: 20s】=======#
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)#记录1-偏差rs.append(cvresult.mean())#记录方差var.append(cvresult.var())#计算泛化误差的可控部分ge.append((1 - cvresult.mean())**2+cvresult.var())
#打印R2最高所对应的参数取值,并打印这个参数下的方差
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#打印方差最低时对应的参数取值,并打印这个参数下的R2
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#打印泛化误差可控部分的参数取值,并打印这个参数下的R2,方差以及泛化误差的可控部分
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eJ1DzHzL-1619417048282)(output_31_1.png)]
axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#添加方差线
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qD6ST091-1619417048283)(output_32_1.png)]
#看看泛化误差的可控部分如何?
plt.figure(figsize=(20,5))
plt.plot(axisx,ge,c="gray",linestyle='-.')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pxH53h8Q-1619417048284)(output_33_0.png)]
#验证模型效果是否提高了?
time0 = time()
print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9197580267581366
0.0787498950958252
time0 = time()
print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9208745746309475
0.36807847023010254
time0 = time()
print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9231068620728082
0.12366437911987305
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()
0.7368421052631579 0.837609040251761
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7svP0syk-1619417048285)(output_37_1.png)]
#继续细化学习曲线
axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.65 0.8302530801197368 0.008708816667924316
0.7999999999999999 0.8277414964661117 0.007159903723250457
0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qvvhiCYL-1619417048286)(output_38_1.png)]
#细化学习曲线
axisx = np.linspace(0.75,1,25)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jULrwpAL-1619417048286)(output_39_1.png)]
reg = XGBR(n_estimators=180# ,subsample=0.7708333333333334,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest,Ytest)
0.9159462982185405
MSE(Ytest,reg.predict(Xtest))
7.821523502888769
#首先我们先来定义一个评分函数,这个评分函数能够帮助我们直接打印Xtrain上的交叉验证结果
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):score = []for i in range(len(scoring)):if show:print("{}:{:.2f}".format(scoring[i] #模型评估指标的名字,CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean()))score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())return score
reg = XGBR(n_estimators=180,random_state=420)
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
r2:0.80
neg_mean_squared_error:-13.48[0.8038787848970184, -13.482301822063182]
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
[0.8038787848970184, -13.482301822063182]
from time import time
import datetimefor i in [0,0.2,0.5,1]:time0=time()reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)print("learning_rate = {}".format(i))regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))print("\t")
learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:561781learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.32
00:01:848888learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.24
00:01:541875learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:499027
axisx = np.arange(0.05,1,0.05)
rs = []
te = []
for i in axisx:reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest)rs.append(score[0])te.append(test)
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,te,c="gray",label="test")
plt.plot(axisx,rs,c="green",label="train")
plt.legend()
plt.show()
0.55 0.8125604372670463
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Kt53b60m-1619417048287)(output_47_1.png)]
for booster in ["gbtree","gblinear","dart"]:reg = XGBR(n_estimators=180,learning_rate=0.1,random_state=420,booster=booster).fit(Xtrain,Ytrain)print(booster)print(reg.score(Xtest,Ytest))
gbtree
0.9231068620728082
gblinear
0.6286510307485139
dart
0.923106843149575
#默认reg:linear
reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest, Ytest)
0.9231068620728082
MSE(Ytest,reg.predict(Xtest))
7.155205217161047
#xgb实现法
import xgboost as xgb
#使用类DMatrix读取数据
dtrain = xgb.DMatrix(Xtrain,Ytrain) #特征矩阵和标签都进行一个传入
dtest = xgb.DMatrix(Xtest,Ytest)
#非常遗憾无法打开来查看,所以通常都是先读到pandas里面查看之后再放到DMatrix中
dtrain
<xgboost.core.DMatrix at 0x2770de3bdd8>
import pandas as pd
pd.DataFrame(Xtrain)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.03041 | 0.0 | 5.19 | 0.0 | 0.515 | 5.895 | 59.6 | 5.6150 | 5.0 | 224.0 | 20.2 | 394.81 | 10.56 |
1 | 0.04113 | 25.0 | 4.86 | 0.0 | 0.426 | 6.727 | 33.5 | 5.4007 | 4.0 | 281.0 | 19.0 | 396.90 | 5.29 |
2 | 10.23300 | 0.0 | 18.10 | 0.0 | 0.614 | 6.185 | 96.7 | 2.1705 | 24.0 | 666.0 | 20.2 | 379.70 | 18.03 |
3 | 0.17142 | 0.0 | 6.91 | 0.0 | 0.448 | 5.682 | 33.8 | 5.1004 | 3.0 | 233.0 | 17.9 | 396.90 | 10.21 |
4 | 0.05059 | 0.0 | 4.49 | 0.0 | 0.449 | 6.389 | 48.0 | 4.7794 | 3.0 | 247.0 | 18.5 | 396.90 | 9.62 |
5 | 0.13587 | 0.0 | 10.59 | 1.0 | 0.489 | 6.064 | 59.1 | 4.2392 | 4.0 | 277.0 | 18.6 | 381.32 | 14.66 |
6 | 0.04981 | 21.0 | 5.64 | 0.0 | 0.439 | 5.998 | 21.4 | 6.8147 | 4.0 | 243.0 | 16.8 | 396.90 | 8.43 |
7 | 0.02543 | 55.0 | 3.78 | 0.0 | 0.484 | 6.696 | 56.4 | 5.7321 | 5.0 | 370.0 | 17.6 | 396.90 | 7.18 |
8 | 0.10793 | 0.0 | 8.56 | 0.0 | 0.520 | 6.195 | 54.4 | 2.7778 | 5.0 | 384.0 | 20.9 | 393.49 | 13.00 |
9 | 0.02498 | 0.0 | 1.89 | 0.0 | 0.518 | 6.540 | 59.7 | 6.2669 | 1.0 | 422.0 | 15.9 | 389.96 | 8.65 |
10 | 0.09299 | 0.0 | 25.65 | 0.0 | 0.581 | 5.961 | 92.9 | 2.0869 | 2.0 | 188.0 | 19.1 | 378.09 | 17.93 |
11 | 0.15876 | 0.0 | 10.81 | 0.0 | 0.413 | 5.961 | 17.5 | 5.2873 | 4.0 | 305.0 | 19.2 | 376.94 | 9.88 |
12 | 6.71772 | 0.0 | 18.10 | 0.0 | 0.713 | 6.749 | 92.6 | 2.3236 | 24.0 | 666.0 | 20.2 | 0.32 | 17.44 |
13 | 0.03768 | 80.0 | 1.52 | 0.0 | 0.404 | 7.274 | 38.3 | 7.3090 | 2.0 | 329.0 | 12.6 | 392.20 | 6.62 |
14 | 5.20177 | 0.0 | 18.10 | 1.0 | 0.770 | 6.127 | 83.4 | 2.7227 | 24.0 | 666.0 | 20.2 | 395.43 | 11.48 |
15 | 11.08740 | 0.0 | 18.10 | 0.0 | 0.718 | 6.411 | 100.0 | 1.8589 | 24.0 | 666.0 | 20.2 | 318.75 | 15.02 |
16 | 0.11432 | 0.0 | 8.56 | 0.0 | 0.520 | 6.781 | 71.3 | 2.8561 | 5.0 | 384.0 | 20.9 | 395.58 | 7.67 |
17 | 0.05602 | 0.0 | 2.46 | 0.0 | 0.488 | 7.831 | 53.6 | 3.1992 | 3.0 | 193.0 | 17.8 | 392.63 | 4.45 |
18 | 0.24103 | 0.0 | 7.38 | 0.0 | 0.493 | 6.083 | 43.7 | 5.4159 | 5.0 | 287.0 | 19.6 | 396.90 | 12.79 |
19 | 0.09378 | 12.5 | 7.87 | 0.0 | 0.524 | 5.889 | 39.0 | 5.4509 | 5.0 | 311.0 | 15.2 | 390.50 | 15.71 |
20 | 8.71675 | 0.0 | 18.10 | 0.0 | 0.693 | 6.471 | 98.8 | 1.7257 | 24.0 | 666.0 | 20.2 | 391.98 | 17.12 |
21 | 7.36711 | 0.0 | 18.10 | 0.0 | 0.679 | 6.193 | 78.1 | 1.9356 | 24.0 | 666.0 | 20.2 | 96.73 | 21.52 |
22 | 1.38799 | 0.0 | 8.14 | 0.0 | 0.538 | 5.950 | 82.0 | 3.9900 | 4.0 | 307.0 | 21.0 | 232.60 | 27.71 |
23 | 14.33370 | 0.0 | 18.10 | 0.0 | 0.614 | 6.229 | 88.0 | 1.9512 | 24.0 | 666.0 | 20.2 | 383.32 | 13.11 |
24 | 28.65580 | 0.0 | 18.10 | 0.0 | 0.597 | 5.155 | 100.0 | 1.5894 | 24.0 | 666.0 | 20.2 | 210.97 | 20.08 |
25 | 0.80271 | 0.0 | 8.14 | 0.0 | 0.538 | 5.456 | 36.6 | 3.7965 | 4.0 | 307.0 | 21.0 | 288.99 | 11.69 |
26 | 1.00245 | 0.0 | 8.14 | 0.0 | 0.538 | 6.674 | 87.3 | 4.2390 | 4.0 | 307.0 | 21.0 | 380.23 | 11.98 |
27 | 9.91655 | 0.0 | 18.10 | 0.0 | 0.693 | 5.852 | 77.8 | 1.5004 | 24.0 | 666.0 | 20.2 | 338.16 | 29.97 |
28 | 0.13158 | 0.0 | 10.01 | 0.0 | 0.547 | 6.176 | 72.5 | 2.7301 | 6.0 | 432.0 | 17.8 | 393.30 | 12.04 |
29 | 0.14231 | 0.0 | 10.01 | 0.0 | 0.547 | 6.254 | 84.2 | 2.2565 | 6.0 | 432.0 | 17.8 | 388.74 | 10.45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
324 | 0.13117 | 0.0 | 8.56 | 0.0 | 0.520 | 6.127 | 85.2 | 2.1224 | 5.0 | 384.0 | 20.9 | 387.69 | 14.09 |
325 | 1.35472 | 0.0 | 8.14 | 0.0 | 0.538 | 6.072 | 100.0 | 4.1750 | 4.0 | 307.0 | 21.0 | 376.73 | 13.04 |
326 | 0.10153 | 0.0 | 12.83 | 0.0 | 0.437 | 6.279 | 74.5 | 4.0522 | 5.0 | 398.0 | 18.7 | 373.66 | 11.97 |
327 | 0.22927 | 0.0 | 6.91 | 0.0 | 0.448 | 6.030 | 85.5 | 5.6894 | 3.0 | 233.0 | 17.9 | 392.74 | 18.80 |
328 | 0.04666 | 80.0 | 1.52 | 0.0 | 0.404 | 7.107 | 36.6 | 7.3090 | 2.0 | 329.0 | 12.6 | 354.31 | 8.61 |
329 | 0.08014 | 0.0 | 5.96 | 0.0 | 0.499 | 5.850 | 41.5 | 3.9342 | 5.0 | 279.0 | 19.2 | 396.90 | 8.77 |
330 | 0.40771 | 0.0 | 6.20 | 1.0 | 0.507 | 6.164 | 91.3 | 3.0480 | 8.0 | 307.0 | 17.4 | 395.24 | 21.46 |
331 | 0.13642 | 0.0 | 10.59 | 0.0 | 0.489 | 5.891 | 22.3 | 3.9454 | 4.0 | 277.0 | 18.6 | 396.90 | 10.87 |
332 | 9.32909 | 0.0 | 18.10 | 0.0 | 0.713 | 6.185 | 98.7 | 2.2616 | 24.0 | 666.0 | 20.2 | 396.90 | 18.13 |
333 | 0.09103 | 0.0 | 2.46 | 0.0 | 0.488 | 7.155 | 92.2 | 2.7006 | 3.0 | 193.0 | 17.8 | 394.12 | 4.82 |
334 | 0.01301 | 35.0 | 1.52 | 0.0 | 0.442 | 7.241 | 49.3 | 7.0379 | 1.0 | 284.0 | 15.5 | 394.74 | 5.49 |
335 | 0.59005 | 0.0 | 21.89 | 0.0 | 0.624 | 6.372 | 97.9 | 2.3274 | 4.0 | 437.0 | 21.2 | 385.76 | 11.12 |
336 | 1.12658 | 0.0 | 19.58 | 1.0 | 0.871 | 5.012 | 88.0 | 1.6102 | 5.0 | 403.0 | 14.7 | 343.28 | 12.12 |
337 | 0.07886 | 80.0 | 4.95 | 0.0 | 0.411 | 7.148 | 27.7 | 5.1167 | 4.0 | 245.0 | 19.2 | 396.90 | 3.56 |
338 | 0.21719 | 0.0 | 10.59 | 1.0 | 0.489 | 5.807 | 53.8 | 3.6526 | 4.0 | 277.0 | 18.6 | 390.94 | 16.03 |
339 | 0.53700 | 0.0 | 6.20 | 0.0 | 0.504 | 5.981 | 68.1 | 3.6715 | 8.0 | 307.0 | 17.4 | 378.35 | 11.65 |
340 | 3.32105 | 0.0 | 19.58 | 1.0 | 0.871 | 5.403 | 100.0 | 1.3216 | 5.0 | 403.0 | 14.7 | 396.90 | 26.82 |
341 | 1.49632 | 0.0 | 19.58 | 0.0 | 0.871 | 5.404 | 100.0 | 1.5916 | 5.0 | 403.0 | 14.7 | 341.60 | 13.28 |
342 | 0.38735 | 0.0 | 25.65 | 0.0 | 0.581 | 5.613 | 95.6 | 1.7572 | 2.0 | 188.0 | 19.1 | 359.29 | 27.26 |
343 | 0.06617 | 0.0 | 3.24 | 0.0 | 0.460 | 5.868 | 25.8 | 5.2146 | 4.0 | 430.0 | 16.9 | 382.44 | 9.97 |
344 | 0.78570 | 20.0 | 3.97 | 0.0 | 0.647 | 7.014 | 84.6 | 2.1329 | 5.0 | 264.0 | 13.0 | 384.07 | 14.79 |
345 | 1.41385 | 0.0 | 19.58 | 1.0 | 0.871 | 6.129 | 96.0 | 1.7494 | 5.0 | 403.0 | 14.7 | 321.02 | 15.12 |
346 | 0.06047 | 0.0 | 2.46 | 0.0 | 0.488 | 6.153 | 68.8 | 3.2797 | 3.0 | 193.0 | 17.8 | 387.11 | 13.15 |
347 | 8.49213 | 0.0 | 18.10 | 0.0 | 0.584 | 6.348 | 86.1 | 2.0527 | 24.0 | 666.0 | 20.2 | 83.45 | 17.64 |
348 | 0.17134 | 0.0 | 10.01 | 0.0 | 0.547 | 5.928 | 88.2 | 2.4631 | 6.0 | 432.0 | 17.8 | 344.91 | 15.76 |
349 | 0.03871 | 52.5 | 5.32 | 0.0 | 0.405 | 6.209 | 31.3 | 7.3172 | 6.0 | 293.0 | 16.6 | 396.90 | 7.14 |
350 | 0.12650 | 25.0 | 5.13 | 0.0 | 0.453 | 6.762 | 43.4 | 7.9809 | 8.0 | 284.0 | 19.7 | 395.58 | 9.50 |
351 | 6.96215 | 0.0 | 18.10 | 0.0 | 0.700 | 5.713 | 97.0 | 1.9265 | 24.0 | 666.0 | 20.2 | 394.43 | 17.11 |
352 | 0.09164 | 0.0 | 10.81 | 0.0 | 0.413 | 6.065 | 7.8 | 5.2873 | 4.0 | 305.0 | 19.2 | 390.91 | 5.52 |
353 | 5.58107 | 0.0 | 18.10 | 0.0 | 0.713 | 6.436 | 87.9 | 2.3158 | 24.0 | 666.0 | 20.2 | 100.19 | 16.22 |
354 rows × 13 columns
#写明参数
param = {'silent':True #默认为False,通常要手动把它关闭掉,'objective':'reg:linear',"eta":0.1}
num_round = 180 #n_estimators
#类train,可以直接导入的参数是训练数据,树的数量,其他参数都需要通过params来导入
bst = xgb.train(param, dtrain, num_round)
#接口predict
preds = bst.predict(dtest)
preds
array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 , 8.378565 ,23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649 ,24.030867 , 34.36362 , 21.461111 , 28.839497 , 19.568035 ,10.188658 , 19.42369 , 23.539951 , 22.850523 , 23.198708 ,17.82486 , 16.07219 , 27.602034 , 20.773046 , 20.868807 ,15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 ,36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615 ,23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111 ,18.784286 , 20.483374 , 37.10668 , 18.068268 , 12.73839 ,31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 ,26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 ,25.225826 , 17.15366 , 21.215551 , 17.426773 , 18.478971 ,14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 ,18.756853 , 18.784918 , 16.33361 , 23.155968 , 19.144344 ,29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 ,23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 ,19.537868 , 46.349495 , 23.119637 , 8.071444 , 26.358177 ,24.85706 , 17.057547 , 20.084204 , 18.54005 , 7.157663 ,20.593962 , 15.451031 , 45.09552 , 34.435097 , 22.969654 ,10.10335 , 10.803318 , 18.42058 , 7.800361 , 11.79309 ,30.755335 , 10.80648 , 26.122625 , 22.589502 , 31.219454 ,42.283318 , 19.274109 , 7.3861685, 23.055706 , 14.315018 ,45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247 ,28.382742 , 23.41182 , 19.962458 , 45.916683 , 17.521889 ,24.13039 , 26.147182 , 18.418781 , 17.606575 , 14.540631 ,20.595512 , 32.59128 , 10.155618 , 20.53032 , 21.477484 ,17.450048 , 20.154486 , 8.010227 , 30.482618 , 29.677181 ,20.357098 , 18.222181 , 14.14504 , 10.100547 , 18.85027 ,41.85804 , 17.44544 , 22.907183 , 21.02398 , 29.799366 ,20.219465 , 12.404763 , 45.750965 , 25.56757 , 22.000706 ,14.194921 , 27.102774 ], dtype=float32)
from sklearn.metrics import r2_score
r2_score(Ytest,preds)
0.9260984298390122
MSE(Ytest,preds)
6.87682821415069
import xgboost as xgb#为了便捷,使用全数据
dfull = xgb.DMatrix(X,y)
#设定参数
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
num_round = 100
n_fold=5 #sklearn - KFold
#使用类xgb.cv
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:610364
#看看类xgb.cv生成了什么结果?
cvresult1 #随着树不断增加,我们的模型的效果如何变化
train-rmse-mean | train-rmse-std | test-rmse-mean | test-rmse-std | |
---|---|---|---|---|
0 | 17.105578 | 0.129116 | 17.163215 | 0.584296 |
1 | 12.337973 | 0.097557 | 12.519736 | 0.473458 |
2 | 8.994071 | 0.065756 | 9.404534 | 0.472310 |
3 | 6.629481 | 0.050323 | 7.250335 | 0.500342 |
4 | 4.954406 | 0.033209 | 5.920812 | 0.591874 |
5 | 3.781454 | 0.029604 | 5.045190 | 0.687971 |
6 | 2.947767 | 0.038786 | 4.472030 | 0.686492 |
7 | 2.357748 | 0.042040 | 4.179314 | 0.737935 |
8 | 1.951907 | 0.044972 | 3.979878 | 0.798198 |
9 | 1.660895 | 0.044894 | 3.870751 | 0.812331 |
10 | 1.464296 | 0.049422 | 3.816196 | 0.835251 |
11 | 1.323362 | 0.056240 | 3.788125 | 0.841643 |
12 | 1.214468 | 0.046524 | 3.766973 | 0.848989 |
13 | 1.137311 | 0.044522 | 3.741199 | 0.872370 |
14 | 1.064629 | 0.042245 | 3.729194 | 0.879429 |
15 | 1.010286 | 0.038892 | 3.717997 | 0.879572 |
16 | 0.941258 | 0.038360 | 3.706736 | 0.878032 |
17 | 0.883599 | 0.056640 | 3.693886 | 0.873913 |
18 | 0.829674 | 0.057284 | 3.693296 | 0.883429 |
19 | 0.772332 | 0.042899 | 3.687510 | 0.880928 |
20 | 0.731557 | 0.049150 | 3.687037 | 0.879180 |
21 | 0.690698 | 0.041190 | 3.677507 | 0.882060 |
22 | 0.657743 | 0.042137 | 3.675343 | 0.883635 |
23 | 0.619988 | 0.054097 | 3.671006 | 0.879224 |
24 | 0.585414 | 0.052585 | 3.670951 | 0.867470 |
25 | 0.548723 | 0.054440 | 3.673598 | 0.863241 |
26 | 0.527266 | 0.049630 | 3.673988 | 0.867116 |
27 | 0.504405 | 0.040376 | 3.671702 | 0.864566 |
28 | 0.468534 | 0.033020 | 3.671324 | 0.862536 |
29 | 0.448633 | 0.032191 | 3.675074 | 0.864713 |
... | ... | ... | ... | ... |
70 | 0.071057 | 0.015411 | 3.668067 | 0.859435 |
71 | 0.067946 | 0.013960 | 3.667708 | 0.859370 |
72 | 0.065197 | 0.012475 | 3.668174 | 0.859307 |
73 | 0.062789 | 0.012538 | 3.668738 | 0.859471 |
74 | 0.060294 | 0.012669 | 3.668950 | 0.860112 |
75 | 0.058278 | 0.012055 | 3.669084 | 0.859966 |
76 | 0.055402 | 0.011065 | 3.669627 | 0.859505 |
77 | 0.053819 | 0.011072 | 3.669904 | 0.859294 |
78 | 0.051280 | 0.011215 | 3.670185 | 0.859204 |
79 | 0.048748 | 0.009988 | 3.670092 | 0.859250 |
80 | 0.046972 | 0.009233 | 3.669869 | 0.858892 |
81 | 0.044753 | 0.008664 | 3.669702 | 0.858676 |
82 | 0.043148 | 0.008636 | 3.669704 | 0.858921 |
83 | 0.041823 | 0.008355 | 3.669596 | 0.858843 |
84 | 0.040257 | 0.008378 | 3.669730 | 0.858459 |
85 | 0.038518 | 0.007731 | 3.669835 | 0.858698 |
86 | 0.036694 | 0.006928 | 3.669705 | 0.858958 |
87 | 0.034932 | 0.006174 | 3.669722 | 0.858715 |
88 | 0.033947 | 0.006206 | 3.669964 | 0.858547 |
89 | 0.032706 | 0.006176 | 3.669988 | 0.858516 |
90 | 0.031317 | 0.006171 | 3.670116 | 0.858512 |
91 | 0.029697 | 0.005473 | 3.669930 | 0.858759 |
92 | 0.028561 | 0.005599 | 3.669906 | 0.858549 |
93 | 0.027585 | 0.005694 | 3.669822 | 0.858554 |
94 | 0.026436 | 0.005414 | 3.669985 | 0.858390 |
95 | 0.025204 | 0.005145 | 3.669921 | 0.858313 |
96 | 0.024422 | 0.005242 | 3.669983 | 0.858255 |
97 | 0.023661 | 0.005117 | 3.669947 | 0.858331 |
98 | 0.022562 | 0.004704 | 3.669868 | 0.858578 |
99 | 0.021496 | 0.004738 | 3.669824 | 0.858305 |
100 rows × 4 columns
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()#从这个图中,我们可以看出什么?
#怎样从图中观察模型的泛化能力?
#从这个图的角度来说,模型的调参目标是什么?
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OVs9GHP4-1619417048288)(output_66_0.png)]
#xgboost中回归模型的默认模型评估指标是什么?
param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"}
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fSCBcwL6-1619417048288)(output_68_0.png)]
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
param2 = {'silent':True,'obj':'reg:linear',"gamma":20}
num_round = 180
n_fold=5time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:083104
time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:359378
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,181),cvresult2.iloc[:,0],c="green",label="train,gamma=20")
plt.plot(range(1,181),cvresult2.iloc[:,2],c="blue",label="test,gamma=20")
plt.legend()
plt.show()#从这里,你看出gamma是如何控制过拟合了吗?控制训练集上的训练 - 降低训练集上的表现
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nW9wsd4l-1619417048289)(output_71_0.png)]
import xgboost as xgb
import matplotlib.pyplot as plt
from time import time
import datetime
from sklearn.datasets import load_breast_cancer
data2 = load_breast_cancer()x2 = data2.data
y2 = data2.targetdfull2 = xgb.DMatrix(x2,y2)param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5,"eval_metrics":"error"}
param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5}
num_round = 100
time0 = time()
cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:271581
time0 = time()
cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:443810
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,101),cvresult2.iloc[:,0],c="green",label="train,gamma=1")
plt.plot(range(1,101),cvresult2.iloc[:,2],c="blue",label="test,gamma=1")
plt.legend()
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-40pJXnNG-1619417048289)(output_76_0.png)]
dfull = xgb.DMatrix(X,y)param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
ax.legend(fontsize="xx-large")
plt.show()
00:00:513584
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-23NZRQ9V-1619417048290)(output_78_1.png)]
param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")param2 = {'silent':True,'obj':'reg:linear',"max_depth":2,"eta":0.05,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":0.4,"colsample_bynode":1,"nfold":5}param3 = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1,"nfold":5}time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))time0 = time()
cvresult3 = xgb.cv(param3, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last")
ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last")
ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this")
ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this")
ax.legend(fontsize="xx-large")
plt.show()
00:00:532621
00:00:223373
00:00:259346
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xdWPZhuA-1619417048290)(output_79_1.png)]
import pickle
dtrain = xgb.DMatrix(Xtrain,Ytrain)#设定参数,对模型进行训练
param = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1}
num_round = 180bst = xgb.train(param, dtrain, num_round)
#保存模型
pickle.dump(bst, open("xgboostonboston.dat","wb"))#注意,open中我们往往使用w或者r作为读取的模式,但其实w与r只能用于文本文件 - txt
#当我们希望导入的不是文本文件,而是模型本身的时候,我们使用"wb"和"rb"作为读取的模式
#其中wb表示以二进制写入,rb表示以二进制读入,使用open进行保存的这个文件中是一个可以进行读取或者调用的模型
#看看模型被保存到了哪里?
import sys
sys.path
['C:\\Pythonwork\\micro-class\\11 xgboost','C:\\Python\\python37.zip','C:\\Python\\DLLs','C:\\Python\\lib','C:\\Python','','C:\\Python\\lib\\site-packages','C:\\Python\\lib\\site-packages\\win32','C:\\Python\\lib\\site-packages\\win32\\lib','C:\\Python\\lib\\site-packages\\Pythonwin','C:\\Python\\lib\\site-packages\\IPython\\extensions','C:\\Users\\Shuyu\\.ipython']
#重新打开jupyter labfrom sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pickle
import xgboost as xgbdata = load_boston()X = data.data
y = data.targetXtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
#注意,如果我们保存的模型是xgboost库中建立的模型,则导入的数据类型也必须是xgboost库中的数据类型
dtest = xgb.DMatrix(Xtest,Ytest)
#导入模型
loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
print("Loaded model from: xgboostonboston.dat")
Loaded model from: xgboostonboston.dat
#做预测,直接调用接口predict
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131, 9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367 , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974 , 18.412853, 45.326836, 22.941956, 9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619 , 12.178641,21.293903, 19.826134, 41.0362 , 31.300192, 24.400661, 11.267941,15.763796, 20.984198, 9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097, 8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)
from sklearn.metrics import mean_squared_error as MSE, r2_score
MSE(Ytest,ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
bst = xgb.train(param, dtrain, num_round)
import joblib#同样可以看看模型被保存到了哪里
joblib.dump(bst,"xgboost-boston.dat")
['xgboost-boston.dat']
loaded_model = joblib.load("xgboost-boston.dat")
dtest = xgb.DMatrix(Xtest,Ytest)
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131, 9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367 , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974 , 18.412853, 45.326836, 22.941956, 9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619 , 12.178641,21.293903, 19.826134, 41.0362 , 31.300192, 24.400661, 11.267941,15.763796, 20.984198, 9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097, 8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)
MSE(Ytest, ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
#使用sklearn中的模型
from xgboost import XGBRegressor as XGBRbst = XGBR(n_estimators=200,eta=0.05,gamma=20,reg_lambda=3.5,reg_alpha=0.2,max_depth=4,colsample_bytree=0.4,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #训练完毕
joblib.dump(bst,"xgboost-boston-sklearn.dat")
['xgboost-boston-sklearn.dat']
loaded_model = joblib.load("xgboost-boston-sklearn.dat")
#则这里可以直接导入Xtest,直接是我们的numpy
ypreds = loaded_model.predict(Xtest)
Xtest
array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.29460e+02, 2.73800e+01],[2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01,3.96900e+02, 9.14000e+00],[3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01,3.96900e+02, 4.56000e+00],...,[5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01,3.89710e+02, 5.68000e+00],[3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,2.20100e+01, 1.71500e+01],[1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.96900e+02, 1.34400e+01]])
dtest
<xgboost.core.DMatrix at 0x29e30670668>
ypreds
array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 , 9.883689 ,20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 ,22.096102 , 35.381573 , 20.3307 , 27.129421 , 19.997156 ,10.935587 , 20.25071 , 26.188572 , 26.711943 , 22.600443 ,18.23832 , 15.876045 , 26.263977 , 22.706024 , 20.18491 ,15.891692 , 21.4781 , 29.047956 , 23.371012 , 17.167185 ,35.699898 , 20.490337 , 20.195292 , 23.81444 , 23.106022 ,25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664 ,17.46115 , 19.159126 , 34.79234 , 17.766731 , 17.141891 ,27.755646 , 39.786766 , 22.49913 , 10.246634 , 36.76105 ,26.294876 , 20.75917 , 19.893272 , 46.62629 , 26.549704 ,24.040398 , 17.769514 , 20.76889 , 16.139618 , 17.494894 ,16.005596 , 24.28487 , 19.15237 , 31.407684 , 27.862312 ,18.877817 , 20.50497 , 16.094156 , 22.622025 , 17.762297 ,28.518019 , 41.146317 , 32.52681 , 23.117966 , 19.125128 ,24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 ,18.567612 , 46.46465 , 23.03303 , 9.912106 , 26.407642 ,23.466772 , 16.985506 , 20.73746 , 15.679997 , 11.697191 ,21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 ,12.362759 , 14.593165 , 20.577328 , 9.253377 , 11.1253805,32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 ,41.556873 , 19.726238 , 8.808649 , 23.04128 , 14.709186 ,46.10303 , 21.435535 , 21.97892 , 24.299171 , 19.591938 ,27.527737 , 23.80468 , 18.782711 , 44.266346 , 17.328068 ,23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 ,23.655058 , 32.294373 , 10.60579 , 22.099716 , 19.26955 ,14.293162 , 19.386055 , 8.824598 , 26.909697 , 29.539446 ,20.38691 , 20.832077 , 22.507433 , 11.142808 , 17.685743 ,40.230915 , 17.526121 , 23.09964 , 19.899158 , 31.775164 ,19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 ,15.041253 , 28.63522 ], dtype=float32)
MSE(Ytest, ypreds)
10.198269690947479
r2_score(Ytest,ypreds)
0.8904046866351292
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #自创数据集
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
class_1 = 500 #类别1有500个样本
class_2 = 50 #类别2只有50个
centers = [[0.0, 0.0], [2.0, 2.0]] #设定两个类别的中心
clusters_std = [1.5, 0.5] #设定两个类别的方差,通常来说,样本量比较大的类别会更加松散
X, y = make_blobs(n_samples=[class_1, class_2],centers=centers,cluster_std=clusters_std,random_state=0, shuffle=False)
X.shape
(550, 2)
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
(y == 1).sum() / y.shape[0] #9%
0.09090909090909091
Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)
#在sklearn下建模#clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)
ypred
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
clf.score(Xtest,Ytest) #默认模型评估指标 - 准确率
0.9272727272727272
cm(Ytest,ypred,labels=[1,0]) #少数类写在前面
array([[ 9, 4],[ 8, 144]], dtype=int64)
recall(Ytest,ypred)
0.6923076923076923
auc(Ytest,clf.predict_proba(Xtest)[:,1])
0.9671052631578947
#负/正样本比例
clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)cm(Ytest,ypred_,labels=[1,0])recall(Ytest,ypred_)auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9696356275303644
#随着样本权重逐渐增加,模型的recall,auc和准确率如何变化?
for i in [1,5,10,20,30]:clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)ypred_ = clf_.predict(Xtest)print(i)print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))print("\tRecall:{}".format(recall(Ytest,ypred_)))print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
1Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9671052631578947
5Accuracy:0.9454545454545454Recall:0.9230769230769231AUC:0.9665991902834008
10Accuracy:0.9515151515151515Recall:1.0AUC:0.9696356275303644
20Accuracy:0.9515151515151515Recall:1.0AUC:0.9706477732793523
30Accuracy:0.9515151515151515Recall:1.0AUC:0.9701417004048584
#负/正样本比例
clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)
0.9515151515151515
cm(Ytest,ypred_,labels=[1,0])
array([[ 13, 0],[ 8, 144]], dtype=int64)
recall(Ytest,ypred_)
1.0
auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9706477732793523
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
#看看xgboost库自带的predict接口
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
#看看preds返回了什么?
preds
array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454,0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839,0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053,0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357,0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839,0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251,0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357,0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426,0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714,0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454,0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026,0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246,0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053,0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357,0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714,0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357,0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053,0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714,0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714,0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839,0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217],dtype=float32)
#自己设定阈值
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)
ypred[ypred != 1] = 0
#写明参数
scale_pos_weight = [1,5,10]
names = ["negative vs positive: 1","negative vs positive: 5","negative vs positive: 10"]
[*zip(names,scale_pos_weight)]
[('negative vs positive: 1', 1),('negative vs positive: 5', 5),('negative vs positive: 10', 10)]
#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as aucfor name,i in zip(names,scale_pos_weight):param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}num_round = 100clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > 0.5] = 1ypred[ypred != 1] = 0print(name)print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 10Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
#当然我们也可以尝试不同的阈值
for name,i in zip(names,scale_pos_weight):for thres in [0.3,0.5,0.7,0.9]:param= {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > thres] = 1ypred[ypred != 1] = 0print("{},thresholds:{}".format(name,thres))print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1,thresholds:0.3Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.5Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.7Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.9Accuracy:0.9515151515151515Recall:0.5384615384615384AUC:0.9741902834008097
negative vs positive: 5,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.7Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9635627530364372
negative vs positive: 10,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.5Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.7Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9665991902834008
集成学习-xgboost学习相关推荐
- 集成模型Xgboost!机器学习最热研究方向入门,附学习路线图
导读:本文介绍了集成学习中比较具有代表性的方法,如Boosting.Bagging等.而XGBoost是集成学习中的佼佼者,目前,一些主流的互联网公司如腾讯.阿里巴巴等都已将XGBoost应用到其业务 ...
- python 持续集成工具_持续集成工具: Jenkins学习
持续集成工具: Jenkins学习 -- 部分内容收集自网络,如有侵权,请联系作者删除 一. 概念 在过去的开发整体流程中,是所有人写好代码之后统一进行合并(svn,git),然后进行测试,确保准发布 ...
- XGBoost学习(六):输出特征重要性以及筛选特征
XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...
- 持续集成工具Jenkins学习4 Idea集成Jenkins插件
持续集成工具Jenkins学习4 Idea集成Jenkins插件 一.功能简介 二.安装Idea插件 1. 搜索安装 2. 设置 三.Jenkins开启CSRF 四.使用 一.功能简介 Idea可以方 ...
- XGBoost学习(五):参数调优
XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...
- XGBoost学习(四):实战-python3
XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...
- 区别:强化学习集成学习增强学习规则学习
1.强化学习 强化学习是智能体(Agent)以"试错"的方式进行学习,通过与环境进行交互获得的奖赏指导行为,目标是使智能体获得最大的奖赏,强化学习不同于连接主义学习中的监督学习,主 ...
- 虚拟专题:联邦学习 | 联邦学习研究综述
来源:网络与信息安全学报 联邦学习研究综述 周传鑫,孙奕,汪德刚,葛桦玮 信息工程大学,河南 郑州 450001 摘要:联邦学习由于能够在多方数据源聚合的场景下协同训练全局最优模型,近年来迅速成为安全 ...
- 【原创】分享一些机器学习和深度学习的学习资料
如果你还在苦苦寻找机器学习和深度学习入门资料的话,或许可以看看本文我的一些推荐,这些材料我自己都学过一遍,分享一下点评,希望对你有帮助.注意,本文只是点评这些资源,不提供任何资源的盗版下载,所有资源我 ...
最新文章
- 算法笔记_114:等额本金(Java)
- Nginx HTTPS功能部署实践
- get post乱码解决
- JavaScript巧用对象的引用解决三级联动
- Py之albumentations:albumentations库函数的简介、安装、使用方法之详细攻略
- yii2 RESTful api的详细使用
- moosefs分布式文件系统
- web实现数据交互的几种常见方式
- 剑指offer(刷题41-50)--c++,Python版本
- 面试官:连Spring三级缓存都答不好,自己走还是我送你?
- java并发包作者lee_Java的一些并发包
- 51Nod 1058 N的阶乘的长度
- Win10 安装 Linux 子系统
- android 随身无线网卡,让小锐WiFi支持USB无线网卡/随身WiFi(附各种“随身wifi”芯片型号)...
- rtklib-RINEX文件读取-rinex.c解析(一)
- 打印机驱动无法安装到计算机是,电脑打印机无法安装驱动的解决方法
- coldfusion_在Windows上安装和配置ColdFusion MX 6.1
- 获取ftp服务器文件,ftp获取服务器文件
- 一种网格去噪算法(基于平均面法向的均值滤波)
- 用python求解一元二次方程组
热门文章
- 美团实习| 周记(三)
- 《涨知识啦32》-SBD器件中的肖特基二极管漏电流机制 (上)
- 2022-10-18
- 数伏食用黄瓜和鸡蛋治慢性支气管炎
- webservice25--基于契约优先开发用户管理小功能--异常处理
- 更快更强大,Oracle Primavera P6 R18.8 发布
- 2023最新SSM计算机毕业设计选题大全(附源码+LW)之java医院住院管理系统7lio5
- 关于服务器开放SQL Server1433端口
- php投票post,请教一个简单的投票页面,post怎么写
- 2019年计算机考研408操作系统真题(客观题)