XGboost相关学习

from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
data = load_boston()
#波士顿数据集非常简单,但它所涉及到的问题却很多
data
{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,4.9800e+00],[2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,9.1400e+00],[2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,4.0300e+00],...,[6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,5.6400e+00],[1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,6.4800e+00],[4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,7.8800e+00]]),'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}
X = data.data
y = data.target
X.shape
(506, 13)
y.shape
(506,)
y
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #训练
reg.predict(Xtest) #传统接口predict
reg.score(Xtest,Ytest) #你能想出这里应该返回什么模型评估指标么?利用shift+Tab可以知道,R^2评估指标
y.mean()
MSE(Ytest,reg.predict(Xtest))#可以看出均方误差是平均值y.mean()的1/3左右,结果不算好也不算坏
reg.feature_importances_ #树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择
#xgboost可以使用嵌入法进行特征选择
reg = XGBR(n_estimators=100) #交叉验证中导入的没有经过训练的模型
CVS(reg,Xtrain,Ytrain,cv=5).mean()
#这里应该返回什么模型评估指标,还记得么? 返回的是与reg.score相同的评估指标R^2(回归),准确率(分类)
0.8017863029875325
#严谨的交叉验证与不严谨的交叉验证之间的讨论:训练集 or 全数据?
array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])
#严谨 vs 不严谨
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
-16.041115480238048
#来查看一下sklearn中所有的模型评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
['accuracy','adjusted_mutual_info_score','adjusted_rand_score','average_precision','balanced_accuracy','brier_score_loss','completeness_score','explained_variance','f1','f1_macro','f1_micro','f1_samples','f1_weighted','fowlkes_mallows_score','homogeneity_score','mutual_info_score','neg_log_loss','neg_mean_absolute_error','neg_mean_squared_error','neg_mean_squared_log_error','neg_median_absolute_error','normalized_mutual_info_score','precision','precision_macro','precision_micro','precision_samples','precision_weighted','r2','recall','recall_macro','recall_micro','recall_samples','recall_weighted','roc_auc','v_measure_score']
#使用随机森林和线性回归进行一个对比
rfr = RFR(n_estimators=100)
CVS(rfr,Xtrain,Ytrain,cv=5).mean()#0.7975497480638329
0.7975497480638329
CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-16.998723616338033
-16.998723616338033
lr = LinearR()
CVS(lr,Xtrain,Ytrain,cv=5).mean()#0.6835070597278085
0.6835070597278085
CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-25.34950749364844
-25.34950749364844
#如果开启参数slient:在数据巨大,预料到算法运行会非常缓慢的时候可以使用这个参数来监控模型的训练进度
reg = XGBR(n_estimators=10,silent=True)#xgboost库silent=True不会打印训练进程,只返回运行结果,默认是False会打印训练进程
#sklearn库中的xgbsoost的默认为silent=True不会打印训练进程,想打印需要手动设置为False
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-92.67865836936579
-92.67865836936579
def plot_learning_curve(estimator,title, X, y, ax=None, #选择子图ylim=None, #设置纵坐标的取值范围cv=None, #交叉验证n_jobs=None #设定索要使用的线程):from sklearn.model_selection import learning_curveimport matplotlib.pyplot as pltimport numpy as nptrain_sizes, train_scores, test_scores = learning_curve(estimator, X, y,shuffle=True,cv=cv,random_state=420,n_jobs=n_jobs)      if ax == None:ax = plt.gca()else:ax = plt.figure()ax.set_title(title)if ylim is not None:ax.set_ylim(*ylim)ax.set_xlabel("Training examples")ax.set_ylabel("Score")ax.grid() #绘制网格,不是必须ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color="r",label="Training score")ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color="g",label="Test score")ax.legend(loc="best")return ax
cv = KFold(n_splits=5, shuffle = True, random_state=42) #交叉验证模式
plot_learning_curve(XGBR(n_estimators=100,random_state=420),"XGB",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xpQh3MDR-1619417048279)(output_28_0.png)]

#=====【TIME WARNING:25 seconds】=====#axisx = range(10,1010,50)
rs = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
660 0.8046775284172915

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6da4aVib-1619417048281)(output_29_1.png)]

#选出来的n_estimators非常不寻常,我们是否要选择准确率最高的n_estimators值呢?
#======【TIME WARNING: 20s】=======#
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)#记录1-偏差rs.append(cvresult.mean())#记录方差var.append(cvresult.var())#计算泛化误差的可控部分ge.append((1 - cvresult.mean())**2+cvresult.var())
#打印R2最高所对应的参数取值,并打印这个参数下的方差
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#打印方差最低时对应的参数取值,并打印这个参数下的R2
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#打印泛化误差可控部分的参数取值,并打印这个参数下的R2,方差以及泛化误差的可控部分
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eJ1DzHzL-1619417048282)(output_31_1.png)]

axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#添加方差线
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qD6ST091-1619417048283)(output_32_1.png)]

#看看泛化误差的可控部分如何?
plt.figure(figsize=(20,5))
plt.plot(axisx,ge,c="gray",linestyle='-.')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pxH53h8Q-1619417048284)(output_33_0.png)]

#验证模型效果是否提高了?
time0 = time()
print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9197580267581366
0.0787498950958252
time0 = time()
print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9208745746309475
0.36807847023010254
time0 = time()
print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9231068620728082
0.12366437911987305
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()
0.7368421052631579 0.837609040251761

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7svP0syk-1619417048285)(output_37_1.png)]

#继续细化学习曲线
axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.65 0.8302530801197368 0.008708816667924316
0.7999999999999999 0.8277414964661117 0.007159903723250457
0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qvvhiCYL-1619417048286)(output_38_1.png)]

#细化学习曲线
axisx = np.linspace(0.75,1,25)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jULrwpAL-1619417048286)(output_39_1.png)]

reg = XGBR(n_estimators=180#  ,subsample=0.7708333333333334,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest,Ytest)
0.9159462982185405
MSE(Ytest,reg.predict(Xtest))
7.821523502888769
#首先我们先来定义一个评分函数,这个评分函数能够帮助我们直接打印Xtrain上的交叉验证结果
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):score = []for i in range(len(scoring)):if show:print("{}:{:.2f}".format(scoring[i] #模型评估指标的名字,CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean()))score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())return score
reg = XGBR(n_estimators=180,random_state=420)
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
r2:0.80
neg_mean_squared_error:-13.48[0.8038787848970184, -13.482301822063182]
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
[0.8038787848970184, -13.482301822063182]
from time import time
import datetimefor i in [0,0.2,0.5,1]:time0=time()reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)print("learning_rate = {}".format(i))regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))print("\t")
learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:561781learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.32
00:01:848888learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.24
00:01:541875learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:499027
axisx = np.arange(0.05,1,0.05)
rs = []
te = []
for i in axisx:reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest)rs.append(score[0])te.append(test)
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,te,c="gray",label="test")
plt.plot(axisx,rs,c="green",label="train")
plt.legend()
plt.show()
0.55 0.8125604372670463

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Kt53b60m-1619417048287)(output_47_1.png)]

for booster in ["gbtree","gblinear","dart"]:reg = XGBR(n_estimators=180,learning_rate=0.1,random_state=420,booster=booster).fit(Xtrain,Ytrain)print(booster)print(reg.score(Xtest,Ytest))
gbtree
0.9231068620728082
gblinear
0.6286510307485139
dart
0.923106843149575
#默认reg:linear
reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest, Ytest)
0.9231068620728082
MSE(Ytest,reg.predict(Xtest))
7.155205217161047
#xgb实现法
import xgboost as xgb
#使用类DMatrix读取数据
dtrain = xgb.DMatrix(Xtrain,Ytrain) #特征矩阵和标签都进行一个传入
dtest = xgb.DMatrix(Xtest,Ytest)
#非常遗憾无法打开来查看,所以通常都是先读到pandas里面查看之后再放到DMatrix中
dtrain
<xgboost.core.DMatrix at 0x2770de3bdd8>
import pandas as pd
pd.DataFrame(Xtrain)
0 1 2 3 4 5 6 7 8 9 10 11 12
0 0.03041 0.0 5.19 0.0 0.515 5.895 59.6 5.6150 5.0 224.0 20.2 394.81 10.56
1 0.04113 25.0 4.86 0.0 0.426 6.727 33.5 5.4007 4.0 281.0 19.0 396.90 5.29
2 10.23300 0.0 18.10 0.0 0.614 6.185 96.7 2.1705 24.0 666.0 20.2 379.70 18.03
3 0.17142 0.0 6.91 0.0 0.448 5.682 33.8 5.1004 3.0 233.0 17.9 396.90 10.21
4 0.05059 0.0 4.49 0.0 0.449 6.389 48.0 4.7794 3.0 247.0 18.5 396.90 9.62
5 0.13587 0.0 10.59 1.0 0.489 6.064 59.1 4.2392 4.0 277.0 18.6 381.32 14.66
6 0.04981 21.0 5.64 0.0 0.439 5.998 21.4 6.8147 4.0 243.0 16.8 396.90 8.43
7 0.02543 55.0 3.78 0.0 0.484 6.696 56.4 5.7321 5.0 370.0 17.6 396.90 7.18
8 0.10793 0.0 8.56 0.0 0.520 6.195 54.4 2.7778 5.0 384.0 20.9 393.49 13.00
9 0.02498 0.0 1.89 0.0 0.518 6.540 59.7 6.2669 1.0 422.0 15.9 389.96 8.65
10 0.09299 0.0 25.65 0.0 0.581 5.961 92.9 2.0869 2.0 188.0 19.1 378.09 17.93
11 0.15876 0.0 10.81 0.0 0.413 5.961 17.5 5.2873 4.0 305.0 19.2 376.94 9.88
12 6.71772 0.0 18.10 0.0 0.713 6.749 92.6 2.3236 24.0 666.0 20.2 0.32 17.44
13 0.03768 80.0 1.52 0.0 0.404 7.274 38.3 7.3090 2.0 329.0 12.6 392.20 6.62
14 5.20177 0.0 18.10 1.0 0.770 6.127 83.4 2.7227 24.0 666.0 20.2 395.43 11.48
15 11.08740 0.0 18.10 0.0 0.718 6.411 100.0 1.8589 24.0 666.0 20.2 318.75 15.02
16 0.11432 0.0 8.56 0.0 0.520 6.781 71.3 2.8561 5.0 384.0 20.9 395.58 7.67
17 0.05602 0.0 2.46 0.0 0.488 7.831 53.6 3.1992 3.0 193.0 17.8 392.63 4.45
18 0.24103 0.0 7.38 0.0 0.493 6.083 43.7 5.4159 5.0 287.0 19.6 396.90 12.79
19 0.09378 12.5 7.87 0.0 0.524 5.889 39.0 5.4509 5.0 311.0 15.2 390.50 15.71
20 8.71675 0.0 18.10 0.0 0.693 6.471 98.8 1.7257 24.0 666.0 20.2 391.98 17.12
21 7.36711 0.0 18.10 0.0 0.679 6.193 78.1 1.9356 24.0 666.0 20.2 96.73 21.52
22 1.38799 0.0 8.14 0.0 0.538 5.950 82.0 3.9900 4.0 307.0 21.0 232.60 27.71
23 14.33370 0.0 18.10 0.0 0.614 6.229 88.0 1.9512 24.0 666.0 20.2 383.32 13.11
24 28.65580 0.0 18.10 0.0 0.597 5.155 100.0 1.5894 24.0 666.0 20.2 210.97 20.08
25 0.80271 0.0 8.14 0.0 0.538 5.456 36.6 3.7965 4.0 307.0 21.0 288.99 11.69
26 1.00245 0.0 8.14 0.0 0.538 6.674 87.3 4.2390 4.0 307.0 21.0 380.23 11.98
27 9.91655 0.0 18.10 0.0 0.693 5.852 77.8 1.5004 24.0 666.0 20.2 338.16 29.97
28 0.13158 0.0 10.01 0.0 0.547 6.176 72.5 2.7301 6.0 432.0 17.8 393.30 12.04
29 0.14231 0.0 10.01 0.0 0.547 6.254 84.2 2.2565 6.0 432.0 17.8 388.74 10.45
... ... ... ... ... ... ... ... ... ... ... ... ... ...
324 0.13117 0.0 8.56 0.0 0.520 6.127 85.2 2.1224 5.0 384.0 20.9 387.69 14.09
325 1.35472 0.0 8.14 0.0 0.538 6.072 100.0 4.1750 4.0 307.0 21.0 376.73 13.04
326 0.10153 0.0 12.83 0.0 0.437 6.279 74.5 4.0522 5.0 398.0 18.7 373.66 11.97
327 0.22927 0.0 6.91 0.0 0.448 6.030 85.5 5.6894 3.0 233.0 17.9 392.74 18.80
328 0.04666 80.0 1.52 0.0 0.404 7.107 36.6 7.3090 2.0 329.0 12.6 354.31 8.61
329 0.08014 0.0 5.96 0.0 0.499 5.850 41.5 3.9342 5.0 279.0 19.2 396.90 8.77
330 0.40771 0.0 6.20 1.0 0.507 6.164 91.3 3.0480 8.0 307.0 17.4 395.24 21.46
331 0.13642 0.0 10.59 0.0 0.489 5.891 22.3 3.9454 4.0 277.0 18.6 396.90 10.87
332 9.32909 0.0 18.10 0.0 0.713 6.185 98.7 2.2616 24.0 666.0 20.2 396.90 18.13
333 0.09103 0.0 2.46 0.0 0.488 7.155 92.2 2.7006 3.0 193.0 17.8 394.12 4.82
334 0.01301 35.0 1.52 0.0 0.442 7.241 49.3 7.0379 1.0 284.0 15.5 394.74 5.49
335 0.59005 0.0 21.89 0.0 0.624 6.372 97.9 2.3274 4.0 437.0 21.2 385.76 11.12
336 1.12658 0.0 19.58 1.0 0.871 5.012 88.0 1.6102 5.0 403.0 14.7 343.28 12.12
337 0.07886 80.0 4.95 0.0 0.411 7.148 27.7 5.1167 4.0 245.0 19.2 396.90 3.56
338 0.21719 0.0 10.59 1.0 0.489 5.807 53.8 3.6526 4.0 277.0 18.6 390.94 16.03
339 0.53700 0.0 6.20 0.0 0.504 5.981 68.1 3.6715 8.0 307.0 17.4 378.35 11.65
340 3.32105 0.0 19.58 1.0 0.871 5.403 100.0 1.3216 5.0 403.0 14.7 396.90 26.82
341 1.49632 0.0 19.58 0.0 0.871 5.404 100.0 1.5916 5.0 403.0 14.7 341.60 13.28
342 0.38735 0.0 25.65 0.0 0.581 5.613 95.6 1.7572 2.0 188.0 19.1 359.29 27.26
343 0.06617 0.0 3.24 0.0 0.460 5.868 25.8 5.2146 4.0 430.0 16.9 382.44 9.97
344 0.78570 20.0 3.97 0.0 0.647 7.014 84.6 2.1329 5.0 264.0 13.0 384.07 14.79
345 1.41385 0.0 19.58 1.0 0.871 6.129 96.0 1.7494 5.0 403.0 14.7 321.02 15.12
346 0.06047 0.0 2.46 0.0 0.488 6.153 68.8 3.2797 3.0 193.0 17.8 387.11 13.15
347 8.49213 0.0 18.10 0.0 0.584 6.348 86.1 2.0527 24.0 666.0 20.2 83.45 17.64
348 0.17134 0.0 10.01 0.0 0.547 5.928 88.2 2.4631 6.0 432.0 17.8 344.91 15.76
349 0.03871 52.5 5.32 0.0 0.405 6.209 31.3 7.3172 6.0 293.0 16.6 396.90 7.14
350 0.12650 25.0 5.13 0.0 0.453 6.762 43.4 7.9809 8.0 284.0 19.7 395.58 9.50
351 6.96215 0.0 18.10 0.0 0.700 5.713 97.0 1.9265 24.0 666.0 20.2 394.43 17.11
352 0.09164 0.0 10.81 0.0 0.413 6.065 7.8 5.2873 4.0 305.0 19.2 390.91 5.52
353 5.58107 0.0 18.10 0.0 0.713 6.436 87.9 2.3158 24.0 666.0 20.2 100.19 16.22

354 rows × 13 columns

#写明参数
param = {'silent':True #默认为False,通常要手动把它关闭掉,'objective':'reg:linear',"eta":0.1}
num_round = 180 #n_estimators
#类train,可以直接导入的参数是训练数据,树的数量,其他参数都需要通过params来导入
bst = xgb.train(param, dtrain, num_round)
#接口predict
preds = bst.predict(dtest)
preds
array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 ,  8.378565 ,23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649  ,24.030867 , 34.36362  , 21.461111 , 28.839497 , 19.568035 ,10.188658 , 19.42369  , 23.539951 , 22.850523 , 23.198708 ,17.82486  , 16.07219  , 27.602034 , 20.773046 , 20.868807 ,15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 ,36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615  ,23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111  ,18.784286 , 20.483374 , 37.10668  , 18.068268 , 12.73839  ,31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 ,26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 ,25.225826 , 17.15366  , 21.215551 , 17.426773 , 18.478971 ,14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 ,18.756853 , 18.784918 , 16.33361  , 23.155968 , 19.144344 ,29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 ,23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 ,19.537868 , 46.349495 , 23.119637 ,  8.071444 , 26.358177 ,24.85706  , 17.057547 , 20.084204 , 18.54005  ,  7.157663 ,20.593962 , 15.451031 , 45.09552  , 34.435097 , 22.969654 ,10.10335  , 10.803318 , 18.42058  ,  7.800361 , 11.79309  ,30.755335 , 10.80648  , 26.122625 , 22.589502 , 31.219454 ,42.283318 , 19.274109 ,  7.3861685, 23.055706 , 14.315018 ,45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247  ,28.382742 , 23.41182  , 19.962458 , 45.916683 , 17.521889 ,24.13039  , 26.147182 , 18.418781 , 17.606575 , 14.540631 ,20.595512 , 32.59128  , 10.155618 , 20.53032  , 21.477484 ,17.450048 , 20.154486 ,  8.010227 , 30.482618 , 29.677181 ,20.357098 , 18.222181 , 14.14504  , 10.100547 , 18.85027  ,41.85804  , 17.44544  , 22.907183 , 21.02398  , 29.799366 ,20.219465 , 12.404763 , 45.750965 , 25.56757  , 22.000706 ,14.194921 , 27.102774 ], dtype=float32)
from sklearn.metrics import r2_score
r2_score(Ytest,preds)
0.9260984298390122
MSE(Ytest,preds)
6.87682821415069
import xgboost as xgb#为了便捷,使用全数据
dfull = xgb.DMatrix(X,y)
#设定参数
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
num_round = 100
n_fold=5 #sklearn - KFold
#使用类xgb.cv
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:610364
#看看类xgb.cv生成了什么结果?
cvresult1 #随着树不断增加,我们的模型的效果如何变化
train-rmse-mean train-rmse-std test-rmse-mean test-rmse-std
0 17.105578 0.129116 17.163215 0.584296
1 12.337973 0.097557 12.519736 0.473458
2 8.994071 0.065756 9.404534 0.472310
3 6.629481 0.050323 7.250335 0.500342
4 4.954406 0.033209 5.920812 0.591874
5 3.781454 0.029604 5.045190 0.687971
6 2.947767 0.038786 4.472030 0.686492
7 2.357748 0.042040 4.179314 0.737935
8 1.951907 0.044972 3.979878 0.798198
9 1.660895 0.044894 3.870751 0.812331
10 1.464296 0.049422 3.816196 0.835251
11 1.323362 0.056240 3.788125 0.841643
12 1.214468 0.046524 3.766973 0.848989
13 1.137311 0.044522 3.741199 0.872370
14 1.064629 0.042245 3.729194 0.879429
15 1.010286 0.038892 3.717997 0.879572
16 0.941258 0.038360 3.706736 0.878032
17 0.883599 0.056640 3.693886 0.873913
18 0.829674 0.057284 3.693296 0.883429
19 0.772332 0.042899 3.687510 0.880928
20 0.731557 0.049150 3.687037 0.879180
21 0.690698 0.041190 3.677507 0.882060
22 0.657743 0.042137 3.675343 0.883635
23 0.619988 0.054097 3.671006 0.879224
24 0.585414 0.052585 3.670951 0.867470
25 0.548723 0.054440 3.673598 0.863241
26 0.527266 0.049630 3.673988 0.867116
27 0.504405 0.040376 3.671702 0.864566
28 0.468534 0.033020 3.671324 0.862536
29 0.448633 0.032191 3.675074 0.864713
... ... ... ... ...
70 0.071057 0.015411 3.668067 0.859435
71 0.067946 0.013960 3.667708 0.859370
72 0.065197 0.012475 3.668174 0.859307
73 0.062789 0.012538 3.668738 0.859471
74 0.060294 0.012669 3.668950 0.860112
75 0.058278 0.012055 3.669084 0.859966
76 0.055402 0.011065 3.669627 0.859505
77 0.053819 0.011072 3.669904 0.859294
78 0.051280 0.011215 3.670185 0.859204
79 0.048748 0.009988 3.670092 0.859250
80 0.046972 0.009233 3.669869 0.858892
81 0.044753 0.008664 3.669702 0.858676
82 0.043148 0.008636 3.669704 0.858921
83 0.041823 0.008355 3.669596 0.858843
84 0.040257 0.008378 3.669730 0.858459
85 0.038518 0.007731 3.669835 0.858698
86 0.036694 0.006928 3.669705 0.858958
87 0.034932 0.006174 3.669722 0.858715
88 0.033947 0.006206 3.669964 0.858547
89 0.032706 0.006176 3.669988 0.858516
90 0.031317 0.006171 3.670116 0.858512
91 0.029697 0.005473 3.669930 0.858759
92 0.028561 0.005599 3.669906 0.858549
93 0.027585 0.005694 3.669822 0.858554
94 0.026436 0.005414 3.669985 0.858390
95 0.025204 0.005145 3.669921 0.858313
96 0.024422 0.005242 3.669983 0.858255
97 0.023661 0.005117 3.669947 0.858331
98 0.022562 0.004704 3.669868 0.858578
99 0.021496 0.004738 3.669824 0.858305

100 rows × 4 columns

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()#从这个图中,我们可以看出什么?
#怎样从图中观察模型的泛化能力?
#从这个图的角度来说,模型的调参目标是什么?

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OVs9GHP4-1619417048288)(output_66_0.png)]

#xgboost中回归模型的默认模型评估指标是什么?
param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"}
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fSCBcwL6-1619417048288)(output_68_0.png)]

param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
param2 = {'silent':True,'obj':'reg:linear',"gamma":20}
num_round = 180
n_fold=5time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:083104
time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:359378
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,181),cvresult2.iloc[:,0],c="green",label="train,gamma=20")
plt.plot(range(1,181),cvresult2.iloc[:,2],c="blue",label="test,gamma=20")
plt.legend()
plt.show()#从这里,你看出gamma是如何控制过拟合了吗?控制训练集上的训练 - 降低训练集上的表现

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nW9wsd4l-1619417048289)(output_71_0.png)]

import xgboost as xgb
import matplotlib.pyplot as plt
from time import time
import datetime
from sklearn.datasets import load_breast_cancer
data2 = load_breast_cancer()x2 = data2.data
y2 = data2.targetdfull2 = xgb.DMatrix(x2,y2)param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5,"eval_metrics":"error"}
param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5}
num_round = 100
time0 = time()
cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:271581
time0 = time()
cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:443810
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,101),cvresult2.iloc[:,0],c="green",label="train,gamma=1")
plt.plot(range(1,101),cvresult2.iloc[:,2],c="blue",label="test,gamma=1")
plt.legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-40pJXnNG-1619417048289)(output_76_0.png)]

dfull = xgb.DMatrix(X,y)param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
ax.legend(fontsize="xx-large")
plt.show()
00:00:513584

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-23NZRQ9V-1619417048290)(output_78_1.png)]

param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")param2 = {'silent':True,'obj':'reg:linear',"max_depth":2,"eta":0.05,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":0.4,"colsample_bynode":1,"nfold":5}param3 = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1,"nfold":5}time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))time0 = time()
cvresult3 = xgb.cv(param3, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last")
ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last")
ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this")
ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this")
ax.legend(fontsize="xx-large")
plt.show()
00:00:532621
00:00:223373
00:00:259346

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xdWPZhuA-1619417048290)(output_79_1.png)]

import pickle
dtrain = xgb.DMatrix(Xtrain,Ytrain)#设定参数,对模型进行训练
param = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1}
num_round = 180bst = xgb.train(param, dtrain, num_round)
#保存模型
pickle.dump(bst, open("xgboostonboston.dat","wb"))#注意,open中我们往往使用w或者r作为读取的模式,但其实w与r只能用于文本文件 - txt
#当我们希望导入的不是文本文件,而是模型本身的时候,我们使用"wb"和"rb"作为读取的模式
#其中wb表示以二进制写入,rb表示以二进制读入,使用open进行保存的这个文件中是一个可以进行读取或者调用的模型
#看看模型被保存到了哪里?
import sys
sys.path
['C:\\Pythonwork\\micro-class\\11 xgboost','C:\\Python\\python37.zip','C:\\Python\\DLLs','C:\\Python\\lib','C:\\Python','','C:\\Python\\lib\\site-packages','C:\\Python\\lib\\site-packages\\win32','C:\\Python\\lib\\site-packages\\win32\\lib','C:\\Python\\lib\\site-packages\\Pythonwin','C:\\Python\\lib\\site-packages\\IPython\\extensions','C:\\Users\\Shuyu\\.ipython']
#重新打开jupyter labfrom sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pickle
import xgboost as xgbdata = load_boston()X = data.data
y = data.targetXtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
#注意,如果我们保存的模型是xgboost库中建立的模型,则导入的数据类型也必须是xgboost库中的数据类型
dtest = xgb.DMatrix(Xtest,Ytest)
#导入模型
loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
print("Loaded model from: xgboostonboston.dat")
Loaded model from: xgboostonboston.dat
#做预测,直接调用接口predict
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)
from sklearn.metrics import mean_squared_error as MSE, r2_score
MSE(Ytest,ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
bst = xgb.train(param, dtrain, num_round)
import joblib#同样可以看看模型被保存到了哪里
joblib.dump(bst,"xgboost-boston.dat")
['xgboost-boston.dat']
loaded_model = joblib.load("xgboost-boston.dat")
dtest = xgb.DMatrix(Xtest,Ytest)
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)
MSE(Ytest, ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
#使用sklearn中的模型
from xgboost import XGBRegressor as XGBRbst = XGBR(n_estimators=200,eta=0.05,gamma=20,reg_lambda=3.5,reg_alpha=0.2,max_depth=4,colsample_bytree=0.4,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #训练完毕
joblib.dump(bst,"xgboost-boston-sklearn.dat")
['xgboost-boston-sklearn.dat']
loaded_model = joblib.load("xgboost-boston-sklearn.dat")
#则这里可以直接导入Xtest,直接是我们的numpy
ypreds = loaded_model.predict(Xtest)
Xtest
array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.29460e+02, 2.73800e+01],[2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01,3.96900e+02, 9.14000e+00],[3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01,3.96900e+02, 4.56000e+00],...,[5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01,3.89710e+02, 5.68000e+00],[3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,2.20100e+01, 1.71500e+01],[1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.96900e+02, 1.34400e+01]])
dtest
<xgboost.core.DMatrix at 0x29e30670668>
ypreds
array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 ,  9.883689 ,20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 ,22.096102 , 35.381573 , 20.3307   , 27.129421 , 19.997156 ,10.935587 , 20.25071  , 26.188572 , 26.711943 , 22.600443 ,18.23832  , 15.876045 , 26.263977 , 22.706024 , 20.18491  ,15.891692 , 21.4781   , 29.047956 , 23.371012 , 17.167185 ,35.699898 , 20.490337 , 20.195292 , 23.81444  , 23.106022 ,25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664  ,17.46115  , 19.159126 , 34.79234  , 17.766731 , 17.141891 ,27.755646 , 39.786766 , 22.49913  , 10.246634 , 36.76105  ,26.294876 , 20.75917  , 19.893272 , 46.62629  , 26.549704 ,24.040398 , 17.769514 , 20.76889  , 16.139618 , 17.494894 ,16.005596 , 24.28487  , 19.15237  , 31.407684 , 27.862312 ,18.877817 , 20.50497  , 16.094156 , 22.622025 , 17.762297 ,28.518019 , 41.146317 , 32.52681  , 23.117966 , 19.125128 ,24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 ,18.567612 , 46.46465  , 23.03303  ,  9.912106 , 26.407642 ,23.466772 , 16.985506 , 20.73746  , 15.679997 , 11.697191 ,21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 ,12.362759 , 14.593165 , 20.577328 ,  9.253377 , 11.1253805,32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 ,41.556873 , 19.726238 ,  8.808649 , 23.04128  , 14.709186 ,46.10303  , 21.435535 , 21.97892  , 24.299171 , 19.591938 ,27.527737 , 23.80468  , 18.782711 , 44.266346 , 17.328068 ,23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 ,23.655058 , 32.294373 , 10.60579  , 22.099716 , 19.26955  ,14.293162 , 19.386055 ,  8.824598 , 26.909697 , 29.539446 ,20.38691  , 20.832077 , 22.507433 , 11.142808 , 17.685743 ,40.230915 , 17.526121 , 23.09964  , 19.899158 , 31.775164 ,19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 ,15.041253 , 28.63522  ], dtype=float32)
MSE(Ytest, ypreds)
10.198269690947479
r2_score(Ytest,ypreds)
0.8904046866351292
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #自创数据集
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
class_1 = 500 #类别1有500个样本
class_2 = 50 #类别2只有50个
centers = [[0.0, 0.0], [2.0, 2.0]] #设定两个类别的中心
clusters_std = [1.5, 0.5] #设定两个类别的方差,通常来说,样本量比较大的类别会更加松散
X, y = make_blobs(n_samples=[class_1, class_2],centers=centers,cluster_std=clusters_std,random_state=0, shuffle=False)
X.shape
(550, 2)
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
(y == 1).sum() / y.shape[0] #9%
0.09090909090909091
Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)
#在sklearn下建模#clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)
ypred
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
clf.score(Xtest,Ytest) #默认模型评估指标 - 准确率
0.9272727272727272
cm(Ytest,ypred,labels=[1,0]) #少数类写在前面
array([[  9,   4],[  8, 144]], dtype=int64)
recall(Ytest,ypred)
0.6923076923076923
auc(Ytest,clf.predict_proba(Xtest)[:,1])
0.9671052631578947
#负/正样本比例
clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)cm(Ytest,ypred_,labels=[1,0])recall(Ytest,ypred_)auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9696356275303644
#随着样本权重逐渐增加,模型的recall,auc和准确率如何变化?
for i in [1,5,10,20,30]:clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)ypred_ = clf_.predict(Xtest)print(i)print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))print("\tRecall:{}".format(recall(Ytest,ypred_)))print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
1Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9671052631578947
5Accuracy:0.9454545454545454Recall:0.9230769230769231AUC:0.9665991902834008
10Accuracy:0.9515151515151515Recall:1.0AUC:0.9696356275303644
20Accuracy:0.9515151515151515Recall:1.0AUC:0.9706477732793523
30Accuracy:0.9515151515151515Recall:1.0AUC:0.9701417004048584
#负/正样本比例
clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)
0.9515151515151515
cm(Ytest,ypred_,labels=[1,0])
array([[ 13,   0],[  8, 144]], dtype=int64)
recall(Ytest,ypred_)
1.0
auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9706477732793523
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
#看看xgboost库自带的predict接口
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
#看看preds返回了什么?
preds
array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454,0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839,0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053,0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357,0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839,0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251,0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357,0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426,0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714,0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454,0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026,0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246,0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053,0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357,0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714,0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357,0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053,0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714,0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714,0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839,0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217],dtype=float32)
#自己设定阈值
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)
ypred[ypred != 1] = 0
#写明参数
scale_pos_weight = [1,5,10]
names = ["negative vs positive: 1","negative vs positive: 5","negative vs positive: 10"]
[*zip(names,scale_pos_weight)]
[('negative vs positive: 1', 1),('negative vs positive: 5', 5),('negative vs positive: 10', 10)]
#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as aucfor name,i in zip(names,scale_pos_weight):param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}num_round = 100clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > 0.5] = 1ypred[ypred != 1] = 0print(name)print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 10Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
#当然我们也可以尝试不同的阈值
for name,i in zip(names,scale_pos_weight):for thres in [0.3,0.5,0.7,0.9]:param= {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > thres] = 1ypred[ypred != 1] = 0print("{},thresholds:{}".format(name,thres))print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1,thresholds:0.3Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.5Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.7Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.9Accuracy:0.9515151515151515Recall:0.5384615384615384AUC:0.9741902834008097
negative vs positive: 5,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.7Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9635627530364372
negative vs positive: 10,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.5Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.7Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9665991902834008

集成学习-xgboost学习相关推荐

  1. 集成模型Xgboost!机器学习最热研究方向入门,附学习路线图

    导读:本文介绍了集成学习中比较具有代表性的方法,如Boosting.Bagging等.而XGBoost是集成学习中的佼佼者,目前,一些主流的互联网公司如腾讯.阿里巴巴等都已将XGBoost应用到其业务 ...

  2. python 持续集成工具_持续集成工具: Jenkins学习

    持续集成工具: Jenkins学习 -- 部分内容收集自网络,如有侵权,请联系作者删除 一. 概念 在过去的开发整体流程中,是所有人写好代码之后统一进行合并(svn,git),然后进行测试,确保准发布 ...

  3. XGBoost学习(六):输出特征重要性以及筛选特征

    XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...

  4. 持续集成工具Jenkins学习4 Idea集成Jenkins插件

    持续集成工具Jenkins学习4 Idea集成Jenkins插件 一.功能简介 二.安装Idea插件 1. 搜索安装 2. 设置 三.Jenkins开启CSRF 四.使用 一.功能简介 Idea可以方 ...

  5. XGBoost学习(五):参数调优

    XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...

  6. XGBoost学习(四):实战-python3

    XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...

  7. 区别:强化学习集成学习增强学习规则学习

    1.强化学习 强化学习是智能体(Agent)以"试错"的方式进行学习,通过与环境进行交互获得的奖赏指导行为,目标是使智能体获得最大的奖赏,强化学习不同于连接主义学习中的监督学习,主 ...

  8. 虚拟专题:联邦学习 | 联邦学习研究综述

    来源:网络与信息安全学报 联邦学习研究综述 周传鑫,孙奕,汪德刚,葛桦玮 信息工程大学,河南 郑州 450001 摘要:联邦学习由于能够在多方数据源聚合的场景下协同训练全局最优模型,近年来迅速成为安全 ...

  9. 【原创】分享一些机器学习和深度学习的学习资料

    如果你还在苦苦寻找机器学习和深度学习入门资料的话,或许可以看看本文我的一些推荐,这些材料我自己都学过一遍,分享一下点评,希望对你有帮助.注意,本文只是点评这些资源,不提供任何资源的盗版下载,所有资源我 ...

最新文章

  1. 算法笔记_114:等额本金(Java)
  2. Nginx HTTPS功能部署实践
  3. get post乱码解决
  4. JavaScript巧用对象的引用解决三级联动
  5. Py之albumentations:albumentations库函数的简介、安装、使用方法之详细攻略
  6. yii2 RESTful api的详细使用
  7. moosefs分布式文件系统
  8. web实现数据交互的几种常见方式
  9. 剑指offer(刷题41-50)--c++,Python版本
  10. 面试官:连Spring三级缓存都答不好,自己走还是我送你?
  11. java并发包作者lee_Java的一些并发包
  12. 51Nod 1058 N的阶乘的长度
  13. Win10 安装 Linux 子系统
  14. android 随身无线网卡,让小锐WiFi支持USB无线网卡/随身WiFi(附各种“随身wifi”芯片型号)...
  15. rtklib-RINEX文件读取-rinex.c解析(一)
  16. 打印机驱动无法安装到计算机是,电脑打印机无法安装驱动的解决方法
  17. coldfusion_在Windows上安装和配置ColdFusion MX 6.1
  18. 获取ftp服务器文件,ftp获取服务器文件
  19. 一种网格去噪算法(基于平均面法向的均值滤波)
  20. 用python求解一元二次方程组

热门文章

  1. 美团实习| 周记(三)
  2. 《涨知识啦32》-SBD器件中的肖特基二极管漏电流机制 (上)
  3. 2022-10-18
  4. 数伏食用黄瓜和鸡蛋治慢性支气管炎
  5. webservice25--基于契约优先开发用户管理小功能--异常处理
  6. 更快更强大,Oracle Primavera P6 R18.8 发布
  7. 2023最新SSM计算机毕业设计选题大全(附源码+LW)之java医院住院管理系统7lio5
  8. 关于服务器开放SQL Server1433端口
  9. php投票post,请教一个简单的投票页面,post怎么写
  10. 2019年计算机考研408操作系统真题(客观题)