XGboost相关学习

from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime

data = load_boston()
#波士顿数据集非常简单，但它所涉及到的问题却很多

data

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,4.9800e+00],[2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,9.1400e+00],[2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,4.0300e+00],...,[6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,5.6400e+00],[1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,6.4800e+00],[4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,7.8800e+00]]),'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}

X = data.data
y = data.target

X.shape

(506, 13)

y.shape

(506,)

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])

Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)

reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #训练

reg.predict(Xtest) #传统接口predict

reg.score(Xtest,Ytest) #你能想出这里应该返回什么模型评估指标么？利用shift+Tab可以知道，R^2评估指标

y.mean()

MSE(Ytest,reg.predict(Xtest))#可以看出均方误差是平均值y.mean()的1/3左右，结果不算好也不算坏

reg.feature_importances_ #树模型的优势之一：能够查看模型的重要性分数，可以使用嵌入法(SelectFromModel)进行特征选择
#xgboost可以使用嵌入法进行特征选择

reg = XGBR(n_estimators=100) #交叉验证中导入的没有经过训练的模型

CVS(reg,Xtrain,Ytrain,cv=5).mean()
#这里应该返回什么模型评估指标，还记得么？ 返回的是与reg.score相同的评估指标R^2（回归），准确率（分类）

0.8017863029875325

#严谨的交叉验证与不严谨的交叉验证之间的讨论：训练集 or 全数据？

array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])

#严谨 vs 不严谨

CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()

-16.041115480238048

#来查看一下sklearn中所有的模型评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy','adjusted_mutual_info_score','adjusted_rand_score','average_precision','balanced_accuracy','brier_score_loss','completeness_score','explained_variance','f1','f1_macro','f1_micro','f1_samples','f1_weighted','fowlkes_mallows_score','homogeneity_score','mutual_info_score','neg_log_loss','neg_mean_absolute_error','neg_mean_squared_error','neg_mean_squared_log_error','neg_median_absolute_error','normalized_mutual_info_score','precision','precision_macro','precision_micro','precision_samples','precision_weighted','r2','recall','recall_macro','recall_micro','recall_samples','recall_weighted','roc_auc','v_measure_score']

#使用随机森林和线性回归进行一个对比
rfr = RFR(n_estimators=100)
CVS(rfr,Xtrain,Ytrain,cv=5).mean()#0.7975497480638329

0.7975497480638329

CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-16.998723616338033

-16.998723616338033

lr = LinearR()
CVS(lr,Xtrain,Ytrain,cv=5).mean()#0.6835070597278085

0.6835070597278085

CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-25.34950749364844

-25.34950749364844

#如果开启参数slient：在数据巨大，预料到算法运行会非常缓慢的时候可以使用这个参数来监控模型的训练进度
reg = XGBR(n_estimators=10,silent=True)#xgboost库silent=True不会打印训练进程，只返回运行结果，默认是False会打印训练进程
#sklearn库中的xgbsoost的默认为silent=True不会打印训练进程，想打印需要手动设置为False
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-92.67865836936579

-92.67865836936579

def plot_learning_curve(estimator,title, X, y, ax=None, #选择子图ylim=None, #设置纵坐标的取值范围cv=None, #交叉验证n_jobs=None #设定索要使用的线程):from sklearn.model_selection import learning_curveimport matplotlib.pyplot as pltimport numpy as nptrain_sizes, train_scores, test_scores = learning_curve(estimator, X, y,shuffle=True,cv=cv,random_state=420,n_jobs=n_jobs)      if ax == None:ax = plt.gca()else:ax = plt.figure()ax.set_title(title)if ylim is not None:ax.set_ylim(*ylim)ax.set_xlabel("Training examples")ax.set_ylabel("Score")ax.grid() #绘制网格，不是必须ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color="r",label="Training score")ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color="g",label="Test score")ax.legend(loc="best")return ax

cv = KFold(n_splits=5, shuffle = True, random_state=42) #交叉验证模式

plot_learning_curve(XGBR(n_estimators=100,random_state=420),"XGB",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xpQh3MDR-1619417048279)(output_28_0.png)]

#=====【TIME WARNING：25 seconds】=====#axisx = range(10,1010,50)
rs = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()

660 0.8046775284172915

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6da4aVib-1619417048281)(output_29_1.png)]

#选出来的n_estimators非常不寻常，我们是否要选择准确率最高的n_estimators值呢？

#======【TIME WARNING: 20s】=======#
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)#记录1-偏差rs.append(cvresult.mean())#记录方差var.append(cvresult.var())#计算泛化误差的可控部分ge.append((1 - cvresult.mean())**2+cvresult.var())
#打印R2最高所对应的参数取值，并打印这个参数下的方差
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#打印方差最低时对应的参数取值，并打印这个参数下的R2
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#打印泛化误差可控部分的参数取值，并打印这个参数下的R2，方差以及泛化误差的可控部分
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()

650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eJ1DzHzL-1619417048282)(output_31_1.png)]

axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#添加方差线
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()

180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qD6ST091-1619417048283)(output_32_1.png)]

#看看泛化误差的可控部分如何？
plt.figure(figsize=(20,5))
plt.plot(axisx,ge,c="gray",linestyle='-.')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pxH53h8Q-1619417048284)(output_33_0.png)]

#验证模型效果是否提高了？
time0 = time()
print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)

0.9197580267581366
0.0787498950958252

time0 = time()
print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)

0.9208745746309475
0.36807847023010254

time0 = time()
print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)

0.9231068620728082
0.12366437911987305

axisx = np.linspace(0,1,20)
rs = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()

0.7368421052631579 0.837609040251761

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7svP0syk-1619417048285)(output_37_1.png)]

#继续细化学习曲线
axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()

0.65 0.8302530801197368 0.008708816667924316
0.7999999999999999 0.8277414964661117 0.007159903723250457
0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qvvhiCYL-1619417048286)(output_38_1.png)]

#细化学习曲线
axisx = np.linspace(0.75,1,25)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()

0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jULrwpAL-1619417048286)(output_39_1.png)]

reg = XGBR(n_estimators=180#  ,subsample=0.7708333333333334,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest,Ytest)

0.9159462982185405

MSE(Ytest,reg.predict(Xtest))

7.821523502888769

#首先我们先来定义一个评分函数，这个评分函数能够帮助我们直接打印Xtrain上的交叉验证结果
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):score = []for i in range(len(scoring)):if show:print("{}:{:.2f}".format(scoring[i] #模型评估指标的名字,CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean()))score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())return score

reg = XGBR(n_estimators=180,random_state=420)

regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])

r2:0.80
neg_mean_squared_error:-13.48[0.8038787848970184, -13.482301822063182]

regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)

[0.8038787848970184, -13.482301822063182]

from time import time
import datetimefor i in [0,0.2,0.5,1]:time0=time()reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)print("learning_rate = {}".format(i))regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))print("\t")

learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:561781learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.32
00:01:848888learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.24
00:01:541875learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:499027

axisx = np.arange(0.05,1,0.05)
rs = []
te = []
for i in axisx:reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest)rs.append(score[0])te.append(test)
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,te,c="gray",label="test")
plt.plot(axisx,rs,c="green",label="train")
plt.legend()
plt.show()

0.55 0.8125604372670463

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Kt53b60m-1619417048287)(output_47_1.png)]

for booster in ["gbtree","gblinear","dart"]:reg = XGBR(n_estimators=180,learning_rate=0.1,random_state=420,booster=booster).fit(Xtrain,Ytrain)print(booster)print(reg.score(Xtest,Ytest))

gbtree
0.9231068620728082
gblinear
0.6286510307485139
dart
0.923106843149575

#默认reg:linear
reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest, Ytest)

0.9231068620728082

MSE(Ytest,reg.predict(Xtest))

7.155205217161047

#xgb实现法
import xgboost as xgb

#使用类DMatrix读取数据
dtrain = xgb.DMatrix(Xtrain,Ytrain) #特征矩阵和标签都进行一个传入
dtest = xgb.DMatrix(Xtest,Ytest)

#非常遗憾无法打开来查看，所以通常都是先读到pandas里面查看之后再放到DMatrix中
dtrain

<xgboost.core.DMatrix at 0x2770de3bdd8>

import pandas as pd

pd.DataFrame(Xtrain)

	0	1	2	3	4	5	6	7	8	9	10	11	12
0	0.03041	0.0	5.19	0.0	0.515	5.895	59.6	5.6150	5.0	224.0	20.2	394.81	10.56
1	0.04113	25.0	4.86	0.0	0.426	6.727	33.5	5.4007	4.0	281.0	19.0	396.90	5.29
2	10.23300	0.0	18.10	0.0	0.614	6.185	96.7	2.1705	24.0	666.0	20.2	379.70	18.03
3	0.17142	0.0	6.91	0.0	0.448	5.682	33.8	5.1004	3.0	233.0	17.9	396.90	10.21
4	0.05059	0.0	4.49	0.0	0.449	6.389	48.0	4.7794	3.0	247.0	18.5	396.90	9.62
5	0.13587	0.0	10.59	1.0	0.489	6.064	59.1	4.2392	4.0	277.0	18.6	381.32	14.66
6	0.04981	21.0	5.64	0.0	0.439	5.998	21.4	6.8147	4.0	243.0	16.8	396.90	8.43
7	0.02543	55.0	3.78	0.0	0.484	6.696	56.4	5.7321	5.0	370.0	17.6	396.90	7.18
8	0.10793	0.0	8.56	0.0	0.520	6.195	54.4	2.7778	5.0	384.0	20.9	393.49	13.00
9	0.02498	0.0	1.89	0.0	0.518	6.540	59.7	6.2669	1.0	422.0	15.9	389.96	8.65
10	0.09299	0.0	25.65	0.0	0.581	5.961	92.9	2.0869	2.0	188.0	19.1	378.09	17.93
11	0.15876	0.0	10.81	0.0	0.413	5.961	17.5	5.2873	4.0	305.0	19.2	376.94	9.88
12	6.71772	0.0	18.10	0.0	0.713	6.749	92.6	2.3236	24.0	666.0	20.2	0.32	17.44
13	0.03768	80.0	1.52	0.0	0.404	7.274	38.3	7.3090	2.0	329.0	12.6	392.20	6.62
14	5.20177	0.0	18.10	1.0	0.770	6.127	83.4	2.7227	24.0	666.0	20.2	395.43	11.48
15	11.08740	0.0	18.10	0.0	0.718	6.411	100.0	1.8589	24.0	666.0	20.2	318.75	15.02
16	0.11432	0.0	8.56	0.0	0.520	6.781	71.3	2.8561	5.0	384.0	20.9	395.58	7.67
17	0.05602	0.0	2.46	0.0	0.488	7.831	53.6	3.1992	3.0	193.0	17.8	392.63	4.45
18	0.24103	0.0	7.38	0.0	0.493	6.083	43.7	5.4159	5.0	287.0	19.6	396.90	12.79
19	0.09378	12.5	7.87	0.0	0.524	5.889	39.0	5.4509	5.0	311.0	15.2	390.50	15.71
20	8.71675	0.0	18.10	0.0	0.693	6.471	98.8	1.7257	24.0	666.0	20.2	391.98	17.12
21	7.36711	0.0	18.10	0.0	0.679	6.193	78.1	1.9356	24.0	666.0	20.2	96.73	21.52
22	1.38799	0.0	8.14	0.0	0.538	5.950	82.0	3.9900	4.0	307.0	21.0	232.60	27.71
23	14.33370	0.0	18.10	0.0	0.614	6.229	88.0	1.9512	24.0	666.0	20.2	383.32	13.11
24	28.65580	0.0	18.10	0.0	0.597	5.155	100.0	1.5894	24.0	666.0	20.2	210.97	20.08
25	0.80271	0.0	8.14	0.0	0.538	5.456	36.6	3.7965	4.0	307.0	21.0	288.99	11.69
26	1.00245	0.0	8.14	0.0	0.538	6.674	87.3	4.2390	4.0	307.0	21.0	380.23	11.98
27	9.91655	0.0	18.10	0.0	0.693	5.852	77.8	1.5004	24.0	666.0	20.2	338.16	29.97
28	0.13158	0.0	10.01	0.0	0.547	6.176	72.5	2.7301	6.0	432.0	17.8	393.30	12.04
29	0.14231	0.0	10.01	0.0	0.547	6.254	84.2	2.2565	6.0	432.0	17.8	388.74	10.45
...	...	...	...	...	...	...	...	...	...	...	...	...	...
324	0.13117	0.0	8.56	0.0	0.520	6.127	85.2	2.1224	5.0	384.0	20.9	387.69	14.09
325	1.35472	0.0	8.14	0.0	0.538	6.072	100.0	4.1750	4.0	307.0	21.0	376.73	13.04
326	0.10153	0.0	12.83	0.0	0.437	6.279	74.5	4.0522	5.0	398.0	18.7	373.66	11.97
327	0.22927	0.0	6.91	0.0	0.448	6.030	85.5	5.6894	3.0	233.0	17.9	392.74	18.80
328	0.04666	80.0	1.52	0.0	0.404	7.107	36.6	7.3090	2.0	329.0	12.6	354.31	8.61
329	0.08014	0.0	5.96	0.0	0.499	5.850	41.5	3.9342	5.0	279.0	19.2	396.90	8.77
330	0.40771	0.0	6.20	1.0	0.507	6.164	91.3	3.0480	8.0	307.0	17.4	395.24	21.46
331	0.13642	0.0	10.59	0.0	0.489	5.891	22.3	3.9454	4.0	277.0	18.6	396.90	10.87
332	9.32909	0.0	18.10	0.0	0.713	6.185	98.7	2.2616	24.0	666.0	20.2	396.90	18.13
333	0.09103	0.0	2.46	0.0	0.488	7.155	92.2	2.7006	3.0	193.0	17.8	394.12	4.82
334	0.01301	35.0	1.52	0.0	0.442	7.241	49.3	7.0379	1.0	284.0	15.5	394.74	5.49
335	0.59005	0.0	21.89	0.0	0.624	6.372	97.9	2.3274	4.0	437.0	21.2	385.76	11.12
336	1.12658	0.0	19.58	1.0	0.871	5.012	88.0	1.6102	5.0	403.0	14.7	343.28	12.12
337	0.07886	80.0	4.95	0.0	0.411	7.148	27.7	5.1167	4.0	245.0	19.2	396.90	3.56
338	0.21719	0.0	10.59	1.0	0.489	5.807	53.8	3.6526	4.0	277.0	18.6	390.94	16.03
339	0.53700	0.0	6.20	0.0	0.504	5.981	68.1	3.6715	8.0	307.0	17.4	378.35	11.65
340	3.32105	0.0	19.58	1.0	0.871	5.403	100.0	1.3216	5.0	403.0	14.7	396.90	26.82
341	1.49632	0.0	19.58	0.0	0.871	5.404	100.0	1.5916	5.0	403.0	14.7	341.60	13.28
342	0.38735	0.0	25.65	0.0	0.581	5.613	95.6	1.7572	2.0	188.0	19.1	359.29	27.26
343	0.06617	0.0	3.24	0.0	0.460	5.868	25.8	5.2146	4.0	430.0	16.9	382.44	9.97
344	0.78570	20.0	3.97	0.0	0.647	7.014	84.6	2.1329	5.0	264.0	13.0	384.07	14.79
345	1.41385	0.0	19.58	1.0	0.871	6.129	96.0	1.7494	5.0	403.0	14.7	321.02	15.12
346	0.06047	0.0	2.46	0.0	0.488	6.153	68.8	3.2797	3.0	193.0	17.8	387.11	13.15
347	8.49213	0.0	18.10	0.0	0.584	6.348	86.1	2.0527	24.0	666.0	20.2	83.45	17.64
348	0.17134	0.0	10.01	0.0	0.547	5.928	88.2	2.4631	6.0	432.0	17.8	344.91	15.76
349	0.03871	52.5	5.32	0.0	0.405	6.209	31.3	7.3172	6.0	293.0	16.6	396.90	7.14
350	0.12650	25.0	5.13	0.0	0.453	6.762	43.4	7.9809	8.0	284.0	19.7	395.58	9.50
351	6.96215	0.0	18.10	0.0	0.700	5.713	97.0	1.9265	24.0	666.0	20.2	394.43	17.11
352	0.09164	0.0	10.81	0.0	0.413	6.065	7.8	5.2873	4.0	305.0	19.2	390.91	5.52
353	5.58107	0.0	18.10	0.0	0.713	6.436	87.9	2.3158	24.0	666.0	20.2	100.19	16.22

354 rows × 13 columns

#写明参数
param = {'silent':True #默认为False，通常要手动把它关闭掉,'objective':'reg:linear',"eta":0.1}
num_round = 180 #n_estimators

#类train，可以直接导入的参数是训练数据，树的数量，其他参数都需要通过params来导入
bst = xgb.train(param, dtrain, num_round)

#接口predict
preds = bst.predict(dtest)

preds

array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 ,  8.378565 ,23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649  ,24.030867 , 34.36362  , 21.461111 , 28.839497 , 19.568035 ,10.188658 , 19.42369  , 23.539951 , 22.850523 , 23.198708 ,17.82486  , 16.07219  , 27.602034 , 20.773046 , 20.868807 ,15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 ,36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615  ,23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111  ,18.784286 , 20.483374 , 37.10668  , 18.068268 , 12.73839  ,31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 ,26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 ,25.225826 , 17.15366  , 21.215551 , 17.426773 , 18.478971 ,14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 ,18.756853 , 18.784918 , 16.33361  , 23.155968 , 19.144344 ,29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 ,23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 ,19.537868 , 46.349495 , 23.119637 ,  8.071444 , 26.358177 ,24.85706  , 17.057547 , 20.084204 , 18.54005  ,  7.157663 ,20.593962 , 15.451031 , 45.09552  , 34.435097 , 22.969654 ,10.10335  , 10.803318 , 18.42058  ,  7.800361 , 11.79309  ,30.755335 , 10.80648  , 26.122625 , 22.589502 , 31.219454 ,42.283318 , 19.274109 ,  7.3861685, 23.055706 , 14.315018 ,45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247  ,28.382742 , 23.41182  , 19.962458 , 45.916683 , 17.521889 ,24.13039  , 26.147182 , 18.418781 , 17.606575 , 14.540631 ,20.595512 , 32.59128  , 10.155618 , 20.53032  , 21.477484 ,17.450048 , 20.154486 ,  8.010227 , 30.482618 , 29.677181 ,20.357098 , 18.222181 , 14.14504  , 10.100547 , 18.85027  ,41.85804  , 17.44544  , 22.907183 , 21.02398  , 29.799366 ,20.219465 , 12.404763 , 45.750965 , 25.56757  , 22.000706 ,14.194921 , 27.102774 ], dtype=float32)

from sklearn.metrics import r2_score
r2_score(Ytest,preds)

0.9260984298390122

MSE(Ytest,preds)

6.87682821415069

import xgboost as xgb#为了便捷，使用全数据
dfull = xgb.DMatrix(X,y)

#设定参数
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
num_round = 100
n_fold=5 #sklearn - KFold

#使用类xgb.cv
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:00:610364

#看看类xgb.cv生成了什么结果？
cvresult1 #随着树不断增加，我们的模型的效果如何变化

	train-rmse-mean	train-rmse-std	test-rmse-mean	test-rmse-std
0	17.105578	0.129116	17.163215	0.584296
1	12.337973	0.097557	12.519736	0.473458
2	8.994071	0.065756	9.404534	0.472310
3	6.629481	0.050323	7.250335	0.500342
4	4.954406	0.033209	5.920812	0.591874
5	3.781454	0.029604	5.045190	0.687971
6	2.947767	0.038786	4.472030	0.686492
7	2.357748	0.042040	4.179314	0.737935
8	1.951907	0.044972	3.979878	0.798198
9	1.660895	0.044894	3.870751	0.812331
10	1.464296	0.049422	3.816196	0.835251
11	1.323362	0.056240	3.788125	0.841643
12	1.214468	0.046524	3.766973	0.848989
13	1.137311	0.044522	3.741199	0.872370
14	1.064629	0.042245	3.729194	0.879429
15	1.010286	0.038892	3.717997	0.879572
16	0.941258	0.038360	3.706736	0.878032
17	0.883599	0.056640	3.693886	0.873913
18	0.829674	0.057284	3.693296	0.883429
19	0.772332	0.042899	3.687510	0.880928
20	0.731557	0.049150	3.687037	0.879180
21	0.690698	0.041190	3.677507	0.882060
22	0.657743	0.042137	3.675343	0.883635
23	0.619988	0.054097	3.671006	0.879224
24	0.585414	0.052585	3.670951	0.867470
25	0.548723	0.054440	3.673598	0.863241
26	0.527266	0.049630	3.673988	0.867116
27	0.504405	0.040376	3.671702	0.864566
28	0.468534	0.033020	3.671324	0.862536
29	0.448633	0.032191	3.675074	0.864713
...	...	...	...	...
70	0.071057	0.015411	3.668067	0.859435
71	0.067946	0.013960	3.667708	0.859370
72	0.065197	0.012475	3.668174	0.859307
73	0.062789	0.012538	3.668738	0.859471
74	0.060294	0.012669	3.668950	0.860112
75	0.058278	0.012055	3.669084	0.859966
76	0.055402	0.011065	3.669627	0.859505
77	0.053819	0.011072	3.669904	0.859294
78	0.051280	0.011215	3.670185	0.859204
79	0.048748	0.009988	3.670092	0.859250
80	0.046972	0.009233	3.669869	0.858892
81	0.044753	0.008664	3.669702	0.858676
82	0.043148	0.008636	3.669704	0.858921
83	0.041823	0.008355	3.669596	0.858843
84	0.040257	0.008378	3.669730	0.858459
85	0.038518	0.007731	3.669835	0.858698
86	0.036694	0.006928	3.669705	0.858958
87	0.034932	0.006174	3.669722	0.858715
88	0.033947	0.006206	3.669964	0.858547
89	0.032706	0.006176	3.669988	0.858516
90	0.031317	0.006171	3.670116	0.858512
91	0.029697	0.005473	3.669930	0.858759
92	0.028561	0.005599	3.669906	0.858549
93	0.027585	0.005694	3.669822	0.858554
94	0.026436	0.005414	3.669985	0.858390
95	0.025204	0.005145	3.669921	0.858313
96	0.024422	0.005242	3.669983	0.858255
97	0.023661	0.005117	3.669947	0.858331
98	0.022562	0.004704	3.669868	0.858578
99	0.021496	0.004738	3.669824	0.858305

100 rows × 4 columns

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()#从这个图中，我们可以看出什么？
#怎样从图中观察模型的泛化能力？
#从这个图的角度来说，模型的调参目标是什么？

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OVs9GHP4-1619417048288)(output_66_0.png)]

#xgboost中回归模型的默认模型评估指标是什么？

param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"}
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fSCBcwL6-1619417048288)(output_68_0.png)]

param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
param2 = {'silent':True,'obj':'reg:linear',"gamma":20}
num_round = 180
n_fold=5time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:01:083104

time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:01:359378

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,181),cvresult2.iloc[:,0],c="green",label="train,gamma=20")
plt.plot(range(1,181),cvresult2.iloc[:,2],c="blue",label="test,gamma=20")
plt.legend()
plt.show()#从这里，你看出gamma是如何控制过拟合了吗？控制训练集上的训练 - 降低训练集上的表现

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nW9wsd4l-1619417048289)(output_71_0.png)]

import xgboost as xgb
import matplotlib.pyplot as plt
from time import time
import datetime

from sklearn.datasets import load_breast_cancer
data2 = load_breast_cancer()x2 = data2.data
y2 = data2.targetdfull2 = xgb.DMatrix(x2,y2)param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5,"eval_metrics":"error"}
param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5}
num_round = 100

time0 = time()
cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:00:271581

time0 = time()
cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:00:443810

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,101),cvresult2.iloc[:,0],c="green",label="train,gamma=1")
plt.plot(range(1,101),cvresult2.iloc[:,2],c="blue",label="test,gamma=1")
plt.legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-40pJXnNG-1619417048289)(output_76_0.png)]

dfull = xgb.DMatrix(X,y)param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
ax.legend(fontsize="xx-large")
plt.show()

00:00:513584

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-23NZRQ9V-1619417048290)(output_78_1.png)]

param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")param2 = {'silent':True,'obj':'reg:linear',"max_depth":2,"eta":0.05,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":0.4,"colsample_bynode":1,"nfold":5}param3 = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1,"nfold":5}time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))time0 = time()
cvresult3 = xgb.cv(param3, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last")
ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last")
ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this")
ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this")
ax.legend(fontsize="xx-large")
plt.show()

00:00:532621
00:00:223373
00:00:259346

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xdWPZhuA-1619417048290)(output_79_1.png)]

import pickle

dtrain = xgb.DMatrix(Xtrain,Ytrain)#设定参数，对模型进行训练
param = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1}
num_round = 180bst = xgb.train(param, dtrain, num_round)

#保存模型
pickle.dump(bst, open("xgboostonboston.dat","wb"))#注意，open中我们往往使用w或者r作为读取的模式，但其实w与r只能用于文本文件 - txt
#当我们希望导入的不是文本文件，而是模型本身的时候，我们使用"wb"和"rb"作为读取的模式
#其中wb表示以二进制写入，rb表示以二进制读入，使用open进行保存的这个文件中是一个可以进行读取或者调用的模型

#看看模型被保存到了哪里？
import sys
sys.path

['C:\\Pythonwork\\micro-class\\11 xgboost','C:\\Python\\python37.zip','C:\\Python\\DLLs','C:\\Python\\lib','C:\\Python','','C:\\Python\\lib\\site-packages','C:\\Python\\lib\\site-packages\\win32','C:\\Python\\lib\\site-packages\\win32\\lib','C:\\Python\\lib\\site-packages\\Pythonwin','C:\\Python\\lib\\site-packages\\IPython\\extensions','C:\\Users\\Shuyu\\.ipython']

#重新打开jupyter labfrom sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pickle
import xgboost as xgbdata = load_boston()X = data.data
y = data.targetXtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)

#注意，如果我们保存的模型是xgboost库中建立的模型，则导入的数据类型也必须是xgboost库中的数据类型
dtest = xgb.DMatrix(Xtest,Ytest)

#导入模型
loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
print("Loaded model from: xgboostonboston.dat")

Loaded model from: xgboostonboston.dat

#做预测，直接调用接口predict
ypreds = loaded_model.predict(dtest)

ypreds

array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)

from sklearn.metrics import mean_squared_error as MSE, r2_score
MSE(Ytest,ypreds)

9.107608696116197

r2_score(Ytest,ypreds)

0.9021254331073938

bst = xgb.train(param, dtrain, num_round)

import joblib#同样可以看看模型被保存到了哪里
joblib.dump(bst,"xgboost-boston.dat")

['xgboost-boston.dat']

loaded_model = joblib.load("xgboost-boston.dat")

dtest = xgb.DMatrix(Xtest,Ytest)
ypreds = loaded_model.predict(dtest)

ypreds

array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)

MSE(Ytest, ypreds)

9.107608696116197

r2_score(Ytest,ypreds)

0.9021254331073938

#使用sklearn中的模型
from xgboost import XGBRegressor as XGBRbst = XGBR(n_estimators=200,eta=0.05,gamma=20,reg_lambda=3.5,reg_alpha=0.2,max_depth=4,colsample_bytree=0.4,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #训练完毕

joblib.dump(bst,"xgboost-boston-sklearn.dat")

['xgboost-boston-sklearn.dat']

loaded_model = joblib.load("xgboost-boston-sklearn.dat")

#则这里可以直接导入Xtest,直接是我们的numpy
ypreds = loaded_model.predict(Xtest)

Xtest

array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.29460e+02, 2.73800e+01],[2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01,3.96900e+02, 9.14000e+00],[3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01,3.96900e+02, 4.56000e+00],...,[5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01,3.89710e+02, 5.68000e+00],[3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,2.20100e+01, 1.71500e+01],[1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.96900e+02, 1.34400e+01]])

dtest

<xgboost.core.DMatrix at 0x29e30670668>

ypreds

array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 ,  9.883689 ,20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 ,22.096102 , 35.381573 , 20.3307   , 27.129421 , 19.997156 ,10.935587 , 20.25071  , 26.188572 , 26.711943 , 22.600443 ,18.23832  , 15.876045 , 26.263977 , 22.706024 , 20.18491  ,15.891692 , 21.4781   , 29.047956 , 23.371012 , 17.167185 ,35.699898 , 20.490337 , 20.195292 , 23.81444  , 23.106022 ,25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664  ,17.46115  , 19.159126 , 34.79234  , 17.766731 , 17.141891 ,27.755646 , 39.786766 , 22.49913  , 10.246634 , 36.76105  ,26.294876 , 20.75917  , 19.893272 , 46.62629  , 26.549704 ,24.040398 , 17.769514 , 20.76889  , 16.139618 , 17.494894 ,16.005596 , 24.28487  , 19.15237  , 31.407684 , 27.862312 ,18.877817 , 20.50497  , 16.094156 , 22.622025 , 17.762297 ,28.518019 , 41.146317 , 32.52681  , 23.117966 , 19.125128 ,24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 ,18.567612 , 46.46465  , 23.03303  ,  9.912106 , 26.407642 ,23.466772 , 16.985506 , 20.73746  , 15.679997 , 11.697191 ,21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 ,12.362759 , 14.593165 , 20.577328 ,  9.253377 , 11.1253805,32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 ,41.556873 , 19.726238 ,  8.808649 , 23.04128  , 14.709186 ,46.10303  , 21.435535 , 21.97892  , 24.299171 , 19.591938 ,27.527737 , 23.80468  , 18.782711 , 44.266346 , 17.328068 ,23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 ,23.655058 , 32.294373 , 10.60579  , 22.099716 , 19.26955  ,14.293162 , 19.386055 ,  8.824598 , 26.909697 , 29.539446 ,20.38691  , 20.832077 , 22.507433 , 11.142808 , 17.685743 ,40.230915 , 17.526121 , 23.09964  , 19.899158 , 31.775164 ,19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 ,15.041253 , 28.63522  ], dtype=float32)

MSE(Ytest, ypreds)

10.198269690947479

r2_score(Ytest,ypreds)

0.8904046866351292

import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #自创数据集
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc

class_1 = 500 #类别1有500个样本
class_2 = 50 #类别2只有50个
centers = [[0.0, 0.0], [2.0, 2.0]] #设定两个类别的中心
clusters_std = [1.5, 0.5] #设定两个类别的方差，通常来说，样本量比较大的类别会更加松散
X, y = make_blobs(n_samples=[class_1, class_2],centers=centers,cluster_std=clusters_std,random_state=0, shuffle=False)

X.shape

(550, 2)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

(y == 1).sum() / y.shape[0] #9%

0.09090909090909091

Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)

#在sklearn下建模#clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)

ypred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

clf.score(Xtest,Ytest) #默认模型评估指标 - 准确率

0.9272727272727272

cm(Ytest,ypred,labels=[1,0]) #少数类写在前面

array([[  9,   4],[  8, 144]], dtype=int64)

recall(Ytest,ypred)

0.6923076923076923

auc(Ytest,clf.predict_proba(Xtest)[:,1])

0.9671052631578947

#负/正样本比例
clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)cm(Ytest,ypred_,labels=[1,0])recall(Ytest,ypred_)auc(Ytest,clf_.predict_proba(Xtest)[:,1])

0.9696356275303644

#随着样本权重逐渐增加，模型的recall,auc和准确率如何变化？
for i in [1,5,10,20,30]:clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)ypred_ = clf_.predict(Xtest)print(i)print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))print("\tRecall:{}".format(recall(Ytest,ypred_)))print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))

1Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9671052631578947
5Accuracy:0.9454545454545454Recall:0.9230769230769231AUC:0.9665991902834008
10Accuracy:0.9515151515151515Recall:1.0AUC:0.9696356275303644
20Accuracy:0.9515151515151515Recall:1.0AUC:0.9706477732793523
30Accuracy:0.9515151515151515Recall:1.0AUC:0.9701417004048584

#负/正样本比例
clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)

0.9515151515151515

cm(Ytest,ypred_,labels=[1,0])

array([[ 13,   0],[  8, 144]], dtype=int64)

recall(Ytest,ypred_)

1.0

auc(Ytest,clf_.predict_proba(Xtest)[:,1])

0.9706477732793523

dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)

#看看xgboost库自带的predict接口
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100

bst = xgb.train(param, dtrain, num_round)

preds = bst.predict(dtest)

#看看preds返回了什么？
preds

array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454,0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839,0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053,0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357,0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839,0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251,0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357,0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426,0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714,0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454,0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026,0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246,0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053,0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357,0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714,0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357,0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053,0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714,0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714,0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839,0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217],dtype=float32)

#自己设定阈值
ypred = preds.copy()

ypred[preds > 0.5] = 1

ypred

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)

ypred[ypred != 1] = 0

#写明参数
scale_pos_weight = [1,5,10]
names = ["negative vs positive: 1","negative vs positive: 5","negative vs positive: 10"]

[*zip(names,scale_pos_weight)]

[('negative vs positive: 1', 1),('negative vs positive: 5', 5),('negative vs positive: 10', 10)]

#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as aucfor name,i in zip(names,scale_pos_weight):param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}num_round = 100clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > 0.5] = 1ypred[ypred != 1] = 0print(name)print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))

negative vs positive: 1Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 10Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008

#当然我们也可以尝试不同的阈值
for name,i in zip(names,scale_pos_weight):for thres in [0.3,0.5,0.7,0.9]:param= {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > thres] = 1ypred[ypred != 1] = 0print("{},thresholds:{}".format(name,thres))print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))

negative vs positive: 1,thresholds:0.3Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.5Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.7Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.9Accuracy:0.9515151515151515Recall:0.5384615384615384AUC:0.9741902834008097
negative vs positive: 5,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.7Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9635627530364372
negative vs positive: 10,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.5Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.7Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9665991902834008

集成学习-xgboost学习相关推荐

集成模型Xgboost！机器学习最热研究方向入门，附学习路线图
导读:本文介绍了集成学习中比较具有代表性的方法,如Boosting.Bagging等.而XGBoost是集成学习中的佼佼者,目前,一些主流的互联网公司如腾讯.阿里巴巴等都已将XGBoost应用到其业务 ...
python 持续集成工具_持续集成工具: Jenkins学习
持续集成工具: Jenkins学习 -- 部分内容收集自网络,如有侵权,请联系作者删除一. 概念在过去的开发整体流程中,是所有人写好代码之后统一进行合并(svn,git),然后进行测试,确保准发布 ...
XGBoost学习（六）：输出特征重要性以及筛选特征
XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...
持续集成工具Jenkins学习4 Idea集成Jenkins插件
持续集成工具Jenkins学习4 Idea集成Jenkins插件一.功能简介二.安装Idea插件 1. 搜索安装 2. 设置三.Jenkins开启CSRF 四.使用一.功能简介 Idea可以方 ...
XGBoost学习（五）：参数调优
XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...
XGBoost学习（四）：实战-python3
XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...
区别：强化学习集成学习增强学习规则学习
1.强化学习强化学习是智能体(Agent)以"试错"的方式进行学习,通过与环境进行交互获得的奖赏指导行为,目标是使智能体获得最大的奖赏,强化学习不同于连接主义学习中的监督学习,主 ...
虚拟专题：联邦学习 | 联邦学习研究综述
来源:网络与信息安全学报联邦学习研究综述周传鑫,孙奕,汪德刚,葛桦玮信息工程大学,河南郑州 450001 摘要:联邦学习由于能够在多方数据源聚合的场景下协同训练全局最优模型,近年来迅速成为安全 ...
【原创】分享一些机器学习和深度学习的学习资料
如果你还在苦苦寻找机器学习和深度学习入门资料的话,或许可以看看本文我的一些推荐,这些材料我自己都学过一遍,分享一下点评,希望对你有帮助.注意,本文只是点评这些资源,不提供任何资源的盗版下载,所有资源我 ...

集成学习-xgboost学习

XGboost相关学习

集成学习-xgboost学习相关推荐

最新文章

热门文章