
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
data = load_boston()
{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,4.9800e+00],[2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,9.1400e+00],[2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,4.0300e+00],...,[6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,5.6400e+00],[1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,6.4800e+00],[4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,7.8800e+00]]),'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}
X = data.data
y = data.target
(506, 13)
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #训练
reg.predict(Xtest) #传统接口predict
reg.score(Xtest,Ytest) #你能想出这里应该返回什么模型评估指标么?利用shift+Tab可以知道,R^2评估指标
reg.feature_importances_ #树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择
reg = XGBR(n_estimators=100) #交叉验证中导入的没有经过训练的模型
#这里应该返回什么模型评估指标,还记得么? 返回的是与reg.score相同的评估指标R^2(回归),准确率(分类)
#严谨的交叉验证与不严谨的交叉验证之间的讨论:训练集 or 全数据?
array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])
#严谨 vs 不严谨
import sklearn
rfr = RFR(n_estimators=100)
lr = LinearR()
reg = XGBR(n_estimators=10,silent=True)#xgboost库silent=True不会打印训练进程,只返回运行结果,默认是False会打印训练进程
def plot_learning_curve(estimator,title, X, y, ax=None, #选择子图ylim=None, #设置纵坐标的取值范围cv=None, #交叉验证n_jobs=None #设定索要使用的线程):from sklearn.model_selection import learning_curveimport matplotlib.pyplot as pltimport numpy as nptrain_sizes, train_scores, test_scores = learning_curve(estimator, X, y,shuffle=True,cv=cv,random_state=420,n_jobs=n_jobs)      if ax == None:ax = plt.gca()else:ax = plt.figure()ax.set_title(title)if ylim is not None:ax.set_ylim(*ylim)ax.set_xlabel("Training examples")ax.set_ylabel("Score")ax.grid() #绘制网格,不是必须ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color="r",label="Training score")ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color="g",label="Test score")ax.legend(loc="best")return ax
cv = KFold(n_splits=5, shuffle = True, random_state=42) #交叉验证模式


#=====【TIME WARNING:25 seconds】=====#axisx = range(10,1010,50)
rs = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
660 0.8046775284172915


#======【TIME WARNING: 20s】=======#
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)#记录1-偏差rs.append(cvresult.mean())#记录方差var.append(cvresult.var())#计算泛化误差的可控部分ge.append((1 - cvresult.mean())**2+cvresult.var())
650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411


axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
rs = np.array(rs)
var = np.array(var)*0.01
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314




time0 = time()
time0 = time()
time0 = time()
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
0.7368421052631579 0.837609040251761


axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
rs = np.array(rs)
var = np.array(var)
0.65 0.8302530801197368 0.008708816667924316
0.7999999999999999 0.8277414964661117 0.007159903723250457
0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055


axisx = np.linspace(0.75,1,25)
rs = []
var = []
ge = []
for i in axisx:reg = XGBR(n_estimators=180,subsample=i,random_state=420)cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)rs.append(cvresult.mean())var.append(cvresult.var())ge.append((1 - cvresult.mean())**2+cvresult.var())
rs = np.array(rs)
var = np.array(var)
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166


reg = XGBR(n_estimators=180#  ,subsample=0.7708333333333334,random_state=420).fit(Xtrain,Ytrain)
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):score = []for i in range(len(scoring)):if show:print("{}:{:.2f}".format(scoring[i] #模型评估指标的名字,CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean()))score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())return score
reg = XGBR(n_estimators=180,random_state=420)
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
neg_mean_squared_error:-13.48[0.8038787848970184, -13.482301822063182]
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
[0.8038787848970184, -13.482301822063182]
from time import time
import datetimefor i in [0,0.2,0.5,1]:time0=time()reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)print("learning_rate = {}".format(i))regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))print("\t")
learning_rate = 0
00:01:561781learning_rate = 0.2
00:01:848888learning_rate = 0.5
00:01:541875learning_rate = 1
axisx = np.arange(0.05,1,0.05)
rs = []
te = []
for i in axisx:reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest)rs.append(score[0])te.append(test)
0.55 0.8125604372670463


for booster in ["gbtree","gblinear","dart"]:reg = XGBR(n_estimators=180,learning_rate=0.1,random_state=420,booster=booster).fit(Xtrain,Ytrain)print(booster)print(reg.score(Xtest,Ytest))
reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest, Ytest)
import xgboost as xgb
dtrain = xgb.DMatrix(Xtrain,Ytrain) #特征矩阵和标签都进行一个传入
dtest = xgb.DMatrix(Xtest,Ytest)
<xgboost.core.DMatrix at 0x2770de3bdd8>
import pandas as pd
0 1 2 3 4 5 6 7 8 9 10 11 12
0 0.03041 0.0 5.19 0.0 0.515 5.895 59.6 5.6150 5.0 224.0 20.2 394.81 10.56
1 0.04113 25.0 4.86 0.0 0.426 6.727 33.5 5.4007 4.0 281.0 19.0 396.90 5.29
2 10.23300 0.0 18.10 0.0 0.614 6.185 96.7 2.1705 24.0 666.0 20.2 379.70 18.03
3 0.17142 0.0 6.91 0.0 0.448 5.682 33.8 5.1004 3.0 233.0 17.9 396.90 10.21
4 0.05059 0.0 4.49 0.0 0.449 6.389 48.0 4.7794 3.0 247.0 18.5 396.90 9.62
5 0.13587 0.0 10.59 1.0 0.489 6.064 59.1 4.2392 4.0 277.0 18.6 381.32 14.66
6 0.04981 21.0 5.64 0.0 0.439 5.998 21.4 6.8147 4.0 243.0 16.8 396.90 8.43
7 0.02543 55.0 3.78 0.0 0.484 6.696 56.4 5.7321 5.0 370.0 17.6 396.90 7.18
8 0.10793 0.0 8.56 0.0 0.520 6.195 54.4 2.7778 5.0 384.0 20.9 393.49 13.00
9 0.02498 0.0 1.89 0.0 0.518 6.540 59.7 6.2669 1.0 422.0 15.9 389.96 8.65
10 0.09299 0.0 25.65 0.0 0.581 5.961 92.9 2.0869 2.0 188.0 19.1 378.09 17.93
11 0.15876 0.0 10.81 0.0 0.413 5.961 17.5 5.2873 4.0 305.0 19.2 376.94 9.88
12 6.71772 0.0 18.10 0.0 0.713 6.749 92.6 2.3236 24.0 666.0 20.2 0.32 17.44
13 0.03768 80.0 1.52 0.0 0.404 7.274 38.3 7.3090 2.0 329.0 12.6 392.20 6.62
14 5.20177 0.0 18.10 1.0 0.770 6.127 83.4 2.7227 24.0 666.0 20.2 395.43 11.48
15 11.08740 0.0 18.10 0.0 0.718 6.411 100.0 1.8589 24.0 666.0 20.2 318.75 15.02
16 0.11432 0.0 8.56 0.0 0.520 6.781 71.3 2.8561 5.0 384.0 20.9 395.58 7.67
17 0.05602 0.0 2.46 0.0 0.488 7.831 53.6 3.1992 3.0 193.0 17.8 392.63 4.45
18 0.24103 0.0 7.38 0.0 0.493 6.083 43.7 5.4159 5.0 287.0 19.6 396.90 12.79
19 0.09378 12.5 7.87 0.0 0.524 5.889 39.0 5.4509 5.0 311.0 15.2 390.50 15.71
20 8.71675 0.0 18.10 0.0 0.693 6.471 98.8 1.7257 24.0 666.0 20.2 391.98 17.12
21 7.36711 0.0 18.10 0.0 0.679 6.193 78.1 1.9356 24.0 666.0 20.2 96.73 21.52
22 1.38799 0.0 8.14 0.0 0.538 5.950 82.0 3.9900 4.0 307.0 21.0 232.60 27.71
23 14.33370 0.0 18.10 0.0 0.614 6.229 88.0 1.9512 24.0 666.0 20.2 383.32 13.11
24 28.65580 0.0 18.10 0.0 0.597 5.155 100.0 1.5894 24.0 666.0 20.2 210.97 20.08
25 0.80271 0.0 8.14 0.0 0.538 5.456 36.6 3.7965 4.0 307.0 21.0 288.99 11.69
26 1.00245 0.0 8.14 0.0 0.538 6.674 87.3 4.2390 4.0 307.0 21.0 380.23 11.98
27 9.91655 0.0 18.10 0.0 0.693 5.852 77.8 1.5004 24.0 666.0 20.2 338.16 29.97
28 0.13158 0.0 10.01 0.0 0.547 6.176 72.5 2.7301 6.0 432.0 17.8 393.30 12.04
29 0.14231 0.0 10.01 0.0 0.547 6.254 84.2 2.2565 6.0 432.0 17.8 388.74 10.45
... ... ... ... ... ... ... ... ... ... ... ... ... ...
324 0.13117 0.0 8.56 0.0 0.520 6.127 85.2 2.1224 5.0 384.0 20.9 387.69 14.09
325 1.35472 0.0 8.14 0.0 0.538 6.072 100.0 4.1750 4.0 307.0 21.0 376.73 13.04
326 0.10153 0.0 12.83 0.0 0.437 6.279 74.5 4.0522 5.0 398.0 18.7 373.66 11.97
327 0.22927 0.0 6.91 0.0 0.448 6.030 85.5 5.6894 3.0 233.0 17.9 392.74 18.80
328 0.04666 80.0 1.52 0.0 0.404 7.107 36.6 7.3090 2.0 329.0 12.6 354.31 8.61
329 0.08014 0.0 5.96 0.0 0.499 5.850 41.5 3.9342 5.0 279.0 19.2 396.90 8.77
330 0.40771 0.0 6.20 1.0 0.507 6.164 91.3 3.0480 8.0 307.0 17.4 395.24 21.46
331 0.13642 0.0 10.59 0.0 0.489 5.891 22.3 3.9454 4.0 277.0 18.6 396.90 10.87
332 9.32909 0.0 18.10 0.0 0.713 6.185 98.7 2.2616 24.0 666.0 20.2 396.90 18.13
333 0.09103 0.0 2.46 0.0 0.488 7.155 92.2 2.7006 3.0 193.0 17.8 394.12 4.82
334 0.01301 35.0 1.52 0.0 0.442 7.241 49.3 7.0379 1.0 284.0 15.5 394.74 5.49
335 0.59005 0.0 21.89 0.0 0.624 6.372 97.9 2.3274 4.0 437.0 21.2 385.76 11.12
336 1.12658 0.0 19.58 1.0 0.871 5.012 88.0 1.6102 5.0 403.0 14.7 343.28 12.12
337 0.07886 80.0 4.95 0.0 0.411 7.148 27.7 5.1167 4.0 245.0 19.2 396.90 3.56
338 0.21719 0.0 10.59 1.0 0.489 5.807 53.8 3.6526 4.0 277.0 18.6 390.94 16.03
339 0.53700 0.0 6.20 0.0 0.504 5.981 68.1 3.6715 8.0 307.0 17.4 378.35 11.65
340 3.32105 0.0 19.58 1.0 0.871 5.403 100.0 1.3216 5.0 403.0 14.7 396.90 26.82
341 1.49632 0.0 19.58 0.0 0.871 5.404 100.0 1.5916 5.0 403.0 14.7 341.60 13.28
342 0.38735 0.0 25.65 0.0 0.581 5.613 95.6 1.7572 2.0 188.0 19.1 359.29 27.26
343 0.06617 0.0 3.24 0.0 0.460 5.868 25.8 5.2146 4.0 430.0 16.9 382.44 9.97
344 0.78570 20.0 3.97 0.0 0.647 7.014 84.6 2.1329 5.0 264.0 13.0 384.07 14.79
345 1.41385 0.0 19.58 1.0 0.871 6.129 96.0 1.7494 5.0 403.0 14.7 321.02 15.12
346 0.06047 0.0 2.46 0.0 0.488 6.153 68.8 3.2797 3.0 193.0 17.8 387.11 13.15
347 8.49213 0.0 18.10 0.0 0.584 6.348 86.1 2.0527 24.0 666.0 20.2 83.45 17.64
348 0.17134 0.0 10.01 0.0 0.547 5.928 88.2 2.4631 6.0 432.0 17.8 344.91 15.76
349 0.03871 52.5 5.32 0.0 0.405 6.209 31.3 7.3172 6.0 293.0 16.6 396.90 7.14
350 0.12650 25.0 5.13 0.0 0.453 6.762 43.4 7.9809 8.0 284.0 19.7 395.58 9.50
351 6.96215 0.0 18.10 0.0 0.700 5.713 97.0 1.9265 24.0 666.0 20.2 394.43 17.11
352 0.09164 0.0 10.81 0.0 0.413 6.065 7.8 5.2873 4.0 305.0 19.2 390.91 5.52
353 5.58107 0.0 18.10 0.0 0.713 6.436 87.9 2.3158 24.0 666.0 20.2 100.19 16.22

354 rows × 13 columns

param = {'silent':True #默认为False,通常要手动把它关闭掉,'objective':'reg:linear',"eta":0.1}
num_round = 180 #n_estimators
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 ,  8.378565 ,23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649  ,24.030867 , 34.36362  , 21.461111 , 28.839497 , 19.568035 ,10.188658 , 19.42369  , 23.539951 , 22.850523 , 23.198708 ,17.82486  , 16.07219  , 27.602034 , 20.773046 , 20.868807 ,15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 ,36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615  ,23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111  ,18.784286 , 20.483374 , 37.10668  , 18.068268 , 12.73839  ,31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 ,26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 ,25.225826 , 17.15366  , 21.215551 , 17.426773 , 18.478971 ,14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 ,18.756853 , 18.784918 , 16.33361  , 23.155968 , 19.144344 ,29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 ,23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 ,19.537868 , 46.349495 , 23.119637 ,  8.071444 , 26.358177 ,24.85706  , 17.057547 , 20.084204 , 18.54005  ,  7.157663 ,20.593962 , 15.451031 , 45.09552  , 34.435097 , 22.969654 ,10.10335  , 10.803318 , 18.42058  ,  7.800361 , 11.79309  ,30.755335 , 10.80648  , 26.122625 , 22.589502 , 31.219454 ,42.283318 , 19.274109 ,  7.3861685, 23.055706 , 14.315018 ,45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247  ,28.382742 , 23.41182  , 19.962458 , 45.916683 , 17.521889 ,24.13039  , 26.147182 , 18.418781 , 17.606575 , 14.540631 ,20.595512 , 32.59128  , 10.155618 , 20.53032  , 21.477484 ,17.450048 , 20.154486 ,  8.010227 , 30.482618 , 29.677181 ,20.357098 , 18.222181 , 14.14504  , 10.100547 , 18.85027  ,41.85804  , 17.44544  , 22.907183 , 21.02398  , 29.799366 ,20.219465 , 12.404763 , 45.750965 , 25.56757  , 22.000706 ,14.194921 , 27.102774 ], dtype=float32)
from sklearn.metrics import r2_score
import xgboost as xgb#为了便捷,使用全数据
dfull = xgb.DMatrix(X,y)
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
num_round = 100
n_fold=5 #sklearn - KFold
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
cvresult1 #随着树不断增加,我们的模型的效果如何变化
train-rmse-mean train-rmse-std test-rmse-mean test-rmse-std
0 17.105578 0.129116 17.163215 0.584296
1 12.337973 0.097557 12.519736 0.473458
2 8.994071 0.065756 9.404534 0.472310
3 6.629481 0.050323 7.250335 0.500342
4 4.954406 0.033209 5.920812 0.591874
5 3.781454 0.029604 5.045190 0.687971
6 2.947767 0.038786 4.472030 0.686492
7 2.357748 0.042040 4.179314 0.737935
8 1.951907 0.044972 3.979878 0.798198
9 1.660895 0.044894 3.870751 0.812331
10 1.464296 0.049422 3.816196 0.835251
11 1.323362 0.056240 3.788125 0.841643
12 1.214468 0.046524 3.766973 0.848989
13 1.137311 0.044522 3.741199 0.872370
14 1.064629 0.042245 3.729194 0.879429
15 1.010286 0.038892 3.717997 0.879572
16 0.941258 0.038360 3.706736 0.878032
17 0.883599 0.056640 3.693886 0.873913
18 0.829674 0.057284 3.693296 0.883429
19 0.772332 0.042899 3.687510 0.880928
20 0.731557 0.049150 3.687037 0.879180
21 0.690698 0.041190 3.677507 0.882060
22 0.657743 0.042137 3.675343 0.883635
23 0.619988 0.054097 3.671006 0.879224
24 0.585414 0.052585 3.670951 0.867470
25 0.548723 0.054440 3.673598 0.863241
26 0.527266 0.049630 3.673988 0.867116
27 0.504405 0.040376 3.671702 0.864566
28 0.468534 0.033020 3.671324 0.862536
29 0.448633 0.032191 3.675074 0.864713
... ... ... ... ...
70 0.071057 0.015411 3.668067 0.859435
71 0.067946 0.013960 3.667708 0.859370
72 0.065197 0.012475 3.668174 0.859307
73 0.062789 0.012538 3.668738 0.859471
74 0.060294 0.012669 3.668950 0.860112
75 0.058278 0.012055 3.669084 0.859966
76 0.055402 0.011065 3.669627 0.859505
77 0.053819 0.011072 3.669904 0.859294
78 0.051280 0.011215 3.670185 0.859204
79 0.048748 0.009988 3.670092 0.859250
80 0.046972 0.009233 3.669869 0.858892
81 0.044753 0.008664 3.669702 0.858676
82 0.043148 0.008636 3.669704 0.858921
83 0.041823 0.008355 3.669596 0.858843
84 0.040257 0.008378 3.669730 0.858459
85 0.038518 0.007731 3.669835 0.858698
86 0.036694 0.006928 3.669705 0.858958
87 0.034932 0.006174 3.669722 0.858715
88 0.033947 0.006206 3.669964 0.858547
89 0.032706 0.006176 3.669988 0.858516
90 0.031317 0.006171 3.670116 0.858512
91 0.029697 0.005473 3.669930 0.858759
92 0.028561 0.005599 3.669906 0.858549
93 0.027585 0.005694 3.669822 0.858554
94 0.026436 0.005414 3.669985 0.858390
95 0.025204 0.005145 3.669921 0.858313
96 0.024422 0.005242 3.669983 0.858255
97 0.023661 0.005117 3.669947 0.858331
98 0.022562 0.004704 3.669868 0.858578
99 0.021496 0.004738 3.669824 0.858305

100 rows × 4 columns



param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"}
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)plt.figure(figsize=(20,5))


param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
param2 = {'silent':True,'obj':'reg:linear',"gamma":20}
num_round = 180
n_fold=5time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round,n_fold)
plt.show()#从这里,你看出gamma是如何控制过拟合了吗?控制训练集上的训练 - 降低训练集上的表现


import xgboost as xgb
import matplotlib.pyplot as plt
from time import time
import datetime
from sklearn.datasets import load_breast_cancer
data2 = load_breast_cancer()x2 = data2.data
y2 = data2.targetdfull2 = xgb.DMatrix(x2,y2)param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5,"eval_metrics":"error"}
param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5}
num_round = 100
time0 = time()
cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error"))
time0 = time()
cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error"))


dfull = xgb.DMatrix(X,y)param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))


param1 = {'silent':True,'obj':'reg:linear',"subsample":1,"max_depth":6,"eta":0.3,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":1,"colsample_bynode":1,"nfold":5}
num_round = 200time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))fig,ax = plt.subplots(1,figsize=(15,8))
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")param2 = {'silent':True,'obj':'reg:linear',"max_depth":2,"eta":0.05,"gamma":0,"lambda":1,"alpha":0,"colsample_bytree":1,"colsample_bylevel":0.4,"colsample_bynode":1,"nfold":5}param3 = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1,"nfold":5}time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))time0 = time()
cvresult3 = xgb.cv(param3, dfull, num_round)


import pickle
dtrain = xgb.DMatrix(Xtrain,Ytrain)#设定参数,对模型进行训练
param = {'silent':True,'obj':'reg:linear',"subsample":1,"eta":0.05,"gamma":20,"lambda":3.5,"alpha":0.2,"max_depth":4,"colsample_bytree":0.4,"colsample_bylevel":0.6,"colsample_bynode":1}
num_round = 180bst = xgb.train(param, dtrain, num_round)
pickle.dump(bst, open("xgboostonboston.dat","wb"))#注意,open中我们往往使用w或者r作为读取的模式,但其实w与r只能用于文本文件 - txt
import sys
['C:\\Pythonwork\\micro-class\\11 xgboost','C:\\Python\\python37.zip','C:\\Python\\DLLs','C:\\Python\\lib','C:\\Python','','C:\\Python\\lib\\site-packages','C:\\Python\\lib\\site-packages\\win32','C:\\Python\\lib\\site-packages\\win32\\lib','C:\\Python\\lib\\site-packages\\Pythonwin','C:\\Python\\lib\\site-packages\\IPython\\extensions','C:\\Users\\Shuyu\\.ipython']
#重新打开jupyter labfrom sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pickle
import xgboost as xgbdata = load_boston()X = data.data
y = data.targetXtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
dtest = xgb.DMatrix(Xtest,Ytest)
loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
print("Loaded model from: xgboostonboston.dat")
Loaded model from: xgboostonboston.dat
ypreds = loaded_model.predict(dtest)
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)
from sklearn.metrics import mean_squared_error as MSE, r2_score
bst = xgb.train(param, dtrain, num_round)
import joblib#同样可以看看模型被保存到了哪里
loaded_model = joblib.load("xgboost-boston.dat")
dtest = xgb.DMatrix(Xtest,Ytest)
ypreds = loaded_model.predict(dtest)
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,16.924698, 22.633028], dtype=float32)
MSE(Ytest, ypreds)
from xgboost import XGBRegressor as XGBRbst = XGBR(n_estimators=200,eta=0.05,gamma=20,reg_lambda=3.5,reg_alpha=0.2,max_depth=4,colsample_bytree=0.4,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #训练完毕
loaded_model = joblib.load("xgboost-boston-sklearn.dat")
ypreds = loaded_model.predict(Xtest)
array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.29460e+02, 2.73800e+01],[2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01,3.96900e+02, 9.14000e+00],[3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01,3.96900e+02, 4.56000e+00],...,[5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01,3.89710e+02, 5.68000e+00],[3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,2.20100e+01, 1.71500e+01],[1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,3.96900e+02, 1.34400e+01]])
<xgboost.core.DMatrix at 0x29e30670668>
array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 ,  9.883689 ,20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 ,22.096102 , 35.381573 , 20.3307   , 27.129421 , 19.997156 ,10.935587 , 20.25071  , 26.188572 , 26.711943 , 22.600443 ,18.23832  , 15.876045 , 26.263977 , 22.706024 , 20.18491  ,15.891692 , 21.4781   , 29.047956 , 23.371012 , 17.167185 ,35.699898 , 20.490337 , 20.195292 , 23.81444  , 23.106022 ,25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664  ,17.46115  , 19.159126 , 34.79234  , 17.766731 , 17.141891 ,27.755646 , 39.786766 , 22.49913  , 10.246634 , 36.76105  ,26.294876 , 20.75917  , 19.893272 , 46.62629  , 26.549704 ,24.040398 , 17.769514 , 20.76889  , 16.139618 , 17.494894 ,16.005596 , 24.28487  , 19.15237  , 31.407684 , 27.862312 ,18.877817 , 20.50497  , 16.094156 , 22.622025 , 17.762297 ,28.518019 , 41.146317 , 32.52681  , 23.117966 , 19.125128 ,24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 ,18.567612 , 46.46465  , 23.03303  ,  9.912106 , 26.407642 ,23.466772 , 16.985506 , 20.73746  , 15.679997 , 11.697191 ,21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 ,12.362759 , 14.593165 , 20.577328 ,  9.253377 , 11.1253805,32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 ,41.556873 , 19.726238 ,  8.808649 , 23.04128  , 14.709186 ,46.10303  , 21.435535 , 21.97892  , 24.299171 , 19.591938 ,27.527737 , 23.80468  , 18.782711 , 44.266346 , 17.328068 ,23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 ,23.655058 , 32.294373 , 10.60579  , 22.099716 , 19.26955  ,14.293162 , 19.386055 ,  8.824598 , 26.909697 , 29.539446 ,20.38691  , 20.832077 , 22.507433 , 11.142808 , 17.685743 ,40.230915 , 17.526121 , 23.09964  , 19.899158 , 31.775164 ,19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 ,15.041253 , 28.63522  ], dtype=float32)
MSE(Ytest, ypreds)
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #自创数据集
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
class_1 = 500 #类别1有500个样本
class_2 = 50 #类别2只有50个
centers = [[0.0, 0.0], [2.0, 2.0]] #设定两个类别的中心
clusters_std = [1.5, 0.5] #设定两个类别的方差,通常来说,样本量比较大的类别会更加松散
X, y = make_blobs(n_samples=[class_1, class_2],centers=centers,cluster_std=clusters_std,random_state=0, shuffle=False)
(550, 2)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
(y == 1).sum() / y.shape[0] #9%
Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)
#在sklearn下建模#clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
clf.score(Xtest,Ytest) #默认模型评估指标 - 准确率
cm(Ytest,ypred,labels=[1,0]) #少数类写在前面
array([[  9,   4],[  8, 144]], dtype=int64)
clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
for i in [1,5,10,20,30]:clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)ypred_ = clf_.predict(Xtest)print(i)print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))print("\tRecall:{}".format(recall(Ytest,ypred_)))print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
array([[ 13,   0],[  8, 144]], dtype=int64)
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454,0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839,0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053,0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357,0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839,0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251,0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357,0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357,0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426,0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714,0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454,0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026,0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246,0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053,0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357,0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714,0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357,0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357,0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357,0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053,0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714,0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714,0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528,0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839,0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217],dtype=float32)
ypred = preds.copy()
ypred[preds > 0.5] = 1
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)
ypred[ypred != 1] = 0
scale_pos_weight = [1,5,10]
names = ["negative vs positive: 1","negative vs positive: 5","negative vs positive: 10"]
[('negative vs positive: 1', 1),('negative vs positive: 5', 5),('negative vs positive: 10', 10)]
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as aucfor name,i in zip(names,scale_pos_weight):param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}num_round = 100clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > 0.5] = 1ypred[ypred != 1] = 0print(name)print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 10Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
for name,i in zip(names,scale_pos_weight):for thres in [0.3,0.5,0.7,0.9]:param= {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":i}clf = xgb.train(param, dtrain, num_round)preds = clf.predict(dtest)ypred = preds.copy()ypred[preds > thres] = 1ypred[ypred != 1] = 0print("{},thresholds:{}".format(name,thres))print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))print("\tRecall:{}".format(recall(Ytest,ypred)))print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1,thresholds:0.3Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.5Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.7Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.9Accuracy:0.9515151515151515Recall:0.5384615384615384AUC:0.9741902834008097
negative vs positive: 5,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.5Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.7Accuracy:0.9272727272727272Recall:0.6923076923076923AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9635627530364372
negative vs positive: 10,thresholds:0.3Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.5Accuracy:0.9515151515151515Recall:1.0AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.7Accuracy:0.9393939393939394Recall:0.8461538461538461AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.9Accuracy:0.9212121212121213Recall:0.6153846153846154AUC:0.9665991902834008


  1. 集成模型Xgboost!机器学习最热研究方向入门,附学习路线图

    导读:本文介绍了集成学习中比较具有代表性的方法,如Boosting.Bagging等.而XGBoost是集成学习中的佼佼者,目前,一些主流的互联网公司如腾讯.阿里巴巴等都已将XGBoost应用到其业务 ...

  2. python 持续集成工具_持续集成工具: Jenkins学习

    持续集成工具: Jenkins学习 -- 部分内容收集自网络,如有侵权,请联系作者删除 一. 概念 在过去的开发整体流程中,是所有人写好代码之后统一进行合并(svn,git),然后进行测试,确保准发布 ...

  3. XGBoost学习(六):输出特征重要性以及筛选特征

    XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...

  4. 持续集成工具Jenkins学习4 Idea集成Jenkins插件

    持续集成工具Jenkins学习4 Idea集成Jenkins插件 一.功能简介 二.安装Idea插件 1. 搜索安装 2. 设置 三.Jenkins开启CSRF 四.使用 一.功能简介 Idea可以方 ...

  5. XGBoost学习(五):参数调优

    XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...

  6. XGBoost学习(四):实战-python3

    XGBoost学习(一):原理 XGBoost学习(二):安装及介绍 XGBoost学习(三):模型详解 XGBoost学习(四):实战 XGBoost学习(五):参数调优 XGBoost学习(六): ...

  7. 区别:强化学习集成学习增强学习规则学习

    1.强化学习 强化学习是智能体(Agent)以"试错"的方式进行学习,通过与环境进行交互获得的奖赏指导行为,目标是使智能体获得最大的奖赏,强化学习不同于连接主义学习中的监督学习,主 ...

  8. 虚拟专题:联邦学习 | 联邦学习研究综述

    来源:网络与信息安全学报 联邦学习研究综述 周传鑫,孙奕,汪德刚,葛桦玮 信息工程大学,河南 郑州 450001 摘要:联邦学习由于能够在多方数据源聚合的场景下协同训练全局最优模型,近年来迅速成为安全 ...

  9. 【原创】分享一些机器学习和深度学习的学习资料

    如果你还在苦苦寻找机器学习和深度学习入门资料的话,或许可以看看本文我的一些推荐,这些材料我自己都学过一遍,分享一下点评,希望对你有帮助.注意,本文只是点评这些资源,不提供任何资源的盗版下载,所有资源我 ...


  1. 算法笔记_114:等额本金(Java)
  2. Nginx HTTPS功能部署实践
  3. get post乱码解决
  4. JavaScript巧用对象的引用解决三级联动
  5. Py之albumentations:albumentations库函数的简介、安装、使用方法之详细攻略
  6. yii2 RESTful api的详细使用
  7. moosefs分布式文件系统
  8. web实现数据交互的几种常见方式
  9. 剑指offer(刷题41-50)--c++,Python版本
  10. 面试官:连Spring三级缓存都答不好,自己走还是我送你?
  11. java并发包作者lee_Java的一些并发包
  12. 51Nod 1058 N的阶乘的长度
  13. Win10 安装 Linux 子系统
  14. android 随身无线网卡,让小锐WiFi支持USB无线网卡/随身WiFi(附各种“随身wifi”芯片型号)...
  15. rtklib-RINEX文件读取-rinex.c解析(一)
  16. 打印机驱动无法安装到计算机是,电脑打印机无法安装驱动的解决方法
  17. coldfusion_在Windows上安装和配置ColdFusion MX 6.1
  18. 获取ftp服务器文件,ftp获取服务器文件
  19. 一种网格去噪算法(基于平均面法向的均值滤波)
  20. 用python求解一元二次方程组


  1. 美团实习| 周记(三)
  2. 《涨知识啦32》-SBD器件中的肖特基二极管漏电流机制 (上)
  3. 2022-10-18
  4. 数伏食用黄瓜和鸡蛋治慢性支气管炎
  5. webservice25--基于契约优先开发用户管理小功能--异常处理
  6. 更快更强大,Oracle Primavera P6 R18.8 发布
  7. 2023最新SSM计算机毕业设计选题大全(附源码+LW)之java医院住院管理系统7lio5
  8. 关于服务器开放SQL Server1433端口
  9. php投票post,请教一个简单的投票页面,post怎么写
  10. 2019年计算机考研408操作系统真题(客观题)