机器学习集中训练营——基于英雄联盟数据集的LightGBM分类实战

# 导入包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 导入数据
df = pd.read_csv(r"D:\备份\OneDrive\桌面\资料\high_diamond_ranked_10min.csv", delimiter=',')
y = df.blueWins

# 简单查看数据
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9879 entries, 0 to 9878
Data columns (total 40 columns):#   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----  0   gameId                        9879 non-null   int64  1   blueWins                      9879 non-null   int64  2   blueWardsPlaced               9879 non-null   int64  3   blueWardsDestroyed            9879 non-null   int64  4   blueFirstBlood                9879 non-null   int64  5   blueKills                     9879 non-null   int64  6   blueDeaths                    9879 non-null   int64  7   blueAssists                   9879 non-null   int64  8   blueEliteMonsters             9879 non-null   int64  9   blueDragons                   9879 non-null   int64  10  blueHeralds                   9879 non-null   int64  11  blueTowersDestroyed           9879 non-null   int64  12  blueTotalGold                 9879 non-null   int64  13  blueAvgLevel                  9879 non-null   float6414  blueTotalExperience           9879 non-null   int64  15  blueTotalMinionsKilled        9879 non-null   int64  16  blueTotalJungleMinionsKilled  9879 non-null   int64  17  blueGoldDiff                  9879 non-null   int64  18  blueExperienceDiff            9879 non-null   int64  19  blueCSPerMin                  9879 non-null   float6420  blueGoldPerMin                9879 non-null   float6421  redWardsPlaced                9879 non-null   int64  22  redWardsDestroyed             9879 non-null   int64  23  redFirstBlood                 9879 non-null   int64  24  redKills                      9879 non-null   int64  25  redDeaths                     9879 non-null   int64  26  redAssists                    9879 non-null   int64  27  redEliteMonsters              9879 non-null   int64  28  redDragons                    9879 non-null   int64  29  redHeralds                    9879 non-null   int64  30  redTowersDestroyed            9879 non-null   int64  31  redTotalGold                  9879 non-null   int64  32  redAvgLevel                   9879 non-null   float6433  redTotalExperience            9879 non-null   int64  34  redTotalMinionsKilled         9879 non-null   int64  35  redTotalJungleMinionsKilled   9879 non-null   int64  36  redGoldDiff                   9879 non-null   int64  37  redExperienceDiff             9879 non-null   int64  38  redCSPerMin                   9879 non-null   float6439  redGoldPerMin                 9879 non-null   float64
dtypes: float64(6), int64(34)
memory usage:

# 查看前5行数据
df.head()

## 标注标签并利用value_counts函数查看训练集标签的数量
y.value_counts()
# 各分类标签数量相对均衡

0    4949
1    4930
Name: blueWins, dtype: int64

# 除去不需要的特征和y标签后得到x
dropd = ['gameId','blueWins']
x = df.drop(dropd, axis = 1)

## 对于特征进行一些统计描述
x.describe()

## 根据上面的描述，我们可以去除一些重复变量，比如只要知道蓝队是否拿到一血，我们就知道红队有没有拿到，可以去除红队的相关冗余数据。
drop_cols = ['redFirstBlood','redKills','redDeaths','redGoldDiff','redExperienceDiff', 'blueCSPerMin','blueGoldPerMin','redCSPerMin','redGoldPerMin']
x.drop(drop_cols, axis = 1, inplace =True)

data = x
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std.iloc[:, 0:9]], axis=1)
data = pd.melt(data, id_vars='blueWins', var_name='Features', value_name='Values')fig, ax = plt.subplots(1,2,figsize=(15,5))# 绘制小提琴图
sns.violinplot(x='Features', y='Values', hue='blueWins', data=data, split=True,inner='quart', ax=ax[0], palette='Blues')
fig.autofmt_xdate(rotation=45)data = x
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std.iloc[:, 9:18]], axis=1)
data = pd.melt(data, id_vars='blueWins', var_name='Features', value_name='Values')# 绘制小提琴图
sns.violinplot(x='Features', y='Values', hue='blueWins', data=data, split=True, inner='quart', ax=ax[1], palette='Blues')
fig.autofmt_xdate(rotation=45)plt.show()

plt.figure(figsize=(18,14))
sns.heatmap(round(x.corr(),2), cmap='Blues', annot=True)
plt.show()

# 去除冗余特征
drop_cols = ['redAvgLevel','blueAvgLevel']
x.drop(drop_cols, axis=1, inplace=True)
# 同时我们画出各个特征之间的相关性热力图，颜色越深代表特征之间相关性越强，我们剔除那些相关性较强的冗余特征。

sns.set(style='whitegrid', palette='muted')# 构造两个新特征
x['wardsPlacedDiff'] = x['blueWardsPlaced'] - x['redWardsPlaced']
x['wardsDestroyedDiff'] = x['blueWardsDestroyed'] - x['redWardsDestroyed']data = x[['blueWardsPlaced','blueWardsDestroyed','wardsPlacedDiff','wardsDestroyedDiff']].sample(1000)
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std], axis=1)
data = pd.melt(data, id_vars='blueWins', var_name='Features', value_name='Values')plt.figure(figsize=(10,6))
sns.swarmplot(x='Features', y='Values', hue='blueWins', data=data)
plt.xticks(rotation=45)
plt.show()

'''我们画出了插眼数量的散点图，发现不存在插眼数量与游戏胜负间的显著规律。
猜测由于钻石分段以上在哪插眼在哪好排眼都是套路，所以数据中前十分钟插眼数拔眼数对游戏的影响不大。所以我们暂时先把这些特征去掉。
'''
## 去除和眼位相关的特征
drop_cols = ['blueWardsPlaced','blueWardsDestroyed','wardsPlacedDiff','wardsDestroyedDiff','redWardsPlaced','redWardsDestroyed']
x.drop(drop_cols, axis=1, inplace=True)

x['killsDiff'] = x['blueKills'] - x['blueDeaths']
x['assistsDiff'] = x['blueAssists'] - x['redAssists']x[['blueKills','blueDeaths','blueAssists','killsDiff','assistsDiff','redAssists']].hist(figsize=(12,10), bins=20)
plt.show()

data = x[['blueKills','blueDeaths','blueAssists','killsDiff','assistsDiff','redAssists']].sample(1000)
data_std = (data - data.mean()) / data.std()
data = pd.concat([y, data_std], axis=1)
data = pd.melt(data, id_vars='blueWins', var_name='Features', value_name='Values')plt.figure(figsize=(10,6))
sns.swarmplot(x='Features', y='Values', hue='blueWins', data=data)
plt.xticks(rotation=45)
plt.show()

# 从上图我们可以发现击杀数与死亡数与助攻数，以及我们构造的特征对数据都有较好的分类能力。
data = pd.concat([y, x], axis=1).sample(500)sns.pairplot(data, vars=['blueKills','blueDeaths','blueAssists','killsDiff','assistsDiff','redAssists'], hue='blueWins')plt.show()

x['dragonsDiff'] = x['blueDragons'] - x['redDragons']
x['heraldsDiff'] = x['blueHeralds'] - x['redHeralds']
x['eliteDiff'] = x['blueEliteMonsters'] - x['redEliteMonsters']data = pd.concat([y, x], axis=1)eliteGroup = data.groupby(['eliteDiff'])['blueWins'].mean()
dragonGroup = data.groupby(['dragonsDiff'])['blueWins'].mean()
heraldGroup = data.groupby(['heraldsDiff'])['blueWins'].mean()fig, ax = plt.subplots(1,3, figsize=(15,4))eliteGroup.plot(kind='bar', ax=ax[0])
dragonGroup.plot(kind='bar', ax=ax[1])
heraldGroup.plot(kind='bar', ax=ax[2])print(eliteGroup)
print(dragonGroup)
print(heraldGroup)plt.show()

eliteDiff
-2    0.286301
-1    0.3687720    0.5006831    0.6320932    0.735211
Name: blueWins, dtype: float64
dragonsDiff
-1    0.3741730    0.5000001    0.640940
Name: blueWins, dtype: float64
heraldsDiff
-1    0.3877290    0.4986801    0.595046
Name: blueWins, dtype: float64

x['towerDiff'] = x['blueTowersDestroyed'] - x['redTowersDestroyed']data = pd.concat([y, x], axis=1)towerGroup = data.groupby(['towerDiff'])['blueWins']
print(towerGroup.count())
print(towerGroup.mean())fig, ax = plt.subplots(1,2,figsize=(15,5))towerGroup.mean().plot(kind='line', ax=ax[0])
ax[0].set_title('Proportion of Blue Wins')
ax[0].set_ylabel('Proportion')towerGroup.count().plot(kind='line', ax=ax[1])
ax[1].set_title('Count of Towers Destroyed')
ax[1].set_ylabel('Count')

towerDiff
-2      27
-1     3470    90641     4062      283       64       1
Name: blueWins, dtype: int64
towerDiff
-2    0.185185
-1    0.2161380    0.4981241    0.7413792    0.9642863    1.0000004    1.000000
Name: blueWins, dtype: float64

Out[30]:

Text(0, 0.5, 'Count')

利用 LightGBM 进行训练与预测

## 为了正确评估模型性能，将数据划分为训练集和测试集，并在训练集上训练模型，在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
## 选择其类别为0和1的样本 （不包括类别为2的样本）
data_target_part = y
data_features_part = x
## 测试集大小为20%， 80%/20%分
x_train, x_test, y_train, y_test = train_test_split(data_features_part, data_target_part, test_size=0.2, random_state=2022)

pip install lightgbm

#  导入LightGBM模型
from lightgbm.sklearn import LGBMClassifier
clf = LGBMClassifier()
clf.fit(x_train,y_train)

## 在训练集和测试集上分布利用训练好的模型进行预测
train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)
from sklearn import metrics
## 利用accuracy（准确度）【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_train, train_predict))
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_test,test_predict))
## 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵)
confusion_matrix_result = metrics.confusion_matrix(test_predict, y_test)
print('The confusion matrix result:\n',confusion_matrix_result)
# 利用热力图对于结果进行可视化
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix_result,annot=True, cmap='Blues')
plt.show()

The accuracy of the Logistic Regression is: 0.8366443122864735
The accuracy of the Logistic Regression is: 0.7312753036437247
The confusion matrix result:[[748 251][280 697]]

sns.barplot(y=data_features_part.columns, x=clf.feature_importances_)  # 查看特征的重要性

'''
除此之外，我们还可以使用LightGBM中的下列重要属性来评估特征的重要性。
gain:当利用特征做划分的时候的评价基尼指数
split:是以特征用到的次数来评价
'''
from sklearn.metrics import accuracy_score
from lightgbm import plot_importancedef estimate(model,data):#sns.barplot(data.columns,model.feature_importances_)ax1=plot_importance(model,importance_type="gain")ax1.set_title('gain')ax2=plot_importance(model, importance_type="split")ax2.set_title('split')plt.show()
def classes(data,label,test):model=LGBMClassifier()model.fit(data,label)ans=model.predict(test)estimate(model, data)return ansans=classes(x_train,y_train,x_test)
pre=accuracy_score(y_test, ans)
print('acc=',accuracy_score(y_test,ans))

'''
ightGBM中包括但不限于下列对模型影响较大的参数：
learning_rate: 有时也叫作eta，系统默认值为0.3。每一步迭代的步长，很重要。太大了运行准确率不高，太小了运行速度慢。
num_leaves：系统默认为32。这个参数控制每棵树中最大叶子节点数量。
feature_fraction：系统默认值为1。我们一般设置成0.8左右。用来控制每棵随机采样的列数的占比(每一列是一个特征)。
max_depth： 系统默认值为6，我们常用3-10之间的数字。这个值为树的最大深度。这个值是用来控制过拟合的。max_depth越大，模型学习的更加具体。
调节模型参数的方法有贪心算法、网格调参、贝叶斯调参等。这里我们采用网格调参，它的基本思想是穷举搜索：在所有候选的参数选择中，通过循环遍历，尝试每一种可能性，表现最好的参数就是最终的结果
'''
## 从sklearn库中导入网格调参函数
from sklearn.model_selection import GridSearchCV## 定义参数取值范围
learning_rate = [0.1, 0.3, 0.6]
feature_fraction = [0.5, 0.8, 1]
num_leaves = [16, 32, 64]
max_depth = [-1,3,5,8]parameters = { 'learning_rate': learning_rate,'feature_fraction':feature_fraction,'num_leaves': num_leaves,'max_depth': max_depth}
model = LGBMClassifier(n_estimators = 50)## 进行网格搜索
clf = GridSearchCV(model, parameters, cv=3, scoring='accuracy',verbose=3, n_jobs=-1)
clf = clf.fit(x_train, y_train)

clf.best_params_

{'feature_fraction': 1, 'learning_rate': 0.1, 'max_depth': 3, 'num_leaves': 16}

## 在训练集和测试集上分布利用最好的模型参数进行预测## 定义带参数的 LightGBM模型
clf = LGBMClassifier(feature_fraction = 1,learning_rate = 0.1,max_depth= 3,num_leaves = 16)
# 在训练集上训练LightGBM模型
clf.fit(x_train, y_train)train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)## 利用accuracy（准确度）【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_train,train_predict))
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_test,test_predict))## 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵)
confusion_matrix_result = metrics.confusion_matrix(test_predict,y_test)
print('The confusion matrix result:\n',confusion_matrix_result)# 利用热力图对于结果进行可视化
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

[LightGBM] [Warning] feature_fraction is set=1, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=1
The accuracy of the Logistic Regression is: 0.7445273946602556
The accuracy of the Logistic Regression is: 0.7444331983805668
The confusion matrix result:[[763 240][265 708]]

LightGBM的重要参数

基本参数调整 num_leaves参数这是控制树模型复杂度的主要参数，一般的我们会使num_leaves小于（2的max_depth次方），以防止过拟合。由于LightGBM是leaf-wise建树与XGBoost的depth-wise建树方法不同，num_leaves比depth有更大的作用。、

min_data_in_leaf 这是处理过拟合问题中一个非常重要的参数. 它的值取决于训练数据的样本个树和 num_leaves参数. 将其设置的较大可以避免生成一个过深的树, 但有可能导致欠拟合. 实际应用中, 对于大数据集, 设置其为几百或几千就足够了.

max_depth 树的深度，depth 的概念在 leaf-wise 树中并没有多大作用, 因为并不存在一个从 leaves 到 depth 的合理映射。

针对训练速度的参数调整通过设置 bagging_fraction 和 bagging_freq 参数来使用 bagging 方法。通过设置 feature_fraction 参数来使用特征的子抽样。选择较小的 max_bin 参数。使用 save_binary 在未来的学习过程对数据加载进行加速。 2.4.1.3 针对准确率的参数调整使用较大的 max_bin （学习速度可能变慢）使用较小的 learning_rate 和较大的 num_iterations 使用较大的 num_leaves （可能导致过拟合）使用更大的训练数据尝试 dart 模式 2.4.1.4 针对过拟合的参数调整使用较小的 max_bin 使用较小的 num_leaves 使用 min_data_in_leaf 和 min_sum_hessian_in_leaf 通过设置 bagging_fraction 和 bagging_freq 来使用 bagging 通过设置 feature_fraction 来使用特征子抽样使用更大的训练数据使用 lambda_l1, lambda_l2 和 min_gain_to_split 来使用正则尝试 max_depth 来避免生成过深的树