Python 决策树的建树、剪枝与优化

关注微信公共号：小程在线

关注CSDN博客：程志伟的博客

八个参数：Criterion，两个随机性相关的参数（random_state，splitter），五个剪枝参数（max_depth,
min_samples_split，min_samples_leaf，max_feature，min_impurity_decrease）
一个属性：feature_importances_
四个接口：fit，score，apply，predict

Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]
Type "copyright", "credits" or "license" for more information.

IPython 7.6.1 -- An enhanced Interactive Python.

########################## 分类树 ################

from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

wine = load_wine()

wine.data.shape
Out[3]: (178, 13)

wine.target
Out[4]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2])

import pandas as pd
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)
Out[5]:
0 1 2 3 4 5 ... 8 9 10 11 12 0
0 14.23 1.71 2.43 15.6 127.0 2.80 ... 2.29 5.64 1.04 3.92 1065.0 0
1 13.20 1.78 2.14 11.2 100.0 2.65 ... 1.28 4.38 1.05 3.40 1050.0 0
2 13.16 2.36 2.67 18.6 101.0 2.80 ... 2.81 5.68 1.03 3.17 1185.0 0
3 14.37 1.95 2.50 16.8 113.0 3.85 ... 2.18 7.80 0.86 3.45 1480.0 0
4 13.24 2.59 2.87 21.0 118.0 2.80 ... 1.82 4.32 1.04 2.93 735.0 0
.. ... ... ... ... ... ... ... ... ... ... ... ... ..
173 13.71 5.65 2.45 20.5 95.0 1.68 ... 1.06 7.70 0.64 1.74 740.0 2
174 13.40 3.91 2.48 23.0 102.0 1.80 ... 1.41 7.30 0.70 1.56 750.0 2
175 13.27 4.28 2.26 20.0 120.0 1.59 ... 1.35 10.20 0.59 1.56 835.0 2
176 13.17 2.59 2.37 20.0 120.0 1.65 ... 1.46 9.30 0.60 1.62 840.0 2
177 14.13 4.10 2.74 24.5 96.0 2.05 ... 1.35 9.20 0.61 1.60 560.0 2

[178 rows x 14 columns]

#变量的名称

wine.feature_names
Out[6]:
['alcohol',
'malic_acid',
'ash',
'alcalinity_of_ash',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavanoid_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'od280/od315_of_diluted_wines',
'proline']

#标签的分类

wine.target_names
Out[7]: array(['class_0', 'class_1', 'class_2'], dtype='<U7')

#分为训练集与测试集

Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)

Xtrain.shape
Out[9]: (124, 13)

Xtest.shape
Out[10]: (54, 13)

#进行预测

clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest) #返回预测的准确度
score
Out[11]: 0.9629629629629629

#对决策树进行可视化

feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']

import graphviz

dot_data = tree.export_graphviz(clf
,out_file = None
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
Out[14]:

#查看变量的重要程度

clf.feature_importances_
Out[15]:
array([0.01424694, 0. , 0. , 0. , 0. ,
0. , 0.41643217, 0. , 0. , 0.23051758,
0. , 0. , 0.33880331])

#将重要性与名字相对应
[*zip(feature_name,clf.feature_importances_)]
Out[17]:
[('酒精', 0.01424693586672224),
('苹果酸', 0.0),
('灰', 0.0),
('灰的碱性', 0.0),
('镁', 0.0),
('总酚', 0.0),
('类黄酮', 0.41643217174608416),
('非黄烷类酚类', 0.0),
('花青素', 0.0),
('颜色强度', 0.23051757969779874),
('色调', 0.0),
('od280/od315稀释葡萄酒', 0.0),
('脯氨酸', 0.33880331268939484)]

在每次分枝时，不从使用全部特征，而是随
机选取一部分特征，从中选取不纯度相关指标最优的作为分枝用的节点。这样，每次生成的树也就不同了
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=30)
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest) #返回预测的准确度
score
Out[18]: 0.9629629629629629

splitter也是用来控制决策树中的随机选项的，有两种输入值，输入”best"，决策树在分枝时虽然随机，但是还是会
优先选择更重要的特征进行分枝，输入“random"，决策树在分枝时会更加随机

clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
)
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest)
score
Out[19]: 0.9629629629629629

import graphviz
dot_data = tree.export_graphviz(clf
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
Out[20]:

测试集结果

score_train = clf.score(Xtrain, Ytrain)
score_train
Out[21]: 1.0

#######################剪枝参数
max_depth
限制树的最大深度，超过设定深度的树枝全部剪掉

clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
,max_depth=3

)

dot_data = tree.export_graphviz(clf
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
Traceback (most recent call last):

File "<ipython-input-22-078a2a9dc9b9>", line 12, in <module>
,rounded=True

File "H:\Anaconda3\lib\site-packages\sklearn\tree\export.py", line 756, in export_graphviz
check_is_fitted(decision_tree, 'tree_')

File "H:\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 914, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

解决方法：

先调用fit方法再进行预测

clf = clf.fit(Xtrain, Ytrain)

在执行上面的脚本
dot_data = tree.export_graphviz(clf
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
Out[26]:

min_samples_leaf限定，一个节点在分枝后的每个子节点都必须包含至少min_samples_leaf个训练样本，否则分
枝就不会发生，或者，分枝会朝着满足每个子节点都包含min_samples_leaf个样本的方向去发生

clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
,max_depth=3
,min_samples_leaf=10

)
clf = clf.fit(Xtrain, Ytrain)
dot_data = tree.export_graphviz(clf
,feature_names= feature_name
,class_names=["琴酒","雪莉","贝尔摩德"]
,filled=True
,rounded=True
)
graph = graphviz.Source(dot_data)
graph
Out[27]:

samples的数字都大于10，上一副图中不大于10的都变为10以上了。

min_samples_split限定，一个节点必须要包含至少min_samples_split个训练样本，这个节点才允许被分枝，否则
分枝就不会发生。

clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"

,min_samples_split=60

确认最优的剪枝参数

import matplotlib.pyplot as plt

test = []

for i in range(10):
clf = tree.DecisionTreeClassifier(max_depth=i+1
,criterion="entropy"
,random_state=30
,splitter="random"
)
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest)
test.append(score)

plt.plot(range(1,11),test,color="red",label="max_depth")
plt.legend()
plt.show()

每个数据归属的节点

clf.apply(Xtest)
Out[36]:
array([22, 14, 30, 22, 30, 14, 14, 11, 22, 14, 14, 26, 22, 22, 30, 18, 30,
9, 14, 4, 4, 14, 30, 26, 30, 14, 11, 30, 4, 30, 4, 4, 26, 8,
4, 4, 14, 6, 30, 22, 30, 30, 26, 30, 24, 8, 9, 4, 30, 4, 30,
4, 4, 30], dtype=int64)

每个数据分分类

clf.predict(Xtest)
Out[37]:
array([1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1, 2, 2, 1,
0, 0, 0, 1, 1, 0, 2, 0, 2, 2, 0, 2, 2, 2, 1, 2, 0, 1, 0, 0, 0, 0,
1, 2, 1, 2, 0, 2, 0, 2, 2, 0])

#############################泰坦尼克号数据 #########################

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

data = pd.read_csv(r"H:\程志伟\python\菜菜的机器学习skleaen课堂\01 决策树课件数据源码\Taitanic data\data.csv",index_col= 0)

data.head()
Out[40]:
Survived Pclass ... Cabin Embarked
PassengerId ...
1 0 3 ... NaN S
2 1 1 ... C85 C
3 1 3 ... NaN S
4 1 1 ... C123 S
5 0 3 ... NaN S

[5 rows x 11 columns]

data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null int64
1 Pclass 891 non-null int64
2 Name 891 non-null object
3 Sex 891 non-null object
4 Age 714 non-null float64
5 SibSp 891 non-null int64
6 Parch 891 non-null int64
7 Ticket 891 non-null object
8 Fare 891 non-null float64
9 Cabin 204 non-null object
10 Embarked 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB

#删除缺失值过多的列，和观察判断来说和预测的y没有关系的列
data.drop(["Cabin","Name","Ticket"],inplace=True,axis=1)

#处理缺失值，对缺失值较多的列进行填补，有一些特征只确实一两个值，可以采取直接删除记录的方法
data["Age"] = data["Age"].fillna(data["Age"].mean())
data = data.dropna()

#将分类变量转换为数值型变量
#将二分类变量转换为数值型变量
#astype能够将一个pandas对象转换为某种类型，和apply(int(x))不同，astype可以将文本类转换为数字，用这
个方式可以很便捷地将二分类特征转换为0~1
data["Sex"] = (data["Sex"]== "male").astype("int")

#将三分类变量转换为数值型变量
labels = data["Embarked"].unique().tolist()
data["Embarked"] = data["Embarked"].apply(lambda x: labels.index(x))

#查看处理后的数据集
data.head()
Out[47]:
Survived Pclass Sex Age SibSp Parch Fare Embarked
PassengerId
1 0 3 1 22.0 1 0 7.2500 0
2 1 1 0 38.0 1 0 71.2833 1
3 1 3 0 26.0 0 0 7.9250 0
4 1 1 0 35.0 1 0 53.1000 0
5 0 3 1 35.0 0 0 8.0500 0

提取标签和特征矩阵，分测试集和训练集

X = data.iloc[:,data.columns != "Survived"]
y = data.iloc[:,data.columns == "Survived"]

from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3)

for i in [Xtrain, Xtest, Ytrain, Ytest]:
i.index = range(i.shape[0])

Xtrain.head()
Out[52]:
Pclass Sex Age SibSp Parch Fare Embarked
0 2 1 2.000000 1 1 26.0000 0
1 3 1 26.000000 0 0 7.8958 0
2 1 0 35.000000 1 0 52.0000 0
3 1 0 52.000000 1 1 93.5000 0
4 3 1 29.699118 0 0 8.0500 0

#对测试集进行预测

clf = DecisionTreeClassifier(random_state=25)
clf = clf.fit(Xtrain, Ytrain)
score_ = clf.score(Xtest, Ytest)
score_
Out[53]: 0.7865168539325843

from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

#交叉验证

score = cross_val_score(clf,X,y,cv=10).mean()
score
Out[55]: 0.7739274770173645

在不同max_depth下观察模型的拟合状况

tr = []
te = []
for i in range(10):
clf = DecisionTreeClassifier(random_state=25
,max_depth=i+1
,criterion="entropy"
)
clf = clf.fit(Xtrain, Ytrain)
score_tr = clf.score(Xtrain,Ytrain)
score_te = cross_val_score(clf,X,y,cv=10).mean()
tr.append(score_tr)
te.append(score_te)

print(max(te))
0.8177860061287026

plt.plot(range(1,11),tr,color="red",label="train")
plt.plot(range(1,11),te,color="blue",label="test")
plt.xticks(range(1,11))
plt.legend()
plt.show()

用网格搜索调整参数

import numpy as np
gini_thresholds = np.linspace(0,0.5,20)
parameters = {'splitter':('best','random')
,'criterion':("gini","entropy")
,"max_depth":[*range(1,10)]
,'min_samples_leaf':[*range(1,50,5)]
,'min_impurity_decrease':[*np.linspace(0,0.5,20)]
}

clf = DecisionTreeClassifier(random_state=25)
GS = GridSearchCV(clf, parameters, cv=10)
GS.fit(Xtrain,Ytrain)
Out[62]:
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight=None,
criterion='gini', max_depth=None,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort=False, random_state=25,
splitter='best'),
iid='warn', n_...
0.23684210526315788,
0.2631578947368421,
0.2894736842105263,
0.3157894736842105,
0.3421052631578947,
0.3684210526315789,
0.39473684210526316,
0.42105263157894735,
0.4473684210526315,
0.47368421052631576, 0.5],
'min_samples_leaf': [1, 6, 11, 16, 21, 26, 31, 36, 41,
46],
'splitter': ('best', 'random')},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)

#最优的参数

GS.best_params_
Out[63]:
{'criterion': 'gini',
'max_depth': 3,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 6,
'splitter': 'best'}

#最优的预测结果

GS.best_score_
Out[64]: 0.8183279742765274