
1  1  1  1  1  3
2  1  1  1  2  2
3  1  1  2  1  3
4  1  1  2  2  1
5  1  2  1  1  3
6  1  2  1  2  2
7  1  2  2  1  3
8  1  2  2  2  1
9  2  1  1  1  3
10  2  1  1  2  2
11  2  1  2  1  3
12  2  1  2  2  1
13  2  2  1  1  3
14  2  2  1  2  2
15  2  2  2  1  3
16  2  2  2  2  3
17  3  1  1  1  3
18  3  1  1  2  3
19  3  1  2  1  3
20  3  1  2  2  1
21  3  2  1  1  3
22  3  2  1  2  2
23  3  2  2  1  3
24  3  2  2  2  3


第二列的1到3,分别对应病人的年龄(age of patient),分别是青年(young),中年(pre-presbyopic),老年(presbyopic)

第三列的1和2,分别对应近视情况(spectacle prescription),近视(myope),远视(hypermetrope)


第五列的1和2,分别对应分泌眼泪的频率(tear production rate),很少(reduce),普通(normal)

第六列的1到3,则是最终根据以上数据得到的分类,分别是硬性的隐形眼镜(hard),软性的隐形眼镜(soft),不需要带眼镜(no lenses)


from numpy import *
import operator
from math import logdef createLensesDataSet():#创建隐形眼镜数据集fr = open('lenses.data')allLinesArr = fr.readlines()linesNum = len(allLinesArr)returnMat = zeros((linesNum, 4))statusLabels = ['age of the patient', 'spectacle prescription', 'astigmatic', 'tear production rate']classLabelVector = []classLabels = ['hard', 'soft', 'no lenses']index = 0for line in allLinesArr:line = line.strip()lineList = line.split('  ')returnMat[index, :] = lineList[1:5]classIndex = int(lineList[5]) - 1classLabelVector.append(classLabels[classIndex])  # 索引-1代表列表最后一个元素index += 1return ndarray.tolist(returnMat), statusLabels, classLabelVectordef createLensesAttributeInfo():parentAgeList = ['young', 'pre', 'presbyopic']spectacleList = ['myope', 'hyper']astigmaticList = ['no', 'yes']tearRateList = ['reduced', 'normal']return parentAgeList, spectacleList, astigmaticList, tearRateList

那么接下来我们应该设定决策树的分支,如何确定以上哪一个特征是第一个分支呢,我们要提到一个概念,香农熵(Shannon entropy)。熵这个概念代表信息的不确定性的大小,在划分数据集中经常会运用到。



def calcShannonEnt(dataSet):#计算香农熵numEntries = len(dataSet)labelCounts = {}for featVec in dataSet:currentLabel = featVec[-1]if currentLabel not in labelCounts.keys():labelCounts[currentLabel] = 0labelCounts[currentLabel] += 1shannonEnt = 0.0for key in labelCounts:prob = float(labelCounts[key])/numEntriesshannonEnt -= prob * log(prob, 2)return shannonEnt



def splitDataSet(dataSet, axis, value):#按照特征值划分数据集,参数为数据集,特征索引,特征值retDataSet = []for featVec in dataSet:if featVec[axis] == value:reducedFeatVec = featVec[:axis]reducedFeatVec.extend(featVec[axis+1:])retDataSet.append(reducedFeatVec)return retDataSet

说到取最佳特征值,我们就要提到一个概念信息增益(information divergence)






def chooseBestFeatureToSplit(dataSet):#选择最佳分割特征值numFeatures =  len(dataSet[0]) - 1baseEntropy = calcShannonEnt(dataSet)bestInfoGain = 0.0bestFeature = -1for i in range(numFeatures):featList = [example[i] for example in dataSet]uniqueVals = set(featList)newEntropy = 0.0for value in uniqueVals:subDataSet = splitDataSet(dataSet, i, value)prob = len(subDataSet) / float(len(dataSet))newEntropy += prob * calcShannonEnt(subDataSet)infoGain = baseEntropy - newEntropyprint(str(i)+':'+str(infoGain))if (infoGain > bestInfoGain):bestInfoGain = infoGainbestFeature = ireturn bestFeature

通过计算我们可以得出特征值的优先级,tear production rate>astigmatic>spectacle prescription>age of patient


def createTree(dataSet, labels):#创建决策树classList = [example[-1] for example in dataSet]if classList.count(classList[0]) == len(classList):print(classList[0])return classList[0]if len(dataSet[0]) == 1:return majorityCnt(classList)bestFeat = chooseBestFeatureToSplit(dataSet)bestFeatLabel = labels[bestFeat]myTree = {bestFeatLabel:{}}del(labels[bestFeat])featValues = [example[bestFeat] for example in dataSet]uniqueVals = set(featValues)for value in uniqueVals:subLabels = labels[:]myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)return myTreedef majorityCnt(classList):#对于单个特征值的列表,按出现次数进行排序classCount = {}for vote in classList:if vote not in classCount.keys(): classCount[vote] = 0classCount[vote] += 1sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)return sortedClassCount[0][0]


import trees
import treePlotter
from numpy import *lensesData, labels, vector = trees.createLensesDataSet()
parentAgeList, spectacleList, astigmaticList, tearRateList = trees.createLensesAttributeInfo()
lensesAttributeList = [parentAgeList, spectacleList, astigmaticList, tearRateList]for i in range(len(lensesData)):for j in range(len(lensesData[i])):index = int(lensesData[i][j]) - 1lensesData[i][j] = lensesAttributeList[j][index]lensesData[i].append(str(vector[i]))myTree = trees.createTree(lensesData, labels)


{'tear production rate': {'reduced': 'no lenses', 'normal': {'astigmatic': {'yes': {'spectacle prescription': {'hyper': {'age of the patient': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}, 'myope': 'hard'}}, 'no': {'age of the patient': {'pre': 'soft', 'presbyopic': {'spectacle prescription': {'hyper': 'soft', 'myope': 'no lenses'}}, 'young': 'soft'}}}}}}



import matplotlib.pyplot as plt
import matplotlibfrom pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']# 定义文本框和箭头格式
decisionNode = dict(boxstyle = "sawtooth", fc = "0.8")
leafNode = dict(boxstyle = "round4", fc = "0.8")
arrow_args = dict(arrowstyle = "<-")def plotNode(nodeTxt, centerPt, parentPt, nodeType):createPlotPlus.ax1.annotate(nodeTxt, xy = parentPt, xycoords = 'axes fraction', xytext = centerPt, textcoords = 'axes fraction', \va = "center", ha = "center", bbox = nodeType, arrowprops = arrow_args)def getNumLeafs(myTree):#获取叶节点的总数量numLeafs = 0firstStr = myTree.keys()[0]secondDict = myTree[firstStr]for k in secondDict.keys():if type(secondDict[k]).__name__ == 'dict':#判断节点数据类型是否为字典numLeafs += getNumLeafs(secondDict[k])else:numLeafs += 1return numLeafsdef getTreeDepth(myTree):#判断决策树的深度maxDepth = 0firstStr = myTree.keys()[0]secondDict = myTree[firstStr]for k in secondDict.keys():if type(secondDict[k]).__name__ == 'dict':  # 判断节点数据类型是否为字典thisDepth = 1 + getTreeDepth(secondDict[k])else:thisDepth = 1if thisDepth > maxDepth:maxDepth = thisDepthreturn maxDepthdef plotMidText(cntrPt, parentPt, txtString):#计算给定两个坐标的中点坐标xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]createPlotPlus.ax1.text(xMid-0.05, yMid, txtString, rotation = 30)def plotTree(myTree, parentPt, nodeTxt):#根据树,父节点,节点文本,绘制一个分支节点numLeafs = getNumLeafs(myTree)firstStr = myTree.keys()[0]cntrPt = (plotTree.xOff +(1.0 + float(numLeafs)) / 2.0 /plotTree.totalW, plotTree.yOff)plotMidText(cntrPt, parentPt, nodeTxt)plotNode(firstStr, cntrPt, parentPt, decisionNode)secondDict = myTree[firstStr]plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalDfor k in secondDict.keys():if type(secondDict[k]).__name__ =='dict':plotTree(secondDict[k], cntrPt, str(k))else:plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalWplotNode(secondDict[k], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(k))plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalDdef createPlotPlus(inTree):#根据给定决策树创建图像fig = plt.figure(1, facecolor='white')fig.clf()axprops = dict(xticks = [], yticks = [])createPlotPlus.ax1 = plt.subplot(111, frameon = False, **axprops)plotTree.totalW = float(getNumLeafs(inTree))plotTree.totalD = float(getTreeDepth(inTree))plotTree.xOff = -0.5 / plotTree.totalWplotTree.yOff = 1.0plotTree(inTree, (0.5, 1.0), '')plt.show()







