python中id3决策树算法_ID3决策树算法实现（Python版）

1 #-*- coding:utf-8 -*-

3 from numpy import *

4 importnumpy as np5 importpandas as pd6 from math importlog7 importoperator8

9 #计算数据集的香农熵

10 defcalcShannonEnt(dataSet):11 numEntries=len(dataSet)12 labelCounts={}13 #给所有可能分类创建字典

14 for featVec indataSet:15 currentLabel=featVec[-1]16 if currentLabel not inlabelCounts.keys():17 labelCounts[currentLabel]=018 labelCounts[currentLabel]+=1

19 shannonEnt=0.0

20 #以2为底数计算香农熵

21 for key inlabelCounts:22 prob = float(labelCounts[key])/numEntries23 shannonEnt-=prob*log(prob,2)24 returnshannonEnt25

27 #对离散变量划分数据集，取出该特征取值为value的所有样本

28 defsplitDataSet(dataSet,axis,value):29 retDataSet=[]30 for featVec indataSet:31 if featVec[axis]==value:32 reducedFeatVec=featVec[:axis]33 reducedFeatVec.extend(featVec[axis+1:])34 retDataSet.append(reducedFeatVec)35 returnretDataSet36

37 #对连续变量划分数据集，direction规定划分的方向，

38 #决定是划分出小于value的数据样本还是大于value的数据样本集

39 defsplitContinuousDataSet(dataSet,axis,value,direction):40 retDataSet=[]41 for featVec indataSet:42 if direction==0:43 if featVec[axis]>value:44 reducedFeatVec=featVec[:axis]45 reducedFeatVec.extend(featVec[axis+1:])46 retDataSet.append(reducedFeatVec)47 else:48 if featVec[axis]<=value:49 reducedFeatVec=featVec[:axis]50 reducedFeatVec.extend(featVec[axis+1:])51 retDataSet.append(reducedFeatVec)52 returnretDataSet53

54 #选择最好的数据集划分方式

55 defchooseBestFeatureToSplit(dataSet,labels):56 numFeatures=len(dataSet[0])-1

57 baseEntropy=calcShannonEnt(dataSet)58 bestInfoGain=0.0

59 bestFeature=-1

60 bestSplitDict={}61 for i inrange(numFeatures):62 featList=[example[i] for example indataSet]63 #对连续型特征进行处理

64 if type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int':65 #产生n-1个候选划分点

66 sortfeatList=sorted(featList)67 splitList=[]68 for j in range(len(sortfeatList)-1):69 splitList.append((sortfeatList[j]+sortfeatList[j+1])/2.0)70

71 bestSplitEntropy=10000

72 slen=len(splitList)73 #求用第j个候选划分点划分时，得到的信息熵，并记录最佳划分点

74 for j inrange(slen):75 value=splitList[j]76 newEntropy=0.0

77 subDataSet0=splitContinuousDataSet(dataSet,i,value,0)78 subDataSet1=splitContinuousDataSet(dataSet,i,value,1)79 prob0=len(subDataSet0)/float(len(dataSet))80 newEntropy+=prob0*calcShannonEnt(subDataSet0)81 prob1=len(subDataSet1)/float(len(dataSet))82 newEntropy+=prob1*calcShannonEnt(subDataSet1)83 if newEntropy<84 bestsplitentropy="newEntropy85" bestsplit="j86">

87 bestSplitDict[labels[i]]=splitList[bestSplit]88 infoGain=baseEntropy-bestSplitEntropy89 #对离散型特征进行处理

90 else:91 uniqueVals=set(featList)92 newEntropy=0.0

93 #计算该特征下每种划分的信息熵

94 for value inuniqueVals:95 subDataSet=splitDataSet(dataSet,i,value)96 prob=len(subDataSet)/float(len(dataSet))97 newEntropy+=prob*calcShannonEnt(subDataSet)98 infoGain=baseEntropy-newEntropy99 if infoGain>bestInfoGain:100 bestInfoGain=infoGain101 bestFeature=i102 #若当前节点的最佳划分特征为连续特征，则将其以之前记录的划分点为界进行二值化处理

103 #即是否小于等于bestSplitValue

104 if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int':105 bestSplitValue=bestSplitDict[labels[bestFeature]]106 labels[bestFeature]=labels[bestFeature]+'<='+str(bestSplitValue)107 for i inrange(shape(dataSet)[0]):108 if dataSet[i][bestFeature]<=bestSplitValue:109 dataSet[i][bestFeature]=1

110 else:111 dataSet[i][bestFeature]=0112 returnbestFeature113

114 #特征若已经划分完，节点下的样本还没有统一取值，则需要进行投票

115 defmajorityCnt(classList):116 classCount={}117 for vote inclassList:118 if vote not inclassCount.keys():119 classCount[vote]=0120 classCount[vote]+=1

121 returnmax(classCount)122

123 #主程序，递归产生决策树

124 defcreateTree(dataSet,labels,data_full,labels_full):125 classList=[example[-1] for example indataSet]126 if classList.count(classList[0])==len(classList):127 returnclassList[0]128 if len(dataSet[0])==1:129 returnmajorityCnt(classList)130 bestFeat=chooseBestFeatureToSplit(dataSet,labels)131 bestFeatLabel=labels[bestFeat]132 myTree={bestFeatLabel:{}}133 featValues=[example[bestFeat] for example indataSet]134 uniqueVals=set(featValues)135 if type(dataSet[0][bestFeat]).__name__=='str':136 currentlabel=labels_full.index(labels[bestFeat])137 featValuesFull=[example[currentlabel] for example indata_full]138 uniqueValsFull=set(featValuesFull)139 del(labels[bestFeat])140 #针对bestFeat的每个取值，划分出一个子树。

141 for value inuniqueVals:142 subLabels=labels[:]143 if type(dataSet[0][bestFeat]).__name__=='str':144 uniqueValsFull.remove(value)145 myTree[bestFeatLabel][value]=createTree(splitDataSet\146 (dataSet,bestFeat,value),subLabels,data_full,labels_full)147 if type(dataSet[0][bestFeat]).__name__=='str':148 for value inuniqueValsFull:149 myTree[bestFeatLabel][value]=majorityCnt(classList)150 returnmyTree151

152 importmatplotlib.pyplot as plt153 decisionNode=dict(boxstyle="sawtooth",fc="0.8")154 leafNode=dict(boxstyle="round4",fc="0.8")155 arrow_args=dict(arrowstyle="

157

158 #计算树的叶子节点数量

159 defgetNumLeafs(myTree):160 numLeafs=0161 firstSides =list(myTree.keys())162 firstStr=firstSides[0]163 secondDict=myTree[firstStr]164 for key insecondDict.keys():165 if type(secondDict[key]).__name__=='dict':166 numLeafs+=getNumLeafs(secondDict[key])167 else: numLeafs+=1

168 returnnumLeafs169

170 #计算树的最大深度

171 defgetTreeDepth(myTree):172 maxDepth=0173 firstSides =list(myTree.keys())174 firstStr=firstSides[0]175 secondDict=myTree[firstStr]176 for key insecondDict.keys():177 if type(secondDict[key]).__name__=='dict':178 thisDepth=1+getTreeDepth(secondDict[key])179 else: thisDepth=1

180 if thisDepth>maxDepth:181 maxDepth=thisDepth182 returnmaxDepth183

184 #画节点

185 defplotNode(nodeTxt,centerPt,parentPt,nodeType):186 createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\187 xytext=centerPt,textcoords='axes fraction',va="center", ha="center",\188 bbox=nodeType,arrowprops=arrow_args)189

190 #画箭头上的文字

191 defplotMidText(cntrPt,parentPt,txtString):192 lens=len(txtString)193 xMid=(parentPt[0]+cntrPt[0])/2.0-lens*0.002

194 yMid=(parentPt[1]+cntrPt[1])/2.0

195 createPlot.ax1.text(xMid,yMid,txtString)196

197 defplotTree(myTree,parentPt,nodeTxt):198 numLeafs=getNumLeafs(myTree)199 depth=getTreeDepth(myTree)200 firstSides =list(myTree.keys())201 firstStr=firstSides[0]202 cntrPt=(plotTree.x0ff+(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.y0ff)203 plotMidText(cntrPt,parentPt,nodeTxt)204 plotNode(firstStr,cntrPt,parentPt,decisionNode)205 secondDict=myTree[firstStr]206 plotTree.y0ff=plotTree.y0ff-1.0/plotTree.totalD207 for key insecondDict.keys():208 if type(secondDict[key]).__name__=='dict':209 plotTree(secondDict[key],cntrPt,str(key))210 else:211 plotTree.x0ff=plotTree.x0ff+1.0/plotTree.totalW212 plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode)213 plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key))214 plotTree.y0ff=plotTree.y0ff+1.0/plotTree.totalD215

216 defcreatePlot(inTree):217 fig=plt.figure(1,facecolor='white')218 fig.clf()219 axprops=dict(xticks=[],yticks=[])220 createPlot.ax1=plt.subplot(111,frameon=False,**axprops)221 plotTree.totalW=float(getNumLeafs(inTree))222 plotTree.totalD=float(getTreeDepth(inTree))223 plotTree.x0ff=-0.5/plotTree.totalW224 plotTree.y0ff=1.0

225 plotTree(inTree,(0.5,1.0),'')226 plt.show()227

228 df=pd.read_csv('watermelon_4_3.csv')229 data=df.values[:,1:].tolist()230 data_full=data[:]231 labels=df.columns.values[1:-1].tolist()232 labels_full=labels[:]233 myTree=createTree(data,labels,data_full,labels_full)234 print(myTree)235 createPlot(myTree)

84>

python中id3决策树算法_ID3决策树算法实现（Python版）相关推荐

Python中的图像处理（第六章）Python图像量化及采样处理（2）
Python中的图像处理(第六章)Python图像量化及采样处理(2) 前言一. Python准备二. Python仿真三. 小结前言随着人工智能研究的不断兴起,Python的应用也在不断上 ...
Python中通过索引名称提取数据loc()函数Python中通过行和列下标提取数据iloc()函数
[小白从小学Python.C.Java] [Python全国计算机等级考试] [Python数据分析考试必会题] ● 标题与摘要 Python中通过索引名称提取数据 loc()函数 Python中通过 ...
Python中的图像处理（第十一章）Python图像锐化及边缘检测（1）
Python中的图像处理(第十一章)Python图像锐化及边缘检测(1) 前言一. Python准备二. Python仿真三. 小结前言随着人工智能研究的不断兴起,Python的应用也在不断 ...
python中filepath路径怎么写_详解Python中的路径问题
1. 绝对路径引入 Python 在搜索模块时,依次搜索sys.path里的位置,直到找到模块为止.下面命令可以查看当前的搜索路径: import sys print(sys.path) sys.pa ...
python下列合法的变量名是什么,python中的合法变量名有什么规则-Python教程
python中非法变量名的规定有:一.能够由字母.数字.下划线组成,同时,不克不及以数字扫尾:二.不克不及是python要害字,但能够蕴含要害字:三.不克不及蕴含空格.例如:[a1c_x2z]. Py ...
python中导入模块使用哪个关键字_关于python导入模块的关键字介绍
关于python导入模块的关键字介绍发布时间:2020-04-17 10:13:26 来源:亿速云阅读:101 作者:小新今天小编给大家分享的是关于python导入模块的关键字介绍,很多人都不太 ...
python中列表和集合_15个例子掌握Python列表，集合和元组
Python中的一切都是对象.每个对象都有自己的数据属性和与之关联的方法.为了有效和恰当地使用一个对象,我们应该知道如何与它们交互. 列表.元组和集合是三种重要的对象类型.它们的共同点是它们都被用作数 ...
在Python中查找和替换文本，玩转Python正则
最简单的查找替换在Python中查找和替换非常简单,如果当前对象是一个字符串str时,你可以使用该类型提供的find() 或者index() 方法查找指定的字符,如果能找到则会返回字符第一次出现的索 ...
在python中定义类时、运算符重载_自定义 Python 类中的运算符和函数重载（上）...
如果你对 Python 中的str对象使用过 + 或 * 运算符,你一定注意到了它的操作与 int 或 float 类型的区别: 你可能想知道同一内置运算符或函数如何对不同类对象进行不同操作的.这分别 ...
python中继承是什么意思_如何理解Python中的继承？python入门
如何理解Python中的继承?如今,python编程语言深受企业和个人的喜爱.python开发工程师是近年来互联网行业非常热门的职业岗位之一.学习python的人除了零基础的,还有一部分是在职运维.在 ...

python中id3决策树算法_ID3决策树算法实现（Python版）

python中id3决策树算法_ID3决策树算法实现（Python版）相关推荐

最新文章

热门文章