ID3/C4.5算法分类决策树

import numpy as np
import math
class Node:def __init__(self,feature_index=None,value=None,label=None):self.feature_index=feature_indexself.value=valueself.child=[]self.label=labelclass C4_5:def __init__(self,X,Y,c=0.1,way='ID3'):self.c = cself.root=Node()self.X = Xself.Y = Yself.feature_num = len(X[0])self.label_num = len(Y)self.feature_set = list(range(self.feature_num))self.getac()self.way = waydef getac(self):self.dict_x = {}self.dict_y = set(self.Y)for i in range(self.feature_num):self.dict_x[i] = set([X[i] for X in self.X])@staticmethoddef get_label(list_):return max(list_, key=list_.count)@staticmethoddef count_Y(Y):dict_y = {}for i in Y:if i in dict_y.keys():dict_y[i]+=1else:dict_y[i] = 1return dict_ydef experience_entropy(self,Y):dict_y = self.count_Y(Y)D = len(Y)set_y = set(Y)return -sum([dict_y[x]/D*math.log(dict_y[x]/D,2) for x in set_y])def get_feature(self,X,Y,rest_x):HD = self.experience_entropy(Y)Y = np.array(Y)X = np.array(X)entropy_ = []if self.way == 'ID3':for i in rest_x:sum_ = 0list_x = np.array([x[i] for x in X])for j in self.dict_x[i]:sum__ = 0Di = sum(list_x == j)if Di != 0:for m in self.dict_y:Dik = sum(Y[list_x == j]==m)if Dik != 0:sum__ += Dik/Di*math.log(Dik/Di,2)sum_ -= Di/len(list_x)*sum__add_entropy = HD - sum_entropy_.append(add_entropy)if self.way == 'C45':for i in rest_x:sum_ = 0list_x = np.array([x[i] for x in X])for j in self.dict_x[i]:sum__ = 0HAD = 0Di = sum(list_x == j)if Di != 0:for m in self.dict_y:Dik = sum(Y[list_x == j]==m)if Dik != 0:sum__ += Dik/Di*math.log(Dik/Di,2)sum_ -= Di/len(list_x)*sum__HAD -= Di/len(list_x)*math.log(Di/len(list_x),2)add_entropy = (HD - sum_)/HADentropy_.append(add_entropy)max_add = max(entropy_)index_ = entropy_.index(max_add)spilt_feature = self.feature_set[index_]return spilt_feature,max_adddef build_tree(self):def build_tree_(node, X, Y, rest_x):X = np.array(X)Y = np.array(Y)if len(set(Y))==1:node.label=list(set(Y))[0]returnelif len(X[0])==0:node.label=self.get_label(self.Y)returnspilt_feature,max_add = self.get_feature(X,Y,rest_x)if max_add < self.c:node.label=self.get_label(self.Y)returnrest_x.remove(spilt_feature)for i in self.dict_x[spilt_feature]:Node_child = Node(feature_index=spilt_feature,value=i)build_tree_(Node_child,X[np.array([x[spilt_feature] for x in X])==i],Y[np.array([x[spilt_feature] for x in X])==i],rest_x)node.child.append(Node_child)build_tree_(self.root,self.X,self.Y,self.feature_set)def print_tree(self):root = self.rootdef pre_order(root):if root:print(root.feature_index,root.value,root.label,len(root.child))for i in root.child:pre_order(i)pre_order(root)def predict_single(self,X):if len(X) != self.feature_num:raise IndexErrorroot = self.rootwhile root.child:for node in root.child:if X[node.feature_index] == node.value:root = nodecontinuereturn root.valuedef predict(self,X):X = np.array(X)if len(X.shape) == 1:return self.predict_single(X)else:result = []for i in X:result.append(self.predict_single(i))return resultdef main():datasets = [['青年', '否', '否', '一般', '否'],['青年', '否', '否', '好', '否'],['青年', '是', '否', '好', '是'],['青年', '是', '是', '一般', '是'],['青年', '否', '否', '一般', '否'],['中年', '否', '否', '一般', '否'],['中年', '否', '否', '好', '否'],['中年', '是', '是', '好', '是'],['中年', '否', '是', '非常好', '是'],['中年', '否', '是', '非常好', '是'],['老年', '否', '是', '非常好', '是'],['老年', '否', '是', '好', '是'],['老年', '是', '否', '好', '是'],['老年', '是', '否', '非常好', '是'],['老年', '否', '否', '一般', '否']]X = [x[0:-1] for x in datasets]Y = [x[-1] for x in datasets]for i,j in zip(X,Y):print(i,j)C4_5_trainer = C4_5(X,Y)C4_5_trainer.build_tree()C4_5_trainer.print_tree()predict_single_x = [['中年', '是', '否', '一般'],['老年', '否', '否', '一般']]print(C4_5_trainer.predict((predict_single_x)))if __name__ == '__main__':main()#############result###################
/usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/C4.5_ID3.py
['青年', '否', '否', '一般'] 否
['青年', '否', '否', '好'] 否
['青年', '是', '否', '好'] 是
['青年', '是', '是', '一般'] 是
['青年', '否', '否', '一般'] 否
['中年', '否', '否', '一般'] 否
['中年', '否', '否', '好'] 否
['中年', '是', '是', '好'] 是
['中年', '否', '是', '非常好'] 是
['中年', '否', '是', '非常好'] 是
['老年', '否', '是', '非常好'] 是
['老年', '否', '是', '好'] 是
['老年', '是', '否', '好'] 是
['老年', '是', '否', '非常好'] 是
['老年', '否', '否', '一般'] 否
None None None 2
2 是 是 0
2 否 None 2
1 是 是 0
1 否 否 0
['是', '否']

平方误差二叉回归树

import numpy as npclass Node:def __init__(self,feature_index=None,cut_value=None,y_value=None,left_l=None,right_s=None):self.feature_index=feature_indexself.y_value=y_valueself.cut_value=cut_valueself.left_l=left_lself.right_s=right_sclass Cart_reg:def __init__(self,X,Y,min_leave_data=3):self.min_leave_data = min_leave_dataself.root=Node()self.X = Xself.Y = Yself.feature_num = len(X[0])def cauclate_loss(self,Y1,Y2):if len(Y1)!=0 and len(Y2)!=0:sum_1 = sum([(y - np.mean(Y1))**2 for y in Y1])sum_2 = sum([(y - np.mean(Y2))**2 for y in Y2])return sum_1 + sum_2elif len(Y1)!=0:return sum([(y - np.mean(Y1))**2 for y in Y1])elif len(Y2)!=0:return sum([(y - np.mean(Y1))** 2 for y in Y2])def find_spilt(self,X,Y):X = np.array(X)Y = np.array(Y)save_loss = []save_feature = []save_value = []for i in range(self.feature_num):list_ix = np.array([x[i] for x in X])for j in list_ix:loss_ = self.cauclate_loss(Y[list_ix <= j],Y[list_ix > j])save_loss.append(loss_)save_feature.append(i)save_value.append(j)min_loss = min(save_loss)min_index = save_loss.index(min_loss)min_feature = save_feature[min_index]min_value = save_value[min_index]return min_feature,min_valuedef build_tree(self):def build_tree_(node, X, Y):X = np.array(X)Y = np.array(Y)# print(len(X))if len(X) <= self.min_leave_data:node.y_value=np.mean(Y)returnmin_feature,min_value = self.find_spilt(X,Y)node.feature_index = min_featurenode.cut_value = min_valuelarge_index = np.array([x[min_feature] for x in X])>min_valuesmall_index = np.array([x[min_feature] for x in X])<=min_valueX_left = X[large_index]X_right = X[small_index]Y_left = Y[large_index]Y_right = Y[small_index]node.left_l = Node()node.right_s = Node()build_tree_(node.left_l,X_left,Y_left)build_tree_(node.right_s,X_right,Y_right)build_tree_(self.root,self.X,self.Y)def print_tree(self):root = self.rootdef pre_order(root):if root:print(root.feature_index,root.cut_value,root.y_value)pre_order(root.left_l)pre_order(root.right_s)pre_order(root)def predict_single(self,X):if len(X) != self.feature_num:raise IndexErrornode = self.rootwhile node.y_value is None:if X[node.feature_index] <= node.cut_value:node = node.right_selse:node = node.left_lreturn node.y_valuedef predict(self,X):X = np.array(X)if len(X.shape) == 1:return self.predict_single(X)else:result = []for i in X:result.append(self.predict_single(i))return resultdef main():X=[[1,5,7,4,8,1,2],[2,3,5,5,2,7,8],[1,2,3,4,5,6,7],[1,2,1,2,2,3,9],[2,8,9,7,0,1,4],[4,8,3,4,5,6,7],[4,1,3,1,5,8,0]]Y= [2,6,2,5,8,3,2]reg_t = Cart_reg(X,Y,2)reg_t.build_tree()reg_t.print_tree()print(reg_t.predict_single([4,1,3,1,5,8,0]))print(reg_t.predict([[1,5,7,4,8,1,2],[2,3,5,5,2,7,8],[1,2,3,4,5,6,7]]))if __name__ == '__main__':main()#############result###################
/usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/cart_reg_tree
4 2 None
1 5 None
None None 3.0
0 1 None
None None 2.0
None None 2.0
1 3 None
None None 8.0
None None 5.5
2.0
[2.0, 5.5, 2.0]

统计学习方法第五章作业:ID3/C4.5算法分类决策树、平方误差二叉回归树代码实现相关推荐

  1. 统计学习方法第二十一章作业:PageRank迭代算法、幂法、代数算法 代码实现

    PageRank迭代算法.幂法.代数算法 import numpy as npclass PageRank:def __init__(self,M,D=0.85):self.M = np.array( ...

  2. 统计学习方法第十一章作业:随机条件场—概率计算问题、IIS/GD学习算法、维特比预测算法 代码实现

    随机条件场-概率计算问题.IIS/GD学习算法.维特比预测算法 这一章的算法不是很好写,整整研究了好几天,代码还是有点小问题,仅供参考. 用的是书上定义的特征函数. import numpy as n ...

  3. 统计学习方法第十七章作业:LSA潜在语义分析算法 代码实现

    LSA潜在语义分析算法 import numpy as np import jieba import collectionsclass LSA:def __init__(self,text_list) ...

  4. 统计学习方法第七章作业:SVM非线性支持向量机之SMO序列最小优化算法代码实现

    SMO序列最小优化算法 import numpy as np import math from sklearn.metrics import accuracy_score from sklearn.m ...

  5. 统计学习方法第六章作业:逻辑斯谛梯度下降法、最大熵模型 IIS / DFP 算法代码实现

    逻辑斯谛梯度下降法 import numpy as np import matplotlib.pyplot as pltclass logist:def __init__(self,a=1,c=Non ...

  6. 统计学习方法第三章作业:一般k邻近、平衡kd树构造、kd树邻近搜索算法代码实现

    一般k邻近 import numpy as np import matplotlib.pyplot as pltclass K_near:def __init__(self,X,Y,K=5,p=2): ...

  7. python 第一行输入n表示一天中有多少人买水果_Python编程:从入门到实践——【作业】——第五章作业...

    第五章作业 5-1条件测试 : 编写一系列条件测试: 将每个测试以及你对其结果的预测和实际结果都打印出来. 你编写的代码应类似于下面这样: car = ' subaru' print("Is ...

  8. c语言第五章作业,《C语言程序设计》第五章作业.doc

    <C语言程序设计>第五章作业 窗体顶端 <C语言程序设计>第5章作业 布置日期:2012-3-15?????截止日期:2012-3-22 一. 单选题 (每小题6分,共102分 ...

  9. 李航《统计学习方法》第二章课后答案链接

    李航<统计学习方法>第二章课后答案链接 李航 统计学习方法 第二章 课后 习题 答案 http://blog.csdn.net/cracker180/article/details/787 ...

最新文章

  1. Oracle回应用户锁定,自治数据库是更好选择
  2. sinee303a变频器说明书_SINEE--EM303A变频器用户手册.pdf
  3. cefSharp通过js操控页面,含跨域操控
  4. [leetcode]541.反转字符串||
  5. 无法在web服务器上启动调试 您不具备调试此应用程序的权限
  6. 微信小程序:页面跳转时传递数据到另一个页面
  7. python语言程序设计及医学应用_Python语言程序设计(高等学校计算机专业规划教材)...
  8. excel 比对多列数据
  9. 御剑端口扫描工具2020下载
  10. stc单片机c语言程序头文件(stc12c5a60s2.h,stc12c5a60s2头文件在keil中没法用?
  11. starbound服务器无响应,星界边境无法开始游戏解决方法
  12. ibm tivoli_通过IBM Tivoli Access Manager WebSEAL单一登录到IBM WebSphere Portal
  13. python图片切割与合并
  14. python输入三门课程成绩_编一程序,从键盘输入学生的三门课程成绩,求其总成绩、平均成绩和总成绩除3的余值...
  15. Datagrip连接mysql错误[08S01]解决办法
  16. 最小生成树算法-克鲁斯卡尔和普利姆
  17. NLP基础知识(三)-AI应用篇
  18. 学习js的第五天【作用域,递归,对象,数组】
  19. (数字ic验证)从零开始的apb_watchdog验证模块搭建(二、apb直接发送激励与寄存器模型加入)
  20. [SUCTF 2019]EasySQL1

热门文章

  1. 进程间通信(IPC)介绍(转)
  2. MyBatis笔记——EhCache二级缓存
  3. 亚马逊记AWS(Amazon Web Services)自由EC2应用
  4. Android-----Activity生命周期
  5. BZOJ-1008 越狱
  6. C语言之判断是否为闰年
  7. backbone js学习笔记之第三篇Model (续)
  8. JavaSE总结笔记
  9. Quartus II14.1安装教程
  10. php发送邮件,标题是乱码,php的mail函数发送UTF-8编码中文邮件时标题乱码怎么办?...