


1.Chest Pain:如是否有胸痛
2.Good Blood Circulation
37 127 100 33

3.Blocked Arteries
92 31 45 129

我们把并非100%正确分类的要做"impure"的分类(着重关注点实在leaf node)

# 1.Chest Pain:如是否有胸痛
# ------|--------|----
# ----HD-----HD-----
# -----|--------|------
# Yes|No---Yes|No------是否患有心脏病
# 105|39-----34|125--对应的人数# 概述:有胸痛/有心脏病的有105个  有胸痛/无心脏病的有的39个
#  无胸痛/有心脏病的有34个   无胸痛无心脏病的有125个1-(105/(105+39))**2-(39/(105+39))**2


# 加权处理:
(144/(144+159))*0.395 +(159/(144+159))*0.336

以此类推, 1.Chest Pain = 0.364
2.Good Blood Circulation = 0.360
3.Blocked Arteries = 0.381

有上述得出Good Blood Circulation = 0.360 最小(最纯)

Good Blood Circulation
37 127 100 33

1.Chest Pain
13 98 24 29
这个时候可以算出在Chest Pain下面的Gini =0.3

2.Blocked Arteries
24 25 13 102
这个时候可以算出在Blocked Arteries下面的Gini =0.290

-----Good Blood Circulation-----

思考: 1.如果你得到的是数值的数据如何计算Gini?(rank一遍,计算平均值,通过小于等于来分类,*没有必要将最大的一个数值包括,因为无法分类) 2.如果你得到的是程度数值(比如:按照喜欢程度1234)的数据如何计算Gini?(rank一遍,通过小于等于来分类,没有必要将最大的一个数值包括,因为无法分类) 3.如果你得到的是调查问卷的数据如何计算Gini?(通过排列组合来分类,*没有必要将包括所有的组合计算在内,因为无法分类)


def calculate_the_gini_index(groups, classes):# 计算有多少实例个数n_instances = float(sum([len(groups) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(groups))# *注意,这里不能除以0,所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这里做了一个加权处理gini += (1 - score) * (size / n_instances)return gini
# 两个类别的最坏情况
worst_case_for_two_classes = [[[1, 1], [1, 0]], [[1, 1], [1, 0]]]print(calculate_the_gini_index(worst_case_for_two_classes, [0, 1]))
# 两个类别的最佳情况
best_case_for_two_classes = [[[1, 0], [1, 0]],[[1, 1], [1, 1]]]print(calculate_the_gini_index(best_case_for_two_classes, [0, 1]))

def test_split(index, value, dataset): # 左右切分函数left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, right
# index,value,groups数据较多,所以选用dict
def get_split(dataset):class_values = list(set(row[-1] for row in dataset))posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, Nonefor index in range(len(dataset[0]) - 1):for row in dataset:groups = test_split(index, row[index], dataset)gini = calculate_the_gini_index(groups, class_values)if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'The Best Index is': posi_index, 'The Best Value is': posi_value, 'The Best Groups is': posi_groups}
# 测试
dataset = [[2.1, 1.1, 0],[3.4, 2.5, 0],[1.3, 5.8, 0],[1.9, 8.6, 0],[3.7, 6.2, 0],[8.8, 1.1, 1],[9.6, 3.4, 1],[10.2, 7.4, 1],[7.7, 8.8, 1],[9.7, 6.9, 1]]split = get_split(dataset)


def test_split(index, value, dataset):left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, rightdef calculate_the_gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意,这里不能除以0,所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return ginidef get_split(dataset):class_values = list(set(row[-1] for row in dataset))posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, Nonefor index in range(len(dataset[0]) - 1):for row in dataset:groups = test_split(index, row[index], dataset)gini = calculate_the_gini_index(groups, class_values)print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini))if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}dataset = [[2.1, 1.1, 0],[3.4, 2.5, 0],[1.3, 5.8, 0],[1.9, 8.6, 0],[3.7, 6.2, 0],[8.8, 1.1, 1],[9.6, 3.4, 1],[10.2, 7.4, 1],[7.7, 8.8, 1],[9.7, 6.9, 1]]split = get_split(dataset)
print('Split:[X%d < %.3f]' % ((split['index'] + 1), split['value']))


# 1.root node
# 2.recursive split
# 3.terminal node (为了解决over-fitting的问题,减少整个tree的深度/高度,以及必须规定最小切分单位)
# 4.finish building the treedef test_split(index, value, dataset):left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, rightdef calculate_the_gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意,这里不能除以0,所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return ginidef get_split(dataset):class_values = list(set(row[-1] for row in dataset))posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, Nonefor index in range(len(dataset[0]) - 1):for row in dataset:groups = test_split(index, row[index], dataset)gini = calculate_the_gini_index(groups, class_values)print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini))if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}def determine_the_terminal(group):outcomes = [row[-1] for row in group]return max(set(outcomes), key=outcomes.count)# 1.把数据进行切分(分为左边与右边),原数据删除掉
# 2.检查非空以及满足我们的我们设置的条件(深度/最小切分单位/非空)
# 3.一直重复类似寻找root node的操作,一直到最末端def split(node, max_depth, min_size, depth):# 做切分,并删除掉原数据left, right = node['groups']del (node['groups'])# 查看非空if not left or not right:node['left'] = node['right'] = determine_the_terminal(left + right)return# 检查最大深度是否超过if depth >= max_depth:node['left'], node['right'] = determine_the_terminal(left), determine_the_terminal(right)return# 最小分类判断与左侧继续向下分类if len(left) <= min_size:node['left'] = determine_the_terminal(left)else:node['left'] = get_split(left)split(node['left'], max_depth, min_size, depth + 1)# 最小分类判断与右侧继续向下分类if len(right) <= min_size:node['right'] = determine_the_terminal(right)else:node['right'] = get_split(right)split(node['right'], max_depth, min_size, depth + 1)# 最终建立决策树def build_the_regression_tree(train, max_depth, min_size):root = get_split(train)split(root, max_depth, min_size, 1)return root# 通过CLI可视化的呈现类树状结构便于感性认知
def print_our_tree(node, depth=0):if isinstance(node, dict):print('%s[X%d < %.3f]' % ((depth * '-', (node['index'] + 1), node['value'])))print_our_tree(node['left'], depth + 1)print_our_tree(node['right'], depth + 1)else:print('%s[%s]' % ((depth * '-', node)))def make_prediction(node, row):if row[node['index']] < node['value']:if isinstance(node['left'], dict):return make_prediction(node['left'], row)else:return node['left']else:if isinstance(node['right'], dict):return make_prediction(node['right'], row)else:return node['right']dataset = [[2.1, 1.1, 0],[3.4, 2.5, 0],[1.3, 5.8, 0],[1.9, 8.6, 0],[3.7, 6.2, 0],[8.8, 1.1, 1],[9.6, 3.4, 1],[10.2, 7.4, 1],[7.7, 8.8, 1],[9.7, 6.9, 1]]tree = build_the_regression_tree(dataset, 3, 1)
print_our_tree(tree)decision_tree_stump = {'index': 0, 'right': 1, 'value': 9.3, 'left': 0}
for row in dataset:prediction = make_prediction(decision_tree_stump, row)print("What is expected data : %d , Your prediction is %d " % (row[-1], prediction))


