简单线性回归

概念

简单线性回归代码


# 平均值函数
def calculate_mean(a_list_of_values):mean=sum(a_list_of_values)/float(len(a_list_of_values))return mean# 计算方差函数
def calculate_variance(a_list_of_values,mean):variance_sum=sum((x-mean)**2 for x in a_list_of_values)variance=variance_sum/(len(a_list_of_values)-1)return variance# 计算协方差函数
def calculate_covariance(a_list_of_Xs,the_mean_of_Xs,a_list_of_Ys,the_mean_of_Ys):cov_sum=0for i in range(len(a_list_of_Xs)):cov_sum+=(a_list_of_Xs[i]-the_mean_of_Xs)*(a_list_of_Ys[i]-the_mean_of_Ys)the_covariance=cov_sum/(len(a_list_of_Xs)-1)return the_covariance# 计算标准差函数
def calculate_the_standard_deviation(a_list_values):the_mean_of_the_list_values=sum(a_list_values)/float(len(a_list_values))variance=sum([(a_list_values[i]-the_mean_of_the_list_values)**2 for i in range(len(a_list_values)) ]) / float(len(a_list_values)-1)return variance**0.5# 计算相关性函数
def calculate_the_correlation(a_list_of_Xs, the_mean_of_Xs, a_list_of_Ys, the_mean_of_Ys):X_std = calculate_the_standard_deviation(a_list_of_Xs)Y_std = calculate_the_standard_deviation(a_list_of_Ys)X_Y_Cov = calculate_covariance(a_list_of_Xs, the_mean_of_Xs, a_list_of_Ys, the_mean_of_Ys)Corr = (X_Y_Cov) / (X_std * Y_std)return Corr# 计算系数
def calculate_the_coefficients(dataset):x = [row[0] for row in dataset]y = [row[1] for row in dataset]x_mean, y_mean = calculate_mean(x), calculate_mean(y)b1 = calculate_covariance(x, x_mean, y, y_mean) / calculate_variance(x, x_mean)b0 = y_mean - b1 * x_meanreturn [b0, b1]experience=[1,2,3,4,5]
salary    =[100,200,300,400,500]
list_of_tuples=list(zip(experience,salary))
print("list_of_tuples:",list_of_tuples)
list_of_lists=[list(elem) for elem in list_of_tuples]
print("list_of_lists:",list_of_lists)
b0,b1=calculate_the_coefficients(list_of_lists)
print("b0,b1:",b0,b1)# 预测函数
def simple_linear_regression(training_data,testing_data):predictions=[]b0,b1=calculate_the_coefficients(training_data)for row in testing_data:y=b0+b1*row[0]predictions.append(y)return predictions# 均方根误差
from math import sqrt
def calculate_the_RMSE(predicted_data,actual_data):the_sum_of_error=0for i in range(len(actual_data)):prediction_error=predicted_data[i]-actual_data[i]the_sum_of_error += (prediction_error**2)RMSE=sqrt(the_sum_of_error/float(len(actual_data)))return RMSE# 把数据x列分离出来
data_to_be_put_into_the_model=[]
for row in list_of_lists:row_copy=list(row)row_copy[-1]=Nonedata_to_be_put_into_the_model.append(row_copy)
print(data_to_be_put_into_the_model)# 使用预测函数进行预测y的数据
predictions=simple_linear_regression(list_of_lists,data_to_be_put_into_the_model)
print(predictions)# 预测新的数据
Y = [[6,], [7,], [8,], [9,], [10,]]
predictions=simple_linear_regression(list_of_lists,Y)
print("Y:",predictions)# 使用均方根误差计算准确率
def how_good_is_our_model(dataset,some_model_to_be_evaluated):test_data=[]for row in dataset:row_copy=list(row)row_copy[-1]=Nonetest_data.append(row_copy)predict_data=some_model_to_be_evaluated(dataset,test_data)print("预测结果：",predict_data)actual_data=[row[-1] for row in dataset]print("真实结果",actual_data)RMSE=calculate_the_RMSE(predict_data,actual_data)return RMSEresult=how_good_is_our_model(list_of_lists,simple_linear_regression)
print(result)

梯度下降

梯度下降概念

梯度下降代码

#make prediction带入系数预测代码def make_prediction(input_row,coefficients):out_put_y_hat = coefficients[0]for i in range(len(input_row)-1):out_put_y_hat += coefficients[i+1] * input_row[i]return  out_put_y_hattest_dataset = [[1,1.5],[2,2.5],[3,3.5],[4,4.5],[5,5.5]]
test_coefficients = [0.4,0.8]for row in test_dataset:y_hat = make_prediction(row,test_coefficients)print("真实Y值= %.3f, 预测结果 = %.3f"%(row[-1],y_hat))

def make_prediction(input_row,coefficients):out_put_y_hat = coefficients[0]for i in range(len(input_row)-1):out_put_y_hat += coefficients[i+1] * input_row[i]return  out_put_y_hatdef using_sgd_method_to_calculate_coefficients(training_dataset, learning_rate, n_times_epoch):# 训练数据集，学习率，次数coefficients = [1 for i in range(len(training_dataset[0]))] # 初始系数print("系数:",coefficients)for epoch in range(n_times_epoch):print("第",epoch,"次循环")the_sum_of_error = 0 # 从0开始记数for row in training_dataset:y_hat = make_prediction(row, coefficients)print("真实数据是:",row,"系数是：",coefficients)print("预测结果:",y_hat)error = y_hat - row[-1]print("误差:",error,"等于","预测结果：",y_hat,"减","真实值：",row[-1])the_sum_of_error += error ** 2print("错误的总和的平方:",the_sum_of_error)coefficients[0] = coefficients[0] - learning_rate * errorprint("新系数b0:",coefficients[0],"等于","旧系数",coefficients[0],"减",learning_rate,"乘","误差：",error)for i in range(len(row) - 1):coefficients[i + 1] = coefficients[i + 1] - learning_rate * error * row[i]print("新系数b1:",coefficients[i + 1],"等于","旧系数",coefficients[i + 1],"减",learning_rate,"乘","误差：",error,"乘",row[i])print("第 【%d】步,我们使用的学习率是 【%.3f】,错误是【%.3f】" % (epoch, learning_rate, the_sum_of_error))return coefficientsyour_training_dataset = [[1,1.5],[2,2.5],[3,3.5],[4,4.5],[5,5.5]]
your_model_learning_rate = 0.1
your_n_epoch =43your_coefficients = using_sgd_method_to_calculate_coefficients(your_training_dataset,your_model_learning_rate,your_n_epoch)
print("-"*50)
print("系数结果b0,b1：",your_coefficients)

在简单线性回归带入梯度下降的系数测试

梯度下降数学公式手撕

逻辑回归

逻辑回归概念

逻辑回归数学公式手撕

print("数据: [x1 = 2, x2 = 2, 类型 = 0]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*2
y2 = -1.15+1.48*2+ -2.30*2
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为：",yy)

print("数据: [x1 = 2, x2 = 4, 类型 = 0]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*2
y2 = -1.15+1.48*2+ -2.30*4
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为：",yy)

print("数据: [x1 = 10, x2 = 4, 类型 = 1]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*10
y2 = -1.15+1.48*10+ -2.30*4
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为：",yy)

print("数据: [x1 = 8.5, x2 = 3.5, 类型 = 1]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*8.5
y2 = -1.15+1.48*8.5+ -2.30*3.5
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为：",yy)

逻辑回归预测代码

# 预测函数
from math import exp
def prediction(row, coefficients):yhat = coefficients[0]for i in range(len(row)-1):yhat+=coefficients[i+1]*row[i]return 1/(1+exp(-yhat))

# 测试
dataset = [[2,2,0],[2,4,0],[3,3,0],[4,5,0],[8,1,1],[8.5,3.5,1],[9,1,1],[10,4,1]]coef = [-1.15, 1.48, -2.30] # 我们随便定义一组系数# prediction function
from math import exp
def prediction(row, coefficients):yhat = coefficients[0]print("yhat:",yhat)for i in range(len(row)-1):yhat+=coefficients[i+1]*row[i]print("yhat:",yhat,"+=","coefficients:",coefficients[i+1],"*" ,"row[i]",row[i])print("1/(1+",exp(-yhat),")=",1/(1+exp(-yhat)))return 1/(1+exp(-yhat))for row in dataset:print("数据:",row)yhat = prediction(row,coef)print("真实类别%.3f, 预测类别 %.3f ≈[%d]" % (row[-1], yhat,round(yhat)))

使用梯度下降计算系数

from math import expdef prediction(row, coefficients):yhat = coefficients[0]for i in range(len(row) - 1):yhat += coefficients[i + 1] * row[i]return 1 / (1 + exp(-yhat))def using_sgd_method_to_calculate_coefficients(training_dataset, learning_rate, n_times_epoch):coefficients = [0.5 for i in range(len(training_dataset[0]))]print("系数:",coefficients)for epoch in range(n_times_epoch):print("第",epoch,"次循环")the_sum_of_error = 0for row in training_dataset:y_hat = prediction(row, coefficients)print("预测结果:",y_hat,"真实数据是:",row,"系数是：",coefficients)error = row[-1] - y_hatprint("误差:",error,"等于","真实值：",row[-1],"减","预测结果：",y_hat)the_sum_of_error += error ** 2coefficients[0] = coefficients[0] + learning_rate * error * y_hat * (1.0 - y_hat)print("新系数b0:",coefficients[0],"等于","旧系数",coefficients[0],"加","lr:",learning_rate,"乘","误差：",error,"乘","y_hat",y_hat,"乘","1.0 - y_hat:",(1.0 - y_hat))print("-"*50)for i in range(len(row) - 1):coefficients[i + 1] = coefficients[i + 1] + learning_rate * error * y_hat * (1.0 - y_hat) * row[i]print("新系数b1:",coefficients[0],"等于","旧系数",coefficients[0],"加","lr:",learning_rate,"乘","误差：",error,"乘","y_hat",y_hat,"乘","1.0 - y_hat:",yhat,"=",(1.0 - y_hat),"乘",row[i])print("*"*50)print("第 【%d】步,我们使用的学习率是 【%.3f】,误差是 【%.3f】" % (epoch, learning_rate, the_sum_of_error))return coefficientsdataset = [[2, 2, 0],[2, 4, 0],[3, 3, 0],[4, 5, 0],[8, 1, 1],[8.5, 3.5, 1],[9, 1, 1],[10, 4, 1]]learning_rate = 0.1n_times_epoch = 1000coef = using_sgd_method_to_calculate_coefficients(dataset, learning_rate, n_times_epoch)print(coef)

逻辑回归梯度下降数学公式手撕

注：从第一行数据一直到最后一行为一个epoch，反复重复让误差越来越小

在逻辑回归预测函数中梯度下降系数进行验证

感知器

感知器概念

感知器预测代码

def predict(row, weights): # 传入想要的数据和权重activation = weights[0]for i in range(len(row)-1):activation += weights[i + 1] * row[i]return 1.0 if activation >= 0.0 else 0.0

dataset = [[2.78,2.55,0],[1.47,2.36,0],[1.39,1.85,0],[3.06,3.01,0],[7.63,2.76,0],[5.33,2.09,1],[6.93,1.76,1],[8.76,-0.77,1],[7.66,2.46,1]]
# 使用梯度下降获得的权重
weights = [2.0000000000000004, 0.5930000000000017, -2.460999999999983]
# 预测
for row in dataset:prediction = predict(row,weights)print("真实值 : %d ,预测值 %d:" %(row[-1],prediction))

感知器数学公式手撕

感知器梯度下降计算系数代码


def predict(row, weights):activation = weights[0]for i in range(len(row) - 1):activation += weights[i + 1] * row[i]return 1.0 if activation >= 0.0 else 0.0def opt_weights(train, learning_rate, how_many_epoch):weights = [0.5 for i in range(len(train[0]))]for epoch in range(how_many_epoch):sum_error = 0.0for row in train:prediction = predict(row, weights)error = row[-1] - predictionsum_error += error ** 2weights[0] = weights[0] + learning_rate * errorfor i in range(len(row) - 1):weights[i + 1] = weights[i + 1] + learning_rate * error * row[i]print('This is epoch: %d, our learning_rate is : %.4f, the error is : %.4f' % (epoch, learning_rate, sum_error))return weightsdataset = [[2.78, 2.55, 0],[1.47, 2.36, 0],[1.39, 1.85, 0],[3.06, 3.01, 0],[7.63, 2.76, 0],[5.33, 2.09, 1],[6.93, 1.76, 1],[8.76, -0.77, 1],[7.66, 2.46, 1]]learning_rate = 0.1how_many_epoch = 100weights = opt_weights(dataset, learning_rate, how_many_epoch)print(weights)

测试

感知器梯度下降系数计算手撕

决策树CART

决策树概念

1.如果你得到的是数值的数据如何计算Gini？(rank一遍，计算平均值，通过小于等于来分类，*没有必要将最大的一个数值包括，因为无法分类)
2.如果你得到的是程度数值（比如：按照喜欢程度1234）的数据如何计算Gini？(rank一遍，通过小于等于来分类，*没有必要将最大的一个数值包括，因为无法分类)
3.如果你得到的是调查问卷的数据如何计算Gini？(通过排列组合来分类，*没有必要将包括所有的组合计算在内，因为无法分类)

基尼指标计算代码

def calculate_the_gini_index(groups, classes):# 传入数据 以及有那些类别# 计算有多少实例个数n_instances = float(sum([len(groups) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(groups))# *注意，这里不能除以0，所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这里做了一个加权处理gini += (1 - score) * (size / n_instances)return gini

# 两个类别的最坏情况
worst_case_for_two_classes = [[[1, 1], [1, 0]], [[1, 1], [1, 0]]]print(calculate_the_gini_index(worst_case_for_two_classes, [0, 1]))
# 两个类别的最佳情况
best_case_for_two_classes = [[[1, 0], [1, 0]],[[1, 1], [1, 1]]]print(calculate_the_gini_index(best_case_for_two_classes, [0, 1]))

找到最优决策

# 用二分类数据举例
# 按照类别进行左右切分
def test_split(index, value, dataset):left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, right

# 基尼系数计算函数
def calculate_the_gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意，这里不能除以0，所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return gini

# 贪心算法检查所有组合
# 需要存储的有：index,value,groups数据较多，所以选用dict
def get_split(dataset):class_values = list(set(row[-1] for row in dataset)) # set去重取出有哪些类别print("类别有:",class_values)posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, None # 生成一个占位for index in range(len(dataset[0]) - 1):for row in dataset:groups = test_split(index, row[index], dataset)print("组合:",groups)gini = calculate_the_gini_index(groups, class_values)print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini))if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}

dataset = [[2.1, 1.1, 0],[3.4, 2.5, 0],[1.3, 5.8, 0],[1.9, 8.6, 0],[3.7, 6.2, 0],[8.8, 1.1, 1],[9.6, 3.4, 1],[10.2, 7.4, 1],[7.7, 8.8, 1],[9.7, 6.9, 1]]split = get_split(dataset)
print('Split:[X%d < %.3f]' % ((split['index'] + 1), split['value']))

决策树完整代码

# 1.root node
# 2.recursive split
# 3.terminal node (为了解决over-fitting的问题，减少整个tree的深度/高度，以及必须规定最小切分单位)
# 4.finish building the treedef test_split(index, value, dataset):left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, rightdef calculate_the_gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意，这里不能除以0，所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return ginidef get_split(dataset):class_values = list(set(row[-1] for row in dataset))posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, Nonefor index in range(len(dataset[0]) - 1):for row in dataset:groups = test_split(index, row[index], dataset)gini = calculate_the_gini_index(groups, class_values)print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini))if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}def determine_the_terminal(group):outcomes = [row[-1] for row in group]return max(set(outcomes), key=outcomes.count)# 1.把数据进行切分（分为左边与右边），原数据删除掉
# 2.检查非空以及满足我们的我们设置的条件（深度/最小切分单位/非空）
# 3.一直重复类似寻找root node的操作，一直到最末端def split(node, max_depth, min_size, depth):# 做切分，并删除掉原数据left, right = node['groups']del (node['groups'])# 查看非空if not left or not right:node['left'] = node['right'] = determine_the_terminal(left + right)return# 检查最大深度是否超过if depth >= max_depth:node['left'], node['right'] = determine_the_terminal(left), determine_the_terminal(right)return# 最小分类判断与左侧继续向下分类if len(left) <= min_size:node['left'] = determine_the_terminal(left)else:node['left'] = get_split(left)split(node['left'], max_depth, min_size, depth + 1)# 最小分类判断与右侧继续向下分类if len(right) <= min_size:node['right'] = determine_the_terminal(right)else:node['right'] = get_split(right)split(node['right'], max_depth, min_size, depth + 1)# 最终建立决策树def build_the_regression_tree(train, max_depth, min_size):root = get_split(train)split(root, max_depth, min_size, 1)return root# 通过CLI可视化的呈现类树状结构便于感性认知
def print_our_tree(node, depth=0):if isinstance(node, dict):print('%s[X%d < %.3f]' % ((depth * '-', (node['index'] + 1), node['value'])))print_our_tree(node['left'], depth + 1)print_our_tree(node['right'], depth + 1)else:print('%s[%s]' % ((depth * '-', node)))def make_prediction(node, row):if row[node['index']] < node['value']:if isinstance(node['left'], dict):return make_prediction(node['left'], row)else:return node['left']else:if isinstance(node['right'], dict):return make_prediction(node['right'], row)else:return node['right']dataset = [[2.1, 1.1, 0],[3.4, 2.5, 0],[1.3, 5.8, 0],[1.9, 8.6, 0],[3.7, 6.2, 0],[8.8, 1.1, 1],[9.6, 3.4, 1],[10.2, 7.4, 1],[7.7, 8.8, 1],[9.7, 6.9, 1]]tree = build_the_regression_tree(dataset, 1, 1)
print_our_tree(tree)decision_tree_stump = {'index': 0, 'right': 1, 'value': 7.7, 'left': 0}
for row in dataset:prediction = make_prediction(decision_tree_stump, row)print("What is expected data : %d , Your prediction is %d " % (row[-1], prediction))

朴素贝叶斯(NaiveBayes)

朴素贝叶斯概念

前置类容

按类别拆分数据

def split_our_data_by_class(dataset):# 按类别拆分数据 传入参数数据splited_data = dict() # 按字典储存for i in range(len(dataset)):vector = dataset[i]class_value = vector[-1]if (class_value not in splited_data):splited_data[class_value] =list()splited_data[class_value].append(vector)return splited_data

# 创建虚拟数据dataset = [ [0.8,2.3,0],[2.1,1.6,0],[2.0,3.6,0],[9.1,2.5,1],[3.1,2.5,0],[3.8,4.7,0],[6.8,2.7,1],[6.1,4.4,1],[8.6,0.3,1],[7.9,5.3,1]]splited = split_our_data_by_class(dataset)# 调用切分函数进行切分
print(splited)

# 逐行打印
for label in splited:print(label)for row in splited[label]:print(row)

计算平均值和标准差（n-1）

from math import sqrtdef calculate_the_mean(a_list_of_num): # 平均值函数mean = sum(a_list_of_num)/float(len(a_list_of_num)) # 求总和除以数据个数return meandef calculate_the_standard_deviation(a_list_of_num): # 计算标准差the_mean = calculate_the_mean(a_list_of_num) # 调用平均值函数计算平均值the_variance = sum([(x-the_mean)**2 for x in a_list_of_num])/ float(len(a_list_of_num)-1)# {每个值减去平均值的平方除以数据的（个数-1）}结果相加的总和std = sqrt(the_variance) # 开方return std

# 使用pandas验证
import pandas as pddf =  pd.DataFrame(dataset)
df
df.info()
df.describe()

接下来咱们用pure python写一个类似pandas的describe功能

# 接下来咱们用pure python写一个类似pandas的describe功能
def describe_our_data(dataset):# 调用函数description = [(calculate_the_mean(column),calculate_the_standard_deviation(column),len(column)) for column in zip(*dataset)]del(description[-1]) # 删除最后一列的分类标签return descriptiondescribe_our_data(dataset)

综合类别平均值标准差

def describe_our_data_by_class(dataset):# 综合splited_data = split_our_data_by_class(dataset)data_description = dict()# 创建一个空字典来存储for class_value,rows in splited_data.items():# 通过迭代填入空字典data_description[class_value] = describe_our_data(rows)return data_descriptiondescription = describe_our_data_by_class(dataset)
for label in description:print(label)for row in description[label]:print(row)

description = describe_our_data_by_class(dataset)
type(description)
## 切分数据字典-元组-列表结构梳理
print("----原始数据---")
print(description)
print("--取出字典0的数据--")
print(description[0])
print("--取出字典1的数据--")
print(description[1])
print("---------")
print(description[0][0])
print("--------")
print(description[0][0][2])

高斯概率密度函数

#高斯概率密度函数的建模构建
from math import exp,sqrt,pi
def calculate_the_probability(x,mean,stdev):exponent = exp(-((x-mean)**2/(2*stdev**2)))result = (1/(sqrt(2*pi)*stdev))* exponentreturn resultcalculate_the_probability(1.0,1.0,1.0)

完整代码

from math import sqrt
from math import pi
from math import expdef split_our_data_by_class(dataset):splited_data = dict()for i in range(len(dataset)):vector = dataset[i]class_value = vector[-1]if (class_value not in splited_data):splited_data[class_value] = list()splited_data[class_value].append(vector)return splited_datadef calculate_the_mean(a_list_of_num):mean = sum(a_list_of_num) / float(len(a_list_of_num))return meandef calculate_the_standard_deviation(a_list_of_num):the_mean = calculate_the_mean(a_list_of_num)the_variance = sum([(x - the_mean) ** 2 for x in a_list_of_num]) / float(len(a_list_of_num) - 1)std = sqrt(the_variance)return stddef describe_our_data(dataset):description = [(calculate_the_mean(column),calculate_the_standard_deviation(column),len(column)) for column in zip(*dataset)]del (description[-1])return descriptiondef describe_our_data_by_class(dataset):splited_data = split_our_data_by_class(dataset)data_description = dict()for class_value, rows in splited_data.items():data_description[class_value] = describe_our_data(rows)return data_descriptiondef calculate_the_probability(x, mean, stdev):exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))result = (1 / (sqrt(2 * pi) * stdev)) * exponentreturn resultdef calculate_class_probability(description, row):total_rows = sum([description[label][0][2] for label in description])probabilities = dict()for class_value, class_description in description.items():probabilities[class_value] = description[class_value][0][2] / float(total_rows)for i in range(len(class_description)):mean, stdev, count = class_description[i]probabilities[class_value] *= calculate_the_probability(row[i], mean, stdev)return probabilitiesdataset = [[0.8, 2.3, 0],[2.1, 1.6, 0],[2.0, 3.6, 0],[3.1, 2.5, 0],[3.8, 4.7, 0],[6.1, 4.4, 1],[8.6, 0.3, 1],[7.9, 5.3, 1],[9.1, 2.5, 1],[6.8, 2.7, 1]]description = describe_our_data_by_class(dataset)probability = calculate_class_probability(description, dataset[0])print(probability)

朴素贝叶斯数学公式手撕

KNN算法(K-Nearest_Neighbor)

KNN概述

1.k——超参数(hyper-parameter)
2.k最好为奇数（no even number , better be odd）
3.k大小有学问：
k太小：outliers 对判断的影像加大
k太大：会"冲淡"周边neighbor（高质量、高权重的数据）对最终判断的影像

欧式距离计算代码

# Euclidean Distance 欧氏距离
from math import sqrt
def calculate_euclidean_distance(row1,row2):# 累计的计数器distance = 0.0for i in range(len(row1)-1):# 这是一种快速写sum求和的方法+=distance += (row1[i] - row2[i])**2return sqrt(distance)

# 创建dummy data
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]

row0 = [1.80,1.91,0]
print("先随便取一行数据作为未知数据：",row0)
# 逐一计算 他与dataset中点的距离
for row in dataset:distance = calculate_euclidean_distance(row0,row)print(distance)

欧氏距离数学公式手撕

a = (1.8-1.8)**2+(1.91-1.91)**2
b = (1.8-1.85)**2+(1.91-2.11)**2
c = (1.8-2.31)**2+(1.91-2.88)**2
sqrt(a),sqrt(b),sqrt(c)

KNN思路

找思路：
1.需要一个输入变量k
2.需要排序（选前面k个）
3.数据类型储存使用元组进行储存
4.通过排序选出k值最近的

KNN完整代码

# Euclidean Distance 欧氏距离
from math import sqrt
def calculate_euclidean_distance(row1,row2):# 累计的计数器distance = 0.0for i in range(len(row1)-1):# 这是一种快速写sum求和的方法+=distance += (row1[i] - row2[i])**2return sqrt(distance)

### 寻找最近的邻居
def get_our_neighbors(train,test_row,num_of_neighbors): # 传入训练数据，测试数据，k值distances = list()# 使用空的列表来存储后面的数据for train_row in train: # 拆解train为每一行dist = calculate_euclidean_distance(test_row,train_row)# 调用欧氏距离函数计算距离distances.append((train_row,dist)) # 把结果放入空列表distances.sort(key=lambda every_tuple:every_tuple[1])# 排序neighbors = list()for i in range(num_of_neighbors): # 循环k值得次数neighbors.append(distances[i][0])#print(neighbors)return neighbors

dataset = [[1.80,1.91,0],[3.66,3.12,0],[1.85,2.11,0],[3.54,-3.21,0],[2.31,2.88,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]

# 传入数据集，选取一个点来计算距离，k=3
neighbors =get_our_neighbors(dataset,dataset[0],3)

for neighbor in neighbors:print(neighbor)

### 预测类别
def predict_the_class(train, test_row, num_of_neighbors):# 传入训练集 测试集 K值neighbors = get_our_neighbors(train,test_row,num_of_neighbors) # 使用计算最近邻居函数进行计算the_class_values = [row[-1] for row in neighbors] # 拿出最近邻居类别数prediction = max(set(the_class_values),key=the_class_values.count)# 是max求出最多的类别个数return predictionprediction = predict_the_class(dataset,dataset[0],3)
print('真实类别 【%d】' %(dataset[0][-1]))
print('预测类别 【%d】' %(prediction))

### 预测类别版本V2
def predict_the_class_V2(train,test_row,num_of_neighbors):neighbors = get_our_neighbors(train,test_row,num_of_neighbors)the_class_values = [row[-1] for row in neighbors]prediction = sum(the_class_values) / float(len(the_class_values))return predictionprediction = predict_the_class_V2(dataset,dataset[0],3)print('Our expectation(the real class) is class 【%d】' %(dataset[0][-1]))
print('Our prediction(the predicted class) is class 【%d】' %(prediction))

学习向量量化(Learning_Vector_Quantization)

LVQ概述

通常，我们使用LVQ方法用在分类问题上。

codebook vector(是一系列数字，与你训练数据里的input与output相关的特征一样)

example：
1.class 0,1
2.width
3.height
4.length

codebook vector(neuron):
1.class 0,1
2.width
3.height
4.length

LVQ跟KNN

通过在codebook vector里面进行寻找，通过Euclidean距离进行判断，找到BMU（Best Matching Unit）

1.选择一部分codebook vector。
2.竞争机制（codebook vector与训练实例（training pattern）一致的情况下。codebook vector向训练实例靠近，反之，则离远）
3.通过learning_rate控制移动的大小

x = x + learning_rate * (t - x)

4.对每个实例进行学习。

learning_rate = alpha(最初的学习率) * （1- (epoch/max_epoch)）

计算两个向量之间的欧氏距离：

# 计算两个向量之间的欧氏距离：
from math import sqrtdef calculate_euclidean_distance(row1,row2):distance = 0.0for i in range(len(row1)-1):distance += (row1[i] - row2[i])**2return sqrt(distance)dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]
# 测试
row0 =dataset[0] # 使用1.80,1.91,0 这个点的数据来计算和dataset每一项的距离
for row in dataset:distance = calculate_euclidean_distance(row0,row)print(distance)

Best Matching Unit

1.计算距离（codebook vector 与新的输入信息）
2.调用calculate_euclidean_distance
3.排序（考虑到数据类型）
4.选取BMU

# 欧氏距离函数
from math import sqrt
def calculate_euclidean_distance(row1,row2):distance = 0.0for i in range(len(row1)-1):distance += (row1[i] - row2[i])**2return sqrt(distance)

# 定义训练集
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]

def calculate_BMU(codebooks, test_row): # codebook vector 与 新的输入信息distances = list()for codebook in codebooks:dist = calculate_euclidean_distance(codebook,test_row)# 使用欧氏距离函数传入codebook vector 与 新的输入信息的计算距离distances.append((codebook,dist))# 把codebook vector和距离信息追加到distances列表中distances.sort(key=lambda every_tuple : every_tuple[1]) # 使用lambda排序print(distances)return distances[0][0] #通过列表切割出数据中第一个数据（就是距离新输入的最近的距离）

# 测试
test_row  = [1.60,1.81,0] # 随便找一列数据来测试
bmu = calculate_BMU(dataset,test_row) # 传入数据集 和测试数据
print("距离最近",bmu) # 得出结果距离最近的是[1.8, 1.91, 0]这一行数据

码本向量训练思路

训练我们的codebook vector

1.初始化（random feature）
2.在每一个epoch,通过 training pattern 进行对codebook vector更新（学习）
3.在每一个training pattern里面，每一个pattern feature如果与我们codebook vector一致的情况下，进行更新（更近，或者更远）

from random import randrange # 导入随机函数
# 随机生成码本
def make_random_codebook(train): # 传入训练数据n_index = len(train) # 数据的长度就是数据的个数n_features = len(train[0]) # 数据的长度如[1.80,1.91,0] = 3个codebook = [train[randrange(n_index)][i] for i in range(n_features)] # 使用randrange方法按照n_features的长度随机生成一条数据return codebook
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]
m_r_c = make_random_codebook(dataset)
m_r_c

LVQ完整代码

# 计算两个向量之间的欧氏距离：
from math import sqrtdef calculate_euclidean_distance(row1,row2):distance = 0.0for i in range(len(row1)-1):distance += (row1[i] - row2[i])**2return sqrt(distance)def calculate_BMU(codebooks, test_row): # codebook vector 与 新的输入信息distances = list()for codebook in codebooks:dist = calculate_euclidean_distance(codebook,test_row)# 使用欧氏距离函数传入codebook vector 与 新的输入信息的计算距离distances.append((codebook,dist))# 把codebook vector和距离信息追加到distances列表中distances.sort(key=lambda every_tuple : every_tuple[1]) # 使用lambda排序return distances[0][0] #通过列表切割出数据中第一个数据（就是距离新输入的最近的距离）from random import randrange # 导入随机函数
# 随机生成码本
def make_random_codebook(train): # 传入训练数据n_index = len(train) # 数据的长度就是数据的个数n_features = len(train[0]) # 数据的长度如[1.80,1.91,0] = 3个codebook = [train[randrange(n_index)][i] for i in range(n_features)] # 使用randrange方法随机生成一条数据return codebook#码本向量竞争实现
def train_codebooks(train,n_codebooks,learn_rate,epochs):codebooks = [make_random_codebook(train) for i in range(n_codebooks)] # 调用make_random_codebook函数随机生成数据for epoch in range(epochs): # 迭代rate = learn_rate * (1-(epoch/float(epochs)))# 更新学习率sum_error = 0.0 # 存储错误for row in train:bmu = calculate_BMU(codebooks,row)# 通过学习不断更新学习率for i in range(len(row)-1):error = row[i] - bmu[i]sum_error += error**2if bmu[-1] == row[-1]: # 如果结果和真实数据一样 一直就 +bmu[i] += rate * errorelse:bmu[i] -= rate * error# 反之就 -print('第【%d】epoch , 学习率 :【%.3f】, 错误总数是 【%.3f】' % (epoch,rate,sum_error))return codebooks
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]learning_rate = 0.3 # 学习率
n_epoch = 10 # 步数
n_codebooks = 2 # 返回多少个数据
codebooks = train_codebooks(dataset,n_codebooks,learning_rate,n_epoch)
print('Our codebook is : %s' % codebooks)

人工神经网络与反向传播

概述

激活函数

rectifier其实就是一种模仿生物的激活机制的函数
（activation function）

常见的激活函数：
https://en.wikipedia.org/wiki/Rectifier_(neural_networks)#Gaussian_Error_Linear_Unit_(GELU)

import math
softplus = math.log(1+math.exp(2.14))
print(softplus)

sigmoid =math.exp(2.14)/(1 + math.exp(2.14))
print(sigmoid)

ReLU = max(2.14,0)
print(ReLU)

前向传播

前向传播(forward-propagate)
1.neuron activation(w1 b1)
2.neuron transfer(activation)
3.forward-propagate(calculate the output)

from random import seed
from random import random#初始化我们的神经网络
# 随机生成权重bias
def initialize_our_neural_network(n_inputs,n_hidden,n_outputs):# 传入参数 多少个输入 多少个隐藏层 多少个输出neural_network = list()# 空列表存储hidden_layer = [{'weights':[random() for i in range(n_inputs +1)]} for i in range(n_hidden)] # weights的值是n_inputs +1 个    生成多少层n_hidden就循环生成多少个weightneural_network.append(hidden_layer)output_layer = [{'weights':[random() for i in range(n_hidden +1)]} for i in range(n_outputs)] # 输出层的weight是n_hidden +1个值        生成多少个输出neural_network.append(output_layer)return neural_networkseed(1)
network = initialize_our_neural_network(2,1,2) # 两个输入 1个隐藏层  两个输出for layer in network:print(layer)

# 构建神经元激活函数
def neuron_activation(weights,inputs): # 传入权重，输入activation = weights[-1] # activation等于 weights中的最后一个值作为 biasfor i in range(len(weights)-1):  # 循环 次数是weight的个数-1 因为上面取出一个activation += weights[i] * inputs[i]  # 让weights中的每个值和inputs中的每个值相乘然后和每个activation相加return activation

# 创建一个sigmiod 激活函数
from math import exp
def neuron_transfer(activation):result = 1.0/(1.0+ exp(-activation))return result

# 正向传播调用参数
def forward_propagate(network,row):# 传入 网络 以及具体数据inputs = row # 输入=具体数据for layer in network: # 循环网络#  print("layer:",layer)new_inputs = []for neuron in layer:# print("神经元:",neuron)activation = neuron_activation(neuron['weights'],inputs)# 调用神经元激活函数提取权重bias进行计算# print("激活:",activation)neuron['output'] = neuron_transfer(activation) # 调用sigmiod激活函数# print("sigmiod:",neuron['output'])new_inputs.append(neuron['output'])inputs = new_inputs # 使用新的输入替换旧的return inputs

# 测试
network=[[{'weights': [0.13436424411240122,0.8474337369372327,0.763774618976614]}],[{'weights': [0.2550690257394217,0.49543508709194095]},{'weights': [0.4494910647887381,0.651592972722763]}]]
row =[1,0]
output = forward_propagate(network,row)
print("输出：",output)

计算过程

传播误差代码

def neuron_transfer_derivative(output):derivative = output*(1-output)return derivative

def backward_propagate_error(network,expected):for i in reversed(range(len(network))):layer = network[i]errors = list()if i != len(network)-1:for j in range(len(layer)):error = 0.0for neuron in network[i+1]:error +=(neuron['weights'][j] * neuron['delta'])errors.append(error)else:for j in range(len(layer)):neuron = layer[j]errors.append(expected[j]- neuron['output'])for j in range(len(layer)):neuron = layer[j]neuron['delta'] = errors[j] * neuron_transfer_derivative(neuron['output'])

## 测试
network=[[{'output':0.71,'weights': [0.13436424411240122,0.8474337369372327,0.763774618976614]}],[{'output':0.62,'weights': [0.2550690257394217,0.49543508709194095]},{'output':0.65,'weights': [0.4494910647887381,0.651592972722763]}]]
expected = [0,1]
backward_propagate_error(network,expected)
for layer in network:print(layer)

手推计算过程

权重学习更新机制

weight = weight + learning rate * error * input

def update_weights(network, row , learning_rate):for i in range(len(network)):inputs = row[:-1]# 因为output layer的数据是hidden layer传到过来的if i != 0:inputs = [neuron['output'] for neuron in network[i-1]]for neuron in network[i]:for j in range(len(inputs)):neuron['weights'][j] += learning_rate* neuron['delta'] * inputs[j]neuron['weights'][-1] += learning_rate*neuron['delta']

预处理编码形式

补充知识：
我们对分类数据做预处理的时候，一般有两种encoding编码方式：
1.Integer Encoding

Blue-- Red–Green–Yellow
—1------2------3----------4–

2.One-Hot Encoding

dummy variable

Blue Red Green Yellow
–1------0------0------0
–0------1------0------0
–0------0------1------0
–0------0------0------1

训练

def train_our_network(network,train,learning_rate,n_epoch,n_output):for epoch in range(n_epoch):sum_error = 0for row in train:outputs = forward_propagate(network,row)expected = [0 for i in range(n_output)]expected[row[-1]] = 1sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])backward_propagate_error(network,expected)update_weights(network,row,learning_rate)print("第[%d] epoch , 学习率： [%.3f],误差： [%.3f]" %(epoch,learning_rate,sum_error))

seed(1)dataset = [[2.1,2.8,0],[1.3,2.7,0],[1.2,5.2,0],[3.3,2.8,0],[1.2,1.1,0],[6.2,5.8,1],[8.3,3.7,1],[6.2,2.7,1],[7.3,3.4,1],[9.2,2.1,1]
]n_inputs = len(dataset[0]) -1
n_outputs = len(set(row[-1] for row in dataset))
network = initialize_our_neural_network(n_inputs,2, n_outputs)
train_our_network(network,dataset,0.1,1000,n_outputs)
for layer in network:print(layer)

预测分类

def predict(network,row):outputs = forward_propagate(network,row)return outputs.index(max(outputs))network=[[{'weights': [-1.1866404384956928, 0.3006679439138231, 3.9117172696404685]}, {'weights': [1.2235819285217509, -0.3158686384608762, -4.03117701360861]}],
[{'weights': [3.933774188003338, -3.959801574023486, 0.4499036554334044]}, {'weights': [-3.678895659767694, 4.256650475104409, -0.7298649815974946]}]]for row in dataset:prediction = predict(network,row)print("Our expected class value is [%d], Our prediction of class value is [%d]" % (row[-1], prediction))

集成算法bagging法

把各种model综合起来——让预测更准确、更加稳定（做平均）
在随机森林里面的超参数(hyper-parameter)：

1.对于每一棵树，要选取特性（features）,假设总共有n个feature，你需要确定选取个m作为参数
2.每一个node的最低size（每个棵树的每一片叶子的最小值）
3.每一个树的深度（maximum depth of one tree）
4.选择森林里面有多少棵树

from random import randrangedef subsample(dataset,ratio=1.0):sample = list()n_sample = round(len(dataset) * ratio )while len(sample) < n_sample:index = randrange(len(dataset))sample.append(dataset[index])return sample

from random import seed
from random import randrange
from random import randomdef subsample(dataset, ratio=1.0):sample = list()n_sample = round(len(dataset) * ratio)while len(sample) < n_sample:index = randrange(len(dataset))sample.append(dataset[index])return sampledef mean(numbers):result = sum(numbers) / float(len(numbers))return resultseed(1)
dataset = [[randrange(10)] for i in range(20)]# print(dataset)ratio = 0.10
for size in [1, 10, 100,1000,10000,100000,1000000]:sample_means = list()for i in range(size):sample = subsample(dataset, ratio)sample_mean = mean([row[0] for row in sample])sample_means.append(sample_mean)print("When sample is [%d],the estimated mean is [%.3f]" % (size, mean(sample_means)))print("The real mean of our dataset is [%.3f]" % mean([row[0] for row in dataset]))

from random import seed
from random import randrange
from random import random
from csv import reader# 1.load our data # 定义加载数据函数
def load_csv(filename): # 传入数据dataset = list()# 定义一个空列表逐步往里面存放数据with open(filename, 'r') as file:  # 通过上下文管理器读取文件csv_reader = reader(file) # 读取文件for row in csv_reader: # 循环文件的行数if not row:  # 如果不是空continue  # 继续下面操作dataset.append(row) # 把每行追加到dataset列表中return dataset
# 读取测试
dataset = load_csv('sonar.all-data.csv')
print(dataset)

# 2.datatype conversion  数据类型转换def str_to_float(dataset, column): # 将字符串转换为浮点类型 传入数据 以及每个columnfor row in dataset: # 通过for循环所有数据逐行转换row[column] = float(row[column].strip()) # 把数字转换为浮点数 去掉前后空格def str_to_int(dataset, column): # 把字符串转换为数字类型 class_value = [row[column] for row in dataset] unique = set(class_value) # 去重look_up = dict() # 定义一个空字典后面进行填充for i, value in enumerate(unique): # 逐一遍历look_up[value] = ifor row in dataset:row[column] = look_up[row[column]]return look_up

# 3.k_fold cross validation k_fold交叉验证数据切分def cross_validation_split(dataset, n_folds): # 传入数据 以及 切分的次数dataset_split = list() # 空列表接受后面切分好的数据dataset_copy = list(dataset) # 把数据放入dataset_copy中 fold_size = int(len(dataset) / n_folds) # 每个 fold_size的大小 是整体长度除以切分的次数for i in range(n_folds): # 循环切分的次数fold = list() # 定义一个空列表来存放数据while len(fold) < fold_size: # 当fold的长度小于切分fold_size的时候一直循环下面的操作index = randrange(len(dataset_copy)) # index=随机选取所有数据的长度fold.append(dataset_copy.pop(index)) # 将dataset_copy.pop(index)随机选取的数据追加到fold中dataset_split.append(fold) # 在将fold的内容追加到 dataset_split中return dataset_split

# 4.calculate model accuracy # 计算准确率def calculate_accuracy(actual, predicted): # 传入真实数据  预测数据correct = 0 # =0 用来计算准确率的个数for i in range(len(actual)): # 按照数据的长度进行循环if actual[i] == predicted[i]: # 当数据一样的时候correct += 1 # correct就更新+1return correct / float(len(actual)) * 100.0 # 最终相等的结果个数/数据总个数 就是数据的准确率
# 总数据为100个 我们correct的数据有90个  90/100=0.9  *100  = 90%准确率

# 5.how good is our algo 使用测试集验证我们的算法有多好
def evaluate_our_algo(dataset, algo, n_folds, *args): # 传入我们的数据 以及其中一个algo 七分 以及其他的argsfolds = cross_validation_split(dataset, n_folds) ## 调用数据切分函数 scores = list() for fold in folds: # 循环我们切分好的数据train_set = list(folds) # 总数据的长度train_set.remove(fold) # 因为我们需要拿出其中一个数据所以删除掉一个数据train_set = sum(train_set, []) # 计算数据总和test_set = list()for row in fold: # 把fold中每行数据进行循环row_copy = list(row) # 定义到row_copy中test_set.append(row_copy) # 在追加到test_set中row_copy[-1] = None # 去掉数据的标签（答案）predicted = algo(train_set, test_set, *args)actual = [row[-1] for row in fold] # 真实数据accuracy = calculate_accuracy(actual, predicted) # 调用准确率函数计算准确率scores.append(accuracy)return scores

# 6.left and right split  # 左右切分def test_split(index, value, dataset):left, right = list(), list()for row in dataset: # 逐行进行操作if row[index] < value:left.append(row)else:right.append(row)return left, right

# 7.calculate gini index # gini系数计算def gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意，这里不能除以0，所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return gini

# 8.calculate the best split 计算最佳分割 根据gini系数不断更新def get_split(dataset):class_values = list(set(row[-1] for row in dataset))posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, Nonefor index in range(len(dataset[0])- 1):for row in dataset:groups = test_split(index, row[index], dataset)gini = gini_index(groups, class_values)if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}

# 9. to terminal # 确认是否切分到末端def determine_the_terminal(group):outcomes = [row[-1] for row in group]return max(set(outcomes), key=outcomes.count)

# 10.
# 1.split our data into left and right将数据分成左右两部分
# 2.delete the original data删除原始数据
# 3.check if the data is none/max depth/min size检查数据是否为无/最大深度/最小大小
# 4.to terminal至终端def split(node, max_depth, min_size, depth):left, right = node['groups']del (node['groups'])if not left or not right:node['left'] = node['right'] = determine_the_terminal(left + right)returnif depth >= max_depth:node['left'], node['right'] = determine_the_terminal(left), determine_the_terminal(right)if len(left) <= min_size:node['left'] = determine_the_terminal(left)else:node['left'] = get_split(left)split(node['left'], max_depth, min_size, depth + 1)if len(right) <= min_size:node['right'] = determine_the_terminal(right)else:node['right'] = get_split(right)split(node['right'], max_depth, min_size, depth + 1)

# 11.make our decision treedef build_tree(train, max_depth, min_zise):root = get_split(train)split(root, max_depth, min_zise, 1)return root

# 12.make predictiondef predict(node, row):if row[node['index']] < node['value']:if isinstance(node['left'], dict):return predict(node['left'], row)else:return node['left']else:if isinstance(node['right'], dict):return predict(node['right'], row)else:return node['right']

# 13. subsampledef subsample(dataset, ratio):sample = list()n_sample = round(len(dataset) * ratio)while len(sample) < n_sample:index = randrange(len(dataset))sample.append(dataset[index])return sample


# 14.make prediction using bagging
def bagging_predict(trees, row):predictions = [predict(tree, row) for tree in trees]return max(set(predictions), key=predictions.count)

# 15.baggingdef bagging(train, test, max_depth, min_size, sample_size, n_trees):trees = list()for i in range(n_trees):sample = subsample(train, sample_size)tree = build_tree(sample, max_depth, min_size)trees.append(tree)predictions = [bagging_predict(trees, row) for row in test]return (predictions)seed(1)
dataset = load_csv('sonar.all-data.csv')for i in range(len(dataset[0]) - 1):str_to_float(dataset, i)str_to_int(dataset, len(dataset[0]) - 1)n_folds = 5
max_depth = 6
min_size = 2
sample_size = 0.5for n_trees in [1, 5, 10, 50]:scores = evaluate_our_algo(dataset, bagging, n_folds, max_depth, min_size, sample_size, n_trees)print('We are using [%d]' % n_trees)print('The scores are : [%s]' % scores)print('The mean accuracy is [%.3f]' % (sum(scores) / float(len(scores))))

未完待续>>>

机器学习从零开始-常见算法手推pure python相关推荐

数据产品必备技术知识：机器学习及常见算法，看这一篇就够了
大家都知道,产品经理需要懂技术,很多面试官都偏好有技术背景的同学,毕竟产品经理经常要和开发同学相爱相杀.当然也不是一定要求能够精通,但是至少不要让这块成为沟通的障碍,懂点技术,实际工作中也能少被开发同 ...
收藏！博士大佬的《机器学习》西瓜书手推笔记！
点上方蓝字计算机视觉联盟获取更多干货在右上方 ··· 设为星标 ★,与你不见不散感谢联盟大佬手推笔记分享原本联盟准备等笔记集中以后再和大家分享,后来想了一下,先拿一部分笔记和大家一起分享,共同学 ...
python机器学习算法.mobi_机器学习之ID3算法详解及python代码实现
在生活中我们经常会用到决策树算法,最简单的就是二叉树了:相信大家也会又同样的困扰,手机经常收到各种短信,其中不乏很多垃圾短信.此时只要设置这类短信为垃圾短信手机就会自动进行屏蔽.减少被骚扰的次数,同时 ...
python决策树 value_机器学习之ID3算法详解及python代码实现
在生活中我们经常会用到决策树算法,最简单的就是二叉树了:相信大家也会又同样的困扰,手机经常收到各种短信,其中不乏很多垃圾短信.此时只要设置这类短信为垃圾短信手机就会自动进行屏蔽.减少被骚扰的次数,同时 ...
机器学习之随机森林(手推公式版)
文章目录前言 1. 集成学习 1.1 Boosting 1.2 Bagging 2. 随机森林 2.1 投票法 2.2 平均法 3. 模型实现结束语前言随机森林(Random(Random ...
机器学习面试常见算法
前言: 找工作时(IT行业),除了常见的软件开发以外,机器学习岗位也可以当作是一个选择,不少计算机方向的研究生都会接触这个,如果你的研究方向是机器学习/数据挖掘之类,且又对其非常感兴趣的话,可以考虑考 ...
GitHub | 周志华《机器学习》手推笔记正式开源！可打印版本附pdf下载链接
点上方蓝字计算机视觉联盟获取更多干货在右上方 ··· 设为星标 ★,与你不见不散编辑:Sophia | 联盟笔记计算机视觉联盟报道 | 公众号 CVLianMen ...
BP算法总结+从输入-隐层-输出的逐步手推
1 BP算法总结 BP算法:bp算法实际上是在神经网络中寻找在合适条件下的最佳权重和bais.实际上是利用输出后的误差来估计输出层前一层的误差,再用这层误差来估计更前一层误差,如此获取所有各层误差估计 ...
AI - 常见算法简介（Common Algorithms）
机器学习常见算法简介 - 原文链接:http://usblogs.pwc.com/emerging-technology/machine-learning-methods-infographic/ 应 ...

机器学习从零开始-常见算法手推pure python