简单线性回归

概念








简单线性回归代码


# 平均值函数
def calculate_mean(a_list_of_values):mean=sum(a_list_of_values)/float(len(a_list_of_values))return mean# 计算方差函数
def calculate_variance(a_list_of_values,mean):variance_sum=sum((x-mean)**2 for x in a_list_of_values)variance=variance_sum/(len(a_list_of_values)-1)return variance# 计算协方差函数
def calculate_covariance(a_list_of_Xs,the_mean_of_Xs,a_list_of_Ys,the_mean_of_Ys):cov_sum=0for i in range(len(a_list_of_Xs)):cov_sum+=(a_list_of_Xs[i]-the_mean_of_Xs)*(a_list_of_Ys[i]-the_mean_of_Ys)the_covariance=cov_sum/(len(a_list_of_Xs)-1)return the_covariance# 计算标准差函数
def calculate_the_standard_deviation(a_list_values):the_mean_of_the_list_values=sum(a_list_values)/float(len(a_list_values))variance=sum([(a_list_values[i]-the_mean_of_the_list_values)**2 for i in range(len(a_list_values)) ]) / float(len(a_list_values)-1)return variance**0.5# 计算相关性函数
def calculate_the_correlation(a_list_of_Xs, the_mean_of_Xs, a_list_of_Ys, the_mean_of_Ys):X_std = calculate_the_standard_deviation(a_list_of_Xs)Y_std = calculate_the_standard_deviation(a_list_of_Ys)X_Y_Cov = calculate_covariance(a_list_of_Xs, the_mean_of_Xs, a_list_of_Ys, the_mean_of_Ys)Corr = (X_Y_Cov) / (X_std * Y_std)return Corr# 计算系数
def calculate_the_coefficients(dataset):x = [row[0] for row in dataset]y = [row[1] for row in dataset]x_mean, y_mean = calculate_mean(x), calculate_mean(y)b1 = calculate_covariance(x, x_mean, y, y_mean) / calculate_variance(x, x_mean)b0 = y_mean - b1 * x_meanreturn [b0, b1]experience=[1,2,3,4,5]
salary    =[100,200,300,400,500]
list_of_tuples=list(zip(experience,salary))
print("list_of_tuples:",list_of_tuples)
list_of_lists=[list(elem) for elem in list_of_tuples]
print("list_of_lists:",list_of_lists)
b0,b1=calculate_the_coefficients(list_of_lists)
print("b0,b1:",b0,b1)# 预测函数
def simple_linear_regression(training_data,testing_data):predictions=[]b0,b1=calculate_the_coefficients(training_data)for row in testing_data:y=b0+b1*row[0]predictions.append(y)return predictions# 均方根误差
from math import sqrt
def calculate_the_RMSE(predicted_data,actual_data):the_sum_of_error=0for i in range(len(actual_data)):prediction_error=predicted_data[i]-actual_data[i]the_sum_of_error += (prediction_error**2)RMSE=sqrt(the_sum_of_error/float(len(actual_data)))return RMSE# 把数据x列分离出来
data_to_be_put_into_the_model=[]
for row in list_of_lists:row_copy=list(row)row_copy[-1]=Nonedata_to_be_put_into_the_model.append(row_copy)
print(data_to_be_put_into_the_model)# 使用预测函数进行预测y的数据
predictions=simple_linear_regression(list_of_lists,data_to_be_put_into_the_model)
print(predictions)# 预测新的数据
Y = [[6,], [7,], [8,], [9,], [10,]]
predictions=simple_linear_regression(list_of_lists,Y)
print("Y:",predictions)# 使用均方根误差计算准确率
def how_good_is_our_model(dataset,some_model_to_be_evaluated):test_data=[]for row in dataset:row_copy=list(row)row_copy[-1]=Nonetest_data.append(row_copy)predict_data=some_model_to_be_evaluated(dataset,test_data)print("预测结果:",predict_data)actual_data=[row[-1] for row in dataset]print("真实结果",actual_data)RMSE=calculate_the_RMSE(predict_data,actual_data)return RMSEresult=how_good_is_our_model(list_of_lists,simple_linear_regression)
print(result)

梯度下降

梯度下降概念



梯度下降代码

#make prediction带入系数预测代码def make_prediction(input_row,coefficients):out_put_y_hat = coefficients[0]for i in range(len(input_row)-1):out_put_y_hat += coefficients[i+1] * input_row[i]return  out_put_y_hattest_dataset = [[1,1.5],[2,2.5],[3,3.5],[4,4.5],[5,5.5]]
test_coefficients = [0.4,0.8]for row in test_dataset:y_hat = make_prediction(row,test_coefficients)print("真实Y值= %.3f, 预测结果 = %.3f"%(row[-1],y_hat))



def make_prediction(input_row,coefficients):out_put_y_hat = coefficients[0]for i in range(len(input_row)-1):out_put_y_hat += coefficients[i+1] * input_row[i]return  out_put_y_hatdef using_sgd_method_to_calculate_coefficients(training_dataset, learning_rate, n_times_epoch):# 训练数据集,学习率,次数coefficients = [1 for i in range(len(training_dataset[0]))] # 初始系数print("系数:",coefficients)for epoch in range(n_times_epoch):print("第",epoch,"次循环")the_sum_of_error = 0 # 从0开始记数for row in training_dataset:y_hat = make_prediction(row, coefficients)print("真实数据是:",row,"系数是:",coefficients)print("预测结果:",y_hat)error = y_hat - row[-1]print("误差:",error,"等于","预测结果:",y_hat,"减","真实值:",row[-1])the_sum_of_error += error ** 2print("错误的总和的平方:",the_sum_of_error)coefficients[0] = coefficients[0] - learning_rate * errorprint("新系数b0:",coefficients[0],"等于","旧系数",coefficients[0],"减",learning_rate,"乘","误差:",error)for i in range(len(row) - 1):coefficients[i + 1] = coefficients[i + 1] - learning_rate * error * row[i]print("新系数b1:",coefficients[i + 1],"等于","旧系数",coefficients[i + 1],"减",learning_rate,"乘","误差:",error,"乘",row[i])print("第 【%d】步,我们使用的学习率是 【%.3f】,错误是【%.3f】" % (epoch, learning_rate, the_sum_of_error))return coefficientsyour_training_dataset = [[1,1.5],[2,2.5],[3,3.5],[4,4.5],[5,5.5]]
your_model_learning_rate = 0.1
your_n_epoch =43your_coefficients = using_sgd_method_to_calculate_coefficients(your_training_dataset,your_model_learning_rate,your_n_epoch)
print("-"*50)
print("系数结果b0,b1:",your_coefficients)


在简单线性回归带入梯度下降的系数测试

梯度下降数学公式手撕


逻辑回归

逻辑回归概念




逻辑回归数学公式手撕

print("数据: [x1 = 2, x2 = 2, 类型 = 0]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*2
y2 = -1.15+1.48*2+ -2.30*2
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)

print("数据: [x1 = 2, x2 = 4, 类型 = 0]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*2
y2 = -1.15+1.48*2+ -2.30*4
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)

print("数据: [x1 = 10, x2 = 4, 类型 = 1]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*10
y2 = -1.15+1.48*10+ -2.30*4
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)

print("数据: [x1 = 8.5, x2 = 3.5, 类型 = 1]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*8.5
y2 = -1.15+1.48*8.5+ -2.30*3.5
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)

逻辑回归预测代码

# 预测函数
from math import exp
def prediction(row, coefficients):yhat = coefficients[0]for i in range(len(row)-1):yhat+=coefficients[i+1]*row[i]return 1/(1+exp(-yhat))
# 测试
dataset = [[2,2,0],[2,4,0],[3,3,0],[4,5,0],[8,1,1],[8.5,3.5,1],[9,1,1],[10,4,1]]coef = [-1.15, 1.48, -2.30] # 我们随便定义一组系数# prediction function
from math import exp
def prediction(row, coefficients):yhat = coefficients[0]print("yhat:",yhat)for i in range(len(row)-1):yhat+=coefficients[i+1]*row[i]print("yhat:",yhat,"+=","coefficients:",coefficients[i+1],"*" ,"row[i]",row[i])print("1/(1+",exp(-yhat),")=",1/(1+exp(-yhat)))return 1/(1+exp(-yhat))for row in dataset:print("数据:",row)yhat = prediction(row,coef)print("真实类别%.3f, 预测类别 %.3f ≈[%d]" % (row[-1], yhat,round(yhat)))


使用梯度下降计算系数

from math import expdef prediction(row, coefficients):yhat = coefficients[0]for i in range(len(row) - 1):yhat += coefficients[i + 1] * row[i]return 1 / (1 + exp(-yhat))def using_sgd_method_to_calculate_coefficients(training_dataset, learning_rate, n_times_epoch):coefficients = [0.5 for i in range(len(training_dataset[0]))]print("系数:",coefficients)for epoch in range(n_times_epoch):print("第",epoch,"次循环")the_sum_of_error = 0for row in training_dataset:y_hat = prediction(row, coefficients)print("预测结果:",y_hat,"真实数据是:",row,"系数是:",coefficients)error = row[-1] - y_hatprint("误差:",error,"等于","真实值:",row[-1],"减","预测结果:",y_hat)the_sum_of_error += error ** 2coefficients[0] = coefficients[0] + learning_rate * error * y_hat * (1.0 - y_hat)print("新系数b0:",coefficients[0],"等于","旧系数",coefficients[0],"加","lr:",learning_rate,"乘","误差:",error,"乘","y_hat",y_hat,"乘","1.0 - y_hat:",(1.0 - y_hat))print("-"*50)for i in range(len(row) - 1):coefficients[i + 1] = coefficients[i + 1] + learning_rate * error * y_hat * (1.0 - y_hat) * row[i]print("新系数b1:",coefficients[0],"等于","旧系数",coefficients[0],"加","lr:",learning_rate,"乘","误差:",error,"乘","y_hat",y_hat,"乘","1.0 - y_hat:",yhat,"=",(1.0 - y_hat),"乘",row[i])print("*"*50)print("第 【%d】步,我们使用的学习率是 【%.3f】,误差是 【%.3f】" % (epoch, learning_rate, the_sum_of_error))return coefficientsdataset = [[2, 2, 0],[2, 4, 0],[3, 3, 0],[4, 5, 0],[8, 1, 1],[8.5, 3.5, 1],[9, 1, 1],[10, 4, 1]]learning_rate = 0.1n_times_epoch = 1000coef = using_sgd_method_to_calculate_coefficients(dataset, learning_rate, n_times_epoch)print(coef)





逻辑回归梯度下降数学公式手撕


注:从第一行数据一直到最后一行 为一个epoch,反复重复让误差越来越小

在逻辑回归预测函数中梯度下降系数进行验证

感知器

感知器概念




感知器预测代码

def predict(row, weights): # 传入想要的数据和权重activation = weights[0]for i in range(len(row)-1):activation += weights[i + 1] * row[i]return 1.0 if activation >= 0.0 else 0.0
dataset = [[2.78,2.55,0],[1.47,2.36,0],[1.39,1.85,0],[3.06,3.01,0],[7.63,2.76,0],[5.33,2.09,1],[6.93,1.76,1],[8.76,-0.77,1],[7.66,2.46,1]]
# 使用梯度下降获得的权重
weights = [2.0000000000000004, 0.5930000000000017, -2.460999999999983]
# 预测
for row in dataset:prediction = predict(row,weights)print("真实值 : %d ,预测值 %d:" %(row[-1],prediction))

感知器数学公式手撕

感知器梯度下降计算系数代码


def predict(row, weights):activation = weights[0]for i in range(len(row) - 1):activation += weights[i + 1] * row[i]return 1.0 if activation >= 0.0 else 0.0def opt_weights(train, learning_rate, how_many_epoch):weights = [0.5 for i in range(len(train[0]))]for epoch in range(how_many_epoch):sum_error = 0.0for row in train:prediction = predict(row, weights)error = row[-1] - predictionsum_error += error ** 2weights[0] = weights[0] + learning_rate * errorfor i in range(len(row) - 1):weights[i + 1] = weights[i + 1] + learning_rate * error * row[i]print('This is epoch: %d, our learning_rate is : %.4f, the error is : %.4f' % (epoch, learning_rate, sum_error))return weightsdataset = [[2.78, 2.55, 0],[1.47, 2.36, 0],[1.39, 1.85, 0],[3.06, 3.01, 0],[7.63, 2.76, 0],[5.33, 2.09, 1],[6.93, 1.76, 1],[8.76, -0.77, 1],[7.66, 2.46, 1]]learning_rate = 0.1how_many_epoch = 100weights = opt_weights(dataset, learning_rate, how_many_epoch)print(weights)


测试

感知器梯度下降系数计算手撕


决策树CART

决策树概念




1.如果你得到的是数值的数据如何计算Gini?(rank一遍,计算平均值,通过小于等于来分类,*没有必要将最大的一个数值包括,因为无法分类)
2.如果你得到的是程度数值(比如:按照喜欢程度1234)的数据如何计算Gini?(rank一遍,通过小于等于来分类,*没有必要将最大的一个数值包括,因为无法分类)
3.如果你得到的是调查问卷的数据如何计算Gini?(通过排列组合来分类,*没有必要将包括所有的组合计算在内,因为无法分类)

基尼指标计算代码

def calculate_the_gini_index(groups, classes):# 传入数据 以及有那些类别# 计算有多少实例个数n_instances = float(sum([len(groups) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(groups))# *注意,这里不能除以0,所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这里做了一个加权处理gini += (1 - score) * (size / n_instances)return gini
# 两个类别的最坏情况
worst_case_for_two_classes = [[[1, 1], [1, 0]], [[1, 1], [1, 0]]]print(calculate_the_gini_index(worst_case_for_two_classes, [0, 1]))
# 两个类别的最佳情况
best_case_for_two_classes = [[[1, 0], [1, 0]],[[1, 1], [1, 1]]]print(calculate_the_gini_index(best_case_for_two_classes, [0, 1]))

找到最优决策

# 用二分类数据举例
# 按照类别进行左右切分
def test_split(index, value, dataset):left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, right
# 基尼系数计算函数
def calculate_the_gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意,这里不能除以0,所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return gini
# 贪心算法检查所有组合
# 需要存储的有:index,value,groups数据较多,所以选用dict
def get_split(dataset):class_values = list(set(row[-1] for row in dataset)) # set去重取出有哪些类别print("类别有:",class_values)posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, None # 生成一个占位for index in range(len(dataset[0]) - 1):for row in dataset:groups = test_split(index, row[index], dataset)print("组合:",groups)gini = calculate_the_gini_index(groups, class_values)print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini))if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}
dataset = [[2.1, 1.1, 0],[3.4, 2.5, 0],[1.3, 5.8, 0],[1.9, 8.6, 0],[3.7, 6.2, 0],[8.8, 1.1, 1],[9.6, 3.4, 1],[10.2, 7.4, 1],[7.7, 8.8, 1],[9.7, 6.9, 1]]split = get_split(dataset)
print('Split:[X%d < %.3f]' % ((split['index'] + 1), split['value']))


决策树完整代码

# 1.root node
# 2.recursive split
# 3.terminal node (为了解决over-fitting的问题,减少整个tree的深度/高度,以及必须规定最小切分单位)
# 4.finish building the treedef test_split(index, value, dataset):left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, rightdef calculate_the_gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意,这里不能除以0,所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return ginidef get_split(dataset):class_values = list(set(row[-1] for row in dataset))posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, Nonefor index in range(len(dataset[0]) - 1):for row in dataset:groups = test_split(index, row[index], dataset)gini = calculate_the_gini_index(groups, class_values)print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini))if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}def determine_the_terminal(group):outcomes = [row[-1] for row in group]return max(set(outcomes), key=outcomes.count)# 1.把数据进行切分(分为左边与右边),原数据删除掉
# 2.检查非空以及满足我们的我们设置的条件(深度/最小切分单位/非空)
# 3.一直重复类似寻找root node的操作,一直到最末端def split(node, max_depth, min_size, depth):# 做切分,并删除掉原数据left, right = node['groups']del (node['groups'])# 查看非空if not left or not right:node['left'] = node['right'] = determine_the_terminal(left + right)return# 检查最大深度是否超过if depth >= max_depth:node['left'], node['right'] = determine_the_terminal(left), determine_the_terminal(right)return# 最小分类判断与左侧继续向下分类if len(left) <= min_size:node['left'] = determine_the_terminal(left)else:node['left'] = get_split(left)split(node['left'], max_depth, min_size, depth + 1)# 最小分类判断与右侧继续向下分类if len(right) <= min_size:node['right'] = determine_the_terminal(right)else:node['right'] = get_split(right)split(node['right'], max_depth, min_size, depth + 1)# 最终建立决策树def build_the_regression_tree(train, max_depth, min_size):root = get_split(train)split(root, max_depth, min_size, 1)return root# 通过CLI可视化的呈现类树状结构便于感性认知
def print_our_tree(node, depth=0):if isinstance(node, dict):print('%s[X%d < %.3f]' % ((depth * '-', (node['index'] + 1), node['value'])))print_our_tree(node['left'], depth + 1)print_our_tree(node['right'], depth + 1)else:print('%s[%s]' % ((depth * '-', node)))def make_prediction(node, row):if row[node['index']] < node['value']:if isinstance(node['left'], dict):return make_prediction(node['left'], row)else:return node['left']else:if isinstance(node['right'], dict):return make_prediction(node['right'], row)else:return node['right']dataset = [[2.1, 1.1, 0],[3.4, 2.5, 0],[1.3, 5.8, 0],[1.9, 8.6, 0],[3.7, 6.2, 0],[8.8, 1.1, 1],[9.6, 3.4, 1],[10.2, 7.4, 1],[7.7, 8.8, 1],[9.7, 6.9, 1]]tree = build_the_regression_tree(dataset, 1, 1)
print_our_tree(tree)decision_tree_stump = {'index': 0, 'right': 1, 'value': 7.7, 'left': 0}
for row in dataset:prediction = make_prediction(decision_tree_stump, row)print("What is expected data : %d , Your prediction is %d " % (row[-1], prediction))

朴素贝叶斯(NaiveBayes)

朴素贝叶斯概念



前置类容

按类别拆分数据

def split_our_data_by_class(dataset):# 按类别拆分数据 传入参数数据splited_data = dict() # 按字典储存for i in range(len(dataset)):vector = dataset[i]class_value = vector[-1]if (class_value not in splited_data):splited_data[class_value] =list()splited_data[class_value].append(vector)return splited_data
# 创建虚拟数据dataset = [ [0.8,2.3,0],[2.1,1.6,0],[2.0,3.6,0],[9.1,2.5,1],[3.1,2.5,0],[3.8,4.7,0],[6.8,2.7,1],[6.1,4.4,1],[8.6,0.3,1],[7.9,5.3,1]]splited = split_our_data_by_class(dataset)# 调用切分函数进行切分
print(splited)

# 逐行打印
for label in splited:print(label)for row in splited[label]:print(row)

计算平均值和标准差(n-1)

from math import sqrtdef calculate_the_mean(a_list_of_num): # 平均值函数mean = sum(a_list_of_num)/float(len(a_list_of_num)) # 求总和除以数据个数return meandef calculate_the_standard_deviation(a_list_of_num): # 计算标准差the_mean = calculate_the_mean(a_list_of_num) # 调用平均值函数计算平均值the_variance = sum([(x-the_mean)**2 for x in a_list_of_num])/ float(len(a_list_of_num)-1)# {每个值减去平均值的平方除以数据的(个数-1)}结果相加的总和std = sqrt(the_variance) # 开方return std
# 使用pandas验证
import pandas as pddf =  pd.DataFrame(dataset)
df
df.info()
df.describe()

接下来咱们用pure python写一个类似pandas的describe功能

# 接下来咱们用pure python写一个类似pandas的describe功能
def describe_our_data(dataset):# 调用函数description = [(calculate_the_mean(column),calculate_the_standard_deviation(column),len(column)) for column in zip(*dataset)]del(description[-1]) # 删除最后一列的分类标签return descriptiondescribe_our_data(dataset)

综合类别平均值标准差

def describe_our_data_by_class(dataset):# 综合splited_data = split_our_data_by_class(dataset)data_description = dict()# 创建一个空字典来存储for class_value,rows in splited_data.items():# 通过迭代填入空字典data_description[class_value] = describe_our_data(rows)return data_descriptiondescription = describe_our_data_by_class(dataset)
for label in description:print(label)for row in description[label]:print(row)

description = describe_our_data_by_class(dataset)
type(description)
## 切分数据字典-元组-列表结构梳理
print("----原始数据---")
print(description)
print("--取出字典0的数据--")
print(description[0])
print("--取出字典1的数据--")
print(description[1])
print("---------")
print(description[0][0])
print("--------")
print(description[0][0][2])

高斯概率密度函数


#高斯概率密度函数的建模构建
from math import exp,sqrt,pi
def calculate_the_probability(x,mean,stdev):exponent = exp(-((x-mean)**2/(2*stdev**2)))result = (1/(sqrt(2*pi)*stdev))* exponentreturn resultcalculate_the_probability(1.0,1.0,1.0)

完整代码

from math import sqrt
from math import pi
from math import expdef split_our_data_by_class(dataset):splited_data = dict()for i in range(len(dataset)):vector = dataset[i]class_value = vector[-1]if (class_value not in splited_data):splited_data[class_value] = list()splited_data[class_value].append(vector)return splited_datadef calculate_the_mean(a_list_of_num):mean = sum(a_list_of_num) / float(len(a_list_of_num))return meandef calculate_the_standard_deviation(a_list_of_num):the_mean = calculate_the_mean(a_list_of_num)the_variance = sum([(x - the_mean) ** 2 for x in a_list_of_num]) / float(len(a_list_of_num) - 1)std = sqrt(the_variance)return stddef describe_our_data(dataset):description = [(calculate_the_mean(column),calculate_the_standard_deviation(column),len(column)) for column in zip(*dataset)]del (description[-1])return descriptiondef describe_our_data_by_class(dataset):splited_data = split_our_data_by_class(dataset)data_description = dict()for class_value, rows in splited_data.items():data_description[class_value] = describe_our_data(rows)return data_descriptiondef calculate_the_probability(x, mean, stdev):exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))result = (1 / (sqrt(2 * pi) * stdev)) * exponentreturn resultdef calculate_class_probability(description, row):total_rows = sum([description[label][0][2] for label in description])probabilities = dict()for class_value, class_description in description.items():probabilities[class_value] = description[class_value][0][2] / float(total_rows)for i in range(len(class_description)):mean, stdev, count = class_description[i]probabilities[class_value] *= calculate_the_probability(row[i], mean, stdev)return probabilitiesdataset = [[0.8, 2.3, 0],[2.1, 1.6, 0],[2.0, 3.6, 0],[3.1, 2.5, 0],[3.8, 4.7, 0],[6.1, 4.4, 1],[8.6, 0.3, 1],[7.9, 5.3, 1],[9.1, 2.5, 1],[6.8, 2.7, 1]]description = describe_our_data_by_class(dataset)probability = calculate_class_probability(description, dataset[0])print(probability)

朴素贝叶斯数学公式手撕



KNN算法(K-Nearest_Neighbor)

KNN概述


1.k——超参数(hyper-parameter)
2.k最好为奇数(no even number , better be odd)
3.k大小有学问:
k太小:outliers 对判断的影像加大
k太大:会"冲淡"周边neighbor(高质量、高权重的数据)对最终判断的影像

欧式距离计算代码

# Euclidean Distance 欧氏距离
from math import sqrt
def calculate_euclidean_distance(row1,row2):# 累计的计数器distance = 0.0for i in range(len(row1)-1):# 这是一种快速写sum求和的方法+=distance += (row1[i] - row2[i])**2return sqrt(distance)
# 创建dummy data
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]
row0 = [1.80,1.91,0]
print("先随便取一行数据作为未知数据:",row0)
# 逐一计算 他与dataset中点的距离
for row in dataset:distance = calculate_euclidean_distance(row0,row)print(distance)

欧氏距离数学公式手撕

a = (1.8-1.8)**2+(1.91-1.91)**2
b = (1.8-1.85)**2+(1.91-2.11)**2
c = (1.8-2.31)**2+(1.91-2.88)**2
sqrt(a),sqrt(b),sqrt(c)

KNN思路

找思路:
1.需要一个输入变量k
2.需要排序(选前面k个)
3.数据类型储存使用元组进行储存
4.通过排序选出k值最近的

KNN完整代码

# Euclidean Distance 欧氏距离
from math import sqrt
def calculate_euclidean_distance(row1,row2):# 累计的计数器distance = 0.0for i in range(len(row1)-1):# 这是一种快速写sum求和的方法+=distance += (row1[i] - row2[i])**2return sqrt(distance)
### 寻找最近的邻居
def get_our_neighbors(train,test_row,num_of_neighbors): # 传入训练数据,测试数据,k值distances = list()# 使用空的列表来存储后面的数据for train_row in train: # 拆解train为每一行dist = calculate_euclidean_distance(test_row,train_row)# 调用欧氏距离函数计算距离distances.append((train_row,dist)) # 把结果放入空列表distances.sort(key=lambda every_tuple:every_tuple[1])# 排序neighbors = list()for i in range(num_of_neighbors): # 循环k值得次数neighbors.append(distances[i][0])#print(neighbors)return neighbors
dataset = [[1.80,1.91,0],[3.66,3.12,0],[1.85,2.11,0],[3.54,-3.21,0],[2.31,2.88,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]
# 传入数据集,选取一个点来计算距离,k=3
neighbors =get_our_neighbors(dataset,dataset[0],3)
for neighbor in neighbors:print(neighbor)

### 预测类别
def predict_the_class(train, test_row, num_of_neighbors):# 传入训练集 测试集 K值neighbors = get_our_neighbors(train,test_row,num_of_neighbors) # 使用计算最近邻居函数进行计算the_class_values = [row[-1] for row in neighbors] # 拿出最近邻居类别数prediction = max(set(the_class_values),key=the_class_values.count)# 是max求出最多的类别个数return predictionprediction = predict_the_class(dataset,dataset[0],3)
print('真实类别 【%d】' %(dataset[0][-1]))
print('预测类别 【%d】' %(prediction))

### 预测类别版本V2
def predict_the_class_V2(train,test_row,num_of_neighbors):neighbors = get_our_neighbors(train,test_row,num_of_neighbors)the_class_values = [row[-1] for row in neighbors]prediction = sum(the_class_values) / float(len(the_class_values))return predictionprediction = predict_the_class_V2(dataset,dataset[0],3)print('Our expectation(the real class) is class 【%d】' %(dataset[0][-1]))
print('Our prediction(the predicted class) is class 【%d】' %(prediction))

学习向量量化(Learning_Vector_Quantization)

LVQ概述

通常,我们使用LVQ方法用在分类问题上。

codebook vector(是一系列数字,与你训练数据里的input与output相关的特征一样)

example:
1.class 0,1
2.width
3.height
4.length

codebook vector(neuron):
1.class 0,1
2.width
3.height
4.length

LVQ跟KNN

通过在codebook vector里面进行寻找,通过Euclidean距离进行判断,找到BMU(Best Matching Unit)

1.选择一部分codebook vector。
2.竞争机制(codebook vector与训练实例(training pattern)一致的情况下。codebook vector向训练实例靠近,反之,则离远)
3.通过learning_rate控制移动的大小

x = x + learning_rate * (t - x)

4.对每个实例进行学习。

learning_rate = alpha(最初的学习率) * (1- (epoch/max_epoch))

计算两个向量之间的欧氏距离:

# 计算两个向量之间的欧氏距离:
from math import sqrtdef calculate_euclidean_distance(row1,row2):distance = 0.0for i in range(len(row1)-1):distance += (row1[i] - row2[i])**2return sqrt(distance)dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]
# 测试
row0 =dataset[0] # 使用1.80,1.91,0 这个点的数据来计算和dataset每一项的距离
for row in dataset:distance = calculate_euclidean_distance(row0,row)print(distance)

Best Matching Unit

1.计算距离(codebook vector 与 新的输入信息)
2.调用calculate_euclidean_distance
3.排序(考虑到数据类型)
4.选取BMU

# 欧氏距离函数
from math import sqrt
def calculate_euclidean_distance(row1,row2):distance = 0.0for i in range(len(row1)-1):distance += (row1[i] - row2[i])**2return sqrt(distance)
# 定义训练集
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]
def calculate_BMU(codebooks, test_row): # codebook vector 与 新的输入信息distances = list()for codebook in codebooks:dist = calculate_euclidean_distance(codebook,test_row)# 使用欧氏距离函数传入codebook vector 与 新的输入信息的计算距离distances.append((codebook,dist))# 把codebook vector和距离信息追加到distances列表中distances.sort(key=lambda every_tuple : every_tuple[1]) # 使用lambda排序print(distances)return distances[0][0] #通过列表切割出数据中第一个数据(就是距离新输入的最近的距离)
# 测试
test_row  = [1.60,1.81,0] # 随便找一列数据来测试
bmu = calculate_BMU(dataset,test_row) # 传入数据集 和测试数据
print("距离最近",bmu) # 得出结果距离最近的是[1.8, 1.91, 0]这一行数据


码本向量训练思路

训练我们的codebook vector

1.初始化(random feature)
2.在每一个epoch,通过 training pattern 进行对codebook vector更新(学习)
3.在每一个training pattern里面,每一个pattern feature如果与我们codebook vector一致的情况下,进行更新(更近,或者更远)

from random import randrange # 导入随机函数
# 随机生成码本
def make_random_codebook(train): # 传入训练数据n_index = len(train) # 数据的长度就是数据的个数n_features = len(train[0]) # 数据的长度如[1.80,1.91,0] = 3个codebook = [train[randrange(n_index)][i] for i in range(n_features)] # 使用randrange方法按照n_features的长度随机生成一条数据return codebook
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]
m_r_c = make_random_codebook(dataset)
m_r_c

LVQ完整代码

# 计算两个向量之间的欧氏距离:
from math import sqrtdef calculate_euclidean_distance(row1,row2):distance = 0.0for i in range(len(row1)-1):distance += (row1[i] - row2[i])**2return sqrt(distance)def calculate_BMU(codebooks, test_row): # codebook vector 与 新的输入信息distances = list()for codebook in codebooks:dist = calculate_euclidean_distance(codebook,test_row)# 使用欧氏距离函数传入codebook vector 与 新的输入信息的计算距离distances.append((codebook,dist))# 把codebook vector和距离信息追加到distances列表中distances.sort(key=lambda every_tuple : every_tuple[1]) # 使用lambda排序return distances[0][0] #通过列表切割出数据中第一个数据(就是距离新输入的最近的距离)from random import randrange # 导入随机函数
# 随机生成码本
def make_random_codebook(train): # 传入训练数据n_index = len(train) # 数据的长度就是数据的个数n_features = len(train[0]) # 数据的长度如[1.80,1.91,0] = 3个codebook = [train[randrange(n_index)][i] for i in range(n_features)] # 使用randrange方法随机生成一条数据return codebook#码本向量竞争实现
def train_codebooks(train,n_codebooks,learn_rate,epochs):codebooks = [make_random_codebook(train) for i in range(n_codebooks)] # 调用make_random_codebook函数随机生成数据for epoch in range(epochs): # 迭代rate = learn_rate * (1-(epoch/float(epochs)))# 更新学习率sum_error = 0.0 # 存储错误for row in train:bmu = calculate_BMU(codebooks,row)# 通过学习不断更新学习率for i in range(len(row)-1):error = row[i] - bmu[i]sum_error += error**2if bmu[-1] == row[-1]: # 如果结果和真实数据一样 一直就 +bmu[i] += rate * errorelse:bmu[i] -= rate * error# 反之就 -print('第【%d】epoch , 学习率 :【%.3f】, 错误总数是 【%.3f】' % (epoch,rate,sum_error))return codebooks
dataset = [[1.80,1.91,0],[1.85,2.11,0],[2.31,2.88,0],[3.54,-3.21,0],[3.66,3.12,0],[5.52,2.13,1],[6.32,1.46,1],[7.35,2.34,1],[7.78,3.26,1],[8.43,-0.34,1]]learning_rate = 0.3 # 学习率
n_epoch = 10 # 步数
n_codebooks = 2 # 返回多少个数据
codebooks = train_codebooks(dataset,n_codebooks,learning_rate,n_epoch)
print('Our codebook is : %s' % codebooks)

人工神经网络与反向传播

概述





激活函数


rectifier其实就是一种模仿生物的激活机制的函数
(activation function)

常见的激活函数:
https://en.wikipedia.org/wiki/Rectifier_(neural_networks)#Gaussian_Error_Linear_Unit_(GELU)

import math
softplus = math.log(1+math.exp(2.14))
print(softplus)

sigmoid =math.exp(2.14)/(1 + math.exp(2.14))
print(sigmoid)

ReLU = max(2.14,0)
print(ReLU)

前向传播

前向传播(forward-propagate)
1.neuron activation(w1 b1)
2.neuron transfer(activation)
3.forward-propagate(calculate the output)

from random import seed
from random import random#初始化我们的神经网络
# 随机生成权重bias
def initialize_our_neural_network(n_inputs,n_hidden,n_outputs):# 传入参数 多少个输入 多少个隐藏层 多少个输出neural_network = list()# 空列表存储hidden_layer = [{'weights':[random() for i in range(n_inputs +1)]} for i in range(n_hidden)] # weights的值是n_inputs +1 个    生成多少层n_hidden就循环生成多少个weightneural_network.append(hidden_layer)output_layer = [{'weights':[random() for i in range(n_hidden +1)]} for i in range(n_outputs)] # 输出层的weight是n_hidden +1个值        生成多少个输出neural_network.append(output_layer)return neural_networkseed(1)
network = initialize_our_neural_network(2,1,2) # 两个输入 1个隐藏层  两个输出for layer in network:print(layer)

# 构建神经元激活函数
def neuron_activation(weights,inputs): # 传入权重,输入activation = weights[-1] # activation等于 weights中的最后一个值作为 biasfor i in range(len(weights)-1):  # 循环 次数是weight的个数-1 因为上面取出一个activation += weights[i] * inputs[i]  # 让weights中的每个值和inputs中的每个值相乘然后和每个activation相加return activation
# 创建一个sigmiod 激活函数
from math import exp
def neuron_transfer(activation):result = 1.0/(1.0+ exp(-activation))return result
# 正向传播调用参数
def forward_propagate(network,row):# 传入 网络 以及具体数据inputs = row # 输入=具体数据for layer in network: # 循环网络#  print("layer:",layer)new_inputs = []for neuron in layer:# print("神经元:",neuron)activation = neuron_activation(neuron['weights'],inputs)# 调用神经元激活函数提取权重bias进行计算# print("激活:",activation)neuron['output'] = neuron_transfer(activation) # 调用sigmiod激活函数# print("sigmiod:",neuron['output'])new_inputs.append(neuron['output'])inputs = new_inputs # 使用新的输入替换旧的return inputs
# 测试
network=[[{'weights': [0.13436424411240122,0.8474337369372327,0.763774618976614]}],[{'weights': [0.2550690257394217,0.49543508709194095]},{'weights': [0.4494910647887381,0.651592972722763]}]]
row =[1,0]
output = forward_propagate(network,row)
print("输出:",output)



计算过程

传播误差代码

def neuron_transfer_derivative(output):derivative = output*(1-output)return derivative

def backward_propagate_error(network,expected):for i in reversed(range(len(network))):layer = network[i]errors = list()if i != len(network)-1:for j in range(len(layer)):error = 0.0for neuron in network[i+1]:error +=(neuron['weights'][j] * neuron['delta'])errors.append(error)else:for j in range(len(layer)):neuron = layer[j]errors.append(expected[j]- neuron['output'])for j in range(len(layer)):neuron = layer[j]neuron['delta'] = errors[j] * neuron_transfer_derivative(neuron['output'])
## 测试
network=[[{'output':0.71,'weights': [0.13436424411240122,0.8474337369372327,0.763774618976614]}],[{'output':0.62,'weights': [0.2550690257394217,0.49543508709194095]},{'output':0.65,'weights': [0.4494910647887381,0.651592972722763]}]]
expected = [0,1]
backward_propagate_error(network,expected)
for layer in network:print(layer)


手推计算过程

权重学习更新机制

weight = weight + learning rate * error * input

def update_weights(network, row , learning_rate):for i in range(len(network)):inputs = row[:-1]# 因为output layer的数据是hidden layer传到过来的if i != 0:inputs = [neuron['output'] for neuron in network[i-1]]for neuron in network[i]:for j in range(len(inputs)):neuron['weights'][j] += learning_rate* neuron['delta'] * inputs[j]neuron['weights'][-1] += learning_rate*neuron['delta']

预处理编码形式

补充知识:
我们对分类数据做预处理的时候,一般有两种encoding编码方式:
1.Integer Encoding

Blue-- Red–Green–Yellow
—1------2------3----------4–

2.One-Hot Encoding

dummy variable

Blue Red Green Yellow
–1------0------0------0
–0------1------0------0
–0------0------1------0
–0------0------0------1

训练

def train_our_network(network,train,learning_rate,n_epoch,n_output):for epoch in range(n_epoch):sum_error = 0for row in train:outputs = forward_propagate(network,row)expected = [0 for i in range(n_output)]expected[row[-1]] = 1sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])backward_propagate_error(network,expected)update_weights(network,row,learning_rate)print("第[%d] epoch , 学习率: [%.3f],误差: [%.3f]" %(epoch,learning_rate,sum_error))
seed(1)dataset = [[2.1,2.8,0],[1.3,2.7,0],[1.2,5.2,0],[3.3,2.8,0],[1.2,1.1,0],[6.2,5.8,1],[8.3,3.7,1],[6.2,2.7,1],[7.3,3.4,1],[9.2,2.1,1]
]n_inputs = len(dataset[0]) -1
n_outputs = len(set(row[-1] for row in dataset))
network = initialize_our_neural_network(n_inputs,2, n_outputs)
train_our_network(network,dataset,0.1,1000,n_outputs)
for layer in network:print(layer)

预测分类

def predict(network,row):outputs = forward_propagate(network,row)return outputs.index(max(outputs))network=[[{'weights': [-1.1866404384956928, 0.3006679439138231, 3.9117172696404685]}, {'weights': [1.2235819285217509, -0.3158686384608762, -4.03117701360861]}],
[{'weights': [3.933774188003338, -3.959801574023486, 0.4499036554334044]}, {'weights': [-3.678895659767694, 4.256650475104409, -0.7298649815974946]}]]for row in dataset:prediction = predict(network,row)print("Our expected class value is [%d], Our prediction of class value is [%d]" % (row[-1], prediction))

集成算法bagging法

把各种model综合起来——让预测更准确、更加稳定(做平均)
在随机森林里面的超参数(hyper-parameter):

1.对于每一棵树,要选取特性(features),假设总共有n个feature,你需要确定选取个m作为参数
2.每一个node的最低size(每个棵树的每一片叶子的最小值)
3.每一个树的深度(maximum depth of one tree)
4.选择森林里面有多少棵树

from random import randrangedef subsample(dataset,ratio=1.0):sample = list()n_sample = round(len(dataset) * ratio )while len(sample) < n_sample:index = randrange(len(dataset))sample.append(dataset[index])return sample
from random import seed
from random import randrange
from random import randomdef subsample(dataset, ratio=1.0):sample = list()n_sample = round(len(dataset) * ratio)while len(sample) < n_sample:index = randrange(len(dataset))sample.append(dataset[index])return sampledef mean(numbers):result = sum(numbers) / float(len(numbers))return resultseed(1)
dataset = [[randrange(10)] for i in range(20)]# print(dataset)ratio = 0.10
for size in [1, 10, 100,1000,10000,100000,1000000]:sample_means = list()for i in range(size):sample = subsample(dataset, ratio)sample_mean = mean([row[0] for row in sample])sample_means.append(sample_mean)print("When sample is [%d],the estimated mean is [%.3f]" % (size, mean(sample_means)))print("The real mean of our dataset is [%.3f]" % mean([row[0] for row in dataset]))

from random import seed
from random import randrange
from random import random
from csv import reader# 1.load our data # 定义加载数据函数
def load_csv(filename): # 传入数据dataset = list()# 定义一个空列表逐步往里面存放数据with open(filename, 'r') as file:  # 通过上下文管理器读取文件csv_reader = reader(file) # 读取文件for row in csv_reader: # 循环文件的行数if not row:  # 如果不是空continue  # 继续下面操作dataset.append(row) # 把每行追加到dataset列表中return dataset
# 读取测试
dataset = load_csv('sonar.all-data.csv')
print(dataset)

# 2.datatype conversion  数据类型转换def str_to_float(dataset, column): # 将字符串转换为浮点类型 传入数据 以及每个columnfor row in dataset: # 通过for循环所有数据逐行转换row[column] = float(row[column].strip()) # 把数字转换为浮点数 去掉前后空格def str_to_int(dataset, column): # 把字符串转换为数字类型 class_value = [row[column] for row in dataset] unique = set(class_value) # 去重look_up = dict() # 定义一个空字典后面进行填充for i, value in enumerate(unique): # 逐一遍历look_up[value] = ifor row in dataset:row[column] = look_up[row[column]]return look_up
# 3.k_fold cross validation k_fold交叉验证数据切分def cross_validation_split(dataset, n_folds): # 传入数据 以及 切分的次数dataset_split = list() # 空列表接受后面切分好的数据dataset_copy = list(dataset) # 把数据放入dataset_copy中 fold_size = int(len(dataset) / n_folds) # 每个 fold_size的大小 是整体长度除以切分的次数for i in range(n_folds): # 循环切分的次数fold = list() # 定义一个空列表来存放数据while len(fold) < fold_size: # 当fold的长度小于切分fold_size的时候一直循环下面的操作index = randrange(len(dataset_copy)) # index=随机选取所有数据的长度fold.append(dataset_copy.pop(index)) # 将dataset_copy.pop(index)随机选取的数据追加到fold中dataset_split.append(fold) # 在将fold的内容追加到 dataset_split中return dataset_split
# 4.calculate model accuracy # 计算准确率def calculate_accuracy(actual, predicted): # 传入真实数据  预测数据correct = 0 # =0 用来计算准确率的个数for i in range(len(actual)): # 按照数据的长度进行循环if actual[i] == predicted[i]: # 当数据一样的时候correct += 1 # correct就更新+1return correct / float(len(actual)) * 100.0 # 最终相等的结果个数/数据总个数 就是数据的准确率
# 总数据为100个 我们correct的数据有90个  90/100=0.9  *100  = 90%准确率
# 5.how good is our algo 使用测试集验证我们的算法有多好
def evaluate_our_algo(dataset, algo, n_folds, *args): # 传入我们的数据 以及其中一个algo 七分 以及其他的argsfolds = cross_validation_split(dataset, n_folds) ## 调用数据切分函数 scores = list() for fold in folds: # 循环我们切分好的数据train_set = list(folds) # 总数据的长度train_set.remove(fold) # 因为我们需要拿出其中一个数据所以删除掉一个数据train_set = sum(train_set, []) # 计算数据总和test_set = list()for row in fold: # 把fold中每行数据进行循环row_copy = list(row) # 定义到row_copy中test_set.append(row_copy) # 在追加到test_set中row_copy[-1] = None # 去掉数据的标签(答案)predicted = algo(train_set, test_set, *args)actual = [row[-1] for row in fold] # 真实数据accuracy = calculate_accuracy(actual, predicted) # 调用准确率函数计算准确率scores.append(accuracy)return scores
# 6.left and right split  # 左右切分def test_split(index, value, dataset):left, right = list(), list()for row in dataset: # 逐行进行操作if row[index] < value:left.append(row)else:right.append(row)return left, right
# 7.calculate gini index # gini系数计算def gini_index(groups, classes):# 计算有多少实例n_instances = float(sum([len(group) for group in groups]))# 把每一个group里面的加权gini计算出来gini = 0.0for group in groups:size = float(len(group))# *注意,这里不能除以0,所以我们要考虑到分母为0的情况if size == 0:continuescore = 0.0for class_val in classes:p = [row[-1] for row in group].count(class_val) / sizescore += p * p# 这个做了一个加权处理gini += (1 - score) * (size / n_instances)return gini
# 8.calculate the best split 计算最佳分割 根据gini系数不断更新def get_split(dataset):class_values = list(set(row[-1] for row in dataset))posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, Nonefor index in range(len(dataset[0])- 1):for row in dataset:groups = test_split(index, row[index], dataset)gini = gini_index(groups, class_values)if gini < posi_score:posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groupsreturn {'index': posi_index, 'value': posi_value, 'groups': posi_groups}
# 9. to terminal # 确认是否切分到末端def determine_the_terminal(group):outcomes = [row[-1] for row in group]return max(set(outcomes), key=outcomes.count)
# 10.
# 1.split our data into left and right将数据分成左右两部分
# 2.delete the original data删除原始数据
# 3.check if the data is none/max depth/min size检查数据是否为无/最大深度/最小大小
# 4.to terminal至终端def split(node, max_depth, min_size, depth):left, right = node['groups']del (node['groups'])if not left or not right:node['left'] = node['right'] = determine_the_terminal(left + right)returnif depth >= max_depth:node['left'], node['right'] = determine_the_terminal(left), determine_the_terminal(right)if len(left) <= min_size:node['left'] = determine_the_terminal(left)else:node['left'] = get_split(left)split(node['left'], max_depth, min_size, depth + 1)if len(right) <= min_size:node['right'] = determine_the_terminal(right)else:node['right'] = get_split(right)split(node['right'], max_depth, min_size, depth + 1)
# 11.make our decision treedef build_tree(train, max_depth, min_zise):root = get_split(train)split(root, max_depth, min_zise, 1)return root
# 12.make predictiondef predict(node, row):if row[node['index']] < node['value']:if isinstance(node['left'], dict):return predict(node['left'], row)else:return node['left']else:if isinstance(node['right'], dict):return predict(node['right'], row)else:return node['right']
# 13. subsampledef subsample(dataset, ratio):sample = list()n_sample = round(len(dataset) * ratio)while len(sample) < n_sample:index = randrange(len(dataset))sample.append(dataset[index])return sample

# 14.make prediction using bagging
def bagging_predict(trees, row):predictions = [predict(tree, row) for tree in trees]return max(set(predictions), key=predictions.count)
# 15.baggingdef bagging(train, test, max_depth, min_size, sample_size, n_trees):trees = list()for i in range(n_trees):sample = subsample(train, sample_size)tree = build_tree(sample, max_depth, min_size)trees.append(tree)predictions = [bagging_predict(trees, row) for row in test]return (predictions)seed(1)
dataset = load_csv('sonar.all-data.csv')for i in range(len(dataset[0]) - 1):str_to_float(dataset, i)str_to_int(dataset, len(dataset[0]) - 1)n_folds = 5
max_depth = 6
min_size = 2
sample_size = 0.5for n_trees in [1, 5, 10, 50]:scores = evaluate_our_algo(dataset, bagging, n_folds, max_depth, min_size, sample_size, n_trees)print('We are using [%d]' % n_trees)print('The scores are : [%s]' % scores)print('The mean accuracy is [%.3f]' % (sum(scores) / float(len(scores))))

未完待续>>>

机器学习从零开始-常见算法手推pure python相关推荐

  1. 数据产品必备技术知识:机器学习及常见算法,看这一篇就够了

    大家都知道,产品经理需要懂技术,很多面试官都偏好有技术背景的同学,毕竟产品经理经常要和开发同学相爱相杀.当然也不是一定要求能够精通,但是至少不要让这块成为沟通的障碍,懂点技术,实际工作中也能少被开发同 ...

  2. 收藏!博士大佬的《机器学习》西瓜书手推笔记!

    点上方蓝字计算机视觉联盟获取更多干货 在右上方 ··· 设为星标 ★,与你不见不散 感谢联盟大佬手推笔记分享 原本联盟准备等笔记集中以后再和大家分享,后来想了一下,先拿一部分笔记和大家一起分享,共同学 ...

  3. python机器学习算法.mobi_机器学习之ID3算法详解及python代码实现

    在生活中我们经常会用到决策树算法,最简单的就是二叉树了:相信大家也会又同样的困扰,手机经常收到各种短信,其中不乏很多垃圾短信.此时只要设置这类短信为垃圾短信手机就会自动进行屏蔽.减少被骚扰的次数,同时 ...

  4. python决策树 value_机器学习之ID3算法详解及python代码实现

    在生活中我们经常会用到决策树算法,最简单的就是二叉树了:相信大家也会又同样的困扰,手机经常收到各种短信,其中不乏很多垃圾短信.此时只要设置这类短信为垃圾短信手机就会自动进行屏蔽.减少被骚扰的次数,同时 ...

  5. 机器学习之随机森林(手推公式版)

    文章目录 前言 1. 集成学习 1.1 Boosting 1.2 Bagging 2. 随机森林 2.1 投票法 2.2 平均法 3. 模型实现 结束语 前言   随机森林(Random(Random ...

  6. 机器学习面试常见算法

    前言: 找工作时(IT行业),除了常见的软件开发以外,机器学习岗位也可以当作是一个选择,不少计算机方向的研究生都会接触这个,如果你的研究方向是机器学习/数据挖掘之类,且又对其非常感兴趣的话,可以考虑考 ...

  7. GitHub | 周志华《机器学习》手推笔记正式开源!可打印版本附pdf下载链接

    点上方蓝字计算机视觉联盟获取更多干货 在右上方 ··· 设为星标 ★,与你不见不散 编辑:Sophia              | 联盟笔记 计算机视觉联盟  报道  | 公众号 CVLianMen ...

  8. BP算法总结+从输入-隐层-输出的逐步手推

    1 BP算法总结 BP算法:bp算法实际上是在神经网络中寻找在合适条件下的最佳权重和bais.实际上是利用输出后的误差来估计输出层前一层的误差,再用这层误差来估计更前一层误差,如此获取所有各层误差估计 ...

  9. AI - 常见算法简介(Common Algorithms)

    机器学习常见算法简介 - 原文链接:http://usblogs.pwc.com/emerging-technology/machine-learning-methods-infographic/ 应 ...

最新文章

  1. Html中的次方符号怎么写
  2. vue循环渲染子组件视图不更新问题
  3. 你与那些经验老练的程序员就差一个 英文编程单词表!【文中资源分享】
  4. 容器化的 DevOps 工作流
  5. 《操作系统真象还原》-阅读笔记(中)
  6. sizeof与offsetof有关的结构体详解
  7. unzip 解压_每天一条Linux命令(11) unzip (超详细)
  8. HLA程序:HelloWorld.hla
  9. FPGA常用总线IIC 与SPI选择策略
  10. UILabel 设置行间距
  11. Hibernate的CRUD
  12. js获取ip地址的私有地址 或者公有地址
  13. RC电路时间常数的定义及计算
  14. Windows 11 22H2 中文版、英文版 (x64、ARM64) 下载 (updated Jan 2023)
  15. 三、实战---爬取百度指定词条所对应的结果页面(一个简单的页面采集器)
  16. 【GNN】task1-简单图论 Data类-PyG中图的表示及使用
  17. 试题 历届真题 机器人行走(C语言实现)
  18. UnityShader-高斯模糊
  19. 2021年中国物联网通信板块现状及重点企业对比分析(移远通信VS广和通VS和而泰VS拓邦股份VS移为通信)[图]
  20. winrar 命令行制作自解压安装包

热门文章

  1. php并行运算,php多进程并行执行脚本的代码
  2. PHP10段常用功能代码
  3. 重装系统 计算机意外遇到错误无法运行,win7系统重装笔记本提示"计算机意外的重新启动或遇到错误"的解决方法...
  4. 蓝凌ekp开发_新华教育集团战略升级,携手蓝凌量身定制数字化办公平台
  5. pandas新建dataframe_pandas数据处理
  6. python编程求导数_用python怎么计算导数最简单?
  7. centos7安装docker安装rabbitmq(2021)亲测有效!!!
  8. java jlable添加gif,Java动画GIF而不使用JLabel
  9. python 可视化饼图_Python可视化学习(饼状图,坐标系...)
  10. 联想拯救者y空间兑换代码_十代酷睿全面升级 拯救者Y7000P 2020产品解读