赞
踩
# 平均值函数 def calculate_mean(a_list_of_values): mean=sum(a_list_of_values)/float(len(a_list_of_values)) return mean # 计算方差函数 def calculate_variance(a_list_of_values,mean): variance_sum=sum((x-mean)**2 for x in a_list_of_values) variance=variance_sum/(len(a_list_of_values)-1) return variance # 计算协方差函数 def calculate_covariance(a_list_of_Xs,the_mean_of_Xs,a_list_of_Ys,the_mean_of_Ys): cov_sum=0 for i in range(len(a_list_of_Xs)): cov_sum+=(a_list_of_Xs[i]-the_mean_of_Xs)*(a_list_of_Ys[i]-the_mean_of_Ys) the_covariance=cov_sum/(len(a_list_of_Xs)-1) return the_covariance # 计算标准差函数 def calculate_the_standard_deviation(a_list_values): the_mean_of_the_list_values=sum(a_list_values)/float(len(a_list_values)) variance=sum([(a_list_values[i]-the_mean_of_the_list_values)**2 for i in range(len(a_list_values)) ]) / float(len(a_list_values)-1) return variance**0.5 # 计算相关性函数 def calculate_the_correlation(a_list_of_Xs, the_mean_of_Xs, a_list_of_Ys, the_mean_of_Ys): X_std = calculate_the_standard_deviation(a_list_of_Xs) Y_std = calculate_the_standard_deviation(a_list_of_Ys) X_Y_Cov = calculate_covariance(a_list_of_Xs, the_mean_of_Xs, a_list_of_Ys, the_mean_of_Ys) Corr = (X_Y_Cov) / (X_std * Y_std) return Corr # 计算系数 def calculate_the_coefficients(dataset): x = [row[0] for row in dataset] y = [row[1] for row in dataset] x_mean, y_mean = calculate_mean(x), calculate_mean(y) b1 = calculate_covariance(x, x_mean, y, y_mean) / calculate_variance(x, x_mean) b0 = y_mean - b1 * x_mean return [b0, b1] experience=[1,2,3,4,5] salary =[100,200,300,400,500] list_of_tuples=list(zip(experience,salary)) print("list_of_tuples:",list_of_tuples) list_of_lists=[list(elem) for elem in list_of_tuples] print("list_of_lists:",list_of_lists) b0,b1=calculate_the_coefficients(list_of_lists) print("b0,b1:",b0,b1) # 预测函数 def simple_linear_regression(training_data,testing_data): predictions=[] b0,b1=calculate_the_coefficients(training_data) for row in testing_data: y=b0+b1*row[0] predictions.append(y) return predictions # 均方根误差 from math import sqrt def calculate_the_RMSE(predicted_data,actual_data): the_sum_of_error=0 for i in range(len(actual_data)): prediction_error=predicted_data[i]-actual_data[i] the_sum_of_error += (prediction_error**2) RMSE=sqrt(the_sum_of_error/float(len(actual_data))) return RMSE # 把数据x列分离出来 data_to_be_put_into_the_model=[] for row in list_of_lists: row_copy=list(row) row_copy[-1]=None data_to_be_put_into_the_model.append(row_copy) print(data_to_be_put_into_the_model) # 使用预测函数进行预测y的数据 predictions=simple_linear_regression(list_of_lists,data_to_be_put_into_the_model) print(predictions) # 预测新的数据 Y = [[6,], [7,], [8,], [9,], [10,]] predictions=simple_linear_regression(list_of_lists,Y) print("Y:",predictions) # 使用均方根误差计算准确率 def how_good_is_our_model(dataset,some_model_to_be_evaluated): test_data=[] for row in dataset: row_copy=list(row) row_copy[-1]=None test_data.append(row_copy) predict_data=some_model_to_be_evaluated(dataset,test_data) print("预测结果:",predict_data) actual_data=[row[-1] for row in dataset] print("真实结果",actual_data) RMSE=calculate_the_RMSE(predict_data,actual_data) return RMSE result=how_good_is_our_model(list_of_lists,simple_linear_regression) print(result)
#make prediction带入系数预测代码 def make_prediction(input_row,coefficients): out_put_y_hat = coefficients[0] for i in range(len(input_row)-1): out_put_y_hat += coefficients[i+1] * input_row[i] return out_put_y_hat test_dataset = [[1,1.5], [2,2.5], [3,3.5], [4,4.5], [5,5.5]] test_coefficients = [0.4,0.8] for row in test_dataset: y_hat = make_prediction(row,test_coefficients) print("真实Y值= %.3f, 预测结果 = %.3f"%(row[-1],y_hat))
def make_prediction(input_row,coefficients): out_put_y_hat = coefficients[0] for i in range(len(input_row)-1): out_put_y_hat += coefficients[i+1] * input_row[i] return out_put_y_hat def using_sgd_method_to_calculate_coefficients(training_dataset, learning_rate, n_times_epoch):# 训练数据集,学习率,次数 coefficients = [1 for i in range(len(training_dataset[0]))] # 初始系数 print("系数:",coefficients) for epoch in range(n_times_epoch): print("第",epoch,"次循环") the_sum_of_error = 0 # 从0开始记数 for row in training_dataset: y_hat = make_prediction(row, coefficients) print("真实数据是:",row,"系数是:",coefficients) print("预测结果:",y_hat) error = y_hat - row[-1] print("误差:",error,"等于","预测结果:",y_hat,"减","真实值:",row[-1]) the_sum_of_error += error ** 2 print("错误的总和的平方:",the_sum_of_error) coefficients[0] = coefficients[0] - learning_rate * error print("新系数b0:",coefficients[0],"等于","旧系数",coefficients[0],"减",learning_rate,"乘","误差:",error) for i in range(len(row) - 1): coefficients[i + 1] = coefficients[i + 1] - learning_rate * error * row[i] print("新系数b1:",coefficients[i + 1],"等于","旧系数",coefficients[i + 1],"减",learning_rate,"乘","误差:",error,"乘",row[i]) print("第 【%d】步,我们使用的学习率是 【%.3f】,错误是【%.3f】" % (epoch, learning_rate, the_sum_of_error)) return coefficients your_training_dataset = [[1,1.5], [2,2.5], [3,3.5], [4,4.5], [5,5.5]] your_model_learning_rate = 0.1 your_n_epoch =43 your_coefficients = using_sgd_method_to_calculate_coefficients(your_training_dataset, your_model_learning_rate, your_n_epoch) print("-"*50) print("系数结果b0,b1:",your_coefficients)
print("数据: [x1 = 2, x2 = 2, 类型 = 0]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*2
y2 = -1.15+1.48*2+ -2.30*2
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)
print("数据: [x1 = 2, x2 = 4, 类型 = 0]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*2
y2 = -1.15+1.48*2+ -2.30*4
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)
print("数据: [x1 = 10, x2 = 4, 类型 = 1]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*10
y2 = -1.15+1.48*10+ -2.30*4
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)
print("数据: [x1 = 8.5, x2 = 3.5, 类型 = 1]")
print("b0 = -1.15, b1 = 1.48, b2 = -2.30")
y1 = -1.15+1.48*8.5
y2 = -1.15+1.48*8.5+ -2.30*3.5
y = 1/(1+exp(-y2))
yy = round(y)
print(y1,y2,y,"类型为:",yy)
# 预测函数
from math import exp
def prediction(row, coefficients):
yhat = coefficients[0]
for i in range(len(row)-1):
yhat+=coefficients[i+1]*row[i]
return 1/(1+exp(-yhat))
# 测试 dataset = [[2,2,0], [2,4,0], [3,3,0], [4,5,0], [8,1,1], [8.5,3.5,1], [9,1,1], [10,4,1]] coef = [-1.15, 1.48, -2.30] # 我们随便定义一组系数 # prediction function from math import exp def prediction(row, coefficients): yhat = coefficients[0] print("yhat:",yhat) for i in range(len(row)-1): yhat+=coefficients[i+1]*row[i] print("yhat:",yhat,"+=","coefficients:",coefficients[i+1],"*" ,"row[i]",row[i]) print("1/(1+",exp(-yhat),")=",1/(1+exp(-yhat))) return 1/(1+exp(-yhat)) for row in dataset: print("数据:",row) yhat = prediction(row,coef) print("真实类别%.3f, 预测类别 %.3f ≈[%d]" % (row[-1], yhat,round(yhat)))
from math import exp def prediction(row, coefficients): yhat = coefficients[0] for i in range(len(row) - 1): yhat += coefficients[i + 1] * row[i] return 1 / (1 + exp(-yhat)) def using_sgd_method_to_calculate_coefficients(training_dataset, learning_rate, n_times_epoch): coefficients = [0.5 for i in range(len(training_dataset[0]))] print("系数:",coefficients) for epoch in range(n_times_epoch): print("第",epoch,"次循环") the_sum_of_error = 0 for row in training_dataset: y_hat = prediction(row, coefficients) print("预测结果:",y_hat,"真实数据是:",row,"系数是:",coefficients) error = row[-1] - y_hat print("误差:",error,"等于","真实值:",row[-1],"减","预测结果:",y_hat) the_sum_of_error += error ** 2 coefficients[0] = coefficients[0] + learning_rate * error * y_hat * (1.0 - y_hat) print("新系数b0:",coefficients[0],"等于","旧系数",coefficients[0],"加","lr:",learning_rate,"乘","误差:",error,"乘","y_hat",y_hat,"乘","1.0 - y_hat:",(1.0 - y_hat)) print("-"*50) for i in range(len(row) - 1): coefficients[i + 1] = coefficients[i + 1] + learning_rate * error * y_hat * (1.0 - y_hat) * row[i] print("新系数b1:",coefficients[0],"等于","旧系数",coefficients[0],"加","lr:",learning_rate,"乘","误差:",error,"乘","y_hat",y_hat,"乘","1.0 - y_hat:",yhat,"=",(1.0 - y_hat),"乘",row[i]) print("*"*50) print("第 【%d】步,我们使用的学习率是 【%.3f】,误差是 【%.3f】" % (epoch, learning_rate, the_sum_of_error)) return coefficients dataset = [[2, 2, 0], [2, 4, 0], [3, 3, 0], [4, 5, 0], [8, 1, 1], [8.5, 3.5, 1], [9, 1, 1], [10, 4, 1]] learning_rate = 0.1 n_times_epoch = 1000 coef = using_sgd_method_to_calculate_coefficients(dataset, learning_rate, n_times_epoch) print(coef)
注:从第一行数据一直到最后一行 为一个epoch,反复重复让误差越来越小
def predict(row, weights): # 传入想要的数据和权重
activation = weights[0]
for i in range(len(row)-1):
activation += weights[i + 1] * row[i]
return 1.0 if activation >= 0.0 else 0.0
dataset = [[2.78,2.55,0],
[1.47,2.36,0],
[1.39,1.85,0],
[3.06,3.01,0],
[7.63,2.76,0],
[5.33,2.09,1],
[6.93,1.76,1],
[8.76,-0.77,1],
[7.66,2.46,1]]
# 使用梯度下降获得的权重
weights = [2.0000000000000004, 0.5930000000000017, -2.460999999999983]
# 预测
for row in dataset:
prediction = predict(row,weights)
print("真实值 : %d ,预测值 %d:" %(row[-1],prediction))
def predict(row, weights): activation = weights[0] for i in range(len(row) - 1): activation += weights[i + 1] * row[i] return 1.0 if activation >= 0.0 else 0.0 def opt_weights(train, learning_rate, how_many_epoch): weights = [0.5 for i in range(len(train[0]))] for epoch in range(how_many_epoch): sum_error = 0.0 for row in train: prediction = predict(row, weights) error = row[-1] - prediction sum_error += error ** 2 weights[0] = weights[0] + learning_rate * error for i in range(len(row) - 1): weights[i + 1] = weights[i + 1] + learning_rate * error * row[i] print('This is epoch: %d, our learning_rate is : %.4f, the error is : %.4f' % (epoch, learning_rate, sum_error)) return weights dataset = [[2.78, 2.55, 0], [1.47, 2.36, 0], [1.39, 1.85, 0], [3.06, 3.01, 0], [7.63, 2.76, 0], [5.33, 2.09, 1], [6.93, 1.76, 1], [8.76, -0.77, 1], [7.66, 2.46, 1]] learning_rate = 0.1 how_many_epoch = 100 weights = opt_weights(dataset, learning_rate, how_many_epoch) print(weights)
测试
1.如果你得到的是数值的数据如何计算Gini?(rank一遍,计算平均值,通过小于等于来分类,*没有必要将最大的一个数值包括,因为无法分类)
2.如果你得到的是程度数值(比如:按照喜欢程度1234)的数据如何计算Gini?(rank一遍,通过小于等于来分类,*没有必要将最大的一个数值包括,因为无法分类)
3.如果你得到的是调查问卷的数据如何计算Gini?(通过排列组合来分类,*没有必要将包括所有的组合计算在内,因为无法分类)
def calculate_the_gini_index(groups, classes):# 传入数据 以及有那些类别 # 计算有多少实例个数 n_instances = float(sum([len(groups) for group in groups])) # 把每一个group里面的加权gini计算出来 gini = 0.0 for group in groups: size = float(len(groups)) # *注意,这里不能除以0,所以我们要考虑到分母为0的情况 if size == 0: continue score = 0.0 for class_val in classes: p = [row[-1] for row in group].count(class_val) / size score += p * p # 这里做了一个加权处理 gini += (1 - score) * (size / n_instances) return gini
# 两个类别的最坏情况
worst_case_for_two_classes = [[[1, 1],
[1, 0]],
[[1, 1],
[1, 0]]]
print(calculate_the_gini_index(worst_case_for_two_classes, [0, 1]))
# 两个类别的最佳情况
best_case_for_two_classes = [[[1, 0],
[1, 0]],
[[1, 1],
[1, 1]]]
print(calculate_the_gini_index(best_case_for_two_classes, [0, 1]))
# 用二分类数据举例
# 按照类别进行左右切分
def test_split(index, value, dataset):
left, right = list(), list()
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
# 基尼系数计算函数 def calculate_the_gini_index(groups, classes): # 计算有多少实例 n_instances = float(sum([len(group) for group in groups])) # 把每一个group里面的加权gini计算出来 gini = 0.0 for group in groups: size = float(len(group)) # *注意,这里不能除以0,所以我们要考虑到分母为0的情况 if size == 0: continue score = 0.0 for class_val in classes: p = [row[-1] for row in group].count(class_val) / size score += p * p # 这个做了一个加权处理 gini += (1 - score) * (size / n_instances) return gini
# 贪心算法检查所有组合
# 需要存储的有:index,value,groups数据较多,所以选用dict
def get_split(dataset):
class_values = list(set(row[-1] for row in dataset)) # set去重取出有哪些类别
print("类别有:",class_values)
posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, None # 生成一个占位
for index in range(len(dataset[0]) - 1):
for row in dataset:
groups = test_split(index, row[index], dataset)
print("组合:",groups)
gini = calculate_the_gini_index(groups, class_values)
print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini))
if gini < posi_score:
posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groups
return {'index': posi_index, 'value': posi_value, 'groups': posi_groups}
dataset = [[2.1, 1.1, 0],
[3.4, 2.5, 0],
[1.3, 5.8, 0],
[1.9, 8.6, 0],
[3.7, 6.2, 0],
[8.8, 1.1, 1],
[9.6, 3.4, 1],
[10.2, 7.4, 1],
[7.7, 8.8, 1],
[9.7, 6.9, 1]]
split = get_split(dataset)
print('Split:[X%d < %.3f]' % ((split['index'] + 1), split['value']))
# 1.root node # 2.recursive split # 3.terminal node (为了解决over-fitting的问题,减少整个tree的深度/高度,以及必须规定最小切分单位) # 4.finish building the tree def test_split(index, value, dataset): left, right = list(), list() for row in dataset: if row[index] < value: left.append(row) else: right.append(row) return left, right def calculate_the_gini_index(groups, classes): # 计算有多少实例 n_instances = float(sum([len(group) for group in groups])) # 把每一个group里面的加权gini计算出来 gini = 0.0 for group in groups: size = float(len(group)) # *注意,这里不能除以0,所以我们要考虑到分母为0的情况 if size == 0: continue score = 0.0 for class_val in classes: p = [row[-1] for row in group].count(class_val) / size score += p * p # 这个做了一个加权处理 gini += (1 - score) * (size / n_instances) return gini def get_split(dataset): class_values = list(set(row[-1] for row in dataset)) posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, None for index in range(len(dataset[0]) - 1): for row in dataset: groups = test_split(index, row[index], dataset) gini = calculate_the_gini_index(groups, class_values) print("X%d < %.3f Gini=%.3f" % ((index + 1), row[index], gini)) if gini < posi_score: posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groups return {'index': posi_index, 'value': posi_value, 'groups': posi_groups} def determine_the_terminal(group): outcomes = [row[-1] for row in group] return max(set(outcomes), key=outcomes.count) # 1.把数据进行切分(分为左边与右边),原数据删除掉 # 2.检查非空以及满足我们的我们设置的条件(深度/最小切分单位/非空) # 3.一直重复类似寻找root node的操作,一直到最末端 def split(node, max_depth, min_size, depth): # 做切分,并删除掉原数据 left, right = node['groups'] del (node['groups']) # 查看非空 if not left or not right: node['left'] = node['right'] = determine_the_terminal(left + right) return # 检查最大深度是否超过 if depth >= max_depth: node['left'], node['right'] = determine_the_terminal(left), determine_the_terminal(right) return # 最小分类判断与左侧继续向下分类 if len(left) <= min_size: node['left'] = determine_the_terminal(left) else: node['left'] = get_split(left) split(node['left'], max_depth, min_size, depth + 1) # 最小分类判断与右侧继续向下分类 if len(right) <= min_size: node['right'] = determine_the_terminal(right) else: node['right'] = get_split(right) split(node['right'], max_depth, min_size, depth + 1) # 最终建立决策树 def build_the_regression_tree(train, max_depth, min_size): root = get_split(train) split(root, max_depth, min_size, 1) return root # 通过CLI可视化的呈现类树状结构便于感性认知 def print_our_tree(node, depth=0): if isinstance(node, dict): print('%s[X%d < %.3f]' % ((depth * '-', (node['index'] + 1), node['value']))) print_our_tree(node['left'], depth + 1) print_our_tree(node['right'], depth + 1) else: print('%s[%s]' % ((depth * '-', node))) def make_prediction(node, row): if row[node['index']] < node['value']: if isinstance(node['left'], dict): return make_prediction(node['left'], row) else: return node['left'] else: if isinstance(node['right'], dict): return make_prediction(node['right'], row) else: return node['right'] dataset = [[2.1, 1.1, 0], [3.4, 2.5, 0], [1.3, 5.8, 0], [1.9, 8.6, 0], [3.7, 6.2, 0], [8.8, 1.1, 1], [9.6, 3.4, 1], [10.2, 7.4, 1], [7.7, 8.8, 1], [9.7, 6.9, 1]] tree = build_the_regression_tree(dataset, 1, 1) print_our_tree(tree) decision_tree_stump = {'index': 0, 'right': 1, 'value': 7.7, 'left': 0} for row in dataset: prediction = make_prediction(decision_tree_stump, row) print("What is expected data : %d , Your prediction is %d " % (row[-1], prediction))
def split_our_data_by_class(dataset):# 按类别拆分数据 传入参数数据
splited_data = dict() # 按字典储存
for i in range(len(dataset)):
vector = dataset[i]
class_value = vector[-1]
if (class_value not in splited_data):
splited_data[class_value] =list()
splited_data[class_value].append(vector)
return splited_data
# 创建虚拟数据
dataset = [ [0.8,2.3,0],
[2.1,1.6,0],
[2.0,3.6,0],
[9.1,2.5,1],
[3.1,2.5,0],
[3.8,4.7,0],
[6.8,2.7,1],
[6.1,4.4,1],
[8.6,0.3,1],
[7.9,5.3,1]]
splited = split_our_data_by_class(dataset)# 调用切分函数进行切分
print(splited)
# 逐行打印
for label in splited:
print(label)
for row in splited[label]:
print(row)
from math import sqrt
def calculate_the_mean(a_list_of_num): # 平均值函数
mean = sum(a_list_of_num)/float(len(a_list_of_num)) # 求总和除以数据个数
return mean
def calculate_the_standard_deviation(a_list_of_num): # 计算标准差
the_mean = calculate_the_mean(a_list_of_num) # 调用平均值函数计算平均值
the_variance = sum([(x-the_mean)**2 for x in a_list_of_num])/ float(len(a_list_of_num)-1)# {每个值减去平均值的平方除以数据的(个数-1)}结果相加的总和
std = sqrt(the_variance) # 开方
return std
# 使用pandas验证
import pandas as pd
df = pd.DataFrame(dataset)
df
df.info()
df.describe()
# 接下来咱们用pure python写一个类似pandas的describe功能
def describe_our_data(dataset):# 调用函数
description = [(calculate_the_mean(column),
calculate_the_standard_deviation(column),
len(column)) for column in zip(*dataset)]
del(description[-1]) # 删除最后一列的分类标签
return description
describe_our_data(dataset)
def describe_our_data_by_class(dataset):# 综合
splited_data = split_our_data_by_class(dataset)
data_description = dict()# 创建一个空字典来存储
for class_value,rows in splited_data.items():# 通过迭代填入空字典
data_description[class_value] = describe_our_data(rows)
return data_description
description = describe_our_data_by_class(dataset)
for label in description:
print(label)
for row in description[label]:
print(row)
description = describe_our_data_by_class(dataset)
type(description)
## 切分数据字典-元组-列表结构梳理
print("----原始数据---")
print(description)
print("--取出字典0的数据--")
print(description[0])
print("--取出字典1的数据--")
print(description[1])
print("---------")
print(description[0][0])
print("--------")
print(description[0][0][2])
#高斯概率密度函数的建模构建
from math import exp,sqrt,pi
def calculate_the_probability(x,mean,stdev):
exponent = exp(-((x-mean)**2/(2*stdev**2)))
result = (1/(sqrt(2*pi)*stdev))* exponent
return result
calculate_the_probability(1.0,1.0,1.0)
from math import sqrt from math import pi from math import exp def split_our_data_by_class(dataset): splited_data = dict() for i in range(len(dataset)): vector = dataset[i] class_value = vector[-1] if (class_value not in splited_data): splited_data[class_value] = list() splited_data[class_value].append(vector) return splited_data def calculate_the_mean(a_list_of_num): mean = sum(a_list_of_num) / float(len(a_list_of_num)) return mean def calculate_the_standard_deviation(a_list_of_num): the_mean = calculate_the_mean(a_list_of_num) the_variance = sum([(x - the_mean) ** 2 for x in a_list_of_num]) / float(len(a_list_of_num) - 1) std = sqrt(the_variance) return std def describe_our_data(dataset): description = [(calculate_the_mean(column), calculate_the_standard_deviation(column), len(column)) for column in zip(*dataset)] del (description[-1]) return description def describe_our_data_by_class(dataset): splited_data = split_our_data_by_class(dataset) data_description = dict() for class_value, rows in splited_data.items(): data_description[class_value] = describe_our_data(rows) return data_description def calculate_the_probability(x, mean, stdev): exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2))) result = (1 / (sqrt(2 * pi) * stdev)) * exponent return result def calculate_class_probability(description, row): total_rows = sum([description[label][0][2] for label in description]) probabilities = dict() for class_value, class_description in description.items(): probabilities[class_value] = description[class_value][0][2] / float(total_rows) for i in range(len(class_description)): mean, stdev, count = class_description[i] probabilities[class_value] *= calculate_the_probability(row[i], mean, stdev) return probabilities dataset = [[0.8, 2.3, 0], [2.1, 1.6, 0], [2.0, 3.6, 0], [3.1, 2.5, 0], [3.8, 4.7, 0], [6.1, 4.4, 1], [8.6, 0.3, 1], [7.9, 5.3, 1], [9.1, 2.5, 1], [6.8, 2.7, 1]] description = describe_our_data_by_class(dataset) probability = calculate_class_probability(description, dataset[0]) print(probability)
1.k——超参数(hyper-parameter)
2.k最好为奇数(no even number , better be odd)
3.k大小有学问:
k太小:outliers 对判断的影像加大
k太大:会"冲淡"周边neighbor(高质量、高权重的数据)对最终判断的影像
# Euclidean Distance 欧氏距离
from math import sqrt
def calculate_euclidean_distance(row1,row2):
# 累计的计数器
distance = 0.0
for i in range(len(row1)-1):
# 这是一种快速写sum求和的方法+=
distance += (row1[i] - row2[i])**2
return sqrt(distance)
# 创建dummy data
dataset = [[1.80,1.91,0],
[1.85,2.11,0],
[2.31,2.88,0],
[3.54,-3.21,0],
[3.66,3.12,0],
[5.52,2.13,1],
[6.32,1.46,1],
[7.35,2.34,1],
[7.78,3.26,1],
[8.43,-0.34,1]
]
row0 = [1.80,1.91,0]
print("先随便取一行数据作为未知数据:",row0)
# 逐一计算 他与dataset中点的距离
for row in dataset:
distance = calculate_euclidean_distance(row0,row)
print(distance)
a = (1.8-1.8)**2+(1.91-1.91)**2
b = (1.8-1.85)**2+(1.91-2.11)**2
c = (1.8-2.31)**2+(1.91-2.88)**2
sqrt(a),sqrt(b),sqrt(c)
找思路:
1.需要一个输入变量k
2.需要排序(选前面k个)
3.数据类型储存使用元组进行储存
4.通过排序选出k值最近的
# Euclidean Distance 欧氏距离
from math import sqrt
def calculate_euclidean_distance(row1,row2):
# 累计的计数器
distance = 0.0
for i in range(len(row1)-1):
# 这是一种快速写sum求和的方法+=
distance += (row1[i] - row2[i])**2
return sqrt(distance)
### 寻找最近的邻居
def get_our_neighbors(train,test_row,num_of_neighbors): # 传入训练数据,测试数据,k值
distances = list()# 使用空的列表来存储后面的数据
for train_row in train: # 拆解train为每一行
dist = calculate_euclidean_distance(test_row,train_row)# 调用欧氏距离函数计算距离
distances.append((train_row,dist)) # 把结果放入空列表
distances.sort(key=lambda every_tuple:every_tuple[1])# 排序
neighbors = list()
for i in range(num_of_neighbors): # 循环k值得次数
neighbors.append(distances[i][0])
#print(neighbors)
return neighbors
dataset = [[1.80,1.91,0],
[3.66,3.12,0],
[1.85,2.11,0],
[3.54,-3.21,0],
[2.31,2.88,0],
[5.52,2.13,1],
[6.32,1.46,1],
[7.35,2.34,1],
[7.78,3.26,1],
[8.43,-0.34,1]
]
# 传入数据集,选取一个点来计算距离,k=3
neighbors =get_our_neighbors(dataset,dataset[0],3)
for neighbor in neighbors:
print(neighbor)
### 预测类别
def predict_the_class(train, test_row, num_of_neighbors):# 传入训练集 测试集 K值
neighbors = get_our_neighbors(train,test_row,num_of_neighbors) # 使用计算最近邻居函数进行计算
the_class_values = [row[-1] for row in neighbors] # 拿出最近邻居类别数
prediction = max(set(the_class_values),key=the_class_values.count)# 是max求出最多的类别个数
return prediction
prediction = predict_the_class(dataset,dataset[0],3)
print('真实类别 【%d】' %(dataset[0][-1]))
print('预测类别 【%d】' %(prediction))
### 预测类别版本V2
def predict_the_class_V2(train,test_row,num_of_neighbors):
neighbors = get_our_neighbors(train,test_row,num_of_neighbors)
the_class_values = [row[-1] for row in neighbors]
prediction = sum(the_class_values) / float(len(the_class_values))
return prediction
prediction = predict_the_class_V2(dataset,dataset[0],3)
print('Our expectation(the real class) is class 【%d】' %(dataset[0][-1]))
print('Our prediction(the predicted class) is class 【%d】' %(prediction))
通常,我们使用LVQ方法用在分类问题上。
codebook vector(是一系列数字,与你训练数据里的input与output相关的特征一样)
example:
1.class 0,1
2.width
3.height
4.length
codebook vector(neuron):
1.class 0,1
2.width
3.height
4.length
LVQ跟KNN
通过在codebook vector里面进行寻找,通过Euclidean距离进行判断,找到BMU(Best Matching Unit)
1.选择一部分codebook vector。
2.竞争机制(codebook vector与训练实例(training pattern)一致的情况下。codebook vector向训练实例靠近,反之,则离远)
3.通过learning_rate控制移动的大小
x = x + learning_rate * (t - x)
4.对每个实例进行学习。
learning_rate = alpha(最初的学习率) * (1- (epoch/max_epoch))
# 计算两个向量之间的欧氏距离: from math import sqrt def calculate_euclidean_distance(row1,row2): distance = 0.0 for i in range(len(row1)-1): distance += (row1[i] - row2[i])**2 return sqrt(distance) dataset = [[1.80,1.91,0], [1.85,2.11,0], [2.31,2.88,0], [3.54,-3.21,0], [3.66,3.12,0], [5.52,2.13,1], [6.32,1.46,1], [7.35,2.34,1], [7.78,3.26,1], [8.43,-0.34,1] ] # 测试 row0 =dataset[0] # 使用1.80,1.91,0 这个点的数据来计算和dataset每一项的距离 for row in dataset: distance = calculate_euclidean_distance(row0,row) print(distance)
1.计算距离(codebook vector 与 新的输入信息)
2.调用calculate_euclidean_distance
3.排序(考虑到数据类型)
4.选取BMU
# 欧氏距离函数
from math import sqrt
def calculate_euclidean_distance(row1,row2):
distance = 0.0
for i in range(len(row1)-1):
distance += (row1[i] - row2[i])**2
return sqrt(distance)
# 定义训练集
dataset = [[1.80,1.91,0],
[1.85,2.11,0],
[2.31,2.88,0],
[3.54,-3.21,0],
[3.66,3.12,0],
[5.52,2.13,1],
[6.32,1.46,1],
[7.35,2.34,1],
[7.78,3.26,1],
[8.43,-0.34,1]
]
def calculate_BMU(codebooks, test_row): # codebook vector 与 新的输入信息
distances = list()
for codebook in codebooks:
dist = calculate_euclidean_distance(codebook,test_row)# 使用欧氏距离函数传入codebook vector 与 新的输入信息的计算距离
distances.append((codebook,dist))# 把codebook vector和距离信息追加到distances列表中
distances.sort(key=lambda every_tuple : every_tuple[1]) # 使用lambda排序
print(distances)
return distances[0][0] #通过列表切割出数据中第一个数据(就是距离新输入的最近的距离)
# 测试
test_row = [1.60,1.81,0] # 随便找一列数据来测试
bmu = calculate_BMU(dataset,test_row) # 传入数据集 和测试数据
print("距离最近",bmu) # 得出结果距离最近的是[1.8, 1.91, 0]这一行数据
训练我们的codebook vector
1.初始化(random feature)
2.在每一个epoch,通过 training pattern 进行对codebook vector更新(学习)
3.在每一个training pattern里面,每一个pattern feature如果与我们codebook vector一致的情况下,进行更新(更近,或者更远)
from random import randrange # 导入随机函数 # 随机生成码本 def make_random_codebook(train): # 传入训练数据 n_index = len(train) # 数据的长度就是数据的个数 n_features = len(train[0]) # 数据的长度如[1.80,1.91,0] = 3个 codebook = [train[randrange(n_index)][i] for i in range(n_features)] # 使用randrange方法按照n_features的长度随机生成一条数据 return codebook dataset = [[1.80,1.91,0], [1.85,2.11,0], [2.31,2.88,0], [3.54,-3.21,0], [3.66,3.12,0], [5.52,2.13,1], [6.32,1.46,1], [7.35,2.34,1], [7.78,3.26,1], [8.43,-0.34,1] ] m_r_c = make_random_codebook(dataset) m_r_c
# 计算两个向量之间的欧氏距离: from math import sqrt def calculate_euclidean_distance(row1,row2): distance = 0.0 for i in range(len(row1)-1): distance += (row1[i] - row2[i])**2 return sqrt(distance) def calculate_BMU(codebooks, test_row): # codebook vector 与 新的输入信息 distances = list() for codebook in codebooks: dist = calculate_euclidean_distance(codebook,test_row)# 使用欧氏距离函数传入codebook vector 与 新的输入信息的计算距离 distances.append((codebook,dist))# 把codebook vector和距离信息追加到distances列表中 distances.sort(key=lambda every_tuple : every_tuple[1]) # 使用lambda排序 return distances[0][0] #通过列表切割出数据中第一个数据(就是距离新输入的最近的距离) from random import randrange # 导入随机函数 # 随机生成码本 def make_random_codebook(train): # 传入训练数据 n_index = len(train) # 数据的长度就是数据的个数 n_features = len(train[0]) # 数据的长度如[1.80,1.91,0] = 3个 codebook = [train[randrange(n_index)][i] for i in range(n_features)] # 使用randrange方法随机生成一条数据 return codebook #码本向量竞争实现 def train_codebooks(train,n_codebooks,learn_rate,epochs): codebooks = [make_random_codebook(train) for i in range(n_codebooks)] # 调用make_random_codebook函数随机生成数据 for epoch in range(epochs): # 迭代 rate = learn_rate * (1-(epoch/float(epochs)))# 更新学习率 sum_error = 0.0 # 存储错误 for row in train: bmu = calculate_BMU(codebooks,row) # 通过学习不断更新学习率 for i in range(len(row)-1): error = row[i] - bmu[i] sum_error += error**2 if bmu[-1] == row[-1]: # 如果结果和真实数据一样 一直就 + bmu[i] += rate * error else: bmu[i] -= rate * error# 反之就 - print('第【%d】epoch , 学习率 :【%.3f】, 错误总数是 【%.3f】' % (epoch,rate,sum_error)) return codebooks dataset = [[1.80,1.91,0], [1.85,2.11,0], [2.31,2.88,0], [3.54,-3.21,0], [3.66,3.12,0], [5.52,2.13,1], [6.32,1.46,1], [7.35,2.34,1], [7.78,3.26,1], [8.43,-0.34,1] ] learning_rate = 0.3 # 学习率 n_epoch = 10 # 步数 n_codebooks = 2 # 返回多少个数据 codebooks = train_codebooks(dataset,n_codebooks,learning_rate,n_epoch) print('Our codebook is : %s' % codebooks)
rectifier其实就是一种模仿生物的激活机制的函数
(activation function)
常见的激活函数:
https://en.wikipedia.org/wiki/Rectifier_(neural_networks)#Gaussian_Error_Linear_Unit_(GELU)
import math
softplus = math.log(1+math.exp(2.14))
print(softplus)
sigmoid =math.exp(2.14)/(1 + math.exp(2.14))
print(sigmoid)
ReLU = max(2.14,0)
print(ReLU)
前向传播(forward-propagate)
1.neuron activation(w1 b1)
2.neuron transfer(activation)
3.forward-propagate(calculate the output)
from random import seed from random import random #初始化我们的神经网络 # 随机生成权重bias def initialize_our_neural_network(n_inputs,n_hidden,n_outputs):# 传入参数 多少个输入 多少个隐藏层 多少个输出 neural_network = list()# 空列表存储 hidden_layer = [{'weights':[random() for i in range(n_inputs +1)]} for i in range(n_hidden)] # weights的值是n_inputs +1 个 生成多少层n_hidden就循环生成多少个weight neural_network.append(hidden_layer) output_layer = [{'weights':[random() for i in range(n_hidden +1)]} for i in range(n_outputs)] # 输出层的weight是n_hidden +1个值 生成多少个输出 neural_network.append(output_layer) return neural_network seed(1) network = initialize_our_neural_network(2,1,2) # 两个输入 1个隐藏层 两个输出 for layer in network: print(layer)
# 构建神经元激活函数
def neuron_activation(weights,inputs): # 传入权重,输入
activation = weights[-1] # activation等于 weights中的最后一个值作为 bias
for i in range(len(weights)-1): # 循环 次数是weight的个数-1 因为上面取出一个
activation += weights[i] * inputs[i] # 让weights中的每个值和inputs中的每个值相乘然后和每个activation相加
return activation
# 创建一个sigmiod 激活函数
from math import exp
def neuron_transfer(activation):
result = 1.0/(1.0+ exp(-activation))
return result
# 正向传播调用参数
def forward_propagate(network,row):# 传入 网络 以及具体数据
inputs = row # 输入=具体数据
for layer in network: # 循环网络
# print("layer:",layer)
new_inputs = []
for neuron in layer:
# print("神经元:",neuron)
activation = neuron_activation(neuron['weights'],inputs)# 调用神经元激活函数提取权重bias进行计算
# print("激活:",activation)
neuron['output'] = neuron_transfer(activation) # 调用sigmiod激活函数
# print("sigmiod:",neuron['output'])
new_inputs.append(neuron['output'])
inputs = new_inputs # 使用新的输入替换旧的
return inputs
# 测试
network=[[{'weights': [0.13436424411240122,
0.8474337369372327,
0.763774618976614]}],
[{'weights': [0.2550690257394217,
0.49543508709194095]},
{'weights': [0.4494910647887381,
0.651592972722763]}]]
row =[1,0]
output = forward_propagate(network,row)
print("输出:",output)
计算过程
def neuron_transfer_derivative(output):
derivative = output*(1-output)
return derivative
def backward_propagate_error(network,expected): for i in reversed(range(len(network))): layer = network[i] errors = list() if i != len(network)-1: for j in range(len(layer)): error = 0.0 for neuron in network[i+1]: error +=(neuron['weights'][j] * neuron['delta']) errors.append(error) else: for j in range(len(layer)): neuron = layer[j] errors.append(expected[j]- neuron['output']) for j in range(len(layer)): neuron = layer[j] neuron['delta'] = errors[j] * neuron_transfer_derivative(neuron['output'])
## 测试
network=[[{'output':0.71,'weights': [0.13436424411240122,
0.8474337369372327,
0.763774618976614]}],
[{'output':0.62,'weights': [0.2550690257394217,
0.49543508709194095]},
{'output':0.65,'weights': [0.4494910647887381,
0.651592972722763]}]]
expected = [0,1]
backward_propagate_error(network,expected)
for layer in network:
print(layer)
手推计算过程
weight = weight + learning rate * error * input
def update_weights(network, row , learning_rate):
for i in range(len(network)):
inputs = row[:-1]
# 因为output layer的数据是hidden layer传到过来的
if i != 0:
inputs = [neuron['output'] for neuron in network[i-1]]
for neuron in network[i]:
for j in range(len(inputs)):
neuron['weights'][j] += learning_rate* neuron['delta'] * inputs[j]
neuron['weights'][-1] += learning_rate*neuron['delta']
补充知识:
我们对分类数据做预处理的时候,一般有两种encoding编码方式:
1.Integer Encoding
Blue-- Red–Green–Yellow
—1------2------3----------4–
2.One-Hot Encoding
dummy variable
Blue Red Green Yellow
–1------0------0------0
–0------1------0------0
–0------0------1------0
–0------0------0------1
def train_our_network(network,train,learning_rate,n_epoch,n_output):
for epoch in range(n_epoch):
sum_error = 0
for row in train:
outputs = forward_propagate(network,row)
expected = [0 for i in range(n_output)]
expected[row[-1]] = 1
sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
backward_propagate_error(network,expected)
update_weights(network,row,learning_rate)
print("第[%d] epoch , 学习率: [%.3f],误差: [%.3f]" %(epoch,learning_rate,sum_error))
seed(1) dataset = [ [2.1,2.8,0], [1.3,2.7,0], [1.2,5.2,0], [3.3,2.8,0], [1.2,1.1,0], [6.2,5.8,1], [8.3,3.7,1], [6.2,2.7,1], [7.3,3.4,1], [9.2,2.1,1] ] n_inputs = len(dataset[0]) -1 n_outputs = len(set(row[-1] for row in dataset)) network = initialize_our_neural_network(n_inputs,2, n_outputs) train_our_network(network,dataset,0.1,1000,n_outputs) for layer in network: print(layer)
def predict(network,row):
outputs = forward_propagate(network,row)
return outputs.index(max(outputs))
network=[[{'weights': [-1.1866404384956928, 0.3006679439138231, 3.9117172696404685]}, {'weights': [1.2235819285217509, -0.3158686384608762, -4.03117701360861]}],
[{'weights': [3.933774188003338, -3.959801574023486, 0.4499036554334044]}, {'weights': [-3.678895659767694, 4.256650475104409, -0.7298649815974946]}]]
for row in dataset:
prediction = predict(network,row)
print("Our expected class value is [%d], Our prediction of class value is [%d]" % (row[-1], prediction))
把各种model综合起来——让预测更准确、更加稳定(做平均)
在随机森林里面的超参数(hyper-parameter):
1.对于每一棵树,要选取特性(features),假设总共有n个feature,你需要确定选取个m作为参数
2.每一个node的最低size(每个棵树的每一片叶子的最小值)
3.每一个树的深度(maximum depth of one tree)
4.选择森林里面有多少棵树
from random import randrange
def subsample(dataset,ratio=1.0):
sample = list()
n_sample = round(len(dataset) * ratio )
while len(sample) < n_sample:
index = randrange(len(dataset))
sample.append(dataset[index])
return sample
from random import seed from random import randrange from random import random def subsample(dataset, ratio=1.0): sample = list() n_sample = round(len(dataset) * ratio) while len(sample) < n_sample: index = randrange(len(dataset)) sample.append(dataset[index]) return sample def mean(numbers): result = sum(numbers) / float(len(numbers)) return result seed(1) dataset = [[randrange(10)] for i in range(20)] # print(dataset) ratio = 0.10 for size in [1, 10, 100,1000,10000,100000,1000000]: sample_means = list() for i in range(size): sample = subsample(dataset, ratio) sample_mean = mean([row[0] for row in sample]) sample_means.append(sample_mean) print("When sample is [%d],the estimated mean is [%.3f]" % (size, mean(sample_means))) print("The real mean of our dataset is [%.3f]" % mean([row[0] for row in dataset]))
from random import seed from random import randrange from random import random from csv import reader # 1.load our data # 定义加载数据函数 def load_csv(filename): # 传入数据 dataset = list()# 定义一个空列表逐步往里面存放数据 with open(filename, 'r') as file: # 通过上下文管理器读取文件 csv_reader = reader(file) # 读取文件 for row in csv_reader: # 循环文件的行数 if not row: # 如果不是空 continue # 继续下面操作 dataset.append(row) # 把每行追加到dataset列表中 return dataset # 读取测试 dataset = load_csv('sonar.all-data.csv') print(dataset)
# 2.datatype conversion 数据类型转换 def str_to_float(dataset, column): # 将字符串转换为浮点类型 传入数据 以及每个column for row in dataset: # 通过for循环所有数据逐行转换 row[column] = float(row[column].strip()) # 把数字转换为浮点数 去掉前后空格 def str_to_int(dataset, column): # 把字符串转换为数字类型 class_value = [row[column] for row in dataset] unique = set(class_value) # 去重 look_up = dict() # 定义一个空字典后面进行填充 for i, value in enumerate(unique): # 逐一遍历 look_up[value] = i for row in dataset: row[column] = look_up[row[column]] return look_up
# 3.k_fold cross validation k_fold交叉验证数据切分
def cross_validation_split(dataset, n_folds): # 传入数据 以及 切分的次数
dataset_split = list() # 空列表接受后面切分好的数据
dataset_copy = list(dataset) # 把数据放入dataset_copy中
fold_size = int(len(dataset) / n_folds) # 每个 fold_size的大小 是整体长度除以切分的次数
for i in range(n_folds): # 循环切分的次数
fold = list() # 定义一个空列表来存放数据
while len(fold) < fold_size: # 当fold的长度小于切分fold_size的时候一直循环下面的操作
index = randrange(len(dataset_copy)) # index=随机选取所有数据的长度
fold.append(dataset_copy.pop(index)) # 将dataset_copy.pop(index)随机选取的数据追加到fold中
dataset_split.append(fold) # 在将fold的内容追加到 dataset_split中
return dataset_split
# 4.calculate model accuracy # 计算准确率
def calculate_accuracy(actual, predicted): # 传入真实数据 预测数据
correct = 0 # =0 用来计算准确率的个数
for i in range(len(actual)): # 按照数据的长度进行循环
if actual[i] == predicted[i]: # 当数据一样的时候
correct += 1 # correct就更新+1
return correct / float(len(actual)) * 100.0 # 最终相等的结果个数/数据总个数 就是数据的准确率
# 总数据为100个 我们correct的数据有90个 90/100=0.9 *100 = 90%准确率
# 5.how good is our algo 使用测试集验证我们的算法有多好 def evaluate_our_algo(dataset, algo, n_folds, *args): # 传入我们的数据 以及其中一个algo 七分 以及其他的args folds = cross_validation_split(dataset, n_folds) ## 调用数据切分函数 scores = list() for fold in folds: # 循环我们切分好的数据 train_set = list(folds) # 总数据的长度 train_set.remove(fold) # 因为我们需要拿出其中一个数据所以删除掉一个数据 train_set = sum(train_set, []) # 计算数据总和 test_set = list() for row in fold: # 把fold中每行数据进行循环 row_copy = list(row) # 定义到row_copy中 test_set.append(row_copy) # 在追加到test_set中 row_copy[-1] = None # 去掉数据的标签(答案) predicted = algo(train_set, test_set, *args) actual = [row[-1] for row in fold] # 真实数据 accuracy = calculate_accuracy(actual, predicted) # 调用准确率函数计算准确率 scores.append(accuracy) return scores
# 6.left and right split # 左右切分
def test_split(index, value, dataset):
left, right = list(), list()
for row in dataset: # 逐行进行操作
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
# 7.calculate gini index # gini系数计算 def gini_index(groups, classes): # 计算有多少实例 n_instances = float(sum([len(group) for group in groups])) # 把每一个group里面的加权gini计算出来 gini = 0.0 for group in groups: size = float(len(group)) # *注意,这里不能除以0,所以我们要考虑到分母为0的情况 if size == 0: continue score = 0.0 for class_val in classes: p = [row[-1] for row in group].count(class_val) / size score += p * p # 这个做了一个加权处理 gini += (1 - score) * (size / n_instances) return gini
# 8.calculate the best split 计算最佳分割 根据gini系数不断更新
def get_split(dataset):
class_values = list(set(row[-1] for row in dataset))
posi_index, posi_value, posi_score, posi_groups = 888, 888, 888, None
for index in range(len(dataset[0])- 1):
for row in dataset:
groups = test_split(index, row[index], dataset)
gini = gini_index(groups, class_values)
if gini < posi_score:
posi_index, posi_value, posi_score, posi_groups = index, row[index], gini, groups
return {'index': posi_index, 'value': posi_value, 'groups': posi_groups}
# 9. to terminal # 确认是否切分到末端
def determine_the_terminal(group):
outcomes = [row[-1] for row in group]
return max(set(outcomes), key=outcomes.count)
# 10. # 1.split our data into left and right将数据分成左右两部分 # 2.delete the original data删除原始数据 # 3.check if the data is none/max depth/min size检查数据是否为无/最大深度/最小大小 # 4.to terminal至终端 def split(node, max_depth, min_size, depth): left, right = node['groups'] del (node['groups']) if not left or not right: node['left'] = node['right'] = determine_the_terminal(left + right) return if depth >= max_depth: node['left'], node['right'] = determine_the_terminal(left), determine_the_terminal(right) if len(left) <= min_size: node['left'] = determine_the_terminal(left) else: node['left'] = get_split(left) split(node['left'], max_depth, min_size, depth + 1) if len(right) <= min_size: node['right'] = determine_the_terminal(right) else: node['right'] = get_split(right) split(node['right'], max_depth, min_size, depth + 1)
# 11.make our decision tree
def build_tree(train, max_depth, min_zise):
root = get_split(train)
split(root, max_depth, min_zise, 1)
return root
# 12.make prediction
def predict(node, row):
if row[node['index']] < node['value']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
# 13. subsample
def subsample(dataset, ratio):
sample = list()
n_sample = round(len(dataset) * ratio)
while len(sample) < n_sample:
index = randrange(len(dataset))
sample.append(dataset[index])
return sample
# 14.make prediction using bagging
def bagging_predict(trees, row):
predictions = [predict(tree, row) for tree in trees]
return max(set(predictions), key=predictions.count)
# 15.bagging def bagging(train, test, max_depth, min_size, sample_size, n_trees): trees = list() for i in range(n_trees): sample = subsample(train, sample_size) tree = build_tree(sample, max_depth, min_size) trees.append(tree) predictions = [bagging_predict(trees, row) for row in test] return (predictions) seed(1) dataset = load_csv('sonar.all-data.csv') for i in range(len(dataset[0]) - 1): str_to_float(dataset, i) str_to_int(dataset, len(dataset[0]) - 1) n_folds = 5 max_depth = 6 min_size = 2 sample_size = 0.5 for n_trees in [1, 5, 10, 50]: scores = evaluate_our_algo(dataset, bagging, n_folds, max_depth, min_size, sample_size, n_trees) print('We are using [%d]' % n_trees) print('The scores are : [%s]' % scores) print('The mean accuracy is [%.3f]' % (sum(scores) / float(len(scores))))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。