赞
踩
1.数据来源:kaggle
2.数据样式
通过对数据“萼片、花瓣的长度、宽度(sepal_length、sepal_width、petal_length、petal_width)”搭建模型进行计算,判断鸢尾花的种类(species)。
朴素贝叶斯(Naive Bayes)
方法说明:
贝叶斯公式:
实际运用时,需要判断在已知数据的情况下,属于该组的概率,公式变形为:
在建立模型时,依次对比是每一组数据的概率大小,而每一组数据的全概率是相同的,所以,公式再次变形为:
从数据读取开始,不调取三方库,纯手工推。
1.导入基础库
- #1.导入基础库
- from csv import reader
- from math import exp,pi,sqrt
- from random import randrange,seed
- import copy
2.读取csv文件和转换数据类型
- #2.读取csv文件和转换数据类型
- #读取csv文件
- def csv_loader(file):
- dataset=list()
- with open(file,'r') as f:
- csv_reader=reader(f)
- for row in csv_reader:
- if not row:
- continue
- dataset.append(row)
- return dataset
-
- #指标数据转换为浮点型
- def str_to_float_converter(dataset):
- dataset=dataset[1:]
- for i in range(len(dataset[0])-1):
- for row in dataset:
- row[i]= float(row[i].strip())
-
- #分类数据转换为整型
- def str_to_int_converter(dataset):
- class_values= [row[-1] for row in dataset]
- unique_values= set(class_values)
- converter_dict=dict()
- for i,value in enumerate(unique_values):
- converter_dict[value] = i
- for row in dataset:
- row[-1] = converter_dict[row[-1] ]
3.K折交叉验证拆分数据
- #3.K折交叉验证拆分数据
- def k_fold_cross_validation(dataset,n_folds):
- dataset_splitted=list()
- fold_size= int(len(dataset)/n_folds)
- dataset_copy = list(dataset)
- for i in range(n_folds):
- fold_data = list()
- while len(fold_data) < fold_size:
- index = randrange(len(dataset_copy))
- fold_data.append(dataset_copy.pop(index))
- dataset_splitted.append(fold_data)
- return dataset_splitted
4.计算准确性
- #4.计算准确性
- def calculate_accuracy(actual,predicted):
- correct_num = 0
- for i in range(len(actual)):
- if actual[i] == predicted[i]:
- correct_num +=1
- accuracy = correct_num/float(len(actual)) *100.0
- return accuracy
5.模型测试
- #5.模型测试
- def mode_test(dataset,algo,n_folds,*args):
- dataset_splitted = k_fold_cross_validation(dataset,n_folds)
- scores = list()
- for fold in dataset_splitted:
- train = copy.deepcopy(dataset_splitted)
- train.remove(fold)
- train = sum(train, [])
- test =list()
- test = copy.deepcopy(fold)
- predicted = algo(train, test, *args)
- actual = [row[-1] for row in fold]
- accuracy= calculate_accuracy(actual,predicted)
- scores.append(accuracy)
- return scores
6.数据按字典分类和描述
首先,将数据按分类数据作为key,每一行作为value,进行字典转换;
然后,计算每一列的均值、标准差、长度,并通过字典进行描述:{class:[(mean,std,len)])}。
- #6.数据按字典分类和描述
- #数据组合成按每一种类进行分类
- def split_class(dataset):
- splitted = dict()
- for i in range(len(dataset)):
- vector = dataset[i]
- class_value = vector[-1]
- if class_value not in splitted:
- splitted[class_value]=list()
- splitted[class_value].append(vector)
- return splitted
-
- #计算每一列(x_i)的均值
- def calculate_mean(column):
- mean = sum(column)/len(column)
- return mean
-
- #计算每一列(x_i)的标准差
- def calculate_std(column):
- mean = calculate_mean(column)
- var = sum([(x - mean )**2 for x in column])/float((len(column)-1))
- std = sqrt(var)
- return std
-
- #描述数据[(mean,std,len)]
- def describe_data(dataset):
- description = [(calculate_mean(column), calculate_std(column),
- len(column)) for column in zip(*dataset)]
- del description[-1]
- return description
-
- #数据按字典分类描述{class:[(mean,std,len)])}
- def describe_class(dataset):
- splitted = split_class(dataset)
- descriptions = dict()
- for class_value, rows in splitted.items():
- descriptions[class_value] = describe_data(rows)
- return descriptions
7.设置计算概率的基础模型
正态分布概率计算:
如果随机变量X服从
- #7.计算概率的基础模型
- def calculate_probability(x,mean,std):
- exponent = exp(-((x - mean)**2)/(2 *(std**2)))
- probability = (1/(sqrt(2* pi) * std)) *exponent
- return probability
8.计算每一行数据的概率
- #8.计算每一行数据的概率
- def calculate_class_probabilities(dataset,row):
- descriptions= describe_class(dataset)
- total = sum([descriptions[label][0][-1] for label in descriptions])
- pribabilities = dict()
- for class_key, class_value in descriptions.items():
- pribabilities [class_key] = class_value[0][-1]/float(total)
- for i in range(len(class_value)):
- mean,std,count = class_value[i]
- pribabilities [class_key] *= calculate_probability(row[i],mean,std)
- return pribabilities
9.每一行数据中找出最好的标签
- #9.每一行数据中找出最好的标签
- def predict(dataset,row):
- pribabilities=calculate_class_probabilities(dataset,row)
- best_label,best_probability =None, -1
- for class_value, probability in pribabilities.items():
- if best_label is None or probability >best_probability:
- best_probability = probability
- best_label= class_value
- return best_label
10.预测测试数据的分类
- #10.预测测试数据的分类
- def naive_bayes(train,test):
- pedictions = list()
- for row in test:
- prediction = predict(train,row)
- pedictions.append(prediction)
- return pedictions
11.运行和参数调整
- #11.运行和参数调整
- seed(5)
- file='./download_datas/IRIS.csv'
- dataset= csv_loader(file)
- str_to_float_converter(dataset)
- dataset=dataset[1:]
- str_to_int_converter(dataset)
- n_folds=3
- algo=naive_bayes
- scores = mode_test(dataset,algo,n_folds)
-
- print('The scores of our model are : %s' % scores)
- print('The average score of our model is : %.3f%%' % (sum(scores)/float(len(scores))))
- #结果输出
- The scores of our model is : [94.0, 98.0, 96.0]
- The average score of our model is : 96.000%
- #1.导入基础库
- from csv import reader
- from math import exp,pi,sqrt
- from random import randrange,seed
- import copy
-
- #2.读取csv文件和转换数据类型
- #读取csv文件
- def csv_loader(file):
- dataset=list()
- with open(file,'r') as f:
- csv_reader=reader(f)
- for row in csv_reader:
- if not row:
- continue
- dataset.append(row)
- return dataset
-
- #指标数据转换为浮点型
- def str_to_float_converter(dataset):
- dataset=dataset[1:]
- for i in range(len(dataset[0])-1):
- for row in dataset:
- row[i]= float(row[i].strip())
-
- #分类数据转换为整型
- def str_to_int_converter(dataset):
- class_values= [row[-1] for row in dataset]
- unique_values= set(class_values)
- converter_dict=dict()
- for i,value in enumerate(unique_values):
- converter_dict[value] = i
- for row in dataset:
- row[-1] = converter_dict[row[-1] ]
-
- #3.K折交叉验证拆分数据
- def k_fold_cross_validation(dataset,n_folds):
- dataset_splitted=list()
- fold_size= int(len(dataset)/n_folds)
- dataset_copy = list(dataset)
- for i in range(n_folds):
- fold_data = list()
- while len(fold_data) < fold_size:
- index = randrange(len(dataset_copy))
- fold_data.append(dataset_copy.pop(index))
- dataset_splitted.append(fold_data)
- return dataset_splitted
-
- #4.计算准确性
- def calculate_accuracy(actual,predicted):
- correct_num = 0
- for i in range(len(actual)):
- if actual[i] == predicted[i]:
- correct_num +=1
- accuracy = correct_num/float(len(actual)) *100.0
- return accuracy
-
- #5.模型测试
- def mode_test(dataset,algo,n_folds,*args):
- dataset_splitted = k_fold_cross_validation(dataset,n_folds)
- scores = list()
- for fold in dataset_splitted:
- train = copy.deepcopy(dataset_splitted)
- train.remove(fold)
- train = sum(train, [])
- test =list()
- test = copy.deepcopy(fold)
- predicted = algo(train, test, *args)
- actual = [row[-1] for row in fold]
- accuracy= calculate_accuracy(actual,predicted)
- scores.append(accuracy)
- return scores
-
- #6.数据按字典分类和描述
- #数据组合成按每一种类进行分类
- def split_class(dataset):
- splitted = dict()
- for i in range(len(dataset)):
- vector = dataset[i]
- class_value = vector[-1]
- if class_value not in splitted:
- splitted[class_value]=list()
- splitted[class_value].append(vector)
- return splitted
-
- #计算每一列(x_i)的均值
- def calculate_mean(column):
- mean = sum(column)/len(column)
- return mean
-
- #计算每一列(x_i)的标准差
- def calculate_std(column):
- mean = calculate_mean(column)
- var = sum([(x - mean )**2 for x in column])/float((len(column)-1))
- std = sqrt(var)
- return std
-
- #描述数据[(mean,std,len)]
- def describe_data(dataset):
- description = [(calculate_mean(column), calculate_std(column),
- len(column)) for column in zip(*dataset)]
- del description[-1]
- return description
-
- #数据按字典分类描述{class:[(mean,std,len)])}
- def describe_class(dataset):
- splitted = split_class(dataset)
- descriptions = dict()
- for class_value, rows in splitted.items():
- descriptions[class_value] = describe_data(rows)
- return descriptions
-
- #7.计算概率的基础模型
- def calculate_probability(x,mean,std):
- exponent = exp(-((x - mean)**2)/(2 *(std**2)))
- probability = (1/(sqrt(2* pi) * std)) *exponent
- return probability
-
- #8.计算每一行数据的概率
- def calculate_class_probabilities(dataset,row):
- descriptions= describe_class(dataset)
- total = sum([descriptions[label][0][-1] for label in descriptions])
- pribabilities = dict()
- for class_key, class_value in descriptions.items():
- pribabilities [class_key] = class_value[0][-1]/float(total)
- for i in range(len(class_value)):
- mean,std,count = class_value[i]
- pribabilities [class_key] *= calculate_probability(row[i],mean,std)
- return pribabilities
-
- #9.每一行数据中找出最好的标签
- def predict(dataset,row):
- pribabilities=calculate_class_probabilities(dataset,row)
- best_label,best_probability =None, -1
- for class_value, probability in pribabilities.items():
- if best_label is None or probability >best_probability:
- best_probability = probability
- best_label= class_value
- return best_label
-
- #10.预测测试数据的分类
- def naive_bayes(train,test):
- pedictions = list()
- for row in test:
- prediction = predict(train,row)
- pedictions.append(prediction)
- return pedictions
-
- #11.运行和参数调整
- seed(5)
- file='./download_datas/IRIS.csv'
- dataset= csv_loader(file)
- str_to_float_converter(dataset)
- dataset=dataset[1:]
- str_to_int_converter(dataset)
- n_folds=3
- algo=naive_bayes
- scores = mode_test(dataset,algo,n_folds)
-
- print('The scores of our model are : %s' % scores)
- print('The average score of our model is : %.3f%%' % (sum(scores)/float(len(scores))))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。