赞
踩
今天博主做了这个KNN和决策树的实验。
介绍一下数据集:
威斯康星州乳腺癌数据集(Wisconsin Breast Cancer Dataset)是一个经典的机器学习数据集,它最初由威斯康星州医院的Dr. William H. Wolberg收集。这个数据集被广泛用于分类任务、特征选择、模型评估等机器学习任务和实验中 数据类型:这是一份多变量数据集,包含了乳腺癌的生理参数。
数据集特征:
特征:数据集由 30 个特征组成,这些特征是图像分析得到的,包括纹理、面积、平滑度、凸性、颗粒度等统计参数。
目标变量:数据集的目标变量是二分类的,即是否患有乳腺癌,用 0 和 1 表示。
样本数量:数据集包含 569 个样本。
鸢尾花数据集最初由Edgar Anderson测量得到,而后在著名的统计学家和生物学家R.A Fisher于1936年发表的文章中被引入到统计和机器学习领域数据集特征:
鸢尾花数据集包含了150个样本,每个样本有4个特征,这些特征是从花朵的尺寸测量中得到的,具体包括:
花萼长度(sepal length):在厘米单位下的花朵萼片的长度。
花萼宽度(sepal width):花萼片的宽度。
花瓣长度(petal length):花瓣的长度。
花瓣宽度(petal width):花瓣的宽度。
看一下我们的代码:
决策树代码:
from sklearn.datasets import load_breast_cancer import matplotlib.pyplot as plt import pandas as pd breast_cancer=load_breast_cancer() from sklearn.tree import plot_tree #df = pd.DataFrame(breast_cancer.target, columns=breast_cancer.feature_names) #df.to_csv(r'D:\coursework\maching_learning\breast_cancer.csv', index=None) #print(type(breast_cancer)) #df = pd.DataFrame(breast_cancer.target, columns="label") #df.to_csv(r'D:\coursework\maching_learning\breast_cancer_label.csv', index=None) #print(type(breast_cancer)) print('breast_cancer数据集特征') print(breast_cancer.data[:5]) print('breast_cancer数据集标签') print(breast_cancer.target[:5]) #2.进行数据集分割。 from sklearn.model_selection import train_test_split data_train,data_test,target_train,target_test=train_test_split(breast_cancer.data,breast_cancer.target,test_size=0.2) #3.配置决策树模型。 from sklearn import tree # 导入决策树包 clf = tree.DecisionTreeClassifier() #加载决策树模型 #4.训练决策树模型。 clf.fit(data_train, target_train) #5.模型预测。 predictions = clf.predict(data_test) # 模型测试 predictions[:10] #6.模型评估。 from sklearn.metrics import accuracy_score # 导入准确率评价指标 print('Accuracy:%s'% accuracy_score(target_test, predictions)) #7.参数调优。可以根据评估结果,对模型设置或调整为更优的参数,使评估结果更准确。 #信息增益--entropy criterions=['gini','entropy'] for ct in criterions: clf2 = tree.DecisionTreeClassifier(criterion = ct) clf2.fit(data_train, target_train) plot_tree(clf2,filled=True, class_names=breast_cancer.target_names,label=ct) predictions2 = clf2.predict(data_test) # 模型测试 # print('第一种:采用信息增益后的Accuracy:%s'% accuracy_score(target_test, predictions2)) #最大深度--max_depth import numpy as np max_depths = np.linspace(1, 32, 32, endpoint=True) scores=[] for i in max_depths: clf3 = tree.DecisionTreeClassifier(max_depth=i) clf3.fit(data_train, target_train) predictions3 = clf3.predict(data_test) # 模型测试 scores.append(accuracy_score(target_test,predictions3)) import matplotlib.pyplot as plt plt.figure() plt.plot(scores) plt.title('max_depth-accuracy_score,'+"criterion = "+ct) plt.xlabel('max_depth') plt.ylabel('accuracy_score') plt.show() max_score_index=np.argmax(scores)+1 print('可见当max-depth=',max_score_index,'时为最优其准确率为:',scores[max_score_index-1])
后剪枝与预剪枝代码:
import math import pandas as pd import matplotlib.pyplot as plt # 设置中文显示字体 from pylab import mpl import copy mpl.rcParams["font.sans-serif"] = ["SimHei"] # 使用文本注释绘制树节点 decision_node = dict(boxstyle='sawtooth', fc='0.8') leaf_node = dict(boxstyle='round4', fc='0.8') arrow_args = dict(arrowstyle='<-') # 节点 def plotNode(nodeTxt, centerPt, parentPt, nodeType): createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow_args) # 获取叶节点的数目 def getNumLeafs(my_tree): num_leafs = 0 first_str = list(my_tree.keys())[0] second_dict = my_tree[first_str] for key in second_dict.keys(): if type(second_dict[key]).__name__ == 'dict': num_leafs += getNumLeafs(second_dict[key]) else: num_leafs += 1 return num_leafs # 获取树的深度 def getTreeDepth(my_tree): max_depth = 0 first_str = list(my_tree.keys())[0] second_dict = my_tree[first_str] for key in second_dict.keys(): if type(second_dict[key]).__name__ == 'dict': this_depth = 1 + getTreeDepth(second_dict[key]) else: this_depth = 1 if this_depth > max_depth: max_depth = this_depth return max_depth # 绘制树中文本 def plotMidText(cntr_pt, parent_pt, txt_string): x_mid = (parent_pt[0] - cntr_pt[0]) / 2.0 + cntr_pt[0] y_mid = (parent_pt[1] - cntr_pt[1]) / 2.0 + cntr_pt[1] createPlot.ax1.text(x_mid, y_mid, txt_string) # 绘制树 def plotTree(my_tree, parent_pt, node_txt): num_leafs = getNumLeafs(my_tree) depth = getTreeDepth(my_tree) first_str = list(my_tree.keys())[0] cntr_pt = (plotTree.x_off + (1.0 + float(num_leafs)) / 2.0 /plotTree.total_w, plotTree.y_off) plotMidText(cntr_pt, parent_pt, node_txt) plotNode(first_str, cntr_pt, parent_pt, decision_node) second_dict = my_tree[first_str] plotTree.y_off = plotTree.y_off - 1.0 / plotTree.total_d for key in second_dict.keys(): if type(second_dict[key]).__name__ == 'dict': plotTree(second_dict[key], cntr_pt, str(key)) else: plotTree.x_off = plotTree.x_off + 1.0 / plotTree.total_w plotNode(second_dict[key], (plotTree.x_off, plotTree.y_off), cntr_pt, leaf_node) plotMidText((plotTree.x_off, plotTree.y_off), cntr_pt, str(key)) plotTree.y_off = plotTree.y_off + 1.0 / plotTree.total_d def createPlot(in_tree,method): # 新建一个窗口 fig = plt.figure(1, facecolor='white') # 清除图形 fig.clf() axprops = dict(xticks=[], yticks=[]) # 创建子图 createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) # w为决策树叶子个数 plotTree.total_w = float(getNumLeafs(in_tree)) # d为决策树深度 plotTree.total_d = float(getTreeDepth(in_tree)) plotTree.x_off = -0.5 / plotTree.total_w plotTree.y_off = 1.0 plotTree(in_tree, (0.5, 1.0), '') # 显示 try: plt.title("method="+method,pad=15) except: plt.title("method=None",pad=15) plt.show() def readDataset(): ''' 读取csv格式的数据集,返回dataset与labels的list形式 ''' #数据集内有中文字符,读取csv文件时需要使用gbk方式读取 df = pd.read_csv('data.csv',encoding="utf8") #前14个样本作为训练集,后4个样本作为验证集 trainDf = df.loc[0:350] testDf = df.loc[350:] #labels为该df的列 labels = df.columns.tolist() #训练集以及测试集为对应df的值 trainDataset = trainDf.values.tolist() testDataset = testDf.values.tolist() return trainDataset, testDataset, labels def Entropy(dataset): ''' 计算信息熵并返回 ''' # 样本个数 numExamples = len(dataset) # 类别计数器 classCount = {} # 每个样本的最后一列为刚样本所属的类别,循环每个样本,以每个类别为key,对应的value # 就是该类别拥有的样本数 for example in dataset: # example[-1]就为样本的类别 # 如果类别对应的key不存在就创建对应的key,样本数(value)置0 if example[-1] not in classCount.keys(): classCount[example[-1]] = 0 # 将类别计数器当中的对应类别的样本数(value) + 1 classCount[example[-1]] += 1 # 熵的计算公式为: entropy = pi * log2(pi) entropy = 0.0 for num in classCount.values(): # 样本出现概率 = 样本出现次数 / 样本总数 p = num / numExamples entropy -= p * math.log(p,2) return entropy def majorityCnt(classList): ''' 统计每个类别的个数,返回出现次数多的类别 ''' # 类别计数器 classCount={} for c in classList: if c not in classCount.keys(): classCount[c] = 0 classCount[c] += 1 # reverse = True 从大到小排列,key x[1]指比较key、value中的value sortedClassCount = sorted(classCount.items(),key=lambda x:x[1],reverse=True) return sortedClassCount[0][0] def splitDataset(dataset, index, splitValue): ''' 划分数据集 index : 该特征的索引 splitValue : 每次取第i个样本与第i+1个样本的第index个特征的平均值splitValue 作为数据集划分点,返回子集1与子集2 ''' subDataset1 = [] subDataset2 = [] # 遍历每个样本,当样本中的第index列的值<splitValue时,归为子集1 # 当样本中的第index列的值>splitValue时,归为子集2 for example in dataset: if example[index] <= splitValue: # 取出分裂特征前的数据集 splitFeature = example[:index] # 取出分裂特征后的数据集,并合并 splitFeature.extend(example[index+1:]) # 本行取得的去除example中index列的列表,加入总列表 subDataset1.append(splitFeature) else: # 取出分裂特征前的数据集 splitFeature = example[:index] # 取出分裂特征后的数据集,并合并 splitFeature.extend(example[index+1:]) # 本行取得的去除example中index列的列表,加入总列表 subDataset2.append(splitFeature) return subDataset1, subDataset2 def chooseBestFeatureToSplit(dataset): ''' 返回最优特征索引和最佳划分点值 ''' # 特征数,由于最后一列是类别不是特征,将最后一列去掉 numFeatures = len(dataset[0]) - 1 # 计算原始信息熵 baseEntropy = Entropy(dataset) # 信息增益 bestInfoGain = 0 # 最优特征下标 bestIndex = -1 # 最佳划分点 bestSplitValue = 0 for column in range(0, numFeatures): # 取出第i列特征值 featureList = [example[column] for example in dataset] # 排序 featureList = sorted(featureList) # 使用第column列特征值的第row行和第row+1行的平均值作为划分点,进行划分 # 得到左右两个子集 for row in range(0,len(featureList)-1): newEntropy = 0 splitValue = (featureList[row] + featureList[row + 1]) / 2.0 subDataset1, subDataset2 = splitDataset(dataset,column,splitValue) # 权重 = 子集样本数 / 全集样本数 weight1 = len(subDataset1) / float(len(dataset)) weight2 = len(subDataset2) / float(len(dataset)) # 按某个特征分类后的熵 = (子集的熵 * 子集占全集的比重) 的总和 newEntropy += weight1 * Entropy(subDataset1) newEntropy += weight2 * Entropy(subDataset2) # 信息增益 = 原始熵 - 按某个特征分类后的熵 infoGain = baseEntropy - newEntropy # 更新信息增益与对应最佳特征的索引 if infoGain > bestInfoGain: bestInfoGain = infoGain bestIndex = column bestSplitValue = splitValue return bestIndex, bestSplitValue def createTree(trainDataset, testDataset, labels, method = None): ''' method 为 [None, 'pre', 'post']中的一种 None为不使用剪枝操作, 'pre'为使用预剪枝操作, 'post'为使用后剪枝操作, 递归建树 1.获取最佳特征索引bestIndex以及最佳划分点bestSplitValue 2.根据bestIndex和bestSplitValue将训练集与测试集划分为左右两个子集subDataset1和subDataset2 3.如选择预剪枝,则每次衡量划分子集前的精确度和划分子集后的精确度,如有提高才生成子树; 4.如选择后剪枝,则先生成子树,再衡量去除每个子树是否带来精确度的提高,如有提高则去除子树; 返回值: 1.method为None或'pre'时,返回myTree 2.method为'post'时,返回myTree与correct 注意: 这个correct是指由训练集划分出的子树对测试集进行预测,一共预测对多少个样本的个数。 ''' # 获取训练集与测试集当中的所有类别 trainClassList = [example[-1] for example in trainDataset] testClassList = [example[-1] for example in testDataset] #print(trainClassList) # 若训练集中只有一个类时,有两种情况: # 1.如果当前采用后剪枝,则返回predict_class与correct # 2.如果不剪枝或采用预剪枝,则返回predict_class if trainClassList.count(trainClassList[0]) == len(trainClassList): # 当前子树预测类别 predict_class = trainClassList[0] # 当前预测类别预测测试集对的个数 correct = testClassList.count(predict_class) if method == 'post': return predict_class, correct else: return predict_class # 若训练集最后只剩下类别,有两种情况: # 1.如果当前采用后剪枝,则返回predict_class与correct # 2.如果不剪枝或采用预剪枝,则返回predict_class if len(trainDataset[0]) == 1: # 当前子树预测类别 predict_class = majorityCnt(trainClassList) # 当前预测类别预测测试集对的个数 correct = testClassList.count(predict_class) if method == 'post': return predict_class, correct else: return predict_class # 找到当前情况下使训练集信息增益最大的特征的索引,以及最佳的划分点值 bestIndex, bestSplitValue = chooseBestFeatureToSplit(trainDataset) # print(bestIndex) print(labels[bestIndex]) # 最优特征的名字 bestFeature = labels[bestIndex] # 创建决策树 myTree = {bestFeature:{}} # 从labels中删除最优特征 #del(labels[bestIndex]) # 使用最优特征索引与最佳参数划分出训练集与测试集的两个子集 trainSubDataset1, trainSubDataset2 = splitDataset(trainDataset,bestIndex ,bestSplitValue) testSubDataset1, testSubDataset2 = splitDataset(testDataset,bestIndex ,bestSplitValue) # 获取训练集与测试集中子集1与子集2的所有类别 trainSubClassList1 = [example[-1] for example in trainSubDataset1] trainSubClassList2 = [example[-1] for example in trainSubDataset2] testSubClassList1 = [example[-1] for example in testSubDataset1] testSubClassList2 = [example[-1] for example in testSubDataset2] if method == 'pre': # 划分子集前: # 预测类别为当前训练集中最多的类别 predict_class_pre = majorityCnt(trainClassList) # 使用训练集中最多的类别预测当前未划分的测试集的准确度 precision_pre = testClassList.count(predict_class_pre)/len(testClassList) # 划分子集后: # 子集1的预测类别为当前训练子集1中最多的类别,子集2同理 predict_class_post1 = majorityCnt(trainSubClassList1) predict_class_post2 = majorityCnt(trainSubClassList2) # 使用这两个类别分别预测测试集的子集1与子集2的正确总数 correct1 = testSubClassList1.count(predict_class_post1) correct2 = testSubClassList2.count(predict_class_post2) totalCorrect = correct1 + correct2 # 划分子集后的准确率 precision_post = totalCorrect / len(testClassList) print("precision_post",precision_post) print("precision_pre",precision_pre) # 如果划分子集后的准确率比划分前更高,则划分子集,否则返回当前样本中最多的类别 if precision_post > precision_pre: myTree[bestFeature]["<="+str(bestSplitValue)] = createTree(trainSubDataset1,testSubDataset1, labels, method = 'pre') myTree[bestFeature][">"+str(bestSplitValue)] = createTree(trainSubDataset2,testSubDataset2, labels, method = 'pre') else: return predict_class_pre elif method == 'post': # 剪枝前: predict_class_pre = majorityCnt(trainClassList) # 生成leftTree与rightTree并得到该子树预测测试集对的数量correct1与correct2 leftTree, correct1 = createTree(trainSubDataset1,testSubDataset1, labels, method = 'post') rightTree, correct2 = createTree(trainSubDataset2,testSubDataset2, labels, method = 'post') totalCorrect = correct1 + correct2 # 剪枝前的准确率 if len(testClassList)==0: precision_pre=0 else: precision_pre = totalCorrect / len(testClassList) # 剪枝后 # 预测类别为当前训练集中最多的类别 predict_class_post = majorityCnt(trainClassList) if len(testClassList)==0: precision_post=0 else: precision_post = testClassList.count(predict_class_post)/len(testClassList) # 使用训练集中最多的类别预测剪枝后的测试集的准确度 print(precision_post) # 如果剪枝后的精确度比剪枝前更高,则进行剪枝, # 返回剪枝后的预测类别predict_class_post与剪枝后预测对的个数correct_post; # 否则返回剪枝前的树myTree以及剪枝前预测正确的个数totalCorrect print("precision_post",precision_post) print("precision",precision_pre) if precision_post > precision_pre: correct_post = testClassList.count(predict_class_pre) return predict_class_pre, correct_post else: myTree[bestFeature]["<="+str(round(bestSplitValue,2))] = leftTree myTree[bestFeature][">"+str(round(bestSplitValue,2))] = rightTree return myTree, totalCorrect else : myTree[bestFeature]["<="+str(round(bestSplitValue,2))] = createTree(trainSubDataset1,testSubDataset1, labels, method = None) myTree[bestFeature][">"+str(round(bestSplitValue,2))] = createTree(trainSubDataset2,testSubDataset2, labels, method = None) predict_class_pre = majorityCnt(trainClassList) # 使用训练集中最多的类别预测当前未划分的测试集的准确度 precision_pre = testClassList.count(predict_class_pre)/len(testClassList) print("precision",precision_pre) return myTree if __name__=='__main__': trainDataset, testDataset, labels = readDataset() print(labels) labelsForPost = copy.deepcopy(labels) values = createTree(trainDataset, testDataset, labels, method = "post") print(values) if len(values) == 1: myTree = values elif len(values) == 2: myTree = values[0] createPlot(myTree, method = "post")
3.KNN+PCA可视化
import numpy as np from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from sklearn.decomposition import PCA # PCA主成分分析类 import matplotlib.pyplot as plt # 画图工具 import pandas as pd # 加载鸢尾花数据集 iris = load_iris() X = iris.data # 特征向量 y = iris.target # 类别标签 #df = pd.DataFrame(iris.data, columns=iris.feature_names) #df.to_csv(r'D:\coursework\maching_learning\iris.csv', index=None) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) #训练集占比为0.4 # 构建KNN分类器 knn = KNeighborsClassifier(n_neighbors=10) # 设置邻居数量为10 knn.fit(X_train, y_train) # 在训练集上训练模型 # 在测试集上进行预测 y_pred = knn.predict(X_test) # 计算分类准确率 accuracy = accuracy_score(y_test, y_pred) print("测试集准确率为: {:.2%}".format(accuracy)) iris = load_iris() y = iris.target X = iris.data #X.shape #调用PCA pca = PCA(n_components=2) # 降到2维 pca = pca.fit(X) #拟合模型 X_dr = pca.transform(X) #获取新矩阵 (降维后的) #X_dr #也可以fit_transform一步到位 #X_dr = PCA(2).fit_transform(X) plt.figure() plt.scatter(X_dr[y==0, 0], X_dr[y==0, 1], c="red", label=iris.target_names[0]) plt.scatter(X_dr[y==1, 0], X_dr[y==1, 1], c="black", label=iris.target_names[1]) plt.scatter(X_dr[y==2, 0], X_dr[y==2, 1], c="orange", label=iris.target_names[2]) plt.legend() plt.title('PCA of IRIS dataset') y_train_pca=pca.transform(X_test) #获取新矩阵 (降维后的) plt.figure() plt.scatter(y_train_pca[y_pred==0, 0], y_train_pca[y_pred==0, 1], c="red", label=iris.target_names[0]) plt.scatter(y_train_pca[y_pred==1, 0], y_train_pca[y_pred==1, 1], c="black", label=iris.target_names[1]) plt.scatter(y_train_pca[y_pred==2, 0], y_train_pca[y_pred==2, 1], c="orange", label=iris.target_names[2]) plt.legend() plt.title('predict of IRIS test_dataset') plt.show()
KD树算法:
import math import numpy as np from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from sklearn.decomposition import PCA # PCA主成分分析类 import matplotlib.pyplot as plt # 画图工具 import pandas as pd import time # 加载鸢尾花数据集 iris = load_iris() X = iris.data # 特征向量 pts = X #点集,任意维度的点集 class Node(): def __init__(self,pt,leftBranch,rightBranch,dimension): self.pt = pt self.leftBranch = leftBranch self.rightBranch = rightBranch self.dimension = dimension class KDTree(): def __init__(self,data): self.nearestPt = None self.nearestDis = math.inf def createKDTree(self,currPts,dimension): if(len(currPts) == 0): return None mid = self.calMedium(currPts) sortedData = sorted(currPts,key=lambda x:x[dimension]) leftBranch = self.createKDTree(sortedData[:mid],self.calDimension(dimension)) rightBranch = self.createKDTree(sortedData[mid+1:],self.calDimension(dimension)) return Node(sortedData[mid],leftBranch,rightBranch,dimension) def calMedium(self,currPts): return len(currPts) // 2 def calDimension(self,dimension): # 区别就在于这里,几维就取余几 return (dimension+1)%len(targetPt) def calDistance(self,p0,p1): return math.sqrt((p0[0]-p1[0])**2+(p0[1]-p1[1])**2) def getNearestPt(self,root,targetPt): self.search(root,targetPt) return self.nearestPt,self.nearestDis def search(self,node,targetPt): if node == None: return dist = node.pt[node.dimension] - targetPt[node.dimension] if(dist > 0):#目标点在节点的左侧或上侧 self.search(node.leftBranch,targetPt) else: self.search(node.rightBranch,targetPt) tempDis = self.calDistance(node.pt,targetPt) if(tempDis < self.nearestDis): self.nearestDis = tempDis self.nearestPt = node.pt #回溯 if(self.nearestDis > abs(dist)): if(dist > 0): self.search(node.rightBranch,targetPt) else: self.search(node.leftBranch,targetPt) def get_min_distance(X,targetPt): small=math.sqrt(sum((X[0]-targetPt)**2)) re_i=0 index=0 for point in X[1:]: d=math.sqrt(sum((point-targetPt)**2)) if d<small: small=d re_i=index index=index+1 return re_i if __name__ == "__main__": targetPt = X[0] #目标点,任意维度的点 kdtree = KDTree(pts) root = kdtree.createKDTree(pts,0) # 记录开始时间 start_time = time.time() for point in X: re_i = get_min_distance(X,point) # 记录结束时间 end_time = time.time() # 计算并打印执行时间 elapsed_time = end_time - start_time print(f"传统遍历方法执行时间: {elapsed_time}秒") # 记录开始时间 start_time = time.time() for point in X: pt,minDis = kdtree.getNearestPt(root,point) # 记录结束时间 end_time = time.time() # 计算并打印执行时间 elapsed_time = end_time - start_time print(f"kd树执行时间: {elapsed_time}秒")
运行结果:
对了,这一次实验,其实对于KNN还少了几个实验,一个是k值得超参数实验,一个是KNN基于不同距离计算公示的考察。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。