当前位置:   article > 正文

机器学习:k近邻算法(Python)

机器学习:k近邻算法(Python)

一、k近邻算法的定义

二、KD树结点信息封装 

kdtree_node.py

  1. class KDTreeNode:
  2. """
  3. KD树结点信息封装
  4. """
  5. def __init__(self, instance_node=None, instance_label=None, instance_idx=None,
  6. split_feature=None, left_child=None, right_child=None, kdt_depth=None):
  7. """
  8. 用于封装kd树的结点信息结构
  9. :param instance_node: 实例点,一个样本
  10. :param instance_label: 实例点对应的类别标记
  11. :param instance_idx: 该实例点对应的样本索引,用于kd树的可视化
  12. :param split_feature: 划分的特征属性,x^(i)
  13. :param left_child: 左子树,小于划分点的
  14. :param right_child: 右子树,大于切分点的
  15. :param kdt_depth: kd树的深度
  16. """
  17. self.instance_node = instance_node
  18. self.instance_label = instance_label
  19. self.instance_idx = instance_idx
  20. self.split_feature = split_feature
  21. self.left_child = left_child
  22. self.right_child = right_child
  23. self.kdt_depth = kdt_depth

三、距离度量的工具类

distUtils.py

  1. import numpy as np
  2. class DistanceUtils:
  3. """
  4. 距离度量的工具类,此处仅实现闵可夫斯基距离
  5. """
  6. def __init__(self, p=2):
  7. self.p = p # 默认欧式距离,p=1曼哈顿距离,p=np。inf是切比雪夫距离
  8. def distance_func(self, xi, xj):
  9. """
  10. 特征空间中两个样本示例的距离计算
  11. :param xi: k维空间某个样本示例
  12. :param xj: k维空间某个样本示例
  13. :return:
  14. """
  15. xi, xj = np.asarray(xi), np.asarray(xj)
  16. if self.p == 1 or self.p == 2:
  17. return (((np.abs(xi - xj)) ** self.p).sum()) ** (1 / self.p)
  18. elif self.p == np.inf:
  19. return np.max(np.abs(xi - xj))
  20. elif self.p == "cos": # 余弦距离或余弦相似度
  21. return xi.dot(xj) / np.sqrt((xi ** 2).sum()) / np.sqrt((xj ** 2).sum())
  22. else:
  23. raise ValueError("目前仅支持p=1、p=2、p=np.inf或余弦距离四种距离...")

四、K近邻算法的实现

knn_kdtree.py

  1. import numpy as np
  2. from kdtree_node import KDTreeNode
  3. from distUtils import DistanceUtils
  4. import heapq # 堆结构,实现堆排序
  5. from collections import Counter # 集合中的计数功能
  6. import networkx as nx # 网络图,可视化
  7. import matplotlib.pyplot as plt
  8. class KNearestNeighborKDTree:
  9. """
  10. K近邻算法的实现,基于KD树结构
  11. 1. fit: 特征向量空间的划分,即构建KD树(建立KNN算法模型)
  12. 2. predict: 预测,近邻搜索
  13. 3. 可视化kd树
  14. """
  15. def __init__(self, k: int=5, p=2, view_kdt=False):
  16. """
  17. KNN算法的初始化必要参数
  18. :param k: 近邻数
  19. :param p: 距离度量标准
  20. :param view_kdt: 是否可视化KD树
  21. """
  22. self.k = k # 预测,近邻搜索时,使用的参数,表示近邻树
  23. self.p = p # 预测,近邻搜索时,使用的参数,表示样本的近邻度
  24. self.view_kdt = view_kdt
  25. self.dis_utils = DistanceUtils(self.p) # 距离度量的类对象
  26. self.kdt_root: KDTreeNode() = None # KD树的根节点
  27. self.k_dimension = 0 # 特征空间维度,即样本的特征属性数
  28. self.k_neighbors = [] # 用于记录某个测试样本的近邻实例点
  29. def fit(self, x_train, y_train):
  30. """
  31. 递归创建KD树,即对特征向量空间进行划分,递归调用进行创建
  32. :param x_train: 训练样本集
  33. :param y_train: 训练样本目标集合
  34. :return:
  35. """
  36. if self.k < 1:
  37. raise ValueError("k must be greater than 0 and be int.")
  38. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
  39. self.k_dimension = x_train.shape[1] # 特征维度
  40. idx_array = np.arange(x_train.shape[0]) # 训练样本索引编号
  41. self.kdt_root = self._build_kd_tree(x_train, y_train, idx_array, 0)
  42. if self.view_kdt:
  43. self.draw_kd_tree() # 可视化kd树
  44. def _build_kd_tree(self, x_train, y_train, idx_array, kdt_depth):
  45. """
  46. 递归创建KD树,KD树是二叉树,严格区分左子树右子树,表示对k维空间的一个划分
  47. :param x_train: 递归划分的训练样本子集
  48. :param y_train: 递归划分的训练样本目标子集
  49. :param idx_array: 递归划分的样本索引
  50. :param depth: kd树的深度
  51. :return:
  52. """
  53. if x_train.shape[0] == 0: # 递归出口
  54. return
  55. split_dimension = kdt_depth % self.k_dimension # 数据的划分维度x^(i)
  56. sorted(x_train, key=lambda x: x[split_dimension]) # 按某个划分维度排序
  57. median_idx = x_train.shape[0] // 2 # 中位数所对应的数据的索引
  58. median_node = x_train[median_idx] # 切分点作为当前子树的根节点
  59. # 划分左右子树区域
  60. left_instances, right_instances = x_train[:median_idx], x_train[median_idx + 1:]
  61. left_labels, right_labels = y_train[:median_idx], y_train[median_idx + 1:]
  62. left_idx, right_idx = idx_array[:median_idx], idx_array[median_idx + 1:]
  63. # 递归调用
  64. left_child = self._build_kd_tree(left_instances, left_labels, left_idx, kdt_depth + 1)
  65. right_child = self._build_kd_tree(right_instances, right_labels, right_idx, kdt_depth + 1)
  66. kdt_new_node = KDTreeNode(median_node, y_train[median_idx], idx_array[median_idx],
  67. split_dimension, left_child, right_child, kdt_depth)
  68. return kdt_new_node
  69. def _search_kd_tree(self, kd_tree: KDTreeNode, x_test):
  70. """
  71. kd树的递归搜索算法,后序遍历,搜索k个最近邻实例点
  72. 数据结构:堆排序,搜索过程中,维护一个小根堆
  73. :param kd_tree: 已构建的kd树
  74. :param x_test: 单个测试样本
  75. :return:
  76. """
  77. if kd_tree is None: # 递归出口
  78. return
  79. # 计算测试样本与当前kd子树的根结点的距离(相似度)
  80. distance = self.dis_utils.distance_func(kd_tree.instance_node, x_test)
  81. # 1. 如果不够k个样本,继续递归
  82. # 2. 如果搜索了k个样本,但是k个样本未必是最近邻的。
  83. # 当计算的当前实例点的距离小于k个样本的最大距离,则递归,大于最大距离,没必要递归
  84. if (len(self.k_neighbors) < self.k) or (distance < self.k_neighbors[-1]["distance"]):
  85. self._search_kd_tree(kd_tree.left_child, x_test) # 递归左子树
  86. self._search_kd_tree(kd_tree.right_child, x_test) # 递归右子树
  87. # 在整个搜索路径上的kd树的结点,存储在self.k_neighbors中,包含三个值
  88. # 当前实例点,类别,距离
  89. self.k_neighbors.append({
  90. "node": kd_tree.instance_node, # 结点
  91. "label": kd_tree.instance_label, # 当前实例的类别
  92. "distance": distance # 当前实例点与测试样本的距离
  93. })
  94. # 按照距离进行排序,选择最小的k个最近邻样本实例,更新最近邻距离
  95. # 小根堆,k_neighbors中第一个结点是距离测试样本最近的
  96. self.k_neighbors = heapq.nsmallest(self.k, self.k_neighbors,
  97. key=lambda d: d["distance"])
  98. def predict(self, x_test):
  99. """
  100. KD树的近邻搜索,即测试样本的预测
  101. :param x_test: 测试样本,ndarray: (n * k)
  102. :return:
  103. """
  104. x_test = np.asarray(x_test)
  105. if self.kdt_root is None:
  106. raise ValueError("KDTree is None, Please fit KDTree...")
  107. elif x_test.shape[1] != self.k_dimension:
  108. raise ValueError("Test Sample dimension unmatched KDTree's dimension.")
  109. else:
  110. y_test_hat = [] # 用于存储测试样本的预测类别
  111. for i in range(x_test.shape[0]):
  112. self.k_neighbors = [] # 调用递归搜索,则包含了k个最近邻的实例点
  113. self._search_kd_tree(self.kdt_root, x_test[i])
  114. # print(self.k_neighbors)
  115. y_test_labels = []
  116. # 取每个近邻样本的类别标签
  117. for k in range(self.k):
  118. y_test_labels.append(self.k_neighbors[k]["label"])
  119. # 按分类规则(多数表决法)
  120. # print(y_test_labels)
  121. counter = Counter(y_test_labels)
  122. idx = int(np.argmax(list(counter.values())))
  123. y_test_hat.append(list(counter.keys())[idx])
  124. return np.asarray(y_test_hat)
  125. def _create_kd_tree(self, graph, kdt_node: KDTreeNode, pos=None, x=0, y=0, layer=1):
  126. """
  127. 递归可视化KD树,递归构造树的结点、边。
  128. :param graph: 有向图对象,递归中逐步增加结点和左子树右子树
  129. :param kdt_node: 递归创建KD树的结点
  130. :param pos: 可视化中树结点位置,初始化(0, 0)绘制根结点
  131. :param x: 对应pos中的横坐标,随着递归,更新
  132. :param y: 对应pos中的纵坐标,随着递归,更新
  133. :param layer: kd树的层次
  134. :return:
  135. """
  136. if pos is None:
  137. pos = {}
  138. pos[str(kdt_node.instance_idx)] = (x, y)
  139. if kdt_node.left_child:
  140. # 父结点指向左子树
  141. graph.add_edge(str(kdt_node.instance_idx), str(kdt_node.left_child.instance_idx))
  142. l_x, l_y = x - 1 / 2 ** layer, y - 1 # 下一个树结点位置的计算
  143. l_layer = layer + 1 # 树的层次 + 1
  144. self._create_kd_tree(graph, kdt_node.left_child, x=l_x, y=l_y, pos=pos, layer=l_layer) # 递归
  145. if kdt_node.right_child:
  146. # 父结点指向右子树
  147. graph.add_edge(str(kdt_node.instance_idx), str(kdt_node.right_child.instance_idx))
  148. r_x, r_y = x + 1 / 2 ** layer, y - 1
  149. r_layer = layer + 1
  150. self._create_kd_tree(graph, kdt_node.right_child, x=r_x, y=r_y, pos=pos, layer=r_layer) # 递归
  151. return graph, pos
  152. def draw_kd_tree(self):
  153. """
  154. 可视化kd树
  155. :return:
  156. """
  157. directed_graph = nx.DiGraph() # 初始化一个有向图,树
  158. graph, pos = self._create_kd_tree(directed_graph, self.kdt_root)
  159. fig, ax = plt.subplots(figsize=(20, 10)) # 比例可以根据树的深度适当调节
  160. nx.draw_networkx(graph, pos, ax=ax, node_size=500, font_color="w", font_size=15,
  161. arrowsize=20)
  162. plt.tight_layout()
  163. plt.show()

 五、K近邻算法的测试

test_knn_1.py

  1. import numpy as np
  2. from sklearn.datasets import load_iris, load_breast_cancer
  3. from knn_kdtree import KNearestNeighborKDTree
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.metrics import classification_report, accuracy_score
  6. import matplotlib.pyplot as plt
  7. from sklearn.model_selection import StratifiedKFold
  8. from sklearn.preprocessing import StandardScaler
  9. iris = load_iris()
  10. X, y = iris.data, iris.target
  11. # bc_data = load_breast_cancer()
  12. # X, y = bc_data.data, bc_data.target
  13. X = StandardScaler().fit_transform(X)
  14. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
  15. k_neighbors = np.arange(3, 21)
  16. # acc = []
  17. # for k in k_neighbors:
  18. # knn = KNearestNeighborKDTree(k=k)
  19. # knn.fit(X_train, y_train)
  20. # y_test_hat = knn.predict(X_test)
  21. # # print(classification_report(y_test, y_test_hat))
  22. # acc.append(accuracy_score(y_test, y_test_hat))
  23. accuracy_scores = [] # 存储每个alpha阈值下的交叉验证均分
  24. for k in k_neighbors:
  25. scores = []
  26. k_fold = StratifiedKFold(n_splits=10).split(X, y)
  27. for train_idx, test_idx in k_fold:
  28. # knn = KNearestNeighborKDTree(k=k, p="cos")
  29. knn = KNearestNeighborKDTree(k=k)
  30. knn.fit(X[train_idx], y[train_idx])
  31. y_test_pred = knn.predict(X[test_idx])
  32. scores.append(accuracy_score(y[test_idx], y_test_pred))
  33. del knn
  34. print("k = %d:" % k, np.mean(scores))
  35. accuracy_scores.append(np.mean(scores))
  36. plt.figure(figsize=(7, 5))
  37. plt.plot(k_neighbors, accuracy_scores, "ko-", lw=1)
  38. plt.grid(ls=":")
  39. plt.xlabel("K Neighbors", fontdict={"fontsize": 12})
  40. plt.ylabel("Accuracy Scores", fontdict={"fontsize": 12})
  41. plt.title("KNN(KDTree) Testing Scores under different K Neighbors", fontdict={"fontsize": 14})
  42. plt.show()
  43. # knn = KNearestNeighborKDTree(k=3)
  44. # knn.fit(X_train, y_train)
  45. # knn.draw_kd_tree()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/124255
推荐阅读
相关标签
  

闽ICP备14008679号