当前位置:   article > 正文

头歌--机器学习之决策树_机器学习头歌答案

机器学习头歌答案

目录

第1关:什么是决策树

第2关:信息熵与信息增益

第3关:使用ID3算法构建决策树

第4关:信息增益率

第5关:基尼系数

第6关:预剪枝与后剪枝

第7关:鸢尾花识别


第1关:什么是决策树

  • 1、下列说法正确的是?(AB)

    A、训练决策树的过程就是构建决策树的过程

    B、ID3算法是根据信息增益来构建决策树

    C、C4.5算法是根据基尼系数来构建决策树

    D、决策树模型的可理解性不高

  • 2、下列说法错误的是?(B)

    A、从树的根节点开始,根据特征的值一步一步走到叶子节点的过程是决策树做决策的过程
    B、决策树只能是一棵二叉树
    C、根节点所代表的特征是最优特征

第2关:信息熵与信息增益

  1. import numpy as np
  2. def calcInfoGain(feature, label, index):
  3. '''
  4. 计算信息增益
  5. :param feature:测试用例中字典里的feature,类型为ndarray
  6. :param label:测试用例中字典里的label,类型为ndarray
  7. :param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
  8. :return:信息增益,类型float
  9. '''
  10. #*********** Begin ***********#
  11. def total_cal(label):
  12. label_set = set(label)
  13. result = 0
  14. for i in label_set:
  15. p=list(label).count(i)/len(label)
  16. result-=p * np.log2(p)
  17. return result
  18. aba=[]
  19. length=[]
  20. for value in set(feature[:,index]):
  21. # num=feature[:,index].count(value)
  22. sub_label = []
  23. for i in range(len(feature)):
  24. if feature[i][index] == value:
  25. sub_label.append(label[i])
  26. aba.append(total_cal(sub_label))
  27. length.append(len(sub_label)/len(label))
  28. res=total_cal(label)-length[0]*aba[0]-length[1]*aba[1]
  29. return res
  30. #*********** End *************#

第3关:使用ID3算法构建决策树

  1. import numpy as np
  2. class DecisionTree(object):
  3. def __init__(self):
  4. #决策树模型
  5. self.tree = {}
  6. def calcInfoGain(self, feature, label, index):
  7. '''
  8. 计算信息增益
  9. :param feature:测试用例中字典里的feature,类型为ndarray
  10. :param label:测试用例中字典里的label,类型为ndarray
  11. :param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
  12. :return:信息增益,类型float
  13. '''
  14. # 计算熵
  15. def calcInfoEntropy(label):
  16. '''
  17. 计算信息熵
  18. :param label:数据集中的标签,类型为ndarray
  19. :return:信息熵,类型float
  20. '''
  21. label_set = set(label)
  22. result = 0
  23. for l in label_set:
  24. count = 0
  25. for j in range(len(label)):
  26. if label[j] == l:
  27. count += 1
  28. # 计算标签在数据集中出现的概率
  29. p = count / len(label)
  30. # 计算熵
  31. result -= p * np.log2(p)
  32. return result
  33. # 计算条件熵
  34. def calcHDA(feature, label, index, value):
  35. '''
  36. 计算信息熵
  37. :param feature:数据集中的特征,类型为ndarray
  38. :param label:数据集中的标签,类型为ndarray
  39. :param index:需要使用的特征列索引,类型为int
  40. :param value:index所表示的特征列中需要考察的特征值,类型为int
  41. :return:信息熵,类型float
  42. '''
  43. count = 0
  44. # sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签
  45. sub_feature = []
  46. sub_label = []
  47. for i in range(len(feature)):
  48. if feature[i][index] == value:
  49. count += 1
  50. sub_feature.append(feature[i])
  51. sub_label.append(label[i])
  52. pHA = count / len(feature)
  53. e = calcInfoEntropy(sub_label)
  54. return pHA * e
  55. base_e = calcInfoEntropy(label)
  56. f = np.array(feature)
  57. # 得到指定特征列的值的集合
  58. f_set = set(f[:, index])
  59. sum_HDA = 0
  60. # 计算条件熵
  61. for value in f_set:
  62. sum_HDA += calcHDA(feature, label, index, value)
  63. # 计算信息增益
  64. return base_e - sum_HDA
  65. # 获得信息增益最高的特征
  66. def getBestFeature(self, feature, label):
  67. max_infogain = 0
  68. best_feature = 0
  69. for i in range(len(feature[0])):
  70. infogain = self.calcInfoGain(feature, label, i)
  71. if infogain > max_infogain:
  72. max_infogain = infogain
  73. best_feature = i
  74. return best_feature
  75. def createTree(self, feature, label):
  76. # 样本里都是同一个label没必要继续分叉了
  77. if len(set(label)) == 1:
  78. return label[0]
  79. # 样本中只有一个特征或者所有样本的特征都一样的话就看哪个label的票数高
  80. if len(feature[0]) == 1 or len(np.unique(feature, axis=0)) == 1:
  81. vote = {}
  82. for l in label:
  83. if l in vote.keys():
  84. vote[l] += 1
  85. else:
  86. vote[l] = 1
  87. max_count = 0
  88. vote_label = None
  89. for k, v in vote.items():
  90. if v > max_count:
  91. max_count = v
  92. vote_label = k
  93. return vote_label
  94. # 根据信息增益拿到特征的索引
  95. best_feature = self.getBestFeature(feature, label)
  96. tree = {best_feature: {}}
  97. f = np.array(feature)
  98. # 拿到bestfeature的所有特征值
  99. f_set = set(f[:, best_feature])
  100. # 构建对应特征值的子样本集sub_feature, sub_label
  101. for v in f_set:
  102. sub_feature = []
  103. sub_label = []
  104. for i in range(len(feature)):
  105. if feature[i][best_feature] == v:
  106. sub_feature.append(feature[i])
  107. sub_label.append(label[i])
  108. # 递归构建决策树
  109. tree[best_feature][v] = self.createTree(sub_feature, sub_label)
  110. return tree
  111. def fit(self, feature, label):
  112. '''
  113. :param feature: 训练集数据,类型为ndarray
  114. :param label:训练集标签,类型为ndarray
  115. :return: None
  116. '''
  117. #************* Begin ************#
  118. self.tree = self.createTree(feature, label)
  119. #************* End **************#
  120. def predict(self, feature):
  121. '''
  122. :param feature:测试集数据,类型为ndarray
  123. :return:预测结果,如np.array([0, 1, 2, 2, 1, 0])
  124. '''
  125. #************* Begin ************#
  126. result = []
  127. def classify(tree, feature):
  128. if not isinstance(tree, dict):
  129. return tree
  130. t_index, t_value = list(tree.items())[0]
  131. f_value = feature[t_index]
  132. if isinstance(t_value, dict):
  133. classLabel = classify(tree[t_index][f_value], feature)
  134. return classLabel
  135. else:
  136. return t_value
  137. for f in feature:
  138. result.append(classify(self.tree, f))
  139. return np.array(result)
  140. #************* End **************#

第4关:信息增益

  1. import numpy as np
  2. def calcInfoGain(feature, label, index):
  3. '''
  4. 计算信息增益
  5. :param feature:测试用例中字典里的feature,类型为ndarray
  6. :param label:测试用例中字典里的label,类型为ndarray
  7. :param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
  8. :return:信息增益,类型float
  9. '''
  10. # 计算熵
  11. def calcInfoEntropy(label):
  12. '''
  13. 计算信息熵
  14. :param label:数据集中的标签,类型为ndarray
  15. :return:信息熵,类型float
  16. '''
  17. label_set = set(label)
  18. result = 0
  19. for l in label_set:
  20. count = 0
  21. for j in range(len(label)):
  22. if label[j] == l:
  23. count += 1
  24. # 计算标签在数据集中出现的概率
  25. p = count / len(label)
  26. # 计算熵
  27. result -= p * np.log2(p)
  28. return result
  29. # 计算条件熵
  30. def calcHDA(feature, label, index, value):
  31. '''
  32. 计算信息熵
  33. :param feature:数据集中的特征,类型为ndarray
  34. :param label:数据集中的标签,类型为ndarray
  35. :param index:需要使用的特征列索引,类型为int
  36. :param value:index所表示的特征列中需要考察的特征值,类型为int
  37. :return:信息熵,类型float
  38. '''
  39. count = 0
  40. # sub_label表示根据特征列和特征值分割出的子数据集中的标签
  41. sub_label = []
  42. for i in range(len(feature)):
  43. if feature[i][index] == value:
  44. count += 1
  45. sub_label.append(label[i])
  46. pHA = count / len(feature)
  47. e = calcInfoEntropy(sub_label)
  48. return pHA * e
  49. base_e = calcInfoEntropy(label)
  50. f = np.array(feature)
  51. # 得到指定特征列的值的集合
  52. f_set = set(f[:, index])
  53. sum_HDA = 0
  54. # 计算条件熵
  55. for value in f_set:
  56. sum_HDA += calcHDA(feature, label, index, value)
  57. # 计算信息增益
  58. return base_e - sum_HDA
  59. def calcInfoGainRatio(feature, label, index):
  60. '''
  61. 计算信息增益率
  62. :param feature:测试用例中字典里的feature,类型为ndarray
  63. :param label:测试用例中字典里的label,类型为ndarray
  64. :param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
  65. :return:信息增益率,类型float
  66. '''
  67. #********* Begin *********#
  68. info_gain = calcInfoGain(feature, label, index)
  69. unique_value = list(set(feature[:, index]))
  70. IV = 0
  71. for value in unique_value:
  72. len_v = np.sum(feature[:, index] == value)
  73. IV -= (len_v/len(feature))*np.log2((len_v/len(feature)))
  74. return info_gain/IV
  75. #********* End *********#

第5关:基尼系数

  1. import numpy as np
  2. def calcGini(feature, label, index):
  3. '''
  4. 计算基尼系数
  5. :param feature:测试用例中字典里的feature,类型为ndarray
  6. :param label:测试用例中字典里的label,类型为ndarray
  7. :param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
  8. :return:基尼系数,类型float
  9. '''
  10. #********* Begin *********#
  11. def _gini(label):
  12. unique_label = list(set(label))
  13. gini = 1
  14. for l in unique_label:
  15. p = np.sum(label == l)/len(label)
  16. gini -= p**2
  17. return gini
  18. unique_value = list(set(feature[:, index]))
  19. gini = 0
  20. for value in unique_value:
  21. len_v = np.sum(feature[:, index] == value)
  22. gini += (len_v/len(feature))*_gini(label[feature[:, index] == value])
  23. return gini
  24. #********* End *********#


第6关:预剪枝与后剪枝

  1. import numpy as np
  2. from copy import deepcopy
  3. class DecisionTree(object):
  4. def __init__(self):
  5. #决策树模型
  6. self.tree = {}
  7. def calcInfoGain(self, feature, label, index):
  8. '''
  9. 计算信息增益
  10. :param feature:测试用例中字典里的feature,类型为ndarray
  11. :param label:测试用例中字典里的label,类型为ndarray
  12. :param index:测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
  13. :return:信息增益,类型float
  14. '''
  15. # 计算熵
  16. def calcInfoEntropy(feature, label):
  17. '''
  18. 计算信息熵
  19. :param feature:数据集中的特征,类型为ndarray
  20. :param label:数据集中的标签,类型为ndarray
  21. :return:信息熵,类型float
  22. '''
  23. label_set = set(label)
  24. result = 0
  25. for l in label_set:
  26. count = 0
  27. for j in range(len(label)):
  28. if label[j] == l:
  29. count += 1
  30. # 计算标签在数据集中出现的概率
  31. p = count / len(label)
  32. # 计算熵
  33. result -= p * np.log2(p)
  34. return result
  35. # 计算条件熵
  36. def calcHDA(feature, label, index, value):
  37. '''
  38. 计算信息熵
  39. :param feature:数据集中的特征,类型为ndarray
  40. :param label:数据集中的标签,类型为ndarray
  41. :param index:需要使用的特征列索引,类型为int
  42. :param value:index所表示的特征列中需要考察的特征值,类型为int
  43. :return:信息熵,类型float
  44. '''
  45. count = 0
  46. # sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签
  47. sub_feature = []
  48. sub_label = []
  49. for i in range(len(feature)):
  50. if feature[i][index] == value:
  51. count += 1
  52. sub_feature.append(feature[i])
  53. sub_label.append(label[i])
  54. pHA = count / len(feature)
  55. e = calcInfoEntropy(sub_feature, sub_label)
  56. return pHA * e
  57. base_e = calcInfoEntropy(feature, label)
  58. f = np.array(feature)
  59. # 得到指定特征列的值的集合
  60. f_set = set(f[:, index])
  61. sum_HDA = 0
  62. # 计算条件熵
  63. for value in f_set:
  64. sum_HDA += calcHDA(feature, label, index, value)
  65. # 计算信息增益
  66. return base_e - sum_HDA
  67. # 获得信息增益最高的特征
  68. def getBestFeature(self, feature, label):
  69. max_infogain = 0
  70. best_feature = 0
  71. for i in range(len(feature[0])):
  72. infogain = self.calcInfoGain(feature, label, i)
  73. if infogain > max_infogain:
  74. max_infogain = infogain
  75. best_feature = i
  76. return best_feature
  77. # 计算验证集准确率
  78. def calc_acc_val(self, the_tree, val_feature, val_label):
  79. result = []
  80. def classify(tree, feature):
  81. if not isinstance(tree, dict):
  82. return tree
  83. t_index, t_value = list(tree.items())[0]
  84. f_value = feature[t_index]
  85. if isinstance(t_value, dict):
  86. classLabel = classify(tree[t_index][f_value], feature)
  87. return classLabel
  88. else:
  89. return t_value
  90. for f in val_feature:
  91. result.append(classify(the_tree, f))
  92. result = np.array(result)
  93. return np.mean(result == val_label)
  94. def createTree(self, train_feature, train_label):
  95. # 样本里都是同一个label没必要继续分叉了
  96. if len(set(train_label)) == 1:
  97. return train_label[0]
  98. # 样本中只有一个特征或者所有样本的特征都一样的话就看哪个label的票数高
  99. if len(train_feature[0]) == 1 or len(np.unique(train_feature, axis=0)) == 1:
  100. vote = {}
  101. for l in train_label:
  102. if l in vote.keys():
  103. vote[l] += 1
  104. else:
  105. vote[l] = 1
  106. max_count = 0
  107. vote_label = None
  108. for k, v in vote.items():
  109. if v > max_count:
  110. max_count = v
  111. vote_label = k
  112. return vote_label
  113. # 根据信息增益拿到特征的索引
  114. best_feature = self.getBestFeature(train_feature, train_label)
  115. tree = {best_feature: {}}
  116. f = np.array(train_feature)
  117. # 拿到bestfeature的所有特征值
  118. f_set = set(f[:, best_feature])
  119. # 构建对应特征值的子样本集sub_feature, sub_label
  120. for v in f_set:
  121. sub_feature = []
  122. sub_label = []
  123. for i in range(len(train_feature)):
  124. if train_feature[i][best_feature] == v:
  125. sub_feature.append(train_feature[i])
  126. sub_label.append(train_label[i])
  127. # 递归构建决策树
  128. tree[best_feature][v] = self.createTree(sub_feature, sub_label)
  129. return tree
  130. # 后剪枝
  131. def post_cut(self, val_feature, val_label):
  132. # 拿到非叶子节点的数量
  133. def get_non_leaf_node_count(tree):
  134. non_leaf_node_path = []
  135. def dfs(tree, path, all_path):
  136. for k in tree.keys():
  137. if isinstance(tree[k], dict):
  138. path.append(k)
  139. dfs(tree[k], path, all_path)
  140. if len(path) > 0:
  141. path.pop()
  142. else:
  143. all_path.append(path[:])
  144. dfs(tree, [], non_leaf_node_path)
  145. unique_non_leaf_node = []
  146. for path in non_leaf_node_path:
  147. isFind = False
  148. for p in unique_non_leaf_node:
  149. if path == p:
  150. isFind = True
  151. break
  152. if not isFind:
  153. unique_non_leaf_node.append(path)
  154. return len(unique_non_leaf_node)
  155. # 拿到树中深度最深的从根节点到非叶子节点的路径
  156. def get_the_most_deep_path(tree):
  157. non_leaf_node_path = []
  158. def dfs(tree, path, all_path):
  159. for k in tree.keys():
  160. if isinstance(tree[k], dict):
  161. path.append(k)
  162. dfs(tree[k], path, all_path)
  163. if len(path) > 0:
  164. path.pop()
  165. else:
  166. all_path.append(path[:])
  167. dfs(tree, [], non_leaf_node_path)
  168. max_depth = 0
  169. result = None
  170. for path in non_leaf_node_path:
  171. if len(path) > max_depth:
  172. max_depth = len(path)
  173. result = path
  174. return result
  175. # 剪枝
  176. def set_vote_label(tree, path, label):
  177. for i in range(len(path)-1):
  178. tree = tree[path[i]]
  179. tree[path[len(path)-1]] = vote_label
  180. acc_before_cut = self.calc_acc_val(self.tree, val_feature, val_label)
  181. # 遍历所有非叶子节点
  182. for _ in range(get_non_leaf_node_count(self.tree)):
  183. path = get_the_most_deep_path(self.tree)
  184. # 备份树
  185. tree = deepcopy(self.tree)
  186. step = deepcopy(tree)
  187. # 跟着路径走
  188. for k in path:
  189. step = step[k]
  190. # 叶子节点中票数最多的标签
  191. vote_label = sorted(step.items(), key=lambda item: item[1], reverse=True)[0][0]
  192. # 在备份的树上剪枝
  193. set_vote_label(tree, path, vote_label)
  194. acc_after_cut = self.calc_acc_val(tree, val_feature, val_label)
  195. # 验证集准确率高于0.9才剪枝
  196. if acc_after_cut > acc_before_cut:
  197. set_vote_label(self.tree, path, vote_label)
  198. acc_before_cut = acc_after_cut
  199. def fit(self, train_feature, train_label, val_feature, val_label):
  200. '''
  201. :param train_feature:训练集数据,类型为ndarray
  202. :param train_label:训练集标签,类型为ndarray
  203. :param val_feature:验证集数据,类型为ndarray
  204. :param val_label:验证集标签,类型为ndarray
  205. :return: None
  206. '''
  207. #************* Begin ************#
  208. self.tree = self.createTree(train_feature, train_label)
  209. # 后剪枝
  210. self.post_cut(val_feature, val_label)
  211. #************* End **************#
  212. def predict(self, feature):
  213. '''
  214. :param feature:测试集数据,类型为ndarray
  215. :return:预测结果,如np.array([0, 1, 2, 2, 1, 0])
  216. '''
  217. #************* Begin ************#
  218. result = []
  219. # 单个样本分类
  220. def classify(tree, feature):
  221. if not isinstance(tree, dict):
  222. return tree
  223. t_index, t_value = list(tree.items())[0]
  224. f_value = feature[t_index]
  225. if isinstance(t_value, dict):
  226. classLabel = classify(tree[t_index][f_value], feature)
  227. return classLabel
  228. else:
  229. return t_value
  230. for f in feature:
  231. result.append(classify(self.tree, f))
  232. return np.array(result)
  233. #************* End **************#


第7关:鸢尾花识别

  1. #********* Begin *********#
  2. import pandas as pd
  3. from sklearn.tree import DecisionTreeClassifier
  4. train_df = pd.read_csv('./step7/train_data.csv').as_matrix()
  5. train_label = pd.read_csv('./step7/train_label.csv').as_matrix()
  6. test_df = pd.read_csv('./step7/test_data.csv').as_matrix()
  7. dt = DecisionTreeClassifier()
  8. dt.fit(train_df, train_label)
  9. result = dt.predict(test_df)
  10. result = pd.DataFrame({'target':result})
  11. result.to_csv('./step7/predict.csv', index=False)
  12. #********* End *********#

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/518520
推荐阅读
相关标签
  

闽ICP备14008679号