赞
踩
理论部分大家可以自己学习,这里代码是利用的逻辑编写,没有用sklearn
- import numpy as np
-
- import pandas as pd
- from collections import Counter
-
-
- class DecisionTree:
- class Node:
- def __init__(self, feature_index=None, threshold=None, value=None, left=None, right=None):
- self.feature_index = feature_index # 特征索引
- self.threshold = threshold # 分割阈值
- self.value = value # 叶子节点预测值
- self.left = left # 左子树
- self.right = right # 右子树
-
- def __init__(self, max_depth=None, min_samples_split=2):
- self.max_depth = max_depth # 最大深度
- self.min_samples_split = min_samples_split # 分割的最小样本数
- self.root = None # 根节点
-
-
- def fit(self, X, y):
- self.root = self._build_tree(X, y, depth=0)
-
- def _build_tree(self, X, y, depth):
- n_samples, n_features = X.shape
- n_classes = len(np.unique(y))
-
-
- if depth == self.max_depth or n_samples < self.min_samples_split or n_classes == 1: # 满足停止条件
- value = self._most_common_label(y)
- return self.Node(value=value)
-
- best_feature_index, best_threshold = self._find_best_split(X, y)
-
-
- if best_feature_index is None or best_threshold is None:# 满足停止条件
- value = self._most_common_label(y)
- return self.Node(value=value)
-
- left_indices = X[:, best_feature_index] < best_threshold
- right_indices = ~left_indices
-
-
- left_branch = self._build_tree(X[left_indices], y[left_indices], depth+1)# 递归构建子树
- right_branch = self._build_tree(X[right_indices], y[right_indices], depth+1)
-
- return self.Node(feature_index=best_feature_index, threshold=best_threshold,
- left=left_branch, right=right_branch)
-
- def _find_best_split(self, X, y):
- n_samples, n_features = X.shape
- best_info_gain = -1
- best_feature_index = None
- best_threshold = None
-
-
- entropy_parent = self._entropy(y)# 计算父节点的熵
-
- for feature_index in range(n_features):
- unique_values = np.unique(X[:, feature_index])
-
- for threshold in unique_values:
- left_indices = X[:, feature_index] < threshold
- right_indices = ~left_indices
-
-
- entropy_left = self._entropy(y[left_indices]) # 计算子节点的熵和信息增益
- entropy_right = self._entropy(y[right_indices])
- info_gain = self._information_gain(entropy_parent, y[left_indices], y[right_indices])
-
-
- if info_gain > best_info_gain:# 选择信息增益最大的分割点
- best_info_gain = info_gain
- best_feature_index = feature_index
- best_threshold = threshold
-
- return best_feature_index, best_threshold
-
- def _entropy(self, y): #计算熵
- counter = Counter(y)
- probabilities = [count / len(y) for count in counter.values()]
- entropy = -sum(p * np.log2(p) for p in probabilities)
- return entropy
-
- def _information_gain(self, entropy_parent, y_left, y_right):
- n_total = len(y_left) + len(y_right)
- p_left, p_right = len(y_left) / n_total, len(y_right) / n_total
- info_gain = entropy_parent - (p_left * self._entropy(y_left) + p_right * self._entropy(y_right))
- return info_gain #计算信息增益
-
- def _most_common_label(self, y):
- counter = Counter(y)
- most_common = counter.most_common(1)
- return most_common[0][0]
-
- def predict(self, X):
- return [self._traverse_tree(x, self.root) for x in X]
-
- def _traverse_tree(self, x, node): #进行分类
- if node.value is not None:
- return node.value
- if x[node.feature_index] < node.threshold:
- return self._traverse_tree(x, node.left)
- else:
- return self._traverse_tree(x, node.right)
-
-
- data = pd.read_excel("C:/Users/wxc/Desktop/xuexi/python/pythonProject/机器学习/决策树/train.xlsx")
- x_train = np.array(data.iloc[:, 1:5])
- y_train = np.array(data.iloc[:, 6])
-
- tree = DecisionTree(max_depth=4, min_samples_split=1)
- tree.fit(x_train, y_train)
- data1 = pd.read_excel("C:/Users/wxc/Desktop/xuexi/python/pythonProject/机器学习/决策树/test.xlsx",header=None) # 新样本特征
- x_test = np.array(data1.iloc[:, 1:5])
- y_test = np.array(data1.iloc[:, 6])
- predictions = tree.predict(x_test)
- print("预测值为:", predictions)
- c = 0
- for i in range(len(y_test)):
- if y_test[i] == predictions[i]:
- c= c+1
- print('准确率')
- print(c/(len(y_test)))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。