赞
踩
任务描述
本关任务:补充python代码,完成DecisionTree类中的fit和predict函数。
相关知识
为了完成本关任务,你需要掌握:
ID3算法构造决策树的流程
如何使用构造好的决策树进行预测
import numpy as np
# 计算熵
def calcInfoEntropy(label):
'''
input:
label(narray):样本标签
output:
InfoEntropy(float):熵
'''
label_set = set(label)
InfoEntropy = 0
for l in label_set:
count = 0
for j in range(len(label)):
if label[j] == l:
count += 1
# 计算标签在数据集中出现的概率
p = count / len(label)
# 计算熵
InfoEntropy -= p * np.log2(p)
return InfoEntropy
#计算条件熵
def calcHDA(feature,label,index,value):
'''
input:
feature(ndarray):样本特征
label(ndarray):样本标签
index(int):需要使用的特征列索引
value(int):index所表示的特征列中需要考察的特征值
output:
HDA(float):信息熵
'''
count = 0
# sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签
sub_feature = []
sub_label = []
for i in range(len(feature)):
if feature[i][index] == value:
count += 1
sub_feature.append(feature[i])
sub_label.append(label[i])
pHA = count / len(feature)
e = calcInfoEntropy(sub_label)
HDA = pHA * e
return HDA
#计算信息增益
def calcInfoGain(feature, label, index):
'''
input:
feature(ndarry):测试用例中字典里的feature
label(ndarray):测试用例中字典里的label
index(int):测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
output:
InfoGain(float):信息增益
'''
base_e = calcInfoEntropy(label)
f = np.array(feature)
# 得到指定特征列的值的集合
f_set = set(f[:, index])
sum_HDA = 0
# 计算条件熵
for value in f_set:
sum_HDA += calcHDA(feature, label, index, value)
# 计算信息增益
InfoGain = base_e - sum_HDA
return InfoGain
# 获得信息增益最高的特征
def getBestFeature(feature, label):
'''
input:
feature(ndarray):样本特征
label(ndarray):样本标签
output:
best_feature(int):信息增益最高的特征
'''
#*********Begin*********#
max_infogain = 0
best_feature = 0
for i in range(len(feature[0])):
infogain = calcInfoGain(feature, label, i)
if infogain > max_infogain:
max_infogain = infogain
best_feature = i
#*********End*********#
return best_feature
#创建决策树
def createTree(feature, label):
'''
input:
feature(ndarray):训练样本特征
label(ndarray):训练样本标签
output:
tree(dict):决策树模型
'''
#*********Begin*********#
# 样本里都是同一个label没必要继续分叉了
if len(set(label)) == 1:
return label[0]
# 样本中只有一个特征或者所有样本的特征都一样的话就看哪个label的票数高
if len(feature[0]) == 1 or len(np.unique(feature, axis=0)) == 1:
vote = {}
for l in label:
if l in vote.keys():
vote[l] += 1
else:
vote[l] = 1
max_count = 0
vote_label = None
for k, v in vote.items():
if v > max_count:
max_count = v
vote_label = k
return vote_label
# 根据信息增益拿到特征的索引
best_feature = getBestFeature(feature, label)
tree = {best_feature: {}}
f = np.array(feature)
# 拿到bestfeature的所有特征值
f_set = set(f[:, best_feature])
# 构建对应特征值的子样本集sub_feature, sub_label
for v in f_set:
sub_feature = []
sub_label = []
for i in range(len(feature)):
if feature[i][best_feature] == v:
sub_feature.append(feature[i])
sub_label.append(label[i])
# 递归构建决策树
tree[best_feature][v] = createTree(sub_feature, sub_label)
#*********End*********#
return tree
#决策树分类
def dt_clf(train_feature,train_label,test_feature):
'''
input:
train_feature(ndarray):训练样本特征
train_label(ndarray):训练样本标签
test_feature(ndarray):测试样本特征
output:
predict(ndarray):测试样本预测标签
'''
#*********Begin*********#
result = []
tree = createTree(train_feature,train_label)
def classify(tree, feature):
if not isinstance(tree, dict):
return tree
t_index, t_value = list(tree.items())[0]
f_value = feature[t_index]
if isinstance(t_value, dict):
classLabel = classify(tree[t_index][f_value], feature)
return classLabel
else:
return t_value
for feature in test_feature:
result.append(classify(tree, feature))
predict = np.array(result)
#*********End*********#
return predict
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。