赞
踩
一种简单但广泛使用的分类器,它通过训练数据构建决策树,对未知的数据进行分类。决策树的每个内部节点表示在一个属性上的测试,每个分枝代表该测试的一个输出,而每个树叶结点存放着一个类标号。
该模型数据为整条数据,目标值在特征值后一列。
信息熵:
def calEnt(dataSet):
n = dataSet.shape[0]
iset = dataSet.iloc[:,-1].value_counts()
p = iset / n
ent = (-p * np.log2(p)).sum()
return ent
信息增益:
""" 根据信息增益选择出最佳数据集切分的列 """ def bestSplit(dataSet): baseEnt = calEnt(dataSet) bestGain = 0 axis = -1 for i in range(dataSet.shape[1]-1): levels = dataSet.iloc[:,i].value_counts().index ents = 0 for j in levels: childSet = dataSet[dataSet.iloc[:,i]==j] ent = calEnt(childSet) ents += (childSet.shape[0]/dataSet.shape[0])*ent infoGain = baseEnt - ents #print(f'第{i}列的信息熵为{infoGain}') if(infoGain > bestGain): bestGain = infoGain axis = i return axis
用到的函数如下:
calEnt(dataSet)#计算信息熵
bestSplit(dataSet)#根据信息增益最大的选择出最佳数据集切分的列
mySplit(dataSet,axis,value)#按照给定的列划分数据集
createTree(dataSet)#基于最大信息增益切分数据集,递归构建决策树
classify(inputTree, labels, testVec)#对一个测试实例进行分类
acc_classify(train,test)#对测试集进行预测,并返回预测后的结果
""" 计算信息熵 """ def calEnt(dataSet): n = dataSet.shape[0] iset = dataSet.iloc[:,-1].value_counts() p = iset / n ent = (-p * np.log2(p)).sum() return ent """ 根据信息增益选择出最佳数据集切分的列 """ def bestSplit(dataSet): baseEnt = calEnt(dataSet) bestGain = 0 axis = -1 for i in range(dataSet.shape[1]-1): levels = dataSet.iloc[:,i].value_counts().index ents = 0 for j in levels: childSet = dataSet[dataSet.iloc[:,i]==j] ent = calEnt(childSet) ents += (childSet.shape[0]/dataSet.shape[0])*ent infoGain = baseEnt - ents #print(f'第{i}列的信息熵为{infoGain}') if(infoGain > bestGain): bestGain = infoGain axis = i return axis """ 按照给定的列划分数据集 """ def mySplit(dataSet,axis,value): col = dataSet.columns[axis] redataSet = dataSet.loc[dataSet[col]==value,:].drop(col,axis=1) return redataSet """ 函数功能:基于最大信息增益切分数据集,递归构建决策树 """ def createTree(dataSet): featlist = list(dataSet.columns) classlist = dataSet.iloc[:,-1].value_counts() if classlist[0]==dataSet.shape[0] or dataSet.shape[1]==1: return classlist.index[0] axis = bestSplit(dataSet) bestfeat = featlist[axis] myTree = {bestfeat:{}} #del featlist[axis] valuelist = set(dataSet.iloc[:,axis]) for value in valuelist: myTree[bestfeat][value] = createTree(mySplit(dataSet,axis,value)) return myTree """ 函数功能:对一个测试实例进行分类 """ def classify(inputTree, labels, testVec): firstStr = next(iter(inputTree)) secondDict = inputTree[firstStr] featIndex = labels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]) == dict: classLabel = classify(secondDict[key],labels,testVec) else: classLabel = secondDict[key] return classLabel """ 函数功能:对测试集进行预测,并返回预测后的结果 """ def acc_classify(train,test): inputTree = createTree(train) labels = list(train.columns) result = [] for i in range(test.shape[0]): testVec = test.iloc[i,:-1] classLabel = classify(inputTree,labels,testVec) result.append(classLabel) test['perdict'] =result acc = (test.iloc[:,-1]== test.iloc[:,-2]).mean() print(f'模型预测准确率为{acc}') return test
sklearn不能处理字符串内容,我们对于字符串数据转换成数值型数据:
处理函数如下:
#特征列
Xtrain1 = train1.iloc[:,:-1]
for i in Xtrain1.columns:
labels = Xtrain1[i].unique().tolist()
Xtrain1[i] = Xtrain1[i].apply(lambda x:labels.index(x))
#标签列
Ytrain1 = train1.iloc[:,-1]
labels = Ytrain1.unique().tolist()
Ytrain1 = Ytrain1.apply(lambda x: labels.index(x))
from sklearn import tree from sklearn.tree import DecisionTreeClassifier import graphviz #特征 Xtrain = dataSet.iloc[:,:-1]#数值型,不需要处理 #标签 Ytrain = dataSet.iloc[:,-1] labels = Ytrain.unique().tolist() Ytrain = Ytrain.apply(lambda x:labels.index(x)) #绘制树模型 clf = DecisionTreeClassifier() clf = clf.fit(Xtrain, Ytrain) tree.export_graphviz(clf) #给图形增加标签和颜色 dot_data = tree.export_graphviz(clf, out_file=None, feature_names=['no surfacing', 'flippers'], class_names=['fish', 'not fish'], filled=True, rounded=True, special_characters=True) graphviz.Source(dot_data)
信息增益率:
基尼值:
def Gini(dataSet):
m = dataSet.shape[0]
iSet = dataSet.iloc[:, -1].value_counts()
p = iSet / m
gini = 1 - (np.power(p, 2)).sum()
return gini
基尼指数:
本文为借助B站UP主菊安酱的资料整理自用。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。