赞
踩
#决策树
importosimportsysimporttimeimportoperatorimportcx_Oracleimportnumpy as npimportpandas as pdfrom math importlogimporttensorflow as tf
conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor=conn.cursor()#获取数据集
defgetdata(surgery,surgeryChest):
sql= "select feature1,feature2,feature3,feature4,feature5,trainLable from menzhenZ where surgery='%s' and surgeryChest='%s'" %(surgery,surgeryChest)
cursor.execute(sql)
rows=cursor.fetchall()
dataset=[]for row inrows:
temp=[]
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
temp.append(row[5])
dataset.append(temp)
lables=[]
lables.append("呼吸急促")
lables.append("持续性脉搏加快")
lables.append("畏寒")
lables.append("血压降低")
lables.append("咳血")returndataset,lablesdefgettestdata(surgery,surgeryChest):
sql= "select feature1,feature2,feature3,feature4,feature5,trainLable from testZ where surgery='%s' and surgeryChest='%s'" %(surgery,surgeryChest)
cursor.execute(sql)
rows=cursor.fetchall()
testdataset=[]
testlables=[]for row inrows:
temp=[]
temp.append(row[0])
temp.append(row[1])
temp.append(row[2])
temp.append(row[3])
temp.append(row[4])
testdataset.append(temp)
testlables.append(row[5])returntestdataset,testlables#计算熵值
defcalcShannonEnt(dataSet):
numEntries=len(dataSet)
labelCounts={}for featVec indataSet:
currentLabel= featVec[-1]if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] =0
labelCounts[currentLabel]+= 1shannonEnt= 0.0
for key inlabelCounts:
prob= float(labelCounts[key])/numEntries
shannonEnt-= prob * log(prob,2)returnshannonEnt#按照给定特征划分数据集
defsplitDataSet(dataSet, axis, value):
retDataSet=[]for featVec indataSet:if featVec[axis] ==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)returnretDataSet#选择最好的属性
defchooseBestFeatureToSplit(dataSet):
numFeatures= len(dataSet[0]) - 1baseEntropy=calcShannonEnt(dataSet)
bestInfoGain= 0.0bestFeature= -1
for i inrange(numFeatures):
featList= [example[i] for example indataSet]
uniqueVals=set(featList)
newEntropy= 0.0
for value inuniqueVals:
subDataSet=splitDataSet(dataSet, i, value)
prob= len(subDataSet)/float(len(dataSet))
newEntropy+= prob *calcShannonEnt(subDataSet)
infoGain= baseEntropy -newEntropyif (infoGain >bestInfoGain):
bestInfoGain=infoGain
bestFeature=ireturnbestFeature#统计机制
defmajorityCnt(classList):
classCount={}for vote inclassList:if vote not in classCount.keys(): classCount[vote] =0
classCount[vote]+= 1sortedClassCount= sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)returnsortedClassCount[0][0]#创建决策树
defcreateTree(dataSet,labels):
classList= [example[-1] for example indataSet]if classList.count(classList[0]) ==len(classList):returnclassList[0]if len(dataSet[0]) == 1:returnmajorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
temp=[]for i inlabels:if i !=labels[bestFeat]:
temp.append(i)
labels=temp
featValues= [example[bestFeat] for example indataSet]
uniqueVals=set(featValues)for value inuniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet, bestFeat, value),subLabels)returnmyTree#使用决策树模型分类
defclassify(inputTree,featLabels,testVec):for i ininputTree.keys():
firstStr=ibreaksecondDict=inputTree[firstStr]
featIndex=featLabels.index(firstStr)
key=testVec[featIndex]
valueOfFeat=secondDict[key]ifisinstance(valueOfFeat, dict):
classLabel=classify(valueOfFeat, featLabels, testVec)else: classLabel =valueOfFeatreturnclassLabel#启动和检测模型
defdatingClassTest():
dataSet,labels= getdata("外科","胸外科")
myTree=createTree(dataSet,labels)
testdataset,testlables= gettestdata("外科","胸外科")
errorCount= 0.0start=time.time()for i inrange(np.shape(testdataset)[0]):
classifierResult=classify(myTree,labels,testdataset[i])print("the classifier came back with: %s, the real answer is: %s" %(classifierResult, testlables[i]))if (classifierResult !=testlables[i]):
errorCount+= 1.0end=time.time()print("错误率: %.2f%%" % (errorCount/float(np.shape(testdataset)[0])*100))print("准确率: %.2f%%" % ((1.0-errorCount/float(np.shape(testdataset)[0]))*100))print("训练和预测一共耗时: %.2f 秒" % (end-start))
datingClassTest()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。