当前位置:   article > 正文

机器学习Python学习——朴素贝叶斯_朴素贝叶斯 python

朴素贝叶斯 python

目录

1. 基本概念

2. 算法原理

        2.1 算法示例

3. 代码实现 


1. 基本概念

        朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法。统计机器学习的策略通常是期望风险最小化,实际学习过程中以经验风险近似期望风险(或加上正则化项)。在朴素贝叶斯方法中,期望风险最小化等价于后验概率最大化。

2. 算法原理

        朴素贝叶斯分类器(Naïve Bayes Classifier)采用了“属性条件独立性 假设” ,即每个属性独立地对分类结果发生影响。为方便公式标记,不妨记P(C=c|X=x)为P(c|x),基于属性条件独立 性假设,贝叶斯公式可重写为:

– 其中d为属性数目,xi为 x 在第i个属性上的取值。

        朴素贝叶斯分类器的训练器的训练过程就是基于训练集D估计类 先验概率P(c),并为每个属性估计条件概率P(xi丨c) 。

        2.1 算法示例

        下面用一个实例来说明贝叶斯分类器的计算过程:

         第一步:统计各个事件发生的次数。

         第二步:计算先验概率和条件概率。

         第三步:判别样例。

3. 代码实现 

        下面给出用贝叶斯分类器实现垃圾邮件分类的代码。

  1. '''
  2. Created on Oct 19, 2010
  3. @author: Peter
  4. '''
  5. from numpy import *
  6. def loadDataSet():
  7. postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
  8. ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
  9. ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
  10. ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
  11. ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
  12. ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
  13. classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
  14. return postingList,classVec
  15. def createVocabList(dataSet):
  16. vocabSet = set([]) #create empty set
  17. for document in dataSet:
  18. vocabSet = vocabSet | set(document) #union of the two sets
  19. return list(vocabSet)
  20. def setOfWords2Vec(vocabList, inputSet):
  21. returnVec = [0]*len(vocabList)
  22. for word in inputSet:
  23. if word in vocabList:
  24. returnVec[vocabList.index(word)] = 1
  25. else: print ("the word: %s is not in my Vocabulary!" % word)
  26. return returnVec
  27. def trainNB0(trainMatrix,trainCategory):
  28. numTrainDocs = len(trainMatrix)
  29. numWords = len(trainMatrix[0])
  30. pAbusive = sum(trainCategory)/float(numTrainDocs)
  31. p0Num = ones(numWords); p1Num = ones(numWords) #change to ones()
  32. p0Denom = 2.0; p1Denom = 2.0 #change to 2.0
  33. for i in range(numTrainDocs):
  34. if trainCategory[i] == 1:
  35. p1Num += trainMatrix[i]
  36. p1Denom += sum(trainMatrix[i])
  37. else:
  38. p0Num += trainMatrix[i]
  39. p0Denom += sum(trainMatrix[i])
  40. p1Vect = log(p1Num/p1Denom) #change to log()
  41. p0Vect = log(p0Num/p0Denom) #change to log()
  42. return p0Vect,p1Vect,pAbusive
  43. def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
  44. p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
  45. p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
  46. if p1 > p0:
  47. return 1
  48. else:
  49. return 0
  50. def bagOfWords2VecMN(vocabList, inputSet):
  51. returnVec = [0]*len(vocabList)
  52. for word in inputSet:
  53. if word in vocabList:
  54. returnVec[vocabList.index(word)] += 1
  55. return returnVec
  56. def testingNB():
  57. listOPosts,listClasses = loadDataSet()
  58. myVocabList = createVocabList(listOPosts)
  59. trainMat=[]
  60. for postinDoc in listOPosts:
  61. trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
  62. p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
  63. testEntry = ['love', 'my', 'dalmation']
  64. thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
  65. print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
  66. testEntry = ['stupid', 'garbage']
  67. thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
  68. print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
  69. def textParse(bigString): #input is big string, #output is word list
  70. import re
  71. listOfTokens = re.split(r'\W*', bigString)
  72. return [tok.lower() for tok in listOfTokens if len(tok) > 2]
  73. def spamTest():
  74. docList=[]; classList = []; fullText =[]
  75. for i in range(1,26):
  76. wordList = textParse(open('email/spam/%d.txt' % i).read())
  77. docList.append(wordList)
  78. fullText.extend(wordList)
  79. classList.append(1)
  80. wordList = textParse(open('email/ham/%d.txt' % i).read())
  81. docList.append(wordList)
  82. fullText.extend(wordList)
  83. classList.append(0)
  84. vocabList = createVocabList(docList)#create vocabulary
  85. trainingSet = range(50); testSet=[] #create test set
  86. for i in range(10):
  87. randIndex = int(random.uniform(0,len(trainingSet)))
  88. testSet.append(trainingSet[randIndex])
  89. del(trainingSet[randIndex])
  90. trainMat=[]; trainClasses = []
  91. for docIndex in trainingSet:#train the classifier (get probs) trainNB0
  92. trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
  93. trainClasses.append(classList[docIndex])
  94. p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
  95. errorCount = 0
  96. for docIndex in testSet: #classify the remaining items
  97. wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
  98. if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
  99. errorCount += 1
  100. print ("classification error",docList[docIndex])
  101. print ('the error rate is: ',float(errorCount)/len(testSet))
  102. #return vocabList,fullText
  103. def calcMostFreq(vocabList,fullText):
  104. import operator
  105. freqDict = {}
  106. for token in vocabList:
  107. freqDict[token]=fullText.count(token)
  108. sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
  109. return sortedFreq[:30]
  110. def localWords(feed1,feed0):
  111. import feedparser
  112. docList=[]; classList = []; fullText =[]
  113. minLen = min(len(feed1['entries']),len(feed0['entries']))
  114. for i in range(minLen):
  115. wordList = textParse(feed1['entries'][i]['summary'])
  116. docList.append(wordList)
  117. fullText.extend(wordList)
  118. classList.append(1) #NY is class 1
  119. wordList = textParse(feed0['entries'][i]['summary'])
  120. docList.append(wordList)
  121. fullText.extend(wordList)
  122. classList.append(0)
  123. vocabList = createVocabList(docList)#create vocabulary
  124. top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words
  125. for pairW in top30Words:
  126. if pairW[0] in vocabList: vocabList.remove(pairW[0])
  127. trainingSet = range(2*minLen); testSet=[] #create test set
  128. for i in range(20):
  129. randIndex = int(random.uniform(0,len(trainingSet)))
  130. testSet.append(trainingSet[randIndex])
  131. del(trainingSet[randIndex])
  132. trainMat=[]; trainClasses = []
  133. for docIndex in trainingSet:#train the classifier (get probs) trainNB0
  134. trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
  135. trainClasses.append(classList[docIndex])
  136. p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
  137. errorCount = 0
  138. for docIndex in testSet: #classify the remaining items
  139. wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
  140. if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
  141. errorCount += 1
  142. print ('the error rate is: ',float(errorCount)/len(testSet))
  143. return vocabList,p0V,p1V
  144. def getTopWords(ny,sf):
  145. import operator
  146. vocabList,p0V,p1V=localWords(ny,sf)
  147. topNY=[]; topSF=[]
  148. for i in range(len(p0V)):
  149. if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
  150. if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
  151. sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
  152. print ("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
  153. for item in sortedSF:
  154. print (item[0])
  155. sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
  156. print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
  157. for item in sortedNY:
  158. print (item[0])

        最后运行结果的错误率为

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/347806
推荐阅读
相关标签
  

闽ICP备14008679号