当前位置:   article > 正文

朴素贝叶斯算法及其Python实现_python实现朴素贝叶斯代码

python实现朴素贝叶斯代码
  1. '''
  2. 朴素贝叶斯模型
  3. '''
  4. import pandas as pd
  5. import numpy as np
  6. from sklearn.datasets import load_iris
  7. from collections import defaultdict
  8. from sklearn.cross_validation import train_test_split
  9. def load_data():
  10. '''
  11. 加载鸢尾花数据
  12. '''
  13. data = load_iris()
  14. return data['data'],data['target']
  15. class NBClassifier(object):
  16. def __init__(self):
  17. self.y = []#标签集合
  18. self.x = []#每个属性的数值集合
  19. self.py = defaultdict(float)#标签的概率分布
  20. self.pxy = defaultdict(dict)#每个标签下的每个属性的概率分布
  21. self.n = 5#分级的级数
  22. def prob(self,element,arr):
  23. '''
  24. 计算元素在列表中出现的频率
  25. '''
  26. prob = 0.0
  27. for a in arr:
  28. if element == a:
  29. prob += 1/len(arr)
  30. if prob == 0.0:
  31. prob = 0.001
  32. return prob
  33. def get_set(self,x,y):
  34. self.y = list(set(y))
  35. for i in range(x.shape[1]):
  36. self.x.append(list(set(x[:,i])))#记录下每一列的数值集
  37. def fit(self,x,y):
  38. '''
  39. 训练模型
  40. '''
  41. x = self.preprocess(x)
  42. self.get_set(x,y)
  43. #1. 获取p(y)
  44. for yi in self.y:
  45. self.py[yi] = self.prob(yi,y)
  46. #2. 获取p(x|y)
  47. for yi in self.y:
  48. for i in range(x.shape[1]):
  49. sample = x[y==yi,i]#标签yi下的样本
  50. #获取该列的概率分布
  51. pxy = [self.prob(xi,sample) for xi in self.x[i]]
  52. self.pxy[yi][i] = pxy
  53. print("train score",self.score(x,y))
  54. def predict_one(self,x):
  55. '''
  56. 预测单个样本
  57. '''
  58. max_prob = 0.0
  59. max_yi = self.y[0]
  60. for yi in self.y:
  61. prob_y = self.py[yi]
  62. for i in range(len(x)):
  63. prob_x_y = self.pxy[yi][i][self.x[i].index(x[i])]#p(xi|y)
  64. prob_y *= prob_x_y#计算p(x1|y)p(x2|y)...p(xn|y)p(y)
  65. if prob_y > max_prob:
  66. max_prob = prob_y
  67. max_yi = yi
  68. return max_yi
  69. def predict(self,samples):
  70. '''
  71. 预测函数
  72. '''
  73. samples = self.preprocess(samples)
  74. y_list = []
  75. for m in range(samples.shape[0]):
  76. yi = self.predict_one(samples[m,:])
  77. y_list.append(yi)
  78. return np.array(y_list)
  79. def preprocess(self,x):
  80. '''
  81. 因为不同特征的数值集大小相差巨大,造成部分概率矩阵变得稀疏,需要进行数据分割
  82. '''
  83. for i in range(x.shape[1]):
  84. x[:,i] = self.step(x[:,i],self.n)
  85. return x
  86. def step(self,arr,n):
  87. '''
  88. 分为n阶
  89. '''
  90. ma = max(arr)
  91. mi = min(arr)
  92. for i in range(len(arr)):
  93. for j in range(n):
  94. a = mi + (ma-mi)*(j/n)
  95. b = mi + (ma-mi)*((j+1)/n)
  96. if arr[i] >= a and arr[i] <= b:
  97. arr[i] = j+1
  98. break
  99. return arr
  100. def score(self,x,y):
  101. y_test = self.predict(x)
  102. score = 0.0
  103. for i in range(len(y)):
  104. if y_test[i] == y[i]:
  105. score += 1/len(y)
  106. return score
  107. if __name__ == "__main__":
  108. x,y = load_data()
  109. x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.5,random_state = 100)
  110. clf = NBClassifier()
  111. clf.fit(x_train,y_train)
  112. score = clf.score(x_test,y_test)
  113. print('test score',score)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/679445
推荐阅读
相关标签
  

闽ICP备14008679号