当前位置:   article > 正文

朴素贝叶斯分类器(MNIST数据集)

format(imgnum)

P(y|X)=P(y)*P(X|y)/P(X)

样本中的属性相互独立;

 

原问题的等价问题为:

 

数据处理
为防止P(y)*P(X|y)的值下溢,对原问题取对数,即:

 

 

 

注意:若某属性值在训练集中没有与某个类同时出现过,则直接P(y)或P(X|y)可能为0,这样计算出P(y)*P(X|y)的值为0,没有可比性,且不便于求对数,因此需要对概率值进行“平滑”处理,常用拉普拉斯修正。

先验概率修正:令Dy表示训练集D中第y类样本组合的集合,N表示训练集D中可能的类别数

 

即每个类别的样本个数都加 1。

类条件概率:另Dy,xi表示Dc中在第 i 个属性上取值为xi的样本组成的集合,Ni表示第 i 个属性可能的取值数

 

即该类别中第 i 个属性都增加一个样本。

--------------------------------------------------------------

数据预处理

 

训练模型

测试样本

函数调用

 

 

 

 

参考

python朴素贝叶斯分类MNIST数据集

  1. import struct
  2. from numpy import *
  3. import numpy as np
  4. import time
  5. def read_image(file_name):
  6. #先用二进制方式把文件都读进来
  7. file_handle=open(file_name,"rb") #以二进制打开文档
  8. file_content=file_handle.read() #读取到缓冲区中
  9. offset=0
  10. head = struct.unpack_from('>IIII', file_content, offset) # 取前4个整数,返回一个元组
  11. offset += struct.calcsize('>IIII')
  12. imgNum = head[1] #图片数
  13. rows = head[2] #宽度
  14. cols = head[3] #高度
  15. images=np.empty((imgNum , 784))#empty,是它所常见的数组内的所有元素均为空,没有实际意义,它是创建数组最快的方法
  16. image_size=rows*cols#单个图片的大小
  17. fmt='>' + str(image_size) + 'B'#单个图片的format
  18. for i in range(imgNum):
  19. images[i] = np.array(struct.unpack_from(fmt, file_content, offset))
  20. # images[i] = np.array(struct.unpack_from(fmt, file_content, offset)).reshape((rows, cols))
  21. offset += struct.calcsize(fmt)
  22. return images
  23. #读取标签
  24. def read_label(file_name):
  25. file_handle = open(file_name, "rb") # 以二进制打开文档
  26. file_content = file_handle.read() # 读取到缓冲区中
  27. head = struct.unpack_from('>II', file_content, 0) # 取前2个整数,返回一个元组
  28. offset = struct.calcsize('>II')
  29. labelNum = head[1] # label数
  30. # print(labelNum)
  31. bitsString = '>' + str(labelNum) + 'B' # fmt格式:'>47040000B'
  32. label = struct.unpack_from(bitsString, file_content, offset) # 取data数据,返回一个元组
  33. return np.array(label)
  34. def loadDataSet():
  35. #mnist
  36. train_x_filename="train-images-idx3-ubyte"
  37. train_y_filename="train-labels-idx1-ubyte"
  38. test_x_filename="t10k-images-idx3-ubyte"
  39. test_y_filename="t10k-labels-idx1-ubyte"
  40. # #fashion mnist
  41. # train_x_filename="fashion-train-images-idx3-ubyte"
  42. # train_y_filename="fashion-train-labels-idx1-ubyte"
  43. # test_x_filename="fashion-t10k-images-idx3-ubyte"
  44. # test_y_filename="fashion-t10k-labels-idx1-ubyte"
  45. train_x=read_image(train_x_filename)#60000*784 的矩阵
  46. train_y=read_label(train_y_filename)#60000*1的矩阵
  47. test_x=read_image(test_x_filename)#10000*784
  48. test_y=read_label(test_y_filename)#10000*1
  49. train_x=normalize(train_x)
  50. test_x=normalize(test_x)
  51. # #调试的时候让速度快点,就先减少数据集大小
  52. # train_x=train_x[0:1000,:]
  53. # train_y=train_y[0:1000]
  54. # test_x=test_x[0:500,:]
  55. # test_y=test_y[0:500]
  56. return train_x, test_x, train_y, test_y
  57. def normalize(data):#图片像素二值化,变成0-1分布
  58. m=data.shape[0]
  59. n=np.array(data).shape[1]
  60. for i in range(m):
  61. for j in range(n):
  62. if data[i,j]!=0:
  63. data[i,j]=1
  64. else:
  65. data[i,j]=0
  66. return data
  67. #(1)计算先验概率及条件概率
  68. def train_model(train_x,train_y,classNum):#classNum是指有10个类别,这里的train_x是已经二值化,
  69. m=train_x.shape[0]
  70. n=train_x.shape[1]
  71. # prior_probability=np.zeros(n)#先验概率
  72. prior_probability=np.zeros(classNum)#先验概率
  73. conditional_probability=np.zeros((classNum,n,2))#条件概率
  74. #计算先验概率和条件概率
  75. for i in range(m):#m是图片数量,共60000张
  76. img=train_x[i]#img是第i个图片,是1*n的行向量
  77. label=train_y[i]#label是第i个图片对应的label
  78. prior_probability[label]+=1#统计label类的label数量(p(Y=ck),下标用来存放label,prior_probability[label]除以n就是某个类的先验概率
  79. for j in range(n):#n是特征数,共784个
  80. temp=img[j].astype(int)#img[j]是0.0,放到下标去会显示错误,只能用整数
  81. conditional_probability[label][j][temp] += 1
  82. # conditional_probability[label][j][img[j]]+=1#统计的是类为label的,在每个列中为1或者0的行数为多少,img[j]的值要么就是0要么就是1,计算条件概率
  83. #将概率归到[1.10001]
  84. for i in range(classNum):
  85. for j in range(n):
  86. #经过二值化的图像只有0,1两种取值
  87. pix_0=conditional_probability[i][j][0]
  88. pix_1=conditional_probability[i][j][1]
  89. #计算0,1像素点对应的条件概率
  90. probability_0=(float(pix_0)/float(pix_0+pix_1))*10000+1
  91. probability_1 = (float(pix_1)/float(pix_0 + pix_1)) * 10000 + 1
  92. conditional_probability[i][j][0]=probability_0
  93. conditional_probability[i][j][1]=probability_1
  94. return prior_probability,conditional_probability
  95. #(2)对给定的x,计算先验概率和条件概率的乘积
  96. def cal_probability(img,label,prior_probability,conditional_probability):
  97. probability=int(prior_probability[label])#先验概率
  98. n=img.shape[0]
  99. # print(n)
  100. for i in range(n):#应该是特征数
  101. probability*=int(conditional_probability[label][i][img[i].astype(int)])
  102. return probability
  103. #确定实例x的类,相当于argmax
  104. def predict(test_x,test_y,prior_probability,conditional_probability):#传进来的test_x或者是train_x都是二值化后的
  105. predict_y=[]
  106. m=test_x.shape[0]
  107. n=test_x.shape[1]
  108. for i in range(m):
  109. img=np.array(test_x[i])#img已经是二值化以后的列向量
  110. label=test_y[i]
  111. max_label=0
  112. max_probability= cal_probability(img,0,prior_probability,conditional_probability)
  113. for j in range(1,10):#从下标为1开始,因为初始值是下标为0
  114. probability=cal_probability(img,j,prior_probability,conditional_probability)
  115. if max_probability<probability:
  116. max_probability=probability
  117. max_label=j
  118. predict_y.append(max_label)#用来记录每行最大概率的label
  119. return np.array(predict_y)
  120. def cal_accuracy(test_y,predict_y):
  121. m=test_y.shape[0]
  122. errorCount=0.0
  123. for i in range(m):
  124. if test_y[i]!=predict_y[i]:
  125. errorCount+=1
  126. accuracy=1.0-float(errorCount)/m
  127. return accuracy
  128. if __name__=='__main__':
  129. classNum=10
  130. print("Start reading data...")
  131. time1=time.time()
  132. train_x, test_x, train_y, test_y=loadDataSet()
  133. train_x=normalize(train_x)
  134. test_x=normalize(test_x)
  135. time2=time.time()
  136. print("read data cost",time2-time1,"second")
  137. print("start training data...")
  138. prior_probability, conditional_probability=train_model(train_x,train_y,classNum)
  139. for i in range(classNum):
  140. print(prior_probability[i])#输出一下每个标签的总共数量
  141. time3=time.time()
  142. print("train data cost",time3-time2,"second")
  143. print("start predicting data...")
  144. predict_y=predict(test_x,test_y,prior_probability,conditional_probability)
  145. time4=time.time()
  146. print("predict data cost",time4-time3,"second")
  147. print("start calculate accuracy...")
  148. acc=cal_accuracy(test_y,predict_y)
  149. time5=time.time()
  150. print("accuarcy",acc)
  151. print("calculate accuarcy cost",time5-time4,"second")

 

转载于:https://www.cnblogs.com/wanglinjie/p/11600994.html

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号