当前位置:   article > 正文

【机器学习】Logistic回归代码实现_在右侧编辑器补充代码,计算并得出logistic回归回归线的x,y坐标点数组。

在右侧编辑器补充代码,计算并得出logistic回归回归线的x,y坐标点数组。

1.各部分代码


1.1 数据加载
数据集包含两个特征、一个类别标签,共一百个样本。
  1. def load_dataset(filename):
  2. # 特征数据
  3. dataset = []
  4. # 标签数据
  5. labels = []
  6. with open(filename) as file:
  7. for eachline in file.readlines():
  8. # 将每一行的换行符去除并以空格拆分整行
  9. data_list = eachline.strip('\n').split('\t')
  10. # 初始化Xo为1,X1,X2为读取数据中的第一位、第二位值
  11. # 注意加上数值类型转化,不加容易出错
  12. dataset.append([1.0, float(data_list[0]), float(data_list[1])])
  13. labels.append(int(data_list[2]))
  14. return dataset, labels
1.2 梯度上升优化算法
  1. def sigmoid(x):
  2. # 跃阶函数表达式
  3. return 1.0/(1 + np.exp(-x))
  4. import numpy as np
  5. # 梯度上升优化算法
  6. def grad_ascent(dataset, labels):
  7. # 将列表转换成矩阵,方便进行矩阵运算
  8. data_matrix_mn = np.mat(dataset)
  9. label_matrix_m1 = np.mat(labels).transpose()
  10. # m:样本个数 n:特征个数
  11. m, n = np.shape(data_matrix_mn)
  12. # 初始化权重矩阵
  13. weight_matrix_n1 = np.ones((n, 1))
  14. # 步长
  15. alpha = 0.001
  16. # 训练次数
  17. times = 500
  18. for i in range(times):
  19. h_matrix_m1 = sigmoid(data_matrix_mn * weight_matrix_n1)
  20. error_matrix_m1 = label_matrix_m1 - h_matrix_m1
  21. # 根据计算出来error矩阵和步长调整权重矩阵 [1]
  22. weight_matrix_n1 = weight_matrix_n1 + alpha * data_matrix_mn.transpose() * error_matrix_m1
  23. return weight_matrix_n1
关于[1]的原理可见 点击打开链接。用一个例子定性解释了一下,没有严格证明。

1.3 数据集及决策边界可视化
  1. def draw_divide_line(dataset, labels, weights):
  2. import matplotlib.pyplot as plt
  3. # 类别1和类别0的坐标
  4. x1 = []
  5. y1 = []
  6. x0 = []
  7. y0 = []
  8. # 将数据集的数据分类并存入坐标
  9. for i in range(len(dataset)):
  10. if labels[i] == 1:
  11. x1.append(dataset[i][1])
  12. y1.append(dataset[i][2])
  13. else:
  14. x0.append(dataset[i][1])
  15. y0.append(dataset[i][2])
  16. # 计算决策边界坐标
  17. x = np.arange(-3, 3, 0.1)
  18. # weight[0] + weight[1]*x + weight[2]*y = 0 . 反算出y值
  19. y = (-weights[0] - weights[1] * x) / (weights[2])
  20. # 绘制数据集散点
  21. fig = plt.figure()
  22. ax = fig.add_subplot(111)
  23. type1 = ax.scatter(x1, y1, c='r', marker='s')
  24. type0 = ax.scatter(x0, y0, c='g')
  25. # 绘制决策边界
  26. ax.plot(x, y)
  27. # 添加标签
  28. plt.xlabel('x1')
  29. plt.ylabel('x2')
  30. # 添加图例
  31. ax.legend((type0, type1), ('0', '1'))
  32. plt.show()
输出效果:

1.4 随机梯度上升算法 
 1.2中梯度上升算法每次更新回归系数时需要遍历整个数据集,计算复杂度太高。因此使用随机梯度上升算法来减少计算得复杂度。
1.41 简单的 随机梯度上升算法
  1. def stoc_grad_ascent(dataset, labels, times=300):
  2. dataset = np.array(dataset)
  3. m, n = np.shape(dataset)
  4. weight = np.ones(n)
  5. alaph = 0.001
  6. # 记录权重变化情况
  7. weight_change = []
  8. # 在数据集中循环训练次数
  9. for j in range(times):
  10. for i in range(m):
  11. # 利用每一条样本调整权重系数
  12. error = labels[i] - sigmoid(sum(dataset[i] * weight))
  13. weight = weight + alaph * error * dataset[i]
  14. # 记录权重
  15. weight_change.append(weight)
  16. return weight, weight_change

利用下述代码 先将权重的变化情况通过图形表示出来。

  1. def draw_weight_change(weight_change):
  2. import matplotlib.pyplot as plt
  3. times = np.arange(0, len(weight_change), 1)
  4. W0 = [weight[0] for weight in weight_change]
  5. W1 = [weight[1] for weight in weight_change]
  6. W3 = [weight[2] for weight in weight_change]
  7. fig = plt.figure('权重变化')
  8. ax1 = fig.add_subplot(311)
  9. ax2 = fig.add_subplot(312)
  10. ax3 = fig.add_subplot(313)
  11. ax1.plot(times, W0)
  12. ax2.plot(times, W1)
  13. ax3.plot(times, W3)
  14. ax1.set_ylabel('W0')
  15. ax2.set_ylabel('W1')
  16. ax3.set_ylabel('W2')
  17. plt.show()


1.42 改进的 随机梯度上升算法
  1. # 改进的随机梯度上升算法
  2. def stoc_grad_ascent2(dataset, labels, times=300):
  3. dataset = np.array(dataset)
  4. m, n = np.shape(dataset)
  5. weight = np.ones(n)
  6. weight_change = []
  7. for i in range(times):
  8. data_index = list(range(m))
  9. # 在一次数据集上训练时,不重复抽取数据集中一条样本进行权重训练
  10. for j in range(m):
  11. # 在 0至len(data_index) 上随机抽取一个整数rand_index
  12. rand_index = int(np.random.uniform(0, len(data_index)))
  13. # j增大时,减小步长的大小,减小权重调整的高频变动
  14. alpha = 4.0/(1.0 + i + j) + 0.001
  15. error = labels[rand_index] - sigmoid(sum(dataset[rand_index] * weight))
  16. weight = weight + alpha * error * dataset[rand_index]
  17. weight_change.append(weight)
  18. del(data_index[rand_index])
  19. return weight, weight_change
权重变化图:

对比两种方法的权重变化图,可以发现改进后的随机梯度上升算法的权重能更快地趋于稳定。

2. 所有代码

  1. def load_dataset(filename):
  2. # 特征数据
  3. dataset = []
  4. # 标签数据
  5. labels = []
  6. with open(filename) as file:
  7. for eachline in file.readlines():
  8. # 将每一行的换行符去除并以空格拆分整行
  9. data_list = eachline.strip('\n').split('\t')
  10. # 初始化Xo为1,X1,X2为读取数据中的第一位、第二位值
  11. # 注意加上数值类型转化,不加容易出错
  12. dataset.append([1.0, float(data_list[0]), float(data_list[1])])
  13. labels.append(int(data_list[2]))
  14. return dataset, labels
  15. def sigmoid(x):
  16. # 跃阶函数表达式
  17. return 1.0/(1 + np.exp(-x))
  18. import numpy as np
  19. # 梯度上升优化算法
  20. def grad_ascent(dataset, labels):
  21. # 将列表转换成矩阵,方便进行矩阵运算
  22. data_matrix_mn = np.mat(dataset)
  23. label_matrix_m1 = np.mat(labels).transpose()
  24. # m:样本个数 n:特征个数
  25. m, n = np.shape(data_matrix_mn)
  26. # 初始化权重矩阵
  27. weight_matrix_n1 = np.ones((n, 1))
  28. # 步长
  29. alpha = 0.001
  30. # 训练次数
  31. times = 500
  32. for i in range(times):
  33. h_matrix_m1 = sigmoid(data_matrix_mn * weight_matrix_n1)
  34. error_matrix_m1 = label_matrix_m1 - h_matrix_m1
  35. # 根据计算出来error矩阵和步长调整权重矩阵
  36. weight_matrix_n1 = weight_matrix_n1 + alpha * data_matrix_mn.transpose() * error_matrix_m1
  37. return weight_matrix_n1
  38. def stoc_grad_ascent(dataset, labels, times=300):
  39. dataset = np.array(dataset)
  40. m, n = np.shape(dataset)
  41. weight = np.ones(n)
  42. alaph = 0.001
  43. # 记录权重变化情况
  44. weight_change = []
  45. # 在数据集中循环训练次数
  46. for j in range(times):
  47. for i in range(m):
  48. # 利用每一条样本调整权重系数
  49. error = labels[i] - sigmoid(sum(dataset[i] * weight))
  50. weight = weight + alaph * error * dataset[i]
  51. # 记录权重
  52. weight_change.append(weight)
  53. return weight, weight_change
  54. # 改进的随机梯度上升算法
  55. def stoc_grad_ascent2(dataset, labels, times=300):
  56. dataset = np.array(dataset)
  57. m, n = np.shape(dataset)
  58. weight = np.ones(n)
  59. weight_change = []
  60. for i in range(times):
  61. data_index = list(range(m))
  62. # 在一次数据集上训练时,不重复抽取数据集中一条样本进行权重训练
  63. for j in range(m):
  64. # 在 0至len(data_index) 上随机抽取一个整数rand_index
  65. rand_index = int(np.random.uniform(0, len(data_index)))
  66. # j增大时,减小步长的大小,减小权重调整的高频变动
  67. alpha = 4.0/(1.0 + i + j) + 0.001
  68. error = labels[rand_index] - sigmoid(sum(dataset[rand_index] * weight))
  69. weight = weight + alpha * error * dataset[rand_index]
  70. weight_change.append(weight)
  71. del(data_index[rand_index])
  72. return weight, weight_change
  73. def draw_weight_change(weight_change):
  74. import matplotlib.pyplot as plt
  75. times = np.arange(0, len(weight_change), 1)
  76. W0 = [weight[0] for weight in weight_change]
  77. W1 = [weight[1] for weight in weight_change]
  78. W3 = [weight[2] for weight in weight_change]
  79. fig = plt.figure('权重变化')
  80. ax1 = fig.add_subplot(311)
  81. ax2 = fig.add_subplot(312)
  82. ax3 = fig.add_subplot(313)
  83. ax1.plot(times, W0)
  84. ax2.plot(times, W1)
  85. ax3.plot(times, W3)
  86. ax1.set_ylabel('W0')
  87. ax2.set_ylabel('W1')
  88. ax3.set_ylabel('W2')
  89. plt.show()
  90. def draw_divide_line(dataset, labels, weights):
  91. import matplotlib.pyplot as plt
  92. # 类别1和类别0的坐标
  93. x1 = []
  94. y1 = []
  95. x0 = []
  96. y0 = []
  97. # 将数据集的数据分类并存入坐标
  98. for i in range(len(dataset)):
  99. if labels[i] == 1:
  100. x1.append(dataset[i][1])
  101. y1.append(dataset[i][2])
  102. else:
  103. x0.append(dataset[i][1])
  104. y0.append(dataset[i][2])
  105. # 计算决策边界坐标
  106. x = np.arange(-3, 3, 0.1)
  107. # weight[0] + weight[1]*x + weight[2]*y = 0 . 反算出y值
  108. y = (-weights[0] - weights[1] * x) / (weights[2])
  109. # 绘制数据集散点
  110. fig = plt.figure('数据集及决策边界')
  111. ax = fig.add_subplot(111)
  112. type1 = ax.scatter(x1, y1, c='r', marker='s')
  113. type0 = ax.scatter(x0, y0, c='g')
  114. # 绘制决策边界
  115. ax.plot(x, y.reshape((60, 1)))
  116. # 添加标签
  117. plt.xlabel('x1')
  118. plt.ylabel('x2')
  119. # 添加图例
  120. ax.legend((type0, type1), ('0', '1'))
  121. plt.savefig('divide_line.png')
  122. if __name__ == '__main__':
  123. dataset, labels = load_dataset(r'machine learning\Ch05\testSet.txt')
  124. weights, weight_change = stoc_grad_ascent2(dataset, labels)
  125. draw_divide_line(dataset, labels, weights)
  126. draw_weight_change(weight_change)











声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/629928
推荐阅读
相关标签
  

闽ICP备14008679号