机器学习常用算法之分类问题_机器学习算法用于分类问题

作者：黑客灵魂 | 2024-06-23 19:05:04

踩

机器学习算法用于分类问题

一、简单分类器（线性分类）

原理示意：

输入输出
3 1 0
2 5 1
1 8 1
6 4 0
5 2 0
3 5 1
4 7 1
4 -1 0
7 5 ?->0


import numpy as np
import matplotlib.pyplot as mp
 
x = np.array([
    [3, 1],
    [2, 5],
    [1, 8],
    [6, 4],
    [5, 2],
    [3, 5],
    [4, 7],
    [4, -1]])
y = np.array([0, 1, 1, 0, 0, 1, 1, 0])
 
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.05
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.05
grid_x = np.meshgrid(np.arange(l, r, h),
                     np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(),
               grid_x[1].ravel()]
flat_y = np.zeros(len(flat_x), dtype=int)
flat_y[flat_x[:, 0] < flat_x[:, 1]] = 1
grid_y = flat_y.reshape(grid_x[0].shape)
 
mp.figure('Simple Classification', facecolor='lightgray')
mp.title('Simple Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
 
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
 
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=60)
mp.show()

二、逻辑分类（线性分类+非线性函数）

2.1 预测函数
x1 x2 -> y
1
y = -----------
1 + e^-z
z = k1x1 + k2x2 + b

2.2 成本函数（损失函数）
交叉熵误差
J(k1,k2,b) = sigma(-ylog(y')-(1-y)log(1-y'))/n +m
# n 为样本总数；m 为正则函数(||k1,k2,b||)x正则强度(惩罚系数)
# -ylog(y`)-(1-y)log(1-y`) 为交叉熵函数
当y=0时，第一项为0，交叉熵函数值趋向于0；
当y=1时，第一项为0，交叉熵函数值趋向于无穷大
x x -> 0.9 1
x x -> 0.2 0


sklearn.linear_model.LogisticRegression(
    solver='liblinear', C=正则强度(惩罚系数))

2.3 多元分类
A B C
... -> A 1 0.9 0.1 0.3 A
... -> B 0 0.3 0.6 0.4 B
... -> C 0 0.1 0.2 0.6 C

2.4 示例


import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as mp
 
x = np.array([
    [4, 7],
    [3.5, 8],
    [3.1, 6.2],
    [0.5, 1],
    [1, 2],
    [1.2, 1.9],
    [6, 2],
    [5.7, 1.5],
    [5.4, 2.2]])
y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
model = lm.LogisticRegression(solver='liblinear',
                              C=1000)
model.fit(x, y)
# 绘图区域（限定范围）
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
                     np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
 
mp.figure('Logistic Classification',
          facecolor='lightgray')
mp.title('Logistic Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=60)
mp.show()

三、朴素贝叶斯分类

3.1 贝叶斯定理（条件概率）：
P(A)P(B|A)
P(A|B) = -----------
P(B)


import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
 
x, y = [], []
with open('../ML/data/multiple1.txt', 'r') as f:
    for line in f.readlines():
        data = [float(substr) for substr in line.split(',')]
        x.append(data[:-1])
        y.append(data[-1])
 
x = np.array(x)
y = np.array(y)
 
# 创建模型
model = nb.GaussianNB()  # 使用高斯（正态分布）分布求概率
model.fit(x, y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
                     np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
 
# 绘图
mp.figure('Naive Bayes Classification',
          facecolor='lightgray')
mp.title('Naive Bayes Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=60)
mp.show()

3.3 划分训练集和测试集


sklearn.model_selection.train_test_split(
     输入集合，输出集合，test_size=测试集占比，
     random_state=随机种子源)
  
--> 得到：训练输入，测试输入，训练输出，测试输出


import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
import sklearn.model_selection as ms
 
x, y = [], []
with open('../ML/data/multiple1.txt', 'r') as f:
    for line in f.readlines():
        data = [float(substr) for substr in line.split(',')]
        x.append(data[:-1])
        y.append(data[-1])
 
x = np.array(x)
y = np.array(y)
train_x, test_x, train_y, test_y = \
    ms.train_test_split(x, y, test_size=0.25,
                        random_state=7)
# 创建模型
model = nb.GaussianNB()  # 使用高斯（正态分布）分布求概率
model.fit(train_x, train_y)  # 训练
 
# 分类边界线
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
                     np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
 
# 测试
pred_test_y = model.predict(test_x)
# 正确率
print((pred_test_y == test_y).sum() /
      pred_test_y.size)
# 绘图
mp.figure('Naive Bayes Classification',
          facecolor='lightgray')
mp.title('Naive Bayes Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=60)
mp.show()

四、随机森林分类

森林深入学习请移步：https://www.cnblogs.com/fionacai/p/5894142.html

4.1 超参数取值调优 -- 验证曲线

f1_score = f(模型对象超参数

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/黑客灵魂/article/detail/750400