当前位置:   article > 正文

数据集:mnist手写数据集_一个手写数字数据集怎么做

一个手写数字数据集怎么做

数据集:mnist手写数据集

参考文献:【Python 3 利用机器学习模型 进行手写体数字检测】

1. 整体设计思路

在这里插入图片描述
整体的设计流程:
在这里插入图片描述

手写数据集的生成

2. 特征向量提取

这一步是提取图像中的特征。保存784个像素点。

from PIL import Image
import csv
import os

# 提取单张图像的特征
def get_features_single(img):
    # 提取特征
    # 30*30的图像
    global pixel_cnt_list
    pixel_cnt_list = []

    height, width = 28, 28
    # 统计784个像素点
    for y in range(height):
        for x in range(width):
            if img.getpixel((x, y)) != 0:  # 白点
                pixel_cnt_list.append(1)
            else:
                pixel_cnt_list.append(0)
    # 统计28行每行的白点数
    # for y in range(height):
    #     pixel_cnt_x = 0
    #     for x in range(width):
    #         if img.getpixel((x, y)) != 0:  # 白点
    #             pixel_cnt_x += 1
    #     pixel_cnt_list.append(pixel_cnt_x)
    # # 统计28列每列的白点数
    # for x in range(width):
    #     pixel_cnt_y = 0
    #     for y in range(height):
    #         if img.getpixel((x, y)) != 0:  # 白点
    #             pixel_cnt_y += 1
    #     pixel_cnt_list.append(pixel_cnt_y)
    return pixel_cnt_list

# 遍历文件夹提取特征存入CSV
# 取 sample_nums 个
def save_features_to_CSV():

    path_images = "3-手写数字集/"
    path_csv = "data_csvs/"

    sum_images = 0

    # 读取图像文件
    with open(path_csv+"tmp.csv", "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        # 访问文件夹 0-9
        for i in range(10):
            num_list = os.listdir(path_images + str(i))
            print(path_images + str(i))
            print("num_list:", num_list)
            # 读到图像文件
            if os.path.isdir(path_images + str(i)):
                print("样本个数:", len(num_list))
                sum_images = sum_images + len(num_list)

                # Travsel every single image to generate the features
                for j in range(0, len(num_list)):

                    # 处理读取单个图像文件提取特征
                    img = Image.open(path_images + str(i)+"/" + num_list[j])
                    print("分析:", num_list[j])
                    get_features_single(img)
                    pixel_cnt_list.append(i)

                    # 写入 CSV
                    writer.writerow(pixel_cnt_list)
                    img.close() 
            print('\n')
            print("样本总数:", sum_images)
            # 以 "test_+样本数.csv" 重新命名 CSV
    if "data_"+str(sum_images)+".csv" in os.listdir(path_csv):
        # 之前生成过 data_XXX.csv,需要先删除掉
        os.remove(path_csv+"data_" + str(sum_images) + ".csv")
        os.rename(path_csv+"tmp.csv", path_csv+"data_"+str(sum_images)+".csv")
    else:
        os.rename(path_csv+"tmp.csv", path_csv+"data_"+str(sum_images)+".csv")

save_features_to_CSV()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80

3. 特征数据加工

def pre_data():
    # CSV57维表头名
    column_names = []

    for i in range(0, 784):
        column_names.append("feature_" + str(i))
    column_names.append("true_number")

    # 读取csv
    path_csv = "data_csvs/"
    data = pd.read_csv(path_csv + "data_500.csv", names=column_names)

    # 提取数据集
    global X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(
        data[column_names[0:784]],
        data[column_names[784]],
        test_size=0.2,  # 80% for 训练,20% for 测试
        random_state=0
        )

path_saved_models = "data_models/"
pre_data()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23

4. k近邻法模型训练

def way_KNN():
    X_train_KNN = X_train
    y_train_KNN = y_train

    X_test_KNN = X_test
    y_test_KNN = y_test
    
    ss_KNN = StandardScaler()
    X_train_KNN = ss_KNN.fit_transform(X_train_KNN)
    X_test_KNN = ss_KNN.transform(X_test_KNN)

    KNN = KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='minkowski',
 metric_params=None, n_jobs=1, n_neighbors=3)

    KNN.fit(X_train_KNN, y_train_KNN)
    
    global y_predict_KNN
    y_predict_KNN = KNN.predict(X_test_KNN)
    score_KNN = KNN.score(X_test_KNN, y_test_KNN)
    print("The accurary of KNN:", '\t', score_KNN)
    # 保存模型
    joblib.dump(filename=path_saved_models +'KNN.model',value=KNN)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22

5. Logistic 回归模型训练

# LR, logistic regression, 逻辑斯特回归分类(线性模型)
def way_LR():
    X_train_LR = X_train
    y_train_LR = y_train

    X_test_LR = X_test
    y_test_LR = y_test

    # 数据预加工
    ss_LR = StandardScaler()
    X_train_LR = ss_LR.fit_transform(X_train_LR)
    X_test_LR = ss_LR.transform(X_test_LR)

    # 初始化LogisticRegression
    LR = LogisticRegression()

    # 调用LogisticRegression中的fit()来训练模型参数
    LR.fit(X_train_LR, y_train_LR)

    # 使用训练好的模型lr对X_test进行预测
    global y_predict_LR
    y_predict_LR = LR.predict(X_test_LR)

    # 评分函数
    score_LR = LR.score(X_test_LR, y_test_LR)
    print("The accurary of LR:", '\t', score_LR)
    # 保存模型
    joblib.dump(filename=path_saved_models +'LR.model',value=LR)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28

6. 随机梯度下降模型

# SGDC, stochastic gradient decent 随机梯度下降法求解(线性模型)
def way_SGDC():
    X_train_SGDC = X_train
    y_train_SGDC = y_train

    X_test_SGDC = X_test
    y_test_SGDC = y_test

    ss_SGDC = StandardScaler()
    X_train_SGDC = ss_SGDC.fit_transform(X_train_SGDC)
    X_test_SGDC = ss_SGDC.transform(X_test_SGDC)

    SGDC = SGDClassifier(penalty='l2',alpha=0.1)

    SGDC.fit(X_train_SGDC, y_train_SGDC)
    global y_predict_SGDC
    y_predict_SGDC = SGDC.predict(X_test_SGDC)

    score_SGDC = SGDC.score(X_test_SGDC, y_test_SGDC)
    print("The accurary of SGDC:", '\t', score_SGDC)
    # 保存模型
    joblib.dump(filename=path_saved_models +'SGDC.model',value=SGDC)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22

7. 支持向量机模型训练

# SVC, Supported Vector Classifier, 线性支持向量分类(SVM支持向量机)
def way_SVC():
    X_train_SVC = X_train
    y_train_SVC = y_train

    X_test_SVC = X_test
    y_test_SVC = y_test

    ss_SVC = StandardScaler()
    X_train_SVC = ss_SVC.fit_transform(X_train_SVC)
    X_test_SVC = ss_SVC.transform(X_test_SVC)

    LSVC = SVC(kernel='rbf', C=1E6)
    LSVC.fit(X_train_SVC, y_train_SVC)
    global y_predict_SVC
    y_predict_SVC = LSVC.predict(X_test_SVC)

    score_SVC = LSVC.score(X_test_SVC, y_test_SVC)
    print("The accurary of SVC:", '\t', score_SVC)
    # 保存模型
    joblib.dump(filename=path_saved_models +'SVC.model',value=LSVC)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

8. 多层网络模型训练

# 多层感知机分类(神经网络)
def way_MLPC():
    X_train_MLPC = X_train
    y_train_MLPC = y_train

    X_test_MLPC = X_test
    y_test_MLPC = y_test

    ss_MLPC = StandardScaler()
    X_train_MLPC = ss_MLPC.fit_transform(X_train_MLPC)
    X_test_MLPC = ss_MLPC.transform(X_test_MLPC)

    MLPC = MLPClassifier(hidden_layer_sizes=(45, 45, 45), max_iter=10000)
    MLPC.fit(X_train_MLPC, y_train_MLPC)
    global y_predict_MLPC
    y_predict_MLPC = MLPC.predict(X_test_MLPC)

    score_MLPC = MLPC.score(X_test_MLPC, y_test_MLPC)
    print("The accurary of MLPC:", '\t', score_MLPC)
    # 保存模型
    joblib.dump(filename=path_saved_models +'MLPC.model',value=MLPC)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

9. 调用及输出

way_LR()
way_SVC()
way_MLPC()
way_SGDC()
way_KNN()
  • 1
  • 2
  • 3
  • 4
  • 5
The accurary of LR: 	 0.89
The accurary of SVC: 	 0.9
The accurary of MLPC: 	 0.85
The accurary of SGDC: 	 0.9
The accurary of KNN: 	 0.84
  • 1
  • 2
  • 3
  • 4
  • 5

10. 对单个图片进行预测

在这里插入图片描述

# 利用保存到本地的训练好的模型,来检测单张 image 的标记
import joblib
from PIL import Image
img = Image.open("3-手写数字集/0/mnist_train_359.png")

# Get features
import get_features
features_test_png = get_features.get_features_single(img)

path_saved_models = "data_models/"

# LR
LR = joblib.load(path_saved_models + "LR.model")
predict_LR = LR.predict([features_test_png])
print("LR:", predict_LR[0])

# LSVC
SVC = joblib.load(path_saved_models + "SVC.model")
predict_SVC = SVC.predict([features_test_png])
print("SVC:", predict_SVC[0])

# MLPC
MLPC = joblib.load(path_saved_models + "MLPC.model")
predict_MLPC = MLPC.predict([features_test_png])
print("MLPC:", predict_MLPC[0])

# SGDC
SGDC = joblib.load(path_saved_models + "SGDC.model")
predict_SGDC = SGDC.predict([features_test_png])
print("SGDC:", predict_SGDC[0])

# KNN
KNN = joblib.load(path_saved_models + "KNN.model")
predict_KNN = KNN.predict([features_test_png])
print("KNN:", predict_KNN[0])

# 关闭图像
img.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38

结果

LR: 0
SVC: 0
MLPC: 0
SGDC: 0
KNN: 0
  • 1
  • 2
  • 3
  • 4
  • 5
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/343031
推荐阅读
相关标签
  

闽ICP备14008679号