当前位置:   article > 正文

【Python】实验五使用Sklearn构建模型_sklearn训练模型 python代码

sklearn训练模型 python代码

一、实验目的

1、掌握sklearn转换器、估计器的使用;
2、掌握sklearn数据标准化和数据划分的方法;
3、掌握sklearn中聚类、分类、回归模型的构建和评价方法。
4、尝试用Python语言实现K-means算法和SVM算法。

二、实验要求

1、完成实验步骤中的各项任务;
2、写出实验报告,内容要求有Python 代码和实验结果
3、鼓励大家给出不同的,更优的代码实现。

三、实验内容

1、使用sklearn处理wine和wine_quality数据集。(具体要求见第六章实训1的内容)
(1)掌握sklearn转换器的使用方法;
(2)掌握sklearn进行数据划分、降维的使用方法。

import os

os.environ["OMP_NUM_THREADS"] = '1'

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler  # 标准差标准化
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
# 读取数据集
wine = pd.read_csv('./wine.csv')
winequality = pd.read_csv('./winequality.csv', sep=';')

# 划分数据和标签
wine_data = wine.iloc[:, 1:]
wine_target = wine['Class']

winequality_data = winequality.iloc[:, :-1]
winequality_target = winequality['quality']

# 划分训练集和测试集
from sklearn.model_selection import train_test_split

wine_data_train, wine_data_test, \
wine_target_train, wine_target_test = \
    train_test_split(wine_data, wine_target, \
                     test_size=0.1, random_state=6)

winequality_data_train, winequality_data_test, \
winequality_target_train, winequality_target_test = \
    train_test_split(winequality_data, winequality_target, \
                     test_size=0.1, random_state=6)

# 标准化数据集


stdScale = StandardScaler().fit(wine_data_train)  # 生成规则
wine_trainScaler = stdScale.transform(wine_data_train)  # 对训练集进行标准化
wine_testScaler = stdScale.transform(wine_data_test)  # 用训练集训练的模型对测试集标准化

stdScale = StandardScaler().fit(winequality_data_train)
winequality_trainScaler = stdScale.transform(winequality_data_train)
winequality_testScaler = stdScale.transform(winequality_data_test)

# PCA降维


pca = PCA(n_components=5).fit(wine_trainScaler)
wine_trainPca = pca.transform(wine_trainScaler)
wine_testPca = pca.transform(wine_testScaler)

pca = PCA(n_components=5).fit(winequality_trainScaler)
winequality_trainPca = pca.transform(winequality_trainScaler)
winequality_testPca = pca.transform(winequality_testScaler)

# 用标准化后的训练集建模
kmeans = KMeans(n_clusters=3, random_state=1).fit(wine_trainScaler)

print('构建的KMeans模型为:\n', kmeans)

from sklearn.metrics import fowlkes_mallows_score  # FMI评价法

score = fowlkes_mallows_score(wine_target_train, kmeans.labels_)
print("wine数据集的FMI:%f" % (score))

for i in range(2, 11):
    # 构建并训练模型
    kmeans = KMeans(n_clusters=i, random_state=123, n_init=10).fit(wine_trainScaler)
    score = fowlkes_mallows_score(wine_target_train, kmeans.labels_)
    print('iris数据聚%d类FMI评价分值为:%f' % (i, score))

silhouettteScore = []
for i in range(2, 11):
    # 构建并训练模型
    kmeans = KMeans(n_clusters=i, random_state=1).fit(wine)
    score = silhouette_score(wine, kmeans.labels_)
    silhouettteScore.append(score)
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouettteScore, linewidth=1.5, linestyle="-")
plt.show()

# 求取 Calinski-Harabasz指数
from sklearn.metrics import calinski_harabasz_score

for i in range(2, 11):
    # 构建并训练模型
    kmeans = KMeans(n_clusters=i, random_state=1, n_init=10).fit(wine)
    score = calinski_harabasz_score(wine, kmeans.labels_)
    print('seeds数据聚%d类calinski_harabaz指数为:%f' % (i, score))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93

运行图如下:
在这里插入图片描述
在这里插入图片描述

2、构建基于wine数据集的K-Means的聚类模型、SVM模型和回归模型(具体要求见第六章实训2至实验4的内容)。
(1)掌握聚类模型、SVM模型和回归模型的构建方法;
(2)掌握聚类模型、SVM模型和回归模型的评价方法。

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 1. Read datasets
wine = pd.read_csv(r'C:\Users\xf224\Desktop\第六章数据\wine.csv')
winequality = pd.read_csv(r'C:\Users\xf224\Desktop\第六章数据\winequality.csv', sep=';')

# Separate data and labels
wine_data = wine.iloc[:, 1:]
wine_target = wine['Class']
winequality_data = winequality.iloc[:, :-1]
winequality_target = winequality['quality']

# 2. Split into training and test sets
wine_data_train, wine_data_test, wine_target_train, wine_target_test = train_test_split(
    wine_data, wine_target, test_size=0.1, random_state=6)
winequality_data_train, winequality_data_test, winequality_target_train, winequality_target_test = train_test_split(
    winequality_data, winequality_target, test_size=0.1, random_state=6)

# 3. Standardize datasets
stdScale = StandardScaler().fit(wine_data_train)
wine_trainScaler = stdScale.transform(wine_data_train)
wine_testScaler = stdScale.transform(wine_data_test)
stdScale = StandardScaler().fit(winequality_data_train)
winequality_trainScaler = stdScale.transform(winequality_data_train)
winequality_testScaler = stdScale.transform(winequality_data_test)

# Visualize standardized results
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(wine_trainScaler.flatten(), bins=30, color='lightgreen', edgecolor='black')  # Modified color to lightgreen
plt.title('标准化后的 Wine 训练数据直方图')
plt.xlabel('标准化值')
plt.ylabel('频率')
plt.subplot(1, 2, 2)
plt.hist(winequality_trainScaler.flatten(), bins=30, color='lightblue', edgecolor='black')  # Modified color to lightblue
plt.title('标准化后的 Wine Quality 训练数据直方图')
plt.xlabel('标准化值')
plt.ylabel('频率')
plt.show()

# 4. PCA dimensionality reduction
pca = PCA(n_components=5).fit(wine_trainScaler)
wine_trainPca = pca.transform(wine_trainScaler)
wine_testPca = pca.transform(wine_testScaler)
pca = PCA(n_components=5).fit(winequality_trainScaler)
winequality_trainPca = pca.transform(winequality_trainScaler)
pca = PCA(n_components=5).fit(wine_trainScaler)
wine_trainPca = pca.transform(wine_trainScaler)
wine_testPca = pca.transform(wine_testScaler)
pca = PCA(n_components=5).fit(winequality_trainScaler)
winequality_trainPca = pca.transform(winequality_trainScaler)
winequality_testPca = pca.transform(winequality_testScaler)

# 可视化PCA降维结果
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x=wine_trainPca[:, 0], y=wine_trainPca[:, 1], hue=wine_target_train, palette='muted', s=50)
plt.title('Wine 数据的 PCA 转换结果 (前两个主成分)')
plt.xlabel('主成分 1')
plt.ylabel('主成分 2')
plt.legend(title='Class', loc='best')
plt.subplot(1, 2, 2)
sns.scatterplot(x=winequality_trainPca[:, 0], y=winequality_trainPca[:, 1], hue=winequality_target_train, palette='muted', s=50)
plt.title('Wine Quality 数据的 PCA 转换结果 (前两个主成分)')
plt.xlabel('主成分 1')
plt.ylabel('主成分 2')
plt.legend(title='Quality', loc='best')
plt.show()

# 1. K-Means聚类模型
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=1)
kmeans.fit(data)
cluster_labels = kmeans.labels_
# 2. SVM分类模型
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
# 3. 回归模型(线性回归)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
regression_predictions = regression_model.predict(X_test)
regression_mse = mean_squared_error(y_test, regression_predictions)
# 可视化结果
plt.figure(figsize=(12, 4))
# 1. K-Means聚类可视化
plt.subplot(1, 3, 1)
plt.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', edgecolor='k')
plt.title('K-Means聚类')
plt.xlabel('特征1')
plt.ylabel('特征2')
# 2. SVM分类可视化
plt.subplot(1, 3, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c=svm_predictions, cmap='viridis', edgecolor='k')
plt.title('SVM分类')
plt.xlabel('特征1')
plt.ylabel('特征2')
# 3. 线性回归可视化
plt.subplot(1, 3, 3)
plt.scatter(y_test, regression_predictions)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)
plt.title('线性回归: 真实 vs 预测')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.tight_layout()
plt.show()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117

运行图如下:
在这里插入图片描述

3、今有沪深300指数2014年的交易数据,其数据结构表1所示。
表1 沪深300指数2014年交易数据
在这里插入图片描述

字段依次表示指数代码、交易日期、开盘价、最高价、最低价、收盘价、成交量。
建一个Python脚本,命名沪深300指数.ipynb,完成以下任务:
1)请计算如下指标:
A1(收盘价 / 均价):即收盘价 / 过去 10 个交易日的移动平均收盘价
A2(现量 / 均量):即成交量 / 过去 10 个交易日的移动平均成交量
A3(收益率):( 当日收盘价 - 前日收盘价 )/ 前日收盘价
A4(最高价 / 均价):最高价 / 过去 10 个交易日的移动均平均收盘价
A5(最低价 / 均价):最低价 / 过去 10 个交易日的移动平均收盘价
A6(极差):最高价 - 最低价(衡量波动性)
A7(瞬时收益):收盘价 - 开盘价
Y(决策变量):后交易日收盘价 - 当前交易日收盘价,如果大于 0,记为 1;如果小于等于 0,记为 -1。
同时对指标A1~A7作标准化处理:( 当前值 - 均值 )/ 标准差,最终得到以下标准的数据结构形式:
ID A1 A2 A3 A4 A5 A6 A7 Y
1
2
3
4
5
6
……
2)取后30条记录作为测试样本,剩下的数据记录为训练样本,利用支持向量机模型进行训练及测试,获得模型的准确率和预测准确率,分别记为score和Rv,并在命令窗口中输出score和Rv。

import warnings
warnings.filterwarnings('ignore', category=UserWarning)
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# 读取数据
data = pd.read_excel('./沪深300指数2014年交易数据.xlsx')  # 替换为你的实际数据文件名
imputer = SimpleImputer(strategy='mean')
# 计算指标
data['A1'] = data['Idxtrd06'] / data['Idxtrd06'].rolling(window=10).mean()
data['A2'] = data['Idxtrd06'] / data['Idxtrd06'].rolling(window=10).mean()
data['A3'] = (data['Idxtrd06'].shift(1) - data['Idxtrd06']) / data['Idxtrd06'].shift(1)
data['A4'] = data['Idxtrd04'] / data['Idxtrd06'].rolling(window=10).mean()
data['A5'] = data['Idxtrd05'] / data['Idxtrd06'].rolling(window=10).mean()
data['A6'] = data['Idxtrd04'] - data['Idxtrd05']
data['A7'] = data['Idxtrd06'] - data['Idxtrd03']
data['Y'] = np.where(data['Idxtrd06'].shift(-1) - data['Idxtrd06'] > 0, 1, -1)
# 标准化处理
features = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7']
for feature in features:
    data[feature] = (data[feature] - data[feature].mean()) / data[feature].std()
# 分割训练和测试数据
train_data = data.head(len(data) - 30)
test_data = data.tail(30)
# 训练模型
X_train = train_data[features]
y_train = train_data['Y']
# 在X_train上应用插补
X_train_imputed = imputer.fit_transform(X_train)
print("标准化和插补后的训练数据:")
print(pd.DataFrame(X_train_imputed, columns=features))
model = SVC()
model.fit(X_train_imputed, y_train)
# 测试模型
X_test = test_data[features]
y_test = test_data['Y']
predictions = model.predict(X_test)
X_test_imputed = imputer.transform(X_test)
print("\n标准化和插补后的测试数据:")
print(pd.DataFrame(X_test_imputed, columns=features))
# 计算准确率
score = model.score(X_test, y_test)
rv = accuracy_score(y_test, predictions)
# 输出结果
print(f"Score: {score}")
print(f"Rv: {rv}")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49

运行图如下:
在这里插入图片描述
在这里插入图片描述

创作不易,感谢未来首富们的支持与关注!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/爱喝兽奶帝天荒/article/detail/849371
推荐阅读
相关标签
  

闽ICP备14008679号