赞
踩
1、掌握sklearn转换器、估计器的使用;
2、掌握sklearn数据标准化和数据划分的方法;
3、掌握sklearn中聚类、分类、回归模型的构建和评价方法。
4、尝试用Python语言实现K-means算法和SVM算法。
1、完成实验步骤中的各项任务;
2、写出实验报告,内容要求有Python 代码和实验结果
3、鼓励大家给出不同的,更优的代码实现。
1、使用sklearn处理wine和wine_quality数据集。(具体要求见第六章实训1的内容)
(1)掌握sklearn转换器的使用方法;
(2)掌握sklearn进行数据划分、降维的使用方法。
import os os.environ["OMP_NUM_THREADS"] = '1' import pandas as pd from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler # 标准差标准化 from sklearn.decomposition import PCA from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore', category=FutureWarning) # 读取数据集 wine = pd.read_csv('./wine.csv') winequality = pd.read_csv('./winequality.csv', sep=';') # 划分数据和标签 wine_data = wine.iloc[:, 1:] wine_target = wine['Class'] winequality_data = winequality.iloc[:, :-1] winequality_target = winequality['quality'] # 划分训练集和测试集 from sklearn.model_selection import train_test_split wine_data_train, wine_data_test, \ wine_target_train, wine_target_test = \ train_test_split(wine_data, wine_target, \ test_size=0.1, random_state=6) winequality_data_train, winequality_data_test, \ winequality_target_train, winequality_target_test = \ train_test_split(winequality_data, winequality_target, \ test_size=0.1, random_state=6) # 标准化数据集 stdScale = StandardScaler().fit(wine_data_train) # 生成规则 wine_trainScaler = stdScale.transform(wine_data_train) # 对训练集进行标准化 wine_testScaler = stdScale.transform(wine_data_test) # 用训练集训练的模型对测试集标准化 stdScale = StandardScaler().fit(winequality_data_train) winequality_trainScaler = stdScale.transform(winequality_data_train) winequality_testScaler = stdScale.transform(winequality_data_test) # PCA降维 pca = PCA(n_components=5).fit(wine_trainScaler) wine_trainPca = pca.transform(wine_trainScaler) wine_testPca = pca.transform(wine_testScaler) pca = PCA(n_components=5).fit(winequality_trainScaler) winequality_trainPca = pca.transform(winequality_trainScaler) winequality_testPca = pca.transform(winequality_testScaler) # 用标准化后的训练集建模 kmeans = KMeans(n_clusters=3, random_state=1).fit(wine_trainScaler) print('构建的KMeans模型为:\n', kmeans) from sklearn.metrics import fowlkes_mallows_score # FMI评价法 score = fowlkes_mallows_score(wine_target_train, kmeans.labels_) print("wine数据集的FMI:%f" % (score)) for i in range(2, 11): # 构建并训练模型 kmeans = KMeans(n_clusters=i, random_state=123, n_init=10).fit(wine_trainScaler) score = fowlkes_mallows_score(wine_target_train, kmeans.labels_) print('iris数据聚%d类FMI评价分值为:%f' % (i, score)) silhouettteScore = [] for i in range(2, 11): # 构建并训练模型 kmeans = KMeans(n_clusters=i, random_state=1).fit(wine) score = silhouette_score(wine, kmeans.labels_) silhouettteScore.append(score) plt.figure(figsize=(10, 6)) plt.plot(range(2, 11), silhouettteScore, linewidth=1.5, linestyle="-") plt.show() # 求取 Calinski-Harabasz指数 from sklearn.metrics import calinski_harabasz_score for i in range(2, 11): # 构建并训练模型 kmeans = KMeans(n_clusters=i, random_state=1, n_init=10).fit(wine) score = calinski_harabasz_score(wine, kmeans.labels_) print('seeds数据聚%d类calinski_harabaz指数为:%f' % (i, score))
运行图如下:
2、构建基于wine数据集的K-Means的聚类模型、SVM模型和回归模型(具体要求见第六章实训2至实验4的内容)。
(1)掌握聚类模型、SVM模型和回归模型的构建方法;
(2)掌握聚类模型、SVM模型和回归模型的评价方法。
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import matplotlib.pyplot as plt import seaborn as sns plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 1. Read datasets wine = pd.read_csv(r'C:\Users\xf224\Desktop\第六章数据\wine.csv') winequality = pd.read_csv(r'C:\Users\xf224\Desktop\第六章数据\winequality.csv', sep=';') # Separate data and labels wine_data = wine.iloc[:, 1:] wine_target = wine['Class'] winequality_data = winequality.iloc[:, :-1] winequality_target = winequality['quality'] # 2. Split into training and test sets wine_data_train, wine_data_test, wine_target_train, wine_target_test = train_test_split( wine_data, wine_target, test_size=0.1, random_state=6) winequality_data_train, winequality_data_test, winequality_target_train, winequality_target_test = train_test_split( winequality_data, winequality_target, test_size=0.1, random_state=6) # 3. Standardize datasets stdScale = StandardScaler().fit(wine_data_train) wine_trainScaler = stdScale.transform(wine_data_train) wine_testScaler = stdScale.transform(wine_data_test) stdScale = StandardScaler().fit(winequality_data_train) winequality_trainScaler = stdScale.transform(winequality_data_train) winequality_testScaler = stdScale.transform(winequality_data_test) # Visualize standardized results plt.figure(figsize=(14, 6)) plt.subplot(1, 2, 1) plt.hist(wine_trainScaler.flatten(), bins=30, color='lightgreen', edgecolor='black') # Modified color to lightgreen plt.title('标准化后的 Wine 训练数据直方图') plt.xlabel('标准化值') plt.ylabel('频率') plt.subplot(1, 2, 2) plt.hist(winequality_trainScaler.flatten(), bins=30, color='lightblue', edgecolor='black') # Modified color to lightblue plt.title('标准化后的 Wine Quality 训练数据直方图') plt.xlabel('标准化值') plt.ylabel('频率') plt.show() # 4. PCA dimensionality reduction pca = PCA(n_components=5).fit(wine_trainScaler) wine_trainPca = pca.transform(wine_trainScaler) wine_testPca = pca.transform(wine_testScaler) pca = PCA(n_components=5).fit(winequality_trainScaler) winequality_trainPca = pca.transform(winequality_trainScaler) pca = PCA(n_components=5).fit(wine_trainScaler) wine_trainPca = pca.transform(wine_trainScaler) wine_testPca = pca.transform(wine_testScaler) pca = PCA(n_components=5).fit(winequality_trainScaler) winequality_trainPca = pca.transform(winequality_trainScaler) winequality_testPca = pca.transform(winequality_testScaler) # 可视化PCA降维结果 plt.figure(figsize=(14, 6)) plt.subplot(1, 2, 1) sns.scatterplot(x=wine_trainPca[:, 0], y=wine_trainPca[:, 1], hue=wine_target_train, palette='muted', s=50) plt.title('Wine 数据的 PCA 转换结果 (前两个主成分)') plt.xlabel('主成分 1') plt.ylabel('主成分 2') plt.legend(title='Class', loc='best') plt.subplot(1, 2, 2) sns.scatterplot(x=winequality_trainPca[:, 0], y=winequality_trainPca[:, 1], hue=winequality_target_train, palette='muted', s=50) plt.title('Wine Quality 数据的 PCA 转换结果 (前两个主成分)') plt.xlabel('主成分 1') plt.ylabel('主成分 2') plt.legend(title='Quality', loc='best') plt.show() # 1. K-Means聚类模型 num_clusters = 3 kmeans = KMeans(n_clusters=num_clusters, random_state=1) kmeans.fit(data) cluster_labels = kmeans.labels_ # 2. SVM分类模型 X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1) svm_model = SVC(kernel='linear', C=1) svm_model.fit(X_train, y_train) svm_predictions = svm_model.predict(X_test) svm_accuracy = accuracy_score(y_test, svm_predictions) # 3. 回归模型(线性回归) X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1) regression_model = LinearRegression() regression_model.fit(X_train, y_train) regression_predictions = regression_model.predict(X_test) regression_mse = mean_squared_error(y_test, regression_predictions) # 可视化结果 plt.figure(figsize=(12, 4)) # 1. K-Means聚类可视化 plt.subplot(1, 3, 1) plt.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', edgecolor='k') plt.title('K-Means聚类') plt.xlabel('特征1') plt.ylabel('特征2') # 2. SVM分类可视化 plt.subplot(1, 3, 2) plt.scatter(X_test[:, 0], X_test[:, 1], c=svm_predictions, cmap='viridis', edgecolor='k') plt.title('SVM分类') plt.xlabel('特征1') plt.ylabel('特征2') # 3. 线性回归可视化 plt.subplot(1, 3, 3) plt.scatter(y_test, regression_predictions) plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2) plt.title('线性回归: 真实 vs 预测') plt.xlabel('真实值') plt.ylabel('预测值') plt.tight_layout() plt.show()
运行图如下:
3、今有沪深300指数2014年的交易数据,其数据结构表1所示。
表1 沪深300指数2014年交易数据
字段依次表示指数代码、交易日期、开盘价、最高价、最低价、收盘价、成交量。
建一个Python脚本,命名沪深300指数.ipynb,完成以下任务:
1)请计算如下指标:
A1(收盘价 / 均价):即收盘价 / 过去 10 个交易日的移动平均收盘价
A2(现量 / 均量):即成交量 / 过去 10 个交易日的移动平均成交量
A3(收益率):( 当日收盘价 - 前日收盘价 )/ 前日收盘价
A4(最高价 / 均价):最高价 / 过去 10 个交易日的移动均平均收盘价
A5(最低价 / 均价):最低价 / 过去 10 个交易日的移动平均收盘价
A6(极差):最高价 - 最低价(衡量波动性)
A7(瞬时收益):收盘价 - 开盘价
Y(决策变量):后交易日收盘价 - 当前交易日收盘价,如果大于 0,记为 1;如果小于等于 0,记为 -1。
同时对指标A1~A7作标准化处理:( 当前值 - 均值 )/ 标准差,最终得到以下标准的数据结构形式:
ID A1 A2 A3 A4 A5 A6 A7 Y
1
2
3
4
5
6
……
2)取后30条记录作为测试样本,剩下的数据记录为训练样本,利用支持向量机模型进行训练及测试,获得模型的准确率和预测准确率,分别记为score和Rv,并在命令窗口中输出score和Rv。
import warnings warnings.filterwarnings('ignore', category=UserWarning) import pandas as pd import numpy as np from sklearn.svm import SVC from sklearn.metrics import accuracy_score from sklearn.impute import SimpleImputer # 读取数据 data = pd.read_excel('./沪深300指数2014年交易数据.xlsx') # 替换为你的实际数据文件名 imputer = SimpleImputer(strategy='mean') # 计算指标 data['A1'] = data['Idxtrd06'] / data['Idxtrd06'].rolling(window=10).mean() data['A2'] = data['Idxtrd06'] / data['Idxtrd06'].rolling(window=10).mean() data['A3'] = (data['Idxtrd06'].shift(1) - data['Idxtrd06']) / data['Idxtrd06'].shift(1) data['A4'] = data['Idxtrd04'] / data['Idxtrd06'].rolling(window=10).mean() data['A5'] = data['Idxtrd05'] / data['Idxtrd06'].rolling(window=10).mean() data['A6'] = data['Idxtrd04'] - data['Idxtrd05'] data['A7'] = data['Idxtrd06'] - data['Idxtrd03'] data['Y'] = np.where(data['Idxtrd06'].shift(-1) - data['Idxtrd06'] > 0, 1, -1) # 标准化处理 features = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7'] for feature in features: data[feature] = (data[feature] - data[feature].mean()) / data[feature].std() # 分割训练和测试数据 train_data = data.head(len(data) - 30) test_data = data.tail(30) # 训练模型 X_train = train_data[features] y_train = train_data['Y'] # 在X_train上应用插补 X_train_imputed = imputer.fit_transform(X_train) print("标准化和插补后的训练数据:") print(pd.DataFrame(X_train_imputed, columns=features)) model = SVC() model.fit(X_train_imputed, y_train) # 测试模型 X_test = test_data[features] y_test = test_data['Y'] predictions = model.predict(X_test) X_test_imputed = imputer.transform(X_test) print("\n标准化和插补后的测试数据:") print(pd.DataFrame(X_test_imputed, columns=features)) # 计算准确率 score = model.score(X_test, y_test) rv = accuracy_score(y_test, predictions) # 输出结果 print(f"Score: {score}") print(f"Rv: {rv}")
运行图如下:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。