赞
踩
- # 导入数据处理相关的包
- import pandas as pd
- import numpy as np
- from scipy import stats
- # 导入逻辑回归的包
- from sklearn.linear_model import LogisticRegression as LR
- from sklearn.preprocessing import LabelEncoder
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- # 导入绘制循环进度条的包
- from tqdm import tqdm
- # 忽略警告
- import warnings
- warnings.filterwarnings('ignore')
- # 导入数据
- data = pd.read_csv('letter_recognition.csv')
- # 划分特征和标签
- X = data.iloc[:,1:]
- y = data.iloc[:,:1]
- # 为字母进行编码
- y = LabelEncoder().fit_transform(y.values.ravel())
- # 划分训练集和测试集
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
- # 提取唯一的标签
- labels = np.unique(y_train)
- # OvO
- count = 0
- # 将预测值初始化为 0
- y_pred = np.zeros((len(y_test),1))
- pbar = tqdm(range(len(labels)))
- for i in pbar:
- for j in range(i+1,len(labels)):
- # 输出进度条
- pbar.set_description("正在构建第{}_{}分类器".format(labels[i]+1, labels[j]+1))
- count += 1
- # 只使用属于第 i 类和第 j 类的样本训练模型
- y_train_ = np.where(np.logical_or(y_train==labels[i], y_train==labels[j]), y_train, np.NAN)
- X_train_ = X_train.iloc[~np.isnan(y_train_)]
- # 二元逻辑回归
- globals()['LR_{}_{}'.format(labels[i], labels[j])] = LR().fit(X_train_, y_train_[~np.isnan(y_train_)])
- # 测试集预测
- globals()['y_pred_{}_{}'.format(labels[i], labels[j])] = globals()['LR_{}_{}'.format(labels[i], labels[j])].predict(X_test)
- # 合并预测结果
- y_pred = np.append(y_pred, globals()['y_pred_{}_{}'.format(labels[i], labels[j])].reshape(-1,1), axis=1)
- print('一共构建了{}个分类器'.format(count))
- # 删除第一列的初始零值
- y_pred = np.delete(y_pred, 0, axis=1)
- # 以所有分类器的分类结果的众数作为最终的预测结果
- y_pred = np.squeeze(stats.mode(y_pred, axis=1)[0])
- error_rate_OvO = 1-accuracy_score(y_test, y_pred)
- # 查看预测结果
- pd.DataFrame(np.append(y_pred.reshape(-1,1), y_test.reshape(-1,1), axis=1), columns=['预测值', '真实值'], dtype=int)
- # OvR
- count = 0
- # 将预测值初始化为 0
- y_pred = np.zeros((len(y_test),1))
- pbar = tqdm(range(len(labels)))
- for i in pbar:
- # 输出进度条
- pbar.set_description("正在构建第{}分类器".format(labels[i]+1))
- count += 1
- # 只使用属于第 i 类和第 j 类的样本训练模型
- y_train_ = np.where(y_train==labels[i], y_train, -1)
- # 二元逻辑回归
- globals()['LR_{}'.format(labels[i])] = LR().fit(X_train, y_train_)
- # 测试集预测
- globals()['y_pred_{}'.format(labels[i])] = globals()['LR_{}'.format(labels[i])].predict_proba(X_test)[:,1]
- # 合并预测结果
- y_pred = np.append(y_pred, globals()['y_pred_{}'.format(labels[i])].reshape(-1,1), axis=1)
- print('一共构建了{}个分类器'.format(count))
- # 删除第一列的初始零值
- y_pred = np.delete(y_pred, 0, axis=1)
- # 以所有分类器的分类概率的最大值对应的值作为最终的预测结果
- y_pred = np.argmax(y_pred, axis=1)
- error_rate_OvR = 1-accuracy_score(y_test, y_pred)
- # 查看预测结果
- pd.DataFrame(np.append(y_pred.reshape(-1,1), y_test.reshape(-1,1), axis=1), columns=['预测值', '真实值'], dtype=int)
- # 直接构造多元逻辑回归模型
- # 多元逻辑回归
- LR_multiclass = LR(multi_class='multinomial').fit(X_train, y_train)
- # 测试集预测
- y_pred_multiclass = LR_multiclass.predict(X_test)
- error_rate_multiclass = 1-accuracy_score(y_test, y_pred_multiclass)
- # 查看预测结果
- pd.DataFrame(np.append(y_pred_multiclass.reshape(-1,1), y_test.reshape(-1,1), axis=1), columns=['预测值', '真实值'], dtype=int)
- # 输出各方法的误差率
- for method in ['OvO', 'OvR', 'multiclass']:
- print(method+'的分类误差率{:.2%}'.format(globals()['error_rate_'+method]), sep='')
OvO 需要构造K×(K−1) / 2个二元分类器,因此耗时较长(本例耗时 16 秒),但结果也更准确。
OvR 需要构造K个二元分类器,因此耗时较短(本例耗时 3 秒),但结果不如 OvO 准确。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。