赞
踩
基于周志华老师的《机器学习》、上一篇学习笔记以及网络的其他资料,对线性模型的这一部分内容进行一个总结。上接:机器学习:线性模型学习总结(2)。
学习时间:2022.04.19~2022.04.20
和用Sk-Learn一样,也用来一个专门处理表格数据的函数,不过主要还是使用之前的流程:
# 数据预处理 df_x = mango_processing(df_x).astype(float) df_y = df_y.astype(float) # 划分训练集和测试集 tr_x, te_x, tr_y, te_y = train_test_split(df_x, df_y, test_size=0.2, random_state=42) # 全部转换成张量 train_tensor_x, test_tensor_x, train_tensor_y, test_tensor_y = map(torch.tensor, (np.array(tr_x), np.array(te_x), np.array(tr_y), np.array(te_y))) # 将标签转为long或float格式(根据损失函数定): # train_tensor_y = train_tensor_y.squeeze(-1).long() # test_tensor_y = test_tensor_y.squeeze(-1).long() train_tensor_y = train_tensor_y.squeeze(-1).float() test_tensor_y = test_tensor_y.squeeze(-1).float() # 返回测试集和训练集 return train_tensor_x, test_tensor_x, train_tensor_y, test_tensor_y
因为全连接层不加激活函数,就只相当于加权求和,所以就可以当做线性回归:
class LinearModel(nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.liner = nn.Linear(14, 1)
def forward(self, x):
x = x.to(torch.float32)
x = self.liner(x)
x = x.squeeze(-1) # 线性回归的损失函数MSELoss要求输入维数和目标维数一致,因此做了个降维
return x
在最后加一个Sigmoid函数实现逻辑回归分类:
class LinearModel(nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.liner = torch.nn.Linear(24, 2)
def forward(self, x):
x = x.to(torch.float32)
x = self.liner(x)
x = torch.sigmoid(x)
return x
这里学习了TorchMetrics这个包,直接用来调用评价回归结果,老样子,还是先写了一个函数:
# 计算均方误差MSE mean_squared_error = torchmetrics.MeanSquaredError() mean_squared_error(y_pred, y_true) mse = mean_squared_error.compute() print('MSE:', mse, end='; ') # 计算平均绝对误差MAE mean_absolute_error = torchmetrics.MeanAbsoluteError() mean_absolute_error(y_pred, y_true) mae = mean_absolute_error.compute() print('MAE:', mae, end='; ') # 计算平均绝对百分比误差MAPE mean_absolute_percentage_error = torchmetrics.MeanAbsolutePercentageError() mean_absolute_percentage_error(y_pred, y_true) mape = mean_absolute_percentage_error.compute() print('MAPE:', mape, end='; ') # 计算可解释方差EV explained_variance = torchmetrics.ExplainedVariance() explained_variance(y_pred, y_true) ev = explained_variance.compute() print('EV:', ev, end='; ') # 计算可解释方差EV r2_score = torchmetrics.R2Score() r2_score(y_pred, y_true) r2 = r2_score.compute() print('R2-Score:', r2, end='.')
同上:
# 计算准确率Accuracy accuracy = torchmetrics.Accuracy() accuracy(y_pred, y_true) acc = accuracy.compute() print('Accuracy:', acc, end='; ') # 计算精度precision precision = torchmetrics.Precision(average='macro', num_classes=calss_num) # 需要根据预测的类别数量设定 precision(y_pred, y_true) pre = precision.compute() print('Precision:', pre, end='; ') # 计算召回率recall recall = torchmetrics.Recall(average='macro', num_classes=calss_num) # 需要根据预测的类别数量设定 recall(y_pred, y_true) rec = recall.compute() print('Recall:', rec, end='; ') # 计算fl-score f1_score = torchmetrics.F1Score(num_classes=calss_num) f1_score(y_pred, y_true) f1 = f1_score.compute() print('F1-Score:', f1, end='; ') # 计算AUROC auroc = torchmetrics.AUROC(average='macro', num_classes=calss_num) auroc(y_pred, y_true) auc = auroc.compute() print('AUROC:', auc, end='.') auroc.reset()
数据来源:New York City Taxi Fare Prediction | Kaggle。
# 读取数据集,做好预处理 df = pd.read_csv('train.csv') df.pickup_datetime = pd.to_datetime(df.pickup_datetime).dt.tz_localize(None) df['hour'] = df['pickup_datetime'].apply(lambda x: x.strftime('%H')).astype(int) df['minute'] = df['pickup_datetime'].apply(lambda x: x.strftime('%M')).astype(int) df['second'] = df['pickup_datetime'].apply(lambda x: x.strftime('%S')).astype(int) df['date'] = df['pickup_datetime'].apply(lambda x: x.strftime('%Y%m%d')).astype(int) print(df.info()) target = df.fare_amount data = df.drop(['fare_amount', 'key', 'pickup_datetime'], axis=1) # 划分训练集,转换成张量 tr_tx, te_tx, tr_ty, te_ty = data_to_tensor(data, target) # ---------------------------------------定义网络--------------------------------------- class LinearModel(nn.Module): def __init__(self): super(LinearModel, self).__init__() self.liner = nn.Linear(14, 1) def forward(self, x): x = x.to(torch.float32) x = self.liner(x) x = x.squeeze(-1) # 线性回归的损失函数MSELoss要求输入维数和目标维数一致,因此做了个降维 return x # --------------------------准备训练(除超参数外,可复用)-------------------------- # 设置随机数种子,保证结果可复现 seed = 42 torch.manual_seed(seed) # 设置CPU # 实例化模型 model = LinearModel() # 适应设备(CPU or GPU) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # 存入DataLoader ds = TensorDataset(tr_tx, tr_ty) dl = DataLoader(ds, batch_size=128, shuffle=True) # 学习率 lr = 1e-5 # 设定迭代次数 epoch = 100 # 设定每隔多少次显示一次评价指标 show_step = 10 # 选用优化器 optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.01) # 设置损失函数Loss criterion = nn.MSELoss() # ----------------------------------模型训练(可复用)---------------------------------- for epoch in range(epoch+1): for x, y in dl: pred = model(x) # 正向传播 loss = criterion(pred, y) # 计算损失函数 optimizer.zero_grad() # 优化器的梯度清零 loss.backward() # 反向传播 optimizer.step() # 参数更新 if epoch % show_step == 0: # 控制输出间隔 with torch.no_grad(): print('epoch: ', epoch) tr_pred = model(tr_tx) # 得到训练集的预测结果 te_pred = model(te_tx) # 得到测试集的预测结果 all_regress_evaluation(tr_pred, tr_ty, te_pred, te_ty) # -------------------------------输出验证集结果(基本可复用)------------------------------- df_v = pd.read_csv('test.csv') # 读取验证集数据 df_v.pickup_datetime = pd.to_datetime(df_v.pickup_datetime).dt.tz_localize(None) df_v['hour'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%H')).astype(int) df_v['minute'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%M')).astype(int) df_v['second'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%S')).astype(int) df_v['date'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%Y%m%d')).astype(int) va_x = df_v.drop(['key', 'pickup_datetime'], axis=1) # 弃列(少一个预测列) va_x = mango_processing(va_x).astype(float) # 数据预处理 va_tx = torch.tensor(np.array(va_x)) # 转换成张量 va_pred = model(va_tx) # 预测 va_id = df_v['key'] # 读取索引列 va_out = pd.DataFrame({'key': va_id, 'fare_amount': va_pred.detach().numpy()}) # 构建输出数据的DataFrame va_out['fare_amount'] = va_out['fare_amount'].apply(lambda x: round(x, 2)) # 数字保留两位小数 va_out.to_csv('Valid Prediction.csv', index=False) # 输出到CSV,并取消索引列
数据来源:Spaceship Titanic | Kaggle。
# 读取数据集,做好预处理 df = pd.read_csv('train.csv') target = df.Transported data = df.drop(['PassengerId', 'Transported', 'Name', 'Cabin'], axis=1) # 划分训练集,转换成张量 tr_tx, te_tx, tr_ty, te_ty = data_to_tensor(data, target) # ------------------------------------------定义网络------------------------------------------ class LinearModel(nn.Module): def __init__(self): super(LinearModel, self).__init__() self.liner = torch.nn.Linear(24, 2) def forward(self, x): x = x.to(torch.float32) x = self.liner(x) x = torch.sigmoid(x) return x # --------------------------准备训练(除超参数外,可完全复用)-------------------------- # 设置随机数种子,保证结果可复现 seed = 42 torch.manual_seed(seed) # 设置CPU # 实例化模型 model = LinearModel() # 适应设备(CPU or GPU) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # 存入DataLoader ds = TensorDataset(tr_tx, tr_ty) dl = DataLoader(ds, batch_size=256, shuffle=True) # 学习率 lr = 1e-3 # 设定迭代次数 epoch = 70 # 设定每隔多少次显示一次评价指标 show_step = 10 # 选用优化器 optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.01) # 设置损失函数Loss criterion = nn.CrossEntropyLoss() # ----------------------------------模型训练(可完全复用)---------------------------------- for epoch in range(epoch+1): for x, y in dl: pred = model(x) # 正向传播 loss = criterion(pred, y) # 计算损失函数 optimizer.zero_grad() # 优化器的梯度清零 loss.backward() # 反向传播 optimizer.step() # 参数更新 if epoch % show_step == 0: # 控制输出间隔 with torch.no_grad(): print('epoch: ', epoch) tr_pred = model(tr_tx) # 得到训练集的预测结果 te_pred = model(te_tx) # 得到测试集的预测结果 all_classify_evaluation(tr_pred, tr_ty, te_pred, te_ty, 2) # 输出曲线 tr_pred = model(tr_tx) torch_plot_curve(tr_pred, tr_ty) # -------------------------------输出验证集结果(基本可复用)------------------------------- df_v = pd.read_csv('test.csv') # 读取验证集数据 va_x = df_v.drop(['PassengerId', 'Name', 'Cabin'], axis=1) # 弃列(少一个预测列) va_x = mango_processing(va_x).astype(float) # 数据预处理 va_tx = torch.tensor(np.array(va_x)) # 转换成张量 va_pred = model(va_tx) # 预测 _, va_y = torch.max(va_pred.data, 1) # 分类数据,需要选取概率最大项的索引填充到第1列 va_id = df_v['PassengerId'] # 读取索引列 va_out = pd.DataFrame({'PassengerId': va_id, 'Transported': va_y}) # 构建输出数据的DataFrame va_out['Transported'] = va_out['Transported'].astype(bool) # 分类数据,标签转换 va_out.to_csv('Valid Prediction.csv', index=False) # 输出到CSV,并取消索引列
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。