赞
踩
最近新入手了一个二手的macbook air m1
准备测试一下m1应对机器学习的能力
找到一位老哥的文章以及代码进行测试:
https://blog.csdn.net/KaelCui/article/details/106184158
运行范例代码之后出现如下错误提示:
raise RuntimeError(f’DataLoader worker (pid(s) {pids_str}) exited unexpectedly’) from e RuntimeError: DataLoader worker (pid(s) 57383) exited unexpectedly
范例代码如下
import torch from torch import nn import torchvision from torchvision import transforms import torch.nn.functional as F import os,sys,time import numpy as np import pandas as pd import datetime from tqdm import tqdm from copy import deepcopy from torchmetrics import Accuracy def printlog(info): nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print("\n"+"=========="*8 + "%s"%nowtime) print(str(info)+"\n") #================================================================================ # 一,准备数据 #================================================================================ transform = transforms.Compose([transforms.ToTensor()]) ds_train = torchvision.datasets.MNIST(root="mnist/",train=True,download=True,transform=transform) ds_val = torchvision.datasets.MNIST(root="mnist/",train=False,download=True,transform=transform) dl_train = torch.utils.data.DataLoader(ds_train, batch_size=128, shuffle=True, num_workers=2) dl_val = torch.utils.data.DataLoader(ds_val, batch_size=128, shuffle=False, num_workers=2) #================================================================================ # 二,定义模型 #================================================================================ def create_net(): net = nn.Sequential() net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=64,kernel_size = 3)) net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2)) net.add_module("conv2",nn.Conv2d(in_channels=64,out_channels=512,kernel_size = 3)) net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2)) net.add_module("dropout",nn.Dropout2d(p = 0.1)) net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1))) net.add_module("flatten",nn.Flatten()) net.add_module("linear1",nn.Linear(512,1024)) net.add_module("relu",nn.ReLU()) net.add_module("linear2",nn.Linear(1024,10)) return net net = create_net() print(net) # 评估指标 class Accuracy(nn.Module): def __init__(self): super().__init__() self.correct = nn.Parameter(torch.tensor(0.0),requires_grad=False) self.total = nn.Parameter(torch.tensor(0.0),requires_grad=False) def forward(self, preds: torch.Tensor, targets: torch.Tensor): preds = preds.argmax(dim=-1) m = (preds == targets).sum() n = targets.shape[0] self.correct += m self.total += n return m/n def compute(self): return self.correct.float() / self.total def reset(self): self.correct -= self.correct self.total -= self.total #================================================================================ # 三,训练模型 #================================================================================ loss_fn = nn.CrossEntropyLoss() optimizer= torch.optim.Adam(net.parameters(),lr = 0.01) metrics_dict = nn.ModuleDict({"acc":Accuracy()}) # =========================移动模型到mps上============================== device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") net.to(device) loss_fn.to(device) metrics_dict.to(device) # ==================================================================== epochs = 20 ckpt_path='checkpoint.pt' #early_stopping相关设置 monitor="val_acc" patience=5 mode="max" history = {} for epoch in range(1, epochs+1): printlog("Epoch {0} / {1}".format(epoch, epochs)) # 1,train ------------------------------------------------- net.train() total_loss,step = 0,0 loop = tqdm(enumerate(dl_train), total =len(dl_train),ncols=100) train_metrics_dict = deepcopy(metrics_dict) for i, batch in loop: features,labels = batch # =========================移动数据到mps上============================== features = features.to(device) labels = labels.to(device) # ==================================================================== #forward preds = net(features) loss = loss_fn(preds,labels) #backward loss.backward() optimizer.step() optimizer.zero_grad() #metrics step_metrics = {"train_"+name:metric_fn(preds, labels).item() for name,metric_fn in train_metrics_dict.items()} step_log = dict({"train_loss":loss.item()},**step_metrics) total_loss += loss.item() step+=1 if i!=len(dl_train)-1: loop.set_postfix(**step_log) else: epoch_loss = total_loss/step epoch_metrics = {"train_"+name:metric_fn.compute().item() for name,metric_fn in train_metrics_dict.items()} epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics) loop.set_postfix(**epoch_log) for name,metric_fn in train_metrics_dict.items(): metric_fn.reset() for name, metric in epoch_log.items(): history[name] = history.get(name, []) + [metric] # 2,validate ------------------------------------------------- net.eval() total_loss,step = 0,0 loop = tqdm(enumerate(dl_val), total =len(dl_val),ncols=100) val_metrics_dict = deepcopy(metrics_dict) with torch.no_grad(): for i, batch in loop: features,labels = batch # =========================移动数据到mps上============================== features = features.to(device) labels = labels.to(device) # ==================================================================== #forward preds = net(features) loss = loss_fn(preds,labels) #metrics step_metrics = {"val_"+name:metric_fn(preds, labels).item() for name,metric_fn in val_metrics_dict.items()} step_log = dict({"val_loss":loss.item()},**step_metrics) total_loss += loss.item() step+=1 if i!=len(dl_val)-1: loop.set_postfix(**step_log) else: epoch_loss = (total_loss/step) epoch_metrics = {"val_"+name:metric_fn.compute().item() for name,metric_fn in val_metrics_dict.items()} epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics) loop.set_postfix(**epoch_log) for name,metric_fn in val_metrics_dict.items(): metric_fn.reset() epoch_log["epoch"] = epoch for name, metric in epoch_log.items(): history[name] = history.get(name, []) + [metric] # 3,early-stopping ------------------------------------------------- arr_scores = history[monitor] best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores) if best_score_idx==len(arr_scores)-1: torch.save(net.state_dict(),ckpt_path) print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor, arr_scores[best_score_idx]),file=sys.stderr) if len(arr_scores)-best_score_idx>patience: print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format( monitor,patience),file=sys.stderr) break net.load_state_dict(torch.load(ckpt_path)) dfhistory = pd.DataFrame(history)
command+f 搜索 numworkers
发现两处numworkers = 2
删除即可
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。