赞
踩
本文参考的教程来自B站大学:wandb我最爱的炼丹伴侣操作指南
在此感谢UP主分享的教程~
wandb官方文档链接
在project中点击侧边栏的Runs会以Table形式显示多次run的结果
在project中点击侧边栏的Artifacts记录数据集,模型的版本
import wandb
wandb.login(key='你的API keys')
import wandb
wandb.login(key='你的API keys')
import os,PIL
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torch
from torch import nn
import torchvision
from torchvision import transforms
import datetime
import wandb
from argparse import Namespace
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = Namespace(
project_name = 'wandb_demo',
batch_size = 512,
hidden_layer_width = 64,
dropout_p = 0.1,
lr = 1e-4,
optim_type = 'Adam',
epochs = 15,
ckpt_path = 'checkpoint.pt'
)
def create_dataloaders(config):
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="./mnist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="./mnist/",train=False,download=True,transform=transform)
ds_train_sub = torch.utils.data.Subset(ds_train, indices=range(0, len(ds_train), 5))
dl_train = torch.utils.data.DataLoader(ds_train_sub, batch_size=config.batch_size, shuffle=True,
num_workers=2,drop_last=True)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=config.batch_size, shuffle=False,
num_workers=2,drop_last=True)
return dl_train,dl_val
def create_net(config):
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=config.hidden_layer_width,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=config.hidden_layer_width,
out_channels=config.hidden_layer_width,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = config.dropout_p))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(config.hidden_layer_width,config.hidden_layer_width))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(config.hidden_layer_width,10))
net.to(device)
return net
def train_epoch(model,dl_train,optimizer):
model.train()
for step, batch in enumerate(dl_train):
features,labels = batch
features,labels = features.to(device),labels.to(device)
preds = model(features)
loss = nn.CrossEntropyLoss()(preds,labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
return model
def eval_epoch(model,dl_val):
model.eval()
accurate = 0
num_elems = 0
for batch in dl_val:
features,labels = batch
features,labels = features.to(device),labels.to(device)
with torch.no_grad():
preds = model(features)
predictions = preds.argmax(dim=-1)
accurate_preds = (predictions==labels)
num_elems += accurate_preds.shape[0]
accurate += accurate_preds.long().sum()
val_acc = accurate.item() / num_elems
return val_acc
def train(config = config):
dl_train, dl_val = create_dataloaders(config)
model = create_net(config);
optimizer = torch.optim.__dict__[config.optim_type](params=model.parameters(), lr=config.lr)
#======================================================================
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
wandb.init(project=config.project_name, config = config.__dict__, name = nowtime, save_code=True)
model.run_id = wandb.run.id
#======================================================================
model.best_metric = -1.0
for epoch in range(1,config.epochs+1):
model = train_epoch(model,dl_train,optimizer)
val_acc = eval_epoch(model,dl_val)
if val_acc>model.best_metric:
model.best_metric = val_acc
torch.save(model.state_dict(),config.ckpt_path)
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"epoch【{epoch}】@{nowtime} --> val_acc= {100 * val_acc:.2f}%")
#======================================================================
wandb.log({'epoch':epoch, 'val_acc': val_acc, 'best_val_acc':model.best_metric})
#======================================================================
#======================================================================
wandb.finish()
#======================================================================
return model
model = train(config)
如遇到CommError: Run initialization has timed out after 60.0 sec,wandb.init处报错,可尝试一下关闭代理后重新执行
点击链接进入runs,可查看训练过程,默认的横坐标为step,每调用一次wandb.log,step就会+1
import wandb
run = wandb.init(project='wandb_demo', id= model.run_id, resume='must')
# save dataset
arti_dataset = wandb.Artifact('mnist', type='dataset')
arti_dataset.add_dir('mnist/')
wandb.log_artifact(arti_dataset)
# save code
arti_code = wandb.Artifact('ipynb', type='code')
arti_code.add_file('./30分钟吃掉wandb可视化模型分析.ipynb')
wandb.log_artifact(arti_code)
# save model
arti_model = wandb.Artifact('cnn', type='model')
arti_model.add_file(config.ckpt_path)
wandb.log_artifact(arti_model)
wandb.finish() #finish时会提交保存
这里我就不进行演示啦,没上传成功,不然是可以在下图中找到代码和数据集的
类似下面这样:
#resume the run
import wandb
run = wandb.init(project=config.project_name, id= model.run_id, resume='must')
import matplotlib.pyplot as plt
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="./mnist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="./mnist/",train=False,download=True,transform=transform)
# visual the prediction
device = None
for p in model.parameters():
device = p.device
break
plt.figure(figsize=(8,8))
for i in range(9):
img,label = ds_val[i]
tensor = img.to(device)
y_pred = torch.argmax(model(tensor[None,...]))
img = img.permute(1,2,0)
ax=plt.subplot(3,3,i+1)
ax.imshow(img.numpy())
ax.set_title("y_pred = %d"%y_pred)
ax.set_xticks([])
ax.set_yticks([])
plt.show()
def data2fig(data):
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot()
ax.imshow(data)
ax.set_xticks([])
ax.set_yticks([])
return fig
def fig2img(fig):
import io,PIL
buf = io.BytesIO()
fig.savefig(buf)
buf.seek(0)
img = PIL.Image.open(buf)
return img
from tqdm import tqdm
good_cases = wandb.Table(columns = ['Image','GroundTruth','Prediction'])
bad_cases = wandb.Table(columns = ['Image','GroundTruth','Prediction'])
# 找到50个good cases 和 50 个bad cases
plt.close()
for i in tqdm(range(1000)):
features,label = ds_val[i]
tensor = features.to(device)
y_pred = torch.argmax(model(tensor[None,...]))
# log badcase
if y_pred!=label:
if len(bad_cases.data)<50:
data = features.permute(1,2,0).numpy()
input_img = wandb.Image(fig2img(data2fig(data)))
bad_cases.add_data(input_img,label,y_pred)
# log goodcase
else:
if len(good_cases.data)<50:
data = features.permute(1,2,0).numpy()
input_img = wandb.Image(fig2img(data2fig(data)))
good_cases.add_data(input_img,label,y_pred)
wandb.log({'good_cases':good_cases,'bad_cases':bad_cases})
wandb.finish()
不知道为什么图片没显示,按道理应该是这个样子的
不知道是不是因为上传的文件太大了
在表格中可对各列进行升序降序显示
在表格中可进行聚合Group by
可对列进行编辑,类似excel的公式,十分灵活
下面的图来自的原作者30分钟吃掉wandb可视化自动调参
from argparse import Namespace
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#初始化参数配置
config = Namespace(
project_name = 'wandb_demo',
batch_size = 512,
hidden_layer_width = 64,
dropout_p = 0.1,
lr = 1e-4,
optim_type = 'Adam',
epochs = 15,
ckpt_path = 'checkpoint.pt'
)
在使用 wandb.sweep 进行超参数调优时,可以通过定义剪枝策略来提前终止那些表现不佳的任务,从而节省资源并加速寻找最佳超参数的过程。
early_terminate 配置允许你指定一个早期终止策略,下面的代码使用的是 hyperband 算法。
以下是 hyperband 早期终止策略的参数解释:
‘type’: 指定使用的早期终止算法的类型。在这个例子中,我们使用 ‘hyperband’,这是一种基于成功率的早期终止策略,它在多个性能水平上并行运行实验,并根据性能结果动态调整资源分配。
‘min_iter’: 每个实验的最小迭代次数。即使实验表现不佳,也会运行足够多次以收集有意义的数据。
‘eta’: 性能水平之间的资源比例。例如,eta=2 意味着每次迭代,实验的资源会减半。这通常与 ‘s’ 参数一起使用,以确定每个性能水平的资源分配。
‘s’: 性能水平的数量。这是一个整数,指定算法在早期终止过程中使用的资源级别数。s 越大,算法在早期阶段就越保守,因为它会尝试更多的性能水平。
通过设置这些参数,你可以定制 hyperband 算法的行为,以平衡资源使用效率和找到最佳超参数配置的可能性。例如,如果你希望在早期阶段快速淘汰表现不佳的实验,可以增加 s 的值并减小 eta 的值。
from argparse import Namespace
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#初始化参数配置
config = Namespace(
project_name = 'wandb_demo',
batch_size = 512,
hidden_layer_width = 64,
dropout_p = 0.1,
lr = 1e-4,
optim_type = 'Adam',
epochs = 15,
ckpt_path = 'checkpoint.pt'
)
#选择一个调优算法
sweep_config = {
'method': 'random'
}
#定义调优目标 设置优化指标,以及优化方向
metric = {
'name': 'val_acc',
'goal': 'maximize'
}
sweep_config['metric'] = metric
#定义超参空间
sweep_config['parameters'] = {}
# 固定不变的超参
sweep_config['parameters'].update({
'project_name':{'value':'wandb_demo'},
'epochs': {'value': 10},
'ckpt_path': {'value':'checkpoint.pt'}})
# 离散型分布超参
sweep_config['parameters'].update({
'optim_type': {
'values': ['Adam', 'SGD','AdamW']
},
'hidden_layer_width': {
'values': [16,32,48,64,80,96,112,128]
}
})
# 连续型分布超参
sweep_config['parameters'].update({
'lr': {
'distribution': 'log_uniform_values',
'min': 1e-6,
'max': 0.1
},
'batch_size': {
'distribution': 'q_uniform',
'q': 8,
'min': 32,
'max': 256,
},
'dropout_p': {
'distribution': 'uniform',
'min': 0,
'max': 0.6,
}
})
#定义剪枝策略,提前终止那些没有希望的任务
sweep_config['early_terminate'] = {
'type':'hyperband',
'min_iter':3,
'eta':2,
's':3
} #在step=3, 6, 12 时考虑是否剪枝
from pprint import pprint
pprint(sweep_config)
sweep_id = wandb.sweep(sweep_config, project=config.project_name)
#把模型训练相关的全部代码整理成一个 train函数
def create_dataloaders(config):
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="./mnist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="./mnist/",train=False,download=True,transform=transform)
ds_train_sub = torch.utils.data.Subset(ds_train, indices=range(0, len(ds_train), 5))
dl_train = torch.utils.data.DataLoader(ds_train_sub, batch_size=config.batch_size, shuffle=True,
num_workers=2,drop_last=True)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=config.batch_size, shuffle=False,
num_workers=2,drop_last=True)
return dl_train,dl_val
def create_net(config):
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=config.hidden_layer_width,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=config.hidden_layer_width,
out_channels=config.hidden_layer_width,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = config.dropout_p))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(config.hidden_layer_width,config.hidden_layer_width))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(config.hidden_layer_width,10))
return net
def train_epoch(model,dl_train,optimizer):
model.train()
for step, batch in enumerate(dl_train):
features,labels = batch
features,labels = features.to(device),labels.to(device)
preds = model(features)
loss = nn.CrossEntropyLoss()(preds,labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
return model
def eval_epoch(model,dl_val):
model.eval()
accurate = 0
num_elems = 0
for batch in dl_val:
features,labels = batch
features,labels = features.to(device),labels.to(device)
with torch.no_grad():
preds = model(features)
predictions = preds.argmax(dim=-1)
accurate_preds = (predictions==labels)
num_elems += accurate_preds.shape[0]
accurate += accurate_preds.long().sum()
val_acc = accurate.item() / num_elems
return val_acc
def train(config = config):
dl_train, dl_val = create_dataloaders(config)
model = create_net(config);
model = model.to(device)
optimizer = torch.optim.__dict__[config.optim_type](params=model.parameters(), lr=config.lr)
#======================================================================
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
wandb.init(project=config.project_name, config = config.__dict__, name = nowtime, save_code=True)
model.run_id = wandb.run.id
#======================================================================
model.best_metric = -1.0
for epoch in range(1,config.epochs+1):
model = train_epoch(model,dl_train,optimizer)
val_acc = eval_epoch(model,dl_val)
if val_acc>model.best_metric:
model.best_metric = val_acc
torch.save(model.state_dict(),config.ckpt_path)
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"epoch【{epoch}】@{nowtime} --> val_acc= {100 * val_acc:.2f}%")
#======================================================================
wandb.log({'epoch':epoch, 'val_acc': val_acc, 'best_val_acc':model.best_metric})
#======================================================================
#======================================================================
wandb.finish()
#======================================================================
return model
# 该agent 随机搜索 尝试5次
wandb.agent(sweep_id, train, count=5)
上面的代码会将完整的训练过程执行5遍
注意每一遍的超参数是不同的
wandb.agent(sweep_id, train, count=5) 是 Weights & Biases (Wandb) 提供的一个函数,用于启动一个或多个超参数调优任务(称为 “sweep agents”),这些任务会根据指定的超参数范围(由 sweep_id 指定)来执行训练函数 train 多次。count 参数指定了要运行的实验次数
可以在多个 Jupyter notebook 中使用相同的 sweep_id 执行 wandb.agent 来并行化超参数调优任务。这样做可以利用多台机器的计算资源来加速超参数搜索过程,每个 Jupyter notebook都会作为一个独立的 “agent” 运行,它们会向 Wandb 的服务器请求任务并执行相应的超参数调优实验
在侧边栏的controls中可对sweeo进行控制,比如暂停,继续,关闭或者杀死controller,若关闭则无法重新启动
在官网侧边栏的sweep可视化查看调参:
超参数重要性图:
但我这里又没显示…
就介绍到这里啦,完结撒花~
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。