赞
踩
目前使用的版本比较简单,不涉及到torch.multiprocessing的内容,只涉及到DistributedDataParallel的部分内容,这里给出一段code snippets以供参考:
### test_distributed_gpu.py import argparse import torch import torch.nn as nn import torch.nn.functional as F from torch import optim import torch.distributed as dist################ from tqdm import tqdm class dummyDataset(nn.Module): def __init__(self): super(dummyDataset, self).__init__() N = 50000 self.data = torch.randn(N, 2048, 3) self.label = (torch.rand(N, 1) > 0.5).float().long() def __getitem__(self, item): jitter = torch.rand(2048,3) return self.data[item] + jitter, self.label[item] def __len__(self): return len(self.data) class dummyModel(nn.Module): def __init__(self): super(dummyModel, self).__init__() self.layer1 = nn.Sequential(nn.Conv1d(3, 64, 1), nn.BatchNorm1d(64), nn.ReLU()) self.layer2 = nn.Sequential(nn.Conv1d(64, 256, 1), nn.BatchNorm1d(256), nn.ReLU()) self.fc = nn.Conv1d(256, 2, 1) def forward(self, data): """ :param data:[B, 3, N] :return: [B, 2, N] """ x = self.layer2(self.layer1(data)) x = torch.max(x, dim=-1, keepdims=True)[0] x = self.fc(x).squeeze(-1) return x if __name__ == '__main__': torch.manual_seed(1234) parser = argparse.ArgumentParser(description='DDP') parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training')################ parser.add_argument('--lr', type=float, help='lr') args = parser.parse_args() dist.init_process_group(backend='nccl')################ torch.cuda.set_device(args.local_rank)################ print(f'current rank -> {args.local_rank}') device = torch.device("cuda", args.local_rank)################ ### configurating dataset train_dataset = dummyDataset() train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)################ train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, num_workers=2, shuffle=False,################ sampler=train_sampler)################ ### configurating model model = dummyModel().to(device) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])################ ### configurating optimizer optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=0.98, weight_decay=0.000001, ) scheduler = optim.lr_scheduler.ExponentialLR( optimizer, gamma=0.95, ) ### configurating loss loss_func = nn.CrossEntropyLoss() model.train() for epoch in range(100): train_sampler.set_epoch(epoch)################ data_iter = train_loader.__iter__() print(f"lr at epoch {epoch} -> {optimizer.param_groups[0]['lr']}") for iter in tqdm(range(train_loader.__len__())): input_pcd, labels = data_iter.__next__() # [B, 2048, 3] input_pcd = input_pcd.permute(0, 2, 1).contiguous().to(device) output = model(input_pcd) # [B] labels = labels[:, 0].to(device) # [B, 1] -> [B] optimizer.zero_grad() loss = loss_func(output, labels) # ([B, 2], [B]) loss.backward() optimizer.step() if epoch > 0 and epoch % 5 == 0: scheduler.step() if epoch > 0 and epoch % 10 == 0: state = {'epoch': epoch, 'state_dict': model.module.state_dict(),################ 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), } torch.save(state, 'test_model.pth') print(f'saving epoch -> {epoch}')
其中相较于单卡训练,需要改动的部分已用################进行标记
脚本启动方式:
#!/bin/bash
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -m torch.distributed.launch \
--nproc_per_node=4 \
--master_port 29501 \
test_distributed_gpu.py \
--lr 0.005 \
nproc_per_node == number of visible devices,即用几张卡训练,此参数设置为几
几个需要注意的点:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。