当前位置:   article > 正文

Distributed Training(Single Machine & Multi-GPU)

distributed training

目前使用的版本比较简单,不涉及到torch.multiprocessing的内容,只涉及到DistributedDataParallel的部分内容,这里给出一段code snippets以供参考:

### test_distributed_gpu.py
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import torch.distributed as dist################                      
from tqdm import tqdm

class dummyDataset(nn.Module):
    def __init__(self):
        super(dummyDataset, self).__init__()
        N = 50000
        self.data = torch.randn(N, 2048, 3)
        self.label = (torch.rand(N, 1) > 0.5).float().long()

    def __getitem__(self, item):
        jitter = torch.rand(2048,3)
        return self.data[item] + jitter, self.label[item]


    def __len__(self):
        return len(self.data)

class dummyModel(nn.Module):
    def __init__(self):
        super(dummyModel, self).__init__()
        self.layer1 = nn.Sequential(nn.Conv1d(3, 64, 1),
                                    nn.BatchNorm1d(64),
                                    nn.ReLU())
        self.layer2 = nn.Sequential(nn.Conv1d(64, 256, 1),
                                    nn.BatchNorm1d(256),
                                    nn.ReLU())
        self.fc = nn.Conv1d(256, 2, 1)

    def forward(self, data):
        """
        :param data:[B, 3, N]
        :return: [B, 2, N]
        """
        x = self.layer2(self.layer1(data))
        x = torch.max(x, dim=-1, keepdims=True)[0]
        x = self.fc(x).squeeze(-1)
        return x


if __name__ == '__main__':
    torch.manual_seed(1234)
    parser = argparse.ArgumentParser(description='DDP')
    parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training')################
    parser.add_argument('--lr', type=float, help='lr')
    args = parser.parse_args()
    dist.init_process_group(backend='nccl')################
    torch.cuda.set_device(args.local_rank)################
    print(f'current rank -> {args.local_rank}')
    device = torch.device("cuda", args.local_rank)################

    ### configurating dataset
    train_dataset = dummyDataset()
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)################
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=16,
                                               num_workers=2,
                                               shuffle=False,################
                                               sampler=train_sampler)################
    ### configurating model
    model = dummyModel().to(device)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])################

    ### configurating optimizer
    optimizer = optim.SGD(
        model.parameters(),
        lr=args.lr,
        momentum=0.98,
        weight_decay=0.000001,
    )
    scheduler = optim.lr_scheduler.ExponentialLR(
        optimizer,
        gamma=0.95,
    )

    ### configurating loss
    loss_func = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(100):
        train_sampler.set_epoch(epoch)################
        data_iter = train_loader.__iter__()
        print(f"lr at epoch {epoch} -> {optimizer.param_groups[0]['lr']}")
        for iter in tqdm(range(train_loader.__len__())):
            input_pcd, labels = data_iter.__next__()  # [B, 2048, 3]
            input_pcd = input_pcd.permute(0, 2, 1).contiguous().to(device)
            output = model(input_pcd)  # [B]
            labels = labels[:, 0].to(device) # [B, 1] -> [B]

            optimizer.zero_grad()
            loss = loss_func(output, labels) # ([B, 2], [B])
            loss.backward()
            optimizer.step()

        if epoch > 0 and epoch % 5 == 0:
            scheduler.step()

        if epoch > 0 and epoch % 10 == 0:
            state = {'epoch': epoch,
                     'state_dict': model.module.state_dict(),################
                     'optimizer': optimizer.state_dict(),
                     'scheduler': scheduler.state_dict(),
                     }

            torch.save(state, 'test_model.pth')
            print(f'saving epoch -> {epoch}')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112

其中相较于单卡训练,需要改动的部分已用################进行标记

脚本启动方式:

#!/bin/bash
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -m torch.distributed.launch \
--nproc_per_node=4 \
--master_port 29501 \
test_distributed_gpu.py \
--lr 0.005 \
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

nproc_per_node == number of visible devices,即用几张卡训练,此参数设置为几

几个需要注意的点:

  1. DPP训练dataloader的shuffle默认为False,设置为True会报错;sampler需要调用set_epoch方法打乱数据顺序,否则每个epoch数据顺序是一样的;
  2. 若出现 RuntimeError: Address already in use 错误,则在启动脚本中加入参数–master_port 29501指定端口,端口号可以任意给出;
  3. 若出现程序中止而显存不释放的问题(常出现于Ctrl+C或者Ctrl+Z挂起以结束程序的方式),使用命令netstat -ntlp查看端口号,找到相应pid,杀死程序kill -9 pid即可;或使用命令fuser -v /dev/nvidia*查看不同显卡上的僵尸进程,找到PID手动kill以释放显存。
  4. pdb调试在单卡下进行,即visible_devices设为1张,nproc_per_node=1下进行,多卡pdb仍然有些问题。
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家小花儿/article/detail/420460
推荐阅读
相关标签
  

闽ICP备14008679号