当前位置:   article > 正文

slowfast 跑多卡的时候遇到问题_slow fast 无法使用多卡

slow fast 无法使用多卡

问题描述:

在57 服务器上跑:

python tools/run_net.py --cfg configs/Kinetics/X3D_XS.yaml NUM_GPUS 4 TRAIN.BATCH_SIZE 8 SOLVER.BASE_LR 0.0125 DATA.PATH_TO_DATA_DIR /mnt/data/geguojing/Heart_data/annotations20201112

 

目前这个问题没有解决,57 的服务器一直在被占用。调试不了 

185 上的出现的错误是:

对比了自己的代码和测试代码:

  1. import torch
  2. import utils.distributed as dist
  3. from torch.utils.data.distributed import DistributedSampler
  4. from torch.utils.data.sampler import RandomSampler
  5. from torch.utils.data._utils.collate import default_collate
  6. import pdb
  7. def manu_collate(batch):
  8. inputs, bbox = zip(*batch)
  9. inputs = default_collate(inputs)
  10. collate_bbox = [i[0] for i in bbox]
  11. return inputs, collate_bbox
  12. class AVA_DATA(torch.utils.data.DataLoader):
  13. def __init__(self):
  14. print('heihei')
  15. pass
  16. def __getitem__(self, index):
  17. return torch.ones(3,224,224)*index, [index]
  18. # return torch.ones(3,224,224)*indx
  19. def __len__(self):
  20. return 16
  21. class AVA_MODEL(torch.nn.Module):
  22. def __init__(self):
  23. super(AVA_MODEL, self).__init__()
  24. self.conv1 = torch.nn.Conv2d(3,64,3,1)
  25. self.tmp = None
  26. # for i in range(20):
  27. # self.tmp[i]=0
  28. def forward(self,x,bbox,tmp):
  29. # print(x.device.index)
  30. self.tmp.append(1)
  31. if x.device.index==0:
  32. print(self.tmp)
  33. # tmp.append(1)
  34. # print(tmp)
  35. # # if 1:
  36. # print('***start***')
  37. # print(self.tmp)
  38. # for j in bbox:
  39. # self.tmp[j]=j
  40. # print('***end***')
  41. # print(bbox, self.tmp)
  42. # self.tmp.append(1)
  43. # print(self.tmp)
  44. # tmp.append(1)
  45. # print(tmp)
  46. # print(len(bbox))
  47. res = self.conv1(x)
  48. return res
  49. def run(
  50. local_rank, num_proc, func, init_method, shard_id, num_shards, backend,
  51. ):
  52. # Initialize the process group.
  53. world_size = num_proc * num_shards
  54. rank = shard_id * num_proc + local_rank
  55. try:
  56. torch.distributed.init_process_group(
  57. backend=backend,
  58. init_method=init_method,
  59. world_size=world_size,
  60. rank=rank,
  61. )
  62. except Exception as e:
  63. raise e
  64. torch.cuda.set_device(local_rank)
  65. func()
  66. def launch_job(func, init_method='tcp://localhost:9999', daemon=False, dis=True):
  67. """
  68. Run 'func' on one or more GPUs, specified in cfg
  69. Args:
  70. cfg (CfgNode): configs. Details can be found in
  71. slowfast/config/defaults.py
  72. init_method (str): initialization method to launch the job with multiple
  73. devices.
  74. func (function): job to run on GPU(s)
  75. daemon (bool): The spawned processes’ daemon flag. If set to True,
  76. daemonic processes will be created
  77. """
  78. if dis:
  79. torch.multiprocessing.spawn(
  80. run,
  81. nprocs=4,
  82. args=(
  83. 4,
  84. func,
  85. init_method,
  86. 0,
  87. 1,
  88. 'nccl',
  89. ),
  90. daemon=daemon,
  91. )
  92. else:
  93. func()
  94. def train():
  95. model = AVA_MODEL()
  96. cur_device = torch.cuda.current_device()
  97. model = model.cuda(device=cur_device)
  98. # model = torch.nn.DataParallel(model).cuda()
  99. model = torch.nn.parallel.DistributedDataParallel(
  100. module=model, device_ids=[cur_device], output_device=cur_device)
  101. dataset = AVA_DATA()
  102. dataloader = torch.utils.data.DataLoader(
  103. dataset,
  104. batch_size=1,
  105. shuffle=False,
  106. sampler=DistributedSampler(dataset),
  107. # sampler=None,
  108. num_workers=8,
  109. pin_memory=True,
  110. drop_last=True,
  111. collate_fn=manu_collate,
  112. # worker_init_fn=None,
  113. )
  114. tmp = []
  115. model.train()
  116. for i in range(200000000):
  117. shuffle_dataset(dataloader,i)
  118. for iter, (inputs,bbox) in enumerate(dataloader):
  119. # print(i,iter)
  120. # inputs = inputs.cuda(non_blocking=True)
  121. # res = model(inputs,bbox,tmp)
  122. for j in bbox:
  123. tmp.append(j)
  124. dist.synchronize()
  125. tmp = dist.all_gather_unaligned(tmp)
  126. if dist.get_rank()==0:
  127. print(iter,tmp)
  128. break
  129. model.module.tmp = tmp
  130. for i in range(2):
  131. for iter, (inputs,bbox) in enumerate(dataloader):
  132. # print(i,iter)
  133. inputs = inputs.cuda(non_blocking=True)
  134. res = model(inputs,bbox,tmp)
  135. # tmp.append(1)
  136. def shuffle_dataset(loader, cur_epoch):
  137. """"
  138. Shuffles the data.
  139. Args:
  140. loader (loader): data loader to perform shuffle.
  141. cur_epoch (int): number of the current epoch.
  142. """
  143. # sampler = (
  144. # loader.batch_sampler.sampler
  145. # if isinstance(loader.batch_sampler, ShortCycleBatchSampler)
  146. # else loader.sampler
  147. # )
  148. # assert isinstance(
  149. # sampler, (RandomSampler, DistributedSampler)
  150. # ), "Sampler type '{}' not supported".format(type(sampler))
  151. # # RandomSampler handles shuffling automatically
  152. sampler = loader.sampler
  153. if isinstance(sampler, DistributedSampler):
  154. # DistributedSampler shuffles data based on epoch
  155. sampler.set_epoch(cur_epoch)
  156. if __name__ == "__main__":
  157. tmp = dict()
  158. pdb.set_trace()
  159. for i in range(20):
  160. tmp[i]=0
  161. pdb.set_trace()
  162. launch_job(train)
  163. # train(tmp)

发现init_method 程序里面没有定义 

给init_method 一个定义:tcp://localhost:9999

 

查看每一个函数定义与测试代码的异同点

cfg.SHRAD_ID 0

cfg.NUM_SHARDS 1

cfg.DIST_BACKEND 为nccl

57上的nccl 的问题 不知道修改这个是不是就可以了

进到run 里面print 打印 

总结下来就是:不知道改了哪里就好了

pdb去掉就好了

最好是加打印输出 

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号