class dataset_h5(torch.utils.data.Dataset):
def __init__(self, in_file):
super(dataset_h5, self).__init__()
self.file = h5py.File(in_file, 'r')
self.n_images, self.nx, self.ny = self.file['images'].shape
def __getitem__(self, index):
input = self.file['images'][index,:,:]
return input.astype('float32')
def __len__(self):
return self.n_images
It’s got simple to use Pytorch integration.
I was running into the same problems with the pytorch dataloader. On ImageNet, I couldn’t seem to get above about 250 images/sec. On a Google cloud instance with 12 cores & a V100, I could get just over 2000 images/sec with DALI. However in cases where the dataloader isn’t the bottleneck, I found that using DALI would impact performance 5-10%. This makes sense I think, as you’re using the GPU to some of the decoding & preprocessing
Edit: Dali also has a CPU only mode, meaning no GPU performance hit
class data_prefetcher(): def __init__(self, loader): self.loader = iter(loader) self.stream = torch.cuda.Stream() self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1) self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1) # With Amp, it isn't necessary to manually convert data to half. # if args.fp16: # self.mean = self.mean.half() # self.std = self.std.half() self.preload() def preload(self): try: self.next_input, self.next_target = next(self.loader) except StopIteration: self.next_input = None self.next_target = None return with torch.cuda.stream(self.stream): self.next_input = self.next_input.cuda(non_blocking=True) self.next_target = self.next_target.cuda(non_blocking=True) # With Amp, it isn't necessary to manually convert data to half. # if args.fp16: # self.next_input = self.next_input.half() # else: self.next_input = self.next_input.float() self.next_input = self.next_input.sub_(self.mean).div_(self.std) def next(self): torch.cuda.current_stream().wait_stream(self.stream) input = self.next_input target = self.next_target self.preload() return input, target train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) def train(train_loader, model, criterion, optimizer, epoch): # switch to train mode model.train() end = time.time() prefetcher = data_prefetcher(train_loader) input, target = prefetcher.next() i = 0 while input is not None: ‘’‘ backward ’‘’ input, target = prefetcher.next()
cuda流:CUDA流表示一个GPU操作队列,该队列中的操作将以添加到流中的先后顺序而依次执行。可以将一个流看做是GPU上的一个任务,不同任务可以并行执行。使用CUDA流,首先要选择一个支持设备重叠(Device Overlap)功能的设备,支持设备重叠功能的GPU能够在执行一个CUDA核函数的同时,还能在主机和设备之间执行复制数据操作。
