当前位置:   article > 正文

torch.cuda.OutOfMemoryError_torch.cuda.outofmemoryerror: cuda out of memory.

torch.cuda.outofmemoryerror: cuda out of memory.

torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB (GPU 0; 7.92 GiB total capacity; 1.48 MiB already allocated; 6.91 GiB free; 22.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

遇到这个错误,按网上改小batch_size改的很小了,依然报错。

后改小了网络结构,搞定。

错误原因是网络的输入输出shape不一致。

错误的shape

128 * 128 * 8=131072

改后。

torch.nn.Flatten(),

torch.nn.Linear(in_features=8 * 8 * 8, out_features=512),

  1. import torch
  2. from matplotlib import pyplot as plt
  3. from torch import nn, optim
  4. # from torch.autograd import Variable
  5. from torch.utils.data import DataLoader
  6. from torchvision import datasets, transforms
  7. from tqdm import tqdm
  8. from matplotlib.ticker import MaxNLocator
  9. # 超参数
  10. batch_size = 128 # 批大小
  11. learning_rate = 0.0001 # 学习率
  12. epochs = 20 # 迭代次数
  13. channels = 3 # 图像通道大小
  14. class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer','dog', 'frog', 'horse', 'ship', 'truck']
  15. # 数据集下载和预处理
  16. transform = transforms.Compose([transforms.ToTensor(), # 将图片转换成PyTorch中处理的对象Tensor,并且进行标准化0-1
  17. transforms.Normalize([0.5], [0.5])]) # 归一化处理
  18. path = './data/' # 数据集下载后保存的目录
  19. # 下载训练集和测试集
  20. trainData = datasets.CIFAR10(path, train=True, transform=transform, download=True)
  21. testData = datasets.CIFAR10(path, train=False, transform=transform)
  22. # 将数据集前20个图片数据可视化显示
  23. # 进行图像大小为20宽、10长的绘图(单位为英寸inch)
  24. plt.figure(figsize=(20,10))
  25. # 遍历MNIST数据集下标数值0~49
  26. for i in range(20):
  27. # 将整个figure分成5行10列,绘制第i+1个子图。
  28. plt.subplot(4,5,i+1)
  29. # 设置不显示x轴刻度
  30. plt.xticks([])
  31. # 设置不显示y轴刻度
  32. plt.yticks([])
  33. # 设置不显示子图网格线
  34. plt.grid(False)
  35. # 图像展示,cmap为颜色图谱,"plt.cm.binary"为matplotlib.cm中的色表
  36. plt.imshow(trainData.data[i], cmap=plt.cm.binary)
  37. # 设置x轴标签显示为图片对应的数字
  38. plt.xlabel(class_names[trainData.targets[i]])
  39. # 显示图片
  40. plt.show()
  41. # 处理成data loader
  42. trainDataLoader = torch.utils.data.DataLoader(dataset=trainData, batch_size=batch_size, shuffle=True) # 批量读取并打乱
  43. testDataLoader = torch.utils.data.DataLoader(dataset=testData, batch_size=batch_size)
  44. # 开始构建cnn模型
  45. class cnn(torch.nn.Module):
  46. def __init__(self):
  47. super(cnn, self).__init__()
  48. self.model = torch.nn.Sequential(
  49. # The size of the picture is 28*28
  50. torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=(3,3), stride=1, padding=1),
  51. torch.nn.ReLU(),
  52. torch.nn.MaxPool2d(kernel_size=2, stride=2),
  53. # The size of the picture is 14*14
  54. torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3,3), stride=1, padding=1),
  55. torch.nn.ReLU(),
  56. torch.nn.MaxPool2d(kernel_size=2, stride=2),
  57. #
  58. # The size of the picture is 7*7
  59. torch.nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), stride=1, padding=1),
  60. torch.nn.ReLU(),
  61. # torch.nn.MaxPool2d(kernel_size=2, stride=2),
  62. # The size of the picture is 7*7
  63. torch.nn.Conv2d(in_channels=128, out_channels=8, kernel_size=(3,3), stride=1, padding=1),
  64. torch.nn.ReLU(),
  65. # torch.nn.MaxPool2d(kernel_size=2, stride=2),
  66. torch.nn.Flatten(),
  67. torch.nn.Linear(in_features=8 * 8 * 8, out_features=512),
  68. torch.nn.ReLU(),
  69. torch.nn.Dropout(0.2), # 抑制过拟合 随机丢掉一些节点
  70. torch.nn.Linear(in_features=512, out_features=10),
  71. # torch.nn.Softmax(dim=1) # pytorch的交叉熵函数其实是softmax-log-NLL 所以这里的输出就不需要再softmax了
  72. )
  73. def forward(self, input):
  74. output = self.model(input)
  75. return output
  76. # 选择模型
  77. model = cnn()
  78. # GPU可用时转到cuda上执行
  79. if torch.cuda.is_available():
  80. model = model.cuda()
  81. # 定义损失函数和优化器
  82. criterion = nn.CrossEntropyLoss() # 选用交叉熵函数作为损失函数
  83. optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  84. # optimizer = optim.Adam(model.parameters())
  85. # 训练模型并存储训练时的指标
  86. epoch = 1
  87. history = {'Train Loss': [],
  88. 'Test Loss': [],
  89. 'Train Acc': [],
  90. 'Test Acc': []}
  91. for epoch in range(1, epochs+1):
  92. processBar = tqdm(trainDataLoader, unit='step')
  93. model.train(True)
  94. train_loss, train_correct = 0, 0
  95. for step, (train_imgs, labels) in enumerate(processBar):
  96. if torch.cuda.is_available(): # GPU可用
  97. train_imgs = train_imgs.cuda()
  98. labels = labels.cuda()
  99. model.zero_grad() # 梯度清零
  100. outputs = model(train_imgs) # 输入训练集
  101. loss = criterion(outputs, labels) # 计算损失函数
  102. predictions = torch.argmax(outputs, dim=1) # 得到预测值
  103. correct = torch.sum(predictions == labels)
  104. accuracy = correct / labels.shape[0] # 计算这一批次的正确率
  105. loss.backward() # 反向传播
  106. optimizer.step() # 更新优化器参数
  107. processBar.set_description("[%d/%d] Loss: %.4f, Acc: %.4f" % # 可视化训练进度条设置
  108. (epoch, epochs, loss.item(), accuracy.item()))
  109. # 记录下训练的指标
  110. train_loss = train_loss + loss
  111. train_correct = train_correct + correct
  112. # 当所有训练数据都进行了一次训练后,在验证集进行验证
  113. if step == len(processBar) - 1:
  114. tst_correct, totalLoss = 0, 0
  115. model.train(False) # 开始测试
  116. model.eval() # 固定模型的参数并在测试阶段不计算梯度
  117. with torch.no_grad():
  118. for test_imgs, test_labels in testDataLoader:
  119. if torch.cuda.is_available():
  120. test_imgs = test_imgs.cuda()
  121. test_labels = test_labels.cuda()
  122. tst_outputs = model(test_imgs)
  123. tst_loss = criterion(tst_outputs, test_labels)
  124. predictions = torch.argmax(tst_outputs, dim=1)
  125. totalLoss += tst_loss
  126. tst_correct += torch.sum(predictions == test_labels)
  127. train_accuracy = train_correct / len(trainDataLoader.dataset)
  128. train_loss = train_loss / len(trainDataLoader) # 累加loss后除以步数即为平均loss值
  129. test_accuracy = tst_correct / len(testDataLoader.dataset) # 累加正确数除以样本数即为验证集正确率
  130. test_loss = totalLoss / len(testDataLoader) # 累加loss后除以步数即为平均loss值
  131. history['Train Loss'].append(train_loss.item()) # 记录loss和acc
  132. history['Train Acc'].append(train_accuracy.item())
  133. history['Test Loss'].append(test_loss.item())
  134. history['Test Acc'].append(test_accuracy.item())
  135. processBar.set_description("[%d/%d] Loss: %.4f, Acc: %.4f, Test Loss: %.4f, Test Acc: %.4f" %
  136. (epoch, epochs, train_loss.item(), train_accuracy.item(), test_loss.item(),
  137. test_accuracy.item()))
  138. processBar.close()
  139. # 对测试Loss进行可视化
  140. plt.plot(history['Test Loss'], color='red', label='Test Loss')
  141. plt.plot(history['Train Loss'], label='Train Loss')
  142. plt.legend(loc='best')
  143. plt.grid(True)
  144. plt.xlabel('Epoch')
  145. plt.xlim([0, epoch])
  146. plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
  147. plt.ylabel('Loss')
  148. plt.title('Train and Test LOSS')
  149. plt.legend(loc='upper right')
  150. plt.savefig('LOSS')
  151. plt.show()
  152. # 对测试准确率进行可视化
  153. plt.plot(history['Test Acc'], color='red', label='Test Acc')
  154. plt.plot(history['Train Acc'], label='Train Acc')
  155. plt.legend(loc='best')
  156. plt.grid(True)
  157. plt.xlabel('Epoch')
  158. plt.xlim([0, epoch])
  159. plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
  160. plt.ylabel('Accuracy')
  161. plt.title('Train and Test ACC')
  162. plt.legend(loc='lower right')
  163. plt.savefig('ACC')
  164. plt.show()
  165. torch.save(model, './model.pth')

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/88991
推荐阅读
相关标签
  

闽ICP备14008679号