赞
踩
首先补充一点pytorch和numpy的函数
import torch import numpy as np # reshape:有返回值,所谓有返回值,即不对原始多维数组进行修改 # resize:无返回值,所谓有返回值,即会对原始多维数组进行修改 a = np.arange(0, 12, 1).reshape(2, 3, 2) print(a) a.resize(3, 2, 2) print("resize\n", a) # transpose函数将矩阵进行转置(可以用于多维度的维度交换),有返回值但是不改变原始数组 b = np.transpose(a, [1, 2, 0]) print("transpose\n", b) # flatten 是将多维数组展平,与ravel函数的功能相同, # 不过flatten函数会请求分配内存来保存结果,而ravel函数只是返回数组的一个视图(view) c = b.flatten() d = b.ravel() print("flatten:\n", c) print("ravel\n", d) # pytorch view函数类似于resize,不会改变t的值,但是似乎只能二维 t = torch.from_numpy(a) #numpy转torch n = t.numpy() #torch转numpy print("view\n", t.view(1, 12)) # permute 将tensor的维度换位,类似于transpose s = t.permute(1, 2, 0) print("permute\n", s)
ResNet和VGG是Faster RCNN的backbone网络
VGG层数少,最多的也只有19层,但是参数多,主要是最后有三个FC层含有大量的参数。Faster RCNN用的是VGG16,其中的卷积都是3x3卷积
torchvision里已经定义了VGG了。VGG16是这么定义的
def vgg16(pretrained=False, **kwargs):
"""VGG 16-layer model (configuration "D")
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
if pretrained:
kwargs['init_weights'] = False
model = VGG(make_layers(cfg['D']), **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['vgg16']))
return model
调用了一个make_layer函数,这个函数也在torchvision的vgg.py里面
def make_layers(cfg, batch_norm=False): layers = [] in_channels = 3 for v in cfg: if v == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) if batch_norm: layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] else: layers += [conv2d, nn.ReLU(inplace=True)] in_channels = v return nn.Sequential(*layers) cfg = { 'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], }
cfg是一个字典,不同的网络传不同的key,对应的也是不同的网络结构。cfg里面定义的是每个层的通道数(M代表pooling)。通过make_layer来生成不同的网络结构,返回不同的sequential.
再来看VGG的class里面的定义,features传入的是之前定义的网络结构,也就是提取特征的卷积和pooling层。后面首先经过一个AdaptiveAvgPool2d层(AdaptiveAvgPool2d可参考https://discuss.pytorch.org/t/what-is-adaptiveavgpool2d/26897/2)
然后经过三个全连接层,每个全连接层后面都有Dropout来防止过拟合
class VGG(nn.Module): def __init__(self, features, num_classes=1000, init_weights=True): super(VGG, self).__init__() self.features = features self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) self.classifier = nn.Sequential( nn.Linear(512 * 7 * 7, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, num_classes), ) if init_weights: self._initialize_weights() def forward(self, x): x = self.features(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.classifier(x) return x
在vgg16.py中定义了faster rcnn的网络结构
def _init_modules(self): vgg = models.vgg16() if self.pretrained: print("Loading pretrained weights from %s" %(self.model_path)) state_dict = torch.load(self.model_path) vgg.load_state_dict({k:v for k,v in state_dict.items() if k in vgg.state_dict()}) vgg.classifier = nn.Sequential(*list(vgg.classifier._modules.values())[:-1]) # not using the last maxpool layer self.RCNN_base = nn.Sequential(*list(vgg.features._modules.values())[:-1]) # Fix the layers before conv3: for layer in range(10): for p in self.RCNN_base[layer].parameters(): p.requires_grad = False # self.RCNN_base = _RCNN_base(vgg.features, self.classes, self.dout_base_model) self.RCNN_top = vgg.classifier # not using the last maxpool layer self.RCNN_cls_score = nn.Linear(4096, self.n_classes) if self.class_agnostic: self.RCNN_bbox_pred = nn.Linear(4096, 4) else: self.RCNN_bbox_pred = nn.Linear(4096, 4 * self.n_classes)
上图是caffe版faster rcnn里面faster_rcnn_train.pt可视化后的结构图。
结合代码和图就很好理解了。
# not using the last maxpool layer
self.RCNN_base = nn.Sequential(*list(vgg.features._modules.values())[:-1])
RCNN_base 取的是feature里面的层,同时去掉了最后的pooling层。
vgg.classifier = nn.Sequential(*list(vgg.classifier._modules.values())[:-1])
self.RCNN_top = vgg.classifier
RCNN_top取的是classifier的三个fc层的前两个,去掉了最后分类的那一个,后面接的是cls_score和bbox_pred
# not using the last maxpool layer
self.RCNN_cls_score = nn.Linear(4096, self.n_classes)
if self.class_agnostic:
self.RCNN_bbox_pred = nn.Linear(4096, 4)
else:
self.RCNN_bbox_pred = nn.Linear(4096, 4 * self.n_classes)
这些结构目前只是定义,会在生成模型时由forward函数中被一个个的调用。
入口在trainval.py文件中。
# initilize the network here.
if args.net == 'vgg16':
fasterRCNN = vgg16(imdb.classes, pretrained=True, class_agnostic=args.class_agnostic)
elif args.net == 'res101':
fasterRCNN = resnet(imdb.classes, 101, pretrained=True, class_agnostic=args.class_agnostic)
elif args.net == 'res50':
fasterRCNN = resnet(imdb.classes, 50, pretrained=True, class_agnostic=args.class_agnostic)
elif args.net == 'res152':
fasterRCNN = resnet(imdb.classes, 152, pretrained=True, class_agnostic=args.class_agnostic)
else:
print("network is not defined")
pdb.set_trace()
进入resnet.py文件的resnet50()函数(以ResNet-50为例)
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
然后进入ResNet类的forward()函数,传入参数Bottleneck,也就是下图右边的模型。
resnet.py中init和forward定义了ResNet的网络结构:
def __init__(self, block, layers, num_classes=1000, zero_init_residual=False): super(ResNet, self).__init__() self.inplanes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(512 * block.expansion, num_classes) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.fc(x)
生成layer的时候还需要一个make_layer函数,生成以后存到layer里返回。
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
VGG有19层,ResNet有最多有151层。但是ResNet的参数却比VGG要少。
ResNet(Residual Neural Network)由微软研究院的Kaiming He等四名华人提出,通过使用ResNet Unit成功训练出了152层的神经网络,并在ILSVRC2015比赛中取得冠军,在top5上的错误率为3.57%,同时参数量比VGGNet低,效果非常突出。ResNet的结构可以极快的加速神经网络的训练,模型的准确率也有比较大的提升。同时ResNet的推广性非常好,甚至可以直接用到InceptionNet网络中。
ResNet的主要思想是在网络中增加了直连通道,即Highway Network的思想。此前的网络结构是性能输入做一个非线性变换,而Highway Network则允许保留之前网络层的一定比例的输出。ResNet的思想和Highway Network的思想也非常类似,允许原始输入信息直接传到后面的层中,如下图所示。
ResNet正是有了这样的Skip Connection,梯度能畅通无阻地通过各个Res blocks,作者何凯明说到,唯一影响深度的就是内存不足,因此只要内存足够,上千层的残差网络也都能实现。
而DenseNet更为极端,它的skip connection不仅仅只连接上下层,直接实现了跨层连接,每一层获得的梯度都是来自前面几层的梯度加成。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。