赞
踩
BEV模型部署一直是难以解决的问题,在车载芯片上运行要占用大量计算资源,为此FastBEV的作者提出了更加轻量级的方法,不需要transformer来提取BEV特征,仅使用卷积网络来完成,简单而有效。本文将会记录学习过程中的一些知识点,包括如果在本地运行、测试、随后对代码进行注释,便于进一步了解FastBEV的结构。
BEV交流群,会有许多智驾内推机会,v群:Rex1586662742、q群:468713665。
原作者训练的数据为完整的nuscences数据集,项目里面提供的pkl文件对应真个nuscences数据集,而对于个人学习来说,一般只能使用mini数据集。因为就需要生成mini数据集的okl文件,这部分原项目没有提及,因此也是花了很多时间来解决这个问题,感谢原作者的耐心回答。生成mini数据的pkl可以参考以下步骤:
目前只找到这个方法,如果有大佬知道更简洁的方法,可以指出。按照上面的方法就可以得到训练和测试mini数据集,本文也提供百度云链接:https://pan.baidu.com/s/1FGvYLqShz8VzWs6syq8uUw?pwd=qgpw 提取码: qgpw
如果直接按照项目里面的命令进行训练,很有可能报各种错误,只要还是本地环境和作者环境有较大的差异,因此需要修改一些参数来方便训练。本文训练的选择的配置参数为configs/fastbev/exp/paper/fastbev_m0_r18_s256x704_v200x200x4_c192_d2_f4.py,需要修改如下内容:
# 取消注释
file_client_args = dict(backend="disk")
并注释下面
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# data_root: 'public-1424:s3://openmmlab/datasets/detection3d/nuscenes/'}))
然后需要将里面的"SyncBN" 全部修改为 “BN”,优化器的"Adam2" 参数修改为"Adam"!!!! 不然会报错
由于原作者使用 bash tools/trian/fastbev_run.sh的方式进行训练和测试,通过会发现很多条件不满足,因此就想着直接使用python tools.train.py的方式进行训练,需要修改的参数为:
parser.add_argument(
"--config",
default="configs/fastbev/exp/paper/fastbev_m0_r18_s256x704_v200x200x4_c192_d2_f4.py",
help="train config file path",
)
parser.add_argument(
"--work-dir", default="work_dir", help="the dir to save logs and models"
)
group_gpus.add_argument(
"--gpu-ids",
type=int,
default=[0],
help="ids of gpus to use " "(only applicable to non-distributed training)",
)
修改完毕之后运行python train.py即可运行
FastBEV有如下特点:
网络结构如下图所示:
通过特征提取获得多尺度特征层
将每个视角的2D特征转移到统一的体素空间
多帧融合
图像以及BEV特征的数据增强
1、mmdet3d/models/detectors/fastbev.py
class FastBEV(BaseDetector):
def __init__():
...
def forward_train(...):
feature_bev, valids, features_2d = self.extract_feat(img, img_metas, "train") # 提取BEV特征
if self.bbox_head is not None:
x = self.bbox_head(feature_bev) # 解码头 -> mmdet3d/models/dense_heads/anchor3d_head.py
loss_det = self.bbox_head.loss(*x, gt_bboxes_3d, gt_labels_3d, img_metas) # 计算损失 -> mmdet3d/models/dense_heads/free_anchor3d_head.py
def extract_feat(self, img, img_metas, mode)
"""
args:
img:[1, 24, 3, 256, 704] -> [1, 4*6, 3, 256, 704] 取4帧的环视图片,用于时序融合
"""
x = self.backbone(x) #提取图片多尺度特征 [24,64,64,176] [24,128,32,88] [24,256,16,44] [24,512,8,22]
# 多尺度特征融合
def _inner_forward(x):
out = self.neck(x)
return out # # [24, 64, 64, 176]; [24, 64, 32, 88]; [24, 64, 16, 44]; [24, 64, 8, 22])
if self.with_cp and x.requires_grad:
...
else:
mlvl_feats = _inner_forward(x)
if self.multi_scale_id is not None:
for msid in self.multi_scale_id:
if getattr(self, f'neck_fuse_{msid}', None) is not None:
fuse_feats = [mlvl_feats[msid]] # [24, 64, 64, 176] 选择最大的特征层
for i in range(msid + 1, len(mlvl_feats)):# 遍历其他特征层
resized_feat = resize(...) # 将其他特征层缩放到最大的的特征层尺寸
fuse_feats.append(resized_feat)
if len(fuse_feats) > 1:
fuse_feats = torch.cat(fuse_feats, dim=1) #[24, 256, 64, 176] 特征层叠加
else:
...
fuse_feats = getattr(self, f'neck_fuse_{msid}')(fuse_feats) # 特征融合 [24, 64, 64, 176]
mlvl_feats_.append(fuse_feats)
mlvl_volumes = []
for lvl, mlvl_feat in enumerate(mlvl_feats):
stride_i = math.ceil(img.shape[-1] / mlvl_feat.shape[-1]) # 当前特征图与原图之间的比例
# [bs*seq*nv, c, h, w] -> [bs, seq*nv, c, h, w]
mlvl_feat = mlvl_feat.reshape([batch_size, -1] + list(mlvl_feat.shape[1:])) #
mlvl_feat_split = torch.split(mlvl_feat, 6, dim=1) # [1, 6, 64, 64, 176] * 4 # 4帧环视图片特征
volume_list = []
# 遍历4帧环视图片特征
for seq_id in range(len(mlvl_feat_split)):
volumes = []
for batch_id, seq_img_meta in enumerate(img_metas):
# 第i帧
feat_i = mlvl_feat_split[seq_id][batch_id]
img_meta["lidar2img"]["extrinsic"] = img_meta["lidar2img"]["extrinsic"][seq_id*6:(seq_id+1)*6] # 相机外参
# 相机内参@相机外参
projection = self._compute_projection(...)
if self.style in ['v1', 'v2']:
n_voxels, voxel_size = self.n_voxels[0], self.voxel_size[0] # [200,200,4] [0.5,0.5,1.5] #体素个数,体素尺寸
else:
...
# ego坐标下的点云
points = get_points(...)
# 利用点云和特征层,获得3d的特征表示
if self.backproject == 'inplace':
volume = backproject_inplace(feat_i[:, :, :height, :width], points, projection)
if self.style in ['v1', 'v2']:
mlvl_volumes = torch.cat(mlvl_volumes, dim=1) # [bs, lvl*seq*c, vx, vy, vz]
else:
...
def _inner_forward(x):
out = self.neck_3d(x)
return out
if self.with_cp and x.requires_grad:
x = cp.checkpoint(_inner_forward, x)
else:
x = _inner_forward(x) # [1, 256, 200, 200, 4] -> [1, 192, 100, 100] fusion BEV encode
return x, None, features_2d
def _compute_projection(img_meta, stride, noise=0):
# 相机内参
intrinsic = torch.tensor(img_meta["lidar2img"]["intrinsic"][:3, :3])
intrinsic[:2] /= stride # 图片缩放系数
for extrinsic in extrinsics:
if noise > 0:
...
else:
projection.append(intrinsic @ extrinsic[:3]) # 内参@外参
return torch.stack(projection)
def get_points(n_voxels, voxel_size, origin):
points = torch.stack(...)
points = points * ... # [3, 200, 200, 4] #每个体素代表的真实距离
return points
def backproject_inplace(...):
'''
function: 2d feature + predefined point cloud -> 3d volume
input:
features: [6, 64, 64, 176]
points: [3, 200, 200, 4]
projection: [6, 3, 4]
output:
volume: [64, 200, 200, 4] # 将2d特征与3d point cloud 结合
'''
n_images, n_channels, height, width = features.shape # [6, 64, 64, 176]
n_x_voxels, n_y_voxels, n_z_voxels = points.shape[-3:] # [200, 200, 4] ,每个相机对应的 200 * 200 * 4 的点云
points = points.view(1, 3, -1).expand(n_images, 3, -1) # 6个相机 * 点云
points = torch.cat((points, torch.ones_like(points[:, :1])), dim=1) # x,y,z,1 齐次坐标
# 3d 转 2d [6, 3, 160000] 每个点云在相机上的坐标
points_2d_3 = torch.bmm(projection, points)
valid = ... # 点云投影在像素坐标系后,提取在图片范围内的特征点
volume = torch.zeros(...) # [64,160000] 1600000个特征点,每个特征点的特征维度为64
for i in range(n_images): # 遍历当前帧的的环视图片
# 提取有效特征
volume[:, valid[i]] = features[i, :, y[i, valid[i]], x[i, valid[i]]]
2、mmdet3d/models/dense_heads/anchor3d_head.py
class Anchor3DHead(...):
def __init__(...):
...
def forward_single(...):
cls_score = self.conv_cls(x) # [1, 80, 100, 100] # 预测分类分支
bbox_pred = self.conv_reg(x) # [1, 72, 100, 100] # 预测框分支
if self.use_direction_classifier:
dir_cls_preds = self.conv_dir_cls(x) # 方向分支
3、mmdet3d/models/dense_heads/free_anchor3d_head.py
"""Calculate loss of FreeAnchor head.
Args:
cls_scores (list[torch.Tensor]): Classification scores of
different samples.
bbox_preds (list[torch.Tensor]): Box predictions of
different samples
dir_cls_preds (list[torch.Tensor]): Direction predictions of
different samples
gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes.
gt_labels (list[torch.Tensor]): Ground truth labels.
input_metas (list[dict]): List of input meta information.
gt_bboxes_ignore (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth boxes that should be ignored. Defaults to None.
Returns:
dict[str, torch.Tensor]: Loss items.
- positive_bag_loss (torch.Tensor): Loss of positive samples.
- negative_bag_loss (torch.Tensor): Loss of negative samples.
"""
# 10个类别
anchors = ... # [80000, 9]为每个特征点生成 8个锚框,共4个尺度,每个尺度两种锚框
cls_scores = ... #[1, 80000, 10] 为每个特征点的8个anchor预测类别
bbox_preds = torch.cat(bbox_preds, dim=1) # [1, 80000, 9] # 为每个特征点的8个anchor预测边界框
dir_cls_preds = torch.cat(dir_cls_preds, dim=1) # [1, 80000, 2] 为每个特征点的8个anchor预测方向
for ... in ...:
gt_bboxes_ = gt_bboxes_.tensor.to(anchors_.device) # 当前帧的GT
with torch.no_grad():
pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_) # bbox_preds_是预测的偏移量
object_box_iou = bbox_overlaps_nearest_3d(gt_bboxes_, pred_boxes) # 每个锚框与gt计算一个IOU
.....
loss_bbox = self.loss_bbox() # 预测框损失
positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos) # 正样本损失
negative_loss = ... # 负样本损失
return losses
FastBEV提供了其中更加轻量化BEV特征提取方式以及BEV特征编码网络,同时引进了时序特征融合,这也是当前许多工作常用的做法,希望后续能够了解如何在移动端进行部署的。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。