赞
踩
目录 1. 数据处理 1.1 数据集分类 1.2 数据转换为hdf5格式 1.3 编码 2. 网络模型 2.1 DarkNet19 2.2 yolo_body+decoder 3. 损失函数 3.1 正样本损失 3.2 负样本损失 3.3 类别损失 3.4 框损失 4. 训 练 4.1 载入数据 4.2 载入模型 4.3 损失函数 4.4 更新参数 5. 预 测 5.1 数据处理 5.2 预测 5.3 筛选 5.4 画框
data_process/datasets_split_1.py
aim : 把数据集划分为训练集、测试集、验证集。每个数据集存放的是图片的名称。
input :xml_path、base_path、trainval_radio、train_radio
output : base_path+trainval.txt、base_path+train.txt、base_path+val.txt、base_path+test.txt。
process:
1. 根据xml_path里的文件获取总样本名称。
2. 根据trainval_radio、train_radio获取各个数据集的样本数量。根据各个数据集的样本数量从总样本中抽取样本,获取样本的下标。
3. 根据下标所在的数据集,把数据集的名称放在不同的数据集中。
import random,os xml_path = '../VOCdevkit/VOC2007/Annotations' # 总样本 base_path = '../VOCdevkit/VOC2007/ImageSets/Main' trainval_radio = 0.9 # 训练测试数据集的样本比例 train_radio = 0.9 # 验证集比例 names_list = [] img_names = os.listdir(xml_path) for name in img_names: if name.endswith('.xml'): names_list.append(name[:-4]) N = len(names_list) # 总样本量 trainval_num = int(N*trainval_radio) # 训练测试数据集量 train_num = int(trainval_num*train_radio) # 训练集样本量 trainval_idx = random.sample(range(N),trainval_num) # 训练测试数据集下标 train_idx = random.sample(trainval_idx,train_num) # 训练集下标 # 数据集地址 ftrain_val = open(os.path.join(base_path,'trainval.txt'),'w') ftrain = open(os.path.join(base_path,'train.txt'),'w') fval = open(os.path.join(base_path,'val.txt'),'w') ftest = open(os.path.join(base_path,'test.txt'),'w') # 读入数据 for i in range(N) : name = names_list[i] + '\n' if i in trainval_idx: ftrain_val.write(name) if i in train_idx: ftrain.write(name) else: fval.write(name) else: ftest.write(name) ftrain_val.close() ftrain.close() fval.close() ftest.close()
data_process/data2hdf5_2.py
input : 数据集
output : pascal_voc_07_12_LS.hdf5
process:
1. 获取数据集的样本。train_set --> get_ids(voc_path,train_set) --> train_ids
2. 生成voc_h5file,设置存储的图片数据类型和框的数据类型。划分每个数据集所属的group。voc_h5file存储'classes'。在每个group中设置train_images和train_boxes项目用来存储图片和框。
3. train_ids ,train_images,train_boxes --> add_to_dataset();
img_id --> get_img(voc_path,year,img_id);get_boxes(voc_path,year,img_id) --> img_data;img_box
代码
import numpy as np import os,h5py,argparse import xml.etree.ElementTree as ElementTree sets_from_2007 = [('2007','train'),('2007','val')] train_set = [('2007','train')] val_set = [('2007','val')] test_set = [('2007','test')] classes = [ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] parser = argparse.ArgumentParser(description='Conver Pascal VOC 2007 detection dataset to HDF5') parser.add_argument('-p','--path_to_voc',help='path to VOCdevkit directory', default='../VOCdevkit') def get_ids(voc_path,datasets): ''' 数据集中的样本''' ids = [] for year,set in datasets: id_path = os.path.join(voc_path,'VOC%s/ImageSets/Main/%s.txt'%(year,set)) print(id_path) with open(id_path,'r')as f: ids.extend(f.read().strip().split()) return ids def get_img(voc_path,year,img_id): ''' 读取图片 ''' img_path = os.path.join(voc_path,'VOC%s/JPEGImages/%s.jpg'%(year,img_id)) with open(img_path,'rb')as f: data = f.read() return np.frombuffer(data,dtype='uint8') # [n,] def get_boxes(voc_path,year,img_id): ''' 读取框 ''' boxes_path = os.path.join(voc_path,'VOC%s/Annotations/%s.xml'%(year,img_id)) with open(boxes_path,'r') as f: xml_tree = ElementTree.parse(f) root = xml_tree.getroot() boxes = [] for obj in root.iter('object'): difficult = obj.find('difficult').text cls = obj.find('name').text if cls not in classes or int(difficult) == 1: continue xml_box = obj.find('bndbox') bbox = (int(xml_box.find('xmin').text), int(xml_box.find('ymin').text), int(xml_box.find('xmax').text), int(xml_box.find('ymax').text), classes.index(cls)) boxes.extend(bbox) return np.array(boxes) # [n,] def add_to_dataset(voc_path,year,ids,images,boxes,start = 0): ''' 遍历每一个样本,读取数据集的样本和框 ''' for i,img_id in enumerate(ids): img_data = get_img(voc_path,year,img_id) img_box = get_boxes(voc_path,year,img_id) images[start+i] = img_data boxes[start+i] = img_box return i def _main(args): voc_path = os.path.expanduser(args.path_to_voc) # 1 获取数据集样本 train_ids = get_ids(voc_path,train_set) val_ids = get_ids(voc_path,val_set) test_ids = get_ids(voc_path,test_set) train_ids_2007 = get_ids(voc_path,sets_from_2007) total_train_ids = len(train_ids)+len(train_ids_2007) # 2 设置voc_h5file、数据类型、train_group print('Creating HDF5 dataset structure.') fname = os.path.join(voc_path,'pascal_voc_07_12_LS.hdf5') voc_h5file = h5py.File(fname,'w') uint8_dt = h5py.special_dtype(vlen = np.dtype('uint8')) # variable length uint8 int_dt = h5py.special_dtype(vlen = np.dtype(int)) train_group = voc_h5file.create_group('train') val_group = voc_h5file.create_group('val') test_group = voc_h5file.create_group('test') # 设置classes,实际应用中没有使用 voc_h5file.attrs['classes'] = np.string_(str.join(',',classes)) # 3 设置train_images 、train_boxes容器 train_images = train_group.create_dataset('images',shape=(total_train_ids,),dtype=uint8_dt) val_images = val_group.create_dataset('images',shape=(len(val_ids),),dtype=uint8_dt) test_images = test_group.create_dataset('images',shape=(len(test_ids),),dtype=uint8_dt) train_boxes = train_group.create_dataset('boxes',shape=(total_train_ids,),dtype=int_dt) val_boxes = val_group.create_dataset('boxes',shape=(len(val_ids),),dtype=int_dt) test_boxes = test_group.create_dataset('boxes',shape=(len(test_ids),),dtype=int_dt) # 4 加载数据 print('Process Pascal VOC 2007 datasets for training set') last_2007 = add_to_dataset(voc_path,'2007',train_ids_2007,train_images,train_boxes) print('Processing Pascal VOC 2012 training set.') add_to_dataset(voc_path,'2007',train_ids,train_images,train_boxes,start=last_2007+1) print('Processing Pascal VOC 2012 val set.') add_to_dataset(voc_path, '2007', val_ids, val_images, val_boxes) print('Processing Pascal VOC 2007 test set.') add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes) print('Closing HDF5 file.') voc_h5file.close() print('Done.') if __name__ == '__main__': _main(parser.parse_args()) # voc_path = parser.parse_args().path_to_voc # datasets = [('2007','train')] # ids = get_ids(voc_path,datasets) # # print(ids) # img = get_img(voc_path,year='2007',img_id='000025') # box = get_boxes(voc_path,year='2007',img_id='000025') # print(box.reshape(-1,5))
data_process/data_encoder_3.py
input : data_path,anchors_path,idx
output : processed_images[n,3,416,416],out[n,13,13,5,4+1+5]
process:
1.读取图片、框、类别数据。processed_images,processed_boxes = self.process_data(idx)
2.对框编码,得到真实偏移和cls。out = self.encoder(processed_boxes)
代码
import numpy as np import io,os,PIL,h5py,argparse from PIL import Image import torch import torch.utils.data as data YOLO_ANCHORS = np.array( ((0.57273, 0.677385), (1.87446, 2.06253), (3.33843, 5.47434), (7.88282, 3.52778), (9.77052, 9.16828))) def get_classes(classes_path): with open(classes_path) as f: class_name = f.read().strip().split() return class_name def get_anchors(anchors_path): if os.path.isfile(anchors_path): with open(anchors_path)as f: anchors = f.read().strip().split() return np.array(list(map(float,anchors))).reshape(-1, 2) else: Warning('Could not open anchors file, using default.') return YOLO_ANCHORS class yoloDataset(data.Dataset): image_size = [416,416] def __init__(self,data_path,anchors_path): self.anchors = self.get_anchors(anchors_path) data = h5py.File(data_path, 'r') self.images = data['train/images'][:] self.boxes = data['train/boxes'][:] # 1 每张图片中,框最多是多少 self.max_num = 0 self.num_samples = len(self.boxes) self.flag = self.boxes is not None if self.flag: for i in range(self.num_samples): self.boxes[i] = self.boxes[i].reshape(-1,5) if self.max_num < self.boxes[i].shape[0]: self.max_num = self.boxes[i].shape[0] def __len__(self): return self.num_samples def __getitem__(self,idx): processed_images,processed_boxes = self.process_data(idx) out = self.encoder(processed_boxes) return torch.tensor(processed_images), torch.tensor(out) def get_anchors(self,anchors_path): if os.path.isfile(anchors_path): with open(anchors_path)as f: anchors = f.read().strip().split() return np.array(list(map(float,anchors))).reshape(-1, 2) else: Warning('Could not open anchors file, using default.') return YOLO_ANCHORS def process_data(self,idx): ''' aim : 1.把图片归一化到0`1,转换通道。 2.box[x1,y1,x2,y2]-->[cx,cy,w,h];在原图上的相对位置; 每张图片上框的shape为[max_num,5],多余的补零。 inputs: idx outputs: np.array(img),np.array(new_box) ''' images = self.images[idx] boxes = self.boxes[idx] img = Image.open(io.BytesIO(images)) img_shape = np.array(img.size) # img = img.resize(self.image_size, PIL.Image.BICUBIC) # (416, 416) img = np.array(img,np.float)/255. img = np.transpose(img,(2,0,1)) if self.flag: box = np.concatenate([(boxes[:,2:4] + boxes[:,:2])*0.5/img_shape,(boxes[:,2:4] - boxes[:,:2])/img_shape,boxes[:,4:5]],1) new_box = np.zeros((self.max_num,5),dtype=np.float32) new_box[:len(box),:] = box # box(cx,cy,w,h,cls) return np.array(img),np.array(new_box) else: return np.array(img),None def encoder(self,boxes): ''' one picture aim : 把真实框映射到特征图上。 1. 真实框在特征图上对应的数值; 2 真实框在特征图上对应的对应的下标; 3 计算预测偏移 inputs: box[max_num_box, 5(cx,cy,w,h,cls)],anchors[5,2] max_num_box=10 ; image_size=[416,416] outputs: true_boxes:[h, w, num_boxes, 4] detectors_mask: (h, w, num_boxes, 1) eg:(13, 13, 5, 1) matching_true_boxes:(h, w, num_boxes, 5) eg:(13, 13, 5, 5) ''' # 1 创建模版 h,w = self.image_size num_anchors = len(self.anchors) num_box_params = boxes.shape[1] assert h % 32 == 0,'Image sizes in YOLO_v2 must be multiples of 32.' assert w % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.' grid_h = h//32 # 13 grid_w = w//32 true_boxes = np.zeros([grid_h,grid_w,num_anchors,4],dtype=np.float32) detectors_mask = np.zeros([grid_h,grid_w,num_anchors,1],dtype=np.float32) # (13, 13, 5, 1) matching_true_boxes = np.zeros([grid_h,grid_w,num_anchors,num_box_params],dtype=np.float32) # (13, 13, 5, 5) # 2 编码 box_class = boxes[:,4] # [n,1] box = boxes[:,:4]*np.array([grid_w,grid_h,grid_w,grid_h]) i,j = list(map(int,box[:,0])),list(map(int,box[:,1])) best_idx = self.iou_wh(box[:,:2],self.anchors) # (10, 2), (5, 2)--> ((10,), (10,)) true_boxes[i, j, best_idx] = boxes[:,:4]/np.array([grid_h,grid_w,grid_h,grid_w]) detectors_mask[i,j,best_idx] = 1 adjusted_box = np.array( [ box[:,0] - i, box[:,1] - j, np.log(box[:,2] / self.anchors[best_idx][:,0]), np.log(box[:,3] / self.anchors[best_idx][:,1]), box_class ], dtype=np.float32).T matching_true_boxes[i, j, best_idx] = adjusted_box out = np.concatenate([np.array(true_boxes),np.array(detectors_mask),np.array(matching_true_boxes)],-1) return out # true_boxes,detectors_mask, matching_true_boxes # ((13, 13, 5, 1), (13, 13, 5, 5)) def iou_wh(self,boxes_wh,anchors_wh): '''boxes_wh[n,2],anchors_wh [m,2] iou[n,m]''' boxes_wh=np.expand_dims(boxes_wh,1) # [10,1,2] anchors_wh=np.expand_dims(anchors_wh,0) # [1,5,2] box_max = boxes_wh/2. box_min = -box_max anchor_max = anchors_wh/2. anchor_min = -anchor_max inter_mins = np.maximum(box_min,anchor_min) # [10,5,2] inter_maxs = np.minimum(box_max,anchor_max) inter_wh = np.maximum(inter_maxs-inter_mins,0.) inter_area = inter_wh[...,0] * inter_wh [...,1] # [10,5] boxes_area = boxes_wh[...,0] * boxes_wh[...,1] anchors_area = anchors_wh[...,0]*anchors_wh[...,1] #[1,5] iou = inter_area/(boxes_area+anchors_area-inter_area) # [10,5] best_iou = np.max(iou,1) best_idx = np.argmax(iou,1) return list(best_idx*(best_iou > 0)) if __name__ == '__main__': from torch.utils.data import DataLoader data_path = '../VOCdevkit/pascal_voc_07_12_LS.hdf5' anchors_path = '../model_data.pascal_classes.txt' train_dataset = yoloDataset(data_path,anchors_path) # [3, 416, 416],[13, 13, 5, 10] train_loader = DataLoader(train_dataset,batch_size=1,shuffle=True,num_workers=0) for i,(img,boxes) in enumerate(train_loader): print(img.shape) # torch.Size([1, 3, 416, 416]) print(boxes.shape) # torch.Size([1, 13, 13, 5, 10]) 4+1+5
nets/darketnet19.py
input : img[b,3,416,416]
output : feas[b,1024,13,13]
process:
1.features_26 = (cov_bn_leaky3 --> maxpool)*2 -->
(bottleneck_block*2 --> maxpool)*2 -->
bottleneck_x2_block --> maxpool -->
bottleneck_x2_block
2.features_13 = features_26 --> maxpool --> bbx22
代码
import torch import torch.nn as nn import math def cov_bn_leaky3(inplanes,outplanes): return nn.Sequential( nn.Conv2d(inplanes,outplanes,kernel_size=3,padding=1), nn.BatchNorm2d(outplanes), nn.LeakyReLU(0.1) ) def cov_bn_leaky1(inplanes,outplanes): return nn.Sequential( nn.Conv2d(inplanes,outplanes,kernel_size=1), nn.BatchNorm2d(outplanes), nn.LeakyReLU(0.1) ) def bottleneck_block(inplanes,outplanes,bottleneck_filters): return nn.Sequential( cov_bn_leaky3(inplanes,outplanes), cov_bn_leaky1(outplanes,bottleneck_filters), cov_bn_leaky3(bottleneck_filters,outplanes) ) def bottleneck_x2_block(inplanes,outplanes,bottleneck_filters): return nn.Sequential( bottleneck_block(inplanes,outplanes,bottleneck_filters), cov_bn_leaky1(outplanes,bottleneck_filters), cov_bn_leaky3(bottleneck_filters,outplanes) ) class darknet_body(nn.Module): def __init__(self,): super(darknet_body, self).__init__() self.cbl1 = cov_bn_leaky3(3,32) self.cbl2 = cov_bn_leaky3(32,64) self.bb1 = bottleneck_block(64,128, 64) self.bb2 = bottleneck_block(128,256, 128) self.bbx21 = bottleneck_x2_block(256,512, 256) self.bbx22 = bottleneck_x2_block(512,1024, 512) self.maxpool = nn.MaxPool2d(kernel_size=2,stride=2) self.features_26 = nn.Sequential(self.cbl1,self.maxpool,self.cbl2,self.maxpool,self.bb1, self.maxpool, self.bb2 ,self.maxpool, self.bbx21) self.features_13 = nn.Sequential(self.features_26 ,self.maxpool, self.bbx22) for m in self.modules(): if isinstance(m,nn.Conv2d): n = m.kernel_size[0]*m.kernel_size[1]*m.out_channels m.weight.data.normal_(0,math.sqrt(2./n)) elif isinstance(m,nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def forward(self,x): # out = self.cbl1(x) # [1, 32, 416, 416] # out = self.maxpool(out) # [1, 32, 208, 208] # out = self.cbl2(out) # [1, 64, 208, 208] # out = self.maxpool(out) # [1, 64, 104, 104] # out = self.bb1(out) # [1, 128, 104, 104] # out = self.maxpool(out) # [1, 128, 52, 52] # out = self.bb2(out) # [1, 256, 52, 52] # out = self.maxpool(out) # [1, 256, 26, 26] # out = self.bbx21(out) # [1, 512, 26, 26] # out = self.maxpool(x) # [1, 512, 13, 13] # out = self.bbx22(out) # [1, 1024, 13, 13] x = self.features_13(x) return x def darknet19(inputs): """Generate Darknet-19 model for Imagenet classification.""" body = darknet_body()(inputs) logits = nn.Conv2d(1024,1000, (1, 1))(body) logits = nn.Softmax(1)(logits) return logits if __name__ == '__main__': x = torch.randn([1,3,416,416]) # y = cov_bn_leaky1(3,10)(x) # y = bottleneck_block(3,30,20) # y = bottleneck_x2_block(3,30,20)(x) # net = darknet_body() # y = net(x) y = darknet_body() print('y.features_26 :',y.features_26) print('\n') print('y.bbx22 :',y.bbx22) # for i in y.children(): # print(i)
nets/yolo_model.py
(1)yolo_body input :[1,3,416,416] output :[1, 13, 13, 125] process: 1.fea_26,fea_13 2.torch.cat([fea_26,fea_13],1) --> cov_bn_leaky3,cov_bn_leaky1 --> transpose (2)yolo_decoder inputs: feats: tensor, [None,125,13,13], anchors: array-like,Anchor box widths and heights. (5,2) num_classes: int, Number of target classes. 20 outputs: box_xy[1, 13, 13, 5, 2] box_wh[1, 13, 13, 5, 2] box_conf[1, 13, 13, 5, 1] box_class_pred[1, 13, 13, 5, 20] process: 根据公式,是编码过程的逆过程。
代码
import sys import numpy as np import torch from torch.autograd import Variable import torch.nn as nn from nets.darketnet19 import cov_bn_leaky1,cov_bn_leaky3,darknet_body sys.path.append('..') # 这个是干什么的? voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]]) voc_classes = [ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] def grid(h,w): cx = torch.repeat_interleave(torch.arange(h),w).view(-1,1) cy = torch.Tensor.repeat(torch.arange(w),h).view(-1,1) return torch.cat([cx,cy],1) class yolo_body(nn.Module): def __init__(self,num_anchors=5,num_classes=20): super(yolo_body, self).__init__() self.num_anchors = num_anchors self.num_classes = num_classes self.darknet = darknet_body() self.fea_13 = nn.Sequential(self.darknet.features_13,cov_bn_leaky3(1024,1024), cov_bn_leaky3(1024,1024)) self.fea_26 = nn.Sequential(self.darknet.features_26,cov_bn_leaky1(512,64)) def pass_through(self,x): return torch.cat([x[:,:,::2,::2],x[:,:,::2,1::2],x[:,:,1::2,::2],x[:,:,1::2,1::2]],1) def forward(self,x): fea_13 = self.fea_13(x) fea_26 = self.fea_26(x) fea_26 = self.pass_through(fea_26) out = torch.cat([fea_26,fea_13],1) out = cov_bn_leaky3(1280,1024)(out) out = cov_bn_leaky1(1024,self.num_anchors*(self.num_classes+5))(out) out = torch.transpose(out,1,3) return out # inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125] ''' def yolo_body(inputs,num_anchors=5,num_classes=20): darknet = darknet_body() features_26 = darknet.features_26 features_13 = darknet.features_13 fea_13 = nn.Sequential(features_13,cov_bn_leaky3(1024,1024), cov_bn_leaky3(1024,1024))(inputs) fea_26 = nn.Sequential(features_26,cov_bn_leaky1(512,64))(inputs) fea_26 = pass_through(fea_26) out = torch.cat([fea_26,fea_13],1) out = cov_bn_leaky3(1280,1024)(out) out = cov_bn_leaky1(1024,num_anchors*(num_classes+5))(out) out = torch.transpose(out,1,3) print('out.shape:',out.shape) return out # inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125] ''' def yolo_decoder(feats,anchors,num_classes): ''' Convert final layer features to bounding box parameters. inputs: feats: tensor, [None,125,13,13], anchors: array-like,Anchor box widths and heights. num_classes: int, Number of target classes. outputs: box_xy ,box_wh,box_conf ,box_class_pred ''' grids = feats.shape[1:3] # torch.Size([13, 13]) num_anchors = len(anchors) # 5 anchors_wh = Variable(torch.from_numpy(anchors)).view(1,1,1,num_anchors,2) # [1, 1, 1, 5, 2] anchors_cxy = grid(grids[0],grids[1]).view(-1,grids[0],grids[1],1,2) # [1, 13, 13, 1, 2] feats = feats.view(-1,grids[0],grids[1],num_anchors,num_classes+5) # [1, 13, 13, 125]-->[1, 13, 13, 5, 25] box_xy = torch.sigmoid(feats[..., :2]) # [1,13,13,5,2] box_wh = torch.exp(feats[..., 2:4]) # [1,13,13,5,2] box_confidence = torch.sigmoid(feats[..., 4:5]) # [1,13,13,5,1] box_class_probs = torch.softmax(feats[..., 5:],-1) # [1,13,13,5,20] box_xy = (box_xy + anchors_cxy) / torch.tensor(list(grids)) # [1, 13, 13, 5, 2] box_wh = box_wh * anchors_wh / torch.tensor(list(grids)) # [1, 13, 13, 5, 2] return box_xy, box_wh, box_confidence, box_class_probs if __name__ == '__main__': x = torch.randn([1,3,416,416]) net = yolo_body() params = [] params_dict = dict(net.named_parameters()) print(net(x).shape) # torch.Size([1, 13, 13, 125]) # x = yolo_body(inputs=x,num_anchors=5,num_classes=20) # box_xy, box_wh, box_confidence, box_class_probs = yolo_encoder(feats=x,anchors=voc_anchors,num_classes=20)
loss.py
input : pred(b, 13, 13, 125),target(b, 13, 13, 5, 1)
output : total_loss
process:
1.数据准备
target --> true_boxes, detectors_mask, matching_true_boxes
pred --> sigmoid --> pred_d_boxes
pred --> yolo_decoder() --> pred_xy, pred_wh, pred_confidence, pred_class_prob
2.正样本损失 best_iou/1 - pred_confidence,detectors_mask --> objects_loss
3.负样本损失 (pred_xy, pred_wh),true_boxes --> iou --> object_detections;
object_detections,detectors_mask,pred_confidence --> no_objects_loss
4.类别损失 matching_true_boxes[...,-1],pred_class_prob,detectors_mask --> classification_loss
5.框损失 matching_true_boxes[...,:4],pred_d_boxes,detectors_mask --> coordinates_loss
代码
import torch import numpy as np import torch.nn as nn from nets.yolo_model import yolo_decoder voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]]) ''' model_body.output (b, 13, 13, 125) detectors_mask_input (b, 13, 13, 5, 1) matching_boxes_input (b, 13, 13, 5, 5) ''' class yoloLoss(nn.Module): def __init__(self,object_scale,no_object_scale,class_scale, coordinates_scale,anchors,num_classes, rescore_confidence=False,print_loss=False): # criterion = yoloLoss(7,2,5,0.5) super(yoloLoss, self).__init__() self.object_scale = object_scale self.no_object_scale = no_object_scale self.class_scale = class_scale self.coordinates_scale = coordinates_scale self.rescore_confidence = rescore_confidence self.print_loss = print_loss self.anchors = anchors self.num_classes = num_classes def compute_iou(self,box_t,box_p): ''' box_pred [b,13, 13, 5, 4],box_true[b,13, 13, 5, 4] (x1,y1,x2,y2)''' # 1 lt,rd --> wh --> inter + areas --> iou lt = torch.maximum(box_t[...,:2],box_p[...,:2]) rd = torch.minimum(box_t[...,2:],box_p[...,2:]) wh = rd - lt wh[wh<0]=0 # [b,h,w,5,n,2] inter = wh[...,0]*wh[...,1] # [b,h,w,5,n] area_t = (box_t[...,3]-box_t[...,1])*(box_t[...,2]-box_t[...,0]) # [b,1,1,1,n] area_p = (box_p[...,3]-box_p[...,1])*(box_p[...,2]-box_p[...,0]) # [b,1,1,1,n] iou = inter/(area_t+area_p-inter) return iou # [b,h,w,5,n] def yolo_loss(self,pred,target): # 1 数据准备 num_anchors = len(self.anchors) yolo_output = pred # [1, 13, 13, 125 ] true_boxes = target[...,:4] # [1, 13, 13, 5, 4] detectors_mask = target[...,4:5] # [1, 13, 13, 5, 1] matching_true_boxes = target[...,5:] # [1, 13, 13, 5, 5] pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_decoder( yolo_output,anchors=voc_anchors,num_classes=self.num_classes) # 预测偏移 yolo_output_shape = yolo_output.shape[1:3] # # torch.Size([1, 13, 13, 125]) feats = yolo_output.view(-1,yolo_output_shape[0],yolo_output_shape[1], num_anchors,self.num_classes+5) # torch.Size([1, 13, 13, 5, 25]) pred_d_boxes = torch.cat((torch.sigmoid(feats[...,0:2]),feats[...,2:4]),axis=-1) # torch.Size([1, 13, 13, 5, 4]) # 2 true_boxes与pred_xy, pred_wh的iou ## true_boxes[(1,13, 13, 5, 4)],pred_xy[1, 13, 13, 5, 2] true_box = torch.cat([(true_boxes[...,:2]-true_boxes[...,2:4]/2.),(true_boxes[...,:2]+true_boxes[...,2:4]/2.)],-1) pred_box = torch.cat([(pred_xy-pred_wh/2.),(pred_xy+pred_wh/2.)],-1) # [1, 13, 13, 5, 1, 4] iou = self.compute_iou(true_box,pred_box) # [1, 13, 13, 5, 10] best_iou, _ = iou.max(-1) # [1, 13, 13, 5 ] best_iou = best_iou.unsqueeze(-1) # [1, 13, 13, 5, 1 ] object_detections = best_iou > 0.6 # [1, 13, 13, 5, 1 ] # 3 loss # 3.1 no_obj loss no_objects_loss = self.no_object_scale * (1-object_detections)*torch.logical_not(detectors_mask)*torch.square(-pred_confidence) # 3.2 obj loss if self.rescore_confidence: objects_loss = self.object_scale * detectors_mask * torch.square(best_iou - pred_confidence) else: objects_loss = self.object_scale * detectors_mask * torch.square(1 - pred_confidence) # 3.3 (obj loss + no_obj loss) confidence_loss = (objects_loss + no_objects_loss).sum() # detectors_mask[b, 13, 13, 5, 1] # 3.4 cls loss true_boxes[b,n] # pred_class_prob [b,13,13,5,20] matching_classes = matching_true_boxes[...,4] # [b, 13, 13, 5, 1] s1,s2,s3,s4 = matching_classes.shape one_hot = torch.eye(self.num_classes) matching_classes = one_hot[matching_classes.flatten()].view(s1,s2,s3,s4,self.num_classes) classification_loss = (self.class_scale * detectors_mask * torch.square(matching_classes - pred_class_prob)).sum() # [b,n,20] # boxes loss matching_boxes = matching_true_boxes[...,0:4] coordinates_loss = (self.coordinates_scale * detectors_mask * torch.square(matching_boxes-pred_d_boxes)).sum() total_loss = 0.5 * (confidence_loss + classification_loss + coordinates_loss) return total_loss if __name__ == '__main__': print('PyCharm')
train.py
process:
1.载入数据
2.载入模型
3.损失函数
4.更新参数
代码
import os import torch,h5py import numpy as np from loss import yoloLoss from torch.autograd import Variable from nets.yolo_model import yolo_body from torch.utils.data import DataLoader from data_process.data_encoder_3 import get_classes,get_anchors,yoloDataset # 1 parameters use_gpu = False learning_rate = 0.001 num_epochs = 1 batch_size = 1 # 2 model net = yolo_body() params = [] params_dict = dict(net.named_parameters()) for k,v in params_dict.items(): if k.startswith('features'): params += [{'params':[v],'lr':learning_rate*1}] else: params += [{'params':[v],'lr':learning_rate*1}] # 3 loss + optimizer anchors_path = 'model_data/anchors.txt' classes_path = 'model_data/pascal_classes.txt' anchors = get_anchors(anchors_path) classes = get_classes(classes_path) num_classes = len(classes) cost = yoloLoss(5,1,1,1,anchors,num_classes) optimizer = torch.optim.SGD(params,lr=learning_rate,momentum=0.9,weight_decay=5e-4) # 4 data data_path = 'VOCdevkit/pascal_voc_07_12_LS.hdf5' data = h5py.File(data_path, 'r') train_dataset = yoloDataset(data_path,anchors_path) # (11, 3, 416, 416) (11, 13, 13, 5, 10) train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=0) # 5 train num_iter = 0 best_test_loss = np.inf for epoch in range(num_epochs): net.train() if epoch == 30: learning_rate = 0.0001 if epoch == 40: learning_rate = 0.00001 for params_group in optimizer.param_groups: params_group['lr'] = learning_rate print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs)) print('Learning Rate for this epoch: {}'.format(learning_rate)) total_loss = 0. for i,(img,targets) in enumerate(train_loader): imgs = Variable(img).to(torch.float32) # torch.Size[b, 3, 416, 416] targets = Variable(targets) # torch.Size[b, 13, 13, 5, 10] pred = net(imgs) loss = cost.yolo_loss(pred,targets) optimizer.zero_grad() loss.backward() optimizer.step() if(i+1)%5 == 0: print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f' %(epoch+1, num_epochs, i+1, len(train_loader), loss.data.item(), total_loss / (i+1))) num_iter += 1
predict.py
process:
1.数据处理
2.预测
3.筛选
4.画框
代码
''' # 1 img process # 2 predict --> decoder # 3 filter_boxes # 4 draw ''' from torch.autograd import Variable import torchvision.transforms as transforms import numpy as np from PIL import Image,ImageDraw,ImageFont import colorsys,imghdr,os,torch,cv2 from nets.yolo_model import yolo_body,yolo_decoder from data_process.data_encoder_3 import get_classes,get_anchors def yolo_boxes_to_corners(box_xy,box_wh): box_mins = box_xy - (box_wh/2.) box_maxes = box_xy + (box_wh/2.) return torch.cat([box_mins[...,1:2], box_mins[...,0:1], box_maxes[...,1:2],box_maxes[...,0:1]],-1) def yolo_filter_boxes(boxes,box_confidence,box_class_probs,threshold=.6): ''' inputs: box [1,13,13,5,4 ]_ confidence [1,13,13,5,1 ] box_class_probs [1,13,13,5,20] outputs: boxes[n,4], scores[n], classes[n] ''' box_scores = box_confidence * box_class_probs # box_scores.shape [1,13,13,5,20] box_class_scores ,box_classes = torch.max(box_scores,axis=-1) # [1, 13, 13, 5]), torch.Size([1, 13, 13, 5] prediction_mask = box_class_scores >= threshold # [1, 13, 13, 5]) boxes = boxes[prediction_mask] # [n,4] scores = box_class_scores[prediction_mask] # [n] classes = box_classes[prediction_mask] # [n] return boxes, scores, classes def nms(bboxes,scores,threshold=0.5): x1 = bboxes[:,0] y1 = bboxes[:,1] x2 = bboxes[:,2] y2 = bboxes[:,3] areas = (x2-x1)*(y2-y1) _,order = scores.sort(0,descending=True) keep = [] while order.numel() > 0: if order.numel()>1: i = order[0] else: i = order keep.append(i) if order.numel() == 1: break xx1 = x1[order[1:]].clamp(min=x1[i]) yy1 = y1[order[1:]].clamp(min=y1[i]) xx2 = x2[order[1:]].clamp(max=x1[i]) yy2 = y2[order[1:]].clamp(max=y1[i]) w = (xx2-xx1).clamp(min=0) h = (yy2-yy1).clamp(min=0) inter = w*h ove = inter/(areas[i]+areas[order[1:]]-inter) ids = torch.nonzero(ove <= threshold).squeeze() if ids.numel() == 0: break order = order[ids+1] return torch.LongTensor(keep) def yolo_eval(yolo_outputs,image_shape=[416,416], score_threshold=.6,iou_threshold=.5): ''' score_filter + NMS box_xy[1,13,13,5,2], box_wh[1,13,13,5,2]_ confidence[1,13,13,5,1], box_class_probs [1,13,13,5,20] ''' box_xy,box_wh,box_confidence,box_class_probs = yolo_outputs boxes = yolo_boxes_to_corners(box_xy, box_wh) #[1, 13, 13, 5, 4] # 1 score_filter boxes, scores, classes = yolo_filter_boxes( boxes, box_confidence, box_class_probs, threshold=score_threshold) # 预测框映射到原图 boxes = boxes * torch.tensor([image_shape[0],image_shape[1],image_shape[0],image_shape[1]]) # 2 NMS keep = nms(boxes,scores,iou_threshold) return boxes[keep],scores[keep],classes[keep] def detect_img(): # 1 img process image_name = '000015.jpg' image = cv2.imread('VOCdevkit/VOC2007/JPEGImages/'+image_name) # (375, 500, 3) h,w,_ = image.shape # h,w,_ =(375, 500, 3) img = cv2.resize(image,(416,416)) # (448, 448, 3) img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB) # dtype('float32') img = np.array(img,np.float)/255. transform = transforms.Compose([transforms.ToTensor(),]) img = transform(img) # torch.Size([3, 448, 448]) img = Variable(img[None,:,:,:],volatile=True) #t # 2 predict --> decoder net = yolo_body() net.eval() print('load model...') print('predicting...') feas = net(img) anchors_path = 'model_data/anchors.txt' classes_path = 'model_data/pascal_classes.txt' anchors = get_anchors(anchors_path) class_names = get_classes(classes_path) num_classes = len(class_names) pred = yolo_decoder(feas,anchors,num_classes) # box_xy[1,13,13,5,2], box_wh[1,13,13,5,2]_confidence[1,13,13,5,1], box_class_probs [1,13,13,5,20] # 3 filter_boxes boxes, scores, classes = yolo_eval(pred) # [n,4],[n],[n] print(boxes.shape, scores.shape, classes.shape) # 4 draw hsv_tuples = [(x / len(class_names), 1., 1.) for x in range(len(class_names))] colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) colors = list( map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors)) font = ImageFont.truetype( font='font/FiraMono-Medium.otf', size= np.floor(3e-2 * h + 0.5).astype('int32')) thickness = (h + w) // 300 for i, c in reversed(list(enumerate(classes))): predicted_class = class_names[c] box = boxes[i] score = scores[i] label = '{} {:.2f}'.format(predicted_class, score) draw = ImageDraw.Draw(image) label_size = draw.textsize(label, font) top, left, bottom, right = box top = max(0, np.floor(top + 0.5).astype('int32')) left = max(0, np.floor(left + 0.5).astype('int32')) bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32')) right = min(image.size[0], np.floor(right + 0.5).astype('int32')) print(label, (left, top), (right, bottom)) if top - label_size[1] >= 0: text_origin = np.array([left, top - label_size[1]]) else: text_origin = np.array([left, top + 1]) # My kingdom for a good redistributable image drawing library. for i in range(thickness): draw.rectangle( [left + i, top + i, right - i, bottom - i], outline=colors[c]) draw.rectangle( [tuple(text_origin), tuple(text_origin + label_size)], fill=colors[c]) draw.text(text_origin, label, fill=(0, 0, 0), font=font) del draw print('\n',1111111) image.save(os.path.join('image', image_name), quality=90) # Press the green button in the gutter to run the script. if __name__ == '__main__': detect_img()
本文代码下载:
链接: link
pwd=123a
参考:
https://github.com/abeardear/pytorch-YOLO-v1
https://github.com/allanzelener/yad2k
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。