赞
踩
YOLOv5-6.0版本的网络可以按照深度和宽度分为五个版本:n、s、m、l和x。在大多数情况下,为了满足模型轻量化设计并保证检测精度,我们选择YOLOv5s作为基础模型进行改进。
YOLOv5主要由四个部分组成:输入端(Input)、主干网络(Backbone)、颈部网络(Neck)和检测端(Head)。这些部分协同工作,使得模型能够高效地进行目标检测。
主干网络是模型的核心部分,负责提取图像的特征信息。颈部网络则将主干网络提取的特征信息进行融合,为检测端提供更加丰富的信息。检测端则负责根据这些特征信息对目标进行定位和分类。
通过选用合适的版本和改进基础模型,YOLOv5可以为用户提供准确、快速的目标检测服务。
源代码:https://github.com/ultralytics/yolov5
其预训练权重可在官方下载
本项目使用YOLOv5s.pt
WiderPerson数据集是一个针对拥挤场景行人检测的基准数据集,其图像来源不再仅限于交通场景,而是从多种场景中精心挑选而来。该数据集包含13382张图像,并附带了约40万个遮挡标记作为注释。为了确保公平性和有效性,我们随机选取了8000张、1000张和4382张图像分别作为训练集、验证集和测试集。与CityPersons和WIDER FACE数据集类似,我们不会发布测试图像的标注文件,以防止潜在的作弊行为。
您可以通过以下网址下载WiderPerson数据集:WiderPerson: A Diverse Dataset for Dense Pedestrian Detection in the Wild。
下载完成之后,其文件夹如下
其中Annotations文件下的txt文件如下,第一行数字为标签数目(没啥用处),前面数字为类别,本数据集内共有五个类别
- 0 : pedestrians
- 1 : riders
- 2 : partially-visible persons
- 3 : ignore regions
- 4 : crowd
为把她转成VOC格式文件,需要把这这个txt文件转换成xml文件,代码如下
- import os
- import numpy as np
- import scipy.io as sio
- import shutil
- from lxml.etree import Element, SubElement, tostring
- from xml.dom.minidom import parseString
- import cv2
-
-
- def make_voc_dir():
- # labels 目录若不存在,创建labels目录。若存在,则清空目录
- if not os.path.exists('../VOC2007/Annotations'):
- os.makedirs('../VOC2007/Annotations')
- if not os.path.exists('../VOC2007/ImageSets'):
- os.makedirs('../VOC2007/ImageSets')
- os.makedirs('../VOC2007/ImageSets/Main')
- if not os.path.exists('../VOC2007/JPEGImages'):
- os.makedirs('../VOC2007/JPEGImages')
-
-
- if __name__ == '__main__':
- classes = {'1': 'pedestrians',
- '2': 'riders',
- '3': 'partially',
- '4': 'ignore',
- '5': 'crowd'}
- VOCRoot = '../VOC2007'
- widerDir = './WiderPerson' # 数据集所在的路径
- wider_path = './WiderPerson/val.txt'
- make_voc_dir()
- with open(wider_path, 'r') as f:
- imgIds = [x for x in f.read().splitlines()]
-
- for imgId in imgIds:
- objCount = 0 # 一个标志位,用来判断该img是否包含我们需要的标注
- filename = imgId + '.jpg'
- img_path = './WiderPerson/Images/' + filename
- print('Img :%s' % img_path)
- img = cv2.imread(img_path)
- width = img.shape[1] # 获取图片尺寸
- height = img.shape[0] # 获取图片尺寸 360
-
- node_root = Element('annotation')
- node_folder = SubElement(node_root, 'folder')
- node_folder.text = 'JPEGImages'
- node_filename = SubElement(node_root, 'filename')
- node_filename.text = 'VOC2007/JPEGImages/%s' % filename
- node_size = SubElement(node_root, 'size')
- node_width = SubElement(node_size, 'width')
- node_width.text = '%s' % width
- node_height = SubElement(node_size, 'height')
- node_height.text = '%s' % height
- node_depth = SubElement(node_size, 'depth')
- node_depth.text = '3'
-
- label_path = img_path.replace('Images', 'Annotations') + '.txt'
- with open(label_path) as file:
- line = file.readline()
- count = int(line.split('\n')[0]) # 里面行人个数
- line = file.readline()
- while line:
- cls_id = line.split(' ')[0]
- xmin = int(line.split(' ')[1]) + 1
- ymin = int(line.split(' ')[2]) + 1
- xmax = int(line.split(' ')[3]) + 1
- ymax = int(line.split(' ')[4].split('\n')[0]) + 1
- line = file.readline()
-
- cls_name = classes[cls_id]
-
- obj_width = xmax - xmin
- obj_height = ymax - ymin
-
- difficult = 0
- if obj_height <= 6 or obj_width <= 6:
- difficult = 1
-
- node_object = SubElement(node_root, 'object')
- node_name = SubElement(node_object, 'name')
- node_name.text = cls_name
- node_difficult = SubElement(node_object, 'difficult')
- node_difficult.text = '%s' % difficult
- node_bndbox = SubElement(node_object, 'bndbox')
- node_xmin = SubElement(node_bndbox, 'xmin')
- node_xmin.text = '%s' % xmin
- node_ymin = SubElement(node_bndbox, 'ymin')
- node_ymin.text = '%s' % ymin
- node_xmax = SubElement(node_bndbox, 'xmax')
- node_xmax.text = '%s' % xmax
- node_ymax = SubElement(node_bndbox, 'ymax')
- node_ymax.text = '%s' % ymax
- node_name = SubElement(node_object, 'pose')
- node_name.text = 'Unspecified'
- node_name = SubElement(node_object, 'truncated')
- node_name.text = '0'
-
- image_path = VOCRoot + '/JPEGImages/' + filename
- xml = tostring(node_root, pretty_print=True) # 'annotation'
- dom = parseString(xml)
- xml_name = filename.replace('.jpg', '.xml')
- xml_path = VOCRoot + '/Annotations/' + xml_name
- with open(xml_path, 'wb') as f:
- f.write(xml)
- # widerDir = '../WiderPerson' # 数据集所在的路径
- shutil.copy(img_path, '../VOC2007/JPEGImages/' + filename)

可以用以下代码展示一下数据集
- # -*- coding: utf-8 -*-
-
- import os
- import cv2
-
- if __name__ == '__main__':
- path = './WiderPerson/train.txt'
- with open(path, 'r') as f:
- img_ids = [x for x in f.read().splitlines()]
-
- for img_id in img_ids: # '000040'
- img_path = './WiderPerson/JPEGImages/' + img_id + '.jpg'
- print(img_path)
- img = cv2.imread(img_path)
-
- im_h = img.shape[0]
- im_w = img.shape[1]
- print(img_path)
- #label_path = img_path.replace('Images', 'Annotations') + '.txt'
- label_path = img_path.replace('JPEGImages', 'Annotations') + '.txt'
- print(label_path)
- with open(label_path) as file:
- line = file.readline()
- count = int(line.split('\n')[0]) # 里面行人个数
- line = file.readline()
- while line:
- cls = int(line.split(' ')[0])
- print(cls)
- # < class_label =1: pedestrians > 行人
- # < class_label =2: riders > 骑车的
- # < class_label =3: partially-visible persons > 遮挡的部分行人
- # < class_label =4: ignore regions > 一些假人,比如图画上的人
- # < class_label =5: crowd > 拥挤人群,直接大框覆盖了
- if cls == 1 or cls == 3:
- xmin = float(line.split(' ')[1])
- ymin = float(line.split(' ')[2])
- xmax = float(line.split(' ')[3])
- ymax = float(line.split(' ')[4].split('\n')[0])
- img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 0), 2)
- line = file.readline()
- cv2.imshow('result', img)
- cv2.waitKey(0)

用上述代码可以生成以下文件夹
下面划分数据集和验证集,用split_train_val.py
- # coding:utf-8
-
- # coding:utf-8
-
- import os
- import random
- import argparse
-
- parser = argparse.ArgumentParser()
- #xml文件的地址,根据自己的数据进行修改 xml一般存放在Annotations下
- parser.add_argument('--xml_path', default='./VOC2007/Annotations', type=str, help='input xml label path')
- #数据集的划分,地址选择自己数据下的ImageSets/Main
- parser.add_argument('--txt_path', default='./VOC2007/ImageSets/Main', type=str, help='output txt label path')
- opt = parser.parse_args()
-
- trainval_percent = 1
- train_percent = 0.9
- xmlfilepath = opt.xml_path
- txtsavepath = opt.txt_path
- print(xmlfilepath)
- total_xml = os.listdir(xmlfilepath)
- if not os.path.exists(txtsavepath):
- os.makedirs(txtsavepath)
-
- num = len(total_xml)
- list_index = range(num)
- tv = int(num * trainval_percent)
- tr = int(tv * train_percent)
- trainval = random.sample(list_index, tv)
- train = random.sample(trainval, tr)
-
- file_trainval = open(txtsavepath + '/trainval.txt', 'w')
- file_test = open(txtsavepath + '/test.txt', 'w')
- file_train = open(txtsavepath + '/train.txt', 'w')
- file_val = open(txtsavepath + '/val.txt', 'w')
-
- for i in list_index:
- name = total_xml[i][:-4] + '\n'
- if i in trainval:
-
- file_trainval.write(name)
- if i in train:
- file_train.write(name)
- else:
- file_val.write(name)
- else:
- file_test.write(name)
-
- file_trainval.close()
- file_train.close()
- file_val.close()
- file_test.close()

生成的txt文件如下
再一步,使用voc_labels.py names修改成自己的类别
- # -*- coding: utf-8 -*-
- import xml.etree.ElementTree as ET
- import os
- from os import getcwd
-
- sets = ['train', 'val', 'test']
- classes = ["pedestrians","riders","partially-visible persons","ignore regions","crowd"] # 改成自己的类别
- abs_path = os.getcwd()
- print(abs_path)
-
-
- def convert(size, box):
- dw = 1. / (size[0])
- dh = 1. / (size[1])
- x = (box[0] + box[1]) / 2.0 - 1
- y = (box[2] + box[3]) / 2.0 - 1
- w = box[1] - box[0]
- h = box[3] - box[2]
- x = x * dw
- w = w * dw
- y = y * dh
- h = h * dh
- return x, y, w, h
-
-
- def convert_annotation(image_id):
- in_file = open('D:/V5/VOC2007/Annotations/%s.xml' % (image_id), encoding='UTF-8')
- out_file = open('D:/V5/VOC2007/labels/%s.txt' % (image_id), 'w')
- tree = ET.parse(in_file)
- root = tree.getroot()
- size = root.find('size')
- w = int(size.find('width').text)
- h = int(size.find('height').text)
- for obj in root.iter('object'):
- # difficult = obj.find('difficult').text
- difficult = obj.find('difficult').text
- cls = obj.find('name').text
- if cls not in classes or int(difficult) == 1:
- continue
- cls_id = classes.index(cls)
- xmlbox = obj.find('bndbox')
- b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
- float(xmlbox.find('ymax').text))
- b1, b2, b3, b4 = b
- # 标注越界修正
- if b2 > w:
- b2 = w
- if b4 > h:
- b4 = h
- b = (b1, b2, b3, b4)
- bb = convert((w, h), b)
- out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
-
-
- wd = getcwd()
- for image_set in sets:
- if not os.path.exists('D:/V5/VOC2007/labels/'):
- os.makedirs('D:/V5/VOC2007/labels/')
- image_ids = open('D:/V5/VOC2007/ImageSets/Main/%s.txt' % (image_set)).read().strip().split()
- list_file = open('D:/V5/VOC2007/%s.txt' % (image_set), 'w')
- for image_id in image_ids:
- list_file.write('D:/V5/VOC2007/JPEGImages/%s.jpg\n' % (image_id))
- convert_annotation(image_id)
- list_file.close()

找到data文件夹的xView.yaml文件,复制一份,改成data.yaml文件,里面放自己的类别
修改前
修改后
找到yolov5s.yaml,复制一份改成yolov5s_s.yaml修改其中的nc参数
修改train.py中的参数,weights改成下载的预训练权重,cfg放yolov5s_s.yaml,data放 data.yaml,修改合理的epoch和batch_size,看着自己的显卡来
运行train.py报错,找网上教程自己修改
训练完成之后,运行detect.py文件,修改参数,weights的权重在runs下面,source更改可以实现图片,摄像头,,视频的检测,别的参数看着修改,一般不修改。
运行结果如下
欢迎交流评论,有啥问题评论区交流
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。