借用yolov5实现目标检测自动标注

背景：目标检测中的数据标注工作量巨大，本文利用yolov5-v5.0实现自动标注功能，格式为pascal voc。

步骤：
1.利用yolov5先训练小批量样本比如100pcs（yolov5的数据标注以及训练网上有很多教材），得到100pcs的.pt文件
2.将我给的auto_labelimg.py放到yolov5文件夹下面
auto_labelimg.py所在位置
3.修改176和178行的weights文件为100pcs的.pt文件，source修改为你需要自动标注的图片文件夹位置
4.运行auto_labelimg.py会在图片文件夹外面生成annotations文件夹，里面为.xml文件
5.在labelImg软件里面自行调整自动生成的新.xml文件

auto_labelimg.py代码如下：

import sys

from utils.datasets import LoadImages, LoadStreams, letterbox
from utils.general import check_img_size, non_max_suppression, scale_coords
from utils.torch_utils import select_device, time_synchronized
import argparse
import os
import shutil
from models.experimental import attempt_load
import cv2
import torch
import torch.backends.cudnn as cudnn
import numpy as np

from xml.etree import ElementTree as ET
import warnings

warnings.filterwarnings('ignore')


# 定义一个创建一级分支object的函数
def create_object(root, xi, yi, xa, ya, obj_name):  # 参数依次，树根，xmin，ymin，xmax，ymax
    # 创建一级分支object
    _object = ET.SubElement(root, 'object')
    # 创建二级分支
    name = ET.SubElement(_object, 'name')
    # print(obj_name)
    name.text = str(obj_name)
    pose = ET.SubElement(_object, 'pose')
    pose.text = 'Unspecified'
    truncated = ET.SubElement(_object, 'truncated')
    truncated.text = '0'
    difficult = ET.SubElement(_object, 'difficult')
    difficult.text = '0'
    # 创建bndbox
    bndbox = ET.SubElement(_object, 'bndbox')
    xmin = ET.SubElement(bndbox, 'xmin')
    xmin.text = '%s' % xi
    ymin = ET.SubElement(bndbox, 'ymin')
    ymin.text = '%s' % yi
    xmax = ET.SubElement(bndbox, 'xmax')
    xmax.text = '%s' % xa
    ymax = ET.SubElement(bndbox, 'ymax')
    ymax.text = '%s' % ya


# 创建xml文件的函数
def create_tree(sources, image_name, h, w):

    imgdir = sources.split('/')[-1]
    # 创建树根annotation
    annotation = ET.Element('annotation')
    # 创建一级分支folder
    folder = ET.SubElement(annotation, 'folder')
    # 添加folder标签内容
    folder.text = (imgdir)

    # 创建一级分支filename
    filename = ET.SubElement(annotation, 'filename')
    filename.text = image_name

    # 创建一级分支path
    path = ET.SubElement(annotation, 'path')

    path.text = '{}/{}'.format(sources, image_name)  # 用于返回当前工作目录

    # 创建一级分支source
    source = ET.SubElement(annotation, 'source')
    # 创建source下的二级分支database
    database = ET.SubElement(source, 'database')
    database.text = 'Unknown'

    # 创建一级分支size
    size = ET.SubElement(annotation, 'size')
    # 创建size下的二级分支图像的宽、高及depth
    width = ET.SubElement(size, 'width')
    width.text = str(w)
    height = ET.SubElement(size, 'height')
    height.text = str(h)
    depth = ET.SubElement(size, 'depth')
    depth.text = '3'

    # 创建一级分支segmented
    segmented = ET.SubElement(annotation, 'segmented')
    segmented.text = '0'
    return annotation


def detect(opt, model, img, img0):

    device = select_device(opt.device)
    half = device.type != 'cpu'  # half precision only supported on CUDA

    # img = img_transpose(img0, imgsz, 32)
    img = torch.from_numpy(img).to(device)
    img = img.half() if half else img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)

    pred = model(img, augment=opt.augment)[0]

    # Apply NMS
    pred = non_max_suppression(
        pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
    # Process detections
    for i, det in enumerate(pred):  # detections per image
        if det is not None and len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()
    return det


def main(opt):

    source,  weights, imgsz, = opt.source, opt.weights, opt.img_size

    # Initialize
    device = select_device(opt.device)

    half = device.type != 'cpu'  # half precision only supported on CUDA

    # Load model
    model = attempt_load(weights, map_location=device)  # load FP32 model
    stride = int(model.stride.max())  # model stride
    imgsz = check_img_size(imgsz, s=stride)  # check img_size
    if half:
        model.half()  # to FP16

    img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
    # run once
    _ = model(img.half() if half else img) if device.type != 'cpu' else None
    # Get names and colors
    names = model.module.names if hasattr(model, 'module') else model.names

    dataset = LoadImages(source, img_size=imgsz, stride=stride)

    # images_list = os.listdir(source)
    # images_style = ['.jpg', '.png', '.bmp']
    # images_list = [x for x in images_list if x[-4:] in images_style]
    # print(images_list)

    for path, img, im0s, vid_cap in dataset:
        image_name = os.path.split(path)[-1]
        # print('path:', path)
        
        # 检测饲料袋
        boxes = detect(opt, model, img, im0s)
        # print(len(boxes))
        (h, w) = im0s.shape[:2]
        annotation = create_tree(source, image_name, h, w)
        # print(annotation)

        for box in boxes:
            if float(box[4]) > opt.conf_thres:
                x1, y1, x2, y2, label_id = int(box[0]), int(box[1]), int(box[2]), int(box[3]), int(box[5])
                label = names[int(label_id)]
                # print(x1, y1, x2, y2, label)
                create_object(annotation, x1, y1, x2, y2, label)

        tree = ET.ElementTree(annotation)
        annotation_path_root = source.replace(source.split('/')[-1], 'annotations')
        tree.write('{}/{}.xml'.format(annotation_path_root, image_name[:-4]))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', type=str,
                        default='weights/char_best.pt', help='model.pt path')
    # file/folder, 0 for webcam
    parser.add_argument('--source', type=str,
                        default='/data/video_image_new/char_replace_voc_data/images', help='source')
    parser.add_argument('--output', type=str, default='inference/output',
                        help='output folder')  # output folder
    parser.add_argument('--img-size', type=int, default=1280,
                        help='inference size (pixels)')
    parser.add_argument('--conf-thres', type=float,
                        default=0.4, help='object confidence threshold')
    parser.add_argument('--iou-thres', type=float,
                        default=0.5, help='IOU threshold for NMS')
    parser.add_argument('--device', default='0',
                        help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--classes', nargs='+', type=int,
                         help='filter by class')
    parser.add_argument('--augment', action='store_true',
                        help='augmented inference')
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    args = parser.parse_args()
    args.img_size = check_img_size(args.img_size)
    print(args)
    with torch.no_grad():
        main(args)

参考：
https://github.com/ultralytics/yolov5
https://blog.csdn.net/qq_43019451/article/details/108546828

版权声明：本文为CSDN博主「ly3848」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/ly3848/article/details/118050167