使用Tensorrt的python api 部署支持动态batch的yolov5s

系统环境:

ubuntu 1804

cuda11.3

tensorrt 8.2.06

显卡2080

pytorch 1.10.0

onnx 1.10.2

onnx-simplifier 0.3.6

步骤１：导出onnx 模型(参考https://github.com/shouxieai/tensorRT_Pro)

1.1 下载yolov5项目并修改相关代码，目的是减少导出onnx的复杂度，只保留一个输出便于后处理

# 下载并进入yolov5项目
git clone git@github.com:ultralytics/yolov5.git
cd yolov5/models
gedit yolo.py

#更改yolov5/models/yolo.py Detect.forward函数 

# yolov5/models/yolo.py 
# bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
# x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# 修改为:
bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
bs = -1
ny = int(ny)
nx = int(nx)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

# yolov5/models/yolo.py
#  z.append(y.view(bs, -1, self.no))
# 修改为：
z.append(y.view(bs, self.na * ny * nx, self.no))

############# 对于 yolov5-6.0 #####################
# yolov5/models/yolo.py Detect.forward函数
# if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
#    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
# 修改为:
if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

    # disconnect for pytorch trace
    anchor_grid = (self.anchors[i].clone() * self.stride[i]).view(1, -1, 1, 1, 2)

# y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
# 修改为:
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * anchor_grid  # wh


# wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
# 修改为:
wh = (y[..., 2:4] * 2) ** 2 * anchor_grid  # wh

#return x if self.training else (torch.cat(z, 1), x)
#修改为
return torch.cat(z, 1)

1.2 模型导出

修改yolov5/export.py,目的是只保留onnx模型动态batch

############# 对于 yolov5-6.0 #####################

# yolov5/export.py　export_onnx函数
# torch.onnx.export(model, im, f, verbose=False, opset_version=opset,
#                   training=torch.onnx.TrainingMode.TRAINING if train else torch.onnx.TrainingMode.EVAL,
#                   do_constant_folding=not train,
#                   input_names=['images'],
#                   output_names=['output'],
#                   dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'},  # shape(1,3,640,640)
#                                 'output': {0: 'batch', 1: 'anchors'}  # shape(1,25200,85)
#                                 } if dynamic else None)
# 修改为

        torch.onnx.export(model, im, f, verbose=False, opset_version=opset,
                          training=torch.onnx.TrainingMode.TRAINING if train else torch.onnx.TrainingMode.EVAL,
                          do_constant_folding=not train,
                          input_names=['images'],
                          output_names=['output'],
                          dynamic_axes={'images': {0: 'batch'},  # shape(1,3,640,640)
                                        'output': {0: 'batch'}  # shape(1,25200,85)
                                        } if dynamic else None)

cd yolov5
python export.py --weights=yolov5s.pt --dynamic --include=onnx --opset=11 --simplify

然后就生成了yolov5.onnx文件，用https://netron.app/ 打开可以看到模型结构，可以发现模型现在只有１个输出，维度是［Batch_size,25200,85]

步骤２:生成tensorrt引擎（官网现在推荐用trtexec，但我的tensorrt是pip安装的，不支持）

*坑比较多，动态维度支持必须添加profile

import tensorrt as trt

#构建logger,builder,network
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

#读入onnx查看有无错误
success = parser.parse_from_file("yolov5s.onnx")
for idx in range(parser.num_errors):
    print(parser.get_error(idx))
if success:
    print('Construction sucess!!!')
    pass # Error handling code here

profile = builder.create_optimization_profile();
profile.set_shape("images", (1,3,640,640), (8,3,640,640), (16,3,640,640)) 

#profile = builder.create_optimization_profile()
#profile.set_shape("foo", (1,3, 640, 640), (20,3,640, 640), (10,3,640, 640)) 
config = builder.create_builder_config()
config.add_optimization_profile(profile)
config.max_workspace_size = 1 << 30 # 1 MiB
serialized_engine = builder.build_serialized_network(network, config)
with open("yolov5.engine", "wb") as f:
    print('正在写入engine文件．．．')
    f.write(serialized_engine)
    print('构建引擎成功！！！')

步骤３：从engine文件反序列化构建tensorrt引擎

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
import time

with open("yolov5.engine", "rb") as f:
    serialized_engine = f.read()
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(serialized_engine)

步骤４:推理

4.1设置一个推理用的BATCH_SIZE

BATCH_SIZE=8

4.2创建上下文管理context并获取相关buffer,每当batch要变化时,要重新set_binding_shape,并且需要重新申请buffer

context = engine.create_execution_context()
context.set_binding_shape(0, (BATCH_SIZE, 3, 640, 640)) #这句非常重要！！！定义batch为动态维度
inputs, outputs, bindings, stream = allocate_buffers(engine,max_batch_size=BATCH_SIZE) #构建输入，输出，流指针

*相关函数和类

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine,max_batch_size=16):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        dims = engine.get_binding_shape(binding) 
        #print(dims) 
        if dims[0] == -1:
            assert(max_batch_size is not None)
            dims[0] = max_batch_size #动态batch_size适应
        
        #size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        size = trt.volume(dims) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        #print(dtype,size)
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype) #开辟出一片显存
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

4.3 构建数据，数据大小要和inputs的buffer大小一致

img = cv2.imread('../dog.jpg')
batch_data = np.repeat(pre_process(img),BATCH_SIZE,0)　#8张一样的图做个batch

*相关函数

def pre_process(img):
    print('original image shape', img.shape)
    img = cv2.resize(img, (640, 640))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # img = img.transpose((2, 0, 1)).astype(np.float16)
    img = img.transpose((2, 0, 1)).astype(np.float32)
    img /= 255.0
    return img

4.4 数据拷贝到inputs内，然后执行推理

np.copyto(inputs[0].host, batch_data.ravel())
result = do_inference_v2(context, bindings, inputs, outputs, stream)[0]
result = np.reshape(result,[BATCH_SIZE,-1,85])
print(result.shape)
#结果
#(8, 25200, 85)
#因为我是8张图片一次推理，所以一次就会出来８个结果
#85维度的意义 ct_x,ct_y,w,h,conf_box,conf_class0,conf_class1,....conf_class79
#ct_x,ct_y,w,h是640*640尺度下结果
#conf结果都已经经过sigmoid或softmax处理

*相关函数

def do_inference_v2(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

4.5 结果绘制出来看看

result = result[4]
img = cv2.resize(img,(640,640))
boxes, confs, classes = filter_boxes(result,0.5) 
boxes, confs, classes = non_max_suppression(boxes, confs, classes)
for box,conf,cls in zip(boxes,confs,classes):
    x1,y1,x2,y2 = np.int32(box)
    cv2.rectangle(img,(x1,y1),(x2,y2),(0,0,255),2)
cv2.imwrite('tmp.jpg',img)

*相关函数

def filter_boxes(pred,threshold):
    result = pred.copy()
    result[...,:2]=result[...,:2]-result[...,2:4]*0.5
    result[...,2:4]=result[...,:2]+result[...,2:4]
    result_selected=result[np.where(result[...,4]>threshold)]
    boxes = result_selected[...,:4]
    classes = np.argmax(result_selected[...,5:],axis=-1)
    confs = np.max(result_selected[...,5:],axis=-1)#[...,classes]
    #print(boxes.shape)
    #print(classes.shape)
    #print(confs.shape)
    return boxes, confs, classes

def non_max_suppression(boxes, confs, classes, iou_thres=0.6):
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1) 
    order = confs.flatten().argsort()[::-1]
    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where( ovr <= iou_thres)[0]
        order = order[inds + 1]
    boxes = boxes[keep]
    confs = confs[keep]
    classes = classes[keep]
    return boxes, confs, classes

版权声明：本文为CSDN博主「fegggye」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/u012160945/article/details/121555088