yolov5 openvino量化部署

首先,下载YOLOv5源码,安装YOLOv5和OpenVINO的python依赖。

git clone https://github.com/ultralytics/yolov5.git
pip install -r requirements.txt && pip install openvino openvino-dev

然后,通过YOLOv5提供的export.py将预训练的Pytorch模型转换为OpenVINO FP32 IR模型。

python export.py --weights yolov5n.pt --imgsz 640 --batch-size 1 --include openvino

下面的量化代码改编自:https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/111-yolov5-quantization-migration

from pathlib import Path
from utils.dataloaders import create_dataloader
from utils.general import check_dataset
from export import attempt_load, yaml_save
from val import run as validation_fn
from openvino.tools import mo
from openvino.runtime import serialize
from openvino.tools.pot.api import DataLoader
from openvino.tools.pot.engines.ie_engine import IEEngine
from openvino.tools.pot.graph import load_model
from openvino.tools.pot.pipeline.initializer import create_pipeline
from openvino.tools.pot.graph.model_utils import compress_model_weights
from openvino.tools.pot.graph import load_model, save_modelIMAGE_SIZE = 640
MODEL_NAME = "yolov5n"
DATASET_CONFIG = "./data/coco128.yaml"class YOLOv5POTDataLoader(DataLoader):'''Inherit from DataLoader function and implement for YOLOv5.'''def __init__(self, data_source):super().__init__({})self._data_loader = data_sourceself._data_iter = iter(self._data_loader)def __len__(self):return len(self._data_loader.dataset)def __getitem__(self, item):try:batch_data = next(self._data_iter)except StopIteration:self._data_iter = iter(self._data_loader)batch_data = next(self._data_iter)im, target, path, shape = batch_dataim = im.float()im /= 255nb, _, height, width = im.shapeimg = im.cpu().detach().numpy()target = target.cpu().detach().numpy()annotation = dict()annotation["image_path"] = pathannotation["target"] = targetannotation["batch_size"] = nbannotation["shape"] = shapeannotation["width"] = widthannotation["height"] = heightannotation["img"] = imgreturn (item, annotation), imgif __name__ == "__main__":'''Conversion of the YOLOv5 model to OpenVINO'''onnx_path = f"./{MODEL_NAME}.onnx"# fp32 IR modelfp32_path = f"./FP32_openvino_model/{MODEL_NAME}_fp32.xml"print(f"Export ONNX to OpenVINO FP32 IR to: {fp32_path}")model = mo.convert_model(onnx_path)serialize(model, fp32_path)# fp16 IR modelfp16_path = f"./FP16_openvino_model/{MODEL_NAME}_fp16.xml"print(f"Export ONNX to OpenVINO FP16 IR to: {fp16_path}")model = mo.convert_model(onnx_path, compress_to_fp16=True)serialize(model, fp16_path)'''Prepare dataset for quantization'''data = check_dataset(DATASET_CONFIG)data_source = create_dataloader(data["val"], imgsz=640, batch_size=1, stride=32, pad=0.5, workers=0)[0]pot_data_loader = YOLOv5POTDataLoader(data_source)'''Configure quantization pipeline'''algorithms_config = [{"name": "DefaultQuantization","params": {"preset": "mixed","stat_subset_size": 300,"target_device": "CPU"},}]engine_config = {"device": "CPU"}model_config = {"model_name": f"{MODEL_NAME}","model": fp32_path,"weights": fp32_path.replace(".xml", ".bin"),}pot_model = load_model(model_config)engine = IEEngine(config=engine_config, data_loader=pot_data_loader)pipeline = create_pipeline(algorithms_config, engine)'''Perform model optimization'''compressed_model = pipeline.run(pot_model)compress_model_weights(compressed_model)optimized_save_dir = Path(f"./POT_INT8_openvino_model/")save_model(compressed_model, optimized_save_dir, model_config["model_name"] + "_int8")pot_int8_path = f"{optimized_save_dir}/{MODEL_NAME}_int8.xml"'''Compare accuracy FP32 and INT8 models'''model = attempt_load(f"./{MODEL_NAME}.pt", device="cpu", inplace=True, fuse=True) metadata = {"stride": int(max(model.stride)), "names": model.names}  # model metadatayaml_save(Path(pot_int8_path).with_suffix(".yaml"), metadata)yaml_save(Path(fp32_path).with_suffix(".yaml"), metadata)print("Checking the accuracy of the original model:")fp32_metrics = validation_fn(data=DATASET_CONFIG,weights=Path(fp32_path).parent,batch_size=1,workers=0,plots=False,device="cpu",iou_thres=0.65,)fp32_ap5 = fp32_metrics[0][2]fp32_ap_full = fp32_metrics[0][3]print(f"mAP@.5 = {fp32_ap5}")print(f"mAP@.5:.95 = {fp32_ap_full}")print("Checking the accuracy of the POT int8 model:")int8_metrics = validation_fn(data=DATASET_CONFIG,weights=Path(pot_int8_path).parent,batch_size=1,workers=0,plots=False,device="cpu",iou_thres=0.65,)pot_int8_ap5 = int8_metrics[0][2]pot_int8_ap_full = int8_metrics[0][3]print(f"mAP@.5 = {pot_int8_ap5}")print(f"mAP@.5:.95 = {pot_int8_ap_full}")

python推理:

import cv2
import numpy as np
from openvino.inference_engine import IECorenames = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light','fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow','elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee','skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard','tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple','sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch','potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard','cell phone','microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors','teddy bear', 'hair drier', 'toothbrush']
conf_thres = 0.5
nms_thres = 0.5
model_path = "yolov5n-f32.onnx" //onnx推理支持fp32和fp16
model_xml = r"./POT_INT8_openvino_model/yolov5n_int8.xml" //IR推理支持fp32、fp16和int8
model_bin = r"./POT_INT8_openvino_model/yolov5n_int8.bin"def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), scaleup=False, stride=32):shape = img.shape[:2]  # current shape [height, width]if isinstance(new_shape, int):new_shape = (new_shape, new_shape)r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])if not scaleup:  # only scale down, do not scale up (for better test mAP)r = min(r, 1.0)ratio = r  # width, height ratiosnew_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh paddingdw /= 2dh /= 2if shape[::-1] != new_unpad:  # resizeimg = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))left, right = int(round(dw - 0.1)), int(round(dw + 0.1))img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add borderreturn img,ratio,(dw,dh)def iou(b1,b2):b1_x1, b1_y1, b1_x2, b1_y2 = b1[0], b1[1], b1[2], b1[3]b2_x1, b2_y1, b2_x2, b2_y2 = b2[:,0], b2[:,1], b2[:,2], b2[:,3]inter_rect_x1 = np.maximum(b1_x1, b2_x1)inter_rect_y1 = np.maximum(b1_y1, b2_y1)inter_rect_x2 = np.minimum(b1_x2, b2_x2)inter_rect_y2 = np.minimum(b1_y2, b2_y2)inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(inter_rect_y2 - inter_rect_y1, 0)area_b1 = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)area_b2 = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)iou = inter_area / np.maximum((area_b1+area_b2-inter_area),1e-6)return ioudef non_max_suppression(boxes, conf_thres=0.5, nms_thres=0.4, ratio=1, pad=(20,20)):# 取出batch_sizebs = np.shape(boxes)[0]# xywh___ to____ xyxyshape_boxes = np.zeros_like(boxes[:,:,:4])shape_boxes[:, :, 0] = boxes[:, :, 0] - boxes[:, :, 2] / 2shape_boxes[:, :, 1] = boxes[:, :, 1] - boxes[:, :, 3] / 2shape_boxes[:, :, 2] = boxes[:, :, 0] + boxes[:, :, 2] / 2shape_boxes[:, :, 3] = boxes[:, :, 1] + boxes[:, :, 3] / 2boxes[:, :, :4] = shape_boxesboxes[:, :, 5:] *= boxes[:, :, 4:5]# output存放每一张图片的预测结果,推理阶段一般是一张图片output = []for i in range(bs):predictions = boxes[i]  # 预测位置xyxy  shape==(12700,85)score = np.max(predictions[:, 5:], axis=-1)# score = predictions[:,4]  # 存在物体置信度,shape==12700mask = score > conf_thres  # 物体置信度阈值mask==[False,False,True......],shape==12700,True将会被保留,False列将会被删除detections = predictions[mask]  # 第一次筛选  shape==(115,85)class_conf = np.expand_dims(np.max(detections[:,5:],axis=-1),axis=-1)  # 获取每个预测框预测的类别置信度class_pred = np.expand_dims(np.argmax(detections[:,5:],axis=-1),axis=-1)  # 获取每个预测框的类别下标# 结果堆叠,(num_boxes,位置信息4+包含物体概率1+类别置信度1+类别序号1)detections = np.concatenate([detections[:,:4],class_conf,class_pred],axis=-1)  # shape=(numbox,7)unique_class = np.unique(detections[:,-1])  # 取出包含的所有类别if len(unique_class)==0:continuebest_box = []for c in unique_class:# 取出类别为c的预测结果cls_mask = detections[:,-1] == cdetection = detections[cls_mask] # shape=(82,7)# 包含物体类别概率从高至低排列scores = detection[:,4]arg_sort = np.argsort(scores)[::-1]  # 返回的是索引detection = detection[arg_sort]while len(detection) != 0:best_box.append(detection[0])if len(detection) == 1:break# 计算当前置信度最大的框和其它预测框的iouious = iou(best_box[-1],detection[1:])detection = detection[1:][ious < nms_thres]  # 小于nms_thres将被保留,每一轮至少减少一个output.append(best_box)boxes_loc = []conf_loc = []class_loc = []if len(output):for i in range(len(output)):pred = output[i]for i, det in enumerate(pred):if len(det):# 将框坐标调整回原始图像中det[0] = (det[0] - pad[0]) / ratiodet[2] = (det[2] - pad[0]) / ratiodet[1] = (det[1] - pad[1]) / ratiodet[3] = (det[3] - pad[1]) / ratioboxes_loc.append([det[0],det[1],det[2],det[3]])conf_loc.append(det[4])class_loc.append(det[5])return boxes_loc,conf_loc,class_locdef plot_box(img,boxes,conf,clas_id,line_thickness=3,names=None):# 画位置框tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1c1, c2 = (int(boxes[0]), int(boxes[1])), (int(boxes[2]),int(boxes[3]))cv2.rectangle(img, c1, c2, [0, 0 ,255], thickness=tl, lineType=cv2.LINE_AA)# 画类别信息框label = f'{names[int(clas_id)]} {conf:.2f}'tf = max(tl - 1, 1)  # label字体的线宽 font thicknesst_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]c2 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3)cv2.rectangle(img, c1, c2, [255, 0 ,0], -1, cv2.LINE_AA)cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)if __name__ == '__main__':ie = IECore()#net = ie.read_network(model=model_path)net = ie.read_network(model=model_xml, weights=model_bin)exec_net = ie.load_network(network=net, device_name="CPU")input_layer = next(iter(net.input_info))frame = cv2.imread("bus.jpg") img, ratio, (dw,dh) = letterbox(frame)blob = cv2.dnn.blobFromImage(np.ascontiguousarray(img), 1/255.0, (img.shape[0], img.shape[1]), swapRB=True, crop=False)infer_request_handle=exec_net.start_async(request_id=0,inputs={input_layer: blob})if infer_request_handle.wait(-1) == 0:res = infer_request_handle.output_blobs["output0"]outs = res.bufferboxes_loc,conf_loc,class_loc = non_max_suppression(outs, conf_thres=conf_thres, nms_thres=nms_thres,ratio=ratio, pad=(dw,dh))for i in range(len(boxes_loc)):boxes = boxes_loc[i]conf = conf_loc[i]clas_id = class_loc[i]plot_box(frame, boxes, conf, clas_id, line_thickness=3, names=names)cv2.imshow("result", frame)cv2.waitKey(0)cv2.destroyAllWindows()

或者利用yolov5自带的detect.py:

python detect.py --weights ./POT_INT8_openvino_model

C++推理:(参考基于OpenVNO C++ API部署YOLOv5模型)

#include <iostream>
#include <string>
#include <openvino/openvino.hpp>
#include <opencv2/opencv.hpp>   /* ---------  Please modify the path of yolov5 model and image -----------*/
std::string model_file = "yolov5n_int8.xml";//可以改成yolov5n_fp16.xml或yolov5n_fp32.xml或yolov5n-f32.onnx(支持fp32、fp18、int8 IR推理和fp32 onnx推理)
std::string image_file = "bus.jpg";const std::vector<std::string> class_names = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light","fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow","elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee","skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard","tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple","sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch","potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone","microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear","hair drier", "toothbrush" };cv::Mat letterbox(cv::Mat& img, std::vector<float>& paddings, std::vector<int> new_shape = { 640, 640 })
{// Get current image shape [height, width]int img_h = img.rows;int img_w = img.cols;// Compute scale ratio(new / old) and target resized shapefloat scale = std::min(new_shape[1] * 1.0 / img_h, new_shape[0] * 1.0 / img_w);int resize_h = int(round(img_h * scale));int resize_w = int(round(img_w * scale));paddings[0] = scale;// Compute paddingint pad_h = new_shape[1] - resize_h;int pad_w = new_shape[0] - resize_w;// Resize and pad image while meeting stride-multiple constraintscv::Mat resized_img;cv::resize(img, resized_img, cv::Size(resize_w, resize_h));// divide padding into 2 sidesfloat half_h = pad_h * 1.0 / 2;float half_w = pad_w * 1.0 / 2;paddings[1] = half_h;paddings[2] = half_w;// Compute padding boarderint top = int(round(half_h - 0.1));int bottom = int(round(half_h + 0.1));int left = int(round(half_w - 0.1));int right = int(round(half_w + 0.1));// Add bordercv::copyMakeBorder(resized_img, resized_img, top, bottom, left, right, 0, cv::Scalar(114, 114, 114));return resized_img;
}int main(int argc, char* argv[])
{// -------- Get OpenVINO runtime version --------std::cout << ov::get_openvino_version().description << ':' << ov::get_openvino_version().buildNumber << std::endl;// -------- Step 1. Initialize OpenVINO Runtime Core --------ov::Core core;// -------- Step 2. Compile the Model --------auto compiled_model = core.compile_model(model_file, "CPU"); // -------- Step 3. Create an Inference Request --------ov::InferRequest infer_request = compiled_model.create_infer_request();clock_t start = clock();// -------- Step 4. Read a picture file and do the preprocess --------cv::Mat img = cv::imread(image_file); //Load a picture into memorystd::vector<float> paddings(3);       //scale, half_h, half_wcv::Mat resized_img = letterbox(img, paddings); //resize to (640,640) by letterboxcv::Mat blob = cv::dnn::blobFromImage(resized_img, 1 / 255.0, cv::Size(640, 640), cv::Scalar(0, 0, 0), true);   // BGR->RGB, u8(0-255)->f32(0.0-1.0), HWC->NCHW// -------- Step 5. Feed the blob into the input node of YOLOv5 ------- auto input_port = compiled_model.input(); // Get input port for model with one inputov::Tensor input_tensor(input_port.get_element_type(), input_port.get_shape(), blob.ptr(0)); // Create tensor from external memoryinfer_request.set_input_tensor(input_tensor); // Set input tensor for model with one input// -------- Step 6. Start inference --------for (size_t i = 0; i < 100; i++)infer_request.infer();// -------- Step 7. Get the inference result --------auto output = infer_request.get_output_tensor(0);auto output_shape = output.get_shape();std::cout << "The shape of output tensor:" << output_shape << std::endl;cv::Mat output_buffer(output_shape[1], output_shape[2], CV_32F, output.data()); // -------- Step 8. Post-process the inference result -----------float conf_threshold = 0.25;float nms_threshold = 0.5;std::vector<cv::Rect> boxes;std::vector<int> class_ids;std::vector<float> class_scores;std::vector<float> confidences;for (int i = 0; i < output_buffer.rows; i++) {float confidence = output_buffer.at<float>(i, 4);if (confidence < conf_threshold) continue;cv::Mat classes_scores = output_buffer.row(i).colRange(5, 85);cv::Point class_id;double score;cv::minMaxLoc(classes_scores, NULL, &score, NULL, &class_id);if (score > 0.25){float cx = output_buffer.at<float>(i, 0);float cy = output_buffer.at<float>(i, 1);float w = output_buffer.at<float>(i, 2);float h = output_buffer.at<float>(i, 3);int left = static_cast<int>((cx - 0.5 * w - paddings[2]) / paddings[0]);int top = static_cast<int>((cy - 0.5 * h - paddings[1]) / paddings[0]);int width = static_cast<int>(w / paddings[0]);int height = static_cast<int>(h / paddings[0]);cv::Rect box;box.x = left;box.y = top;box.width = width;box.height = height;boxes.push_back(box);class_ids.push_back(class_id.x);class_scores.push_back(score);confidences.push_back(confidence);}}// NMSstd::vector<int> indices;cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, nms_threshold, indices);clock_t end = clock();std::cout << end - start << std::endl;// -------- Step 8. Visualize the detection results -----------for (size_t i = 0; i < indices.size(); i++) {int index = indices[i];int class_id = class_ids[index];cv::rectangle(img, boxes[index], cv::Scalar(0, 0, 255), 2, 8);std::string label = class_names[class_id] + ":" + std::to_string(class_scores[index]);cv::putText(img, label, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 10), cv::FONT_HERSHEY_SIMPLEX, .5, cv::Scalar(255, 0, 0));}cv::imshow("YOLOv5 OpenVINO Inference C++ Demo", img);cv::waitKey(0);return 0;
}

C++在i7-12700 CPU下推理fp32、fp16、int8模型循环100轮,耗时如下(各跑3次):
yolov5n_fp32:1599ms 2040ms 1514ms
yolov5n_fp16:1505ms 2078ms 1514ms
yolov5n_int8: 856ms 861ms 852ms

fp32和fp16模型推理耗时差不多,int8能缩短推理耗时到一半左右。

yolov5 tensorrt量化部署

方法一:wts转trt的硬解析方案
这个借助的是大佬的工程:https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5
python和c++的推理代码都有。不得不佩服人家写的确实很详细了,有这么好的轮子干嘛不直接拿来用呢,哈哈。
配置环境的过程还参考了博文:windows上配置TensorRT yolov5 -6.0部署 tensorrtx视频流推理
LZ测试了一下yolov5各种模型单张图片的推理耗时如下(C++,RTX3070 gpu ):
yolov5n-int8: 2ms 1ms
yolov5s-int8: 2ms 1ms
yolov5m-int8:3ms 2ms
yolov5l-int8: 4ms 3ms
yolov5x-int8 :7ms 6ms

yolov5n-fp16: 1ms 1ms
yolov5s-fp16: 2ms 2ms
yolov5m-fp16:4ms 3ms
yolov5l-fp16: 6ms 5ms
yolov5x-fp16:10ms 9ms

yolov5n-fp32: 424ms 2ms
yolov5s-fp32: 389ms 4ms
yolov5m-fp32:401ms 9ms
yolov5l-fp32: 422ms 17ms
yolov5x-fp32: 30ms 28ms
其中在我的机器上,fp32模型的yolov5n-yolov5l版本首次推理时间较长,不清楚是什么原因。

方法二:onnx解析trt的api方案
代码参考课程《深度学习-TensorRT模型部署实战》,先将onnx转换成tensorrt的engine,再进行推理。

#include <NvInfer.h>
#include <NvInferRuntime.h>#include "onnx-tensorrt-release-8.0/NvOnnxParser.h"#include <cuda_runtime.h>#include <stdio.h>
#include <math.h>#include <iostream>
#include <fstream>
#include <vector>
#include <memory>
#include <functional>#include <opencv2/opencv.hpp>using namespace std;#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){if(code != cudaSuccess){    const char* err_name = cudaGetErrorName(code);    const char* err_message = cudaGetErrorString(code);  printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);   return false;}return true;
}inline const char* severity_string(nvinfer1::ILogger::Severity t){switch(t){case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";case nvinfer1::ILogger::Severity::kERROR:   return "error";case nvinfer1::ILogger::Severity::kWARNING: return "warning";case nvinfer1::ILogger::Severity::kINFO:    return "info";case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";default: return "unknow";}
}static const char* cocolabels[] = {"person", "bicycle", "car", "motorcycle", "airplane","bus", "train", "truck", "boat", "traffic light", "fire hydrant","stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse","sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack","umbrella", "handbag", "tie", "suitcase", "frisbee", "skis","snowboard", "sports ball", "kite", "baseball bat", "baseball glove","skateboard", "surfboard", "tennis racket", "bottle", "wine glass","cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich","orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake","chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv","laptop", "mouse", "remote", "keyboard", "cell phone", "microwave","oven", "toaster", "sink", "refrigerator", "book", "clock", "vase","scissors", "teddy bear", "hair drier", "toothbrush"
};static std::tuple<uint8_t, uint8_t, uint8_t> hsv2bgr(float h, float s, float v){const int h_i = static_cast<int>(h * 6);const float f = h * 6 - h_i;const float p = v * (1 - s);const float q = v * (1 - f*s);const float t = v * (1 - (1 - f) * s);float r, g, b;switch (h_i) {case 0:r = v; g = t; b = p;break;case 1:r = q; g = v; b = p;break;case 2:r = p; g = v; b = t;break;case 3:r = p; g = q; b = v;break;case 4:r = t; g = p; b = v;break;case 5:r = v; g = p; b = q;break;default:r = 1; g = 1; b = 1;break;}return make_tuple(static_cast<uint8_t>(b * 255), static_cast<uint8_t>(g * 255), static_cast<uint8_t>(r * 255));
}static std::tuple<uint8_t, uint8_t, uint8_t> random_color(int id){float h_plane = ((((unsigned int)id << 2) ^ 0x937151) % 100) / 100.0f;;float s_plane = ((((unsigned int)id << 3) ^ 0x315793) % 100) / 100.0f;return hsv2bgr(h_plane, s_plane, 1);
}class TRTLogger : public nvinfer1::ILogger{public:virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{if(severity <= Severity::kWARNING){if(severity == Severity::kWARNING){printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);}else if(severity <= Severity::kERROR){printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);}else{printf("%s: %s\n", severity_string(severity), msg);}}}
} logger;template<typename _T>
shared_ptr<_T> make_nvshared(_T* ptr){return shared_ptr<_T>(ptr, [](_T* p){p->destroy();});
}bool build_model() {TRTLogger logger;// 这是基本需要的组件auto builder = make_nvshared(nvinfer1::createInferBuilder(logger));auto config = make_nvshared(builder->createBuilderConfig());auto network = make_nvshared(builder->createNetworkV2(1));// 通过onnxparser解析器解析的结果会填充到network中,类似addConv的方式添加进去auto parser = make_nvshared(nvonnxparser::createParser(*network, logger));if (!parser->parseFromFile("yolov5s-7.0.onnx", 1)) {printf("Failed to parse yolov5s.onnx\n");// 注意这里的几个指针还没有释放,是有内存泄漏的,后面考虑更优雅的解决return false;}int maxBatchSize = 1;printf("Workspace Size = %.2f MB\n", (1 << 28) / 1024.0f / 1024.0f);config->setMaxWorkspaceSize(1 << 28);// 如果模型有多个输入,则必须多个profileauto profile = builder->createOptimizationProfile();auto input_tensor = network->getInput(0);auto input_dims = input_tensor->getDimensions();// 配置最小、最优、最大范围input_dims.d[0] = 1;profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);input_dims.d[0] = maxBatchSize;profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);config->addOptimizationProfile(profile);auto engine = make_nvshared(builder->buildEngineWithConfig(*network, *config));if (engine == nullptr) {printf("Build engine failed.\n");return false;}// 将模型序列化,并储存为文件auto model_data = make_nvshared(engine->serialize());FILE* f = fopen("yolov5s-7.0.trt", "wb");fwrite(model_data->data(), 1, model_data->size(), f);fclose(f);// 卸载顺序按照构建顺序倒序printf("Build Done.\n");return true;
}vector<unsigned char> load_file(const string& file){ifstream in(file, ios::in | ios::binary);if (!in.is_open())return {};in.seekg(0, ios::end);size_t length = in.tellg();std::vector<uint8_t> data;if (length > 0){in.seekg(0, ios::beg);data.resize(length);in.read((char*)&data[0], length);}in.close();return data;
}void inference(){TRTLogger logger;auto engine_data = load_file("yolov5s-7.0.trt");auto runtime   = make_nvshared(nvinfer1::createInferRuntime(logger));auto engine = make_nvshared(runtime->deserializeCudaEngine(engine_data.data(), engine_data.size()));if(engine == nullptr){printf("Deserialize cuda engine failed.\n");runtime->destroy();return;}if(engine->getNbBindings() != 2){printf("你的onnx导出有问题,必须是1个输入和1个输出,你这明显有:%d个输出.\n", engine->getNbBindings() - 1);return;}cudaStream_t stream = nullptr;checkRuntime(cudaStreamCreate(&stream));auto execution_context = make_nvshared(engine->createExecutionContext());int input_batch = 1;int input_channel = 3;int input_height = 640;int input_width = 640;int input_numel = input_batch * input_channel * input_height * input_width;float* input_data_host = nullptr;float* input_data_device = nullptr;checkRuntime(cudaMallocHost(&input_data_host, input_numel * sizeof(float)));checkRuntime(cudaMalloc(&input_data_device, input_numel * sizeof(float)));auto image = cv::imread("bus.jpg");float scale_x = input_width / (float)image.cols;float scale_y = input_height / (float)image.rows;float scale = std::min(scale_x, scale_y);float i2d[6], d2i[6];i2d[0] = scale;  i2d[1] = 0;  i2d[2] = (-scale * image.cols + input_width + scale  - 1) * 0.5;i2d[3] = 0;  i2d[4] = scale;  i2d[5] = (-scale * image.rows + input_height + scale - 1) * 0.5;cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);  cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);  cv::invertAffineTransform(m2x3_i2d, m2x3_d2i); cv::Mat input_image(input_height, input_width, CV_8UC3);cv::warpAffine(image, input_image, m2x3_i2d, input_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));cv::imwrite("input-image.jpg", input_image);int image_area = input_image.cols * input_image.rows;unsigned char* pimage = input_image.data;float* phost_b = input_data_host + image_area * 0;float* phost_g = input_data_host + image_area * 1;float* phost_r = input_data_host + image_area * 2;for(int i = 0; i < image_area; ++i, pimage += 3){*phost_r++ = pimage[0] / 255.0f;*phost_g++ = pimage[1] / 255.0f;*phost_b++ = pimage[2] / 255.0f;}///checkRuntime(cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream));auto output_dims = engine->getBindingDimensions(1);int output_numbox = output_dims.d[1];int output_numprob = output_dims.d[2];int num_classes = output_numprob - 5;int output_numel = input_batch * output_numbox * output_numprob;float* output_data_host = nullptr;float* output_data_device = nullptr;checkRuntime(cudaMallocHost(&output_data_host, sizeof(float) * output_numel));checkRuntime(cudaMalloc(&output_data_device, sizeof(float) * output_numel));auto input_dims = engine->getBindingDimensions(0);input_dims.d[0] = input_batch;execution_context->setBindingDimensions(0, input_dims);float* bindings[] = {input_data_device, output_data_device};bool success      = execution_context->enqueueV2((void**)bindings, stream, nullptr);checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream));checkRuntime(cudaStreamSynchronize(stream));vector<vector<float>> bboxes;float confidence_threshold = 0.25;float nms_threshold = 0.5;for(int i = 0; i < output_numbox; ++i){float* ptr = output_data_host + i * output_numprob;float objness = ptr[4];if(objness < confidence_threshold)continue;float* pclass = ptr + 5;int label     = std::max_element(pclass, pclass + num_classes) - pclass;float prob    = pclass[label];float confidence = prob * objness;if(confidence < confidence_threshold)continue;float cx     = ptr[0];float cy     = ptr[1];float width  = ptr[2];float height = ptr[3];float left   = cx - width * 0.5;float top    = cy - height * 0.5;float right  = cx + width * 0.5;float bottom = cy + height * 0.5;float image_base_left   = d2i[0] * left   + d2i[2];float image_base_right  = d2i[0] * right  + d2i[2];float image_base_top    = d2i[0] * top    + d2i[5];float image_base_bottom = d2i[0] * bottom + d2i[5];bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});}printf("decoded bboxes.size = %d\n", bboxes.size());std::sort(bboxes.begin(), bboxes.end(), [](vector<float>& a, vector<float>& b){return a[5] > b[5];});std::vector<bool> remove_flags(bboxes.size());std::vector<vector<float>> box_result;box_result.reserve(bboxes.size());auto iou = [](const vector<float>& a, const vector<float>& b){float cross_left   = std::max(a[0], b[0]);float cross_top    = std::max(a[1], b[1]);float cross_right  = std::min(a[2], b[2]);float cross_bottom = std::min(a[3], b[3]);float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);float union_area = std::max(0.0f, a[2] - a[0]) * std::max(0.0f, a[3] - a[1]) + std::max(0.0f, b[2] - b[0]) * std::max(0.0f, b[3] - b[1]) - cross_area;if(cross_area == 0 || union_area == 0) return 0.0f;return cross_area / union_area;};for(int i = 0; i < bboxes.size(); ++i){if(remove_flags[i]) continue;auto& ibox = bboxes[i];box_result.emplace_back(ibox);for(int j = i + 1; j < bboxes.size(); ++j){if(remove_flags[j]) continue;auto& jbox = bboxes[j];if(ibox[4] == jbox[4]){if(iou(ibox, jbox) >= nms_threshold)remove_flags[j] = true;}}}printf("box_result.size = %d\n", box_result.size());for(int i = 0; i < box_result.size(); ++i){auto& ibox = box_result[i];float left = ibox[0];float top = ibox[1];float right = ibox[2];float bottom = ibox[3];int class_label = ibox[4];float confidence = ibox[5];cv::Scalar color;tie(color[0], color[1], color[2]) = random_color(class_label);cv::rectangle(image, cv::Point(left, top), cv::Point(right, bottom), color, 3);auto name      = cocolabels[class_label];auto caption   = cv::format("%s %.2f", name, confidence);int text_width = cv::getTextSize(caption, 0, 1, 2, nullptr).width + 10;cv::rectangle(image, cv::Point(left-3, top-33), cv::Point(left + text_width, top), color, -1);cv::putText(image, caption, cv::Point(left, top-5), 0, 1, cv::Scalar::all(0), 2, 16);}cv::imwrite("image-draw.jpg", image);checkRuntime(cudaStreamDestroy(stream));checkRuntime(cudaFreeHost(input_data_host));checkRuntime(cudaFreeHost(output_data_host));checkRuntime(cudaFree(input_data_device));checkRuntime(cudaFree(output_data_device));
}int main(){if (!build_model()) {return -1;}inference();return 0;
}

其中build_model(将onnx转换成tensorrt的engine)也可以通过yolov5-7.0自带的export.py导出:

python export.py --include 'engine'

yolov5量化部署(基于openvino和tensorrt)相关推荐

  1. yolov5模型部署:Nvidia使用TensorRT部署yolov5s模型

    点上方计算机视觉联盟获取更多干货 仅作学术分享,不代表本公众号立场,侵权联系删除 转载于:作者丨梁彦哲@知乎(已授权) 来源丨https://zhuanlan.zhihu.com/p/38688102 ...

  2. jenson nano对YoloV5进行部署以及TensorRT加速

    部署是软件开发生命周期中非常重要的环节,它直接影响到系统的安全性.可用性和性能.因此,在进行部署前需要仔细检查和准备,确保软件能够在目标环境中顺利运行,并进行后期的监控和维护.那么今天这篇文章就来记录 ...

  3. 通用目标检测开源框架YOLOv6在美团的量化部署实战

    基于美团目标检测模型开源框架 YOLOv6,本文介绍了一种通用的量化部署方案,在保持精度的同时大幅提升了检测的速度,为通用检测的工业化部署探索出一条可行之路,希望能给大家带来一些启发或者帮助. 1. ...

  4. xshell部署web项目_前端轻量化部署脚手架实践

    背景 传统的前端代码手工部署流程如下: 手工部署流程 传统的手工部署需要经历: 1.打包,本地运行npm run build打包生成dist文件夹. 2.ssh连接服务器,切换路径到web对应目录下. ...

  5. YOLOv5安卓部署 | 理论+环境配置+实战

    一.YOLOv5安卓部署 | 安卓部署理论篇 | 我这里把打包好的APK格式的安装包放到我的网盘里了,安卓手机的同学可以直接安装在自己的手机上体验一下. 打包好后只有88.17 MB. 链接:http ...

  6. 海思NNIE Hi3559量化部署Mobilefacenet与RetinaFace

    目录 海思NNIE Hi3559量化部署Mobileface模型 环境介绍 前言 准备工作 1.完成Ruyi Studio的安装 2.下载模型.数据集 NNIE量化 1.创建工程 2.配置cfg文件并 ...

  7. 基于OpenVINO与PP-Strucutre的文档智能分析 - 飞桨AI Studio

    ★★★ 本文源自AlStudio社区精品项目,[点击此处]查看更多精品内容 >>> 基于OpenVINO与PP-Strucutre的文档智能分析 本示例包含以下部分组成: 项目说明 ...

  8. 部署基于嵌入的机器学习模型的通用模式

    2020-01-31 21:00:32 作者:Robbe Sneyders 编译:ronghuaiyang 导读 给大家介绍一下如何在生产中部署基于嵌入的机器学习模型. 由于最近大量的研究,机器学习模 ...

  9. logicaldoc 6.5 结合postgresql 9.x安装部署—基于windows平台

    2019独角兽企业重金招聘Python工程师标准>>> 湘中朱生   2012年9月于深圳 说明:原创内容,请勿转载! <1> 从官网下载部署包 官方网站提供源码包和集成 ...

最新文章

  1. 通过T-SQL语句实现数据库加解密功能
  2. 用AJAX实现无刷新的分页
  3. context-param和init-param的区别
  4. intel服务器最新主板芯片组,intel主板芯片组的介绍
  5. 微型计算机中 辅助存储器通常包括,第7章 微型计算机存储器习题参考答案
  6. gcc/g++版本切换与降级
  7. lucene5 排序
  8. kindle导出电子书pc_在PC版Kindle上阅读Mobi电子书
  9. Red Hat 9.0下载及安装
  10. windows10安装masscan
  11. 自定义数据类型的指针的含义
  12. 物通博联为注塑机远程监控提供物联网解决方案
  13. java计算机毕业设计健身房管理系统演示录像2021MyBatis+系统+LW文档+源码+调试部署
  14. 我的世界服务器vip账号和密码是多少,我的世界hypixel服务器会员有什么区别 hypixel服务器会员介绍...
  15. GB、Gb、MB、Mb知识扫盲
  16. 方法教程:如何下载网易云音乐上的视频到本地电脑
  17. Android开发之仿360手机卫士悬浮窗效果
  18. 【车载以太网】【测试】架构及测试工具
  19. vim常用插件安装及使用
  20. HTTP的返回状态码

热门文章

  1. 2018年德国培训交流回顾
  2. 女人 让自己越吃越漂亮(图)
  3. AT89C51单片机
  4. 【无标题】闲习业:分享一下副业兼职创业的方法
  5. C#高德地图经纬度转地理位置或物理地址
  6. Vue相关组件的安装
  7. 算法学习--回溯和剪枝
  8. 【解决方案】RTSP/RTMP/GB28181协议EasyCVR安防视频云服务智慧工地解决方案
  9. C语言入门——兔子数列
  10. 基于RT-THREAD nano的平衡车--硬件