YOLO v5 TensorRT推理

#include "iostream"
#include "NvInfer.h"
#include <fstream>
#include <sstream>
#include <assert.h>
#include <vector>
#include <numeric>
#include "opencv2/opencv.hpp"
#include <algorithm>
#include <math.h>
#include <ctime>
#include <math.h>
#include <fstream>#define LOCATIONS 4
#define MAX_DET 300
#define PERSIZE 6
#define INPUT_W 416
#define INPUT_H 416struct Detection {//center_x center_y w hfloat bbox[LOCATIONS];float conf;  // bbox_conf * cls_conffloat class_air;float class_oxy;
};struct ResultDetection {//center_x center_y w hfloat bbox[LOCATIONS];float conf;  // bbox_conf * cls_confint class_id;int suppressed;
};class Logger : public nvinfer1::ILogger
{void log(Severity severity, const char* msg) noexcept override{// suppress info-level messagesif (severity <= Severity::kWARNING)std::cout << msg << std::endl;}
} glogger;unsigned int getElementSize(nvinfer1::DataType t)
{switch (t){case nvinfer1::DataType::kINT32: return 4;case nvinfer1::DataType::kFLOAT: return 4;case nvinfer1::DataType::kHALF: return 2;case nvinfer1::DataType::kBOOL:case nvinfer1::DataType::kINT8: return 1;}throw std::runtime_error("Invalid DataType.");return 0;
}int64_t volume(const nvinfer1::Dims& d)
{return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
}nvinfer1::ICudaEngine* loadEngine(std::fstream& file){std::string cacheEngine = "";while(file.peek() != EOF){std::stringstream buffer;buffer << file.rdbuf();cacheEngine.append(buffer.str());}file.close();nvinfer1::IRuntime* trtRuntime = nvinfer1::createInferRuntime(glogger);nvinfer1::ICudaEngine* engine = trtRuntime->deserializeCudaEngine(cacheEngine.data(), cacheEngine.size(), nullptr);assert(engine != nullptr);return engine;
/*newShape = [width, height]
float letterbox(cv::Mat& image, int* newShape, int* color, int aut=0){int width = image.cols;int height = image.rows;// 获取比较小的缩放比例double r = std::min((float)newShape[0]/width, (float)newShape[1]/height);// 计算paddingint newUnpad[2] = {(int)round(width * r), (int)round(height * r)};// 计算差值float dw = newShape[0] - newUnpad[0];float dh = newShape[1] - newUnpad[1]; if(aut){dw = (int)dw % 32;dh = (int)dh % 32;}// 取对半dw = dw / 2;dh = dh / 2;cv::resize(image, image, cv::Size(newUnpad[0], newUnpad[1]));// 计算坐标int top = (int)round(dh - 0.1);int bottom = (int)round(dh + 0.1);int left = (int)round(dw - 0.1);int right = (int)round(dw + 0.1);cv::copyMakeBorder(image,image,top,bottom,left,right,cv::BORDER_CONSTANT,cv::Scalar(color[0],color[1],color[2]));return r;
}void xywh2xyxy(float *xywh, float * xyxy){xyxy[0] = (float)(xywh[0] - xywh[2] / 2);xyxy[1] = (float)(xywh[1] - xywh[3] / 2);xyxy[2] = (float)(xywh[0] + xywh[2] / 2);xyxy[3] = (float)(xywh[1] + xywh[3] / 2);
}float iou(float lbox[4], float rbox[4]) {// float interBox[] = {//     (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left//     (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right//     (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top//     (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom// };if (lbox[2] > lbox[3] || lbox[0] > lbox[1])return 0.0f;float interBoxS = (lbox[1] - lbox[0])*(lbox[3] - lbox[2]);return interBoxS / ((lbox[1] - lbox[0])*(lbox[3] - lbox[2])+ (rbox[1] - rbox[0])*(rbox[3] - rbox[2]) - interBoxS);
}bool cmp(const ResultDetection& a, const ResultDetection& b) {return a.conf > b.conf;
}void compareAndfilter(std::vector<float>& oriList, std::vector<ResultDetection>& resultList, int max_size){// 获取原始长度int len = oriList.size() / PERSIZE;// 遍历  ===================================================================当前状态下很难会产生超过max_size的候选框,故此不做对置信度的排序 python代码-》  x = x[x[:, 4].argsort(descending=True)[:max_nms]]for(int j = 0; j < max_size; ++j){int max = 0;// for(int i = 0;i < len; ++i){for(int k = 1; k < len; ++k){if(oriList[k * PERSIZE + 4] > oriList[max * PERSIZE + 4]){max = k;}}std::cout << max << std::endl;ResultDetection temp = {oriList[max * PERSIZE + 0], oriList[max * PERSIZE + 1], oriList[max * PERSIZE + 2], oriList[max * PERSIZE + 3], oriList[max * PERSIZE + 4], (int)oriList[max * PERSIZE + 5], 0};resultList.emplace_back(temp);oriList[max  * PERSIZE + 4] = -1;}
}void nms(std::vector<ResultDetection>& res, float *output, int outSize, float conf_thresh, float nms_thresh = 0.5){int det_size = sizeof(Detection) / sizeof(float);// 筛选出第一轮结果,根据conf > conf_threshstd::vector<int> oneList;for(int i = 0; i < outSize / det_size; ++i){if(output[det_size * i + 4] > conf_thresh){// 记录下所有置信度大于conf_thresh的标签oneList.emplace_back(i);}}if(oneList.size() == 0) return;// nms数据std::vector<float> nmsList;for(int i : oneList){int class_index;float zero_class_conf = output[i*det_size + 5] * output[i*det_size + 4];float first_class_conf = output[i*det_size + 6] * output[i*det_size + 4];float xywh[4] = {output[i*det_size + 0], output[i*det_size + 1], output[i*det_size + 2], output[i*det_size + 3]};float xyxy[4];xywh2xyxy(xywh, xyxy);if(zero_class_conf > first_class_conf){nmsList.emplace_back(xyxy[0]);nmsList.emplace_back(xyxy[1]);nmsList.emplace_back(xyxy[2]);nmsList.emplace_back(xyxy[3]);nmsList.emplace_back(zero_class_conf);nmsList.emplace_back(0);}else{nmsList.emplace_back(xyxy[0]);nmsList.emplace_back(xyxy[1]);nmsList.emplace_back(xyxy[2]);nmsList.emplace_back(xyxy[3]);nmsList.emplace_back(first_class_conf);nmsList.emplace_back(1);}}std::vector<ResultDetection> resultList;if(nmsList.size()==0) return;else if ((nmsList.size()/PERSIZE) > MAX_OUTPUT_BBOX_COUNT){// 提取前MAX_OUTPUT_BBOX_COUNT个候选框compareAndfilter(nmsList, resultList, MAX_OUTPUT_BBOX_COUNT);}else{// resultList.assign(nmsList.begin(), nmsList.end());compareAndfilter(nmsList, resultList, nmsList.size() / PERSIZE);}// 简单的排序std::cout << resultList.size() << std::endl;for(int i = 0; i <  resultList.size(); i++){auto tempDet = resultList[i];if(tempDet.suppressed == 1) continue;auto ix1 = tempDet.bbox[0];auto iy1 = tempDet.bbox[1];auto ix2 = tempDet.bbox[2];auto iy2 = tempDet.bbox[3];auto iarea = (ix2 - ix1) * (iy2 - iy1);for(int j = i+1; j < resultList.size();  ++j){auto yTempDet = resultList[j];if(yTempDet.suppressed == 1) continue;auto xx1 = std::max(ix1, yTempDet.bbox[0]);auto yy1 = std::max(iy1, yTempDet.bbox[1]);auto xx2 = std::min(ix2, yTempDet.bbox[2]);auto yy2 = std::min(iy2, yTempDet.bbox[3]);auto w = std::max((float)0, xx2 - xx1);auto h = std::max((float)0, yy2 - yy1);auto inter = w * h;auto ovr = inter / (iarea + (yTempDet.bbox[2] - yTempDet.bbox[0])*(yTempDet.bbox[3]-yTempDet.bbox[1]) - inter);if(ovr > nms_thresh){resultList[j].suppressed = 1;}}}for(int i = 0; i < resultList.size() && res.size() < MAX_DET; ++i){ResultDetection a = resultList[i];if(!a.suppressed) res.emplace_back(a);}}float getTrueCor(float cor, float size){if(cor < 0){return 0;}else{if(cor < size){return cor;}else{return size;}}
}cv::Rect get_rect(cv::Mat& img, float bbox[4], cv::Mat& img_letter_box, float scale) {   // bbox[4]  ->  xmin, ymin, xmax, ymax// 获取原图img与加了空白的预测图片img_letter_boxfloat gain = std::min((float)img_letter_box.cols / img.cols, (float)img_letter_box.rows / img.rows);// 一般在加了letter box之后,较长的一边都会为满,则获取短边的pad即可int pad_cols = int(img_letter_box.cols - img.cols*gain) / 2;   // wint pad_rows = int(img_letter_box.rows - img.rows*gain) / 2;   // h// bbox[0] = bbox[0] - pad_cols;// bbox[2] = bbox[2] - pad_cols;// bbox[1] = bbox[1] - pad_rows;// bbox[3] = bbox[3] - pad_rows;// float scale = (float) img.cols / (img_letter_box.cols - pad_cols*2);float l, r, t, b;l = (int)(getTrueCor((bbox[0] - pad_cols)  / scale, img.cols));r = (int)(getTrueCor((bbox[2] - pad_cols)  / scale, img.cols));t = (int)(getTrueCor((bbox[1] - pad_rows)  / scale, img.rows));b = (int)(getTrueCor((bbox[3] - pad_rows) / scale, img.rows));// float l, r, t, b;// float r_w = INPUT_W / (img.cols * 1.0);// float r_h = INPUT_H / (img.rows * 1.0);// if (r_h > r_w) {//     l = bbox[0] - bbox[2] / 2.f;//     r = bbox[0] + bbox[2] / 2.f;//     t = bbox[1] - bbox[3] / 2.f - (INPUT_H - r_w * img.rows) / 2;//     b = bbox[1] + bbox[3] / 2.f - (INPUT_H - r_w * img.rows) / 2;//     // l = l / r_w;//     // r = r / r_w;//     // t = t / r_w;//     // b = b / r_w;//     l = getTrueCor(l / r_w, img.cols);//     r = getTrueCor(r / r_w, img.cols);//     t = getTrueCor(t / r_w, img.rows);//     b = getTrueCor(b / r_w, img.rows);// } else {//     l = bbox[0] - bbox[2] / 2.f - (INPUT_W - r_h * img.cols) / 2;//     r = bbox[0] + bbox[2] / 2.f - (INPUT_W - r_h * img.cols) / 2;//     t = bbox[1] - bbox[3] / 2.f;//     b = bbox[1] + bbox[3] / 2.f;//     // l = l / r_h;//     // r = r / r_h;//     // t = t / r_h;//     // b = b / r_h;//     l = getTrueCor(l / r_h, img.cols);//     r = getTrueCor(r / r_h, img.cols);//     t = getTrueCor(t / r_h, img.rows);//     b = getTrueCor(b / r_h, img.rows);// }return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}int main(){std::string enginePath="/data/kile/202204/yolov5/log/2.engine";// 创建文件流std::fstream file(enginePath, std::ios_base::binary | std::ios_base::in);if(!file.is_open()){return 10001; // engine文件打开失败 }nvinfer1::ICudaEngine* engine = loadEngine(file);nvinfer1::IExecutionContext* context = engine->createExecutionContext();assert(context != nullptr);clock_t start, end;start = clock();void *buffers[2];std::vector<int64_t> bufferSize;int nBindings = engine->getNbBindings();bufferSize.resize(nBindings);for(int i = 0; i < nBindings; ++i){nvinfer1::Dims dims = engine->getBindingDimensions(i);nvinfer1::DataType dtype = engine->getBindingDataType(i);int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);bufferSize[i] = totalSize;cudaMalloc(&buffers[i], totalSize);}int outSize = bufferSize[1] / sizeof(float);std::string imagePath = "/data/kile/202204/yolov5/result/aircraft_4.jpg";cv::Mat image = cv::imread(imagePath);cv::Mat img1 = image.clone();int size[2] = {416, 416};int color[3] = {114, 114, 114};float scale = letterbox(image, size, color);cv::cvtColor(image, image, cv::COLOR_BGR2RGB);// image.convertTo(image, CV_32F, 255.0, 0);// time_t start,end;// start = clock();std::vector<float> a;if(image.isContinuous()){// a->assign(image.datastart, image.dataend);a.assign(image.datastart, image.dataend);}float image_content[image.channels()*image.rows*image.cols];for (int i = 0; i < image.rows*image.cols; i++){   // (float)(a[i*3] / 255)image_content[i] = (float)(a[i*3] / 255);image_content[i+image.cols*image.rows] = (float)(a[i*3+1] / 255);image_content[i+2*image.cols*image.rows] = (float)(a[i*3+2] / 255);   // 0.003// std::cout << image_content[i] << std::endl;}// end = clock();// std::cout << (float) (end - start) / CLOCKS_PER_SEC << std::endl;cudaError_t flag;cudaStream_t stream;flag = cudaStreamCreate(&stream);if (flag != cudaSuccess){std::cout << "1 cudaStreamCreate error : " << flag <<  std::endl;return 40001;}flag = cudaMemcpyAsync(buffers[0],&image_content, bufferSize[0],cudaMemcpyHostToDevice,stream);if (flag != cudaSuccess){std::cout << "2 cudaMemcpyAsync input error : " << cudaGetErrorString(flag) << std::endl;return 40002;}bool status = context->enqueueV2(buffers, stream, nullptr);if (!status){std::cout << "4 inference error : " << status << std::endl;return 40004;}float result[outSize];flag = cudaMemcpyAsync(result, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost,stream);if (flag != cudaSuccess){   std::cout << "3 cudaMemcpyAsync output error : " << cudaGetErrorString(flag) << std::endl;return 40003;}cudaStreamSynchronize(stream);cudaStreamDestroy(stream);// std::fstream f;// f.open("/data/kile/202204/yolov5/log/imagecontent.txt", std::ios_base::out);// for (int i = 0; i < 74529; i++)// {   // (float)(a[i*3] / 255)//     f << std::to_string(result[i]) << std::endl;// }// f.close();std::vector<ResultDetection> res;nms(res, result, outSize, 0.25, 0.45);for (size_t j = 0; j < res.size(); j++) {cv::Rect r = get_rect(img1, res[j].bbox, image, scale);cv::rectangle(img1, r, cv::Scalar(0x27, 0xC1, 0x36), 2);cv::putText(img1, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);}cv::imwrite("/data/kile/202204/yolov5/log/test_c.jpg", img1);end = clock();std::cout << "hello world!! " << (float) (end - start) / CLOCKS_PER_SEC  << std::endl;getchar();return 1;
import time
from typing import List
import numpy as np
import tensorrt
import torch
from pycuda import driver
import pycuda.autoinit
from PIL import Image, ImageDraw, ImageFontfrom utils.general import non_max_suppression, scale_coordsdef time_sync():# PyTorch-accurate timeif torch.cuda.is_available():torch.cuda.synchronize()return time.time()def trt_pre(batch, context, d_size,d_type):  # Need to set both input and output precisions to FP16 to fully enable FP16output = np.empty(d_size, dtype=d_type)batch = batch.reshape(-1)d_input = driver.mem_alloc(1 * batch.nbytes)d_output = driver.mem_alloc(output.nbytes)bindings = [int(d_input), int(d_output)]stream = driver.Stream()# Transfer input data to devicedriver.memcpy_htod_async(d_input, batch, stream)# Execute modelcontext.execute_async_v2(bindings, stream.handle, None)# Transfer predictions backdriver.memcpy_dtoh_async(output, d_output, stream)# Syncronize threadsstream.synchronize()return outputdef python_tensorrt_predict(model_path):# 加载模型Atrt_model = tensorrt.Runtime(tensorrt.Logger(tensorrt.Logger.WARNING))# 反序列化模型engine = trt_model.deserialize_cuda_engine(open(model_path, "rb").read())# 创建推理上下文context = engine.create_execution_context()for binding in engine:if not engine.binding_is_input(binding):size = tensorrt.volume(engine.get_binding_shape(binding)) * engine.max_batch_sizedtype = tensorrt.nptype(engine.get_binding_dtype(binding))else:input_w = engine.get_binding_shape(binding)[-1]input_h = engine.get_binding_shape(binding)[-2]from utils.datasets import LoadImages# source = r"/data/kile/202204/yolov5/result/aircraft_4.jpg"source = r"/data/kile/202204/yolov5/video/data/data_data/cd3ed3d2cf7611eca8630050569379a7.jpg"start = time.perf_counter()dataset = LoadImages(source, img_size=[input_w, input_h], stride=32, auto=False)for path, im, im0s, vid_cap, s in dataset:image = Image.open(path)im = torch.from_numpy(im).to("cuda").float()im /= 255start1 = time.perf_counter()outputs = trt_pre(np.asarray(im.cpu(), dtype=np.float32), context, size, dtype)end1 = time.perf_counter()print(f"inference {end1 - start1}")# with open("log/outputs.txt", "w") as w:#     for i in outputs:#         w.write(str(i))#         w.write("\n")outputs = torch.as_tensor(outputs).reshape((-1, 7)).unsqueeze(0)pred = non_max_suppression(outputs, 0.25, 0.45)pred[0][:,:4] = scale_coords(im.shape[1:], pred[0][:,:4], im0s.shape)image = drawImage(image, list(pred))image.save(r"log/test.jpg")end = time.perf_counter()print(f"{end -start}")def drawImage(image, class_list):font = ImageFont.truetype(font='/data/kile/other/yolov3/font/FiraMono-Medium.otf',size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))thickness = (image.size[0] + image.size[1]) // 300for i in class_list[0]:if not isinstance(i, List):i = list(i)label = str(i[-1])+"_"+str(i[-2])box = i[:-2]left, top, right, bottom = boxtop = int(top.numpy())left = int(left.numpy())bottom = int(bottom.numpy())right = int(right.numpy())draw = ImageDraw.Draw(image)label_size = draw.textsize(label, font)top = max(0, np.floor(top + 0.5).astype('int32'))left = max(0, np.floor(left + 0.5).astype('int32'))bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))right = min(image.size[0], np.floor(right + 0.5).astype('int32'))if top - label_size[1] >= 0:text_origin = np.array([left, top - label_size[1]])else:text_origin = np.array([left, top + 1])for i in range(thickness):draw.rectangle([left + i, top + i, right - i, bottom - i],outline=(0x27, 0xC1, 0x36))draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)],fill=(128, 0, 128))draw.text(text_origin, label, fill=(0, 0, 0), font=font)del drawreturn imageif __name__ == '__main__':# 通官方的源码export.py生成tensorrt模型# model_path = r"/data/kile/202204/yolov5/log/2.engine"model_path = r"/data/kile/202204/yolov5/log/1.trt"python_tensorrt_predict(model_path)from PIL import Image# path = r"/data/kile/data/oridata_100/n0942195117838/n0942195117838.jpeg"# image = cv2.imread(path)# im, _, _ = letterbox(image, (416,416), auto=False)# cv2.imwrite("/data/1.jpg", im)

用tensorrt c++的话速度可以比yolov5 python快1倍,时间是python的1/2

