onnx格式转tensorRT

自己写的onnx转trt的代码。
此处记录几点：

模型的输入须为numpy格式，所以直接从DataLoader取出的数据是不能直接扔进模型的
模型的输入是多个的时候，如输入多张图片时，可以通过下面这种代码格式传入数据。

 inputs[0].host = data[0]inputs[1].host = data[1]inputs[2].host = data[2]inputs[3].host = data[3]

【该代码是在已经获得engine文件和onnx文件的情况下进行使用的。】

此处给出快速获得engine文件的方法：使用TensorRT自带的trtexec，见：
https://zhuanlan.zhihu.com/p/158199822

# --*-- coding:utf-8 --*--
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2, os
import torchvision
import numpy as np
import model1
from torch.utils.data import Dataset
import torch
import h5py
import random
from torch.utils.data import DataLoader
from torch.autograd import Variableclass NTUData(Dataset):def __init__(self, cv_path, transform=None):# TODO# 1. Initialize file path or list of file names.self.cv_path = cv_pathself.transform = transformwith h5py.File(self.cv_path, 'r') as file:self.group_name_list = [name for name in file]  # Êý¾Ý¼¯h5µÄdata name# shape[1]È¡¹Ç¼ÜÐòÁÐµÄÖ¡Êýself.num_per_frame = Noneself.data_len = file.keys().__len__()  # ¹Ç¼ÜÐòÁÐÊý¾ÝµÄÊýÄ¿def __getitem__(self, index):# TODO# 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).# 2. Preprocess the data (e.g. torchvision.Transform).# 3. Return a data pair (e.g. image and label).# ÕâÀïÐèÒª×¢ÒâµÄÊÇ£¬µÚÒ»²½£ºread one data£¬ÊÇÒ»¸ödataindex_30 = self.get_frame30_index_list(index)# print(index_30.shape)frame30_body_0, frame30_body_1, label_action = self.get_frame30_data(index, index_30)  # get segment data ,labeldiff_body_0, diff_body_1 = self.get_diff_data(frame30_body_0, frame30_body_1)  # get skeleton Motionframe30_body_0 = Variable(torch.FloatTensor(frame30_body_0))frame30_body_1 = Variable(torch.FloatTensor(frame30_body_1))diff_body_0 = Variable(torch.FloatTensor(diff_body_0))diff_body_1 = Variable(torch.FloatTensor(diff_body_1))label_action = torch.LongTensor(label_action).squeeze().numpy().tolist().index(1)return frame30_body_0, frame30_body_1, diff_body_0, diff_body_1, label_actiondef __len__(self):# You should change 0 to the total size of your dataset.return self.data_lendef get_frame30_index_list(self, index):with h5py.File(self.cv_path, 'r') as file:self.num_per_frame = file[self.group_name_list[index]].shape[1]  # Ã¿¸ö¹Ç¼ÜÐòÁÐµÄ×ÜÖ¡Êý# Éú³É31¸öÊýÊÇÎªÁËµÃµ½30¸öÇø¼ärandom_list = np.linspace(0, self.num_per_frame, 31, dtype=int)  # ºóÃæ»á¼õÒ»£¬²»»áÈ¡µ½µÚ109Ö¡# ¼õÒ»ÊÇÈ¡Ã¿Ò»Ð¡Çø¼äµÄ×îºóÒ»Î»Êý£¬±ÜÃâÈ¡µ½ÏÂÒ»Çø¼äµÄµÚÒ»¸öÊý£¬±ÜÃâ³öÏÖÏàÍ¬Ö¡µÄÇé¿öframe30_list = [random.randint(random_list[i], random_list[i + 1] - 1) for i in range(30)]# frame30_list = sorted(random.sample(range(0, self.num_per_frame), 30))  # index list(1, 30)# frame30_index_list = sorted(np.arange(0, self.num_per_frame, 3))return frame30_listdef get_frame30_data(self, index, frame30_list):with h5py.File(self.cv_path, 'r') as file:frame30_body_0 = file[self.group_name_list[index]][0, frame30_list, :, :]  # Ð´Èë30Ö¡Êý¾Ý(30, 25, 3)frame30_body_1 = file[self.group_name_list[index]][1, frame30_list, :, :]  # Ð´Èë30Ö¡Êý¾Ýlabel_action = file[self.group_name_list[index]].attrs['label']  # Ð´Èëlabel(60, 1)return frame30_body_0, frame30_body_1, label_actiondef get_diff_data(self, body0, body1):diff_body_0 = np.diff(body0, n=1, axis=0)  # (29, 25, 3)diff_body_1 = np.diff(body1, n=1, axis=0)diff_zero = np.zeros((1, 25, 3))diff_body_0 = np.r_[diff_body_0, diff_zero]diff_body_1 = np.r_[diff_body_1, diff_zero]return diff_body_0, diff_body_1cv_tst_path = '../cv_tst.hdf5'
test_data = NTUData(cv_tst_path)
net = model1.ConVNet().cuda()
net.eval()# filename = '/home/zp1/code/tensorrt/1.png'
max_batch_size = 1
onnx_model_path = "./ConVNet.onnx"TRT_LOGGER = trt.Logger()class HostDeviceMem(object):def __init__(self, host_mem, device_mem):"""host_mem: cpu memorydevice_mem: gpu memory"""self.host = host_memself.device = device_memdef __str__(self):return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)def __repr__(self):return self.__str__()def allocate_buffers(engine):inputs, outputs, bindings = [], [], []stream = cuda.Stream()for binding in engine:print('the bindding\'s name:', binding)  # 绑定的输入输出print('bingding shape: ', engine.get_binding_shape(binding))  # get_binding_shape 是变量的大小size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size# volume 计算可迭代变量的空间，指元素个数# size = trt.volume(engine.get_binding_shape(binding)) # 如果采用固定bs的onnx，则采用该句dtype = trt.nptype(engine.get_binding_dtype(binding))# get_binding_dtype  获得binding的数据类型# nptype等价于numpy中的dtype，即数据类型# allocate host and device buffershost_mem = cuda.pagelocked_empty(size, dtype)  # 创建锁业内存device_mem = cuda.mem_alloc(host_mem.nbytes)  # cuda分配空间# print(int(device_mem)) # binding在计算图中的缓冲地址bindings.append(int(device_mem))# append to the appropriate listif engine.binding_is_input(binding):inputs.append(HostDeviceMem(host_mem, device_mem))else:outputs.append(HostDeviceMem(host_mem, device_mem))return inputs, outputs, bindings, streamdef get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", fp16_mode=False, save_engine=False):"""params max_batch_size:      预先指定大小好分配显存params onnx_file_path:      onnx文件路径params engine_file_path:    待保存的序列化的引擎文件路径params fp16_mode:           是否采用FP16params save_engine:         是否保存引擎returns:                    ICudaEngine"""# 如果已经存在序列化之后的引擎，则直接反序列化得到cudaEngineif os.path.exists(engine_file_path):print("Reading engine from file: {}".format(engine_file_path))with open(engine_file_path, 'rb') as f, \trt.Runtime(TRT_LOGGER) as runtime:return runtime.deserialize_cuda_engine(f.read())  # 反序列化# else:  # 由onnx创建cudaEngine##     # 使用logger创建一个builder#     # builder创建一个计算图 INetworkDefinition#     # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.#     explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)##     with trt.Builder(TRT_LOGGER) as builder, \#             builder.create_network(explicit_batch) as network, \#             trt.OnnxParser(network, TRT_LOGGER) as parser:  # 使用onnx的解析器绑定计算图，后续将通过解析填充计算图#         builder.max_workspace_size = 1 << 30  # 预先分配的工作空间大小,即ICudaEngine执行时GPU最大需要的空间#         builder.max_batch_size = max_batch_size  # 执行时最大可以使用的batchsize#         builder.fp16_mode = fp16_mode##         # 解析onnx文件，填充计算图#         if not os.path.exists(onnx_file_path):#             quit("ONNX file {} not found!".format(onnx_file_path))#         print('loading onnx file from path {} ...'.format(onnx_file_path))#         with open(onnx_file_path, 'rb') as model:  # 二值化的网络结果和参数#             print("Begining onnx file parsing")#             parser.parse(model.read())  # 解析onnx文件#         # parser.parse_from_file(onnx_file_path) # parser还有一个从文件解析onnx的方法##         print("Completed parsing of onnx file")#         # 填充计算图完成后，则使用builder从计算图中创建CudaEngine#         print("Building an engine from file{}' this may take a while...".format(onnx_file_path))##         ##################         print(network.get_layer(network.num_layers - 1).get_output(0).shape)#         # network.mark_output(network.get_layer(network.num_layers -1).get_output(0))#         last_layer = network.get_layer(network.num_layers - 1)#         network.mark_output(last_layer.get_output(0))#         engine = builder.build_cuda_engine(network)  # 注意，这里的network是INetworkDefinition类型，即填充后的计算图#         print("Completed creating Engine")#         if save_engine:  # 保存engine供以后直接反序列化使用#             with open(engine_file_path, 'wb') as f:#                 f.write(engine.serialize())  # 序列化#         return enginedef do_inference(context, bindings, inputs, outputs, stream, batch_size=1):# Transfer data from CPU to the GPU.[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]# htod： host to device 将数据由cpu复制到gpu device# Run inference.context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)# 当创建network时显式指定了batchsize， 则使用execute_async_v2, 否则使用execute_async# Transfer predictions back from the GPU.[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]# gpu to cpu# Synchronize the streamstream.synchronize()# Return only the host outputs.return [out.host for out in outputs]def postprocess_the_outputs(h_outputs, shape_of_output):h_outputs = h_outputs.reshape(*shape_of_output)return h_outputsLr = 0.0002
Epochs = 300
Batch_Size = 64
test_loader = DataLoader(dataset=test_data, batch_size=Batch_Size, shuffle=False)# img_np_nchw = get_img_np_nchw(filename).astype(np.float32)
# These two modes are depend on hardwares
fp16_mode = False
trt_engine_path = "./ConVNet.engine"
# Build an cudaEngine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode)
# 创建CudaEngine之后,需要将该引擎应用到不同的卡上配置执行环境
context = engine.create_execution_context()
inputs, outputs, bindings, stream = allocate_buffers(engine)  # input, output: host # bindingsshape_of_output = (64, 4096)#t_row和t_rt用于计算pytorch和trt的推断时间
t_row = 0
t_rt = 0# 数据的输入需要为numpy格式，为了更好的测试trt的速度，此处先将所有的数据取出
l = []
for data in test_loader:inputs[0].host = data[0].numpy()inputs[1].host = data[1].numpy()inputs[2].host = data[2].numpy()inputs[3].host = data[3].numpy()l.append([data[0].numpy(),data[1].numpy(),data[2].numpy(),data[3].numpy()])# inputs[1].host = ... for multiple input
t1 = time.time()
for data in l:inputs[0].host = data[0]inputs[1].host = data[1]inputs[2].host = data[2]inputs[3].host = data[3]trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)  # numpy data# feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
t2 = time.time()
t_rt += (t2 - t1)
t3 = time.time()
for data in test_loader:output = net(data[0].cuda(), data[1].cuda(), data[2].cuda(), data[3].cuda())
t4 = time.time()
t_row += t4 - t3
print('inference in pytorch 总耗时', t_row)
print('inference in trt 总耗时', t_rt)
print('TensorRT ok')# mse = np.mean((feat - feat_2) ** 2)
# print('MSE Error = {}'.format(mse))print('All completed!')

onnx格式转tensorRT相关推荐

【YOLOv5】手把手教你使用LabVIEW ONNX Runtime部署 TensorRT加速，实现YOLOv5实时物体识别（含源码）
文章目录前言一.TensorRT简介二.准备工作三.YOLOv5模型的获取 1.下载源码 2.安装模块 3.下载预训练模型 4.转换为onnx模型四.LabVIEW使用TensorRT加速Y ...
OpenCV推断onnx格式目标检测模型(SSD)(附完整代码)
OpenCV推断onnx格式目标检测模型的实例 OpenCV推断onnx格式目标检测模型的实例 OpenCV推断onnx格式目标检测模型的实例 #include <algorithm> # ...
pytorch导出onnx格式模型时，不固定输入输出维度
Pytorch模型转换为onnx格式模型后,模型的输入.输出维度跟转换模型时,用的dummy_input的维度有关系,属于固定尺寸的输入与输出.可以采用以下代码修改onnx模型的输入输出维度: imp ...
把onnx模型转TensorRT模型的trt模型报错：Your ONNX model has been generated with INT64 weights. while TensorRT
欢迎大家关注笔者,你的关注是我持续更博的最大动力原创文章,转载告知,盗版必究把onnx模型转TensorRT模型的trt模型报错:[TRT] onnx2trt_utils.cpp:198: You ...
[Bevdet4D]转Onnx以及转Tensorrt（一）
BEVDet4D: Exploit Temporal Cues in Multi-camera 3D Object Detection(从多相机3D目标检测中利用时间线索) 论文速读 **Summar ...
Transformers文本分类微调和TensorRT推理加速
Transformers文本分类微调和TensorRT推理加速本文介绍了使用Transformers模块微调文本分类模型,以及模型转pth格式,进而转为onnx格式和TensorRT的engine格 ...
用于ONNX的TensorRT后端
用于ONNX的TensorRT后端解析ONNX模型以使用TensorRT执行. 另请参阅TensorRT文档. 有关最近更改的列表,请参见changelog. 支持的TensorRT版本 Maste ...
onnx 测试_用于ONNX的TensorRT后端
用于ONNX的TensorRT后端解析ONNX模型以使用TensorRT执行. 另请参阅TensorRT文档. 有关最近更改的列表,请参见changelog. 支持的TensorRT版本 Maste ...
java调用onnx模型_开源一年多的模型交换格式ONNX，已经一统框架江湖了？
原标题:开源一年多的模型交换格式ONNX,已经一统框架江湖了? 机器之心原创作者:思源近日,微软亚洲研究院和华为举办了 ONNX 合作伙伴研讨会,这是 ONNX 开源社区成立以来首次在中国举办的活 ...

onnx格式转tensorRT

【该代码是在已经获得engine文件和onnx文件的情况下进行使用的。】

onnx格式转tensorRT相关推荐

最新文章

热门文章