#!/usr/bin/env python3 import os import numpy as np import time import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit import pycuda.tools ## Model properties: # MODEL_PATH = os.environ['CK_ENV_TENSORRT_MODEL_FILENAME'] MODEL_PLUGIN_PATH = os.getenv('CK_ENV_TENSORRT_PLUGIN_PATH', os.getenv('ML_MODEL_TENSORRT_PLUGIN','')) MODEL_USE_DLA = os.getenv('ML_MODEL_USE_DLA', 'NO') in ('YES', 'yes', 'ON', 'on', '1') MODEL_SOFTMAX_LAYER = os.getenv('CK_ENV_ONNX_MODEL_OUTPUT_LAYER_NAME', os.getenv('CK_ENV_TENSORFLOW_MODEL_OUTPUT_LAYER_NAME', '')) ## Processing in batches: # BATCH_SIZE = int(os.getenv('CK_BATCH_SIZE', 1)) if MODEL_PLUGIN_PATH: import ctypes if not os.path.isfile(MODEL_PLUGIN_PATH): raise IOError("{}\n{}\n".format( "Failed to load library ({}).".format(MODEL_PLUGIN_PATH), "Please build the plugin." )) ctypes.CDLL(MODEL_PLUGIN_PATH) def initialize_predictor(): global pycuda_context global d_inputs, h_d_outputs, h_output, model_bindings, cuda_stream global input_volume, output_volume global trt_context global BATCH_SIZE global max_batch_size global trt_version # Load the TensorRT model from file pycuda_context = pycuda.tools.make_default_context() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) try: trt.init_libnvinfer_plugins(TRT_LOGGER, "") with open(MODEL_PATH, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: serialized_engine = f.read() trt_engine = runtime.deserialize_cuda_engine(serialized_engine) trt_version = [ int(v) for v in trt.__version__.split('.') ] print('[TensorRT v{}.{}] successfully loaded'.format(trt_version[0], trt_version[1])) except: pycuda_context.pop() raise RuntimeError('TensorRT model file {} is not found or corrupted'.format(MODEL_PATH)) max_batch_size = trt_engine.max_batch_size if trt_version[0] >= 7 and BATCH_SIZE>1: pycuda_context.pop() raise RuntimeError("Desired batch_size ({}) is not yet supported in TensorRT {}".format(BATCH_SIZE,trt_version[0])) if BATCH_SIZE>max_batch_size: pycuda_context.pop() raise RuntimeError("Desired batch_size ({}) exceeds max_batch_size of the model ({})".format(BATCH_SIZE,max_batch_size)) trt_context = trt_engine.create_execution_context() d_inputs, h_d_outputs, model_bindings = [], [], [] for interface_layer in trt_engine: idx = trt_engine.get_binding_index(interface_layer) dtype = trt_engine.get_binding_dtype(interface_layer) shape = tuple(abs(i) for i in trt_engine.get_binding_shape(interface_layer)) fmt = trt_engine.get_binding_format(idx) if trt_version[0] >= 6 else None if fmt and fmt == trt.TensorFormat.CHW4 and trt_engine.binding_is_input(interface_layer): shape[-3] = ((shape[-3] - 1) // 4 + 1) * 4 size = trt.volume(shape) * max_batch_size dev_mem = cuda.mem_alloc(size * dtype.itemsize) model_bindings.append( int(dev_mem) ) if trt_engine.binding_is_input(interface_layer): if trt_version[0] >= 6: trt_context.set_binding_shape(idx, shape) interface_type = 'Input' d_inputs.append(dev_mem) model_input_shape = shape else: interface_type = 'Output' host_mem = cuda.pagelocked_empty(size, trt.nptype(dtype)) h_d_outputs.append({ 'host_mem': host_mem, 'dev_mem': dev_mem }) if MODEL_SOFTMAX_LAYER=='' or interface_layer == MODEL_SOFTMAX_LAYER: model_output_shape = shape h_output = host_mem print("{} layer {}: dtype={}, shape={}, elements_per_max_batch={}".format(interface_type, interface_layer, dtype, shape, size)) cuda_stream = cuda.Stream() input_volume = trt.volume(model_input_shape) # total number of monochromatic subpixels (before batching) output_volume = trt.volume(model_output_shape) # total number of elements in one image prediction (before batching) num_layers = trt_engine.num_layers return pycuda_context, max_batch_size, input_volume, output_volume, num_layers def inference_for_given_batch(batch_data): global d_inputs, h_d_outputs, h_output, model_bindings, cuda_stream global trt_context global max_batch_size global trt_version actual_batch_size = len(batch_data) if MODEL_USE_DLA and max_batch_size>actual_batch_size: batch_data = np.pad(batch_data, ((0,max_batch_size-actual_batch_size), (0,0), (0,0), (0,0)), 'constant') pseudo_batch_size = max_batch_size else: pseudo_batch_size = actual_batch_size flat_batch = np.ravel(batch_data) begin_inference_timestamp = time.time() cuda.memcpy_htod_async(d_inputs[0], flat_batch, cuda_stream) # assuming one input layer for image classification if trt_version[0] >= 7: trt_context.execute_async_v2(bindings=model_bindings, stream_handle=cuda_stream.handle) else: trt_context.execute_async(bindings=model_bindings, batch_size=pseudo_batch_size, stream_handle=cuda_stream.handle) for output in h_d_outputs: cuda.memcpy_dtoh_async(output['host_mem'], output['dev_mem'], cuda_stream) cuda_stream.synchronize() inference_time_s = time.time() - begin_inference_timestamp ## first dimension contains actual_batch_size vectors, further format depends on the task: # trimmed_batch_results = np.split(h_output, max_batch_size)[:actual_batch_size] return trimmed_batch_results, inference_time_s