import numpy as np
from cuda import cudart
import tensorrt as trt
nIn, cIn, hIn, wIn = 1, 1, 6, 9 #
输
⼊
张
量
NCHW
hW, wW = 2, 2 #
池
化
窗
口
HW
data = np.tile(np.arange(1, 1 + 9, dtype=np.float32).reshape(1, 3, 3), (nIn, cIn, hIn // 3, wIn // 3))
#
输
⼊
数
据
np.set_printoptions(precision=8, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()
logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
config = builder.create_builder_config()
inputT0 = network.add_input('inputT0', trt.DataType.FLOAT, (nIn, cIn, hIn, wIn))
#---------------------------------------------------------- --------------------#
替
换
部
分
poolLayer = network.add_pooling_nd(inputT0, trt.PoolingType.MAX, (hW, wW))
#---------------------------------------------------------- --------------------#
替
换
部
分
network.mark_output(poolLayer.get_output(0))
engineString = builder.build_serialized_network(network, config)
engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
context = engine.create_execution_context()
_, stream = cudart.cudaStreamCreate()
inputH0 = np.ascontiguousarray(data.reshape(-1))
outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
_, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
_, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)
评论0
最新资源