import numpy as np
from cuda import cudart
import tensorrt as trt
nIn, cIn, hIn, wIn = 1, 1, 6, 9 #
输
⼊
张
量
NCHW
cOut, hW, wW = 1, 3, 3 #
卷
积
权
重
的
输
出
通
道
数
、
⾼
度
和
宽
度
data = np.tile(np.arange(1, 1 + hW * wW, dtype=np.float32).reshape(hW, wW), (cIn, hIn // hW, wIn //
wW)).reshape(1, cIn, hIn, wIn) #
输
⼊
数
据
weight = np.power(10, range(4, -5, -1), dtype=np.float32).reshape(cOut, hW, wW) #
卷
积
权
重
bias = np.zeros(cOut, dtype=np.float32) #
卷
积
偏
置
np.set_printoptions(precision=8, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()
logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
config = builder.create_builder_config()
inputT0 = network.add_input('inputT0', trt.DataType.FLOAT, (nIn, cIn, hIn, wIn))
#---------------------------------------------------------- --------------------#
替
换
部
分
convolutionLayer = network.add_convolution_nd(inputT0, cOut, (hW, wW), weight, bias)
#---------------------------------------------------------- --------------------#
替
换
部
分
network.mark_output(convolutionLayer.get_output(0))
engineString = builder.build_serialized_network(network, config)
engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
context = engine.create_execution_context()
_, stream = cudart.cudaStreamCreate()
inputH0 = np.ascontiguousarray(data.reshape(-1))
outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
_, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
_, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)
cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
context.execute_async_v2([int(inputD0), int(outputD0)], stream)
评论0
最新资源