import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import torch
import cv2
import torchvision
import time
import operator
from warpaffine import Warpaffine
class gpu_decode(object):
def __init__(self, rows, cols, confidence_threshold = 0.6,nms_threshold = 0.45,model="yolov5",stream=None):
super(gpu_decode, self).__init__()
self.rows = rows
self.cols = cols
self.model = model
self.block = 512 if rows > 512 else rows
self.grid = (rows + self.block - 1) // self.block
self.block = (self.block,1,1)
self.grid = (self.grid,1,1)
self.max_objects = 1000
self.NUM_BOX_ELEMENT = 7
self.num_bboxes = cuda.In(np.array([rows]).astype(np.int32))
if self.model == "yolov5":
self.num_classes = cuda.In(np.array([cols-5]).astype(np.int32))
elif self.model == "yolov8":
self.num_classes = cuda.In(np.array([cols-4]).astype(np.int32))
self.confidence_threshold = cuda.In(np.array([confidence_threshold]).astype(np.float32))
self.nms_threshold = cuda.In(np.array([nms_threshold]).astype(np.float32))
self.nms_block = 512 if self.max_objects > 512 else self.max_objects
self.nms_grid = (self.max_objects + self.nms_block - 1) / self.nms_block;
self.nms_block = (self.nms_block,1,1)
self.nms_grid = (self.nms_grid,1,1)
if stream == None:
self.stream = cuda.Stream()
else:
self.stream = stream
# self.predict_host = cuda.register_host_memory(np.ones((1,self.rows,self.cols)).astype(np.float32))
# self.predict_device = cuda.mem_alloc(self.predict_host.nbytes)
self.output_host = cuda.pagelocked_empty_like(np.ones((self.max_objects, self.NUM_BOX_ELEMENT)).astype(np.float32))
self.output_device_nbytes = self.output_host.nbytes
self.output_device = cuda.mem_alloc(self.output_device_nbytes)
self.max_objects = cuda.In(np.array([self.max_objects]).astype(np.int32))
self.NUM_BOX_ELEMENT = cuda.In(np.array([self.NUM_BOX_ELEMENT]).astype(np.int32))
self.filter_boxs = np.array([0]).astype(np.uint32) #获取第一次过滤后的box数量
self.decode_kernel,self.fast_nms_kernel = self.cuda_func()
def cuda_func(self):
mod = SourceModule("""
__device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
*ox = matrix[0] * x + matrix[1] * y + matrix[2];
*oy = matrix[3] * x + matrix[4] * y + matrix[5];
}
__global__ void decode_kernelv5(
float* predict, int* num_bboxes, int* num_classes, float* confidence_threshold,
float* invert_affine_matrix, float* parray, int* max_objects, int* filter_boxs, int* NUM_BOX_ELEMENT
)
{
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= *num_bboxes) return;
float* pitem = predict + (5 + *num_classes) * position;
float objectness = pitem[4];
if(objectness < *confidence_threshold)
return;
float* class_confidence = pitem + 5;
float confidence = *class_confidence++;
int label = 0;
for(int i = 1; i < *num_classes; ++i, ++class_confidence){
if(*class_confidence > confidence){
confidence = *class_confidence;
label = i;
}
}
confidence *= objectness;
if(confidence < *confidence_threshold)
return;
int index = atomicAdd(filter_boxs, 1);
if(index >= *max_objects)
return;
float cx = *pitem++;
float cy = *pitem++;
float width = *pitem++;
float height = *pitem++;
float left = cx - width * 0.5f;
float top = cy - height * 0.5f;
float right = cx + width * 0.5f;
float bottom = cy + height * 0.5f;
affine_project(invert_affine_matrix, left, top, &left, &top);
affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
// left, top, right, bottom, confidence, class, keepflag
float* pout_item = parray + index * (*NUM_BOX_ELEMENT);
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
__global__ void decode_kernelv8(
float* predict, int* num_bboxes, int* num_classes, float* confidence_threshold,
float* invert_affine_matrix, float* parray, int* max_objects, int* filter_boxs, int* NUM_BOX_ELEMENT
)
{
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= *num_bboxes) return;
float* pitem = predict + (4 + *num_classes) * position;
float* class_confidence = pitem + 4;
float confidence = *class_confidence++;
int label = 0;
for(int i = 1; i < *num_classes; ++i, ++class_confidence){
if(*class_confidence > confidence){
confidence = *class_confidence;
label = i;
}
}
if(confidence < *confidence_threshold)
return;
int index = atomicAdd(filter_boxs, 1);
if(index >= *max_objects)
return;
float cx = *pitem++;
float cy = *pitem++;
float width = *pitem++;
float height = *pitem++;
float left = cx - width * 0.5f;
float top = cy - height * 0.5f;
float right = cx + width * 0.5f;
float bottom = cy + height * 0.5f;
affine_project(invert_affine_matrix, left, top, &left, &top);
affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
// left, top, right, bottom, confidence, class, keepflag
float* pout_item = parray + index * (*NUM_BOX_ELEMENT);
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
__device__ float box_iou(
float aleft, float atop, float aright, float abottom,
float bleft, float btop, float bright, float bbottom
){
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if(c_area == 0.0f)
return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
__global__ void fast_nms_kernel(float* bboxes, int*filter_boxs, int* max_objects, float* threshold, int* NUM_BOX_ELEMENT){
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = min(*filter_boxs, *max_objects);
if (position >= count)
return;
// left, top, right, bottom, confidence, class, keepflag
float* pcurrent = bboxes + position * (*NUM_BOX_ELEMENT);
使用pycuda对图片进行预处理.zip
版权申诉
68 浏览量
2024-03-02
15:11:19
上传
评论
收藏 75.58MB ZIP 举报
博士僧小星
- 粉丝: 1923
- 资源: 5884