// armor_cuda_infer.cu #include "armor_infer.hpp" #include "letter_box.hpp" #include #include #include #include #include #include #define CUDA_CHECK(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ fprintf( \ stderr, \ "CUDA error at %s:%d: %s\n", \ __FILE__, \ __LINE__, \ cudaGetErrorString(err) \ ); \ exit(EXIT_FAILURE); \ } \ } while (0) namespace armor_cuda_infer { __global__ void nchw_float_to_hwc_uchar4( const float* __restrict__ src, uchar4* __restrict__ dst, int W, int H, float norm ) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= W || y >= H) return; const int idx = y * W + x; const int plane = W * H; float r = __ldg(src + idx + plane * 0); float g = __ldg(src + idx + plane * 1); float b = __ldg(src + idx + plane * 2); r = fminf(fmaxf(r / norm, 0.f), 255.f); g = fminf(fmaxf(g / norm, 0.f), 255.f); b = fminf(fmaxf(b / norm, 0.f), 255.f); dst[idx] = make_uchar4((unsigned char)b, (unsigned char)g, (unsigned char)r, 255); } cv::Mat CudaInfer::tensorToMat(float* d_nchw, int W, int H, float norm, cudaStream_t stream) const { static uchar4* d_hwc = nullptr; static size_t cap = 0; const size_t need = W * H * sizeof(uchar4); if (cap < need) { if (d_hwc) cudaFree(d_hwc); cudaMalloc(&d_hwc, need); cap = need; } const dim3 block(TILE_W, TILE_H); const dim3 grid((W + block.x - 1) / block.x, (H + block.y - 1) / block.y); nchw_float_to_hwc_uchar4<<>>(d_nchw, d_hwc, W, H, norm); cv::Mat img(H, W, CV_8UC4); cudaMemcpyAsync(img.data, d_hwc, need, cudaMemcpyDeviceToHost, stream); // cudaStreamSynchronize(stream); return img; } CudaInfer::CudaInfer() = default; CudaInfer::~CudaInfer() { release(); } void CudaInfer::init(int max_src_w, int max_src_h, int input_w, int input_h) { input_w_ = input_w; input_h_ = input_h; max_src_h_ = max_src_h; max_src_w_ = max_src_w; rellocMem(); } void CudaInfer::rellocMem() { CUDA_CHECK(cudaMalloc(&d_input_bgr_, max_src_w_ * max_src_h_ * 3 * sizeof(unsigned char))); CUDA_CHECK(cudaMallocPitch( &d_input_bgr_pitched_, &input_pitch_bytes_, max_src_w_ * 3 * sizeof(unsigned char), max_src_h_ )); CUDA_CHECK(cudaMalloc(&d_nchw_, input_w_ * input_h_ * 3 * sizeof(float))); printf("Relloc memory for CudaInfer\n"); } void CudaInfer::getOutEnoughMem(int img_w, int img_h) { if (img_w > max_src_w_ || img_h > max_src_h_) { if (img_w > max_src_w_) { max_src_w_ = img_w; } if (img_h > max_src_h_) { max_src_h_ = img_h; } rellocMem(); } } void CudaInfer::release() { if (d_input_bgr_) cudaFree(d_input_bgr_), d_input_bgr_ = nullptr; if (d_input_bgr_pitched_) cudaFree(d_input_bgr_pitched_), d_input_bgr_pitched_ = nullptr; if (d_nchw_) cudaFree(d_nchw_), d_nchw_ = nullptr; } float* CudaInfer::preprocess( const unsigned char* input_bgr_host, int img_w, int img_h, float norm, bool swap_rb, Eigen::Matrix3f& tf_matrix, cudaStream_t stream ) { if (!isInitialized()) { throw std::runtime_error("CudaInfer not initialized properly."); } if (!input_bgr_host || !d_input_bgr_ || !d_nchw_) { fprintf(stderr, "[Error] Null pointer in preprocess input\n"); return nullptr; } getOutEnoughMem(img_w, img_h); float scale = fminf(input_w_ / (float)img_w, input_h_ / (float)img_h); int rw = round(img_w * scale), rh = round(img_h * scale); int pad_l = (input_w_ - rw) / 2, pad_t = (input_h_ - rh) / 2; tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1; size_t img_size = img_w * img_h * 3; CUDA_CHECK( cudaMemcpyAsync(d_input_bgr_, input_bgr_host, img_size, cudaMemcpyHostToDevice, stream) ); dim3 threads(TILE_W, TILE_H); dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H); letterbox_kernel_shared<<>>( d_input_bgr_, img_w, img_h, d_nchw_, input_w_, input_h_, scale, pad_t, pad_l, norm, swap_rb ); CUDA_CHECK(cudaGetLastError()); return d_nchw_; } float* CudaInfer::preprocess_gpu( const unsigned char* input_bgr_device, int img_w, int img_h, float norm, bool swap_rb, Eigen::Matrix3f& tf_matrix, cudaStream_t stream ) { if (!isInitialized()) { throw std::runtime_error("CudaInfer not initialized properly."); } if (!input_bgr_device || !d_nchw_) { fprintf(stderr, "[Error] Null pointer in preprocess input\n"); return nullptr; } getOutEnoughMem(img_w, img_h); float scale = fminf(input_w_ / (float)img_w, input_h_ / (float)img_h); int rw = round(img_w * scale), rh = round(img_h * scale); int pad_l = (input_w_ - rw) / 2, pad_t = (input_h_ - rh) / 2; tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1; size_t img_size = img_w * img_h * 3; dim3 threads(TILE_W, TILE_H); dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H); letterbox_kernel_shared<<>>( input_bgr_device, img_w, img_h, d_nchw_, input_w_, input_h_, scale, pad_t, pad_l, norm, swap_rb ); CUDA_CHECK(cudaGetLastError()); return d_nchw_; } float* CudaInfer::preprocess_pitched( const unsigned char* input_bgr_host, int img_w, int img_h, int host_step, float norm, bool swap_rb, Eigen::Matrix3f& tf_matrix, cudaStream_t stream ) { if (!isInitialized()) { throw std::runtime_error("CudaInfer not initialized properly."); } if (!input_bgr_host || !d_nchw_) { fprintf(stderr, "[Error] Null pointer in preprocess input\n"); return nullptr; } getOutEnoughMem(img_w, img_h); float scale = fminf((float)input_w_ / img_w, (float)input_h_ / img_h); int rw = round(img_w * scale); int rh = round(img_h * scale); int pad_l = (input_w_ - rw) / 2; int pad_t = (input_h_ - rh) / 2; tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1; CUDA_CHECK(cudaMemcpy2DAsync( d_input_bgr_pitched_, input_pitch_bytes_, input_bgr_host, host_step, img_w * 3, img_h, cudaMemcpyHostToDevice, stream )); dim3 threads(TILE_W, TILE_H); dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H); letterbox_kernel_pitched<<>>( d_input_bgr_pitched_, input_pitch_bytes_, img_w, img_h, d_nchw_, input_w_, input_h_, scale, pad_t, pad_l, norm, swap_rb ); CUDA_CHECK(cudaGetLastError()); return d_nchw_; } float* CudaInfer::preprocess_pitched_gpu( const unsigned char* input_bgr_device, int img_w, int img_h, int input_step, float norm, bool swap_rb, Eigen::Matrix3f& tf_matrix, cudaStream_t stream ) { if (!isInitialized()) { throw std::runtime_error("CudaInfer not initialized properly."); } if (!input_bgr_device || !d_nchw_) { fprintf(stderr, "[Error] Null pointer in preprocess_pitched_gpu\n"); return nullptr; } getOutEnoughMem(img_w, img_h); float scale = fminf(static_cast(input_w_) / img_w, static_cast(input_h_) / img_h); int rw = static_cast(roundf(img_w * scale)); int rh = static_cast(roundf(img_h * scale)); int pad_l = (input_w_ - rw) / 2; int pad_t = (input_h_ - rh) / 2; tf_matrix << 1.f / scale, 0.f, -pad_l / scale, 0.f, 1.f / scale, -pad_t / scale, 0.f, 0.f, 1.f; dim3 threads(TILE_W, TILE_H); dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H); letterbox_kernel_pitched<<>>( input_bgr_device, input_step, img_w, img_h, d_nchw_, input_w_, input_h_, scale, pad_t, pad_l, norm, swap_rb ); CUDA_CHECK(cudaGetLastError()); return d_nchw_; } } // namespace armor_cuda_infer