add wust typr mpc and mutipule x

2026-03-27 03:41:42 +08:00
parent 2c64655fae
commit 7dcb53bb77
192 changed files with 29571 additions and 9 deletions
--- a/wust_vision-main/cuda_infer/armor_infer.cu
+++ b/wust_vision-main/cuda_infer/armor_infer.cu
@@ -0,0 +1,323 @@
+// armor_cuda_infer.cu
+#include "armor_infer.hpp"
+#include "letter_box.hpp"
+#include <cmath>
+#include <cstdio>
+#include <cuda_fp16.h>
+#include <opencv2/core/hal/interface.h>
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#define CUDA_CHECK(call) \
+    do { \
+        cudaError_t err = call; \
+        if (err != cudaSuccess) { \
+            fprintf( \
+                stderr, \
+                "CUDA error at %s:%d: %s\n", \
+                __FILE__, \
+                __LINE__, \
+                cudaGetErrorString(err) \
+            ); \
+            exit(EXIT_FAILURE); \
+        } \
+    } while (0)
+namespace armor_cuda_infer {
+__global__ void nchw_float_to_hwc_uchar4(
+    const float* __restrict__ src,
+    uchar4* __restrict__ dst,
+    int W,
+    int H,
+    float norm
+) {
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= W || y >= H)
+        return;
+
+    const int idx = y * W + x;
+    const int plane = W * H;
+
+    float r = __ldg(src + idx + plane * 0);
+    float g = __ldg(src + idx + plane * 1);
+    float b = __ldg(src + idx + plane * 2);
+
+    r = fminf(fmaxf(r / norm, 0.f), 255.f);
+    g = fminf(fmaxf(g / norm, 0.f), 255.f);
+    b = fminf(fmaxf(b / norm, 0.f), 255.f);
+
+    dst[idx] = make_uchar4((unsigned char)b, (unsigned char)g, (unsigned char)r, 255);
+}
+
+cv::Mat CudaInfer::tensorToMat(float* d_nchw, int W, int H, float norm, cudaStream_t stream) const {
+    static uchar4* d_hwc = nullptr;
+    static size_t cap = 0;
+
+    const size_t need = W * H * sizeof(uchar4);
+    if (cap < need) {
+        if (d_hwc)
+            cudaFree(d_hwc);
+        cudaMalloc(&d_hwc, need);
+        cap = need;
+    }
+
+    const dim3 block(TILE_W, TILE_H);
+    const dim3 grid((W + block.x - 1) / block.x, (H + block.y - 1) / block.y);
+
+    nchw_float_to_hwc_uchar4<<<grid, block, 0, stream>>>(d_nchw, d_hwc, W, H, norm);
+
+    cv::Mat img(H, W, CV_8UC4);
+
+    cudaMemcpyAsync(img.data, d_hwc, need, cudaMemcpyDeviceToHost, stream);
+
+    // cudaStreamSynchronize(stream);
+    return img;
+}
+
+CudaInfer::CudaInfer() = default;
+CudaInfer::~CudaInfer() {
+    release();
+}
+
+void CudaInfer::init(int max_src_w, int max_src_h, int input_w, int input_h) {
+    input_w_ = input_w;
+    input_h_ = input_h;
+    max_src_h_ = max_src_h;
+    max_src_w_ = max_src_w;
+    rellocMem();
+}
+void CudaInfer::rellocMem() {
+    CUDA_CHECK(cudaMalloc(&d_input_bgr_, max_src_w_ * max_src_h_ * 3 * sizeof(unsigned char)));
+    CUDA_CHECK(cudaMallocPitch(
+        &d_input_bgr_pitched_,
+        &input_pitch_bytes_,
+        max_src_w_ * 3 * sizeof(unsigned char),
+        max_src_h_
+    ));
+    CUDA_CHECK(cudaMalloc(&d_nchw_, input_w_ * input_h_ * 3 * sizeof(float)));
+    printf("Relloc memory for CudaInfer\n");
+}
+void CudaInfer::getOutEnoughMem(int img_w, int img_h) {
+    if (img_w > max_src_w_ || img_h > max_src_h_) {
+        if (img_w > max_src_w_) {
+            max_src_w_ = img_w;
+        }
+        if (img_h > max_src_h_) {
+            max_src_h_ = img_h;
+        }
+        rellocMem();
+    }
+}
+
+void CudaInfer::release() {
+    if (d_input_bgr_)
+        cudaFree(d_input_bgr_), d_input_bgr_ = nullptr;
+    if (d_input_bgr_pitched_)
+        cudaFree(d_input_bgr_pitched_), d_input_bgr_pitched_ = nullptr;
+    if (d_nchw_)
+        cudaFree(d_nchw_), d_nchw_ = nullptr;
+}
+
+float* CudaInfer::preprocess(
+    const unsigned char* input_bgr_host,
+    int img_w,
+    int img_h,
+    float norm,
+    bool swap_rb,
+    Eigen::Matrix3f& tf_matrix,
+    cudaStream_t stream
+) {
+    if (!isInitialized()) {
+        throw std::runtime_error("CudaInfer not initialized properly.");
+    }
+
+    if (!input_bgr_host || !d_input_bgr_ || !d_nchw_) {
+        fprintf(stderr, "[Error] Null pointer in preprocess input\n");
+        return nullptr;
+    }
+    getOutEnoughMem(img_w, img_h);
+    float scale = fminf(input_w_ / (float)img_w, input_h_ / (float)img_h);
+    int rw = round(img_w * scale), rh = round(img_h * scale);
+    int pad_l = (input_w_ - rw) / 2, pad_t = (input_h_ - rh) / 2;
+
+    tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1;
+
+    size_t img_size = img_w * img_h * 3;
+    CUDA_CHECK(
+        cudaMemcpyAsync(d_input_bgr_, input_bgr_host, img_size, cudaMemcpyHostToDevice, stream)
+    );
+
+    dim3 threads(TILE_W, TILE_H);
+    dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
+
+    letterbox_kernel_shared<<<blocks, threads, 0, stream>>>(
+        d_input_bgr_,
+        img_w,
+        img_h,
+        d_nchw_,
+        input_w_,
+        input_h_,
+        scale,
+        pad_t,
+        pad_l,
+        norm,
+        swap_rb
+    );
+
+    CUDA_CHECK(cudaGetLastError());
+    return d_nchw_;
+}
+float* CudaInfer::preprocess_gpu(
+    const unsigned char* input_bgr_device,
+    int img_w,
+    int img_h,
+    float norm,
+    bool swap_rb,
+    Eigen::Matrix3f& tf_matrix,
+    cudaStream_t stream
+) {
+    if (!isInitialized()) {
+        throw std::runtime_error("CudaInfer not initialized properly.");
+    }
+
+    if (!input_bgr_device || !d_nchw_) {
+        fprintf(stderr, "[Error] Null pointer in preprocess input\n");
+        return nullptr;
+    }
+    getOutEnoughMem(img_w, img_h);
+    float scale = fminf(input_w_ / (float)img_w, input_h_ / (float)img_h);
+    int rw = round(img_w * scale), rh = round(img_h * scale);
+    int pad_l = (input_w_ - rw) / 2, pad_t = (input_h_ - rh) / 2;
+
+    tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1;
+
+    size_t img_size = img_w * img_h * 3;
+
+    dim3 threads(TILE_W, TILE_H);
+    dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
+
+    letterbox_kernel_shared<<<blocks, threads, 0, stream>>>(
+        input_bgr_device,
+        img_w,
+        img_h,
+        d_nchw_,
+        input_w_,
+        input_h_,
+        scale,
+        pad_t,
+        pad_l,
+        norm,
+        swap_rb
+    );
+
+    CUDA_CHECK(cudaGetLastError());
+    return d_nchw_;
+}
+float* CudaInfer::preprocess_pitched(
+    const unsigned char* input_bgr_host,
+    int img_w,
+    int img_h,
+    int host_step,
+    float norm,
+    bool swap_rb,
+    Eigen::Matrix3f& tf_matrix,
+    cudaStream_t stream
+) {
+    if (!isInitialized()) {
+        throw std::runtime_error("CudaInfer not initialized properly.");
+    }
+
+    if (!input_bgr_host || !d_nchw_) {
+        fprintf(stderr, "[Error] Null pointer in preprocess input\n");
+        return nullptr;
+    }
+    getOutEnoughMem(img_w, img_h);
+    float scale = fminf((float)input_w_ / img_w, (float)input_h_ / img_h);
+    int rw = round(img_w * scale);
+    int rh = round(img_h * scale);
+    int pad_l = (input_w_ - rw) / 2;
+    int pad_t = (input_h_ - rh) / 2;
+    tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1;
+    CUDA_CHECK(cudaMemcpy2DAsync(
+        d_input_bgr_pitched_,
+        input_pitch_bytes_,
+        input_bgr_host,
+        host_step,
+        img_w * 3,
+        img_h,
+        cudaMemcpyHostToDevice,
+        stream
+    ));
+    dim3 threads(TILE_W, TILE_H);
+    dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
+
+    letterbox_kernel_pitched<<<blocks, threads, 0, stream>>>(
+        d_input_bgr_pitched_,
+        input_pitch_bytes_,
+        img_w,
+        img_h,
+        d_nchw_,
+        input_w_,
+        input_h_,
+        scale,
+        pad_t,
+        pad_l,
+        norm,
+        swap_rb
+    );
+
+    CUDA_CHECK(cudaGetLastError());
+    return d_nchw_;
+}
+float* CudaInfer::preprocess_pitched_gpu(
+    const unsigned char* input_bgr_device,
+    int img_w,
+    int img_h,
+    int input_step,
+    float norm,
+    bool swap_rb,
+    Eigen::Matrix3f& tf_matrix,
+    cudaStream_t stream
+) {
+    if (!isInitialized()) {
+        throw std::runtime_error("CudaInfer not initialized properly.");
+    }
+
+    if (!input_bgr_device || !d_nchw_) {
+        fprintf(stderr, "[Error] Null pointer in preprocess_pitched_gpu\n");
+        return nullptr;
+    }
+    getOutEnoughMem(img_w, img_h);
+    float scale = fminf(static_cast<float>(input_w_) / img_w, static_cast<float>(input_h_) / img_h);
+
+    int rw = static_cast<int>(roundf(img_w * scale));
+    int rh = static_cast<int>(roundf(img_h * scale));
+
+    int pad_l = (input_w_ - rw) / 2;
+    int pad_t = (input_h_ - rh) / 2;
+
+    tf_matrix << 1.f / scale, 0.f, -pad_l / scale, 0.f, 1.f / scale, -pad_t / scale, 0.f, 0.f, 1.f;
+
+    dim3 threads(TILE_W, TILE_H);
+    dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
+
+    letterbox_kernel_pitched<<<blocks, threads, 0, stream>>>(
+        input_bgr_device,
+        input_step,
+        img_w,
+        img_h,
+        d_nchw_,
+        input_w_,
+        input_h_,
+        scale,
+        pad_t,
+        pad_l,
+        norm,
+        swap_rb
+    );
+
+    CUDA_CHECK(cudaGetLastError());
+
+    return d_nchw_;
+}
+
+} // namespace armor_cuda_infer