add wust typr mpc and mutipule x
This commit is contained in:
61
wust_vision-main/cuda_infer/CMakeLists.txt
Normal file
61
wust_vision-main/cuda_infer/CMakeLists.txt
Normal file
@@ -0,0 +1,61 @@
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
cmake_policy(SET CMP0079 NEW)
|
||||
|
||||
project(cuda_infer LANGUAGES CXX CUDA)
|
||||
|
||||
# 设置标准
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CUDA_STANDARD 17)
|
||||
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
|
||||
set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
# 抑制过时 API 警告
|
||||
add_compile_options(-Wno-deprecated-declarations)
|
||||
|
||||
# 禁用 .rsp 响应文件(避免 nvcc 报错)
|
||||
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS OFF)
|
||||
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES OFF)
|
||||
|
||||
# 查找依赖
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
find_package(OpenCV REQUIRED)
|
||||
find_package(Eigen3 REQUIRED)
|
||||
|
||||
# 收集源码
|
||||
file(GLOB_RECURSE CUDA_INFER_SRC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.cu
|
||||
)
|
||||
|
||||
# 添加静态库
|
||||
add_library(cuda_infer STATIC ${CUDA_INFER_SRC})
|
||||
set_target_properties(cuda_infer PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
# 设置包含路径
|
||||
target_include_directories(cuda_infer PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
${TensorRT_INCLUDE_DIR}
|
||||
${EIGEN3_INCLUDE_DIRS}
|
||||
${OpenCV_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
# 设置 CUDA 编译选项
|
||||
target_compile_options(cuda_infer PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:
|
||||
--generate-code=arch=compute_86,code=sm_86
|
||||
-Xcompiler=-fPIC
|
||||
-O3
|
||||
-w
|
||||
-Wno-deprecated-gpu-targets
|
||||
-Wno-error=deprecated-declarations
|
||||
>
|
||||
)
|
||||
|
||||
# 链接库
|
||||
target_link_libraries(cuda_infer PRIVATE
|
||||
${OpenCV_LIBS}
|
||||
CUDA::cudart
|
||||
TensorRT::TensorRT
|
||||
)
|
||||
323
wust_vision-main/cuda_infer/armor_infer.cu
Normal file
323
wust_vision-main/cuda_infer/armor_infer.cu
Normal file
@@ -0,0 +1,323 @@
|
||||
// armor_cuda_infer.cu
|
||||
#include "armor_infer.hpp"
|
||||
#include "letter_box.hpp"
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cuda_fp16.h>
|
||||
#include <opencv2/core/hal/interface.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
#define CUDA_CHECK(call) \
|
||||
do { \
|
||||
cudaError_t err = call; \
|
||||
if (err != cudaSuccess) { \
|
||||
fprintf( \
|
||||
stderr, \
|
||||
"CUDA error at %s:%d: %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
cudaGetErrorString(err) \
|
||||
); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
namespace armor_cuda_infer {
|
||||
__global__ void nchw_float_to_hwc_uchar4(
|
||||
const float* __restrict__ src,
|
||||
uchar4* __restrict__ dst,
|
||||
int W,
|
||||
int H,
|
||||
float norm
|
||||
) {
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (x >= W || y >= H)
|
||||
return;
|
||||
|
||||
const int idx = y * W + x;
|
||||
const int plane = W * H;
|
||||
|
||||
float r = __ldg(src + idx + plane * 0);
|
||||
float g = __ldg(src + idx + plane * 1);
|
||||
float b = __ldg(src + idx + plane * 2);
|
||||
|
||||
r = fminf(fmaxf(r / norm, 0.f), 255.f);
|
||||
g = fminf(fmaxf(g / norm, 0.f), 255.f);
|
||||
b = fminf(fmaxf(b / norm, 0.f), 255.f);
|
||||
|
||||
dst[idx] = make_uchar4((unsigned char)b, (unsigned char)g, (unsigned char)r, 255);
|
||||
}
|
||||
|
||||
cv::Mat CudaInfer::tensorToMat(float* d_nchw, int W, int H, float norm, cudaStream_t stream) const {
|
||||
static uchar4* d_hwc = nullptr;
|
||||
static size_t cap = 0;
|
||||
|
||||
const size_t need = W * H * sizeof(uchar4);
|
||||
if (cap < need) {
|
||||
if (d_hwc)
|
||||
cudaFree(d_hwc);
|
||||
cudaMalloc(&d_hwc, need);
|
||||
cap = need;
|
||||
}
|
||||
|
||||
const dim3 block(TILE_W, TILE_H);
|
||||
const dim3 grid((W + block.x - 1) / block.x, (H + block.y - 1) / block.y);
|
||||
|
||||
nchw_float_to_hwc_uchar4<<<grid, block, 0, stream>>>(d_nchw, d_hwc, W, H, norm);
|
||||
|
||||
cv::Mat img(H, W, CV_8UC4);
|
||||
|
||||
cudaMemcpyAsync(img.data, d_hwc, need, cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
// cudaStreamSynchronize(stream);
|
||||
return img;
|
||||
}
|
||||
|
||||
CudaInfer::CudaInfer() = default;
|
||||
CudaInfer::~CudaInfer() {
|
||||
release();
|
||||
}
|
||||
|
||||
void CudaInfer::init(int max_src_w, int max_src_h, int input_w, int input_h) {
|
||||
input_w_ = input_w;
|
||||
input_h_ = input_h;
|
||||
max_src_h_ = max_src_h;
|
||||
max_src_w_ = max_src_w;
|
||||
rellocMem();
|
||||
}
|
||||
void CudaInfer::rellocMem() {
|
||||
CUDA_CHECK(cudaMalloc(&d_input_bgr_, max_src_w_ * max_src_h_ * 3 * sizeof(unsigned char)));
|
||||
CUDA_CHECK(cudaMallocPitch(
|
||||
&d_input_bgr_pitched_,
|
||||
&input_pitch_bytes_,
|
||||
max_src_w_ * 3 * sizeof(unsigned char),
|
||||
max_src_h_
|
||||
));
|
||||
CUDA_CHECK(cudaMalloc(&d_nchw_, input_w_ * input_h_ * 3 * sizeof(float)));
|
||||
printf("Relloc memory for CudaInfer\n");
|
||||
}
|
||||
void CudaInfer::getOutEnoughMem(int img_w, int img_h) {
|
||||
if (img_w > max_src_w_ || img_h > max_src_h_) {
|
||||
if (img_w > max_src_w_) {
|
||||
max_src_w_ = img_w;
|
||||
}
|
||||
if (img_h > max_src_h_) {
|
||||
max_src_h_ = img_h;
|
||||
}
|
||||
rellocMem();
|
||||
}
|
||||
}
|
||||
|
||||
void CudaInfer::release() {
|
||||
if (d_input_bgr_)
|
||||
cudaFree(d_input_bgr_), d_input_bgr_ = nullptr;
|
||||
if (d_input_bgr_pitched_)
|
||||
cudaFree(d_input_bgr_pitched_), d_input_bgr_pitched_ = nullptr;
|
||||
if (d_nchw_)
|
||||
cudaFree(d_nchw_), d_nchw_ = nullptr;
|
||||
}
|
||||
|
||||
float* CudaInfer::preprocess(
|
||||
const unsigned char* input_bgr_host,
|
||||
int img_w,
|
||||
int img_h,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
) {
|
||||
if (!isInitialized()) {
|
||||
throw std::runtime_error("CudaInfer not initialized properly.");
|
||||
}
|
||||
|
||||
if (!input_bgr_host || !d_input_bgr_ || !d_nchw_) {
|
||||
fprintf(stderr, "[Error] Null pointer in preprocess input\n");
|
||||
return nullptr;
|
||||
}
|
||||
getOutEnoughMem(img_w, img_h);
|
||||
float scale = fminf(input_w_ / (float)img_w, input_h_ / (float)img_h);
|
||||
int rw = round(img_w * scale), rh = round(img_h * scale);
|
||||
int pad_l = (input_w_ - rw) / 2, pad_t = (input_h_ - rh) / 2;
|
||||
|
||||
tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1;
|
||||
|
||||
size_t img_size = img_w * img_h * 3;
|
||||
CUDA_CHECK(
|
||||
cudaMemcpyAsync(d_input_bgr_, input_bgr_host, img_size, cudaMemcpyHostToDevice, stream)
|
||||
);
|
||||
|
||||
dim3 threads(TILE_W, TILE_H);
|
||||
dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
|
||||
|
||||
letterbox_kernel_shared<<<blocks, threads, 0, stream>>>(
|
||||
d_input_bgr_,
|
||||
img_w,
|
||||
img_h,
|
||||
d_nchw_,
|
||||
input_w_,
|
||||
input_h_,
|
||||
scale,
|
||||
pad_t,
|
||||
pad_l,
|
||||
norm,
|
||||
swap_rb
|
||||
);
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
return d_nchw_;
|
||||
}
|
||||
float* CudaInfer::preprocess_gpu(
|
||||
const unsigned char* input_bgr_device,
|
||||
int img_w,
|
||||
int img_h,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
) {
|
||||
if (!isInitialized()) {
|
||||
throw std::runtime_error("CudaInfer not initialized properly.");
|
||||
}
|
||||
|
||||
if (!input_bgr_device || !d_nchw_) {
|
||||
fprintf(stderr, "[Error] Null pointer in preprocess input\n");
|
||||
return nullptr;
|
||||
}
|
||||
getOutEnoughMem(img_w, img_h);
|
||||
float scale = fminf(input_w_ / (float)img_w, input_h_ / (float)img_h);
|
||||
int rw = round(img_w * scale), rh = round(img_h * scale);
|
||||
int pad_l = (input_w_ - rw) / 2, pad_t = (input_h_ - rh) / 2;
|
||||
|
||||
tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1;
|
||||
|
||||
size_t img_size = img_w * img_h * 3;
|
||||
|
||||
dim3 threads(TILE_W, TILE_H);
|
||||
dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
|
||||
|
||||
letterbox_kernel_shared<<<blocks, threads, 0, stream>>>(
|
||||
input_bgr_device,
|
||||
img_w,
|
||||
img_h,
|
||||
d_nchw_,
|
||||
input_w_,
|
||||
input_h_,
|
||||
scale,
|
||||
pad_t,
|
||||
pad_l,
|
||||
norm,
|
||||
swap_rb
|
||||
);
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
return d_nchw_;
|
||||
}
|
||||
float* CudaInfer::preprocess_pitched(
|
||||
const unsigned char* input_bgr_host,
|
||||
int img_w,
|
||||
int img_h,
|
||||
int host_step,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
) {
|
||||
if (!isInitialized()) {
|
||||
throw std::runtime_error("CudaInfer not initialized properly.");
|
||||
}
|
||||
|
||||
if (!input_bgr_host || !d_nchw_) {
|
||||
fprintf(stderr, "[Error] Null pointer in preprocess input\n");
|
||||
return nullptr;
|
||||
}
|
||||
getOutEnoughMem(img_w, img_h);
|
||||
float scale = fminf((float)input_w_ / img_w, (float)input_h_ / img_h);
|
||||
int rw = round(img_w * scale);
|
||||
int rh = round(img_h * scale);
|
||||
int pad_l = (input_w_ - rw) / 2;
|
||||
int pad_t = (input_h_ - rh) / 2;
|
||||
tf_matrix << 1.f / scale, 0, -pad_l / scale, 0, 1.f / scale, -pad_t / scale, 0, 0, 1;
|
||||
CUDA_CHECK(cudaMemcpy2DAsync(
|
||||
d_input_bgr_pitched_,
|
||||
input_pitch_bytes_,
|
||||
input_bgr_host,
|
||||
host_step,
|
||||
img_w * 3,
|
||||
img_h,
|
||||
cudaMemcpyHostToDevice,
|
||||
stream
|
||||
));
|
||||
dim3 threads(TILE_W, TILE_H);
|
||||
dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
|
||||
|
||||
letterbox_kernel_pitched<<<blocks, threads, 0, stream>>>(
|
||||
d_input_bgr_pitched_,
|
||||
input_pitch_bytes_,
|
||||
img_w,
|
||||
img_h,
|
||||
d_nchw_,
|
||||
input_w_,
|
||||
input_h_,
|
||||
scale,
|
||||
pad_t,
|
||||
pad_l,
|
||||
norm,
|
||||
swap_rb
|
||||
);
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
return d_nchw_;
|
||||
}
|
||||
float* CudaInfer::preprocess_pitched_gpu(
|
||||
const unsigned char* input_bgr_device,
|
||||
int img_w,
|
||||
int img_h,
|
||||
int input_step,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
) {
|
||||
if (!isInitialized()) {
|
||||
throw std::runtime_error("CudaInfer not initialized properly.");
|
||||
}
|
||||
|
||||
if (!input_bgr_device || !d_nchw_) {
|
||||
fprintf(stderr, "[Error] Null pointer in preprocess_pitched_gpu\n");
|
||||
return nullptr;
|
||||
}
|
||||
getOutEnoughMem(img_w, img_h);
|
||||
float scale = fminf(static_cast<float>(input_w_) / img_w, static_cast<float>(input_h_) / img_h);
|
||||
|
||||
int rw = static_cast<int>(roundf(img_w * scale));
|
||||
int rh = static_cast<int>(roundf(img_h * scale));
|
||||
|
||||
int pad_l = (input_w_ - rw) / 2;
|
||||
int pad_t = (input_h_ - rh) / 2;
|
||||
|
||||
tf_matrix << 1.f / scale, 0.f, -pad_l / scale, 0.f, 1.f / scale, -pad_t / scale, 0.f, 0.f, 1.f;
|
||||
|
||||
dim3 threads(TILE_W, TILE_H);
|
||||
dim3 blocks((input_w_ + TILE_W - 1) / TILE_W, (input_h_ + TILE_H - 1) / TILE_H);
|
||||
|
||||
letterbox_kernel_pitched<<<blocks, threads, 0, stream>>>(
|
||||
input_bgr_device,
|
||||
input_step,
|
||||
img_w,
|
||||
img_h,
|
||||
d_nchw_,
|
||||
input_w_,
|
||||
input_h_,
|
||||
scale,
|
||||
pad_t,
|
||||
pad_l,
|
||||
norm,
|
||||
swap_rb
|
||||
);
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
return d_nchw_;
|
||||
}
|
||||
|
||||
} // namespace armor_cuda_infer
|
||||
78
wust_vision-main/cuda_infer/armor_infer.hpp
Normal file
78
wust_vision-main/cuda_infer/armor_infer.hpp
Normal file
@@ -0,0 +1,78 @@
|
||||
// armor_cuda_infer.hpp
|
||||
#pragma once
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
#include <opencv2/core/mat.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace armor_cuda_infer {
|
||||
|
||||
class CudaInfer {
|
||||
public:
|
||||
CudaInfer();
|
||||
~CudaInfer() noexcept;
|
||||
|
||||
void init(int max_src_w, int max_src_h, int input_w, int input_h);
|
||||
void release();
|
||||
bool isInitialized() const {
|
||||
return d_input_bgr_ && d_nchw_ && d_input_bgr_pitched_;
|
||||
}
|
||||
void getOutEnoughMem(int img_w, int img_h);
|
||||
void rellocMem();
|
||||
|
||||
float* preprocess(
|
||||
const unsigned char* input_bgr_host,
|
||||
int img_w,
|
||||
int img_h,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
);
|
||||
float* preprocess_pitched(
|
||||
const unsigned char* input_bgr_host,
|
||||
int img_w,
|
||||
int img_h,
|
||||
int host_step,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
);
|
||||
float* preprocess_gpu(
|
||||
const unsigned char* input_bgr_device,
|
||||
int img_w,
|
||||
int img_h,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
);
|
||||
float* preprocess_pitched_gpu(
|
||||
const unsigned char* input_bgr_device,
|
||||
int img_w,
|
||||
int img_h,
|
||||
int host_step,
|
||||
float norm,
|
||||
bool swap_rb,
|
||||
Eigen::Matrix3f& tf_matrix,
|
||||
cudaStream_t stream
|
||||
);
|
||||
cv::Mat tensorToMat(float* d_nchw, int W, int H, float norm, cudaStream_t stream) const;
|
||||
|
||||
private:
|
||||
CudaInfer(const CudaInfer&) = delete;
|
||||
CudaInfer& operator=(const CudaInfer&) = delete;
|
||||
unsigned char* d_input_bgr_ = nullptr;
|
||||
float* d_nchw_ = nullptr;
|
||||
unsigned char* d_input_bgr_pitched_ = nullptr;
|
||||
size_t input_pitch_bytes_ = 0;
|
||||
int input_w_;
|
||||
int input_h_;
|
||||
int max_src_w_, max_src_h_;
|
||||
};
|
||||
} // namespace armor_cuda_infer
|
||||
147
wust_vision-main/cuda_infer/letter_box.cu
Normal file
147
wust_vision-main/cuda_infer/letter_box.cu
Normal file
@@ -0,0 +1,147 @@
|
||||
#include "letter_box.hpp"
|
||||
__global__ void letterbox_kernel_shared(
|
||||
const uchar* __restrict__ input_bgr,
|
||||
int in_w,
|
||||
int in_h,
|
||||
float* __restrict__ output_nchw,
|
||||
int out_w,
|
||||
int out_h,
|
||||
float scale,
|
||||
int pad_t,
|
||||
int pad_l,
|
||||
float norm,
|
||||
bool swap_rb
|
||||
) {
|
||||
// global x/y
|
||||
int x = blockIdx.x * TILE_W + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_H + threadIdx.y;
|
||||
if (x >= out_w || y >= out_h)
|
||||
return;
|
||||
|
||||
// 共享内存 + halo
|
||||
__shared__ uchar4 smem[TILE_H + 1][TILE_W + 1];
|
||||
|
||||
int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
int total_smem = (TILE_W + 1) * (TILE_H + 1);
|
||||
int threads_per_block = blockDim.x * blockDim.y;
|
||||
int iter = (total_smem + threads_per_block - 1) / threads_per_block;
|
||||
|
||||
float inv_scale = 1.0f / scale;
|
||||
float block_start_x = blockIdx.x * TILE_W - pad_l;
|
||||
float block_start_y = blockIdx.y * TILE_H - pad_t;
|
||||
|
||||
// load shared memory
|
||||
for (int i = 0; i < iter; i++) {
|
||||
int idx = tid + i * threads_per_block;
|
||||
if (idx < total_smem) {
|
||||
int sx = idx % (TILE_W + 1);
|
||||
int sy = idx / (TILE_W + 1);
|
||||
|
||||
float in_x = (block_start_x + sx) * inv_scale;
|
||||
float in_y = (block_start_y + sy) * inv_scale;
|
||||
|
||||
int ix = floorf(in_x);
|
||||
int iy = floorf(in_y);
|
||||
|
||||
uchar4 p = make_uchar4(114, 114, 114, 0); // padding BGR
|
||||
if (ix >= 0 && iy >= 0 && ix < in_w && iy < in_h) {
|
||||
int offset = (iy * in_w + ix) * 3;
|
||||
p.x = input_bgr[offset]; // b
|
||||
p.y = input_bgr[offset + 1]; // g
|
||||
p.z = input_bgr[offset + 2]; // r
|
||||
}
|
||||
|
||||
smem[sy][sx] = p;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 双线性插值
|
||||
float in_x = (x - pad_l) * inv_scale;
|
||||
float in_y = (y - pad_t) * inv_scale;
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
float dx = in_x - floorf(in_x);
|
||||
float dy = in_y - floorf(in_y);
|
||||
float dx1 = 1.0f - dx, dy1 = 1.0f - dy;
|
||||
|
||||
uchar4 p00 = smem[ty][tx];
|
||||
uchar4 p01 = smem[ty][tx + 1];
|
||||
uchar4 p10 = smem[ty + 1][tx];
|
||||
uchar4 p11 = smem[ty + 1][tx + 1];
|
||||
|
||||
float out_r = dx1 * dy1 * p00.z + dx * dy1 * p01.z + dx1 * dy * p10.z + dx * dy * p11.z;
|
||||
float out_g = dx1 * dy1 * p00.y + dx * dy1 * p01.y + dx1 * dy * p10.y + dx * dy * p11.y;
|
||||
float out_b = dx1 * dy1 * p00.x + dx * dy1 * p01.x + dx1 * dy * p10.x + dx * dy * p11.x;
|
||||
|
||||
int out_idx = y * out_w + x;
|
||||
if (swap_rb) {
|
||||
output_nchw[out_idx + 0 * out_w * out_h] = out_r * norm;
|
||||
output_nchw[out_idx + 1 * out_w * out_h] = out_g * norm;
|
||||
output_nchw[out_idx + 2 * out_w * out_h] = out_b * norm;
|
||||
} else {
|
||||
output_nchw[out_idx + 0 * out_w * out_h] = out_b * norm;
|
||||
output_nchw[out_idx + 1 * out_w * out_h] = out_g * norm;
|
||||
output_nchw[out_idx + 2 * out_w * out_h] = out_r * norm;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void letterbox_kernel_pitched(
|
||||
const unsigned char* __restrict__ d_input_bgr,
|
||||
size_t pitch,
|
||||
int src_w,
|
||||
int src_h,
|
||||
float* __restrict__ d_nchw,
|
||||
int OUT_W,
|
||||
int OUT_H,
|
||||
float scale,
|
||||
int pad_t,
|
||||
int pad_l,
|
||||
float norm,
|
||||
bool swap_rb
|
||||
) {
|
||||
int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (ox >= OUT_W || oy >= OUT_H)
|
||||
return;
|
||||
|
||||
float fx = (ox - pad_l) / scale;
|
||||
float fy = (oy - pad_t) / scale;
|
||||
|
||||
int out_idx = oy * OUT_W + ox;
|
||||
int plane = OUT_W * OUT_H;
|
||||
|
||||
// clamp coordinates
|
||||
fx = fmaxf(0.f, fminf(fx, src_w - 2.f));
|
||||
fy = fmaxf(0.f, fminf(fy, src_h - 2.f));
|
||||
|
||||
int x0 = floorf(fx), y0 = floorf(fy);
|
||||
int x1 = x0 + 1, y1 = y0 + 1;
|
||||
|
||||
float dx = fx - x0, dy = fy - y0;
|
||||
float dx1 = 1.f - dx, dy1 = 1.f - dy;
|
||||
|
||||
// row pointers
|
||||
const uchar3* row0 = (const uchar3*)((const char*)d_input_bgr + y0 * pitch);
|
||||
const uchar3* row1 = (const uchar3*)((const char*)d_input_bgr + y1 * pitch);
|
||||
|
||||
uchar3 p00 = row0[x0];
|
||||
uchar3 p01 = row0[x1];
|
||||
uchar3 p10 = row1[x0];
|
||||
uchar3 p11 = row1[x1];
|
||||
|
||||
// bilinear interpolation
|
||||
float r = dx1 * dy1 * p00.z + dx * dy1 * p01.z + dx1 * dy * p10.z + dx * dy * p11.z;
|
||||
float g = dx1 * dy1 * p00.y + dx * dy1 * p01.y + dx1 * dy * p10.y + dx * dy * p11.y;
|
||||
float b = dx1 * dy1 * p00.x + dx * dy1 * p01.x + dx1 * dy * p10.x + dx * dy * p11.x;
|
||||
|
||||
if (swap_rb) {
|
||||
d_nchw[out_idx + 0 * plane] = r * norm;
|
||||
d_nchw[out_idx + 1 * plane] = g * norm;
|
||||
d_nchw[out_idx + 2 * plane] = b * norm;
|
||||
} else {
|
||||
d_nchw[out_idx + 0 * plane] = b * norm;
|
||||
d_nchw[out_idx + 1 * plane] = g * norm;
|
||||
d_nchw[out_idx + 2 * plane] = r * norm;
|
||||
}
|
||||
}
|
||||
42
wust_vision-main/cuda_infer/letter_box.hpp
Normal file
42
wust_vision-main/cuda_infer/letter_box.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
#pragma once
|
||||
#include <Eigen/Dense>
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
#include <opencv2/core/hal/interface.h>
|
||||
#include <opencv2/core/mat.hpp>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <vector>
|
||||
static constexpr int TILE_W = 32;
|
||||
static constexpr int TILE_H = 32;
|
||||
__global__ void letterbox_kernel_shared(
|
||||
const uchar* __restrict__ input_bgr,
|
||||
int in_w,
|
||||
int in_h,
|
||||
float* __restrict__ output_nchw,
|
||||
int out_w,
|
||||
int out_h,
|
||||
float scale,
|
||||
int pad_t,
|
||||
int pad_l,
|
||||
float norm,
|
||||
bool swap_rb
|
||||
);
|
||||
__global__ void letterbox_kernel_pitched(
|
||||
const unsigned char* __restrict__ d_input_bgr,
|
||||
size_t pitch,
|
||||
int src_w,
|
||||
int src_h,
|
||||
float* __restrict__ d_nchw,
|
||||
int OUT_W,
|
||||
int OUT_H,
|
||||
float scale,
|
||||
int pad_t,
|
||||
int pad_l,
|
||||
float norm,
|
||||
bool swap_rb
|
||||
);
|
||||
Reference in New Issue
Block a user