From 78298e56f172dac6a44ee0660d1a33887085f2c3 Mon Sep 17 00:00:00 2001 From: cyy_mac Date: Sat, 9 May 2026 17:03:40 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BD=91=E7=BB=9C=E6=B5=8B=E8=AF=95=E5=92=8C?= =?UTF-8?q?=E5=AD=A6=E4=B9=A0demo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- network_learning/01_alnet_demo.py | 260 ++++++++++ network_learning/02_ricnn_demo.py | 425 +++++++++++++++ network_learning/03_converter_demo.py | 230 +++++++++ network_learning/04_generator_fusion_demo.py | 304 +++++++++++ network_learning/05_netvlad_demo.py | 308 +++++++++++ network_learning/06_uot_demo.py | 356 +++++++++++++ network_learning/08_full_pipeline_demo.py | 516 +++++++++++++++++++ network_learning/LEARNING_GUIDE.md | 419 +++++++++++++++ network_learning/README.md | 50 ++ 9 files changed, 2868 insertions(+) create mode 100644 network_learning/01_alnet_demo.py create mode 100644 network_learning/02_ricnn_demo.py create mode 100644 network_learning/03_converter_demo.py create mode 100644 network_learning/04_generator_fusion_demo.py create mode 100644 network_learning/05_netvlad_demo.py create mode 100644 network_learning/06_uot_demo.py create mode 100644 network_learning/08_full_pipeline_demo.py create mode 100644 network_learning/LEARNING_GUIDE.md create mode 100644 network_learning/README.md diff --git a/network_learning/01_alnet_demo.py b/network_learning/01_alnet_demo.py new file mode 100644 index 0000000..5a283d3 --- /dev/null +++ b/network_learning/01_alnet_demo.py @@ -0,0 +1,260 @@ +""" +ALNet 网络结构可视化 Demo +=========================== +ALNet 是图像分支的特征提取网络,基于 ALIKE 架构。 +输入:图像 (B, 3, 192, 576) +输出:score_map (B, 1, 192, 576) + descriptor_map (B, 128, 192, 576) + +网络由以下部分组成: + block1: ConvBlock(3→16) - 保持分辨率 + pool2: MaxPool2d(2) - 下采样 2x + block2: ResBlock(16→32) - 残差块 + pool4: MaxPool2d(4) - 下采样 4x + block3: ResBlock(32→64) - 残差块 + pool4: MaxPool2d(4) - 下采样 4x + block4: ResBlock(64→128) - 残差块 + 特征聚合: 4层concat + 上采样 - 多尺度融合 + 输出头: Conv1x1(128→129) - score + descriptor +""" + +import torch +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') # 非交互后端,适合服务器 + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ALIKE.alnet import ALNet, ConvBlock, ResBlock + +# ============================================================ +# 配置 +# ============================================================ +OUTPUT_DIR = os.path.join(os.path.dirname(__file__), 'output') +os.makedirs(OUTPUT_DIR, exist_ok=True) + +# 使用 alike-n 配置(论文中使用) +CFG = {'c1': 16, 'c2': 32, 'c3': 64, 'c4': 128, 'dim': 128, 'single_head': True} + + +def visualize_tensor(tensor, title, save_name, cmap='viridis', n_channels=8): + """可视化特征图的多个通道""" + if tensor.dim() == 4: + tensor = tensor[0] # 取第一个batch + C, H, W = tensor.shape + n_show = min(n_channels, C) + + fig, axes = plt.subplots(2, 4, figsize=(16, 8)) + fig.suptitle(title, fontsize=14, fontweight='bold') + + for i in range(n_show): + ax = axes[i // 4, i % 4] + im = ax.imshow(tensor[i].detach().cpu().numpy(), cmap=cmap) + ax.set_title(f'Channel {i}') + ax.axis('off') + plt.colorbar(im, ax=ax, fraction=0.046) + + for i in range(n_show, 8): + axes[i // 4, i % 4].axis('off') + + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, save_name) + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_score_map(score_map, title, save_name): + """可视化得分图""" + if score_map.dim() == 4: + score_map = score_map[0, 0] + elif score_map.dim() == 3: + score_map = score_map[0] + + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + fig.suptitle(title, fontsize=14, fontweight='bold') + + im0 = axes[0].imshow(score_map.detach().cpu().numpy(), cmap='hot') + axes[0].set_title('Score Map (热力图)') + axes[0].axis('off') + plt.colorbar(im0, ax=axes[0]) + + # 直方图 + axes[1].hist(score_map.detach().cpu().numpy().flatten(), bins=50, color='steelblue', edgecolor='white') + axes[1].set_title('Score 分布直方图') + axes[1].set_xlabel('Score Value') + axes[1].set_ylabel('Frequency') + + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, save_name) + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_intermediate_features(model, input_tensor): + """逐层提取并可视化中间特征图""" + print('\n' + '=' * 60) + print('ALNet 中间特征逐层可视化') + print('=' * 60) + + x = input_tensor + print(f'输入: {x.shape}') + + # Block 1: ConvBlock + x1 = model.block1(x) + print(f'block1 (ConvBlock 3→16): {x1.shape}') + visualize_tensor(x1, 'Block1: ConvBlock 输出 (16通道)', 'alnet_block1.png') + + # Pool2 + Block 2 + x2 = model.pool2(x1) + x2 = model.block2(x2) + print(f'pool2 + block2 (ResBlock 16→32): {x2.shape}') + visualize_tensor(x2, 'Block2: ResBlock 输出 (32通道) [1/2分辨率]', 'alnet_block2.png') + + # Pool4 + Block 3 + x3 = model.pool4(x2) + x3 = model.block3(x3) + print(f'pool4 + block3 (ResBlock 32→64): {x3.shape}') + visualize_tensor(x3, 'Block3: ResBlock 输出 (64通道) [1/8分辨率]', 'alnet_block3.png') + + # Pool4 + Block 4 + x4 = model.pool4(x3) + x4 = model.block4(x4) + print(f'pool4 + block4 (ResBlock 64→128): {x4.shape}') + visualize_tensor(x4, 'Block4: ResBlock 输出 (128通道) [1/32分辨率]', 'alnet_block4.png') + + # 特征聚合 + f1 = model.gate(model.conv1(x1)) # dim//4 通道 + f2 = model.gate(model.conv2(x2)) + f3 = model.gate(model.conv3(x3)) + f4 = model.gate(model.conv4(x4)) + + f2_up = model.upsample2(f2) + f3_up = model.upsample8(f3) + f4_up = model.upsample32(f4) + + print(f'特征聚合: f1={f1.shape}, f2_up={f2_up.shape}, f3_up={f3_up.shape}, f4_up={f4_up.shape}') + + fused = torch.cat([f1, f2_up, f3_up, f4_up], dim=1) + print(f'多尺度拼接后: {fused.shape}') + visualize_tensor(fused, '多尺度特征拼接 (128通道)', 'alnet_fused_features.png', n_channels=8) + + # 输出头 + output = model.convhead2(fused) + score_map = torch.sigmoid(output[:, -1:, :, :]) + descriptor_map = output[:, :-1, :, :] + + print(f'Score Map: {score_map.shape}') + print(f'Descriptor Map: {descriptor_map.shape}') + + visualize_score_map(score_map, 'ALNet 最终输出 Score Map', 'alnet_final_score.png') + visualize_tensor(descriptor_map, 'ALNet 最终输出 Descriptor Map (128通道)', 'alnet_final_descriptor.png') + + +def visualize_receptive_field(): + """可视化有效感受野(通过梯度反传)""" + print('\n--- 感受野分析 ---') + model = ALNet(**CFG) + model.eval() + + input_tensor = torch.randn(1, 3, 192, 576, requires_grad=True) + score_map, _ = model(input_tensor) + + # 对score_map中心点的梯度反传 + h, w = score_map.shape[2], score_map.shape[3] + score_map[0, 0, h // 2, w // 2].backward() + + grad = input_tensor.grad.abs().sum(dim=1)[0] + fig, ax = plt.subplots(figsize=(12, 4)) + im = ax.imshow(grad.detach().cpu().numpy(), cmap='hot') + ax.set_title('ALNet 有效感受野 (梯度幅度)', fontsize=14) + ax.axis('off') + plt.colorbar(im, ax=ax) + path = os.path.join(OUTPUT_DIR, 'alnet_receptive_field.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def analyze_parameters(): + """分析网络参数量""" + print('\n--- 参数量分析 ---') + model = ALNet(**CFG) + total = sum(p.numel() for p in model.parameters()) + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print(f'总参数量: {total:,} ({total / 1e6:.2f}M)') + print(f'可训练参数: {trainable:,} ({trainable / 1e6:.2f}M)') + + # 逐模块分析 + for name, module in model.named_children(): + params = sum(p.numel() for p in module.parameters()) + print(f' {name:20s}: {params:>10,} params ({params / 1e3:.1f}K)') + + +def main(): + print('=' * 60) + print('ALNet (图像特征提取网络) 结构与特征可视化') + print('=' * 60) + + analyze_parameters() + + # 构建模型 + model = ALNet(**CFG) + model.eval() + + # 模拟输入: 裁剪后的KITTI图像 (192, 576) + input_tensor = torch.randn(1, 3, 192, 576) + + # 前向传播 + with torch.no_grad(): + score_map, descriptor_map = model(input_tensor) + + print(f'\n输入尺寸: {input_tensor.shape}') + print(f'Score Map 输出: {score_map.shape} (范围: [{score_map.min():.3f}, {score_map.max():.3f}])') + print(f'Descriptor Map 输出: {descriptor_map.shape}') + + # 逐层可视化中间特征 + visualize_intermediate_features(model, input_tensor) + + # 感受野分析 + visualize_receptive_field() + + # 网络结构文本总结 + print('\n' + '=' * 60) + print('网络结构总结:') + print('=' * 60) + print(""" + ALNet (alike-n config): + ┌──────────────────────────────────────────────────────┐ + │ 输入: (B, 3, 192, 576) │ + │ ↓ │ + │ block1: ConvBlock(3→16) → (B, 16, 192, 576) │ + │ ↓ MaxPool2d(2) │ + │ block2: ResBlock(16→32) → (B, 32, 96, 288) │ + │ ↓ MaxPool2d(4) │ + │ block3: ResBlock(32→64) → (B, 64, 24, 72) │ + │ ↓ MaxPool2d(4) │ + │ block4: ResBlock(64→128) → (B, 128, 6, 18) │ + │ ↓ │ + │ 特征聚合: 4尺度1×1conv + 上采样 + concat → (B,128,192,576) │ + │ ↓ Conv1x1(128→129) │ + │ 输出: score(B,1,192,576) + desc(B,128,192,576) │ + └──────────────────────────────────────────────────────┘ + + block1/2/3/4 各阶段的作用: + - block1: 浅层特征(边缘、角点等低级特征) + - block2: 中层特征(纹理、局部形状) + - block3: 高层特征(语义信息、物体部件) + - block4: 最抽象特征(全局上下文) + - 多尺度融合: 结合各层信息,兼顾定位精度和语义鲁棒性 + """) + + print(f'\n所有可视化结果保存在: {OUTPUT_DIR}') + + +if __name__ == '__main__': + main() diff --git a/network_learning/02_ricnn_demo.py b/network_learning/02_ricnn_demo.py new file mode 100644 index 0000000..c5612f5 --- /dev/null +++ b/network_learning/02_ricnn_demo.py @@ -0,0 +1,425 @@ +""" +RICNN 旋转不变CNN网络结构可视化 Demo +======================================= +RICNN 是点云BEV分支的特征提取网络,核心创新是"旋转不变性"。 +与标准CNN不同,RICNN的卷积核根据像素到中心的欧氏距离分组, +使得旋转后的特征保持一致。 + +输入:BEV图像 (B, 3, 320, 320) +输出:score_map (B, 1, 320, 320) + descriptor_map (B, 128, 320, 320) + +关键组件: + RIConv2d: 旋转不变卷积(按距离分组共享权重) + RIMaxpool2d: 旋转不变最大池化(只对圆形邻域取max) + RIAvgpool2d: 旋转不变平均池化(只对圆形邻域取avg) + RIResBlock: 旋转不变残差块 +""" + +import torch +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from BEVNet import RICNN, RIConv2d, RIMaxpool2d, RIAvgpool2d, EncodePosition + +OUTPUT_DIR = os.path.join(os.path.dirname(__file__), 'output') +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def visualize_tensor(tensor, title, save_name, cmap='viridis', n_channels=8): + """可视化特征图""" + if tensor.dim() == 4: + tensor = tensor[0] + C, H, W = tensor.shape + n_show = min(n_channels, C) + + fig, axes = plt.subplots(2, 4, figsize=(16, 8)) + fig.suptitle(title, fontsize=14, fontweight='bold') + for i in range(n_show): + ax = axes[i // 4, i % 4] + im = ax.imshow(tensor[i].detach().cpu().numpy(), cmap=cmap) + ax.set_title(f'Channel {i}') + ax.axis('off') + plt.colorbar(im, ax=ax, fraction=0.046) + for i in range(n_show, 8): + axes[i // 4, i % 4].axis('off') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, save_name) + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_score_map(score_map, title, save_name): + """可视化得分图""" + if score_map.dim() == 4: + score_map = score_map[0, 0] + elif score_map.dim() == 3: + score_map = score_map[0] + + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + fig.suptitle(title, fontsize=14, fontweight='bold') + im0 = axes[0].imshow(score_map.detach().cpu().numpy(), cmap='hot') + axes[0].set_title('Score Map') + axes[0].axis('off') + plt.colorbar(im0, ax=axes[0]) + axes[1].hist(score_map.detach().cpu().numpy().flatten(), bins=50, color='steelblue', edgecolor='white') + axes[1].set_title('Score 分布') + axes[1].set_xlabel('Score') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, save_name) + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_ri_conv_kernel(): + """可视化旋转不变卷积核的权重分组模式""" + print('\n--- RIConv2d 卷积核分组可视化 ---') + + fig, axes = plt.subplots(1, 3, figsize=(16, 5)) + + for idx, kz in enumerate([3, 5, 7]): + # 计算距离掩码 + coords = torch.arange(kz ** 2).view(-1, 1) + row = torch.div(coords, kz, rounding_mode='floor') + col = torch.fmod(coords, kz) + coords = torch.cat([row, col], dim=1) + dis = (coords - 0.5 * (kz - 1)).norm(dim=1) + 0.5 * (kz % 2 - 1) + dis = dis.view(kz, kz) + dis = torch.round(dis).long() + dis[dis > 0.5 * (kz - 1)] = -1 + + ax = axes[idx] + im = ax.imshow(dis.numpy(), cmap='tab10') + ax.set_title(f'Kernel {kz}x{kz}\nDistance Groups: {dis.max().item() + 1}') + # 标注每个位置的距离值 + for i in range(kz): + for j in range(kz): + val = dis[i, j].item() + color = 'white' if val >= 0 else 'red' + ax.text(j, i, str(val), ha='center', va='center', fontsize=8, color=color) + ax.axis('off') + + plt.suptitle('RIConv2d: 按到中心距离分组的卷积核权重', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'ricnn_kernel_groups.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_ri_pooling(): + """可视化旋转不变池化的有效区域""" + print('\n--- 旋转不变池化区域可视化 ---') + + fig, axes = plt.subplots(2, 2, figsize=(10, 10)) + + # RIMaxpool2d 有效区域 (kernel_size=5) + kz = 5 + coords = torch.arange(kz ** 2).view(-1, 1) + row = torch.div(coords, kz, rounding_mode='floor') + col = torch.fmod(coords, kz) + coords = torch.cat([row, col], dim=1) + dis = (coords - 0.5 * (kz - 1)).norm(dim=1) + 0.5 * (kz % 2 - 1) + dis = dis.view(kz, kz) + dis = torch.round(dis) + dis[dis > 0.5 * (kz - 1)] = -1 + mask_ri = (dis > -1).numpy().astype(float) + + # 标准 MaxPool2d 有效区域(正方形) + mask_std = np.ones((kz, kz)) + + ax = axes[0, 0] + ax.imshow(mask_std, cmap='Blues') + ax.set_title(f'标准 MaxPool {kz}x{kz}\n有效区域: {mask_std.sum():.0f} 个像素', fontsize=12) + for i in range(kz): + for j in range(kz): + ax.text(j, i, '✓', ha='center', va='center', fontsize=10) + ax.axis('off') + + ax = axes[0, 1] + ax.imshow(mask_ri, cmap='Oranges') + ax.set_title(f'RI MaxPool {kz}x{kz}\n有效区域: {mask_ri.sum():.0f} 个像素 (圆形)', fontsize=12) + for i in range(kz): + for j in range(kz): + text = '✓' if mask_ri[i, j] else '✗' + color = 'white' if mask_ri[i, j] else 'red' + ax.text(j, i, text, ha='center', va='center', fontsize=10, color=color) + ax.axis('off') + + # 可视化旋转不变性:对比旋转前后的特征 + ax = axes[1, 0] + ax.set_title('旋转不变性原理', fontsize=12) + ax.text(0.5, 0.7, '标准CNN:', transform=ax.transAxes, fontsize=11, ha='center', + bbox=dict(boxstyle='round', facecolor='lightblue')) + ax.text(0.5, 0.5, '旋转图像 → 特征也旋转 → 不匹配', transform=ax.transAxes, fontsize=10, ha='center') + ax.text(0.5, 0.3, 'RICNN:', transform=ax.transAxes, fontsize=11, ha='center', + bbox=dict(boxstyle='round', facecolor='lightgreen')) + ax.text(0.5, 0.1, '旋转图像 → 特征不变 → 可以匹配', transform=ax.transAxes, fontsize=10, ha='center') + ax.axis('off') + + ax = axes[1, 1] + ax.set_title('RI vs 标准 CNN 对比', fontsize=12) + categories = ['旋转鲁棒性', '计算效率', '平移不变性', '尺度不变性'] + ri_scores = [0.9, 0.7, 0.8, 0.5] + std_scores = [0.3, 1.0, 0.8, 0.5] + x = np.arange(len(categories)) + width = 0.35 + ax.bar(x - width / 2, ri_scores, width, label='RICNN', color='orange', alpha=0.8) + ax.bar(x + width / 2, std_scores, width, label='标准CNN', color='blue', alpha=0.8) + ax.set_xticks(x) + ax.set_xticklabels(categories, fontsize=9) + ax.set_ylim(0, 1.2) + ax.legend() + ax.set_ylabel('能力评分') + + plt.suptitle('RICNN 旋转不变池化详解', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'ricnn_pooling_visualization.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def test_rotation_invariance(): + """测试旋转不变性:对比旋转前后特征差异""" + print('\n--- 旋转不变性测试 ---') + + model = RICNN() + model.eval() + + # 创建测试BEV图像(带明显特征) + bev = torch.zeros(1, 3, 320, 320) + # 添加一些矩形特征 + bev[0, 0, 100:120, 150:170] = 1.0 + bev[0, 1, 140:160, 100:140] = 0.8 + bev[0, 2, 150:170, 160:200] = 0.6 + + with torch.no_grad(): + score_orig, desc_orig = model(bev) + + # 旋转90度 + bev_rot90 = torch.rot90(bev, k=1, dims=[2, 3]) + score_rot90, desc_rot90 = model(bev_rot90) + # 旋转回去比较 + desc_rot90_back = torch.rot90(desc_rot90, k=-1, dims=[2, 3]) + + # 旋转180度 + bev_rot180 = torch.rot90(bev, k=2, dims=[2, 3]) + score_rot180, desc_rot180 = model(bev_rot180) + desc_rot180_back = torch.rot90(desc_rot180, k=-2, dims=[2, 3]) + + # 计算相似度 + cos_sim_90 = torch.nn.functional.cosine_similarity( + desc_orig.flatten(), desc_rot90_back.flatten(), dim=0) + cos_sim_180 = torch.nn.functional.cosine_similarity( + desc_orig.flatten(), desc_rot180_back.flatten(), dim=0) + + print(f'原始 vs 旋转90°后特征 余弦相似度: {cos_sim_90.item():.4f}') + print(f'原始 vs 旋转180°后特征 余弦相似度: {cos_sim_180.item():.4f}') + print(f'(越接近1.0说明旋转不变性越好)') + + # 可视化 + fig, axes = plt.subplots(2, 4, figsize=(18, 8)) + + axes[0, 0].imshow(bev[0].permute(1, 2, 0).numpy()) + axes[0, 0].set_title('原始BEV') + axes[0, 1].imshow(bev_rot90[0].permute(1, 2, 0).numpy()) + axes[0, 1].set_title('旋转90°') + axes[0, 2].imshow(score_orig[0, 0].numpy(), cmap='hot') + axes[0, 2].set_title('原始Score') + axes[0, 3].imshow(score_rot90[0, 0].numpy(), cmap='hot') + axes[0, 3].set_title('旋转90° Score') + + axes[1, 0].imshow(desc_orig[0, 0].numpy(), cmap='viridis') + axes[1, 0].set_title(f'原始Desc ch0') + axes[1, 1].imshow(desc_rot90_back[0, 0].numpy(), cmap='viridis') + axes[1, 1].set_title(f'旋回后Desc ch0\n相似度:{cos_sim_90.item():.3f}') + axes[1, 2].imshow((desc_orig[0, 0] - desc_rot90_back[0, 0]).abs().numpy(), cmap='Reds') + axes[1, 2].set_title('差异热图 ch0') + axes[1, 3].axis('off') + + for ax in axes.flatten(): + if ax.collections or ax.images: + continue + ax.axis('off') + + plt.suptitle('RICNN 旋转不变性测试', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'ricnn_rotation_invariance.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + return cos_sim_90.item(), cos_sim_180.item() + + +def visualize_ricnn_intermediate(): + """可视化RICNN中间层特征""" + print('\n--- RICNN 中间特征可视化 ---') + + model = RICNN() + model.eval() + + # 使用更有结构的输入 + x = torch.linspace(-1, 1, 320) + y = torch.linspace(-1, 1, 320) + grid_y, grid_x = torch.meshgrid(y, x, indexing='ij') + r = torch.sqrt(grid_x ** 2 + grid_y ** 2) + + bev = torch.zeros(1, 3, 320, 320) + bev[0, 0] = (torch.sin(grid_x * 10) * torch.cos(grid_y * 10) + 1) / 2 + bev[0, 1] = (torch.cos(r * 5) + 1) / 2 + bev[0, 2] = (r < 0.5).float() + + # 逐层前向 + with torch.no_grad(): + x1 = model.block1(bev) + x2 = model.pool2(x1) + x2 = model.block2(x2) + x3 = model.pool4(x2) + x3 = model.block3(x3) + x4 = model.pool4(x3) + x4 = model.block4(x4) + + print(f'输入BEV: {bev.shape}') + print(f'block1 (RIConvBlock 3→16): {x1.shape}') + print(f'pool2+block2 (RIResBlock 16→32): {x2.shape}') + print(f'pool4+block3 (RIResBlock 32→64): {x3.shape}') + print(f'pool4+block4 (RIResBlock 64→128): {x4.shape}') + + visualize_tensor(x1, 'RICNN Block1 输出 (16通道)', 'ricnn_block1.png') + visualize_tensor(x2, 'RICNN Block2 输出 (32通道)', 'ricnn_block2.png') + visualize_tensor(x3, 'RICNN Block3 输出 (64通道)', 'ricnn_block3.png') + visualize_tensor(x4, 'RICNN Block4 输出 (128通道)', 'ricnn_block4.png') + + +def visualize_position_encoding(): + """可视化位置编码模块""" + print('\n--- EncodePosition 位置编码可视化 ---') + + ep = EncodePosition(feature_size=128) + ep.eval() + + # 模拟150个BEV关键点 (B, 150, 4) — [x,y,z,intensity] + kpts = torch.randn(2, 150, 4) + kpts[:, :, :2] = kpts[:, :, :2] * 30 # x,y 在 ±30m 范围 + kpts[:, :, 2] = 0 # z=0 (BEV平面) + kpts[:, :, 3] = 1 # intensity=1 + + # 模拟特征 (B, 128, 150) + fea = torch.randn(2, 128, 150) + + with torch.no_grad(): + fea_encoded = ep(kpts, fea) + + print(f'关键点输入: {kpts.shape}') + print(f'原始特征: {fea.shape}') + print(f'位置编码后特征: {fea_encoded.shape}') + + # 可视化距离直方图 + x1 = kpts[0].unsqueeze(1) # (150, 1, 4) + x2 = kpts[0].unsqueeze(0) # (1, 150, 4) + dx = x1 - x2 + distance = dx.norm(p=2, dim=2) # (150, 150) + + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + im0 = axes[0].imshow(distance.numpy(), cmap='plasma') + axes[0].set_title('关键点间距离矩阵 (150x150)') + axes[0].set_xlabel('Keypoint j') + axes[0].set_ylabel('Keypoint i') + plt.colorbar(im0, ax=axes[0]) + + # 示例直方图 (第一个关键点) + hist = torch.histc(distance[0], bins=16, min=1, max=80) + axes[1].bar(range(16), hist.numpy(), color='steelblue') + axes[1].set_title('距离直方图 (16 bins, 1-80m)\n用于位置编码') + axes[1].set_xlabel('Distance Bin') + axes[1].set_ylabel('Count') + + plt.suptitle('EncodePosition 位置编码模块', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'ricnn_position_encoding.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def analyze_parameters(): + """参数量分析""" + print('\n--- 参数量分析 ---') + model = RICNN() + total = sum(p.numel() for p in model.parameters()) + print(f'总参数量: {total:,} ({total / 1e6:.2f}M)') + for name, module in model.named_children(): + params = sum(p.numel() for p in module.parameters()) + print(f' {name:20s}: {params:>10,} params ({params / 1e3:.1f}K)') + + +def main(): + print('=' * 60) + print('RICNN (旋转不变CNN) 网络结构与特征可视化') + print('=' * 60) + + analyze_parameters() + + # 1. 卷积核分组可视化 + visualize_ri_conv_kernel() + + # 2. 池化区域可视化 + visualize_ri_pooling() + + # 3. 中间特征可视化 + visualize_ricnn_intermediate() + + # 4. 旋转不变性测试 + test_rotation_invariance() + + # 5. 位置编码可视化 + visualize_position_encoding() + + print('\n' + '=' * 60) + print('网络结构总结:') + print('=' * 60) + print(""" + RICNN (Rotation-Invariant CNN): + ┌──────────────────────────────────────────────────────┐ + │ 输入: BEV图像 (B, 3, 320, 320) │ + │ ↓ │ + │ block1: RIConvBlock(3→16) → (B, 16, 320, 320) │ + │ ↓ RIMaxpool2d(2) │ + │ block2: RIResBlock(16→32) → (B, 32, 160, 160) │ + │ ↓ RIMaxpool2d(5, s=4) │ + │ block3: RIResBlock(32→64) → (B, 64, 40, 40) │ + │ ↓ RIMaxpool2d(5, s=4) │ + │ block4: RIResBlock(64→128) → (B, 128, 10, 10) │ + │ ↓ │ + │ 多尺度特征聚合 (1x1conv + 上采样 + concat) │ + │ → (B, 128, 320, 320) │ + │ ↓ Conv1x1(128→129) │ + │ 输出: score(B,1,320,320) + desc(B,128,320,320) │ + └──────────────────────────────────────────────────────┘ + + 旋转不变性的实现: + - RIConv2d: 根据kernel位置到中心的欧氏距离分组 + 同距离的位置共享权重 → 旋转后权重不变 + - RIMaxpool2d: 只在圆形邻域内取max(忽略角点) + - RIAvgpool2d: 只在圆形邻域内取mean + + EncodePosition (位置编码): + - 输入: 150个关键点的3D坐标 + - 计算150×150距离矩阵 → 直方图(16 bins) → MLP + - 残差加到特征上,增强空间感知能力 + """) + + print(f'\n所有可视化结果保存在: {OUTPUT_DIR}') + + +if __name__ == '__main__': + main() diff --git a/network_learning/03_converter_demo.py b/network_learning/03_converter_demo.py new file mode 100644 index 0000000..b51ccec --- /dev/null +++ b/network_learning/03_converter_demo.py @@ -0,0 +1,230 @@ +""" +Converter 跨模态特征转换器 Demo +================================ +Converter 是跨模态融合的核心组件,负责在不同模态之间转换特征: + - cvt_bev: 图像特征 → BEV空间特征 + - cvt_img: BEV特征 → 图像空间特征 + +结构: + Self-Attention (MHA) + Conv1d瓶颈残差块 + 输入: (B, 128, N) N个特征点 + 输出: (B, 128, N) 转换后的特征 + +作用: 使两个模态的特征在同一个空间中对齐,便于后续匹配和融合 +""" + +import torch +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from net import Converter + +OUTPUT_DIR = os.path.join(os.path.dirname(__file__), 'output') +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def visualize_feature_similarity(fea_before, fea_after, title, save_name): + """可视化特征转换前后的相似度矩阵""" + fig, axes = plt.subplots(2, 3, figsize=(18, 10)) + + # 转换前特征相似度 + fea_before_norm = fea_before / (fea_before.norm(dim=1, keepdim=True) + 1e-8) + sim_before = (fea_before_norm[0].T @ fea_before_norm[0]).detach().numpy() + + im0 = axes[0, 0].imshow(sim_before, cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[0, 0].set_title('转换前 特征相似度矩阵') + axes[0, 0].set_xlabel('Point j'); axes[0, 0].set_ylabel('Point i') + plt.colorbar(im0, ax=axes[0, 0]) + + # 转换后特征相似度 + fea_after_norm = fea_after / (fea_after.norm(dim=1, keepdim=True) + 1e-8) + sim_after = (fea_after_norm[0].T @ fea_after_norm[0]).detach().numpy() + + im1 = axes[0, 1].imshow(sim_after, cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[0, 1].set_title('转换后 特征相似度矩阵') + axes[0, 1].set_xlabel('Point j'); axes[0, 1].set_ylabel('Point i') + plt.colorbar(im1, ax=axes[0, 1]) + + # 差异 + im2 = axes[0, 2].imshow(np.abs(sim_after - sim_before), cmap='YlOrRd') + axes[0, 2].set_title('相似度变化 |差值|') + axes[0, 2].set_xlabel('Point j'); axes[0, 2].set_ylabel('Point i') + plt.colorbar(im2, ax=axes[0, 2]) + + # 特征值分布 before + vals_before = fea_before[0].detach().numpy().flatten() + axes[1, 0].hist(vals_before, bins=50, color='steelblue', edgecolor='white', alpha=0.7) + axes[1, 0].set_title('转换前 特征值分布') + axes[1, 0].set_xlabel('Feature Value') + + # 特征值分布 after + vals_after = fea_after[0].detach().numpy().flatten() + axes[1, 1].hist(vals_after, bins=50, color='coral', edgecolor='white', alpha=0.7) + axes[1, 1].set_title('转换后 特征值分布') + axes[1, 1].set_xlabel('Feature Value') + + # 重叠对比 + axes[1, 2].hist(vals_before, bins=50, color='steelblue', edgecolor='white', + alpha=0.5, label='Before') + axes[1, 2].hist(vals_after, bins=50, color='coral', edgecolor='white', + alpha=0.5, label='After') + axes[1, 2].set_title('分布对比') + axes[1, 2].legend() + + plt.suptitle(title, fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, save_name) + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_attention(converter, fea_input): + """提取并可视化Self-Attention权重""" + b, c, n = fea_input.shape + x1 = fea_input.permute(0, 2, 1) # B, N, C + + # 手动计算attention权重 + with torch.no_grad(): + q = converter.mha.w_q(x1) + k = converter.mha.w_k(x1) + weights = torch.nn.functional.softmax( + torch.matmul(q, k.transpose(-2, -1)) / (converter.mha.d_model ** 0.5), + dim=-1 + ) + + # 可视化前几个点的attention + n_show = 6 + n = min(n, weights.shape[1]) + + fig, axes = plt.subplots(2, 3, figsize=(16, 10)) + for idx in range(min(n_show, n)): + ax = axes[idx // 3, idx % 3] + ax.bar(range(min(n, 50)), weights[0, idx, :min(n, 50)].detach().numpy(), + color='steelblue', width=1.0) + ax.set_title(f'Query Point {idx} 的 Attention') + ax.set_xlabel('Key Point') + ax.set_ylabel('Weight') + ax.axhline(y=1.0 / n, color='red', linestyle='--', alpha=0.5, label=f'平均={1/n:.3f}') + ax.legend(fontsize=8) + + for idx in range(n_show, 6): + axes[idx // 3, idx % 3].axis('off') + + plt.suptitle('Converter Self-Attention 权重分析', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'converter_attention.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def test_cross_modal_convert(): + """测试跨模态转换:模拟图像特征→BEV特征转换""" + print('\n--- 跨模态转换测试 ---') + + converter_bev = Converter(in_c=128) + converter_img = Converter(in_c=128) + + # 模拟两个模态的特征 + # 图像空间特征 (从图像特征图采样的N个点) + torch.manual_seed(42) + fea_img_space = torch.randn(2, 128, 100) # B=2, C=128, N=100 + + # BEV空间特征 + fea_bev_space = torch.randn(2, 128, 100) + + with torch.no_grad(): + # 图像→BEV: 将图像空间特征转换到BEV空间 + fea_to_bev = converter_bev(fea_img_space) + + # BEV→图像: 将BEV空间特征转换到图像空间 + fea_to_img = converter_img(fea_bev_space) + + print(f'图像空间特征输入: {fea_img_space.shape}') + print(f'→ cvt_bev 转换后: {fea_to_bev.shape}') + print(f'BEV空间特征输入: {fea_bev_space.shape}') + print(f'→ cvt_img 转换后: {fea_to_img.shape}') + + # 可视化转换前后 + visualize_feature_similarity( + fea_img_space, fea_to_bev, + 'cvt_bev: 图像特征 → BEV空间', + 'converter_img_to_bev.png' + ) + + visualize_feature_similarity( + fea_bev_space, fea_to_img, + 'cvt_img: BEV特征 → 图像空间', + 'converter_bev_to_img.png' + ) + + # 可视化attention + visualize_attention(converter_bev, fea_img_space) + + +def analyze_architecture(): + """分析Converter结构""" + print('\n--- Converter 架构分析 ---') + + converter = Converter(in_c=128) + total = sum(p.numel() for p in converter.parameters()) + print(f'总参数量: {total:,} ({total / 1e3:.1f}K)') + + for name, module in converter.named_children(): + params = sum(p.numel() for p in module.parameters()) + print(f' {name:15s}: {params:>10,} params') + + # 详细结构 + print(""" + Converter 内部结构: + + ┌──────────────────────────────────────────┐ + │ 输入 x: (B, 128, N) │ + │ │ │ + │ ┌─────┴─────┐ │ + │ │ 路径1: MHA │ 路径2: Conv1d瓶颈块 │ + │ │ Self-Attn │ Conv1d(128→32→128) │ + │ │ x → x2 │ x → x3 │ + │ └─────┬─────┘ │ + │ │ │ + │ concat([x2, x3]) → Conv1d(256→128) │ + │ │ │ + │ 输出: (B, 128, N) │ + └──────────────────────────────────────────┘ + + MHA (多头自注意力): + - d_model=128, num_heads=4 + - Q,K,V → 点积attention → FFN + - 捕捉特征点之间的全局关系 + + Conv1d瓶颈块: + - 128→32→16→32→128→128 (bottleneck) + - 逐点卷积,提取通道间的非线性关系 + + 两条路径互补: + - MHA: 全局上下文建模 + - Conv1d: 局部特征变换 + - 残差连接 + concat融合 + """) + + +def main(): + print('=' * 60) + print('Converter (跨模态特征转换器) 结构与功能可视化') + print('=' * 60) + + analyze_architecture() + test_cross_modal_convert() + + print(f'\n所有可视化结果保存在: {OUTPUT_DIR}') + + +if __name__ == '__main__': + main() diff --git a/network_learning/04_generator_fusion_demo.py b/network_learning/04_generator_fusion_demo.py new file mode 100644 index 0000000..722e655 --- /dev/null +++ b/network_learning/04_generator_fusion_demo.py @@ -0,0 +1,304 @@ +""" +Generator & FusionHead 全景生成器与融合头 Demo +============================================== +Generator: 从变长图像特征生成固定数量的全景特征 + Self-Attention → ConvTranspose1d(k3,s3) → AdaptiveMaxPool1d(150) + 输入: (B, 128, N) N可变 + 输出: (B, 128, 150) 固定150个 + +FusionHead: 融合多来源特征 + 对 [original, gen, gen_gen, kpl_gen] 四个特征 + → pair-wise Self-Attention → max聚合 → Cross-Attention → 输出 + 输入: (B, 128, 150, 4) + 输出: (B, 128, 150) 融合后特征 +""" + +import torch +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from net import Generator, FusionHead, Attention + +OUTPUT_DIR = os.path.join(os.path.dirname(__file__), 'output') +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def test_generator(): + """测试Generator: 变长→定长特征转换""" + print('\n--- Generator 全景特征生成器 ---') + + generator = Generator(in_c=128, num=150) + generator.eval() + + # 模拟变长输入 (B=2, C=128, N=可变的200) + torch.manual_seed(42) + x = torch.randn(2, 128, 200) + + with torch.no_grad(): + output = generator(x) + + print(f'输入: {x.shape} (变长,N=200)') + print(f'输出: {output.shape} (固定,K=150)') + + # 可视化输入输出特征 + fig, axes = plt.subplots(2, 3, figsize=(18, 10)) + + # 输入特征相似度矩阵 (前50个点) + x_norm = x[0] / (x[0].norm(dim=0, keepdim=True) + 1e-8) + sim_in = (x_norm.T[:50] @ x_norm[:, :50]).detach().numpy() + im0 = axes[0, 0].imshow(sim_in, cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[0, 0].set_title('输入特征相似度 (前50点)') + plt.colorbar(im0, ax=axes[0, 0]) + + # 输出特征相似度矩阵 + out_norm = output[0] / (output[0].norm(dim=0, keepdim=True) + 1e-8) + sim_out = (out_norm.T @ out_norm).detach().numpy() + im1 = axes[0, 1].imshow(sim_out, cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[0, 1].set_title('输出特征相似度 (150点)') + plt.colorbar(im1, ax=axes[0, 1]) + + # 输入特征热图 + im2 = axes[0, 2].imshow(x[0, :, :30].detach().numpy(), cmap='viridis', aspect='auto') + axes[0, 2].set_title('输入特征 (30点)') + axes[0, 2].set_xlabel('Point Index'); axes[0, 2].set_ylabel('Channel') + plt.colorbar(im2, ax=axes[0, 2]) + + # 输出特征热图 + im3 = axes[1, 0].imshow(output[0, :, :30].detach().numpy(), cmap='viridis', aspect='auto') + axes[1, 0].set_title('输出特征 (30点)') + axes[1, 0].set_xlabel('Point Index'); axes[1, 0].set_ylabel('Channel') + plt.colorbar(im3, ax=axes[1, 0]) + + # ConvTranspose + AdaptiveMaxPool 原理 + axes[1, 1].set_title('Generator 内部变换', fontsize=12) + axes[1, 1].text(0.5, 0.8, 'ConvTranspose1d(k3,s3)', transform=axes[1, 1].transAxes, + ha='center', fontsize=11, bbox=dict(boxstyle='round', facecolor='lightblue')) + axes[1, 1].text(0.5, 0.6, f'200 → 200*3 = 600', transform=axes[1, 1].transAxes, + ha='center', fontsize=10) + axes[1, 1].text(0.5, 0.4, 'AdaptiveMaxPool1d(150)', transform=axes[1, 1].transAxes, + ha='center', fontsize=11, bbox=dict(boxstyle='round', facecolor='lightgreen')) + axes[1, 1].text(0.5, 0.2, f'600 → 150', transform=axes[1, 1].transAxes, + ha='center', fontsize=10) + axes[1, 1].axis('off') + + # 特征值分布对比 + axes[1, 2].hist(x[0].detach().numpy().flatten(), bins=50, alpha=0.5, + label='Input', color='steelblue') + axes[1, 2].hist(output[0].detach().numpy().flatten(), bins=50, alpha=0.5, + label='Output', color='coral') + axes[1, 2].set_title('特征值分布对比') + axes[1, 2].legend() + + plt.suptitle('Generator: 变长特征→固定大小特征', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'generator_demo.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + # 测试不同输入长度 + print('\nGenerator 对不同输入长度的适应:') + for n in [50, 100, 200, 500]: + x_test = torch.randn(1, 128, n) + with torch.no_grad(): + out = generator(x_test) + print(f' N={n:4d} → 输出形状 {out.shape}') + + +def test_fusion_head(): + """测试FusionHead: 多来源特征融合""" + print('\n--- FusionHead 融合头 ---') + + fusion_head = FusionHead(in_c=128) + fusion_head.eval() + + # 模拟4种特征: + # [0]: fea_kpt_original - BEV原始关键点特征 + # [1]: fea_kpt_original_gen - Generator生成的BEV特征 + # [2]: fea_kpt_gen_gen - 双路径转换器输出 + # [3]: fea_kpl_gen - BEV→图像空间特征 + B, C, K = 2, 128, 150 + torch.manual_seed(42) + + # 让不同来源的特征有相关性但不完全相同 + base = torch.randn(B, C, K) + fea_original = base + fea_gen = base + 0.3 * torch.randn(B, C, K) + fea_gen_gen = fea_gen + 0.2 * torch.randn(B, C, K) + fea_kpl_gen = base + 0.5 * torch.randn(B, C, K) + + fea_kpts = torch.stack([fea_original, fea_gen, fea_gen_gen, fea_kpl_gen], dim=2) + print(f'输入: {fea_kpts.shape} [B, C, K, 4来源]') + + with torch.no_grad(): + fea_fused = fusion_head(fea_kpts) + + print(f'输出: {fea_fused.shape} [B, C, K] 融合特征') + + # 可视化 + fig, axes = plt.subplots(2, 3, figsize=(18, 10)) + + names = ['Original (BEV原始)', 'Generated (全景生成)', + 'Gen_Gen (双路径)', 'KPL_Gen (图像空间)'] + + for idx in range(4): + ax = axes[idx // 2, idx % 2] + sim = torch.nn.functional.cosine_similarity( + fea_kpts[0, :, :, 0].T.unsqueeze(-1), + fea_kpts[0, :, :, idx].T.unsqueeze(0), + dim=1 + ) + im = ax.imshow(sim.detach().numpy(), cmap='RdYlBu_r', vmin=-1, vmax=1) + ax.set_title(f'{names[idx]}\nvs Original 相似度') + ax.set_xlabel('Point'); ax.set_ylabel('Point') + plt.colorbar(im, ax=ax) + + # 融合特征 vs 原始特征 + ax = axes[1, 2] + sim_fused = torch.nn.functional.cosine_similarity( + fea_original[0].T.unsqueeze(-1), + fea_fused[0].T.unsqueeze(0), + dim=1 + ) + im = ax.imshow(sim_fused.detach().numpy(), cmap='RdYlBu_r', vmin=-1, vmax=1) + ax.set_title('Fused vs Original 相似度') + ax.set_xlabel('Point'); ax.set_ylabel('Point') + plt.colorbar(im, ax=ax) + + plt.suptitle('FusionHead: 多来源特征融合分析', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'fusion_head_demo.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_attention_detail(): + """详细可视化FusionHead中的Attention机制""" + print('\n--- FusionHead Attention 详细分析 ---') + + att = Attention(d_model=128) + att.eval() + + # 模拟3对特征的Self-Attention + B, N_pair, C = 2, 3, 128 + torch.manual_seed(42) + x = torch.randn(B * 2, N_pair, C) # 模拟batch*样本数的3对特征 + + with torch.no_grad(): + output, weights = att(x, x, x) + + print(f'Self-Attention 输入: {x.shape}') + print(f'输出: {output.shape}') + print(f'Attention权重: {weights.shape} (B, 3, 3)') + + # 可视化attention权重 + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + + weights_np = weights[0].detach().numpy() + im0 = axes[0].imshow(weights_np, cmap='YlOrRd', vmin=0, vmax=1) + axes[0].set_title('Self-Attention 权重 (3对特征)') + axes[0].set_xticks(range(3)) + axes[0].set_xticklabels(['Original', 'Generated', 'Gen_Gen']) + axes[0].set_yticks(range(3)) + axes[0].set_yticklabels(['Original', 'Generated', 'Gen_Gen']) + + for i in range(3): + for j in range(3): + axes[0].text(j, i, f'{weights_np[i, j]:.3f}', ha='center', va='center', + fontsize=12, color='white' if weights_np[i, j] > 0.5 else 'black') + plt.colorbar(im0, ax=axes[0]) + + # Cross-Attention 示意图 + axes[1].set_title('FusionHead Attention 流程', fontsize=12) + steps = [ + '1. 拼接4种特征 [original, gen, gen_gen, kpl_gen]', + '2. 取前3种 [original, gen, gen_gen]', + '3. 对每个样本的3对特征做Self-Attention', + '4. max聚合 → 每样本1个特征', + '5. Cross-Attention with kpl_gen (图像空间特征)', + '6. concat(original, cross_out) → Conv1d → 输出' + ] + for i, step in enumerate(steps): + axes[1].text(0.1, 0.9 - i * 0.15, step, transform=axes[1].transAxes, + fontsize=10, family='monospace', + bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.7)) + axes[1].axis('off') + + plt.suptitle('FusionHead Attention 机制详解', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'fusion_attention_detail.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def analyze_parameters(): + """参数量分析""" + print('\n--- 参数量分析 ---') + + gen = Generator(in_c=128, num=150) + fusion = FusionHead(in_c=128) + + for name, model in [('Generator', gen), ('FusionHead', fusion)]: + total = sum(p.numel() for p in model.parameters()) + print(f'\n{name}: {total:,} params ({total / 1e3:.1f}K)') + for n, m in model.named_children(): + p = sum(pmt.numel() for pmt in m.parameters()) + print(f' {n:15s}: {p:>10,} params') + + +def main(): + print('=' * 60) + print('Generator & FusionHead 结构与功能可视化') + print('=' * 60) + + analyze_parameters() + test_generator() + test_fusion_head() + visualize_attention_detail() + + print('\n' + '=' * 60) + print('结构总结:') + print('=' * 60) + print(""" + Generator (全景特征生成器): + ┌──────────────────────────────────────────────┐ + │ 输入: (B, 128, N) N可变 │ + │ ↓ Self-Attention (MHA) │ + │ x2: (B, 128, N) 全局上下文特征 │ + │ ↓ ConvTranspose1d(k3,s3) │ + │ x3: (B, 128, N*3) 上采样扩展 │ + │ ↓ AdaptiveMaxPool1d(150) │ + │ 输出: (B, 128, 150) 固定K个全景特征 │ + └──────────────────────────────────────────────┘ + 作用: 将BEV中可变数量的匹配点特征压缩为固定150个, + 与BEV关键点数量对齐 + + FusionHead (跨模态融合头): + ┌──────────────────────────────────────────────┐ + │ 输入: (B, 128, 150, 4) │ + │ [original, gen, gen_gen, kpl_gen] │ + │ ↓ │ + │ 对前3对 (B*N, 3, C): │ + │ Self-Attn → max(dim=1) → (B*N, C) │ + │ ↓ reshape → (B, N, C) │ + │ Cross-Attention with kpl_gen │ + │ ↓ │ + │ concat(original, cross_out) → Conv1d(256→128) │ + │ 输出: (B, 128, 150) 融合特征 │ + └──────────────────────────────────────────────┘ + 作用: 整合多来源特征,增强融合表示 + """) + + print(f'\n所有可视化结果保存在: {OUTPUT_DIR}') + + +if __name__ == '__main__': + main() diff --git a/network_learning/05_netvlad_demo.py b/network_learning/05_netvlad_demo.py new file mode 100644 index 0000000..efa0e27 --- /dev/null +++ b/network_learning/05_netvlad_demo.py @@ -0,0 +1,308 @@ +""" +NetVLAD 全局描述子 Demo +======================= +NetVLAD (Vector of Locally Aggregated Descriptors) 将局部特征聚合为全局描述子。 + +原理: +1. Soft Assignment: 每个局部特征软分配到K个聚类中心 +2. Residual: 计算特征与聚类中心的残差 +3. Aggregation: 加权求和残差 +4. Normalization: 逐聚类L2归一化 + 全局L2归一化 + +论文中使用 cluster_num=16, feature_size=128 +输出: 16 × 128 = 2048 维全局描述子 +""" + +import torch +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from netvlad import NetVLAD, NetVLADLoupe + +OUTPUT_DIR = os.path.join(os.path.dirname(__file__), 'output') +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def test_netvlad_basic(): + """测试NetVLAD基本功能""" + print('\n--- NetVLAD 基本功能测试 ---') + + netvlad = NetVLAD(fea_size=128, num_clusters=16) + netvlad.eval() + + # 输入: (B=2, C=128, K=150, W=1) + torch.manual_seed(42) + features = torch.randn(2, 128, 150, 1) + + with torch.no_grad(): + vlad = netvlad(features) + + print(f'输入特征: {features.shape} [B, C, K, W]') + print(f'VLAD输出: {vlad.shape} [B, cluster_num × C = 2048]') + print(f'VLAD L2 norm: {vlad.norm(dim=1)}') # 应该是全1(已归一化) + + +def visualize_soft_assignment(): + """可视化软分配过程""" + print('\n--- 软分配可视化 ---') + + netvlad = NetVLAD(fea_size=128, num_clusters=16) + netvlad.eval() + + torch.manual_seed(42) + features = torch.randn(1, 128, 150, 1) + + # 手动提取中间结果 + with torch.no_grad(): + x = features + soft_assign = netvlad.conv(x) + soft_assign = netvlad.relu(soft_assign) + soft_assign = torch.nn.functional.softmax(soft_assign, dim=1) + + # soft_assign: (B, 16, 150, 1) + assign_np = soft_assign[0, :, :, 0].numpy() # (16, 150) + + fig, axes = plt.subplots(2, 3, figsize=(18, 10)) + + # 软分配矩阵 + im0 = axes[0, 0].imshow(assign_np, cmap='YlOrRd', aspect='auto') + axes[0, 0].set_title('软分配矩阵 (16 clusters × 150 points)') + axes[0, 0].set_xlabel('Point Index') + axes[0, 0].set_ylabel('Cluster') + plt.colorbar(im0, ax=axes[0, 0]) + + # 每个聚类中心的总权重 + cluster_weight = assign_np.sum(axis=1) + axes[0, 1].bar(range(16), cluster_weight, color='steelblue') + axes[0, 1].axhline(y=150 / 16, color='red', linestyle='--', + label=f'平均={150 / 16:.1f}') + axes[0, 1].set_title('每个聚类的总权重') + axes[0, 1].set_xlabel('Cluster') + axes[0, 1].legend() + + # 每个点的最大分配 + max_cluster = assign_np.argmax(axis=0) + axes[0, 2].hist(max_cluster, bins=16, color='coral', edgecolor='white') + axes[0, 2].set_title('每个点被分配到哪个聚类 (argmax)') + axes[0, 2].set_xlabel('Cluster') + axes[0, 2].set_ylabel('点数') + + # 分配熵(混乱度) + entropy = -(assign_np * np.log(assign_np + 1e-8)).sum(axis=0) + axes[1, 0].bar(range(150), entropy, color='steelblue', width=1.0) + axes[1, 0].set_title('每个点的分配熵\n(高=模糊分配, 低=确定分配)') + axes[1, 0].set_xlabel('Point Index') + axes[1, 0].set_ylabel('Entropy') + + # 前3个聚类的分配权重 + for i in range(3): + axes[1, 1].plot(assign_np[i], alpha=0.7, label=f'Cluster {i}') + axes[1, 1].set_title('前3个聚类的分配权重') + axes[1, 1].set_xlabel('Point Index') + axes[1, 1].set_ylabel('Weight') + axes[1, 1].legend(fontsize=8) + + # 聚类中心可视化 (前2维t-SNE类比) + centroids = netvlad.centroids.detach().numpy() # (16, 128) + # PCA降维到2维 + U, S, Vt = np.linalg.svd(centroids - centroids.mean(axis=0), full_matrices=False) + centroids_2d = (centroids @ Vt[:2].T) + + axes[1, 2].scatter(centroids_2d[:, 0], centroids_2d[:, 1], c=range(16), + cmap='tab20', s=200, edgecolors='black') + for i in range(16): + axes[1, 2].annotate(str(i), (centroids_2d[i, 0], centroids_2d[i, 1]), + fontsize=10, ha='center', va='center') + axes[1, 2].set_title('聚类中心 PCA 2D 可视化') + axes[1, 2].set_xlabel('PC1'); axes[1, 2].set_ylabel('PC2') + + plt.suptitle('NetVLAD 软分配机制', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'netvlad_soft_assignment.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_vlad_structure(): + """可视化VLAD向量结构""" + print('\n--- VLAD向量结构可视化 ---') + + netvlad = NetVLAD(fea_size=128, num_clusters=16) + netvlad.eval() + + # 两组明显不同的特征 → 应该产生不同的VLAD + torch.manual_seed(42) + fea1 = torch.randn(1, 128, 150, 1) # 场景A + fea2 = torch.randn(1, 128, 150, 1) # 场景B(不同随机种子) + + with torch.no_grad(): + vlad1 = netvlad(fea1)[0] # (2048,) + vlad2 = netvlad(fea2)[0] + + # 每组同场景特征(加噪声)→ VLAD应相似 + fea1_noisy = fea1 + 0.1 * torch.randn(1, 128, 150, 1) + with torch.no_grad(): + vlad1_noisy = netvlad(fea1_noisy)[0] + + sim_same = torch.nn.functional.cosine_similarity(vlad1, vlad1_noisy, dim=0) + sim_diff = torch.nn.functional.cosine_similarity(vlad1, vlad2, dim=0) + + print(f'同场景(加噪声) VLAD相似度: {sim_same.item():.4f}') + print(f'不同场景 VLAD相似度: {sim_diff.item():.4f}') + print(f'区分度 (同-异): {sim_same.item() - sim_diff.item():.4f}') + + fig, axes = plt.subplots(1, 3, figsize=(18, 5)) + + # VLAD向量可视化 (reshape为16x128) + vlad1_2d = vlad1.view(16, 128).numpy() + vlad2_2d = vlad2.view(16, 128).numpy() + + im0 = axes[0].imshow(vlad1_2d, cmap='RdBu_r', aspect='auto') + axes[0].set_title('VLAD场景A (16×128)') + axes[0].set_xlabel('Feature Dim'); axes[0].set_ylabel('Cluster') + plt.colorbar(im0, ax=axes[0]) + + im1 = axes[1].imshow(vlad2_2d, cmap='RdBu_r', aspect='auto') + axes[1].set_title('VLAD场景B (16×128)') + axes[1].set_xlabel('Feature Dim'); axes[1].set_ylabel('Cluster') + plt.colorbar(im1, ax=axes[1]) + + im2 = axes[2].imshow(np.abs(vlad1_2d - vlad2_2d), cmap='YlOrRd', aspect='auto') + axes[2].set_title(f'|差异| (cos_sim={sim_same.item():.3f})') + axes[2].set_xlabel('Feature Dim'); axes[2].set_ylabel('Cluster') + plt.colorbar(im2, ax=axes[2]) + + plt.suptitle('NetVLAD 全局描述子结构', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'netvlad_vlad_structure.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def compare_netvlad_variants(): + """对比NetVLAD和NetVLADLoupe""" + print('\n--- NetVLAD vs NetVLADLoupe 对比 ---') + + netvlad = NetVLAD(fea_size=128, num_clusters=16) + netvlad_loupe = NetVLADLoupe(feature_size=128, cluster_size=16, output_dim=256) + + torch.manual_seed(42) + x = torch.randn(2, 128, 150, 1) # NetVLAD输入 (B,C,H,W) + x_loupe = torch.randn(2, 150, 128) # NetVLADLoupe输入 (B,N,C) + + with torch.no_grad(): + v1 = netvlad(x) + v2 = netvlad_loupe(x_loupe) + + print(f'NetVLAD: {sum(p.numel() for p in netvlad.parameters()):,} params') + print(f' 输入: {list(x.shape)} → 输出: {list(v1.shape)}') + print(f'NetVLADLoupe: {sum(p.numel() for p in netvlad_loupe.parameters()):,} params') + print(f' 输入: {list(x_loupe.shape)} → 输出: {list(v2.shape)}') + + # 示意图 + fig, axes = plt.subplots(1, 2, figsize=(16, 6)) + + # NetVLAD 流程 + axes[0].set_title('NetVLAD (论文使用)', fontsize=13, fontweight='bold') + steps_vlad = [ + '输入: (B, 128, 150, 1)', + '↓ Conv2d(128→16) + Softmax', + '软分配: (B, 16, 150, 1)', + '↓ 残差 = x - centroids', + '残差: (B, 16, 150, 128)', + '↓ sum(软分配 × 残差)', + 'VLAD: (B, 16, 128)', + '↓ L2归一化 (per cluster)', + '↓ flatten + L2归一化', + '输出: (B, 2048)' + ] + for i, s in enumerate(steps_vlad): + axes[0].text(0.1, 0.95 - i * 0.09, s, transform=axes[0].transAxes, + fontsize=10, family='monospace') + axes[0].axis('off') + + # NetVLADLoupe 流程 + axes[1].set_title('NetVLADLoupe', fontsize=13, fontweight='bold') + steps_loupe = [ + '输入: (B, N, 128)', + '↓ x @ cluster_weights', + '↓ Softmax + BatchNorm', + '软分配: (B, N, 16)', + '↓ activation @ x', + '↓ 减去中心校正项 a', + '↓ L2归一化', + '↓ MLP: 2048 → 256', + '↓ Context Gating', + '输出: (B, 256)' + ] + for i, s in enumerate(steps_loupe): + axes[1].text(0.1, 0.95 - i * 0.09, s, transform=axes[1].transAxes, + fontsize=10, family='monospace') + axes[1].axis('off') + + plt.suptitle('NetVLAD 两种变体对比', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'netvlad_variants.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def main(): + print('=' * 60) + print('NetVLAD 全局描述子 结构与功能可视化') + print('=' * 60) + + test_netvlad_basic() + visualize_soft_assignment() + visualize_vlad_structure() + compare_netvlad_variants() + + print('\n' + '=' * 60) + print('结构总结:') + print('=' * 60) + print(""" + NetVLAD (全局描述子聚合): + + 论文中使用: + - cluster_num: 16 + - feature_size: 128 + - 输出: 2048维全局描述子 + + VLAD计算步骤: + 1. Soft Assignment: soft_assign = Softmax(Conv2d(128→16)(x)) + 每个局部特征被软分配到16个聚类中心 + + 2. Residual: residual = x - centroids + 计算特征与每个聚类中心的残差 + + 3. VLAD Core: vlad = Σ(soft_assign × residual) / Σsoft_assign + 按聚类聚合加权残差 + + 4. Normalization: + - 逐聚类 L2 norm + - flatten + - 全局 L2 norm + + 最终VLAD融合: + vlads = sigmoid(w) × vlad_fusion + (1-sigmoid(w)) × vlad_bev + 其中 w 是可学习参数 + + VLAD vs 平均池化: + - 平均池化: 丢失空间分布信息 + - VLAD: 通过聚类保留了"哪些类型的特征在哪里出现"的信息 + """) + + print(f'\n所有可视化结果保存在: {OUTPUT_DIR}') + + +if __name__ == '__main__': + main() diff --git a/network_learning/06_uot_demo.py b/network_learning/06_uot_demo.py new file mode 100644 index 0000000..7948163 --- /dev/null +++ b/network_learning/06_uot_demo.py @@ -0,0 +1,356 @@ +""" +UOT (Unbalanced Optimal Transport) 位姿估计 Demo +================================================= +UOTHead 使用 Sinkhorn 非平衡最优传输进行特征匹配和位姿估计。 + +流程: +1. Cosine Cost Matrix: C = 1 - cosine_sim(feat1, feat2) +2. Sinkhorn Unbalanced OT: 迭代求解运输计划 T +3. Point Projection: project_kpts = T @ kpts2 / sum(T) +4. Weighted SVD: 从匹配点对估计刚体变换 R|t + +关键参数: +- epsilon: 熵正则化(控制运输计划的平滑度) +- gamma: 质量正则化(允许部分匹配) +- sinkhorn_iter: 5次迭代 +""" + +import torch +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from uot import UOTHead, sinkhorn_unbalanced, compute_rigid_transform + +OUTPUT_DIR = os.path.join(os.path.dirname(__file__), 'output') +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def visualize_cost_matrix(): + """可视化代价矩阵""" + print('\n--- 代价矩阵 (Cost Matrix) ---') + + torch.manual_seed(42) + # 模拟query和positive的150个关键点特征 + feat1 = torch.randn(2, 150, 128) # query + feat2 = torch.randn(2, 150, 128) # positive + + # 让部分特征相似(模拟真实闭环场景) + # 前100个特征点有对应关系 + feat2[:, :100] = feat1[:, :100] + 0.1 * torch.randn(2, 100, 128) + + # 计算cosine cost matrix + feat1_norm = feat1 / (feat1.norm(dim=2, keepdim=True) + 1e-8) + feat2_norm = feat2 / (feat2.norm(dim=2, keepdim=True) + 1e-8) + C = 1.0 - torch.bmm(feat1_norm, feat2_norm.transpose(1, 2)) + + C_np = C[0].numpy() + + fig, axes = plt.subplots(1, 3, figsize=(18, 5)) + + im0 = axes[0].imshow(C_np, cmap='YlOrRd') + axes[0].set_title('Cost Matrix C = 1 - cos_sim') + axes[0].set_xlabel('Positive Point j') + axes[0].set_ylabel('Query Point i') + plt.colorbar(im0, ax=axes[0]) + + # 缩放看前30个点(有对应关系的) + im1 = axes[1].imshow(C_np[:30, :30], cmap='YlOrRd') + axes[1].set_title('Cost Matrix (前30×30)\n有模拟对应关系') + axes[1].set_xlabel('Positive Point j') + axes[1].set_ylabel('Query Point i') + plt.colorbar(im1, ax=axes[1]) + + # 对角线cost分布 vs 非对角线 + diag_cost = np.diag(C_np) + off_diag = C_np[~np.eye(150, dtype=bool)] + + axes[2].hist(diag_cost, bins=30, alpha=0.6, label=f'对角线(匹配点)\nmean={diag_cost.mean():.3f}', + color='green') + axes[2].hist(off_diag, bins=30, alpha=0.6, label=f'非对角线\nmean={off_diag.mean():.3f}', + color='gray') + axes[2].set_title('Cost分布: 匹配 vs 非匹配') + axes[2].set_xlabel('Cost') + axes[2].legend(fontsize=8) + + plt.suptitle('UOT 代价矩阵分析', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'uot_cost_matrix.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_sinkhorn(): + """可视化Sinkhorn迭代过程""" + print('\n--- Sinkhorn 迭代过程 ---') + + torch.manual_seed(42) + # 构造有明显对应关系的特征 + B, N, C = 1, 50, 128 + feat1 = torch.randn(B, N, C) + feat1_norm = feat1 / (feat1.norm(dim=2, keepdim=True) + 1e-8) + + # feat2是feat1的扰动版本 + feat2 = feat1 + 0.15 * torch.randn(B, N, C) + feat2_norm = feat2 / (feat2.norm(dim=2, keepdim=True) + 1e-8) + + C = 1.0 - torch.bmm(feat1_norm, feat2_norm.transpose(1, 2)) + + epsilon = torch.tensor([0.05]) + gamma = torch.tensor([1.0]) + + # 逐步可视化Sinkhorn迭代 + K = torch.exp(-C / epsilon) + max_iter = 5 + power = gamma / (gamma + epsilon + 1e-8) + + a = torch.ones((B, N, 1)) / N + prob1 = torch.ones((B, N, 1)) / N + prob2 = torch.ones((B, N, 1)) / N + + fig, axes = plt.subplots(2, 4, figsize=(18, 9)) + + # K (初始) + K_np = K[0].numpy() + im0 = axes[0, 0].imshow(K_np, cmap='YlOrRd') + axes[0, 0].set_title('K (exp(-C/ε))\n迭代0') + axes[0, 0].set_xlabel('Positive'); axes[0, 0].set_ylabel('Query') + plt.colorbar(im0, ax=axes[0, 0]) + + for iteration in range(1, min(max_iter + 1, 7)): + # Update b + KTa = torch.bmm(K.transpose(1, 2), a) + b = torch.pow(prob2 / (KTa + 1e-8), power) + # Update a + Kb = torch.bmm(K, b) + a = torch.pow(prob1 / (Kb + 1e-8), power) + + T = torch.mul(torch.mul(a, K), b.transpose(1, 2)) + T_np = T[0].numpy() + + ax = axes[(iteration) // 4, (iteration) % 4] + im = ax.imshow(T_np, cmap='YlOrRd') + ax.set_title(f'Transport Plan T\n迭代{iteration}') + ax.set_xlabel('Positive'); ax.set_ylabel('Query') + plt.colorbar(im, ax=ax) + + # 空余位置 + for i in range(max_iter + 1, 8): + axes[i // 4, i % 4].axis('off') + + plt.suptitle('Sinkhorn 非平衡最优传输迭代过程', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'uot_sinkhorn_iterations.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def test_rigid_transform(): + """测试刚体变换估计""" + print('\n--- Weighted SVD 刚体变换估计 ---') + + torch.manual_seed(42) + B, N = 2, 150 + + # 真实变换 + angle = torch.tensor(0.5) # ~28.6度 + R_true = torch.tensor([ + [torch.cos(angle), -torch.sin(angle), 0], + [torch.sin(angle), torch.cos(angle), 0], + [0, 0, 1] + ]).unsqueeze(0).repeat(B, 1, 1) + t_true = torch.tensor([2.0, -1.0, 0.1]).unsqueeze(0).unsqueeze(-1).repeat(B, 1, 1) + + # query点云 + pts1 = torch.randn(B, N, 3) * 20 + + # positive点云 = R * query + t + noise + pts2 = R_true @ pts1.transpose(1, 2) + t_true + pts2 = pts2.transpose(1, 2) + 0.3 * torch.randn(B, N, 3) + + # 模拟transport weights(前80个点匹配好,后70个匹配差) + weights = torch.ones(B, N) + weights[:, 80:] = 0.1 # 降低后70个点的权重 + + # 估计变换 + transform = compute_rigid_transform(pts1, pts2, weights) + + # 评估 + R_est = transform[:, :3, :3] + t_est = transform[:, :3, 3] + + # 旋转误差 + R_err = R_est @ R_true.transpose(1, 2) + trace = torch.diagonal(R_err, dim1=1, dim2=2).sum(dim=1) + angle_err = torch.acos(torch.clamp((trace - 1) / 2, -1, 1)) * 180 / np.pi + + # 平移误差 + t_err = (t_est - t_true.squeeze(-1)).norm(dim=1) + + print(f'旋转误差: {angle_err[0].item():.2f}°') + print(f'平移误差: {t_err[0].item():.3f}m') + + # 可视化 + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + + # 3D点云(XY平面投影) + pts1_2d = pts1[0, :, :2].numpy() + pts2_2d = pts2[0, :, :2].numpy() + + # 投影点 + pts1_transformed = (R_est[0] @ pts1[0].T + t_est[0].unsqueeze(-1)).T[:, :2].numpy() + + axes[0].scatter(pts1_2d[:, 0], pts1_2d[:, 1], c='blue', s=10, alpha=0.6, label='Query') + axes[0].scatter(pts2_2d[:, 0], pts2_2d[:, 1], c='red', s=10, alpha=0.6, label='Positive') + for i in range(min(20, N)): + if weights[0, i] > 0.5: + axes[0].plot([pts1_2d[i, 0], pts2_2d[i, 0]], + [pts1_2d[i, 1], pts2_2d[i, 1]], + 'gray', alpha=0.3, linewidth=0.5) + axes[0].set_title('匹配点对 (蓝色→红色)') + axes[0].set_xlabel('X (m)'); axes[0].set_ylabel('Y (m)') + axes[0].legend(fontsize=8) + axes[0].set_aspect('equal') + + # 变换后 + axes[1].scatter(pts1_transformed[:, 0], pts1_transformed[:, 1], + c='blue', s=10, alpha=0.6, label='Query (变换后)') + axes[1].scatter(pts2_2d[:, 0], pts2_2d[:, 1], + c='red', s=10, alpha=0.6, label='Positive (目标)') + axes[1].set_title(f'变换后对比\n旋转误差:{angle_err[0].item():.2f}° 平移误差:{t_err[0].item():.3f}m') + axes[1].set_xlabel('X (m)'); axes[1].set_ylabel('Y (m)') + axes[1].legend(fontsize=8) + axes[1].set_aspect('equal') + + plt.suptitle('Weighted SVD 刚体变换估计', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'uot_rigid_transform.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + +def visualize_epsilon_gamma(): + """可视化epsilon和gamma参数的影响""" + print('\n--- epsilon/gamma 参数分析 ---') + + torch.manual_seed(42) + N = 50 + feat1 = torch.randn(1, N, 128) + feat1_norm = feat1 / (feat1.norm(dim=2, keepdim=True) + 1e-8) + feat2 = feat1 + 0.2 * torch.randn(1, N, 128) + feat2_norm = feat2 / (feat2.norm(dim=2, keepdim=True) + 1e-8) + + epsilons = [0.01, 0.05, 0.1, 0.5] + gammas = [0.1, 1.0, 10.0] + + fig, axes = plt.subplots(len(gammas), len(epsilons), figsize=(16, 12)) + + for gi, gamma in enumerate(gammas): + for ei, eps in enumerate(epsilons): + epsilon = torch.tensor([eps]) + gam = torch.tensor([gamma]) + T = sinkhorn_unbalanced( + feat1_norm, feat2_norm, + epsilon=epsilon, gamma=gam, + max_iter=5, matrix='cosine' + ) + ax = axes[gi, ei] + im = ax.imshow(T[0].numpy(), cmap='YlOrRd') + ax.set_title(f'ε={eps}, γ={gamma}') + ax.set_xlabel('Positive'); ax.set_ylabel('Query') + plt.colorbar(im, ax=ax) + + plt.suptitle('epsilon (熵正则) 和 gamma (质量正则) 对 Transport Plan 的影响', + fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'uot_epsilon_gamma.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f' [保存] {path}') + + print(""" + 参数解释: + - epsilon (ε): 熵正则化强度 + - 小ε → Transport Plan更稀疏(hard matching) + - 大ε → Transport Plan更平滑(soft matching) + - gamma (γ): 质量正则化强度 + - 小γ → 允许部分匹配(质量可增减) + - 大γ → 要求质量守恒(所有点必须匹配) + """) + + +def analyze_parameters(): + """参数量分析""" + print('\n--- 参数量分析 ---') + uot = UOTHead(nb_iter=5, name='original') + total = sum(p.numel() for p in uot.parameters()) + print(f'总参数量: {total} (仅 epsilon, gamma 两个可学习标量)') + for name, param in uot.named_parameters(): + print(f' {name}: {param.data.item():.4f}') + + +def main(): + print('=' * 60) + print('UOT (Unbalanced Optimal Transport) 位姿估计可视化') + print('=' * 60) + + analyze_parameters() + visualize_cost_matrix() + visualize_sinkhorn() + test_rigid_transform() + visualize_epsilon_gamma() + + print('\n' + '=' * 60) + print('结构总结:') + print('=' * 60) + print(""" + UOTHead (非平衡最优传输位姿估计): + + ┌──────────────────────────────────────────────────────┐ + │ 输入: feat1(B,150,128), feat2(B,150,128) │ + │ kpts1(B,150,3), kpts2(B,150,3) │ + │ │ + │ 1. Cost Matrix: C = 1 - cosine_sim(feat1, feat2) │ + │ → (B, 150, 150) │ + │ │ + │ 2. Sinkhorn Unbalanced OT (迭代5次): │ + │ K = exp(-C / epsilon) │ + │ for i in range(5): │ + │ b = (prob2 / Kᵀa)^(γ/(γ+ε)) │ + │ a = (prob1 / Kb)^(γ/(γ+ε)) │ + │ T = a ⊙ K ⊙ bᵀ │ + │ → (B, 150, 150) 运输计划 │ + │ │ + │ 3. 投影: project_kpts = T @ kpts2 / ΣT │ + │ → (B, 150, 3) query匹配点在positive空间的投影坐标 │ + │ │ + │ 4. Weighted SVD 刚体变换: │ + │ - 加权中心化 │ + │ - SVD分解协方差 │ + │ - 输出 R(3×3), t(3×1) │ + │ → transformation: (B, 3, 4) │ + └──────────────────────────────────────────────────────┘ + + 为什么用Unbalanced OT(非平衡最优传输)? + - 标准OT要求两个点集大小相同且质量守恒 + - 实际场景:部分关键点在另一帧中可能被遮挡 + - Unbalanced OT允许部分匹配,更鲁棒 + + 两个可学习参数: + - epsilon (ε): 熵正则化,exp(ε)+0.03 + - gamma (γ): 质量正则化,exp(γ) + """) + + print(f'\n所有可视化结果保存在: {OUTPUT_DIR}') + + +if __name__ == '__main__': + main() diff --git a/network_learning/08_full_pipeline_demo.py b/network_learning/08_full_pipeline_demo.py new file mode 100644 index 0000000..6e77211 --- /dev/null +++ b/network_learning/08_full_pipeline_demo.py @@ -0,0 +1,516 @@ +""" +完整流水线 Demo: 端到端网络结构可视化 +===================================== +集成所有子网络,展示从输入到输出的完整数据流。 + +运行模式: + python 08_full_pipeline_demo.py --mode bev # 仅BEV分支 + python 08_full_pipeline_demo.py --mode img # 仅图像分支 + python 08_full_pipeline_demo.py --mode fusion # 完整融合模式 +""" + +import torch +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('Agg') + +import sys +import os +import argparse +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from net import Fusion, BEVHead, ImgHead, FusionHead +from BEVNet import RICNN +from ALIKE.alnet import ALNet +from netvlad import NetVLAD +from uot import UOTHead + +OUTPUT_DIR = os.path.join(os.path.dirname(__file__), 'output') +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def create_dummy_batch_dict(mode='fusion'): + """创建模拟的batch_dict""" + B = 2 # batch中1对 (query + positive) + batch_dict = { + 'batch_size': 2 * B, + } + + if mode in ('fusion', 'bev'): + batch_dict['bev'] = torch.randn(2 * B, 7, 320, 320) + batch_dict['bev'][:, :3] = torch.sigmoid(batch_dict['bev'][:, :3]) # 可视通道 + batch_dict['bev'][:, 2:3] = (batch_dict['bev'][:, 2:3] > 0.3).float() # guider mask + + if mode in ('fusion', 'img'): + batch_dict['img'] = torch.randint(0, 256, (2 * B, 5, 192, 576)).float() + + if mode == 'fusion': + # 模拟 relation: (B, max_len, K, 2) + max_len, K = 200, 11 # K=1+10: last dim is bev coord + batch_dict['relation'] = torch.zeros(2 * B, max_len, K, 2, dtype=torch.long) + for i in range(2 * B): + n_valid = 150 + batch_dict['relation'][i, :n_valid, :K - 1, 0] = torch.randint(0, 576, (n_valid, K - 1)) + batch_dict['relation'][i, :n_valid, :K - 1, 1] = torch.randint(0, 192, (n_valid, K - 1)) + batch_dict['relation'][i, :n_valid, K - 1, 0] = torch.randint(0, 320, (n_valid,)) + batch_dict['relation'][i, :n_valid, K - 1, 1] = torch.randint(0, 320, (n_valid,)) + + # pose_to_frame (训练时需要) + angle = 0.3 + pose = torch.eye(4).unsqueeze(0).repeat(B, 1, 1) + pose[:, 0, 0] = torch.cos(torch.tensor(angle)) + pose[:, 0, 1] = -torch.sin(torch.tensor(angle)) + pose[:, 1, 0] = torch.sin(torch.tensor(angle)) + pose[:, 1, 1] = torch.cos(torch.tensor(angle)) + pose[:, 0, 3] = 2.0 + pose[:, 1, 3] = -1.0 + batch_dict['pose_to_frame'] = pose.clone() + + batch_dict['pose_query'] = torch.eye(4).unsqueeze(0).repeat(B, 1, 1) + batch_dict['pose_positive'] = torch.eye(4).unsqueeze(0).repeat(B, 1, 1) + + batch_dict['label_score'] = torch.zeros(B, 320, 320, 2) + batch_dict['id_query'] = torch.arange(B) + batch_dict['id_positive'] = torch.arange(B) + batch_dict['sequence'] = torch.zeros(B, dtype=torch.long) + + return batch_dict + + +def run_bev_only(): + """仅BEV分支""" + print('\n' + '=' * 60) + print('模式: BEV Only (仅点云分支)') + print('=' * 60) + + cfg = { + 'flag': 'bev', + 'kpts_number_bev': 150, + 'kpts_number_img': 150, + 'cluster_num_bev': 16, + 'cluster_num_img': 16, + 'cluster_num_fusion': 16, + 'sinkhorn_iter': 5, + 'vlad_size': 256, + } + + model = Fusion(cfg) + model.eval() + total_params = sum(p.numel() for p in model.parameters()) + print(f'模型参数量: {total_params:,} ({total_params / 1e6:.2f}M)') + + batch_dict = create_dummy_batch_dict('bev') + + with torch.no_grad(): + output = model(batch_dict) + + print('\n输出:') + for k, v in output.items(): + if isinstance(v, torch.Tensor): + print(f' {k:30s}: {list(v.shape)}') + else: + print(f' {k:30s}: {v}') + + # 可视化BEV分支数据流 + fig, axes = plt.subplots(2, 4, figsize=(18, 9)) + + # BEV输入 (3个可视通道) + if 'bev' in output or 'bev' in batch_dict: + bev_in = batch_dict['bev'][0, :3].permute(1, 2, 0).numpy() + axes[0, 0].imshow(bev_in) + axes[0, 0].set_title('BEV输入 (3通道)') + axes[0, 0].axis('off') + + # Score Map + if 'score_bev' in output: + axes[0, 1].imshow(output['score_bev'][0].numpy(), cmap='hot') + axes[0, 1].set_title('BEV Score Map') + axes[0, 1].axis('off') + + # 关键点位置 + if 'key_points' in output and 'pixels_kpt' in output: + bev_show = batch_dict['bev'][0, :3].permute(1, 2, 0).numpy() + axes[0, 2].imshow(bev_show) + kpt = output['pixels_kpt'][0].numpy() + axes[0, 2].scatter(kpt[:, 1], kpt[:, 0], c='red', s=5, alpha=0.8) + axes[0, 2].set_title(f'BEV Top-{len(kpt)} 关键点') + axes[0, 2].axis('off') + + # Descriptor Map (第一通道) + if 'fea_bev' in output: + axes[0, 3].imshow(output['fea_bev'][0, 0].numpy(), cmap='viridis') + axes[0, 3].set_title('BEV Descriptor ch0') + axes[0, 3].axis('off') + + # 关键点特征相似度 + if 'fea_kpt_original' in output: + fea = output['fea_kpt_original'] + # query vs positive 的相似度 + B = fea.shape[0] // 2 + sim = torch.nn.functional.cosine_similarity( + fea[:B].permute(0, 2, 1).unsqueeze(-1), + fea[B:].permute(0, 2, 1).unsqueeze(-2), + dim=1 + )[0] + im = axes[1, 0].imshow(sim.numpy(), cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[1, 0].set_title('Query-Positive 特征相似度') + axes[1, 0].set_xlabel('Positive'); axes[1, 0].set_ylabel('Query') + plt.colorbar(im, ax=axes[1, 0]) + + # VLAD + if 'vlads' in output: + vlad = output['vlads'][0].view(16, 128).numpy() + im = axes[1, 1].imshow(vlad, cmap='RdBu_r', aspect='auto') + axes[1, 1].set_title('VLAD描述子 (16×128)') + axes[1, 1].set_xlabel('Feature Dim'); axes[1, 1].set_ylabel('Cluster') + plt.colorbar(im, ax=axes[1, 1]) + + # 数据流图 + axes[1, 2].set_title('BEV分支数据流') + flow = [ + 'bev (7,320,320)', + '→ x = bev[:3] (可视BEV)', + '→ points = bev[3:7] (坐标)', + '→ RICNN前向', + '→ score_bev (1,320,320)', + '→ fea_bev (128,320,320)', + '→ NMS + Top-K(150)', + '→ key_points (150,4)', + '→ fea_kpt (128,150)', + '→ EncodePosition', + '→ NetVLAD → vlad_bev (2048)', + ] + for i, f in enumerate(flow): + axes[1, 2].text(0.1, 0.95 - i * 0.1, f, transform=axes[1, 2].transAxes, + fontsize=9, family='monospace') + axes[1, 2].axis('off') + + # 参数量饼图 + axes[1, 3].set_title('BEV分支参数分布') + modules = dict(model.bev.feature_extractor.named_children()) + sizes = [] + labels = [] + for name, mod in modules.items(): + p = sum(pm.numel() for pm in mod.parameters()) + if p > 0: + sizes.append(p) + labels.append(f'{name}\n({p/1e3:.0f}K)') + axes[1, 3].pie(sizes, labels=labels, autopct='%1.1f%%', textprops={'fontsize': 8}) + + plt.suptitle('BEV Only 模式: 点云分支可视化', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'full_pipeline_bev.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f'[保存] {path}') + + +def run_img_only(): + """仅图像分支""" + print('\n' + '=' * 60) + print('模式: Image Only (仅图像分支)') + print('=' * 60) + + cfg = { + 'flag': 'img', + 'kpts_number_bev': 150, + 'kpts_number_img': 150, + 'cluster_num_bev': 16, + 'cluster_num_img': 16, + 'cluster_num_fusion': 16, + 'sinkhorn_iter': 5, + 'vlad_size': 256, + } + + model = Fusion(cfg) + model.eval() + total_params = sum(p.numel() for p in model.parameters()) + print(f'模型参数量: {total_params:,} ({total_params / 1e6:.2f}M)') + + batch_dict = create_dummy_batch_dict('img') + + with torch.no_grad(): + output = model(batch_dict) + + print('\n输出:') + for k, v in output.items(): + if isinstance(v, torch.Tensor): + print(f' {k:30s}: {list(v.shape)}') + else: + print(f' {k:30s}: {v}') + + # 可视化 + fig, axes = plt.subplots(2, 4, figsize=(18, 9)) + + # 输入图像 + img_in = batch_dict['img'][0, :3].permute(1, 2, 0).numpy().astype(np.uint8) + axes[0, 0].imshow(img_in) + axes[0, 0].set_title('图像输入 (192×576)') + axes[0, 0].axis('off') + + # Score Map + if 'score_img' in output: + axes[0, 1].imshow(output['score_img'][0, 0].numpy(), cmap='hot') + axes[0, 1].set_title('图像 Score Map') + axes[0, 1].axis('off') + + # 关键点 + if 'key_pixels' in output: + axes[0, 2].imshow(img_in) + kpt = output['key_pixels'][0].numpy() + axes[0, 2].scatter(kpt[:, 1], kpt[:, 0], c='red', s=5, alpha=0.8) + axes[0, 2].set_title(f'Top-{len(kpt)} 关键点') + axes[0, 2].axis('off') + + # Descriptor Map + if 'fea_img' in output: + axes[0, 3].imshow(output['fea_img'][0, 0].numpy(), cmap='viridis') + axes[0, 3].set_title('图像 Descriptor ch0') + axes[0, 3].axis('off') + + # 关键点特征相似度 + if 'fea_kpl' in output: + fea = output['fea_kpl'] + B = fea.shape[0] // 2 + sim = torch.nn.functional.cosine_similarity( + fea[:B].permute(0, 2, 1).unsqueeze(-1), + fea[B:].permute(0, 2, 1).unsqueeze(-2), + dim=1 + )[0] + im = axes[1, 0].imshow(sim.numpy(), cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[1, 0].set_title('Query-Positive 特征相似度') + plt.colorbar(im, ax=axes[1, 0]) + + # 数据流图 + axes[1, 1].set_title('图像分支数据流') + flow = [ + 'img (5,192,576)', + '→ x = img[:3]/255', + '→ ALNet前向', + '→ score_img (1,192,576)', + '→ fea_img (128,192,576)', + '→ NMS(2) + Top-K(150)', + '→ key_pixels (150,2)', + '→ fea_kpl (128,150)', + ] + for i, f in enumerate(flow): + axes[1, 1].text(0.1, 0.95 - i * 0.11, f, transform=axes[1, 1].transAxes, + fontsize=9, family='monospace') + axes[1, 1].axis('off') + + # 参数量饼图 + axes[1, 2].set_title('图像分支参数分布') + modules = dict(model.img.feature_extractor.named_children()) + sizes = [] + labels = [] + for name, mod in modules.items(): + p = sum(pm.numel() for pm in mod.parameters()) + if p > 0: + sizes.append(p) + labels.append(f'{name}\n({p/1e3:.0f}K)') + axes[1, 2].pie(sizes, labels=labels, autopct='%1.1f%%', textprops={'fontsize': 8}) + + axes[1, 3].axis('off') + + plt.suptitle('Image Only 模式: 图像分支可视化', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'full_pipeline_img.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f'[保存] {path}') + + +def run_fusion(): + """完整融合模式""" + print('\n' + '=' * 60) + print('模式: Fusion (完整融合)') + print('=' * 60) + + cfg = { + 'flag': 'fusion', + 'kpts_number_bev': 150, + 'kpts_number_img': 150, + 'cluster_num_bev': 16, + 'cluster_num_img': 16, + 'cluster_num_fusion': 16, + 'sinkhorn_iter': 5, + 'vlad_size': 256, + } + + model = Fusion(cfg) + model.eval() + total_params = sum(p.numel() for p in model.parameters()) + print(f'模型参数量: {total_params:,} ({total_params / 1e6:.2f}M)') + + batch_dict = create_dummy_batch_dict('fusion') + + with torch.no_grad(): + output = model(batch_dict) + + print('\n输出:') + for k, v in output.items(): + if isinstance(v, torch.Tensor): + print(f' {k:30s}: {list(v.shape)}') + else: + print(f' {k:30s}: {v}') + + # 可视化融合数据流 + fig, axes = plt.subplots(3, 4, figsize=(22, 15)) + + # BEV输入 + bev_in = batch_dict['bev'][0, :3].permute(1, 2, 0).numpy() + axes[0, 0].imshow(bev_in) + axes[0, 0].set_title('BEV 输入 (320×320)') + axes[0, 0].axis('off') + + # 图像输入 + img_in = batch_dict['img'][0, :3].permute(1, 2, 0).numpy().astype(np.uint8) + axes[0, 1].imshow(img_in) + axes[0, 1].set_title('图像输入 (192×576)') + axes[0, 1].axis('off') + + # Score maps + if 'score_bev' in output: + axes[0, 2].imshow(output['score_bev'][0].numpy(), cmap='hot') + axes[0, 2].set_title('BEV Score') + axes[0, 2].axis('off') + if 'score_img' in output: + axes[0, 3].imshow(output['score_img'][0, 0].numpy(), cmap='hot') + axes[0, 3].set_title('Image Score') + axes[0, 3].axis('off') + + # 融合特征空间中的相似度 + if 'fea_kpt_original' in output and 'fea_kpt_fusion' in output: + fea_orig = output['fea_kpt_original'] + fea_fusion = output['fea_kpt_fusion'] + B = fea_orig.shape[0] // 2 + + sim_orig = torch.nn.functional.cosine_similarity( + fea_orig[:B].permute(0, 2, 1).unsqueeze(-1), + fea_orig[B:].permute(0, 2, 1).unsqueeze(-2), + dim=1 + )[0].numpy() + + sim_fusion = torch.nn.functional.cosine_similarity( + fea_fusion[:B].permute(0, 2, 1).unsqueeze(-1), + fea_fusion[B:].permute(0, 2, 1).unsqueeze(-2), + dim=1 + )[0].numpy() + + im1 = axes[1, 0].imshow(sim_orig, cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[1, 0].set_title('原始特征 相似度 (150×150)') + plt.colorbar(im1, ax=axes[1, 0]) + + im2 = axes[1, 1].imshow(sim_fusion, cmap='RdYlBu_r', vmin=-1, vmax=1) + axes[1, 1].set_title('融合特征 相似度 (150×150)') + plt.colorbar(im2, ax=axes[1, 1]) + + axes[1, 2].imshow(np.abs(sim_orig - sim_fusion), cmap='YlOrRd') + axes[1, 2].set_title('相似度变化 |差异|') + plt.colorbar(im2, ax=axes[1, 2]) + + # VLAD + if 'vlads' in output: + vlad = output['vlads'][0].view(16, 128).numpy() + im = axes[1, 3].imshow(vlad, cmap='RdBu_r', aspect='auto') + axes[1, 3].set_title('VLAD 融合 (16×128)') + plt.colorbar(im, ax=axes[1, 3]) + + # 整体架构图 + axes[2, 0].set_title('完整架构') + arch = [ + '┌─ BEVHead ─────────────┐', + '│ RICNN + EncodePos │', + '│ → fea_kpt_original │', + '│ → vlad_bev │', + '└───────────────────────┘', + '┌─ ImgHead ─────────────┐', + '│ ALNet + NMS │', + '│ → fea_kpl │', + '│ → fea_img │', + '└───────────────────────┘', + '┌─ FusionHead ──────────┐', + '│ LocalPool + Converter │', + '│ Generator + FusionHead│', + '│ → fea_kpt_fusion │', + '└───────────────────────────────────────────────────────┘', + ' VLAD = w·vlad_fusion + (1-w)·vlad_bev' + ] + for i, a in enumerate(arch): + axes[2, 0].text(0.05, 0.98 - i * 0.075, a, transform=axes[2, 0].transAxes, + fontsize=7.5, family='monospace') + axes[2, 0].axis('off') + + # 模块参数对比 + axes[2, 1].set_title('各模块参数量') + module_names = [] + module_params = [] + for name, mod in model.named_children(): + p = sum(pm.numel() for pm in mod.parameters()) + if p > 0: + module_names.append(name) + module_params.append(p) + colors = plt.cm.Set3(np.linspace(0, 1, len(module_names))) + axes[2, 1].barh(range(len(module_names)), module_params, color=colors) + axes[2, 1].set_yticks(range(len(module_names))) + axes[2, 1].set_yticklabels(module_names, fontsize=8) + for i, p in enumerate(module_params): + axes[2, 1].text(p, i, f' {p/1e3:.0f}K', va='center', fontsize=8) + + # 数据流汇总 + axes[2, 2].set_title('融合模式数据流') + flow = [ + 'img, bev, relation 输入', + '├─ ImgHead → ALNet', + '│ ├─ score_img', + '│ ├─ fea_img (密集描述子)', + '│ └─ fea_kpl (关键点)', + '├─ BEVHead → RICNN', + '│ ├─ score_bev', + '│ ├─ fea_bev (密集描述子)', + '│ ├─ fea_kpt_original', + '│ └─ vlad_bev', + '└─ FusionHead', + ' ├─ GridSample → fea_pl_dual, fea_pt_dual', + ' ├─ Converters → 跨模态转换', + ' ├─ Generator → 全景特征', + ' ├─ FusionHead → 融合特征', + ' └─ NetVLAD → vlad_fusion', + '最终: vlads = w·vlad_fusion + (1-w)·vlad_bev', + ' UOT: → transformation (位姿)', + ] + for i, f in enumerate(flow): + axes[2, 2].text(0.05, 0.98 - i * 0.06, f, transform=axes[2, 2].transAxes, + fontsize=7.5, family='monospace') + axes[2, 2].axis('off') + + axes[2, 3].axis('off') + + plt.suptitle('Fusion 模式: 完整跨模态融合可视化', fontsize=14, fontweight='bold') + plt.tight_layout() + path = os.path.join(OUTPUT_DIR, 'full_pipeline_fusion.png') + plt.savefig(path, dpi=150, bbox_inches='tight') + plt.close() + print(f'[保存] {path}') + + +def main(): + parser = argparse.ArgumentParser(description='全流水线可视化') + parser.add_argument('--mode', type=str, default='all', + choices=['all', 'bev', 'img', 'fusion'], + help='运行模式') + args = parser.parse_args() + + if args.mode in ('all', 'bev'): + run_bev_only() + if args.mode in ('all', 'img'): + run_img_only() + if args.mode in ('all', 'fusion'): + run_fusion() + + print(f'\n所有可视化结果保存在: {OUTPUT_DIR}') + + +if __name__ == '__main__': + main() diff --git a/network_learning/LEARNING_GUIDE.md b/network_learning/LEARNING_GUIDE.md new file mode 100644 index 0000000..af68400 --- /dev/null +++ b/network_learning/LEARNING_GUIDE.md @@ -0,0 +1,419 @@ +# 网络结构学习指南 + +> 论文:[Cross Fusion of Point Cloud and Learned Image for Loop Closure Detection](../Cross_Fusion_of_Point_Cloud_and_Learned_Image_for_Loop_Closure_Detection.pdf) + +--- + +## 目录 + +1. [项目总览](#1-项目总览) +2. [网络结构全景图](#2-网络结构全景图) +3. [ALNet — 图像特征提取器](#3-alnet--图像特征提取器) +4. [RICNN — 旋转不变CNN](#4-ricnn--旋转不变cnn) +5. [EncodePosition — 位置编码](#5-encodeposition--位置编码) +6. [Converter — 跨模态特征转换器](#6-converter--跨模态特征转换器) +7. [Generator & FusionHead — 特征生成与融合](#7-generator--fusionhead--特征生成与融合) +8. [LocalPool — 局部特征聚合](#8-localpool--局部特征聚合) +9. [NetVLAD — 全局描述子聚合](#9-netvlad--全局描述子聚合) +10. [UOTHead — 最优传输位姿估计](#10-uothead--最优传输位姿估计) +11. [完整数据流](#11-完整数据流) +12. [学习路线建议](#12-学习路线建议) + +--- + +## 1. 项目总览 + +本项目实现**点云-图像跨模态融合的闭环检测**系统,共包含 **9 个网络结构**: + +| # | 网络 | 源文件 | 作用 | +|---|------|------|------| +| 1 | **ALNet** | `ALIKE/alnet.py` | 图像特征提取(关键点+描述子) | +| 2 | **RICNN** | `BEVNet.py` | BEV点云特征提取(旋转不变) | +| 3 | **EncodePosition** | `BEVNet.py` | 关键点空间位置编码 | +| 4 | **Converter** | `net.py` | 跨模态特征空间转换 | +| 5 | **Generator** | `net.py` | 变长特征→固定大小 | +| 6 | **FusionHead** | `net.py` | 多来源特征Attention融合 | +| 7 | **LocalPool** | `net.py` | 多像素特征→单体素聚合 | +| 8 | **NetVLAD** | `netvlad.py` | 局部特征→全局描述子 | +| 9 | **UOTHead** | `uot.py` | 最优传输→位姿估计 | + +### 运行模式 + +| flag | 含义 | 包含模块 | +|------|------|---------| +| `bev` | 仅点云 | 2, 3, 8 | +| `img` | 仅图像 | 1 | +| `fusion` | 完整融合 | 全部 1-9 | + +### 关键维度 + +| 参数 | 值 | +|------|-----| +| BEV图尺寸 (H×W) | 320×320 | +| BEV输入通道 | 7 (max_z, intensity, density, cx, cy, cz, ci) | +| 图像尺寸 (H×W) | 192×576 | +| 关键点数量 (BEV/Img) | 150 | +| 特征维度 | 128 | +| VLAD聚类数 | 16 | +| VLAD输出维度 | 2048 (=16×128) | + +--- + +## 2. 网络结构全景图 + +``` + ┌─────────────────────────┐ + │ 输入 img + bev + relation│ + └──────┬──────────┬───────┘ + │ │ + ┌──────────┘ └──────────┐ + ▼ ▼ + ┌──────────────┐ ┌──────────────┐ + │ ImgHead │ │ BEVHead │ + │ (ALNet) │ │ (RICNN) │ + │ │ │ │ + │ score_img │ │ score_bev │ + │ fea_img │ │ fea_bev │ + │ fea_kpl │ │ fea_kpt_orig │ + │ key_pixels │ │ key_points │ + └──────┬───────┘ │ vlad_bev │ + │ └──────┬───────┘ + │ │ + │ ┌─────────────────────────┘ + │ │ + ▼ ▼ + ┌─────────────────────────────────────┐ + │ FusionHead │ + │ │ + │ LocalPool → Converter(cvt_bev) │ + │ GridSample → Converter(cvt_img) │ + │ Generator → FusionHead(Attention) │ + │ │ + │ fea_kpt_fusion (B, 128, 150) │ + └─────────────┬───────────────────────┘ + │ + ┌───────▼────────┐ + │ NetVLAD │ + │ vlad_fusion │ + └───────┬────────┘ + │ + ┌───────▼────────┐ + │ VLAD 融合 │ + │ w*fusion + │ + │ (1-w)*bev │ + └───────┬────────┘ + │ + ┌───────▼────────┐ + │ UOTHead │ + │ (仅训练时) │ + │→ transformation│ + └────────────────┘ +``` + +--- + +## 3. ALNet — 图像特征提取器 + +**源码**: `ALIKE/alnet.py` | **Demo**: `python 01_alnet_demo.py` + +### 结构 + +``` +输入: (B, 3, 192, 576) + ↓ +block1: ConvBlock(3→16) → (B, 16, 192, 576) + ↓ MaxPool2d(2) +block2: ResBlock(16→32) → (B, 32, 96, 288) + ↓ MaxPool2d(4) +block3: ResBlock(32→64) → (B, 64, 24, 72) + ↓ MaxPool2d(4) +block4: ResBlock(64→128) → (B, 128, 6, 18) + ↓ +特征聚合: 4尺度concat + 上采样 → (B, 128, 192, 576) + ↓ Conv1x1(128→129) +输出: score(B,1,192,576) + desc(B,128,192,576) +``` + +### 设计要点 + +- **多尺度特征聚合**: 4阶段特征通过1x1conv压缩后上采样拼接,兼顾浅层定位精度和深层语义 +- **共享检测+描述**: 单一骨干同时输出关键点得分和密集描述子 +- **配置(alike-n)**: c1=16, c2=32, c3=64, c4=128, dim=128 +- **关键点选择**: NMS (radius=2, 2轮) + Top-K=150 + +### 各阶段含义 + +| 阶段 | 分辨率 | 学习内容 | +|------|-------|---------| +| block1 | 原始 | 边缘、角点等低级特征 | +| block2 | 1/2 | 纹理、局部形状 | +| block3 | 1/8 | 物体部件、语义信息 | +| block4 | 1/32 | 全局上下文、场景级信息 | + +--- + +## 4. RICNN — 旋转不变CNN + +**源码**: `BEVNet.py` | **Demo**: `python 02_ricnn_demo.py` + +### 结构 + +``` +输入: BEV图像 (B, 3, 320, 320) + ↓ +block1: RIConvBlock(3→16) → (B, 16, 320, 320) + ↓ RIMaxpool2d(2) +block2: RIResBlock(16→32) → (B, 32, 160, 160) + ↓ RIMaxpool2d(5, stride=4) +block3: RIResBlock(32→64) → (B, 64, 40, 40) + ↓ RIMaxpool2d(5, stride=4) +block4: RIResBlock(64→128) → (B, 128, 10, 10) + ↓ +特征聚合 (同ALNet) → (B, 128, 320, 320) + ↓ Conv1x1(128→129) +输出: score(B,1,320,320) + desc(B,128,320,320) +``` + +### 旋转不变性原理(核心创新) + +BEV图像中车辆旋转时点云投影会旋转。RICNN通过以下机制保持特征不变: + +**RIConv2d**: 根据kernel位置到中心的**欧氏距离**分组,同距离共享权重 + +``` +标准 5×5 kernel (25个独立权重): RI kernel (3组共享权重): +[0 1 2 3 4] [0 1 1 1 0] +[1 2 3 4 5] [1 2 2 2 1] +[2 3 4 5 6] [1 2 3 2 1] ← 3组: dis=0,1,2 +[1 2 3 4 5] [1 2 2 2 1] +[0 1 2 3 4] [0 1 1 1 0] +``` + +**RIMaxpool2d / RIAvgpool2d**: 只取圆形区域内像素,排除对角线角点(旋转不一致) + +**推理优化**: `disable_ri()` 可将RI层转为标准CNN层 + +--- + +## 5. EncodePosition — 位置编码 + +**源码**: `BEVNet.py` | **Demo**: 包含在 `02_ricnn_demo.py` + +``` +输入: kpts (B, 150, 4), fea (B, 128, 150) + ↓ +1. 计算150×150关键点欧氏距离矩阵 +2. 距离直方图 (16 bins, range=[1,80]m) +3. 直方图归一化 +4. MLP: 16→64→64→128 +5. fea_out = fea + MLP(hist) (残差连接) +``` + +将关键点间空间关系编码到特征中,帮助网络理解"哪些关键点在物理空间中相邻"。 + +--- + +## 6. Converter — 跨模态特征转换器 + +**源码**: `net.py` | **Demo**: `python 03_converter_demo.py` + +``` +输入: x (B, 128, N) N个特征点 + ├─ 路径1: Self-Attention(MHA) → x2 (B, 128, N) + ├─ 路径2: Conv1d瓶颈(128→32→16→32→128) → x3 (B, 128, N) + └─ concat([x2,x3]) → Conv1d(256→128) → 输出 (B, 128, N) +``` + +### 两种使用 + +| 转换器 | 输入 → 输出 | 含义 | +|--------|------------|------| +| `cvt_bev` | 图像特征 → BEV空间 | 让图像特征"理解"BEV几何 | +| `cvt_img` | BEV特征 → 图像空间 | 让BEV特征"理解"图像语义 | + +双路径设计:MHA捕获全局关系,Conv1d做逐点变换,互补增强。 + +--- + +## 7. Generator & FusionHead — 特征生成与融合 + +**源码**: `net.py` | **Demo**: `python 04_generator_fusion_demo.py` + +### Generator (全景特征生成器) + +``` +输入: (B, 128, N) N可变 + ↓ Self-Attention + ↓ ConvTranspose1d(k3, s3) → 上采样扩展 + ↓ AdaptiveMaxPool1d(150) +输出: (B, 128, 150) 固定K=150 +``` + +将可变数量的匹配点特征压缩为固定150个,与BEV关键点对齐。 + +### FusionHead (跨模态融合头) + +``` +输入: (B, 128, 150, 4) ← [original, gen, gen_gen, kpl_gen] + ↓ +Step 1: 对前3对做 Self-Attn → max聚合 +Step 2: Cross-Attn with kpl_gen (图像空间特征) + ↓ +concat(original, cross_out) → Conv1d(256→128) +输出: (B, 128, 150) 融合特征 +``` + +4种特征来源: + +| 特征 | 来源 | 空间 | +|------|------|------| +| `original` | RICNN直接提取 | BEV | +| `gen` | Generator从图像特征生成 | 图像→BEV | +| `gen_gen` | cvt_bev(cvt_img(original)) | BEV→图像→BEV循环 | +| `kpl_gen` | cvt_img(original) | BEV→图像残留 | + +--- + +## 8. LocalPool — 局部特征聚合 + +**源码**: `net.py` | 轻量级模块,无独立demo + +``` +输入: (B, 128, N, K) N个体素,每体素K个像素(K≤100) + ↓ Conv2d(100→10, k=1) + MaxPool2d((1,10)) +输出: (B, 128, N, 1) → squeeze → (B, 128, N) +``` + +一个BEV体素对应图像上多个像素,需聚合为单个体素特征:1x1 Conv降维 + MaxPool取最显著响应。 + +--- + +## 9. NetVLAD — 全局描述子聚合 + +**源码**: `netvlad.py` | **Demo**: `python 05_netvlad_demo.py` + +``` +输入: (B, 128, 150, 1) + ↓ +1. Soft Assignment: Softmax(Conv2d(128→16)(x)) → (B, 16, 150, 1) +2. Residual: x - centroids[16,128] → (B, 16, 150, 128) +3. VLAD Core: Σ(soft_assign × residual) → (B, 16, 128) +4. 归一化: per-cluster L2 → flatten → global L2 +输出: (B, 2048) +``` + +### 为什么用VLAD + +| 方法 | 问题 | +|------|------| +| 平均池化 | 丢失空间分布信息 | +| VLAD | 通过聚类保留"哪些类型特征在哪里"的结构信息 | + +### VLAD融合 + +```python +vlads = sigmoid(w) * vlad_fusion + (1 - sigmoid(w)) * vlad_bev +``` + +--- + +## 10. UOTHead — 最优传输位姿估计 + +**源码**: `uot.py` | **Demo**: `python 06_uot_demo.py` + +``` +输入: feat1,feat2 (B,150,128), kpts1,kpts2 (B,150,3) + ↓ +1. Cost Matrix: C = 1 - cosine_sim(feat1, feat2) → (B, 150, 150) +2. Sinkhorn Unbalanced OT (5 iterations): + K = exp(-C/ε) where ε = exp(ε_raw)+0.03 + a, b 交替更新,γ 控制质量正则 + T = diag(a)·K·diag(b) → (B, 150, 150) +3. 投影: project_kpts = T @ kpts2 / ΣT → (B, 150, 3) +4. Weighted SVD → R, t → transformation (B, 3, 4) +``` + +### 两个可学习参数 + +| 参数 | 含义 | 效果 | +|------|------|------| +| ε | 熵正则化 | 大→平滑匹配, 小→稀疏匹配 | +| γ | 质量正则化 | 大→质量守恒, 小→允许不匹配 | + +非平衡OT允许部分点不匹配,对有遮挡的真实场景更鲁棒。 + +--- + +## 11. 完整数据流 + +### 训练时 + +``` +img → ALNet → fea_img, fea_kpl +bev → RICNN → fea_bev, fea_kpt_original, vlad_bev + +relation → grid_sample(fea_img) → fea_pl_dual + → LocalPool → cvt_bev → fea_pt_dual_gen + → Generator → fea_kpt_original_gen + + grid_sample(fea_bev) → fea_pt_dual (训练时) + → cvt_img → fea_pl_dual_gen (训练时) + +fea_kpt_original → cvt_img → fea_kpl_gen + → cvt_bev → fea_kpt_gen_gen + +FusionHead([original, gen, gen_gen, kpl_gen]) → fea_kpt_fusion + → NetVLAD → vlad_fusion + +vlads = sigmoid(w)*vlad_fusion + (1-sigmoid(w))*vlad_bev + +UOTHead(fea_kpt_original) → transformation_original +UOTHead(fea_kpt_fusion) → transformation_fusion +``` + +### 推理时简化 + +不执行 UOTHead 和 BEV→图像采样(无 `pose_to_frame`),只输出 `vlads` + 局部特征。 + +--- + +## 12. 学习路线建议 + +### 入门(约2-3小时) + +```bash +# 1. 全流程概览 +python 08_full_pipeline_demo.py --mode all + +# 2. 独立分支 +python 01_alnet_demo.py # 图像分支 +python 02_ricnn_demo.py # BEV分支(含旋转不变性测试 + 位置编码) + +# 3. 融合机制 +python 03_converter_demo.py # 跨模态转换 +python 04_generator_fusion_demo.py # 特征生成 + 融合 + +# 4. 全局描述子与位姿 +python 05_netvlad_demo.py # VLAD聚合 +python 06_uot_demo.py # 最优传输位姿估计 +``` + +### 深入 + +1. 阅读论文 Section 3 (Methodology) +2. 对照代码看每个模块的 `forward` 函数 +3. 修改demo中的参数(关键点数量、VLAD聚类数),观察变化 +4. 加载真实checkpoint运行推理 + +### 运行环境 + +```bash +conda activate fusion_cyy +cd network_learning +``` + +所有可视化图像输出在 `network_learning/output/` 目录。 + +--- + +*基于代码版本: commit c3d268f* diff --git a/network_learning/README.md b/network_learning/README.md new file mode 100644 index 0000000..b2fba14 --- /dev/null +++ b/network_learning/README.md @@ -0,0 +1,50 @@ +# Network Learning — 网络结构可视化学习 + +论文《Cross Fusion of Point Cloud and Learned Image for Loop Closure Detection》中所有网络结构的可视化 Demo。 + +## 快速开始 + +```bash +conda activate fusion_cyy +cd network_learning + +# 依次运行各网络demo +python 01_alnet_demo.py # ALNet — 图像特征提取器 +python 02_ricnn_demo.py # RICNN — 旋转不变CNN + 位置编码 +python 03_converter_demo.py # Converter — 跨模态特征转换器 +python 04_generator_fusion_demo.py # Generator + FusionHead +python 05_netvlad_demo.py # NetVLAD — 全局描述子聚合 +python 06_uot_demo.py # UOT — 最优传输位姿估计 + +# 或一次性看完整流水线 +python 08_full_pipeline_demo.py --mode all +``` + +所有图像输出到 `output/` 目录。 + +## 文件说明 + +| 文件 | 内容 | +|------|------| +| `01_alnet_demo.py` | ALNet中间特征、感受野、参数量分析 | +| `02_ricnn_demo.py` | RICNN卷积核分组、池化区域、旋转不变性测试、位置编码 | +| `03_converter_demo.py` | 跨模态转换前后特征相似度、Attention权重 | +| `04_generator_fusion_demo.py` | Generator变长→定长、FusionHead多来源融合 | +| `05_netvlad_demo.py` | 软分配过程、VLAD结构、NetVLAD变体对比 | +| `06_uot_demo.py` | 代价矩阵、Sinkhorn迭代、刚体变换、参数影响 | +| `08_full_pipeline_demo.py` | BEV/Img/Fusion三种模式端到端可视化 | +| `LEARNING_GUIDE.md` | 完整学习文档(9个网络结构详解) | + +## 网络结构一览 + +| # | 网络 | 文件 | Demo | +|---|------|------|------| +| 1 | ALNet | `ALIKE/alnet.py` | `01_alnet_demo.py` | +| 2 | RICNN | `BEVNet.py` | `02_ricnn_demo.py` | +| 3 | EncodePosition | `BEVNet.py` | `02_ricnn_demo.py` | +| 4 | Converter | `net.py` | `03_converter_demo.py` | +| 5 | Generator | `net.py` | `04_generator_fusion_demo.py` | +| 6 | FusionHead | `net.py` | `04_generator_fusion_demo.py` | +| 7 | LocalPool | `net.py` | (轻量级,见文档) | +| 8 | NetVLAD | `netvlad.py` | `05_netvlad_demo.py` | +| 9 | UOTHead | `uot.py` | `06_uot_demo.py` |