多卡ap测试

2026-04-11 14:12:20 +08:00
parent e66077997d
commit 13e436a146
3 changed files with 186 additions and 80 deletions
--- a/auto_eval_checkpoints.py
+++ b/auto_eval_checkpoints.py
@@ -4,25 +4,26 @@
 """
 自动评估 FUSIONLCD 多个 checkpoint 的脚本

-用法示例：
+支持：
+1. 单卡串行测试：
+   --gpu 0
+2. 多卡并行测试：
+   --gpu 0,1,2,3
+
 python auto_eval_checkpoints.py \
  --project_dir /home/adlab36/chenyouyuan/FUSIONLCD \
  --config /home/adlab36/chenyouyuan/FUSIONLCD/config.yaml \
  --train_script /home/adlab36/chenyouyuan/FUSIONLCD/train.py \
  --models_dir /home/adlab36/chenyouyuan/FUSIONLCD/result/log/models \
  --result_name auto_eval \
-  --gpu 1
+  --gpu 2,3 \
+  --epochs_filter 119, 139

+099, 119, 139, 159, 179, 199
 说明：
-1. 会备份原 config.yaml 为 config.yaml.bak_auto_eval
-2. 每个 checkpoint 测试前会把 config 改成：
-   - train_flag = 0
-   - validate_flag = 0
-   - test_flag = 1
-   - load_model = 1
-   - last_model = 当前 checkpoint
-3. 每测完一个 checkpoint，会读取 result/<result_name>.txt 追加的新结果
-4. 最终输出 summary.csv
+- 多卡模式下，每个 checkpoint 会分配到一个 GPU
+- 每个子进程使用独立的临时工作目录和独立 config.yaml，避免冲突
+- 会实时输出子进程日志
 """

 from __future__ import annotations
@@ -35,8 +36,10 @@ import shutil
 import subprocess
 import sys
 import time
+import tempfile
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed

 import yaml

@@ -52,7 +55,7 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument("--train_script", type=str, required=True, help="train.py 路径")
    parser.add_argument("--models_dir", type=str, required=True, help="checkpoint 目录")
    parser.add_argument("--result_name", type=str, default="auto_eval", help="train.py 的 result_name")
-    parser.add_argument("--gpu", type=str, default="0", help="GPU id，例如 0 或 1")
+    parser.add_argument("--gpu", type=str, default="0", help="GPU id，例如 0 或 0,1,2,3")
    parser.add_argument("--epochs_filter", type=str, default="", help="只测试指定 epoch，逗号分隔，如 99,109,119")
    parser.add_argument("--min_epoch", type=int, default=None, help="最小 epoch 过滤")
    parser.add_argument("--max_epoch", type=int, default=None, help="最大 epoch 过滤")
@@ -101,17 +104,6 @@ def filter_checkpoints(
    return selected


-def result_txt_path(project_dir: Path, result_name: str) -> Path:
-    return project_dir / "result" / f"{result_name}.txt"
-
-
-def read_result_lines(path: Path) -> List[str]:
-    if not path.exists():
-        return []
-    with path.open("r", encoding="utf-8") as f:
-        return [line.rstrip("\n") for line in f.readlines()]
-
-
 def parse_result_file(path: Path) -> List[Dict]:
    rows: List[Dict] = []
    if not path.exists():
@@ -131,8 +123,6 @@ def parse_result_file(path: Path) -> List[Dict]:
            epoch = int(m.group(2))
            rest = m.group(3).split()

-            # 表头来自你的 log_result:
-            # AP R100 F1 R@1 R@2 R@3 R@4 R@5 R@6 R@7 R@8 R@9 R@10 R@15 R@20 R@25
            if len(rest) < 16:
                continue

@@ -163,8 +153,8 @@ def parse_result_file(path: Path) -> List[Dict]:
    return rows


-def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None:
-    cfg = load_yaml(config_path)
+def make_eval_config(base_config_path: Path, ckpt_path: Path, result_name: str, temp_config_path: Path) -> None:
+    cfg = load_yaml(base_config_path)
    exp = cfg["experiment"]

    exp["train_flag"] = 0
@@ -173,10 +163,17 @@ def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None:
    exp["load_model"] = 1
    exp["last_model"] = str(ckpt_path)

-    save_yaml(config_path, cfg)
+    # 保持原 path_result，不改数据库等输出位置
+    save_yaml(temp_config_path, cfg)


-def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: str) -> int:
+def run_one_eval(
+    work_dir: Path,
+    train_script: Path,
+    result_name: str,
+    gpu: str,
+    tag: str,
+) -> int:
    env = os.environ.copy()
    env["CUDA_VISIBLE_DEVICES"] = gpu

@@ -188,15 +185,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s
        "--gpu",
        gpu,
        "--info",
-        "auto_eval",
+        f"auto_eval_{tag}",
    ]

-    print(f"[INFO] Running command: {' '.join(cmd)}")
-    print(f"[INFO] CUDA_VISIBLE_DEVICES={gpu}")
+    print(f"[INFO][{tag}] Running command: {' '.join(cmd)}")
+    print(f"[INFO][{tag}] CUDA_VISIBLE_DEVICES={gpu}")
+    print(f"[INFO][{tag}] cwd={work_dir}")

    proc = subprocess.Popen(
        cmd,
-        cwd=str(project_dir),
+        cwd=str(work_dir),
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
@@ -208,16 +206,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s
    try:
        assert proc.stdout is not None
        for line in proc.stdout:
-            print(line, end="")
+            print(f"[{tag}] {line}", end="")
        proc.wait()
        return proc.returncode
    except KeyboardInterrupt:
-        print("\n[WARN] 收到 Ctrl+C，正在终止当前测试子进程...")
+        print(f"\n[WARN][{tag}] 收到 Ctrl+C，正在终止当前测试子进程...")
        proc.terminate()
        try:
            proc.wait(timeout=5)
        except Exception:
-            print("[WARN] 子进程未及时退出，强制 kill")
+            print(f"[WARN][{tag}] 子进程未及时退出，强制 kill")
            proc.kill()
            proc.wait()
        raise
@@ -247,18 +245,69 @@ def save_summary_csv(path: Path, summary: List[Dict]) -> None:
        writer.writerows(summary)


+def run_single_checkpoint(
+    epoch: int,
+    ckpt: Path,
+    gpu: str,
+    args: argparse.Namespace,
+    project_dir: Path,
+    train_script: Path,
+) -> Optional[Dict]:
+    tag = f"gpu{gpu}_ep{epoch}"
+    temp_root = Path(tempfile.mkdtemp(prefix=f"auto_eval_{tag}_"))
+    try:
+        # train.py 会优先从 cwd/config.yaml 读取配置
+        temp_config = temp_root / "config.yaml"
+        make_eval_config(Path(args.config), ckpt, args.result_name, temp_config)
+
+        # train.py 会把 txt 写到 cwd/result/result_name.txt
+        (temp_root / "result").mkdir(parents=True, exist_ok=True)
+
+        ret = run_one_eval(
+            work_dir=temp_root,
+            train_script=train_script,
+            result_name=args.result_name,
+            gpu=gpu,
+            tag=tag,
+        )
+        if ret != 0:
+            print(f"[WARN][{tag}] checkpoint {epoch} 测试失败，返回码 {ret}")
+            return None
+
+        time.sleep(args.sleep_sec)
+
+        result_txt = temp_root / "result" / f"{args.result_name}.txt"
+        parsed = parse_result_file(result_txt)
+        epoch_rows = collect_epoch_rows(parsed, epoch)
+
+        if not epoch_rows:
+            print(f"[WARN][{tag}] 没有在结果文件中找到 epoch={epoch} 的记录")
+            return None
+
+        agg = aggregate_rows(epoch_rows)
+        row = {
+            "epoch": epoch,
+            "checkpoint": str(ckpt),
+            "gpu": gpu,
+            **agg,
+        }
+        print(f"[INFO][{tag}] 汇总: {row}")
+        return row
+    finally:
+        shutil.rmtree(temp_root, ignore_errors=True)
+
+
 def main() -> None:
    args = parse_args()

    project_dir = Path(args.project_dir).resolve()
-    config_path = Path(args.config).resolve()
    train_script = Path(args.train_script).resolve()
    models_dir = Path(args.models_dir).resolve()

    if not project_dir.exists():
        raise FileNotFoundError(f"project_dir 不存在: {project_dir}")
-    if not config_path.exists():
-        raise FileNotFoundError(f"config 不存在: {config_path}")
+    if not Path(args.config).exists():
+        raise FileNotFoundError(f"config 不存在: {args.config}")
    if not train_script.exists():
        raise FileNotFoundError(f"train_script 不存在: {train_script}")
    if not models_dir.exists():
@@ -270,44 +319,49 @@ def main() -> None:
    if not ckpts:
        raise RuntimeError("没有找到符合条件的 checkpoint")

-    backup_path = config_path.with_suffix(config_path.suffix + ".bak_auto_eval")
-    shutil.copy2(config_path, backup_path)
-    print(f"[INFO] 已备份配置到: {backup_path}")
+    gpu_list = [x.strip() for x in args.gpu.split(",") if x.strip()]
+    if not gpu_list:
+        raise RuntimeError("没有可用 GPU 参数")
+
+    print(f"[INFO] 使用 GPU 列表: {gpu_list}")
+    print(f"[INFO] 待测试 checkpoint 数量: {len(ckpts)}")

-    result_txt = result_txt_path(project_dir, args.result_name)
    summary_rows: List[Dict] = []

-    try:
+    # 单卡时保持串行行为
+    if len(gpu_list) == 1:
+        gpu = gpu_list[0]
        for epoch, ckpt in ckpts:
            print("=" * 100)
-            print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}")
+            print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}, gpu={gpu}")
            print("=" * 100)
+            row = run_single_checkpoint(epoch, ckpt, gpu, args, project_dir, train_script)
+            if row is not None:
+                summary_rows.append(row)
+    else:
+        # 多卡并行：round-robin 分配 checkpoint 到不同 GPU
+        futures = []
+        with ThreadPoolExecutor(max_workers=len(gpu_list)) as ex:
+            for idx, (epoch, ckpt) in enumerate(ckpts):
+                gpu = gpu_list[idx % len(gpu_list)]
+                futures.append(
+                    ex.submit(
+                        run_single_checkpoint,
+                        epoch,
+                        ckpt,
+                        gpu,
+                        args,
+                        project_dir,
+                        train_script,
+                    )
+                )

-            overwrite_test_config(config_path, ckpt)
-
-            ret = run_one_eval(project_dir, train_script, args.result_name, args.gpu)
-            if ret != 0:
-                print(f"[WARN] checkpoint {epoch} 测试失败，返回码 {ret}")
-                continue
-
-            time.sleep(args.sleep_sec)
-
-            parsed = parse_result_file(result_txt)
-            epoch_rows = collect_epoch_rows(parsed, epoch)
-
-            if not epoch_rows:
-                print(f"[WARN] 没有在结果文件中找到 epoch={epoch} 的记录")
-                continue
-
-            agg = aggregate_rows(epoch_rows)
-            row = {
-                "epoch": epoch,
-                "checkpoint": str(ckpt),
-                **agg,
-            }
+            for fut in as_completed(futures):
+                row = fut.result()
+                if row is not None:
                    summary_rows.append(row)

-            print(f"[INFO] epoch={epoch} 汇总: {row}")
+        summary_rows.sort(key=lambda x: x["epoch"])

    summary_csv = project_dir / "result" / f"{args.result_name}_summary.csv"
    save_summary_csv(summary_csv, summary_rows)
@@ -318,10 +372,6 @@ def main() -> None:
        print("\n[INFO] 最佳 checkpoint（按 mean_AP）:")
        print(best_by_ap)

-    finally:
-        shutil.copy2(backup_path, config_path)
-        print(f"[INFO] 已恢复原始配置: {config_path}")
-

 if __name__ == "__main__":
    main()
--- a/config.yaml.bak_auto_eval
+++ b/config.yaml.bak_auto_eval
@@ -0,0 +1,56 @@
+'experiment' :
+
+  # 'path_dataset' : '/mnt/data/cdy/project/dataset/FUSION'
+  # 'path_result': '/mnt/data/cdy/data2/results/FUSIONLCD'
+
+  # 'path_dataset' : 'E:\work\Project\dataset\FUSION'
+  # 'path_result' : 'E:\work\Project\results\FUSIONLCD\bev2'
+  
+  'path_dataset' : '/home/adlab36/chenyouyuan/FUSIONLCD'
+  'path_result': '/home/adlab36/chenyouyuan/FUSIONLCD/result'
+  'train_flag' : 0
+  'validate_flag' : 1
+  'test_flag' : 1
+  'flag' : 'fusion' 
+  'cuda' : 1
+  # TRAINING
+  'epochs' : 200
+  'batchsize' : 6
+  'learning_rate' : 1.e-3
+  'beta1' : 0.9
+  'beta2' : 0.999
+  'eps' : 1.e-8
+  'weight_decay' : 5.e-6
+  'load_model' : 1
+  #FUSION
+  # 'last_model' : '/data4/caodanyang/results/FUSIONLCD/08310/models/checkpoint_079.pth.tar'
+  #BEV
+  # 'last_model' : '/data4/caodanyang/results/FUSIONLCD/bev_09030/models/checkpoint_066.pth.tar'
+  #BEV+EP
+  'last_model' : '/home/adlab36/chenyouyuan/FUSIONLCD/result/log/models/checkpoint_199.pth.tar'
+  #DATASET
+  'train' : 0,5,6,7,9
+  'validate' : 8,50,54,55,56,59
+  'test' : 8,50,54,55,56,59
+  'voxel_num' : 15000
+  'voxel_max_points' : 100
+  'voxel_sample' : 'top'
+#  'bev_range' : -51.2,-51.2,-2.5,51.2,51.2,1.5
+#  'bev_resolution' : 0.16
+#  'bev_range' : -64,-64,-2.5,64,64,1.5
+#  'bev_resolution' : 0.2
+  'bev_range' : -32,-32,-2.5,32,32,1.5
+  'bev_resolution' : 0.2
+
+  # NETWORK PARAMS
+  'kpts_number_bev' : 150
+  'kpts_number_img' : 150
+  'cluster_num_bev' : 16
+  'cluster_num_img' : 16
+  'cluster_num_fusion' : 16
+  'sinkhorn_iter' : 5
+  'vlad_size' : 256
+  # LOSS
+  'loop_file' :  'loop_GT_4m'
+  'trip_margin' : 0.5
+  'negetative_selsector' : 'random'
--- a/evaluate_all_models.py
+++ b/evaluate_all_models.py