diff --git a/auto_eval_checkpoints.py b/auto_eval_checkpoints.py index 11692c0..54cec81 100644 --- a/auto_eval_checkpoints.py +++ b/auto_eval_checkpoints.py @@ -4,25 +4,26 @@ """ 自动评估 FUSIONLCD 多个 checkpoint 的脚本 -用法示例: +支持: +1. 单卡串行测试: + --gpu 0 +2. 多卡并行测试: + --gpu 0,1,2,3 + python auto_eval_checkpoints.py \ --project_dir /home/adlab36/chenyouyuan/FUSIONLCD \ --config /home/adlab36/chenyouyuan/FUSIONLCD/config.yaml \ --train_script /home/adlab36/chenyouyuan/FUSIONLCD/train.py \ --models_dir /home/adlab36/chenyouyuan/FUSIONLCD/result/log/models \ --result_name auto_eval \ - --gpu 1 + --gpu 2,3 \ + --epochs_filter 119, 139 +099, 119, 139, 159, 179, 199 说明: -1. 会备份原 config.yaml 为 config.yaml.bak_auto_eval -2. 每个 checkpoint 测试前会把 config 改成: - - train_flag = 0 - - validate_flag = 0 - - test_flag = 1 - - load_model = 1 - - last_model = 当前 checkpoint -3. 每测完一个 checkpoint,会读取 result/.txt 追加的新结果 -4. 最终输出 summary.csv +- 多卡模式下,每个 checkpoint 会分配到一个 GPU +- 每个子进程使用独立的临时工作目录和独立 config.yaml,避免冲突 +- 会实时输出子进程日志 """ from __future__ import annotations @@ -35,8 +36,10 @@ import shutil import subprocess import sys import time +import tempfile from pathlib import Path from typing import Dict, List, Tuple, Optional +from concurrent.futures import ThreadPoolExecutor, as_completed import yaml @@ -52,7 +55,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--train_script", type=str, required=True, help="train.py 路径") parser.add_argument("--models_dir", type=str, required=True, help="checkpoint 目录") parser.add_argument("--result_name", type=str, default="auto_eval", help="train.py 的 result_name") - parser.add_argument("--gpu", type=str, default="0", help="GPU id,例如 0 或 1") + parser.add_argument("--gpu", type=str, default="0", help="GPU id,例如 0 或 0,1,2,3") parser.add_argument("--epochs_filter", type=str, default="", help="只测试指定 epoch,逗号分隔,如 99,109,119") parser.add_argument("--min_epoch", type=int, default=None, help="最小 epoch 过滤") parser.add_argument("--max_epoch", type=int, default=None, help="最大 epoch 过滤") @@ -101,17 +104,6 @@ def filter_checkpoints( return selected -def result_txt_path(project_dir: Path, result_name: str) -> Path: - return project_dir / "result" / f"{result_name}.txt" - - -def read_result_lines(path: Path) -> List[str]: - if not path.exists(): - return [] - with path.open("r", encoding="utf-8") as f: - return [line.rstrip("\n") for line in f.readlines()] - - def parse_result_file(path: Path) -> List[Dict]: rows: List[Dict] = [] if not path.exists(): @@ -131,8 +123,6 @@ def parse_result_file(path: Path) -> List[Dict]: epoch = int(m.group(2)) rest = m.group(3).split() - # 表头来自你的 log_result: - # AP R100 F1 R@1 R@2 R@3 R@4 R@5 R@6 R@7 R@8 R@9 R@10 R@15 R@20 R@25 if len(rest) < 16: continue @@ -163,8 +153,8 @@ def parse_result_file(path: Path) -> List[Dict]: return rows -def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None: - cfg = load_yaml(config_path) +def make_eval_config(base_config_path: Path, ckpt_path: Path, result_name: str, temp_config_path: Path) -> None: + cfg = load_yaml(base_config_path) exp = cfg["experiment"] exp["train_flag"] = 0 @@ -173,10 +163,17 @@ def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None: exp["load_model"] = 1 exp["last_model"] = str(ckpt_path) - save_yaml(config_path, cfg) + # 保持原 path_result,不改数据库等输出位置 + save_yaml(temp_config_path, cfg) -def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: str) -> int: +def run_one_eval( + work_dir: Path, + train_script: Path, + result_name: str, + gpu: str, + tag: str, +) -> int: env = os.environ.copy() env["CUDA_VISIBLE_DEVICES"] = gpu @@ -188,15 +185,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s "--gpu", gpu, "--info", - "auto_eval", + f"auto_eval_{tag}", ] - print(f"[INFO] Running command: {' '.join(cmd)}") - print(f"[INFO] CUDA_VISIBLE_DEVICES={gpu}") + print(f"[INFO][{tag}] Running command: {' '.join(cmd)}") + print(f"[INFO][{tag}] CUDA_VISIBLE_DEVICES={gpu}") + print(f"[INFO][{tag}] cwd={work_dir}") proc = subprocess.Popen( cmd, - cwd=str(project_dir), + cwd=str(work_dir), env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, @@ -208,16 +206,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s try: assert proc.stdout is not None for line in proc.stdout: - print(line, end="") + print(f"[{tag}] {line}", end="") proc.wait() return proc.returncode except KeyboardInterrupt: - print("\n[WARN] 收到 Ctrl+C,正在终止当前测试子进程...") + print(f"\n[WARN][{tag}] 收到 Ctrl+C,正在终止当前测试子进程...") proc.terminate() try: proc.wait(timeout=5) except Exception: - print("[WARN] 子进程未及时退出,强制 kill") + print(f"[WARN][{tag}] 子进程未及时退出,强制 kill") proc.kill() proc.wait() raise @@ -247,18 +245,69 @@ def save_summary_csv(path: Path, summary: List[Dict]) -> None: writer.writerows(summary) +def run_single_checkpoint( + epoch: int, + ckpt: Path, + gpu: str, + args: argparse.Namespace, + project_dir: Path, + train_script: Path, +) -> Optional[Dict]: + tag = f"gpu{gpu}_ep{epoch}" + temp_root = Path(tempfile.mkdtemp(prefix=f"auto_eval_{tag}_")) + try: + # train.py 会优先从 cwd/config.yaml 读取配置 + temp_config = temp_root / "config.yaml" + make_eval_config(Path(args.config), ckpt, args.result_name, temp_config) + + # train.py 会把 txt 写到 cwd/result/result_name.txt + (temp_root / "result").mkdir(parents=True, exist_ok=True) + + ret = run_one_eval( + work_dir=temp_root, + train_script=train_script, + result_name=args.result_name, + gpu=gpu, + tag=tag, + ) + if ret != 0: + print(f"[WARN][{tag}] checkpoint {epoch} 测试失败,返回码 {ret}") + return None + + time.sleep(args.sleep_sec) + + result_txt = temp_root / "result" / f"{args.result_name}.txt" + parsed = parse_result_file(result_txt) + epoch_rows = collect_epoch_rows(parsed, epoch) + + if not epoch_rows: + print(f"[WARN][{tag}] 没有在结果文件中找到 epoch={epoch} 的记录") + return None + + agg = aggregate_rows(epoch_rows) + row = { + "epoch": epoch, + "checkpoint": str(ckpt), + "gpu": gpu, + **agg, + } + print(f"[INFO][{tag}] 汇总: {row}") + return row + finally: + shutil.rmtree(temp_root, ignore_errors=True) + + def main() -> None: args = parse_args() project_dir = Path(args.project_dir).resolve() - config_path = Path(args.config).resolve() train_script = Path(args.train_script).resolve() models_dir = Path(args.models_dir).resolve() if not project_dir.exists(): raise FileNotFoundError(f"project_dir 不存在: {project_dir}") - if not config_path.exists(): - raise FileNotFoundError(f"config 不存在: {config_path}") + if not Path(args.config).exists(): + raise FileNotFoundError(f"config 不存在: {args.config}") if not train_script.exists(): raise FileNotFoundError(f"train_script 不存在: {train_script}") if not models_dir.exists(): @@ -270,57 +319,58 @@ def main() -> None: if not ckpts: raise RuntimeError("没有找到符合条件的 checkpoint") - backup_path = config_path.with_suffix(config_path.suffix + ".bak_auto_eval") - shutil.copy2(config_path, backup_path) - print(f"[INFO] 已备份配置到: {backup_path}") + gpu_list = [x.strip() for x in args.gpu.split(",") if x.strip()] + if not gpu_list: + raise RuntimeError("没有可用 GPU 参数") + + print(f"[INFO] 使用 GPU 列表: {gpu_list}") + print(f"[INFO] 待测试 checkpoint 数量: {len(ckpts)}") - result_txt = result_txt_path(project_dir, args.result_name) summary_rows: List[Dict] = [] - try: + # 单卡时保持串行行为 + if len(gpu_list) == 1: + gpu = gpu_list[0] for epoch, ckpt in ckpts: print("=" * 100) - print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}") + print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}, gpu={gpu}") print("=" * 100) + row = run_single_checkpoint(epoch, ckpt, gpu, args, project_dir, train_script) + if row is not None: + summary_rows.append(row) + else: + # 多卡并行:round-robin 分配 checkpoint 到不同 GPU + futures = [] + with ThreadPoolExecutor(max_workers=len(gpu_list)) as ex: + for idx, (epoch, ckpt) in enumerate(ckpts): + gpu = gpu_list[idx % len(gpu_list)] + futures.append( + ex.submit( + run_single_checkpoint, + epoch, + ckpt, + gpu, + args, + project_dir, + train_script, + ) + ) - overwrite_test_config(config_path, ckpt) + for fut in as_completed(futures): + row = fut.result() + if row is not None: + summary_rows.append(row) - ret = run_one_eval(project_dir, train_script, args.result_name, args.gpu) - if ret != 0: - print(f"[WARN] checkpoint {epoch} 测试失败,返回码 {ret}") - continue + summary_rows.sort(key=lambda x: x["epoch"]) - time.sleep(args.sleep_sec) + summary_csv = project_dir / "result" / f"{args.result_name}_summary.csv" + save_summary_csv(summary_csv, summary_rows) + print(f"[INFO] 汇总结果已保存到: {summary_csv}") - parsed = parse_result_file(result_txt) - epoch_rows = collect_epoch_rows(parsed, epoch) - - if not epoch_rows: - print(f"[WARN] 没有在结果文件中找到 epoch={epoch} 的记录") - continue - - agg = aggregate_rows(epoch_rows) - row = { - "epoch": epoch, - "checkpoint": str(ckpt), - **agg, - } - summary_rows.append(row) - - print(f"[INFO] epoch={epoch} 汇总: {row}") - - summary_csv = project_dir / "result" / f"{args.result_name}_summary.csv" - save_summary_csv(summary_csv, summary_rows) - print(f"[INFO] 汇总结果已保存到: {summary_csv}") - - if summary_rows: - best_by_ap = max(summary_rows, key=lambda x: x.get("mean_AP", float("-inf"))) - print("\n[INFO] 最佳 checkpoint(按 mean_AP):") - print(best_by_ap) - - finally: - shutil.copy2(backup_path, config_path) - print(f"[INFO] 已恢复原始配置: {config_path}") + if summary_rows: + best_by_ap = max(summary_rows, key=lambda x: x.get("mean_AP", float("-inf"))) + print("\n[INFO] 最佳 checkpoint(按 mean_AP):") + print(best_by_ap) if __name__ == "__main__": diff --git a/config.yaml.bak_auto_eval b/config.yaml.bak_auto_eval new file mode 100644 index 0000000..db0031e --- /dev/null +++ b/config.yaml.bak_auto_eval @@ -0,0 +1,56 @@ +'experiment' : + + # 'path_dataset' : '/mnt/data/cdy/project/dataset/FUSION' + # 'path_result': '/mnt/data/cdy/data2/results/FUSIONLCD' + + # 'path_dataset' : 'E:\work\Project\dataset\FUSION' + # 'path_result' : 'E:\work\Project\results\FUSIONLCD\bev2' + + 'path_dataset' : '/home/adlab36/chenyouyuan/FUSIONLCD' + 'path_result': '/home/adlab36/chenyouyuan/FUSIONLCD/result' + 'train_flag' : 0 + 'validate_flag' : 1 + 'test_flag' : 1 + 'flag' : 'fusion' + 'cuda' : 1 + # TRAINING + 'epochs' : 200 + 'batchsize' : 6 + 'learning_rate' : 1.e-3 + 'beta1' : 0.9 + 'beta2' : 0.999 + 'eps' : 1.e-8 + 'weight_decay' : 5.e-6 + 'load_model' : 1 + #FUSION + # 'last_model' : '/data4/caodanyang/results/FUSIONLCD/08310/models/checkpoint_079.pth.tar' + #BEV + # 'last_model' : '/data4/caodanyang/results/FUSIONLCD/bev_09030/models/checkpoint_066.pth.tar' + #BEV+EP + 'last_model' : '/home/adlab36/chenyouyuan/FUSIONLCD/result/log/models/checkpoint_199.pth.tar' + #DATASET + 'train' : 0,5,6,7,9 + 'validate' : 8,50,54,55,56,59 + 'test' : 8,50,54,55,56,59 + 'voxel_num' : 15000 + 'voxel_max_points' : 100 + 'voxel_sample' : 'top' +# 'bev_range' : -51.2,-51.2,-2.5,51.2,51.2,1.5 +# 'bev_resolution' : 0.16 +# 'bev_range' : -64,-64,-2.5,64,64,1.5 +# 'bev_resolution' : 0.2 + 'bev_range' : -32,-32,-2.5,32,32,1.5 + 'bev_resolution' : 0.2 + + # NETWORK PARAMS + 'kpts_number_bev' : 150 + 'kpts_number_img' : 150 + 'cluster_num_bev' : 16 + 'cluster_num_img' : 16 + 'cluster_num_fusion' : 16 + 'sinkhorn_iter' : 5 + 'vlad_size' : 256 + # LOSS + 'loop_file' : 'loop_GT_4m' + 'trip_margin' : 0.5 + 'negetative_selsector' : 'random' \ No newline at end of file diff --git a/evaluate_all_models.py b/evaluate_all_models.py new file mode 100644 index 0000000..e69de29