多卡ap测试

This commit is contained in:
MobKBK
2026-04-11 14:12:20 +08:00
parent e66077997d
commit 13e436a146
3 changed files with 186 additions and 80 deletions

View File

@@ -4,25 +4,26 @@
""" """
自动评估 FUSIONLCD 多个 checkpoint 的脚本 自动评估 FUSIONLCD 多个 checkpoint 的脚本
用法示例 支持
1. 单卡串行测试:
--gpu 0
2. 多卡并行测试:
--gpu 0,1,2,3
python auto_eval_checkpoints.py \ python auto_eval_checkpoints.py \
--project_dir /home/adlab36/chenyouyuan/FUSIONLCD \ --project_dir /home/adlab36/chenyouyuan/FUSIONLCD \
--config /home/adlab36/chenyouyuan/FUSIONLCD/config.yaml \ --config /home/adlab36/chenyouyuan/FUSIONLCD/config.yaml \
--train_script /home/adlab36/chenyouyuan/FUSIONLCD/train.py \ --train_script /home/adlab36/chenyouyuan/FUSIONLCD/train.py \
--models_dir /home/adlab36/chenyouyuan/FUSIONLCD/result/log/models \ --models_dir /home/adlab36/chenyouyuan/FUSIONLCD/result/log/models \
--result_name auto_eval \ --result_name auto_eval \
--gpu 1 --gpu 2,3 \
--epochs_filter 119, 139
099, 119, 139, 159, 179, 199
说明: 说明:
1. 会备份原 config.yaml 为 config.yaml.bak_auto_eval - 多卡模式下,每个 checkpoint 会分配到一个 GPU
2. 每个 checkpoint 测试前会把 config 改成: - 每个子进程使用独立的临时工作目录和独立 config.yaml避免冲突
- train_flag = 0 - 会实时输出子进程日志
- validate_flag = 0
- test_flag = 1
- load_model = 1
- last_model = 当前 checkpoint
3. 每测完一个 checkpoint会读取 result/<result_name>.txt 追加的新结果
4. 最终输出 summary.csv
""" """
from __future__ import annotations from __future__ import annotations
@@ -35,8 +36,10 @@ import shutil
import subprocess import subprocess
import sys import sys
import time import time
import tempfile
from pathlib import Path from pathlib import Path
from typing import Dict, List, Tuple, Optional from typing import Dict, List, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import yaml import yaml
@@ -52,7 +55,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--train_script", type=str, required=True, help="train.py 路径") parser.add_argument("--train_script", type=str, required=True, help="train.py 路径")
parser.add_argument("--models_dir", type=str, required=True, help="checkpoint 目录") parser.add_argument("--models_dir", type=str, required=True, help="checkpoint 目录")
parser.add_argument("--result_name", type=str, default="auto_eval", help="train.py 的 result_name") parser.add_argument("--result_name", type=str, default="auto_eval", help="train.py 的 result_name")
parser.add_argument("--gpu", type=str, default="0", help="GPU id例如 0 或 1") parser.add_argument("--gpu", type=str, default="0", help="GPU id例如 0 或 0,1,2,3")
parser.add_argument("--epochs_filter", type=str, default="", help="只测试指定 epoch逗号分隔如 99,109,119") parser.add_argument("--epochs_filter", type=str, default="", help="只测试指定 epoch逗号分隔如 99,109,119")
parser.add_argument("--min_epoch", type=int, default=None, help="最小 epoch 过滤") parser.add_argument("--min_epoch", type=int, default=None, help="最小 epoch 过滤")
parser.add_argument("--max_epoch", type=int, default=None, help="最大 epoch 过滤") parser.add_argument("--max_epoch", type=int, default=None, help="最大 epoch 过滤")
@@ -101,17 +104,6 @@ def filter_checkpoints(
return selected return selected
def result_txt_path(project_dir: Path, result_name: str) -> Path:
return project_dir / "result" / f"{result_name}.txt"
def read_result_lines(path: Path) -> List[str]:
if not path.exists():
return []
with path.open("r", encoding="utf-8") as f:
return [line.rstrip("\n") for line in f.readlines()]
def parse_result_file(path: Path) -> List[Dict]: def parse_result_file(path: Path) -> List[Dict]:
rows: List[Dict] = [] rows: List[Dict] = []
if not path.exists(): if not path.exists():
@@ -131,8 +123,6 @@ def parse_result_file(path: Path) -> List[Dict]:
epoch = int(m.group(2)) epoch = int(m.group(2))
rest = m.group(3).split() rest = m.group(3).split()
# 表头来自你的 log_result:
# AP R100 F1 R@1 R@2 R@3 R@4 R@5 R@6 R@7 R@8 R@9 R@10 R@15 R@20 R@25
if len(rest) < 16: if len(rest) < 16:
continue continue
@@ -163,8 +153,8 @@ def parse_result_file(path: Path) -> List[Dict]:
return rows return rows
def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None: def make_eval_config(base_config_path: Path, ckpt_path: Path, result_name: str, temp_config_path: Path) -> None:
cfg = load_yaml(config_path) cfg = load_yaml(base_config_path)
exp = cfg["experiment"] exp = cfg["experiment"]
exp["train_flag"] = 0 exp["train_flag"] = 0
@@ -173,10 +163,17 @@ def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None:
exp["load_model"] = 1 exp["load_model"] = 1
exp["last_model"] = str(ckpt_path) exp["last_model"] = str(ckpt_path)
save_yaml(config_path, cfg) # 保持原 path_result不改数据库等输出位置
save_yaml(temp_config_path, cfg)
def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: str) -> int: def run_one_eval(
work_dir: Path,
train_script: Path,
result_name: str,
gpu: str,
tag: str,
) -> int:
env = os.environ.copy() env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = gpu env["CUDA_VISIBLE_DEVICES"] = gpu
@@ -188,15 +185,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s
"--gpu", "--gpu",
gpu, gpu,
"--info", "--info",
"auto_eval", f"auto_eval_{tag}",
] ]
print(f"[INFO] Running command: {' '.join(cmd)}") print(f"[INFO][{tag}] Running command: {' '.join(cmd)}")
print(f"[INFO] CUDA_VISIBLE_DEVICES={gpu}") print(f"[INFO][{tag}] CUDA_VISIBLE_DEVICES={gpu}")
print(f"[INFO][{tag}] cwd={work_dir}")
proc = subprocess.Popen( proc = subprocess.Popen(
cmd, cmd,
cwd=str(project_dir), cwd=str(work_dir),
env=env, env=env,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
@@ -208,16 +206,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s
try: try:
assert proc.stdout is not None assert proc.stdout is not None
for line in proc.stdout: for line in proc.stdout:
print(line, end="") print(f"[{tag}] {line}", end="")
proc.wait() proc.wait()
return proc.returncode return proc.returncode
except KeyboardInterrupt: except KeyboardInterrupt:
print("\n[WARN] 收到 Ctrl+C正在终止当前测试子进程...") print(f"\n[WARN][{tag}] 收到 Ctrl+C正在终止当前测试子进程...")
proc.terminate() proc.terminate()
try: try:
proc.wait(timeout=5) proc.wait(timeout=5)
except Exception: except Exception:
print("[WARN] 子进程未及时退出,强制 kill") print(f"[WARN][{tag}] 子进程未及时退出,强制 kill")
proc.kill() proc.kill()
proc.wait() proc.wait()
raise raise
@@ -247,18 +245,69 @@ def save_summary_csv(path: Path, summary: List[Dict]) -> None:
writer.writerows(summary) writer.writerows(summary)
def run_single_checkpoint(
epoch: int,
ckpt: Path,
gpu: str,
args: argparse.Namespace,
project_dir: Path,
train_script: Path,
) -> Optional[Dict]:
tag = f"gpu{gpu}_ep{epoch}"
temp_root = Path(tempfile.mkdtemp(prefix=f"auto_eval_{tag}_"))
try:
# train.py 会优先从 cwd/config.yaml 读取配置
temp_config = temp_root / "config.yaml"
make_eval_config(Path(args.config), ckpt, args.result_name, temp_config)
# train.py 会把 txt 写到 cwd/result/result_name.txt
(temp_root / "result").mkdir(parents=True, exist_ok=True)
ret = run_one_eval(
work_dir=temp_root,
train_script=train_script,
result_name=args.result_name,
gpu=gpu,
tag=tag,
)
if ret != 0:
print(f"[WARN][{tag}] checkpoint {epoch} 测试失败,返回码 {ret}")
return None
time.sleep(args.sleep_sec)
result_txt = temp_root / "result" / f"{args.result_name}.txt"
parsed = parse_result_file(result_txt)
epoch_rows = collect_epoch_rows(parsed, epoch)
if not epoch_rows:
print(f"[WARN][{tag}] 没有在结果文件中找到 epoch={epoch} 的记录")
return None
agg = aggregate_rows(epoch_rows)
row = {
"epoch": epoch,
"checkpoint": str(ckpt),
"gpu": gpu,
**agg,
}
print(f"[INFO][{tag}] 汇总: {row}")
return row
finally:
shutil.rmtree(temp_root, ignore_errors=True)
def main() -> None: def main() -> None:
args = parse_args() args = parse_args()
project_dir = Path(args.project_dir).resolve() project_dir = Path(args.project_dir).resolve()
config_path = Path(args.config).resolve()
train_script = Path(args.train_script).resolve() train_script = Path(args.train_script).resolve()
models_dir = Path(args.models_dir).resolve() models_dir = Path(args.models_dir).resolve()
if not project_dir.exists(): if not project_dir.exists():
raise FileNotFoundError(f"project_dir 不存在: {project_dir}") raise FileNotFoundError(f"project_dir 不存在: {project_dir}")
if not config_path.exists(): if not Path(args.config).exists():
raise FileNotFoundError(f"config 不存在: {config_path}") raise FileNotFoundError(f"config 不存在: {args.config}")
if not train_script.exists(): if not train_script.exists():
raise FileNotFoundError(f"train_script 不存在: {train_script}") raise FileNotFoundError(f"train_script 不存在: {train_script}")
if not models_dir.exists(): if not models_dir.exists():
@@ -270,57 +319,58 @@ def main() -> None:
if not ckpts: if not ckpts:
raise RuntimeError("没有找到符合条件的 checkpoint") raise RuntimeError("没有找到符合条件的 checkpoint")
backup_path = config_path.with_suffix(config_path.suffix + ".bak_auto_eval") gpu_list = [x.strip() for x in args.gpu.split(",") if x.strip()]
shutil.copy2(config_path, backup_path) if not gpu_list:
print(f"[INFO] 已备份配置到: {backup_path}") raise RuntimeError("没有可用 GPU 参数")
print(f"[INFO] 使用 GPU 列表: {gpu_list}")
print(f"[INFO] 待测试 checkpoint 数量: {len(ckpts)}")
result_txt = result_txt_path(project_dir, args.result_name)
summary_rows: List[Dict] = [] summary_rows: List[Dict] = []
try: # 单卡时保持串行行为
if len(gpu_list) == 1:
gpu = gpu_list[0]
for epoch, ckpt in ckpts: for epoch, ckpt in ckpts:
print("=" * 100) print("=" * 100)
print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}") print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}, gpu={gpu}")
print("=" * 100) print("=" * 100)
row = run_single_checkpoint(epoch, ckpt, gpu, args, project_dir, train_script)
if row is not None:
summary_rows.append(row)
else:
# 多卡并行round-robin 分配 checkpoint 到不同 GPU
futures = []
with ThreadPoolExecutor(max_workers=len(gpu_list)) as ex:
for idx, (epoch, ckpt) in enumerate(ckpts):
gpu = gpu_list[idx % len(gpu_list)]
futures.append(
ex.submit(
run_single_checkpoint,
epoch,
ckpt,
gpu,
args,
project_dir,
train_script,
)
)
overwrite_test_config(config_path, ckpt) for fut in as_completed(futures):
row = fut.result()
if row is not None:
summary_rows.append(row)
ret = run_one_eval(project_dir, train_script, args.result_name, args.gpu) summary_rows.sort(key=lambda x: x["epoch"])
if ret != 0:
print(f"[WARN] checkpoint {epoch} 测试失败,返回码 {ret}")
continue
time.sleep(args.sleep_sec) summary_csv = project_dir / "result" / f"{args.result_name}_summary.csv"
save_summary_csv(summary_csv, summary_rows)
print(f"[INFO] 汇总结果已保存到: {summary_csv}")
parsed = parse_result_file(result_txt) if summary_rows:
epoch_rows = collect_epoch_rows(parsed, epoch) best_by_ap = max(summary_rows, key=lambda x: x.get("mean_AP", float("-inf")))
print("\n[INFO] 最佳 checkpoint按 mean_AP:")
if not epoch_rows: print(best_by_ap)
print(f"[WARN] 没有在结果文件中找到 epoch={epoch} 的记录")
continue
agg = aggregate_rows(epoch_rows)
row = {
"epoch": epoch,
"checkpoint": str(ckpt),
**agg,
}
summary_rows.append(row)
print(f"[INFO] epoch={epoch} 汇总: {row}")
summary_csv = project_dir / "result" / f"{args.result_name}_summary.csv"
save_summary_csv(summary_csv, summary_rows)
print(f"[INFO] 汇总结果已保存到: {summary_csv}")
if summary_rows:
best_by_ap = max(summary_rows, key=lambda x: x.get("mean_AP", float("-inf")))
print("\n[INFO] 最佳 checkpoint按 mean_AP:")
print(best_by_ap)
finally:
shutil.copy2(backup_path, config_path)
print(f"[INFO] 已恢复原始配置: {config_path}")
if __name__ == "__main__": if __name__ == "__main__":

56
config.yaml.bak_auto_eval Normal file
View File

@@ -0,0 +1,56 @@
'experiment' :
# 'path_dataset' : '/mnt/data/cdy/project/dataset/FUSION'
# 'path_result': '/mnt/data/cdy/data2/results/FUSIONLCD'
# 'path_dataset' : 'E:\work\Project\dataset\FUSION'
# 'path_result' : 'E:\work\Project\results\FUSIONLCD\bev2'
'path_dataset' : '/home/adlab36/chenyouyuan/FUSIONLCD'
'path_result': '/home/adlab36/chenyouyuan/FUSIONLCD/result'
'train_flag' : 0
'validate_flag' : 1
'test_flag' : 1
'flag' : 'fusion'
'cuda' : 1
# TRAINING
'epochs' : 200
'batchsize' : 6
'learning_rate' : 1.e-3
'beta1' : 0.9
'beta2' : 0.999
'eps' : 1.e-8
'weight_decay' : 5.e-6
'load_model' : 1
#FUSION
# 'last_model' : '/data4/caodanyang/results/FUSIONLCD/08310/models/checkpoint_079.pth.tar'
#BEV
# 'last_model' : '/data4/caodanyang/results/FUSIONLCD/bev_09030/models/checkpoint_066.pth.tar'
#BEV+EP
'last_model' : '/home/adlab36/chenyouyuan/FUSIONLCD/result/log/models/checkpoint_199.pth.tar'
#DATASET
'train' : 0,5,6,7,9
'validate' : 8,50,54,55,56,59
'test' : 8,50,54,55,56,59
'voxel_num' : 15000
'voxel_max_points' : 100
'voxel_sample' : 'top'
# 'bev_range' : -51.2,-51.2,-2.5,51.2,51.2,1.5
# 'bev_resolution' : 0.16
# 'bev_range' : -64,-64,-2.5,64,64,1.5
# 'bev_resolution' : 0.2
'bev_range' : -32,-32,-2.5,32,32,1.5
'bev_resolution' : 0.2
# NETWORK PARAMS
'kpts_number_bev' : 150
'kpts_number_img' : 150
'cluster_num_bev' : 16
'cluster_num_img' : 16
'cluster_num_fusion' : 16
'sinkhorn_iter' : 5
'vlad_size' : 256
# LOSS
'loop_file' : 'loop_GT_4m'
'trip_margin' : 0.5
'negetative_selsector' : 'random'

0
evaluate_all_models.py Normal file
View File