多卡ap测试
This commit is contained in:
@@ -4,25 +4,26 @@
|
|||||||
"""
|
"""
|
||||||
自动评估 FUSIONLCD 多个 checkpoint 的脚本
|
自动评估 FUSIONLCD 多个 checkpoint 的脚本
|
||||||
|
|
||||||
用法示例:
|
支持:
|
||||||
|
1. 单卡串行测试:
|
||||||
|
--gpu 0
|
||||||
|
2. 多卡并行测试:
|
||||||
|
--gpu 0,1,2,3
|
||||||
|
|
||||||
python auto_eval_checkpoints.py \
|
python auto_eval_checkpoints.py \
|
||||||
--project_dir /home/adlab36/chenyouyuan/FUSIONLCD \
|
--project_dir /home/adlab36/chenyouyuan/FUSIONLCD \
|
||||||
--config /home/adlab36/chenyouyuan/FUSIONLCD/config.yaml \
|
--config /home/adlab36/chenyouyuan/FUSIONLCD/config.yaml \
|
||||||
--train_script /home/adlab36/chenyouyuan/FUSIONLCD/train.py \
|
--train_script /home/adlab36/chenyouyuan/FUSIONLCD/train.py \
|
||||||
--models_dir /home/adlab36/chenyouyuan/FUSIONLCD/result/log/models \
|
--models_dir /home/adlab36/chenyouyuan/FUSIONLCD/result/log/models \
|
||||||
--result_name auto_eval \
|
--result_name auto_eval \
|
||||||
--gpu 1
|
--gpu 2,3 \
|
||||||
|
--epochs_filter 119, 139
|
||||||
|
|
||||||
|
099, 119, 139, 159, 179, 199
|
||||||
说明:
|
说明:
|
||||||
1. 会备份原 config.yaml 为 config.yaml.bak_auto_eval
|
- 多卡模式下,每个 checkpoint 会分配到一个 GPU
|
||||||
2. 每个 checkpoint 测试前会把 config 改成:
|
- 每个子进程使用独立的临时工作目录和独立 config.yaml,避免冲突
|
||||||
- train_flag = 0
|
- 会实时输出子进程日志
|
||||||
- validate_flag = 0
|
|
||||||
- test_flag = 1
|
|
||||||
- load_model = 1
|
|
||||||
- last_model = 当前 checkpoint
|
|
||||||
3. 每测完一个 checkpoint,会读取 result/<result_name>.txt 追加的新结果
|
|
||||||
4. 最终输出 summary.csv
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -35,8 +36,10 @@ import shutil
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Tuple, Optional
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
@@ -52,7 +55,7 @@ def parse_args() -> argparse.Namespace:
|
|||||||
parser.add_argument("--train_script", type=str, required=True, help="train.py 路径")
|
parser.add_argument("--train_script", type=str, required=True, help="train.py 路径")
|
||||||
parser.add_argument("--models_dir", type=str, required=True, help="checkpoint 目录")
|
parser.add_argument("--models_dir", type=str, required=True, help="checkpoint 目录")
|
||||||
parser.add_argument("--result_name", type=str, default="auto_eval", help="train.py 的 result_name")
|
parser.add_argument("--result_name", type=str, default="auto_eval", help="train.py 的 result_name")
|
||||||
parser.add_argument("--gpu", type=str, default="0", help="GPU id,例如 0 或 1")
|
parser.add_argument("--gpu", type=str, default="0", help="GPU id,例如 0 或 0,1,2,3")
|
||||||
parser.add_argument("--epochs_filter", type=str, default="", help="只测试指定 epoch,逗号分隔,如 99,109,119")
|
parser.add_argument("--epochs_filter", type=str, default="", help="只测试指定 epoch,逗号分隔,如 99,109,119")
|
||||||
parser.add_argument("--min_epoch", type=int, default=None, help="最小 epoch 过滤")
|
parser.add_argument("--min_epoch", type=int, default=None, help="最小 epoch 过滤")
|
||||||
parser.add_argument("--max_epoch", type=int, default=None, help="最大 epoch 过滤")
|
parser.add_argument("--max_epoch", type=int, default=None, help="最大 epoch 过滤")
|
||||||
@@ -101,17 +104,6 @@ def filter_checkpoints(
|
|||||||
return selected
|
return selected
|
||||||
|
|
||||||
|
|
||||||
def result_txt_path(project_dir: Path, result_name: str) -> Path:
|
|
||||||
return project_dir / "result" / f"{result_name}.txt"
|
|
||||||
|
|
||||||
|
|
||||||
def read_result_lines(path: Path) -> List[str]:
|
|
||||||
if not path.exists():
|
|
||||||
return []
|
|
||||||
with path.open("r", encoding="utf-8") as f:
|
|
||||||
return [line.rstrip("\n") for line in f.readlines()]
|
|
||||||
|
|
||||||
|
|
||||||
def parse_result_file(path: Path) -> List[Dict]:
|
def parse_result_file(path: Path) -> List[Dict]:
|
||||||
rows: List[Dict] = []
|
rows: List[Dict] = []
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
@@ -131,8 +123,6 @@ def parse_result_file(path: Path) -> List[Dict]:
|
|||||||
epoch = int(m.group(2))
|
epoch = int(m.group(2))
|
||||||
rest = m.group(3).split()
|
rest = m.group(3).split()
|
||||||
|
|
||||||
# 表头来自你的 log_result:
|
|
||||||
# AP R100 F1 R@1 R@2 R@3 R@4 R@5 R@6 R@7 R@8 R@9 R@10 R@15 R@20 R@25
|
|
||||||
if len(rest) < 16:
|
if len(rest) < 16:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -163,8 +153,8 @@ def parse_result_file(path: Path) -> List[Dict]:
|
|||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
|
||||||
def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None:
|
def make_eval_config(base_config_path: Path, ckpt_path: Path, result_name: str, temp_config_path: Path) -> None:
|
||||||
cfg = load_yaml(config_path)
|
cfg = load_yaml(base_config_path)
|
||||||
exp = cfg["experiment"]
|
exp = cfg["experiment"]
|
||||||
|
|
||||||
exp["train_flag"] = 0
|
exp["train_flag"] = 0
|
||||||
@@ -173,10 +163,17 @@ def overwrite_test_config(config_path: Path, ckpt_path: Path) -> None:
|
|||||||
exp["load_model"] = 1
|
exp["load_model"] = 1
|
||||||
exp["last_model"] = str(ckpt_path)
|
exp["last_model"] = str(ckpt_path)
|
||||||
|
|
||||||
save_yaml(config_path, cfg)
|
# 保持原 path_result,不改数据库等输出位置
|
||||||
|
save_yaml(temp_config_path, cfg)
|
||||||
|
|
||||||
|
|
||||||
def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: str) -> int:
|
def run_one_eval(
|
||||||
|
work_dir: Path,
|
||||||
|
train_script: Path,
|
||||||
|
result_name: str,
|
||||||
|
gpu: str,
|
||||||
|
tag: str,
|
||||||
|
) -> int:
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
env["CUDA_VISIBLE_DEVICES"] = gpu
|
env["CUDA_VISIBLE_DEVICES"] = gpu
|
||||||
|
|
||||||
@@ -188,15 +185,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s
|
|||||||
"--gpu",
|
"--gpu",
|
||||||
gpu,
|
gpu,
|
||||||
"--info",
|
"--info",
|
||||||
"auto_eval",
|
f"auto_eval_{tag}",
|
||||||
]
|
]
|
||||||
|
|
||||||
print(f"[INFO] Running command: {' '.join(cmd)}")
|
print(f"[INFO][{tag}] Running command: {' '.join(cmd)}")
|
||||||
print(f"[INFO] CUDA_VISIBLE_DEVICES={gpu}")
|
print(f"[INFO][{tag}] CUDA_VISIBLE_DEVICES={gpu}")
|
||||||
|
print(f"[INFO][{tag}] cwd={work_dir}")
|
||||||
|
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
cmd,
|
cmd,
|
||||||
cwd=str(project_dir),
|
cwd=str(work_dir),
|
||||||
env=env,
|
env=env,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
@@ -208,16 +206,16 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s
|
|||||||
try:
|
try:
|
||||||
assert proc.stdout is not None
|
assert proc.stdout is not None
|
||||||
for line in proc.stdout:
|
for line in proc.stdout:
|
||||||
print(line, end="")
|
print(f"[{tag}] {line}", end="")
|
||||||
proc.wait()
|
proc.wait()
|
||||||
return proc.returncode
|
return proc.returncode
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("\n[WARN] 收到 Ctrl+C,正在终止当前测试子进程...")
|
print(f"\n[WARN][{tag}] 收到 Ctrl+C,正在终止当前测试子进程...")
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
try:
|
try:
|
||||||
proc.wait(timeout=5)
|
proc.wait(timeout=5)
|
||||||
except Exception:
|
except Exception:
|
||||||
print("[WARN] 子进程未及时退出,强制 kill")
|
print(f"[WARN][{tag}] 子进程未及时退出,强制 kill")
|
||||||
proc.kill()
|
proc.kill()
|
||||||
proc.wait()
|
proc.wait()
|
||||||
raise
|
raise
|
||||||
@@ -247,18 +245,69 @@ def save_summary_csv(path: Path, summary: List[Dict]) -> None:
|
|||||||
writer.writerows(summary)
|
writer.writerows(summary)
|
||||||
|
|
||||||
|
|
||||||
|
def run_single_checkpoint(
|
||||||
|
epoch: int,
|
||||||
|
ckpt: Path,
|
||||||
|
gpu: str,
|
||||||
|
args: argparse.Namespace,
|
||||||
|
project_dir: Path,
|
||||||
|
train_script: Path,
|
||||||
|
) -> Optional[Dict]:
|
||||||
|
tag = f"gpu{gpu}_ep{epoch}"
|
||||||
|
temp_root = Path(tempfile.mkdtemp(prefix=f"auto_eval_{tag}_"))
|
||||||
|
try:
|
||||||
|
# train.py 会优先从 cwd/config.yaml 读取配置
|
||||||
|
temp_config = temp_root / "config.yaml"
|
||||||
|
make_eval_config(Path(args.config), ckpt, args.result_name, temp_config)
|
||||||
|
|
||||||
|
# train.py 会把 txt 写到 cwd/result/result_name.txt
|
||||||
|
(temp_root / "result").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
ret = run_one_eval(
|
||||||
|
work_dir=temp_root,
|
||||||
|
train_script=train_script,
|
||||||
|
result_name=args.result_name,
|
||||||
|
gpu=gpu,
|
||||||
|
tag=tag,
|
||||||
|
)
|
||||||
|
if ret != 0:
|
||||||
|
print(f"[WARN][{tag}] checkpoint {epoch} 测试失败,返回码 {ret}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
time.sleep(args.sleep_sec)
|
||||||
|
|
||||||
|
result_txt = temp_root / "result" / f"{args.result_name}.txt"
|
||||||
|
parsed = parse_result_file(result_txt)
|
||||||
|
epoch_rows = collect_epoch_rows(parsed, epoch)
|
||||||
|
|
||||||
|
if not epoch_rows:
|
||||||
|
print(f"[WARN][{tag}] 没有在结果文件中找到 epoch={epoch} 的记录")
|
||||||
|
return None
|
||||||
|
|
||||||
|
agg = aggregate_rows(epoch_rows)
|
||||||
|
row = {
|
||||||
|
"epoch": epoch,
|
||||||
|
"checkpoint": str(ckpt),
|
||||||
|
"gpu": gpu,
|
||||||
|
**agg,
|
||||||
|
}
|
||||||
|
print(f"[INFO][{tag}] 汇总: {row}")
|
||||||
|
return row
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(temp_root, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
project_dir = Path(args.project_dir).resolve()
|
project_dir = Path(args.project_dir).resolve()
|
||||||
config_path = Path(args.config).resolve()
|
|
||||||
train_script = Path(args.train_script).resolve()
|
train_script = Path(args.train_script).resolve()
|
||||||
models_dir = Path(args.models_dir).resolve()
|
models_dir = Path(args.models_dir).resolve()
|
||||||
|
|
||||||
if not project_dir.exists():
|
if not project_dir.exists():
|
||||||
raise FileNotFoundError(f"project_dir 不存在: {project_dir}")
|
raise FileNotFoundError(f"project_dir 不存在: {project_dir}")
|
||||||
if not config_path.exists():
|
if not Path(args.config).exists():
|
||||||
raise FileNotFoundError(f"config 不存在: {config_path}")
|
raise FileNotFoundError(f"config 不存在: {args.config}")
|
||||||
if not train_script.exists():
|
if not train_script.exists():
|
||||||
raise FileNotFoundError(f"train_script 不存在: {train_script}")
|
raise FileNotFoundError(f"train_script 不存在: {train_script}")
|
||||||
if not models_dir.exists():
|
if not models_dir.exists():
|
||||||
@@ -270,57 +319,58 @@ def main() -> None:
|
|||||||
if not ckpts:
|
if not ckpts:
|
||||||
raise RuntimeError("没有找到符合条件的 checkpoint")
|
raise RuntimeError("没有找到符合条件的 checkpoint")
|
||||||
|
|
||||||
backup_path = config_path.with_suffix(config_path.suffix + ".bak_auto_eval")
|
gpu_list = [x.strip() for x in args.gpu.split(",") if x.strip()]
|
||||||
shutil.copy2(config_path, backup_path)
|
if not gpu_list:
|
||||||
print(f"[INFO] 已备份配置到: {backup_path}")
|
raise RuntimeError("没有可用 GPU 参数")
|
||||||
|
|
||||||
|
print(f"[INFO] 使用 GPU 列表: {gpu_list}")
|
||||||
|
print(f"[INFO] 待测试 checkpoint 数量: {len(ckpts)}")
|
||||||
|
|
||||||
result_txt = result_txt_path(project_dir, args.result_name)
|
|
||||||
summary_rows: List[Dict] = []
|
summary_rows: List[Dict] = []
|
||||||
|
|
||||||
try:
|
# 单卡时保持串行行为
|
||||||
|
if len(gpu_list) == 1:
|
||||||
|
gpu = gpu_list[0]
|
||||||
for epoch, ckpt in ckpts:
|
for epoch, ckpt in ckpts:
|
||||||
print("=" * 100)
|
print("=" * 100)
|
||||||
print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}")
|
print(f"[INFO] 开始测试 checkpoint: epoch={epoch}, path={ckpt}, gpu={gpu}")
|
||||||
print("=" * 100)
|
print("=" * 100)
|
||||||
|
row = run_single_checkpoint(epoch, ckpt, gpu, args, project_dir, train_script)
|
||||||
|
if row is not None:
|
||||||
|
summary_rows.append(row)
|
||||||
|
else:
|
||||||
|
# 多卡并行:round-robin 分配 checkpoint 到不同 GPU
|
||||||
|
futures = []
|
||||||
|
with ThreadPoolExecutor(max_workers=len(gpu_list)) as ex:
|
||||||
|
for idx, (epoch, ckpt) in enumerate(ckpts):
|
||||||
|
gpu = gpu_list[idx % len(gpu_list)]
|
||||||
|
futures.append(
|
||||||
|
ex.submit(
|
||||||
|
run_single_checkpoint,
|
||||||
|
epoch,
|
||||||
|
ckpt,
|
||||||
|
gpu,
|
||||||
|
args,
|
||||||
|
project_dir,
|
||||||
|
train_script,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
overwrite_test_config(config_path, ckpt)
|
for fut in as_completed(futures):
|
||||||
|
row = fut.result()
|
||||||
|
if row is not None:
|
||||||
|
summary_rows.append(row)
|
||||||
|
|
||||||
ret = run_one_eval(project_dir, train_script, args.result_name, args.gpu)
|
summary_rows.sort(key=lambda x: x["epoch"])
|
||||||
if ret != 0:
|
|
||||||
print(f"[WARN] checkpoint {epoch} 测试失败,返回码 {ret}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
time.sleep(args.sleep_sec)
|
summary_csv = project_dir / "result" / f"{args.result_name}_summary.csv"
|
||||||
|
save_summary_csv(summary_csv, summary_rows)
|
||||||
|
print(f"[INFO] 汇总结果已保存到: {summary_csv}")
|
||||||
|
|
||||||
parsed = parse_result_file(result_txt)
|
if summary_rows:
|
||||||
epoch_rows = collect_epoch_rows(parsed, epoch)
|
best_by_ap = max(summary_rows, key=lambda x: x.get("mean_AP", float("-inf")))
|
||||||
|
print("\n[INFO] 最佳 checkpoint(按 mean_AP):")
|
||||||
if not epoch_rows:
|
print(best_by_ap)
|
||||||
print(f"[WARN] 没有在结果文件中找到 epoch={epoch} 的记录")
|
|
||||||
continue
|
|
||||||
|
|
||||||
agg = aggregate_rows(epoch_rows)
|
|
||||||
row = {
|
|
||||||
"epoch": epoch,
|
|
||||||
"checkpoint": str(ckpt),
|
|
||||||
**agg,
|
|
||||||
}
|
|
||||||
summary_rows.append(row)
|
|
||||||
|
|
||||||
print(f"[INFO] epoch={epoch} 汇总: {row}")
|
|
||||||
|
|
||||||
summary_csv = project_dir / "result" / f"{args.result_name}_summary.csv"
|
|
||||||
save_summary_csv(summary_csv, summary_rows)
|
|
||||||
print(f"[INFO] 汇总结果已保存到: {summary_csv}")
|
|
||||||
|
|
||||||
if summary_rows:
|
|
||||||
best_by_ap = max(summary_rows, key=lambda x: x.get("mean_AP", float("-inf")))
|
|
||||||
print("\n[INFO] 最佳 checkpoint(按 mean_AP):")
|
|
||||||
print(best_by_ap)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
shutil.copy2(backup_path, config_path)
|
|
||||||
print(f"[INFO] 已恢复原始配置: {config_path}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
56
config.yaml.bak_auto_eval
Normal file
56
config.yaml.bak_auto_eval
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
'experiment' :
|
||||||
|
|
||||||
|
# 'path_dataset' : '/mnt/data/cdy/project/dataset/FUSION'
|
||||||
|
# 'path_result': '/mnt/data/cdy/data2/results/FUSIONLCD'
|
||||||
|
|
||||||
|
# 'path_dataset' : 'E:\work\Project\dataset\FUSION'
|
||||||
|
# 'path_result' : 'E:\work\Project\results\FUSIONLCD\bev2'
|
||||||
|
|
||||||
|
'path_dataset' : '/home/adlab36/chenyouyuan/FUSIONLCD'
|
||||||
|
'path_result': '/home/adlab36/chenyouyuan/FUSIONLCD/result'
|
||||||
|
'train_flag' : 0
|
||||||
|
'validate_flag' : 1
|
||||||
|
'test_flag' : 1
|
||||||
|
'flag' : 'fusion'
|
||||||
|
'cuda' : 1
|
||||||
|
# TRAINING
|
||||||
|
'epochs' : 200
|
||||||
|
'batchsize' : 6
|
||||||
|
'learning_rate' : 1.e-3
|
||||||
|
'beta1' : 0.9
|
||||||
|
'beta2' : 0.999
|
||||||
|
'eps' : 1.e-8
|
||||||
|
'weight_decay' : 5.e-6
|
||||||
|
'load_model' : 1
|
||||||
|
#FUSION
|
||||||
|
# 'last_model' : '/data4/caodanyang/results/FUSIONLCD/08310/models/checkpoint_079.pth.tar'
|
||||||
|
#BEV
|
||||||
|
# 'last_model' : '/data4/caodanyang/results/FUSIONLCD/bev_09030/models/checkpoint_066.pth.tar'
|
||||||
|
#BEV+EP
|
||||||
|
'last_model' : '/home/adlab36/chenyouyuan/FUSIONLCD/result/log/models/checkpoint_199.pth.tar'
|
||||||
|
#DATASET
|
||||||
|
'train' : 0,5,6,7,9
|
||||||
|
'validate' : 8,50,54,55,56,59
|
||||||
|
'test' : 8,50,54,55,56,59
|
||||||
|
'voxel_num' : 15000
|
||||||
|
'voxel_max_points' : 100
|
||||||
|
'voxel_sample' : 'top'
|
||||||
|
# 'bev_range' : -51.2,-51.2,-2.5,51.2,51.2,1.5
|
||||||
|
# 'bev_resolution' : 0.16
|
||||||
|
# 'bev_range' : -64,-64,-2.5,64,64,1.5
|
||||||
|
# 'bev_resolution' : 0.2
|
||||||
|
'bev_range' : -32,-32,-2.5,32,32,1.5
|
||||||
|
'bev_resolution' : 0.2
|
||||||
|
|
||||||
|
# NETWORK PARAMS
|
||||||
|
'kpts_number_bev' : 150
|
||||||
|
'kpts_number_img' : 150
|
||||||
|
'cluster_num_bev' : 16
|
||||||
|
'cluster_num_img' : 16
|
||||||
|
'cluster_num_fusion' : 16
|
||||||
|
'sinkhorn_iter' : 5
|
||||||
|
'vlad_size' : 256
|
||||||
|
# LOSS
|
||||||
|
'loop_file' : 'loop_GT_4m'
|
||||||
|
'trip_margin' : 0.5
|
||||||
|
'negetative_selsector' : 'random'
|
||||||
0
evaluate_all_models.py
Normal file
0
evaluate_all_models.py
Normal file
Reference in New Issue
Block a user