diff --git a/auto_eval_checkpoints.py b/auto_eval_checkpoints.py index 869bb83..11692c0 100644 --- a/auto_eval_checkpoints.py +++ b/auto_eval_checkpoints.py @@ -191,17 +191,36 @@ def run_one_eval(project_dir: Path, train_script: Path, result_name: str, gpu: s "auto_eval", ] - proc = subprocess.run( + print(f"[INFO] Running command: {' '.join(cmd)}") + print(f"[INFO] CUDA_VISIBLE_DEVICES={gpu}") + + proc = subprocess.Popen( cmd, cwd=str(project_dir), env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + bufsize=1, + universal_newlines=True, ) - print(proc.stdout) - return proc.returncode + try: + assert proc.stdout is not None + for line in proc.stdout: + print(line, end="") + proc.wait() + return proc.returncode + except KeyboardInterrupt: + print("\n[WARN] 收到 Ctrl+C,正在终止当前测试子进程...") + proc.terminate() + try: + proc.wait(timeout=5) + except Exception: + print("[WARN] 子进程未及时退出,强制 kill") + proc.kill() + proc.wait() + raise def collect_epoch_rows(all_rows: List[Dict], epoch: int) -> List[Dict]: