diff --git a/.gitignore b/.gitignore index 38c13d1..32b8b20 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ env/ # Logs / test / tooling *.log +.cache/ .pytest_cache/ .mypy_cache/ .ruff_cache/ diff --git a/mario-rl-mvp/README.md b/mario-rl-mvp/README.md index 9b4e674..e6cc17f 100644 --- a/mario-rl-mvp/README.md +++ b/mario-rl-mvp/README.md @@ -11,6 +11,7 @@ mario-rl-mvp/ train_ppo.py eval.py record_video.py + plot_model_max_x_trend.py utils.py artifacts/ models/ @@ -32,6 +33,44 @@ python -m pip install --upgrade pip setuptools wheel pip install -r requirements.txt ``` +## 2.1 环境准备(WSL / Ubuntu) + +如果系统 Python 缺少 `venv/pip`,推荐直接用 `uv` 创建环境并安装依赖: + +```bash +cd /home/roog/super-mario/mario-rl-mvp +uv venv .venv -p /usr/bin/python3.10 +uv pip install --python .venv/bin/python -r requirements.txt +``` + +如果你更倾向用系统 `venv`,先安装: + +```bash +sudo apt-get update +sudo apt-get install -y python3.10-venv python3-pip +``` + +### RTX 50 系列(如 RTX 5080)GPU 说明 + +如果你看到类似: + +```text +... CUDA capability sm_120 is not compatible with the current PyTorch installation ... +``` + +说明当前 torch wheel 不包含 `sm_120` 内核。可直接升级到 `cu128` nightly: + +```bash +cd /home/roog/super-mario/mario-rl-mvp +uv pip install --python .venv/bin/python --upgrade --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 +``` + +验证 GPU: + +```bash +.venv/bin/python -c "import torch; print(torch.__version__); print(torch.version.cuda); print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'); print(torch.cuda.get_device_capability(0) if torch.cuda.is_available() else 'N/A')" +``` + 可选系统依赖(用于 ffmpeg 转码与潜在 SDL 兼容): ```bash @@ -40,12 +79,19 @@ brew install ffmpeg sdl2 ## 3. 一条命令开始训练 -默认 CPU 训练(如果检测到可用且稳定的 MPS,会自动尝试启用,否则自动回退 CPU): +默认 `--device auto` 训练(优先 CUDA,其次 MPS,最后 CPU): ```bash python -m src.train_ppo ``` +显式指定 `--device cuda` 或 `--device mps` 时,如果该设备不可用,脚本会默认报错(避免静默回退到 CPU)。 +若你明确接受回退,可加: + +```bash +python -m src.train_ppo --device cuda --allow-device-fallback +``` + 常用覆盖参数: ```bash @@ -78,6 +124,30 @@ python -m src.train_ppo \ --total-timesteps 300000 ``` +我目前的参数 + +``` +python -m src.train_ppo \ + --init-model-path artifacts/models/latest_model.zip \ + --n-envs 16 \ + --allow-partial-init \ + --reward-mode progress \ + --movement simple \ + --ent-coef 0.001 \ + --learning-rate 2e-5 \ + --n-steps 2048 \ + --gamma 0.99 \ + --death-penalty -50 \ + --stall-penalty 0.05 \ + --stall-steps 40 \ + --backward-penalty-scale 0.01 \ + --milestone-bonus 2.0 \ + --no-progress-terminate-steps 300 \ + --no-progress-terminate-penalty 10 \ + --time-penalty -0.01 \ + --total-timesteps 1200000 +``` + ### 3.1 从已有模型继续训练(`--init-model-path`) - 用途:加载已有 `.zip` 权重后继续训练,适合“不中断实验目标但调整探索参数”。 @@ -143,13 +213,35 @@ tensorboard --logdir artifacts/logs --port 6006 加载最新模型,跑 N 个 episode,输出平均指标: ```bash -python -m src.eval --episodes 5 --stochastic +python -m src.eval \ + --model-path artifacts/models/latest_model.zip \ + --episodes 20 \ + --movement simple \ + --reward-mode progress \ + --no-progress-terminate-steps 300 \ + --no-progress-terminate-penalty 10 \ + --death-penalty -50 \ + --stall-penalty 0.05 \ + --stall-steps 40 \ + --time-penalty -0.01 \ + --epsilon 0.08 ``` 可指定模型: ```bash -python -m src.eval --model-path artifacts/models/latest_model.zip --episodes 10 --stochastic +python -m src.eval \ + --model-path artifacts/models/latest_model.zip \ + --episodes 20 \ + --movement simple \ + --reward-mode progress \ + --no-progress-terminate-steps 300 \ + --no-progress-terminate-penalty 10 \ + --death-penalty -50 \ + --stall-penalty 0.05 \ + --stall-steps 40 \ + --time-penalty -0.01 \ + --stochastic ``` 注意:`eval.py` 默认 `--movement auto`,会按模型动作维度自动匹配 `right_only/simple`,避免动作空间不一致导致 `KeyError`。 @@ -174,13 +266,41 @@ _total_timesteps = 150000 默认录制约 10 秒 mp4 到 `artifacts/videos/`: ```bash -python -m src.record_video --duration-sec 10 --fps 30 --stochastic +python -m src.record_video \ + --model-path artifacts/models/latest_model.zip \ + --movement simple \ + --reward-mode progress \ + --no-progress-terminate-steps 300 \ + --no-progress-terminate-penalty 10 \ + --death-penalty -50 \ + --stall-penalty 0.05 \ + --stall-steps 40 \ + --time-penalty -0.01 \ + --epsilon 0.08 \ + --duration-sec 30 ``` -可指定输出路径: +或者稳定版本 ```bash -python -m src.record_video --output artifacts/videos/demo.mp4 --stochastic --duration-sec 10 +python -m src.record_video \ + --model-path artifacts/models/latest_model.zip \ + --movement simple \ + --reward-mode progress \ + --no-progress-terminate-steps 300 \ + --no-progress-terminate-penalty 10 \ + --death-penalty -50 \ + --stall-penalty 0.05 \ + --stall-steps 40 \ + --time-penalty -0.01 \ + --epsilon 0.08 \ + --epsilon-random-mode uniform \ + --max-steps 6000 +``` +可选: + +```bash +--output artifacts/videos/mario_eps008.mp4 ``` 注意:`record_video.py` 默认 `--movement auto`,会按模型自动匹配动作空间。 @@ -190,6 +310,47 @@ python -m src.record_video --output artifacts/videos/demo.mp4 --stochastic --dur - 默认通过 `imageio + ffmpeg` 输出 mp4 - 若 mp4 写入失败,会自动降级保存帧序列(PNG),并打印 ffmpeg 转码命令 +## 5.1 模型趋势可视化(HTML / Markdown) + +用于可视化 `artifacts/models/` 里的模型在训练过程中的关键指标趋势,输出中文 HTML 或 Markdown 报告。 + +默认命令: + +```bash +python -m src.plot_model_max_x_trend +``` + +默认输出: + +- `artifacts/reports/model_max_x_trend.html` + +输出 Markdown 报告: + +```bash +python -m src.plot_model_max_x_trend --format markdown +``` + +Markdown 默认输出: + +- `artifacts/reports/model_max_x_trend.md` + +可选参数(自定义目录/输出): + +```bash +uv run python -m src.plot_model_max_x_trend \ + --models-dir artifacts/models \ + --logs-dir artifacts/logs \ + --format markdown \ + --output artifacts/reports/model_max_x_trend.md +``` + +报告内容: + +- 主趋势:`max_x`(最大前进距离) +- 多维趋势:平均回报、平均回合步数、通关率、无进展终止率、死亡终止率、超时终止率、硬卡死终止率 +- 模型明细表:每个 checkpoint/final 模型对应的指标值、匹配步数、来源 TensorBoard tag +- 术语解释:Run、Checkpoint、model_step、matched_step、TensorBoard Tag 等专有名词 + ## 6. 动作空间选择说明 默认 `RIGHT_ONLY`,原因: @@ -231,22 +392,23 @@ python -m src.train_ppo --reward-mode clip ```bash python -m src.train_ppo \ - --init-model-path artifacts/models/latest_model.zip \ + --init-model-path /home/roog/super-mario/mario-rl-mvp/artifacts/models/ppo_SuperMarioBros-1-1-v0_20260212_205220/ppo_mario_ckpt_100000_steps.zip \ + --n-envs 16 \ --allow-partial-init \ --reward-mode progress \ --movement simple \ - --ent-coef 0.04 \ + --ent-coef 0.01 \ --learning-rate 1e-4 \ - --n-steps 512 \ - --gamma 0.995 \ - --death-penalty -120 \ - --stall-penalty 0.2 \ - --stall-steps 20 \ - --backward-penalty-scale 0.03 \ + --n-steps 1024 \ + --gamma 0.99 \ + --death-penalty -50 \ + --stall-penalty 0.05 \ + --stall-steps 40 \ + --backward-penalty-scale 0.01 \ --milestone-bonus 2.0 \ - --no-progress-terminate-steps 80 \ - --no-progress-terminate-penalty 30 \ - --total-timesteps 150000 + --no-progress-terminate-steps 300 \ + --no-progress-terminate-penalty 10 \ + --total-timesteps 300000 ``` ## 8. 常见问题排查 @@ -312,6 +474,30 @@ python -m src.train_ppo --init-model-path artifacts/models/latest_model.zip --mo 3) 或者直接不加载旧模型,从头训练新动作空间。 +### 8.6 `cudaGetDeviceCount ... Error 304`(WSL 下 CUDA 初始化失败) + +如果训练启动时看到: + +```text +[device] cpu | CUDA unavailable, using CPU. +[device_diag] ... torch.cuda.is_available()=False ... Error 304 ... +``` + +说明不是 `device` 参数没传,而是 CUDA 运行时在当前环境初始化失败。 + +先做两步确认: + +```bash +nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader +.venv/bin/python -c "import torch; print(torch.__version__); print(torch.version.cuda); print(torch.cuda.is_available())" +``` + +常见原因是 WSL GPU 栈/驱动状态异常,而不是 PPO 代码本身。若你是临时跑通实验,可先显式 CPU: + +```bash +python -m src.train_ppo --device cpu +``` + ## 9. 最小 smoke test(按顺序执行) ```bash diff --git a/mario-rl-mvp/requirements.txt b/mario-rl-mvp/requirements.txt index a747e8f..34cf6ba 100644 --- a/mario-rl-mvp/requirements.txt +++ b/mario-rl-mvp/requirements.txt @@ -1,4 +1,5 @@ -torch==2.5.1 +# Keep torch unpinned to avoid forcing old wheels on new GPUs (e.g. RTX 50xx). +torch>=2.5.1 stable-baselines3==2.3.2 gym==0.26.2 gymnasium==0.29.1 diff --git a/mario-rl-mvp/src/env.py b/mario-rl-mvp/src/env.py index d82b6d6..6983d92 100644 --- a/mario-rl-mvp/src/env.py +++ b/mario-rl-mvp/src/env.py @@ -212,6 +212,7 @@ class ProgressRewardEnv(gym.Wrapper): truncated = True shaped_reward -= self.no_progress_terminate_penalty info["terminated_by_stall"] = True + info["done_reason"] = "no_progress" if terminated or truncated: if bool(info.get("flag_get", False)): @@ -222,6 +223,71 @@ class ProgressRewardEnv(gym.Wrapper): return obs, shaped_reward, terminated, truncated, info +class TimePenaltyHardStuckEnv(gym.Wrapper): + """Optional living cost and hard-stuck truncation based on x_pos movement.""" + + def __init__( + self, + env: gym.Env, + time_penalty: float = 0.0, + hard_stuck_steps: int = 0, + hard_stuck_epsilon: float = 1.0, + hard_stuck_penalty: float = 5.0, + ): + super().__init__(env) + if time_penalty > 0.0: + raise ValueError(f"time_penalty must be <= 0.0, got {time_penalty}") + if hard_stuck_steps < 0: + raise ValueError(f"hard_stuck_steps must be >= 0, got {hard_stuck_steps}") + if hard_stuck_epsilon < 0.0: + raise ValueError(f"hard_stuck_epsilon must be >= 0.0, got {hard_stuck_epsilon}") + if hard_stuck_penalty < 0.0: + raise ValueError(f"hard_stuck_penalty must be >= 0.0, got {hard_stuck_penalty}") + + self.time_penalty = float(time_penalty) + self.hard_stuck_steps = int(hard_stuck_steps) + self.hard_stuck_epsilon = float(hard_stuck_epsilon) + self.hard_stuck_penalty = float(hard_stuck_penalty) + self._last_x_pos: Optional[float] = None + self._hard_stuck_count = 0 + + def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): + del options + obs, info = reset_compat(self.env, seed=seed) + self._last_x_pos = float(info.get("x_pos", 0.0)) + self._hard_stuck_count = 0 + return obs, info + + def step(self, action: Any): + obs, reward, terminated, truncated, info = step_compat(self.env, action) + x_pos = float(info.get("x_pos", 0.0)) + if self._last_x_pos is None: + delta_x = 0.0 + else: + delta_x = x_pos - self._last_x_pos + self._last_x_pos = x_pos + + shaped_reward = float(reward) + if self.time_penalty != 0.0: + shaped_reward += self.time_penalty + + if self.hard_stuck_steps > 0 and not terminated and not truncated: + if abs(delta_x) < self.hard_stuck_epsilon: + self._hard_stuck_count += 1 + else: + self._hard_stuck_count = 0 + + if self._hard_stuck_count >= self.hard_stuck_steps: + shaped_reward -= self.hard_stuck_penalty + truncated = True + info["terminated_by_hard_stuck"] = True + info["done_reason"] = "hard_stuck" + else: + self._hard_stuck_count = 0 + + return obs, shaped_reward, terminated, truncated, info + + def get_action_set(name: str): name = name.lower().strip() if name == "simple": @@ -249,6 +315,10 @@ def make_mario_env( milestone_bonus: float = 1.0, no_progress_terminate_steps: int = 120, no_progress_terminate_penalty: float = 20.0, + time_penalty: float = 0.0, + hard_stuck_steps: int = 0, + hard_stuck_epsilon: float = 1.0, + hard_stuck_penalty: float = 5.0, ) -> gym.Env: kwargs: Dict[str, Any] = {} if render_mode is not None: @@ -284,6 +354,15 @@ def make_mario_env( elif mode != "raw": raise ValueError(f"Unsupported reward_mode='{reward_mode}'. Use one of: raw, clip, progress") + if time_penalty != 0.0 or hard_stuck_steps > 0: + env = TimePenaltyHardStuckEnv( + env=env, + time_penalty=time_penalty, + hard_stuck_steps=hard_stuck_steps, + hard_stuck_epsilon=hard_stuck_epsilon, + hard_stuck_penalty=hard_stuck_penalty, + ) + env = PreprocessFrame(env, width=84, height=84) env = ChannelLastFrameStack(env, num_stack=4) env = TransposeObservation(env) @@ -312,6 +391,10 @@ def make_env_fn( milestone_bonus: float, no_progress_terminate_steps: int, no_progress_terminate_penalty: float, + time_penalty: float, + hard_stuck_steps: int, + hard_stuck_epsilon: float, + hard_stuck_penalty: float, ) -> Callable[[], gym.Env]: def _thunk() -> gym.Env: return make_mario_env( @@ -332,6 +415,10 @@ def make_env_fn( milestone_bonus=milestone_bonus, no_progress_terminate_steps=no_progress_terminate_steps, no_progress_terminate_penalty=no_progress_terminate_penalty, + time_penalty=time_penalty, + hard_stuck_steps=hard_stuck_steps, + hard_stuck_epsilon=hard_stuck_epsilon, + hard_stuck_penalty=hard_stuck_penalty, ) return _thunk diff --git a/mario-rl-mvp/src/eval.py b/mario-rl-mvp/src/eval.py index b903506..d9bd076 100644 --- a/mario-rl-mvp/src/eval.py +++ b/mario-rl-mvp/src/eval.py @@ -6,6 +6,11 @@ from pathlib import Path from statistics import mean import numpy as np +from src.policy_utils import select_action +from src.runtime import configure_runtime_env + +configure_runtime_env() + from stable_baselines3 import PPO from src.env import get_action_set, make_mario_env, reset_compat, step_compat @@ -32,8 +37,51 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--milestone-bonus", type=float, default=1.0) parser.add_argument("--no-progress-terminate-steps", type=int, default=120) parser.add_argument("--no-progress-terminate-penalty", type=float, default=20.0) + parser.add_argument( + "--time-penalty", + type=float, + default=0.0, + help="Per-step living cost added to reward (<=0.0). 0.0 disables.", + ) + parser.add_argument( + "--hard-stuck-steps", + type=int, + default=0, + help="Consecutive near-stationary steps before truncation. 0 disables.", + ) + parser.add_argument( + "--hard-stuck-epsilon", + type=float, + default=1.0, + help="Treat |x_pos delta| < epsilon as no movement for hard-stuck detection.", + ) + parser.add_argument( + "--hard-stuck-penalty", + type=float, + default=5.0, + help="Extra penalty applied when hard-stuck truncation triggers.", + ) parser.add_argument("--clip-reward", action="store_true") parser.add_argument("--stochastic", action="store_true", help="Use stochastic policy (deterministic=False).") + parser.add_argument( + "--epsilon", + type=float, + default=0.0, + help="Epsilon-greedy probability for deterministic policy. Ignored when --stochastic is set.", + ) + parser.add_argument( + "--epsilon-random-mode", + type=str, + default="uniform", + choices=["uniform", "policy"], + help="Random action source for epsilon-greedy: uniform=action_space.sample, policy=model stochastic sample.", + ) + parser.add_argument( + "--random-noops", + type=int, + default=0, + help="Random no-op steps after reset (0 disables). Uses action 0 as NOOP.", + ) return parser.parse_args() @@ -87,6 +135,8 @@ def resolve_model_path(user_path: str) -> Path: def main() -> None: args = parse_args() + if args.epsilon < 0.0 or args.epsilon > 1.0: + raise ValueError(f"--epsilon must be in [0, 1], got {args.epsilon}") seed_everything(args.seed) reward_mode = "clip" if args.clip_reward and args.reward_mode == "raw" else args.reward_mode @@ -94,7 +144,12 @@ def main() -> None: print(f"[eval] model={model_path}") model = PPO.load(str(model_path)) movement = resolve_movement(args.movement, model) - print(f"[eval] movement={movement} reward_mode={reward_mode}") + print( + f"[eval] movement={movement} reward_mode={reward_mode} random_noops={args.random_noops} " + f"time_penalty={args.time_penalty} hard_stuck_steps={args.hard_stuck_steps} " + f"hard_stuck_epsilon={args.hard_stuck_epsilon} hard_stuck_penalty={args.hard_stuck_penalty} " + f"epsilon={args.epsilon} epsilon_random_mode={args.epsilon_random_mode}" + ) env = make_mario_env( env_id=args.env_id, @@ -114,6 +169,10 @@ def main() -> None: milestone_bonus=args.milestone_bonus, no_progress_terminate_steps=args.no_progress_terminate_steps, no_progress_terminate_penalty=args.no_progress_terminate_penalty, + time_penalty=args.time_penalty, + hard_stuck_steps=args.hard_stuck_steps, + hard_stuck_epsilon=args.hard_stuck_epsilon, + hard_stuck_penalty=args.hard_stuck_penalty, ) rewards = [] @@ -122,6 +181,12 @@ def main() -> None: for ep in range(1, args.episodes + 1): obs, info = reset_compat(env, seed=args.seed + ep) + if args.random_noops > 0: + noop_steps = np.random.randint(0, args.random_noops + 1) + for _ in range(noop_steps): + obs, _, terminated, truncated, info = step_compat(env, 0) + if terminated or truncated: + obs, info = reset_compat(env, seed=args.seed + ep + 1000) done = False ep_reward = 0.0 ep_max_x = float(info.get("x_pos", 0.0)) @@ -129,9 +194,14 @@ def main() -> None: step_count = 0 while not done and step_count < args.max_steps: - action, _ = model.predict(obs, deterministic=not args.stochastic) - if isinstance(action, np.ndarray): - action = int(action.item()) + action = select_action( + model=model, + obs=obs, + deterministic=not args.stochastic, + epsilon=args.epsilon, + epsilon_random_mode=args.epsilon_random_mode, + env=env, + ) obs, reward, terminated, truncated, info = step_compat(env, action) ep_reward += float(reward) ep_max_x = max(ep_max_x, float(info.get("x_pos", 0.0))) diff --git a/mario-rl-mvp/src/plot_model_max_x_trend.py b/mario-rl-mvp/src/plot_model_max_x_trend.py new file mode 100644 index 0000000..f6f8a70 --- /dev/null +++ b/mario-rl-mvp/src/plot_model_max_x_trend.py @@ -0,0 +1,677 @@ +from __future__ import annotations + +import argparse +import datetime as dt +import html +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +from tensorboard.backend.event_processing.event_accumulator import EventAccumulator + +from src.utils import ensure_artifact_paths + +RUN_TS_PATTERN = re.compile(r"(\d{8}_\d{6})$") +CKPT_STEPS_PATTERN = re.compile(r"ppo_mario_ckpt_(\d+)_steps\.zip$") + + +@dataclass(frozen=True) +class ScalarPoint: + step: int + value: float + + +@dataclass(frozen=True) +class MetricSpec: + key: str + label: str + tag_candidates: Tuple[str, ...] + meaning: str + scale: float = 1.0 + decimals: int = 2 + suffix: str = "" + + +METRIC_SPECS: List[MetricSpec] = [ + MetricSpec( + key="max_x", + label="最大前进距离", + tag_candidates=("rollout/episode_max_x_pos", "episode_end/episode_max_x_pos"), + meaning="马里奥在单回合内到达的最远 x 坐标,越大通常代表策略能跑得更远。", + decimals=1, + ), + MetricSpec( + key="ep_rew", + label="平均回报", + tag_candidates=("rollout/ep_rew_mean",), + meaning="训练窗口内的平均 episode reward,会随 reward shaping 配置变化。", + decimals=2, + ), + MetricSpec( + key="ep_len", + label="平均回合步数", + tag_candidates=("rollout/ep_len_mean",), + meaning="训练窗口内平均每回合步数;若步数高但 max_x 不涨,常见于卡住。", + decimals=1, + ), + MetricSpec( + key="clear_rate", + label="通关率", + tag_candidates=("rollout/flag_get", "episode_end/flag_get"), + meaning="到达旗帜终点的比例。0 表示从未通关,1 表示全部通关。", + scale=100.0, + decimals=1, + suffix="%", + ), + MetricSpec( + key="no_progress_rate", + label="无进展终止率", + tag_candidates=("rollout/done_reason_no_progress", "episode_end/done_reason_no_progress"), + meaning="因 no-progress 规则提前结束的比例,越高说明越容易卡住。", + scale=100.0, + decimals=1, + suffix="%", + ), + MetricSpec( + key="death_rate", + label="死亡终止率", + tag_candidates=("rollout/done_reason_death", "episode_end/done_reason_death"), + meaning="因死亡结束 episode 的比例。", + scale=100.0, + decimals=1, + suffix="%", + ), + MetricSpec( + key="timeout_rate", + label="超时终止率", + tag_candidates=("rollout/done_reason_timeout", "episode_end/done_reason_timeout"), + meaning="因时间耗尽或 TimeLimit 截断的比例。", + scale=100.0, + decimals=1, + suffix="%", + ), + MetricSpec( + key="hard_stuck_rate", + label="硬卡死终止率", + tag_candidates=("rollout/done_reason_hard_stuck", "episode_end/done_reason_hard_stuck"), + meaning="触发 hard-stuck 截断的比例(仅在启用 hard_stuck 后有数据)。", + scale=100.0, + decimals=1, + suffix="%", + ), +] +METRIC_BY_KEY: Dict[str, MetricSpec] = {spec.key: spec for spec in METRIC_SPECS} + + +@dataclass(frozen=True) +class ModelItem: + run_name: str + run_time: dt.datetime + model_path: Path + model_kind: str + model_step: Optional[int] + + +@dataclass(frozen=True) +class ModelMetric: + index: int + run_name: str + model_file: str + model_path: str + model_kind: str + model_step: Optional[int] + metric_values: Dict[str, Optional[float]] + metric_steps: Dict[str, Optional[int]] + metric_tags: Dict[str, Optional[str]] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="可视化 artifacts/models 中模型在多维指标上的变化趋势,输出中文 HTML 或 Markdown 报告。" + ) + parser.add_argument("--models-dir", type=str, default="", help="模型目录,默认 artifacts/models") + parser.add_argument("--logs-dir", type=str, default="", help="日志目录,默认 artifacts/logs") + parser.add_argument( + "--format", + type=str, + default="html", + choices=("html", "markdown"), + help="输出格式:html 或 markdown", + ) + parser.add_argument( + "--output", + type=str, + default="", + help="输出报告路径。默认 html: artifacts/reports/model_max_x_trend.html,markdown: artifacts/reports/model_max_x_trend.md", + ) + return parser.parse_args() + + +def _parse_run_time(run_name: str, fallback: float) -> dt.datetime: + match = RUN_TS_PATTERN.search(run_name) + if match: + try: + return dt.datetime.strptime(match.group(1), "%Y%m%d_%H%M%S") + except ValueError: + pass + return dt.datetime.fromtimestamp(fallback) + + +def _list_model_items(models_dir: Path) -> List[ModelItem]: + items: List[ModelItem] = [] + for run_dir in sorted(path for path in models_dir.iterdir() if path.is_dir()): + run_name = run_dir.name + run_time = _parse_run_time(run_name, run_dir.stat().st_mtime) + + zip_files = sorted(path for path in run_dir.glob("*.zip") if path.is_file()) + staged: List[Tuple[int, str, Path, Optional[int], str]] = [] + for zip_path in zip_files: + name = zip_path.name + ckpt_match = CKPT_STEPS_PATTERN.match(name) + if ckpt_match: + step = int(ckpt_match.group(1)) + staged.append((0, f"{step:012d}", zip_path, step, "checkpoint")) + continue + if name == "final_model.zip": + staged.append((1, "final_model", zip_path, None, "final")) + continue + staged.append((2, name, zip_path, None, "other")) + + for _, _, model_path, model_step, model_kind in sorted(staged, key=lambda row: (row[0], row[1])): + items.append( + ModelItem( + run_name=run_name, + run_time=run_time, + model_path=model_path, + model_kind=model_kind, + model_step=model_step, + ) + ) + + items.sort( + key=lambda item: ( + item.run_time, + 10**18 if item.model_step is None and item.model_kind == "final" else (item.model_step or 0), + item.model_path.name, + ) + ) + return items + + +def _load_scalar_map_from_dir(event_dir: Path) -> Dict[str, List[ScalarPoint]]: + if not event_dir.exists(): + return {} + event_files = list(event_dir.glob("events.out.tfevents.*")) + if not event_files: + return {} + + accumulator = EventAccumulator(str(event_dir), size_guidance={"scalars": 0}) + try: + accumulator.Reload() + except Exception: + return {} + + scalar_map: Dict[str, List[ScalarPoint]] = {} + for tag in accumulator.Tags().get("scalars", []): + try: + points = [ScalarPoint(step=int(s.step), value=float(s.value)) for s in accumulator.Scalars(tag)] + except Exception: + continue + if points: + scalar_map[tag] = points + return scalar_map + + +def _load_run_scalar_map(logs_dir: Path, run_name: str) -> Dict[str, List[ScalarPoint]]: + run_dir = logs_dir / run_name + merged: Dict[str, List[ScalarPoint]] = {} + + for sub in (("tb", "ppo_1"), ("tb", "episode_end")): + event_dir = run_dir.joinpath(*sub) + part = _load_scalar_map_from_dir(event_dir) + for tag, points in part.items(): + if tag not in merged or len(points) > len(merged[tag]): + merged[tag] = points + + return merged + + +def _value_at_or_before(points: Sequence[ScalarPoint], target_step: int) -> Tuple[float, int]: + best = points[0] + for point in points: + if point.step <= target_step: + best = point + else: + break + return best.value, best.step + + +def _pick_series_for_metric( + scalar_map: Dict[str, List[ScalarPoint]], + spec: MetricSpec, +) -> Tuple[Optional[str], Optional[List[ScalarPoint]]]: + for tag in spec.tag_candidates: + points = scalar_map.get(tag) + if points: + return tag, points + return None, None + + +def _resolve_model_metric(item: ModelItem, scalar_map: Dict[str, List[ScalarPoint]], index: int) -> ModelMetric: + metric_values: Dict[str, Optional[float]] = {} + metric_steps: Dict[str, Optional[int]] = {} + metric_tags: Dict[str, Optional[str]] = {} + + for spec in METRIC_SPECS: + tag, points = _pick_series_for_metric(scalar_map, spec) + metric_tags[spec.key] = tag + if not points: + metric_values[spec.key] = None + metric_steps[spec.key] = None + continue + + if item.model_step is None: + point = points[-1] + metric_values[spec.key] = point.value + metric_steps[spec.key] = point.step + continue + + value, matched_step = _value_at_or_before(points, item.model_step) + metric_values[spec.key] = value + metric_steps[spec.key] = matched_step + + return ModelMetric( + index=index, + run_name=item.run_name, + model_file=item.model_path.name, + model_path=str(item.model_path.resolve()), + model_kind=item.model_kind, + model_step=item.model_step, + metric_values=metric_values, + metric_steps=metric_steps, + metric_tags=metric_tags, + ) + + +def _scaled_value(metric: ModelMetric, key: str) -> Optional[float]: + spec = METRIC_BY_KEY[key] + raw = metric.metric_values.get(key) + if raw is None: + return None + return raw * spec.scale + + +def _format_value(metric: ModelMetric, key: str) -> str: + spec = METRIC_BY_KEY[key] + scaled = _scaled_value(metric, key) + if scaled is None: + return "" + return f"{scaled:.{spec.decimals}f}{spec.suffix}" + + +def _render_metric_svg(metrics: Sequence[ModelMetric], key: str, width: int, height: int) -> str: + spec = METRIC_BY_KEY[key] + valid: List[Tuple[int, float]] = [] + for idx, metric in enumerate(metrics): + value = _scaled_value(metric, key) + if value is not None: + valid.append((idx, value)) + + if not valid: + return f'
{html.escape(metric.model_path)}{html.escape(payload)}
+