diff --git a/bot_bottle/cli/dashboard.py b/bot_bottle/cli/dashboard.py index a317108..02b7c2b 100644 --- a/bot_bottle/cli/dashboard.py +++ b/bot_bottle/cli/dashboard.py @@ -21,6 +21,7 @@ import subprocess import sys import tempfile import time +import traceback from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path @@ -52,7 +53,7 @@ from ..backend.docker.pipelock_apply import ( parse_allowlist_content, render_allowlist_content, ) -from ..log import info +from ..log import Die, error, info from ..manifest import Manifest from ..supervise import ( ACTION_OPERATOR_EDIT, @@ -1277,9 +1278,57 @@ def cmd_dashboard(argv: list[str]) -> int: curses.wrapper(_main_loop) except KeyboardInterrupt: return 130 + except Die as e: + # die() printed the reason to stderr, but that happened while + # curses owned the terminal — the text landed on the alternate + # screen and was wiped when the terminal was restored. Re-surface + # it now that we're back on the normal screen. + if e.message: + error(e.message) + else: + error("dashboard exited on a fatal error (no detail captured).") + return e.code if isinstance(e.code, int) else 1 + except Exception as e: + # Any other crash inside the TUI. The traceback would otherwise + # vanish with the alternate screen, so persist it and tell the + # operator where to look. + log_path = _write_crash_log(e) + error(f"dashboard crashed: {type(e).__name__}: {e}") + error(f"full traceback written to {log_path}") + return 1 return 0 +def _write_crash_log(exc: BaseException) -> Path: + """Persist `exc`'s traceback to a stable file under ~/.bot-bottle/ + and return its path. + + The dashboard runs under curses, so a crash's stderr/traceback is + painted onto the alternate screen and lost when the terminal is + restored — this leaves the operator a durable record of *why* it + died. Best-effort: falls back to a tempfile if the home dir can't + be written.""" + stamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + body = "".join( + traceback.format_exception(type(exc), exc, exc.__traceback__) + ) + entry = f"=== dashboard crash {stamp} ===\n{body}\n" + try: + log_dir = _supervise.bot_bottle_root() / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + path = log_dir / "dashboard-crash.log" + with path.open("a", encoding="utf-8") as fh: + fh.write(entry) + return path + except OSError: + fd, tmp = tempfile.mkstemp( + prefix="bot-bottle-dashboard-crash-", suffix=".log", + ) + with os.fdopen(fd, "w", encoding="utf-8") as fh: + fh.write(entry) + return Path(tmp) + + def _list_once() -> int: pending = discover_pending() if not pending: @@ -1407,8 +1456,19 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None: if manifest_cache[0] is None: manifest_cache[0] = Manifest.resolve(USER_CWD, missing_ok=True) return manifest_cache[0] - if not _get_manifest().bottles and not _get_manifest().agents: - status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty" + # A malformed manifest must not take the whole dashboard down — the + # operator may just be watching running bottles. Degrade to a + # status-line warning. Die is a SystemExit (not an Exception), so it + # has to be caught explicitly or it escapes the loop and crashes. + try: + _loaded = _get_manifest() + except Die as e: + status_line = f"config error: {e.message or 'malformed manifest'}" + except Exception as e: + status_line = f"config load failed: {e}" + else: + if not _loaded.bottles and not _loaded.agents: + status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty" # First-tick guard: a brand-new dashboard finds any # pre-existing queue entries on its first poll; those # shouldn't ring the bell as if they just arrived. @@ -1494,6 +1554,11 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None: # bottle running. try: manifest = _get_manifest() + except Die as e: + # Config error (Die is a SystemExit, missed by the + # except-Exception below). Surface the reason inline. + status_line = f"config error: {e.message or 'malformed manifest'}" + continue except Exception as e: status_line = f"manifest load failed: {e}" continue diff --git a/bot_bottle/log.py b/bot_bottle/log.py index 8e1abc7..b7d4204 100644 --- a/bot_bottle/log.py +++ b/bot_bottle/log.py @@ -14,11 +14,23 @@ def warn(msg: str) -> None: print(f"bot-bottle: warning: {msg}", file=sys.stderr) +def error(msg: str) -> None: + print(f"bot-bottle: error: {msg}", file=sys.stderr) + + class Die(SystemExit): """Raised by die() so callers (and tests) can distinguish a deliberate - fatal exit from an unrelated SystemExit.""" + fatal exit from an unrelated SystemExit. + + Carries the human-facing message so a caller that suppressed stderr + — e.g. the curses dashboard, whose alternate screen is wiped when the + terminal is restored — can re-surface the reason after the fact.""" + + def __init__(self, code: int = 1, message: str = "") -> None: + super().__init__(code) + self.message = message def die(msg: str) -> NoReturn: - print(f"bot-bottle: error: {msg}", file=sys.stderr) - raise Die(1) + error(msg) + raise Die(1, msg) diff --git a/tests/unit/test_dashboard_crash_logging.py b/tests/unit/test_dashboard_crash_logging.py new file mode 100644 index 0000000..56dff42 --- /dev/null +++ b/tests/unit/test_dashboard_crash_logging.py @@ -0,0 +1,140 @@ +"""Unit: dashboard launch/crash failure logging (issue #100). + +The dashboard runs under curses, so anything written to stderr while the +TUI owns the terminal is wiped when the terminal is restored. These +tests lock the recovery paths: a config error (`Die`) is re-surfaced +after the wrapper returns, and an unexpected crash is persisted to a +log file the operator can read. +""" + +from __future__ import annotations + +import contextlib +import io +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +from bot_bottle import supervise +from bot_bottle.cli import dashboard +from bot_bottle.log import Die, die + + +class TestDieCarriesMessage(unittest.TestCase): + def test_die_attaches_message_and_code(self): + buf = io.StringIO() + with contextlib.redirect_stderr(buf): + with self.assertRaises(Die) as cm: + die("bad manifest: unknown key 'foo'") + self.assertEqual("bad manifest: unknown key 'foo'", cm.exception.message) + self.assertEqual(1, cm.exception.code) + self.assertIn( + "bot-bottle: error: bad manifest: unknown key 'foo'", buf.getvalue() + ) + + def test_die_default_message_is_empty(self): + self.assertEqual("", Die(1).message) + self.assertEqual(1, Die(1).code) + + +class _FakeHomeMixin: + """Point supervise.bot_bottle_root (what _write_crash_log resolves + through) at a temp dir so the crash log doesn't touch the real + ~/.bot-bottle.""" + + def _setup_fake_home(self): + self._tmp = tempfile.TemporaryDirectory(prefix="dash-crash-test.") + self._orig_root = supervise.bot_bottle_root + self._root = Path(self._tmp.name) / ".bot-bottle" + supervise.bot_bottle_root = lambda: self._root # type: ignore[assignment] + + def _teardown_fake_home(self): + supervise.bot_bottle_root = self._orig_root # type: ignore[assignment] + self._tmp.cleanup() + + +class TestCmdDashboardErrorPaths(_FakeHomeMixin, unittest.TestCase): + def setUp(self): + self._setup_fake_home() + + def tearDown(self): + self._teardown_fake_home() + + def test_keyboard_interrupt_returns_130(self): + with mock.patch.object( + dashboard.curses, "wrapper", side_effect=KeyboardInterrupt + ): + self.assertEqual(130, dashboard.cmd_dashboard([])) + + def test_die_resurfaces_message_after_curses(self): + buf = io.StringIO() + with mock.patch.object( + dashboard.curses, "wrapper", + side_effect=Die(1, "manifest parse error at line 3"), + ): + with contextlib.redirect_stderr(buf): + rc = dashboard.cmd_dashboard([]) + self.assertEqual(1, rc) + self.assertIn("manifest parse error at line 3", buf.getvalue()) + + def test_die_without_message_has_fallback(self): + buf = io.StringIO() + with mock.patch.object(dashboard.curses, "wrapper", side_effect=Die(1)): + with contextlib.redirect_stderr(buf): + rc = dashboard.cmd_dashboard([]) + self.assertEqual(1, rc) + self.assertIn("fatal error", buf.getvalue()) + + def test_unexpected_exception_writes_crash_log(self): + buf = io.StringIO() + with mock.patch.object( + dashboard.curses, "wrapper", + side_effect=ValueError("kaboom in render"), + ): + with contextlib.redirect_stderr(buf): + rc = dashboard.cmd_dashboard([]) + self.assertEqual(1, rc) + out = buf.getvalue() + self.assertIn("dashboard crashed: ValueError: kaboom in render", out) + self.assertIn("full traceback written to", out) + log_path = self._root / "logs" / "dashboard-crash.log" + self.assertTrue(log_path.exists()) + content = log_path.read_text() + self.assertIn("kaboom in render", content) + self.assertIn("Traceback (most recent call last)", content) + + +class TestWriteCrashLog(_FakeHomeMixin, unittest.TestCase): + def setUp(self): + self._setup_fake_home() + + def tearDown(self): + self._teardown_fake_home() + + def test_appends_traceback_with_header(self): + try: + raise RuntimeError("explode") + except RuntimeError as e: + path = dashboard._write_crash_log(e) + self.assertEqual(self._root / "logs" / "dashboard-crash.log", path) + text = path.read_text() + self.assertIn("=== dashboard crash", text) + self.assertIn("RuntimeError: explode", text) + + def test_falls_back_to_tempfile_when_home_unwritable(self): + # bot_bottle_root points at a *file*, so mkdir under it raises + # OSError and the helper must fall back to a tempfile. + bad = Path(self._tmp.name) / "not-a-dir" + bad.write_text("x") + supervise.bot_bottle_root = lambda: bad # type: ignore[assignment] + try: + raise RuntimeError("explode2") + except RuntimeError as e: + path = dashboard._write_crash_log(e) + self.assertTrue(path.exists()) + self.assertIn("explode2", path.read_text()) + + +if __name__ == "__main__": + unittest.main()