fix(dashboard): surface launch/crash failures (#100)
The dashboard runs under curses.wrapper and cmd_dashboard only caught KeyboardInterrupt, so failures vanished: - die() prints to stderr, but under curses that lands on the alternate screen and is wiped on exit, so config errors gave no reason. - Die is a SystemExit, so the new-agent flow's `except Exception` never caught config errors; they crashed the TUI. - the startup manifest probe was unguarded. Now: Die carries its message (+ log.error()); cmd_dashboard re-surfaces a Die's reason once the terminal is restored and writes any other crash's traceback to ~/.bot-bottle/logs/dashboard-crash.log; the startup probe and the new-agent flow degrade a bad config to a status-line warning instead of crashing. Closes #100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
@@ -52,7 +53,7 @@ from ..backend.docker.pipelock_apply import (
|
||||
parse_allowlist_content,
|
||||
render_allowlist_content,
|
||||
)
|
||||
from ..log import info
|
||||
from ..log import Die, error, info
|
||||
from ..manifest import Manifest
|
||||
from ..supervise import (
|
||||
ACTION_OPERATOR_EDIT,
|
||||
@@ -1277,9 +1278,57 @@ def cmd_dashboard(argv: list[str]) -> int:
|
||||
curses.wrapper(_main_loop)
|
||||
except KeyboardInterrupt:
|
||||
return 130
|
||||
except Die as e:
|
||||
# die() printed the reason to stderr, but that happened while
|
||||
# curses owned the terminal — the text landed on the alternate
|
||||
# screen and was wiped when the terminal was restored. Re-surface
|
||||
# it now that we're back on the normal screen.
|
||||
if e.message:
|
||||
error(e.message)
|
||||
else:
|
||||
error("dashboard exited on a fatal error (no detail captured).")
|
||||
return e.code if isinstance(e.code, int) else 1
|
||||
except Exception as e:
|
||||
# Any other crash inside the TUI. The traceback would otherwise
|
||||
# vanish with the alternate screen, so persist it and tell the
|
||||
# operator where to look.
|
||||
log_path = _write_crash_log(e)
|
||||
error(f"dashboard crashed: {type(e).__name__}: {e}")
|
||||
error(f"full traceback written to {log_path}")
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def _write_crash_log(exc: BaseException) -> Path:
|
||||
"""Persist `exc`'s traceback to a stable file under ~/.bot-bottle/
|
||||
and return its path.
|
||||
|
||||
The dashboard runs under curses, so a crash's stderr/traceback is
|
||||
painted onto the alternate screen and lost when the terminal is
|
||||
restored — this leaves the operator a durable record of *why* it
|
||||
died. Best-effort: falls back to a tempfile if the home dir can't
|
||||
be written."""
|
||||
stamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
body = "".join(
|
||||
traceback.format_exception(type(exc), exc, exc.__traceback__)
|
||||
)
|
||||
entry = f"=== dashboard crash {stamp} ===\n{body}\n"
|
||||
try:
|
||||
log_dir = _supervise.bot_bottle_root() / "logs"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = log_dir / "dashboard-crash.log"
|
||||
with path.open("a", encoding="utf-8") as fh:
|
||||
fh.write(entry)
|
||||
return path
|
||||
except OSError:
|
||||
fd, tmp = tempfile.mkstemp(
|
||||
prefix="bot-bottle-dashboard-crash-", suffix=".log",
|
||||
)
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as fh:
|
||||
fh.write(entry)
|
||||
return Path(tmp)
|
||||
|
||||
|
||||
def _list_once() -> int:
|
||||
pending = discover_pending()
|
||||
if not pending:
|
||||
@@ -1407,8 +1456,19 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
|
||||
if manifest_cache[0] is None:
|
||||
manifest_cache[0] = Manifest.resolve(USER_CWD, missing_ok=True)
|
||||
return manifest_cache[0]
|
||||
if not _get_manifest().bottles and not _get_manifest().agents:
|
||||
status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty"
|
||||
# A malformed manifest must not take the whole dashboard down — the
|
||||
# operator may just be watching running bottles. Degrade to a
|
||||
# status-line warning. Die is a SystemExit (not an Exception), so it
|
||||
# has to be caught explicitly or it escapes the loop and crashes.
|
||||
try:
|
||||
_loaded = _get_manifest()
|
||||
except Die as e:
|
||||
status_line = f"config error: {e.message or 'malformed manifest'}"
|
||||
except Exception as e:
|
||||
status_line = f"config load failed: {e}"
|
||||
else:
|
||||
if not _loaded.bottles and not _loaded.agents:
|
||||
status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty"
|
||||
# First-tick guard: a brand-new dashboard finds any
|
||||
# pre-existing queue entries on its first poll; those
|
||||
# shouldn't ring the bell as if they just arrived.
|
||||
@@ -1494,6 +1554,11 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
|
||||
# bottle running.
|
||||
try:
|
||||
manifest = _get_manifest()
|
||||
except Die as e:
|
||||
# Config error (Die is a SystemExit, missed by the
|
||||
# except-Exception below). Surface the reason inline.
|
||||
status_line = f"config error: {e.message or 'malformed manifest'}"
|
||||
continue
|
||||
except Exception as e:
|
||||
status_line = f"manifest load failed: {e}"
|
||||
continue
|
||||
|
||||
+15
-3
@@ -14,11 +14,23 @@ def warn(msg: str) -> None:
|
||||
print(f"bot-bottle: warning: {msg}", file=sys.stderr)
|
||||
|
||||
|
||||
def error(msg: str) -> None:
|
||||
print(f"bot-bottle: error: {msg}", file=sys.stderr)
|
||||
|
||||
|
||||
class Die(SystemExit):
|
||||
"""Raised by die() so callers (and tests) can distinguish a deliberate
|
||||
fatal exit from an unrelated SystemExit."""
|
||||
fatal exit from an unrelated SystemExit.
|
||||
|
||||
Carries the human-facing message so a caller that suppressed stderr
|
||||
— e.g. the curses dashboard, whose alternate screen is wiped when the
|
||||
terminal is restored — can re-surface the reason after the fact."""
|
||||
|
||||
def __init__(self, code: int = 1, message: str = "") -> None:
|
||||
super().__init__(code)
|
||||
self.message = message
|
||||
|
||||
|
||||
def die(msg: str) -> NoReturn:
|
||||
print(f"bot-bottle: error: {msg}", file=sys.stderr)
|
||||
raise Die(1)
|
||||
error(msg)
|
||||
raise Die(1, msg)
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
"""Unit: dashboard launch/crash failure logging (issue #100).
|
||||
|
||||
The dashboard runs under curses, so anything written to stderr while the
|
||||
TUI owns the terminal is wiped when the terminal is restored. These
|
||||
tests lock the recovery paths: a config error (`Die`) is re-surfaced
|
||||
after the wrapper returns, and an unexpected crash is persisted to a
|
||||
log file the operator can read.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import io
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from bot_bottle import supervise
|
||||
from bot_bottle.cli import dashboard
|
||||
from bot_bottle.log import Die, die
|
||||
|
||||
|
||||
class TestDieCarriesMessage(unittest.TestCase):
|
||||
def test_die_attaches_message_and_code(self):
|
||||
buf = io.StringIO()
|
||||
with contextlib.redirect_stderr(buf):
|
||||
with self.assertRaises(Die) as cm:
|
||||
die("bad manifest: unknown key 'foo'")
|
||||
self.assertEqual("bad manifest: unknown key 'foo'", cm.exception.message)
|
||||
self.assertEqual(1, cm.exception.code)
|
||||
self.assertIn(
|
||||
"bot-bottle: error: bad manifest: unknown key 'foo'", buf.getvalue()
|
||||
)
|
||||
|
||||
def test_die_default_message_is_empty(self):
|
||||
self.assertEqual("", Die(1).message)
|
||||
self.assertEqual(1, Die(1).code)
|
||||
|
||||
|
||||
class _FakeHomeMixin:
|
||||
"""Point supervise.bot_bottle_root (what _write_crash_log resolves
|
||||
through) at a temp dir so the crash log doesn't touch the real
|
||||
~/.bot-bottle."""
|
||||
|
||||
def _setup_fake_home(self):
|
||||
self._tmp = tempfile.TemporaryDirectory(prefix="dash-crash-test.")
|
||||
self._orig_root = supervise.bot_bottle_root
|
||||
self._root = Path(self._tmp.name) / ".bot-bottle"
|
||||
supervise.bot_bottle_root = lambda: self._root # type: ignore[assignment]
|
||||
|
||||
def _teardown_fake_home(self):
|
||||
supervise.bot_bottle_root = self._orig_root # type: ignore[assignment]
|
||||
self._tmp.cleanup()
|
||||
|
||||
|
||||
class TestCmdDashboardErrorPaths(_FakeHomeMixin, unittest.TestCase):
|
||||
def setUp(self):
|
||||
self._setup_fake_home()
|
||||
|
||||
def tearDown(self):
|
||||
self._teardown_fake_home()
|
||||
|
||||
def test_keyboard_interrupt_returns_130(self):
|
||||
with mock.patch.object(
|
||||
dashboard.curses, "wrapper", side_effect=KeyboardInterrupt
|
||||
):
|
||||
self.assertEqual(130, dashboard.cmd_dashboard([]))
|
||||
|
||||
def test_die_resurfaces_message_after_curses(self):
|
||||
buf = io.StringIO()
|
||||
with mock.patch.object(
|
||||
dashboard.curses, "wrapper",
|
||||
side_effect=Die(1, "manifest parse error at line 3"),
|
||||
):
|
||||
with contextlib.redirect_stderr(buf):
|
||||
rc = dashboard.cmd_dashboard([])
|
||||
self.assertEqual(1, rc)
|
||||
self.assertIn("manifest parse error at line 3", buf.getvalue())
|
||||
|
||||
def test_die_without_message_has_fallback(self):
|
||||
buf = io.StringIO()
|
||||
with mock.patch.object(dashboard.curses, "wrapper", side_effect=Die(1)):
|
||||
with contextlib.redirect_stderr(buf):
|
||||
rc = dashboard.cmd_dashboard([])
|
||||
self.assertEqual(1, rc)
|
||||
self.assertIn("fatal error", buf.getvalue())
|
||||
|
||||
def test_unexpected_exception_writes_crash_log(self):
|
||||
buf = io.StringIO()
|
||||
with mock.patch.object(
|
||||
dashboard.curses, "wrapper",
|
||||
side_effect=ValueError("kaboom in render"),
|
||||
):
|
||||
with contextlib.redirect_stderr(buf):
|
||||
rc = dashboard.cmd_dashboard([])
|
||||
self.assertEqual(1, rc)
|
||||
out = buf.getvalue()
|
||||
self.assertIn("dashboard crashed: ValueError: kaboom in render", out)
|
||||
self.assertIn("full traceback written to", out)
|
||||
log_path = self._root / "logs" / "dashboard-crash.log"
|
||||
self.assertTrue(log_path.exists())
|
||||
content = log_path.read_text()
|
||||
self.assertIn("kaboom in render", content)
|
||||
self.assertIn("Traceback (most recent call last)", content)
|
||||
|
||||
|
||||
class TestWriteCrashLog(_FakeHomeMixin, unittest.TestCase):
|
||||
def setUp(self):
|
||||
self._setup_fake_home()
|
||||
|
||||
def tearDown(self):
|
||||
self._teardown_fake_home()
|
||||
|
||||
def test_appends_traceback_with_header(self):
|
||||
try:
|
||||
raise RuntimeError("explode")
|
||||
except RuntimeError as e:
|
||||
path = dashboard._write_crash_log(e)
|
||||
self.assertEqual(self._root / "logs" / "dashboard-crash.log", path)
|
||||
text = path.read_text()
|
||||
self.assertIn("=== dashboard crash", text)
|
||||
self.assertIn("RuntimeError: explode", text)
|
||||
|
||||
def test_falls_back_to_tempfile_when_home_unwritable(self):
|
||||
# bot_bottle_root points at a *file*, so mkdir under it raises
|
||||
# OSError and the helper must fall back to a tempfile.
|
||||
bad = Path(self._tmp.name) / "not-a-dir"
|
||||
bad.write_text("x")
|
||||
supervise.bot_bottle_root = lambda: bad # type: ignore[assignment]
|
||||
try:
|
||||
raise RuntimeError("explode2")
|
||||
except RuntimeError as e:
|
||||
path = dashboard._write_crash_log(e)
|
||||
self.assertTrue(path.exists())
|
||||
self.assertIn("explode2", path.read_text())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user