fix(dashboard): surface launch/crash failures (#100)
test / unit (pull_request) Successful in 29s
test / integration (pull_request) Successful in 43s

The dashboard runs under curses.wrapper and cmd_dashboard only caught
KeyboardInterrupt, so failures vanished:
- die() prints to stderr, but under curses that lands on the alternate
  screen and is wiped on exit, so config errors gave no reason.
- Die is a SystemExit, so the new-agent flow's `except Exception` never
  caught config errors; they crashed the TUI.
- the startup manifest probe was unguarded.

Now: Die carries its message (+ log.error()); cmd_dashboard re-surfaces
a Die's reason once the terminal is restored and writes any other
crash's traceback to ~/.bot-bottle/logs/dashboard-crash.log; the startup
probe and the new-agent flow degrade a bad config to a status-line
warning instead of crashing.

Closes #100

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 23:49:21 -04:00
parent 848515e5d4
commit 99ec267c74
3 changed files with 223 additions and 6 deletions
+68 -3
View File
@@ -21,6 +21,7 @@ import subprocess
import sys
import tempfile
import time
import traceback
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
@@ -52,7 +53,7 @@ from ..backend.docker.pipelock_apply import (
parse_allowlist_content,
render_allowlist_content,
)
from ..log import info
from ..log import Die, error, info
from ..manifest import Manifest
from ..supervise import (
ACTION_OPERATOR_EDIT,
@@ -1277,9 +1278,57 @@ def cmd_dashboard(argv: list[str]) -> int:
curses.wrapper(_main_loop)
except KeyboardInterrupt:
return 130
except Die as e:
# die() printed the reason to stderr, but that happened while
# curses owned the terminal — the text landed on the alternate
# screen and was wiped when the terminal was restored. Re-surface
# it now that we're back on the normal screen.
if e.message:
error(e.message)
else:
error("dashboard exited on a fatal error (no detail captured).")
return e.code if isinstance(e.code, int) else 1
except Exception as e:
# Any other crash inside the TUI. The traceback would otherwise
# vanish with the alternate screen, so persist it and tell the
# operator where to look.
log_path = _write_crash_log(e)
error(f"dashboard crashed: {type(e).__name__}: {e}")
error(f"full traceback written to {log_path}")
return 1
return 0
def _write_crash_log(exc: BaseException) -> Path:
"""Persist `exc`'s traceback to a stable file under ~/.bot-bottle/
and return its path.
The dashboard runs under curses, so a crash's stderr/traceback is
painted onto the alternate screen and lost when the terminal is
restored — this leaves the operator a durable record of *why* it
died. Best-effort: falls back to a tempfile if the home dir can't
be written."""
stamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
body = "".join(
traceback.format_exception(type(exc), exc, exc.__traceback__)
)
entry = f"=== dashboard crash {stamp} ===\n{body}\n"
try:
log_dir = _supervise.bot_bottle_root() / "logs"
log_dir.mkdir(parents=True, exist_ok=True)
path = log_dir / "dashboard-crash.log"
with path.open("a", encoding="utf-8") as fh:
fh.write(entry)
return path
except OSError:
fd, tmp = tempfile.mkstemp(
prefix="bot-bottle-dashboard-crash-", suffix=".log",
)
with os.fdopen(fd, "w", encoding="utf-8") as fh:
fh.write(entry)
return Path(tmp)
def _list_once() -> int:
pending = discover_pending()
if not pending:
@@ -1407,8 +1456,19 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
if manifest_cache[0] is None:
manifest_cache[0] = Manifest.resolve(USER_CWD, missing_ok=True)
return manifest_cache[0]
if not _get_manifest().bottles and not _get_manifest().agents:
status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty"
# A malformed manifest must not take the whole dashboard down — the
# operator may just be watching running bottles. Degrade to a
# status-line warning. Die is a SystemExit (not an Exception), so it
# has to be caught explicitly or it escapes the loop and crashes.
try:
_loaded = _get_manifest()
except Die as e:
status_line = f"config error: {e.message or 'malformed manifest'}"
except Exception as e:
status_line = f"config load failed: {e}"
else:
if not _loaded.bottles and not _loaded.agents:
status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty"
# First-tick guard: a brand-new dashboard finds any
# pre-existing queue entries on its first poll; those
# shouldn't ring the bell as if they just arrived.
@@ -1494,6 +1554,11 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
# bottle running.
try:
manifest = _get_manifest()
except Die as e:
# Config error (Die is a SystemExit, missed by the
# except-Exception below). Surface the reason inline.
status_line = f"config error: {e.message or 'malformed manifest'}"
continue
except Exception as e:
status_line = f"manifest load failed: {e}"
continue
+15 -3
View File
@@ -14,11 +14,23 @@ def warn(msg: str) -> None:
print(f"bot-bottle: warning: {msg}", file=sys.stderr)
def error(msg: str) -> None:
print(f"bot-bottle: error: {msg}", file=sys.stderr)
class Die(SystemExit):
"""Raised by die() so callers (and tests) can distinguish a deliberate
fatal exit from an unrelated SystemExit."""
fatal exit from an unrelated SystemExit.
Carries the human-facing message so a caller that suppressed stderr
e.g. the curses dashboard, whose alternate screen is wiped when the
terminal is restored can re-surface the reason after the fact."""
def __init__(self, code: int = 1, message: str = "") -> None:
super().__init__(code)
self.message = message
def die(msg: str) -> NoReturn:
print(f"bot-bottle: error: {msg}", file=sys.stderr)
raise Die(1)
error(msg)
raise Die(1, msg)
+140
View File
@@ -0,0 +1,140 @@
"""Unit: dashboard launch/crash failure logging (issue #100).
The dashboard runs under curses, so anything written to stderr while the
TUI owns the terminal is wiped when the terminal is restored. These
tests lock the recovery paths: a config error (`Die`) is re-surfaced
after the wrapper returns, and an unexpected crash is persisted to a
log file the operator can read.
"""
from __future__ import annotations
import contextlib
import io
import tempfile
import unittest
from pathlib import Path
from unittest import mock
from bot_bottle import supervise
from bot_bottle.cli import dashboard
from bot_bottle.log import Die, die
class TestDieCarriesMessage(unittest.TestCase):
def test_die_attaches_message_and_code(self):
buf = io.StringIO()
with contextlib.redirect_stderr(buf):
with self.assertRaises(Die) as cm:
die("bad manifest: unknown key 'foo'")
self.assertEqual("bad manifest: unknown key 'foo'", cm.exception.message)
self.assertEqual(1, cm.exception.code)
self.assertIn(
"bot-bottle: error: bad manifest: unknown key 'foo'", buf.getvalue()
)
def test_die_default_message_is_empty(self):
self.assertEqual("", Die(1).message)
self.assertEqual(1, Die(1).code)
class _FakeHomeMixin:
"""Point supervise.bot_bottle_root (what _write_crash_log resolves
through) at a temp dir so the crash log doesn't touch the real
~/.bot-bottle."""
def _setup_fake_home(self):
self._tmp = tempfile.TemporaryDirectory(prefix="dash-crash-test.")
self._orig_root = supervise.bot_bottle_root
self._root = Path(self._tmp.name) / ".bot-bottle"
supervise.bot_bottle_root = lambda: self._root # type: ignore[assignment]
def _teardown_fake_home(self):
supervise.bot_bottle_root = self._orig_root # type: ignore[assignment]
self._tmp.cleanup()
class TestCmdDashboardErrorPaths(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_keyboard_interrupt_returns_130(self):
with mock.patch.object(
dashboard.curses, "wrapper", side_effect=KeyboardInterrupt
):
self.assertEqual(130, dashboard.cmd_dashboard([]))
def test_die_resurfaces_message_after_curses(self):
buf = io.StringIO()
with mock.patch.object(
dashboard.curses, "wrapper",
side_effect=Die(1, "manifest parse error at line 3"),
):
with contextlib.redirect_stderr(buf):
rc = dashboard.cmd_dashboard([])
self.assertEqual(1, rc)
self.assertIn("manifest parse error at line 3", buf.getvalue())
def test_die_without_message_has_fallback(self):
buf = io.StringIO()
with mock.patch.object(dashboard.curses, "wrapper", side_effect=Die(1)):
with contextlib.redirect_stderr(buf):
rc = dashboard.cmd_dashboard([])
self.assertEqual(1, rc)
self.assertIn("fatal error", buf.getvalue())
def test_unexpected_exception_writes_crash_log(self):
buf = io.StringIO()
with mock.patch.object(
dashboard.curses, "wrapper",
side_effect=ValueError("kaboom in render"),
):
with contextlib.redirect_stderr(buf):
rc = dashboard.cmd_dashboard([])
self.assertEqual(1, rc)
out = buf.getvalue()
self.assertIn("dashboard crashed: ValueError: kaboom in render", out)
self.assertIn("full traceback written to", out)
log_path = self._root / "logs" / "dashboard-crash.log"
self.assertTrue(log_path.exists())
content = log_path.read_text()
self.assertIn("kaboom in render", content)
self.assertIn("Traceback (most recent call last)", content)
class TestWriteCrashLog(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_appends_traceback_with_header(self):
try:
raise RuntimeError("explode")
except RuntimeError as e:
path = dashboard._write_crash_log(e)
self.assertEqual(self._root / "logs" / "dashboard-crash.log", path)
text = path.read_text()
self.assertIn("=== dashboard crash", text)
self.assertIn("RuntimeError: explode", text)
def test_falls_back_to_tempfile_when_home_unwritable(self):
# bot_bottle_root points at a *file*, so mkdir under it raises
# OSError and the helper must fall back to a tempfile.
bad = Path(self._tmp.name) / "not-a-dir"
bad.write_text("x")
supervise.bot_bottle_root = lambda: bad # type: ignore[assignment]
try:
raise RuntimeError("explode2")
except RuntimeError as e:
path = dashboard._write_crash_log(e)
self.assertTrue(path.exists())
self.assertIn("explode2", path.read_text())
if __name__ == "__main__":
unittest.main()