Compare commits

..

1 Commits

Author SHA1 Message Date
didericis-claude 99ec267c74 fix(dashboard): surface launch/crash failures (#100)
test / unit (pull_request) Successful in 29s
test / integration (pull_request) Successful in 43s
The dashboard runs under curses.wrapper and cmd_dashboard only caught
KeyboardInterrupt, so failures vanished:
- die() prints to stderr, but under curses that lands on the alternate
  screen and is wiped on exit, so config errors gave no reason.
- Die is a SystemExit, so the new-agent flow's `except Exception` never
  caught config errors; they crashed the TUI.
- the startup manifest probe was unguarded.

Now: Die carries its message (+ log.error()); cmd_dashboard re-surfaces
a Die's reason once the terminal is restored and writes any other
crash's traceback to ~/.bot-bottle/logs/dashboard-crash.log; the startup
probe and the new-agent flow degrade a bad config to a status-line
warning instead of crashing.

Closes #100

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-28 23:49:21 -04:00
3 changed files with 223 additions and 6 deletions
+67 -2
View File
@@ -21,6 +21,7 @@ import subprocess
import sys import sys
import tempfile import tempfile
import time import time
import traceback
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
@@ -52,7 +53,7 @@ from ..backend.docker.pipelock_apply import (
parse_allowlist_content, parse_allowlist_content,
render_allowlist_content, render_allowlist_content,
) )
from ..log import info from ..log import Die, error, info
from ..manifest import Manifest from ..manifest import Manifest
from ..supervise import ( from ..supervise import (
ACTION_OPERATOR_EDIT, ACTION_OPERATOR_EDIT,
@@ -1277,9 +1278,57 @@ def cmd_dashboard(argv: list[str]) -> int:
curses.wrapper(_main_loop) curses.wrapper(_main_loop)
except KeyboardInterrupt: except KeyboardInterrupt:
return 130 return 130
except Die as e:
# die() printed the reason to stderr, but that happened while
# curses owned the terminal — the text landed on the alternate
# screen and was wiped when the terminal was restored. Re-surface
# it now that we're back on the normal screen.
if e.message:
error(e.message)
else:
error("dashboard exited on a fatal error (no detail captured).")
return e.code if isinstance(e.code, int) else 1
except Exception as e:
# Any other crash inside the TUI. The traceback would otherwise
# vanish with the alternate screen, so persist it and tell the
# operator where to look.
log_path = _write_crash_log(e)
error(f"dashboard crashed: {type(e).__name__}: {e}")
error(f"full traceback written to {log_path}")
return 1
return 0 return 0
def _write_crash_log(exc: BaseException) -> Path:
"""Persist `exc`'s traceback to a stable file under ~/.bot-bottle/
and return its path.
The dashboard runs under curses, so a crash's stderr/traceback is
painted onto the alternate screen and lost when the terminal is
restored — this leaves the operator a durable record of *why* it
died. Best-effort: falls back to a tempfile if the home dir can't
be written."""
stamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
body = "".join(
traceback.format_exception(type(exc), exc, exc.__traceback__)
)
entry = f"=== dashboard crash {stamp} ===\n{body}\n"
try:
log_dir = _supervise.bot_bottle_root() / "logs"
log_dir.mkdir(parents=True, exist_ok=True)
path = log_dir / "dashboard-crash.log"
with path.open("a", encoding="utf-8") as fh:
fh.write(entry)
return path
except OSError:
fd, tmp = tempfile.mkstemp(
prefix="bot-bottle-dashboard-crash-", suffix=".log",
)
with os.fdopen(fd, "w", encoding="utf-8") as fh:
fh.write(entry)
return Path(tmp)
def _list_once() -> int: def _list_once() -> int:
pending = discover_pending() pending = discover_pending()
if not pending: if not pending:
@@ -1407,7 +1456,18 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
if manifest_cache[0] is None: if manifest_cache[0] is None:
manifest_cache[0] = Manifest.resolve(USER_CWD, missing_ok=True) manifest_cache[0] = Manifest.resolve(USER_CWD, missing_ok=True)
return manifest_cache[0] return manifest_cache[0]
if not _get_manifest().bottles and not _get_manifest().agents: # A malformed manifest must not take the whole dashboard down — the
# operator may just be watching running bottles. Degrade to a
# status-line warning. Die is a SystemExit (not an Exception), so it
# has to be caught explicitly or it escapes the loop and crashes.
try:
_loaded = _get_manifest()
except Die as e:
status_line = f"config error: {e.message or 'malformed manifest'}"
except Exception as e:
status_line = f"config load failed: {e}"
else:
if not _loaded.bottles and not _loaded.agents:
status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty" status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty"
# First-tick guard: a brand-new dashboard finds any # First-tick guard: a brand-new dashboard finds any
# pre-existing queue entries on its first poll; those # pre-existing queue entries on its first poll; those
@@ -1494,6 +1554,11 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
# bottle running. # bottle running.
try: try:
manifest = _get_manifest() manifest = _get_manifest()
except Die as e:
# Config error (Die is a SystemExit, missed by the
# except-Exception below). Surface the reason inline.
status_line = f"config error: {e.message or 'malformed manifest'}"
continue
except Exception as e: except Exception as e:
status_line = f"manifest load failed: {e}" status_line = f"manifest load failed: {e}"
continue continue
+15 -3
View File
@@ -14,11 +14,23 @@ def warn(msg: str) -> None:
print(f"bot-bottle: warning: {msg}", file=sys.stderr) print(f"bot-bottle: warning: {msg}", file=sys.stderr)
def error(msg: str) -> None:
print(f"bot-bottle: error: {msg}", file=sys.stderr)
class Die(SystemExit): class Die(SystemExit):
"""Raised by die() so callers (and tests) can distinguish a deliberate """Raised by die() so callers (and tests) can distinguish a deliberate
fatal exit from an unrelated SystemExit.""" fatal exit from an unrelated SystemExit.
Carries the human-facing message so a caller that suppressed stderr
e.g. the curses dashboard, whose alternate screen is wiped when the
terminal is restored can re-surface the reason after the fact."""
def __init__(self, code: int = 1, message: str = "") -> None:
super().__init__(code)
self.message = message
def die(msg: str) -> NoReturn: def die(msg: str) -> NoReturn:
print(f"bot-bottle: error: {msg}", file=sys.stderr) error(msg)
raise Die(1) raise Die(1, msg)
+140
View File
@@ -0,0 +1,140 @@
"""Unit: dashboard launch/crash failure logging (issue #100).
The dashboard runs under curses, so anything written to stderr while the
TUI owns the terminal is wiped when the terminal is restored. These
tests lock the recovery paths: a config error (`Die`) is re-surfaced
after the wrapper returns, and an unexpected crash is persisted to a
log file the operator can read.
"""
from __future__ import annotations
import contextlib
import io
import tempfile
import unittest
from pathlib import Path
from unittest import mock
from bot_bottle import supervise
from bot_bottle.cli import dashboard
from bot_bottle.log import Die, die
class TestDieCarriesMessage(unittest.TestCase):
def test_die_attaches_message_and_code(self):
buf = io.StringIO()
with contextlib.redirect_stderr(buf):
with self.assertRaises(Die) as cm:
die("bad manifest: unknown key 'foo'")
self.assertEqual("bad manifest: unknown key 'foo'", cm.exception.message)
self.assertEqual(1, cm.exception.code)
self.assertIn(
"bot-bottle: error: bad manifest: unknown key 'foo'", buf.getvalue()
)
def test_die_default_message_is_empty(self):
self.assertEqual("", Die(1).message)
self.assertEqual(1, Die(1).code)
class _FakeHomeMixin:
"""Point supervise.bot_bottle_root (what _write_crash_log resolves
through) at a temp dir so the crash log doesn't touch the real
~/.bot-bottle."""
def _setup_fake_home(self):
self._tmp = tempfile.TemporaryDirectory(prefix="dash-crash-test.")
self._orig_root = supervise.bot_bottle_root
self._root = Path(self._tmp.name) / ".bot-bottle"
supervise.bot_bottle_root = lambda: self._root # type: ignore[assignment]
def _teardown_fake_home(self):
supervise.bot_bottle_root = self._orig_root # type: ignore[assignment]
self._tmp.cleanup()
class TestCmdDashboardErrorPaths(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_keyboard_interrupt_returns_130(self):
with mock.patch.object(
dashboard.curses, "wrapper", side_effect=KeyboardInterrupt
):
self.assertEqual(130, dashboard.cmd_dashboard([]))
def test_die_resurfaces_message_after_curses(self):
buf = io.StringIO()
with mock.patch.object(
dashboard.curses, "wrapper",
side_effect=Die(1, "manifest parse error at line 3"),
):
with contextlib.redirect_stderr(buf):
rc = dashboard.cmd_dashboard([])
self.assertEqual(1, rc)
self.assertIn("manifest parse error at line 3", buf.getvalue())
def test_die_without_message_has_fallback(self):
buf = io.StringIO()
with mock.patch.object(dashboard.curses, "wrapper", side_effect=Die(1)):
with contextlib.redirect_stderr(buf):
rc = dashboard.cmd_dashboard([])
self.assertEqual(1, rc)
self.assertIn("fatal error", buf.getvalue())
def test_unexpected_exception_writes_crash_log(self):
buf = io.StringIO()
with mock.patch.object(
dashboard.curses, "wrapper",
side_effect=ValueError("kaboom in render"),
):
with contextlib.redirect_stderr(buf):
rc = dashboard.cmd_dashboard([])
self.assertEqual(1, rc)
out = buf.getvalue()
self.assertIn("dashboard crashed: ValueError: kaboom in render", out)
self.assertIn("full traceback written to", out)
log_path = self._root / "logs" / "dashboard-crash.log"
self.assertTrue(log_path.exists())
content = log_path.read_text()
self.assertIn("kaboom in render", content)
self.assertIn("Traceback (most recent call last)", content)
class TestWriteCrashLog(_FakeHomeMixin, unittest.TestCase):
def setUp(self):
self._setup_fake_home()
def tearDown(self):
self._teardown_fake_home()
def test_appends_traceback_with_header(self):
try:
raise RuntimeError("explode")
except RuntimeError as e:
path = dashboard._write_crash_log(e)
self.assertEqual(self._root / "logs" / "dashboard-crash.log", path)
text = path.read_text()
self.assertIn("=== dashboard crash", text)
self.assertIn("RuntimeError: explode", text)
def test_falls_back_to_tempfile_when_home_unwritable(self):
# bot_bottle_root points at a *file*, so mkdir under it raises
# OSError and the helper must fall back to a tempfile.
bad = Path(self._tmp.name) / "not-a-dir"
bad.write_text("x")
supervise.bot_bottle_root = lambda: bad # type: ignore[assignment]
try:
raise RuntimeError("explode2")
except RuntimeError as e:
path = dashboard._write_crash_log(e)
self.assertTrue(path.exists())
self.assertIn("explode2", path.read_text())
if __name__ == "__main__":
unittest.main()