fix(dashboard): surface launch/crash failures (#100)
test / unit (pull_request) Successful in 29s
test / integration (pull_request) Successful in 43s

The dashboard runs under curses.wrapper and cmd_dashboard only caught
KeyboardInterrupt, so failures vanished:
- die() prints to stderr, but under curses that lands on the alternate
  screen and is wiped on exit, so config errors gave no reason.
- Die is a SystemExit, so the new-agent flow's `except Exception` never
  caught config errors; they crashed the TUI.
- the startup manifest probe was unguarded.

Now: Die carries its message (+ log.error()); cmd_dashboard re-surfaces
a Die's reason once the terminal is restored and writes any other
crash's traceback to ~/.bot-bottle/logs/dashboard-crash.log; the startup
probe and the new-agent flow degrade a bad config to a status-line
warning instead of crashing.

Closes #100

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 23:49:21 -04:00
parent 848515e5d4
commit 99ec267c74
3 changed files with 223 additions and 6 deletions
+68 -3
View File
@@ -21,6 +21,7 @@ import subprocess
import sys
import tempfile
import time
import traceback
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
@@ -52,7 +53,7 @@ from ..backend.docker.pipelock_apply import (
parse_allowlist_content,
render_allowlist_content,
)
from ..log import info
from ..log import Die, error, info
from ..manifest import Manifest
from ..supervise import (
ACTION_OPERATOR_EDIT,
@@ -1277,9 +1278,57 @@ def cmd_dashboard(argv: list[str]) -> int:
curses.wrapper(_main_loop)
except KeyboardInterrupt:
return 130
except Die as e:
# die() printed the reason to stderr, but that happened while
# curses owned the terminal — the text landed on the alternate
# screen and was wiped when the terminal was restored. Re-surface
# it now that we're back on the normal screen.
if e.message:
error(e.message)
else:
error("dashboard exited on a fatal error (no detail captured).")
return e.code if isinstance(e.code, int) else 1
except Exception as e:
# Any other crash inside the TUI. The traceback would otherwise
# vanish with the alternate screen, so persist it and tell the
# operator where to look.
log_path = _write_crash_log(e)
error(f"dashboard crashed: {type(e).__name__}: {e}")
error(f"full traceback written to {log_path}")
return 1
return 0
def _write_crash_log(exc: BaseException) -> Path:
"""Persist `exc`'s traceback to a stable file under ~/.bot-bottle/
and return its path.
The dashboard runs under curses, so a crash's stderr/traceback is
painted onto the alternate screen and lost when the terminal is
restored — this leaves the operator a durable record of *why* it
died. Best-effort: falls back to a tempfile if the home dir can't
be written."""
stamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
body = "".join(
traceback.format_exception(type(exc), exc, exc.__traceback__)
)
entry = f"=== dashboard crash {stamp} ===\n{body}\n"
try:
log_dir = _supervise.bot_bottle_root() / "logs"
log_dir.mkdir(parents=True, exist_ok=True)
path = log_dir / "dashboard-crash.log"
with path.open("a", encoding="utf-8") as fh:
fh.write(entry)
return path
except OSError:
fd, tmp = tempfile.mkstemp(
prefix="bot-bottle-dashboard-crash-", suffix=".log",
)
with os.fdopen(fd, "w", encoding="utf-8") as fh:
fh.write(entry)
return Path(tmp)
def _list_once() -> int:
pending = discover_pending()
if not pending:
@@ -1407,8 +1456,19 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
if manifest_cache[0] is None:
manifest_cache[0] = Manifest.resolve(USER_CWD, missing_ok=True)
return manifest_cache[0]
if not _get_manifest().bottles and not _get_manifest().agents:
status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty"
# A malformed manifest must not take the whole dashboard down — the
# operator may just be watching running bottles. Degrade to a
# status-line warning. Die is a SystemExit (not an Exception), so it
# has to be caught explicitly or it escapes the loop and crashes.
try:
_loaded = _get_manifest()
except Die as e:
status_line = f"config error: {e.message or 'malformed manifest'}"
except Exception as e:
status_line = f"config load failed: {e}"
else:
if not _loaded.bottles and not _loaded.agents:
status_line = "warning: no bot-bottle config/agents found; new-agent picker is empty"
# First-tick guard: a brand-new dashboard finds any
# pre-existing queue entries on its first poll; those
# shouldn't ring the bell as if they just arrived.
@@ -1494,6 +1554,11 @@ def _main_loop(stdscr: "curses._CursesWindow") -> None:
# bottle running.
try:
manifest = _get_manifest()
except Die as e:
# Config error (Die is a SystemExit, missed by the
# except-Exception below). Surface the reason inline.
status_line = f"config error: {e.message or 'malformed manifest'}"
continue
except Exception as e:
status_line = f"manifest load failed: {e}"
continue
+15 -3
View File
@@ -14,11 +14,23 @@ def warn(msg: str) -> None:
print(f"bot-bottle: warning: {msg}", file=sys.stderr)
def error(msg: str) -> None:
print(f"bot-bottle: error: {msg}", file=sys.stderr)
class Die(SystemExit):
"""Raised by die() so callers (and tests) can distinguish a deliberate
fatal exit from an unrelated SystemExit."""
fatal exit from an unrelated SystemExit.
Carries the human-facing message so a caller that suppressed stderr
e.g. the curses dashboard, whose alternate screen is wiped when the
terminal is restored can re-surface the reason after the fact."""
def __init__(self, code: int = 1, message: str = "") -> None:
super().__init__(code)
self.message = message
def die(msg: str) -> NoReturn:
print(f"bot-bottle: error: {msg}", file=sys.stderr)
raise Die(1)
error(msg)
raise Die(1, msg)