Defer broken manifest parse errors to preflight #239

Merged
didericis merged 8 commits from lazy-manifest-parse-on-select into main 2026-06-22 23:59:01 -04:00
7 changed files with 277 additions and 62 deletions
Showing only changes of commit 3375df3f52 - Show all commits
+1 -1
View File
@@ -67,7 +67,7 @@ def cmd_start(argv: list[str]) -> int:
agent_name: str | None = args.name
if agent_name is None:
agent_name = tui.filter_select(
sorted(manifest.agents.keys()),
manifest.all_agent_names,
title="Select agent",
)
if agent_name is None:
+33 -6
View File
@@ -193,6 +193,9 @@ class ManifestBottle:
class Manifest:
bottles: Mapping[str, ManifestBottle]
agents: Mapping[str, ManifestAgent]
Outdated
Review

Why are ALL the bottles and ALL the agents needed for a manifest?

Why are ALL the bottles and ALL the agents needed for a manifest?
# Agents (and agents referencing broken bottles) that failed to load.
# Their errors are deferred to preflight rather than raised at load time.
broken_agents: Mapping[str, ManifestError] = field(default_factory=dict)
@classmethod
def resolve(cls, cwd: str, *, missing_ok: bool = False) -> "Manifest":
@@ -256,16 +259,23 @@ class Manifest:
name collision. A `bottles/` subdir under `cwd_dir` is
logged as a warning and ignored.
Per-file parse errors are deferred into `broken_agents` rather
than raised, so a broken bottle or agent only fails at preflight
when that specific agent is selected for launch.
Used by tests to build a Manifest from fixture directories
without touching `os.environ`."""
bottles_dir = home_dir / "bottles"
from .manifest_loader import load_agents_from_dir, load_bottles_from_dir
bottles = load_bottles_from_dir(bottles_dir)
bottles, broken_bottle_errors = load_bottles_from_dir(bottles_dir)
Outdated
Review

Don't load all the bottles at this point anymore: wait to load the full bottle manifest (andly only for the relevant bottle) and catch errors during preflight. This should just load bottle names.

Don't load all the bottles at this point anymore: wait to load the full bottle manifest (andly only for the relevant bottle) and catch errors during preflight. This should just load bottle names.
bottle_names = set(bottles.keys())
agents_dir = home_dir / "agents"
agents = load_agents_from_dir(agents_dir, bottle_names, source="$HOME")
agents, broken_agents = load_agents_from_dir(
agents_dir, bottle_names, source="$HOME",
broken_bottle_errors=broken_bottle_errors,
)
didericis marked this conversation as resolved Outdated
Outdated
Review

Also don't load the agents here anymore, just load agent names/defer loading the full agent until preflight

Also don't load the agents here anymore, just load agent names/defer loading the full agent until preflight
if cwd_dir is not None:
stale_bottles = cwd_dir / "bottles"
@@ -281,12 +291,14 @@ class Manifest:
f"(PRD 0011). Move them or delete."
)
cwd_agents_dir = cwd_dir / "agents"
cwd_agents = load_agents_from_dir(
cwd_agents_dir, bottle_names, source="$CWD"
cwd_agents, cwd_broken = load_agents_from_dir(
cwd_agents_dir, bottle_names, source="$CWD",
broken_bottle_errors=broken_bottle_errors,
)
agents = {**agents, **cwd_agents}
broken_agents = {**broken_agents, **cwd_broken}
return cls(bottles=bottles, agents=agents)
return cls(bottles=bottles, agents=agents, broken_agents=broken_agents)
@classmethod
def from_json_obj(cls, obj: object) -> "Manifest":
@@ -311,10 +323,21 @@ class Manifest:
}
return cls(bottles=bottles, agents=agents)
@property
def all_agent_names(self) -> list[str]:
"""Sorted list of all agent names, including broken ones.
Broken agents appear in the CLI selector so users can select any
agent — the error surfaces only at preflight when launch is
attempted."""
return sorted(set(self.agents.keys()) | set(self.broken_agents.keys()))
def has_agent(self, name: str) -> bool:
return name in self.agents
def require_agent(self, name: str) -> None:
if name in self.broken_agents:
raise self.broken_agents[name]
if self.has_agent(name):
return
available = ", ".join(self.agents.keys())
@@ -361,7 +384,11 @@ class Manifest:
The overlay lives here, the single point both backends call to
resolve an agent's bottle, so the docker / smolmachines git
provisioners pick up the merged identity unchanged."""
provisioners pick up the merged identity unchanged.
Raises the stored ManifestError for agents that failed to load."""
if agent_name in self.broken_agents:
raise self.broken_agents[agent_name]
bottle = self.bottles[self.agents[agent_name].bottle]
merged = self._effective_git_user(agent_name)
if merged == bottle.git_user:
+83
View File
@@ -22,6 +22,25 @@ def resolve_bottles(raws: dict[str, dict[str, object]]) -> dict[str, ManifestBot
return cache
def resolve_bottles_partial(
raws: dict[str, dict[str, object]],
) -> tuple[dict[str, ManifestBottle], dict[str, ManifestError]]:
"""Apply `extends:` chains and return `(good, broken)`.
Bottles that fail validation (schema errors, bad extends, cycles) are
collected in `broken` rather than raising, so unrelated bottles remain
usable. Errors for parent bottles propagate to all children that extend
them."""
from .manifest import ManifestError
cache: dict[str, ManifestBottle] = {}
broken: dict[str, ManifestError] = {}
for name in raws:
if name not in cache and name not in broken:
_resolve_one_bottle_partial(name, raws, cache, broken, ())
return cache, broken
def _resolve_one_bottle(
name: str,
raws: dict[str, dict[str, object]],
@@ -210,3 +229,67 @@ def _merge_egress(
routes = parent.routes + child.routes
log = child.Log if "log" in child_egress_raw else parent.Log
return ManifestEgressConfig(routes=routes, Log=log)
def _resolve_one_bottle_partial(
name: str,
raws: dict[str, dict[str, object]],
cache: dict[str, ManifestBottle],
broken: dict[str, ManifestError],
seen: tuple[str, ...],
) -> None:
"""Error-tolerant variant: on failure, adds to `broken` instead of raising."""
from .manifest import ManifestBottle, ManifestError
if name in cache or name in broken:
return
if name in seen:
chain = " -> ".join(seen + (name,))
broken[name] = ManifestError(
f"bottle '{name}' is in an extends cycle: {chain}"
)
return
raw = raws[name]
parent_name_raw = raw.get("extends")
child_raw = {k: v for k, v in raw.items() if k != "extends"}
try:
if parent_name_raw is None:
cache[name] = ManifestBottle.from_dict(name, child_raw)
return
if not isinstance(parent_name_raw, str):
broken[name] = ManifestError(
f"bottle '{name}' extends must be a string "
f"(was {type(parent_name_raw).__name__})"
)
return
parent_name: str = parent_name_raw
if parent_name == name:
broken[name] = ManifestError(
f"bottle '{name}' extends itself; remove the self-reference"
)
return
if parent_name not in raws:
avail = ", ".join(sorted(raws.keys())) or "(none)"
broken[name] = ManifestError(
f"bottle '{name}' extends '{parent_name}' which is not "
f"defined. Available bottles: {avail}"
)
return
_resolve_one_bottle_partial(parent_name, raws, cache, broken, seen + (name,))
if parent_name in broken:
broken[name] = ManifestError(
f"bottle '{name}' extends '{parent_name}' which failed to load: "
f"{broken[parent_name]}"
)
return
parent = cache[parent_name]
cache[name] = _merge_bottles(parent, child_raw, name)
except ManifestError as exc:
broken[name] = exc
+69 -31
View File
@@ -11,6 +11,7 @@ from .manifest_schema import (
validate_agent_frontmatter_keys,
validate_bottle_frontmatter_keys,
)
from .manifest_util import ManifestError
from .yaml_subset import YamlSubsetError, parse_frontmatter
if TYPE_CHECKING:
@@ -21,8 +22,6 @@ def check_stale_json(dir_path: Path, md_dir: Path, label: str) -> None:
"""Die if `<dir_path>/bot-bottle.json` exists but `md_dir` does
not. The manifest format changed in PRD 0011 and we do not want
to silently leave the JSON content unused."""
from .manifest import ManifestError
legacy = dir_path / "bot-bottle.json"
if legacy.is_file() and not md_dir.exists():
raise ManifestError(
@@ -34,15 +33,20 @@ def check_stale_json(dir_path: Path, md_dir: Path, label: str) -> None:
)
def load_bottles_from_dir(bottles_dir: Path) -> dict[str, ManifestBottle]:
def load_bottles_from_dir(
bottles_dir: Path,
) -> tuple[dict[str, ManifestBottle], dict[str, ManifestError]]:
"""Walk `<bottles_dir>/*.md`, parse each as a bottle, and return
`{name: Bottle}`. Missing dir returns an empty dict."""
from .manifest import ManifestError
from .manifest_extends import resolve_bottles
`({name: Bottle}, {name: error})`. Missing dir returns empty dicts.
Per-file errors are collected in the second dict rather than raised,
so an invalid bottle file does not block unrelated bottles or agents."""
from .manifest_extends import resolve_bottles_partial
raws: dict[str, dict[str, object]] = {}
broken: dict[str, ManifestError] = {}
if not bottles_dir.is_dir():
return {}
return {}, {}
for path in sorted(bottles_dir.glob("*.md")):
name = entity_name_from_path(path)
if name is None:
@@ -54,12 +58,21 @@ def load_bottles_from_dir(bottles_dir: Path) -> dict[str, ManifestBottle]:
try:
fm, _body = parse_frontmatter(path.read_text())
except OSError as e:
raise ManifestError(f"could not read {path}: {e}") from e
broken[name] = ManifestError(f"could not read {path}: {e}")
continue
except YamlSubsetError as e:
raise ManifestError(f"{path}: {e}") from e
validate_bottle_frontmatter_keys(path, fm.keys())
broken[name] = ManifestError(f"{path}: {e}")
continue
try:
validate_bottle_frontmatter_keys(path, fm.keys())
except ManifestError as e:
broken[name] = e
continue
raws[name] = fm
return resolve_bottles(raws)
good, resolve_broken = resolve_bottles_partial(raws)
broken.update(resolve_broken)
return good, broken
def load_agents_from_dir(
@@ -67,15 +80,26 @@ def load_agents_from_dir(
bottle_names: set[str],
*,
source: str, # noqa: F841 — unused, but required by interface
) -> dict[str, ManifestAgent]:
broken_bottle_errors: dict[str, ManifestError] | None = None,
) -> tuple[dict[str, ManifestAgent], dict[str, ManifestError]]:
"""Walk `<agents_dir>/*.md`, parse each as an agent, and return
`{name: Agent}`. The Markdown body becomes the agent's prompt.
Missing dir returns an empty dict."""
from .manifest import ManifestAgent, ManifestError
`({name: Agent}, {name: error})`. The Markdown body becomes the
agent's prompt. Missing dir returns empty dicts.
Per-file errors are collected in the second dict rather than raised.
Agents referencing a broken bottle are also moved to the error dict
so their error surfaces at preflight rather than manifest load time."""
from .manifest import ManifestAgent
broken_bottles = broken_bottle_errors or {}
# Agents may reference bottles that failed to resolve; accept those names
# during structural parsing so we can detect the broken-bottle case below.
all_known_bottles = bottle_names | set(broken_bottles.keys())
out: dict[str, ManifestAgent] = {}
broken: dict[str, ManifestError] = {}
if not agents_dir.is_dir():
return out
return out, broken
for path in sorted(agents_dir.glob("*.md")):
name = entity_name_from_path(path)
if name is None:
@@ -87,19 +111,33 @@ def load_agents_from_dir(
try:
fm, body = parse_frontmatter(path.read_text())
except OSError as e:
raise ManifestError(f"could not read {path}: {e}") from e
broken[name] = ManifestError(f"could not read {path}: {e}")
continue
except YamlSubsetError as e:
raise ManifestError(f"{path}: {e}") from e
validate_agent_frontmatter_keys(path, fm.keys())
# Build the dict Agent.from_dict expects. The body becomes
# prompt; Claude Code passthrough fields stay in fm and get
# ignored by Agent.from_dict (reads bottle/skills/git-gate/prompt).
agent_dict: dict[str, object] = {
"bottle": fm.get("bottle"),
"skills": fm.get("skills", []),
"prompt": body.strip(),
}
if "git-gate" in fm:
agent_dict["git-gate"] = fm["git-gate"]
out[name] = ManifestAgent.from_dict(name, agent_dict, bottle_names)
return out
broken[name] = ManifestError(f"{path}: {e}")
continue
try:
validate_agent_frontmatter_keys(path, fm.keys())
agent_dict: dict[str, object] = {
"bottle": fm.get("bottle"),
"skills": fm.get("skills", []),
"prompt": body.strip(),
}
if "git-gate" in fm:
agent_dict["git-gate"] = fm["git-gate"]
agent = ManifestAgent.from_dict(name, agent_dict, all_known_bottles)
except ManifestError as e:
broken[name] = e
continue
# Agent parsed fine but its bottle may have failed to resolve.
bottle_ref = agent.bottle
if bottle_ref in broken_bottles:
broken[name] = ManifestError(
f"agent '{name}' references bottle '{bottle_ref}' which "
f"failed to load: {broken_bottles[bottle_ref]}"
)
continue
out[name] = agent
return out, broken
+1
View File
@@ -20,6 +20,7 @@ from bot_bottle.backend import ActiveAgent
def _make_manifest(agent_names: list[str]):
manifest = MagicMock()
manifest.agents = {name: MagicMock() for name in agent_names}
manifest.all_agent_names = sorted(agent_names)
return manifest
+8 -2
View File
@@ -226,10 +226,16 @@ class TestAgentGitUserMdLoader(unittest.TestCase):
m.git_identity_summary("impl"),
)
def test_md_agent_repos_dies(self):
def test_md_agent_repos_deferred(self):
"""git-gate.repos on an agent is an error, but deferred into
broken_agents rather than raised at resolve time, so other agents
remain accessible."""
self._write("bottles/dev.md", _BOTTLE_DEV)
self._write("agents/impl.md", _AGENT_WITH_REPOS)
msg = _error_message(Manifest.resolve, str(self.home))
m = Manifest.resolve(str(self.home))
self.assertNotIn("impl", m.agents)
self.assertIn("impl", m.broken_agents)
msg = str(m.broken_agents["impl"])
self.assertIn("git-gate.repos", msg)
self.assertIn("bottle-only", msg)
+82 -22
View File
@@ -294,14 +294,15 @@ class TestManifestEntryPointParity(_ResolveCase):
self.assertEqual("dev", manifest.agents["implementer"].bottle)
class TestUnknownAgentKeyDies(_ResolveCase):
"""A typo'd / unknown frontmatter key on an agent file dies
rather than silently ignoring."""
class TestUnknownAgentKeyDefersToBroken(_ResolveCase):
"""A typo'd / unknown frontmatter key on an agent file is deferred
into broken_agents rather than crashing the whole manifest load.
The error surfaces when that specific agent is selected for launch."""
def test_dies(self):
def test_broken_agent_deferred(self):
_write(self.home_cb / "bottles" / "dev.md", _BOTTLE_DEV)
_write(
self.home_cb / "agents" / "implementer.md",
self.home_cb / "agents" / "bad.md",
"""
---
bottle: dev
@@ -311,17 +312,58 @@ class TestUnknownAgentKeyDies(_ResolveCase):
...
""",
)
with self.assertRaises(ManifestError):
self.resolve()
_write(self.home_cb / "agents" / "implementer.md", _AGENT_IMPL)
m = self.resolve()
# The broken agent is NOT in the valid agents dict…
self.assertNotIn("bad", m.agents)
# …but it IS captured in broken_agents.
self.assertIn("bad", m.broken_agents)
self.assertIsInstance(m.broken_agents["bad"], ManifestError)
# Unrelated agent still loads fine.
self.assertIn("implementer", m.agents)
class TestUnknownBottleKeyDies(_ResolveCase):
"""A typo'd / unknown frontmatter key on a bottle file dies
rather than silently ignoring."""
def test_dies(self):
def test_broken_agent_appears_in_all_agent_names(self):
_write(self.home_cb / "bottles" / "dev.md", _BOTTLE_DEV)
_write(
self.home_cb / "bottles" / "dev.md",
self.home_cb / "agents" / "bad.md",
"""
---
bottle: dev
skillz: [init-prd]
---
""",
)
_write(self.home_cb / "agents" / "implementer.md", _AGENT_IMPL)
m = self.resolve()
self.assertIn("bad", m.all_agent_names)
self.assertIn("implementer", m.all_agent_names)
def test_broken_agent_raises_at_preflight(self):
_write(self.home_cb / "bottles" / "dev.md", _BOTTLE_DEV)
_write(
self.home_cb / "agents" / "bad.md",
"""
---
bottle: dev
skillz: [init-prd]
---
""",
)
m = self.resolve()
with self.assertRaises(ManifestError):
m.require_agent("bad")
with self.assertRaises(ManifestError):
m.bottle_for("bad")
class TestUnknownBottleKeyDefersToBroken(_ResolveCase):
"""A typo'd / unknown frontmatter key on a bottle file is deferred
into broken_agents for agents referencing that bottle, rather than
crashing the whole manifest load."""
def test_broken_bottle_defers_agent(self):
_write(
self.home_cb / "bottles" / "bad.md",
"""
---
credproxy:
@@ -329,9 +371,24 @@ class TestUnknownBottleKeyDies(_ResolveCase):
---
""",
)
_write(self.home_cb / "bottles" / "dev.md", _BOTTLE_DEV)
_write(self.home_cb / "agents" / "implementer.md", _AGENT_IMPL)
with self.assertRaises(ManifestError):
self.resolve()
_write(
self.home_cb / "agents" / "broken-agent.md",
"""
---
bottle: bad
---
""",
)
m = self.resolve()
# Good bottle and agent still load.
self.assertIn("dev", m.bottles)
self.assertIn("implementer", m.agents)
# Broken bottle's agent is deferred.
self.assertNotIn("bad", m.bottles)
self.assertNotIn("broken-agent", m.agents)
self.assertIn("broken-agent", m.broken_agents)
class TestStaleJsonDies(_ResolveCase):
@@ -359,11 +416,11 @@ class TestNoManifestDies(_ResolveCase):
self.assertEqual({}, dict(m.agents))
class TestUnknownBottleReferenceDies(_ResolveCase):
"""An agent file naming a bottle that doesn't exist on disk
dies with the existing "bottle not defined" error."""
class TestUnknownBottleReferenceDefersToBroken(_ResolveCase):
"""An agent file naming a bottle that doesn't exist on disk is
deferred into broken_agents; other agents still load."""
def test_dies(self):
def test_stray_bottle_reference_deferred(self):
_write(self.home_cb / "bottles" / "dev.md", _BOTTLE_DEV)
_write(
self.home_cb / "agents" / "stray.md",
@@ -373,8 +430,11 @@ class TestUnknownBottleReferenceDies(_ResolveCase):
---
""",
)
with self.assertRaises(ManifestError):
self.resolve()
_write(self.home_cb / "agents" / "implementer.md", _AGENT_IMPL)
m = self.resolve()
self.assertNotIn("stray", m.agents)
self.assertIn("stray", m.broken_agents)
self.assertIn("implementer", m.agents)
class TestFilenameValidation(_ResolveCase):