Files
bot-bottle/bot_bottle/yaml_subset.py
T
didericis-codex 18e3b62b72
test / unit (pull_request) Successful in 28s
test / integration (pull_request) Successful in 40s
test / unit (push) Successful in 31s
test / integration (push) Successful in 44s
docs: rename CLAUDE.md to AGENTS.md and rebrand provider-agnostic
Delete CLAUDE.md in favor of AGENTS.md as the orientation doc, rebrand
the project from Codex-bottle to provider-agnostic bot-bottle, and
repoint every CLAUDE.md reference across PRDs, research notes, the
implementer agent example, and the yaml_subset comment.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-28 20:36:47 -04:00

583 lines
22 KiB
Python

"""Hand-rolled YAML-subset parser for bot-bottle manifest files
(PRD 0011).
Why hand-rolled: the configs we accept have a bounded shape (flat
top-level keys; values are strings / ints / bools / null / lists /
nested dicts; no anchors, no multi-line block scalars, no tags, no
implicit type coercion gotchas). A real YAML library is a much
larger dependency surface than we need. The project's stdlib-only
stance (AGENTS.md) is the load-bearing reason; the safety
properties — no Norway problem, no surprise date/octal coercion —
are the bonus.
Public API:
parse_yaml_subset(text) -> dict[str, object]
Parse a full document. Top level must be a mapping (the
shape every bot-bottle manifest file uses). Values are
str / int / bool / None / list / dict only.
parse_frontmatter(text) -> tuple[dict[str, object], str]
For a Markdown file with YAML frontmatter delimited by `---`
lines. Returns (frontmatter_dict, body_text).
What we accept (block-style):
key: value # mapping entry, value is inline
key: # mapping entry, value is block
nested_key: value
key:
- item # list under a key
- item
key:
- subkey: value1 # list item that's a mapping
subkey2: value2
- subkey: value3
What we accept (inline, scalar leaves only):
key: [a, b, "c d"]
key: {a: 1, b: 2}
What we reject (each dies with a clear pointer):
&anchor / *alias # anchors / aliases
!!tag # YAML tags
| / > # multi-line block scalars
yes / no / on / off # only true / false count as bool
ambiguous bare strings # numbers, dates, etc. when unquoted
tabs as indentation # spaces only
flow-style nested deeper than one level
Errors carry the line number from the source document.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
class YamlSubsetError(ValueError):
"""Raised when input violates the YAML subset's rules. Callers
that want fatal-exit semantics (manifest loader, pipelock-apply,
etc.) catch this at their own boundary and forward to `die`;
callers running outside the bot-bottle CLI process (the
egress sidecar's addon) handle it as a normal exception."""
def die(msg: str) -> None:
"""Module-local helper so the parser body reads cleanly. Just
raises YamlSubsetError — the `bot-bottle: error: ` prefix
is added by the boundary `die` in `bot_bottle.log`."""
raise YamlSubsetError(msg)
# --- Tokenizer / line preprocessing ----------------------------------------
@dataclass(frozen=True)
class _Line:
"""One non-blank, non-comment line from the source. `indent` is
the column of the first non-space character; `content` is the
line text from that column onward, with trailing whitespace and
trailing `# ...` comments stripped. `lineno` is the 1-based
line in the original document."""
indent: int
content: str
lineno: int
def _strip_trailing_comment(s: str) -> str:
"""Strip ` # comment` from end of line, but only when the `#`
isn't inside a quoted string. Returns the cleaned line."""
in_single = False
in_double = False
for i, ch in enumerate(s):
if ch == "'" and not in_double:
in_single = not in_single
elif ch == '"' and not in_single:
in_double = not in_double
elif ch == "#" and not in_single and not in_double:
# `#` must be preceded by whitespace to be a comment,
# otherwise it's just a literal character.
if i == 0 or s[i - 1] in (" ", "\t"):
return s[:i].rstrip()
return s.rstrip()
def _tokenize(text: str) -> list[_Line]:
"""Drop blank / comment lines, parse indent + content for the
rest. Tabs in the indent area are rejected outright."""
out: list[_Line] = []
for n, raw in enumerate(text.splitlines(), start=1):
# Tabs in indent are a portability footgun — different
# editors render them differently and the spec says spaces.
leading = len(raw) - len(raw.lstrip(" \t"))
if "\t" in raw[:leading]:
die(f"yaml-subset: tab character in indent on line {n}")
stripped = raw.strip()
if not stripped:
continue
if stripped.startswith("#"):
continue
# Whole-line position: indent before first non-space.
indent = len(raw) - len(raw.lstrip(" "))
content = _strip_trailing_comment(raw[indent:])
if not content:
continue
out.append(_Line(indent=indent, content=content, lineno=n))
return out
# --- Scalar parsing ---------------------------------------------------------
_BARE_RX = re.compile(r"^[A-Za-z_][A-Za-z0-9_.\-]*$")
_INT_RX = re.compile(r"^-?[0-9]+$")
_RESERVED_BOOL_LIKE = frozenset({"yes", "no", "on", "off", "y", "n", "Y", "N",
"YES", "NO", "ON", "OFF", "True", "False",
"TRUE", "FALSE"})
# Yaml-ish ambiguity sources that an unquoted bare token COULD be
# mistaken for: dates, octals, etc. Detected and rejected so users
# quote their strings explicitly. We don't try to enumerate every
# ambiguity; the rule is "if it looks like a non-string literal,
# either parse it as that literal (true/false/null/int) or reject
# it with a 'quote it' hint."
_DATE_RX = re.compile(r"^-?\d{4}-\d{2}-\d{2}(T\d.*)?$")
_OCTAL_RX = re.compile(r"^0o?\d+$")
_HEX_RX = re.compile(r"^0x[0-9A-Fa-f]+$")
_FLOAT_RX = re.compile(r"^-?\d+\.\d+([eE][-+]?\d+)?$")
def _parse_scalar(s: str, lineno: int) -> object:
"""Turn a stripped value string into a Python value (str, int,
bool, None). Quoted strings preserve their literal content
(with standard escapes); bare strings are accepted only when
they're unambiguous."""
s = s.strip()
if not s:
return ""
# Quoted forms first — content is whatever's between the quotes
# with the documented escapes applied.
if (s.startswith('"') and s.endswith('"')) or (
s.startswith("'") and s.endswith("'")
):
if len(s) < 2:
die(f"yaml-subset: unterminated quoted string on line {lineno}")
body = s[1:-1]
if s.startswith('"'):
# JSON-style escapes for double quotes.
try:
return body.encode("utf-8").decode("unicode_escape")
except UnicodeDecodeError as e:
die(f"yaml-subset: bad escape on line {lineno}: {e}")
else:
# Single quotes: only '' → ' (standard YAML); no other escapes.
return body.replace("''", "'")
# Reserved bool-like tokens that aren't `true` / `false` —
# always reject so users have to be explicit.
if s in _RESERVED_BOOL_LIKE:
if s in ("true", "false"):
return s == "true"
die(
f"yaml-subset: bare {s!r} on line {lineno} is ambiguous "
f"(use literal `true` / `false`, or quote it as a string)"
)
if s == "true":
return True
if s == "false":
return False
if s in ("null", "~"):
return None
if _INT_RX.match(s):
return int(s)
# Look-alikes that we reject to keep the user in control.
if _DATE_RX.match(s):
die(
f"yaml-subset: bare {s!r} on line {lineno} looks like a "
f"date — quote it as a string or use an explicit int"
)
if _OCTAL_RX.match(s):
die(
f"yaml-subset: bare {s!r} on line {lineno} looks like an "
f"octal/0-prefixed integer — quote it as a string"
)
if _HEX_RX.match(s):
die(
f"yaml-subset: bare {s!r} on line {lineno} looks like a "
f"hex integer — quote it as a string"
)
if _FLOAT_RX.match(s):
die(
f"yaml-subset: floats not supported (line {lineno}, "
f"value {s!r}); use an int or quote as a string"
)
# Bare strings: anything that matches the bare-string pattern is
# accepted as a string literal. Otherwise we hand it back as a
# string anyway — for URLs, paths, hostnames, etc. that contain
# special chars. The PRD calls for rejecting "ambiguous" strings,
# and we've already rejected the ambiguous shapes above; what's
# left is unambiguously a string.
return s
# --- Inline list / dict ----------------------------------------------------
def _parse_inline(s: str, lineno: int) -> object:
"""Inline list `[a, b]` or dict `{a: 1, b: 2}` or scalar.
Nested flow more than one level deep is rejected (PRD)."""
s = s.strip()
if s.startswith("["):
if not s.endswith("]"):
die(f"yaml-subset: unterminated `[` on line {lineno}")
body = s[1:-1].strip()
if not body:
return []
items: list[object] = []
for raw in _split_flow(body, lineno, "list"):
v = _parse_scalar(raw, lineno)
items.append(v)
return items
if s.startswith("{"):
if not s.endswith("}"):
die(f"yaml-subset: unterminated `{{` on line {lineno}")
body = s[1:-1].strip()
if not body:
return {}
out: dict[str, object] = {}
for raw in _split_flow(body, lineno, "dict"):
if ":" not in raw:
die(
f"yaml-subset: inline dict entry on line {lineno} "
f"missing `:` ({raw!r})"
)
k, _, v = raw.partition(":")
k = k.strip()
if not _BARE_RX.match(k):
die(
f"yaml-subset: inline dict key on line {lineno} "
f"must be a bare identifier ({k!r})"
)
out[k] = _parse_scalar(v.strip(), lineno)
return out
return _parse_scalar(s, lineno)
def _split_flow(body: str, lineno: int, kind: str) -> list[str]:
"""Split `a, b, c` respecting quoted strings. Rejects nested
flow (a list/dict inside the flow body) since the PRD limits
flow nesting to one level."""
items: list[str] = []
depth_b = 0
depth_c = 0
in_single = False
in_double = False
cur = []
for ch in body:
if ch == "'" and not in_double:
in_single = not in_single
elif ch == '"' and not in_single:
in_double = not in_double
elif not in_single and not in_double:
if ch in "[{":
depth_b += 1
elif ch in "]}":
depth_b -= 1
if depth_b > 0:
die(
f"yaml-subset: nested flow {kind} on line "
f"{lineno} (only one level of flow allowed)"
)
if ch == "," and depth_b == 0 and depth_c == 0:
items.append("".join(cur))
cur = []
continue
cur.append(ch)
if cur:
items.append("".join(cur))
return [s.strip() for s in items if s.strip()]
# --- Block parser ----------------------------------------------------------
def _split_key_value(content: str, lineno: int) -> tuple[str, str]:
"""Find the FIRST top-level `:` that separates a key from its
value (ignoring `:` inside quoted strings). Returns (key, value).
`value` may be empty (block-form mapping)."""
in_single = False
in_double = False
for i, ch in enumerate(content):
if ch == "'" and not in_double:
in_single = not in_single
elif ch == '"' and not in_single:
in_double = not in_double
elif ch == ":" and not in_single and not in_double:
# `:` must be followed by space or be at end-of-line to
# count as a key separator (otherwise `key:value` would
# ambiguous with URLs etc.).
if i + 1 >= len(content) or content[i + 1] in (" ", "\t"):
return content[:i].strip(), content[i + 1:].lstrip()
die(f"yaml-subset: line {lineno} missing `: ` separator: {content!r}")
def _parse_block(
lines: list[_Line], idx: int, base_indent: int
) -> tuple[object, int]:
"""Parse a block starting at `lines[idx]`, expecting that block
to live at `base_indent`. Returns (value, new_idx) where
`new_idx` is the index of the first unconsumed line."""
if idx >= len(lines):
die("yaml-subset: unexpected end of document")
first = lines[idx]
if first.indent < base_indent:
die(
f"yaml-subset: line {first.lineno} indented less than "
f"expected (got {first.indent}, expected >= {base_indent})"
)
if first.indent > base_indent:
die(
f"yaml-subset: line {first.lineno} indented more than "
f"expected (got {first.indent}, expected {base_indent})"
)
if first.content.startswith("- ") or first.content == "-":
return _parse_block_list(lines, idx, base_indent)
return _parse_block_mapping(lines, idx, base_indent)
def _parse_block_mapping(
lines: list[_Line], idx: int, base_indent: int
) -> tuple[dict[str, object], int]:
out: dict[str, object] = {}
while idx < len(lines) and lines[idx].indent == base_indent:
line = lines[idx]
if line.content.startswith("- "):
die(
f"yaml-subset: line {line.lineno} unexpected list "
f"item at mapping indent (got `-`, expected `key:`)"
)
key, value_text = _split_key_value(line.content, line.lineno)
if not _BARE_RX.match(key):
die(
f"yaml-subset: line {line.lineno} key {key!r} is not "
f"a bare identifier"
)
if key in out:
die(
f"yaml-subset: line {line.lineno} duplicate key {key!r}"
)
if value_text:
out[key] = _parse_inline(value_text, line.lineno)
idx += 1
else:
# Value is a block on subsequent lines.
idx += 1
if idx >= len(lines) or lines[idx].indent <= base_indent:
# Empty block — treat as None to match YAML.
out[key] = None
continue
child_indent = lines[idx].indent
value, idx = _parse_block(lines, idx, child_indent)
out[key] = value
return out, idx
def _parse_block_list(
lines: list[_Line], idx: int, base_indent: int
) -> tuple[list[object], int]:
items: list[object] = []
while idx < len(lines) and lines[idx].indent == base_indent and (
lines[idx].content.startswith("- ") or lines[idx].content == "-"
):
line = lines[idx]
rest = line.content[2:] if line.content.startswith("- ") else ""
rest = rest.strip()
# Look ahead at the next non-empty line: if it's indented
# more than the dash AND aligned with the rest's column,
# we have a multi-line mapping item.
if rest and ":" in rest and _looks_like_kv(rest):
# The first key:value of a multi-line mapping list item.
# Subsequent keys live at indent = base_indent + 2 (or
# wherever the content after `- ` started).
content_col = base_indent + 2
first_key, first_value_text = _split_key_value(rest, line.lineno)
if not _BARE_RX.match(first_key):
die(
f"yaml-subset: line {line.lineno} key {first_key!r} "
f"is not a bare identifier"
)
item: dict[str, object] = {}
if first_value_text:
item[first_key] = _parse_inline(first_value_text, line.lineno)
idx += 1
else:
idx += 1
if idx < len(lines) and lines[idx].indent > content_col:
nested_indent = lines[idx].indent
value, idx = _parse_block(lines, idx, nested_indent)
item[first_key] = value
else:
item[first_key] = None
# Consume additional keys at content_col.
while idx < len(lines) and lines[idx].indent == content_col:
ln = lines[idx]
if ln.content.startswith("- "):
break # next list item, not a sibling key
k, v_text = _split_key_value(ln.content, ln.lineno)
if not _BARE_RX.match(k):
die(
f"yaml-subset: line {ln.lineno} key {k!r} is "
f"not a bare identifier"
)
if k in item:
die(f"yaml-subset: line {ln.lineno} duplicate key {k!r}")
if v_text:
item[k] = _parse_inline(v_text, ln.lineno)
idx += 1
else:
idx += 1
if idx < len(lines) and lines[idx].indent > content_col:
nested_indent = lines[idx].indent
value, idx = _parse_block(lines, idx, nested_indent)
item[k] = value
else:
item[k] = None
items.append(item)
elif rest:
# Inline scalar / inline list / inline dict on the dash line.
items.append(_parse_inline(rest, line.lineno))
idx += 1
else:
# Bare `-` — value is a block on subsequent lines.
idx += 1
if idx >= len(lines) or lines[idx].indent <= base_indent:
items.append(None)
continue
child_indent = lines[idx].indent
value, idx = _parse_block(lines, idx, child_indent)
items.append(value)
return items, idx
def _looks_like_kv(s: str) -> bool:
"""Heuristic: does `s` look like a mapping `key: value` line?
True if there's an unquoted `:` that's followed by space-or-EOL."""
in_single = False
in_double = False
for i, ch in enumerate(s):
if ch == "'" and not in_double:
in_single = not in_single
elif ch == '"' and not in_single:
in_double = not in_double
elif ch == ":" and not in_single and not in_double:
if i + 1 >= len(s) or s[i + 1] in (" ", "\t"):
return True
return False
# --- Public API -------------------------------------------------------------
def parse_yaml_subset(text: str) -> dict[str, object]:
"""Parse a YAML-subset document. Top level must be a mapping;
otherwise we die with a clear pointer."""
# Reject features that have no place in our schema before we
# tokenize, with line numbers from the raw text.
for n, raw in enumerate(text.splitlines(), start=1):
s = raw.strip()
if s.startswith("|") or s.startswith(">") or s.startswith("- |") or s.startswith("- >"):
die(
f"yaml-subset: line {n} uses a multi-line block "
f"scalar (`|` / `>`) — not supported. Use a quoted "
f"single-line string instead."
)
if "&" in s or "*" in s:
# Only flag when `&` or `*` is being used as anchor/alias,
# not when it's inside a quoted string. Cheap check: any
# bare `&foo:` / `*foo` at the start of a value position.
if re.search(r"(^|\s)[&*][A-Za-z0-9_]+", s):
die(
f"yaml-subset: line {n} uses anchors / aliases "
f"(`&` / `*`) — not supported."
)
if "!!" in s and not (s.count("'") % 2 or s.count('"') % 2):
die(
f"yaml-subset: line {n} uses a YAML tag (`!!`) — not "
f"supported."
)
lines = _tokenize(text)
if not lines:
return {}
base_indent = lines[0].indent
if base_indent != 0:
die(
f"yaml-subset: top-level content must start in column 0 "
f"(got column {base_indent} on line {lines[0].lineno})"
)
value, consumed = _parse_block(lines, 0, 0)
if consumed < len(lines):
die(
f"yaml-subset: trailing content starting on line "
f"{lines[consumed].lineno}"
)
if not isinstance(value, dict):
die("yaml-subset: top-level value must be a mapping")
return value
def parse_frontmatter(text: str) -> tuple[dict[str, object], str]:
"""Find `---` delimiters at the top of a Markdown file, parse
the frontmatter as YAML subset, return (mapping, body_text).
No frontmatter at all → ({}, text). Single opening `---` with
no closing → die with a clear pointer. Body is the verbatim
text after the closing `---` line (preserving original line
endings)."""
# Split into lines but preserve the original separators so the
# body slice is exact.
nl_positions: list[int] = []
for i, ch in enumerate(text):
if ch == "\n":
nl_positions.append(i)
if not nl_positions and not text:
return {}, ""
first_nl = nl_positions[0] if nl_positions else len(text)
first_line = text[:first_nl].strip()
if first_line != "---":
return {}, text # no frontmatter; whole document is body
# Find the matching closing `---`.
body_start = -1
fm_end_lineno = -1
line_starts = [0] + [p + 1 for p in nl_positions]
for line_idx in range(1, len(line_starts)):
ls = line_starts[line_idx]
next_nl = nl_positions[line_idx] if line_idx < len(nl_positions) else len(text)
line = text[ls:next_nl].rstrip()
if line == "---":
body_start = next_nl + 1 if next_nl < len(text) else next_nl
fm_end_lineno = line_idx
break
if body_start < 0:
die("frontmatter: opening `---` has no matching closing `---`")
fm_text = text[line_starts[1]:line_starts[fm_end_lineno]] if fm_end_lineno > 1 else ""
fm = parse_yaml_subset(fm_text)
body = text[body_start:]
return fm, body