bot-bottle/bot_bottle/yaml_subset.py

"""Hand-rolled YAML-subset parser for bot-bottle manifest files
(PRD 0011).

Why hand-rolled: the configs we accept have a bounded shape (flat
top-level keys; values are strings / ints / bools / null / lists /
nested dicts; no anchors, no multi-line block scalars, no tags, no
implicit type coercion gotchas). A real YAML library is a much
larger dependency surface than we need. The project's stdlib-only
stance (AGENTS.md) is the load-bearing reason; the safety
properties — no Norway problem, no surprise date/octal coercion —
are the bonus.

Public API:

    parse_yaml_subset(text) -> dict[str, object]
        Parse a full document. Top level must be a mapping (the
        shape every bot-bottle manifest file uses). Values are
        str / int / bool / None / list / dict only.

    parse_frontmatter(text) -> tuple[dict[str, object], str]
        For a Markdown file with YAML frontmatter delimited by `---`
        lines. Returns (frontmatter_dict, body_text).

What we accept (block-style):

    key: value                # mapping entry, value is inline
    key:                      # mapping entry, value is block
      nested_key: value

    key:
      - item                  # list under a key
      - item

    key:
      - subkey: value1        # list item that's a mapping
        subkey2: value2
      - subkey: value3

What we accept (inline, scalar leaves only):

    key: [a, b, "c d"]
    key: {a: 1, b: 2}

What we reject (each dies with a clear pointer):

    &anchor / *alias          # anchors / aliases
    !!tag                     # YAML tags
    | / >                     # multi-line block scalars
    yes / no / on / off       # only true / false count as bool
    ambiguous bare strings    # numbers, dates, etc. when unquoted
    tabs as indentation       # spaces only
    flow-style nested deeper than one level

Errors carry the line number from the source document.
"""

from __future__ import annotations

import re
from dataclasses import dataclass


class YamlSubsetError(ValueError):
    """Raised when input violates the YAML subset's rules. Callers
    that want fatal-exit semantics (manifest loader, pipelock-apply,
    etc.) catch this at their own boundary and forward to `die`;
    callers running outside the bot-bottle CLI process (the
    egress sidecar's addon) handle it as a normal exception."""


def die(msg: str) -> None:
    """Module-local helper so the parser body reads cleanly. Just
    raises YamlSubsetError — the `bot-bottle: error: ` prefix
    is added by the boundary `die` in `bot_bottle.log`."""
    raise YamlSubsetError(msg)


# --- Tokenizer / line preprocessing ----------------------------------------


@dataclass(frozen=True)
class _Line:
    """One non-blank, non-comment line from the source. `indent` is
    the column of the first non-space character; `content` is the
    line text from that column onward, with trailing whitespace and
    trailing `# ...` comments stripped. `lineno` is the 1-based
    line in the original document."""

    indent: int
    content: str
    lineno: int


def _strip_trailing_comment(s: str) -> str:
    """Strip ` # comment` from end of line, but only when the `#`
    isn't inside a quoted string. Returns the cleaned line."""
    in_single = False
    in_double = False
    for i, ch in enumerate(s):
        if ch == "'" and not in_double:
            in_single = not in_single
        elif ch == '"' and not in_single:
            in_double = not in_double
        elif ch == "#" and not in_single and not in_double:
            # `#` must be preceded by whitespace to be a comment,
            # otherwise it's just a literal character.
            if i == 0 or s[i - 1] in (" ", "\t"):
                return s[:i].rstrip()
    return s.rstrip()


def _tokenize(text: str) -> list[_Line]:
    """Drop blank / comment lines, parse indent + content for the
    rest. Tabs in the indent area are rejected outright."""
    out: list[_Line] = []
    for n, raw in enumerate(text.splitlines(), start=1):
        # Tabs in indent are a portability footgun — different
        # editors render them differently and the spec says spaces.
        leading = len(raw) - len(raw.lstrip(" \t"))
        if "\t" in raw[:leading]:
            die(f"yaml-subset: tab character in indent on line {n}")
        stripped = raw.strip()
        if not stripped:
            continue
        if stripped.startswith("#"):
            continue
        # Whole-line position: indent before first non-space.
        indent = len(raw) - len(raw.lstrip(" "))
        content = _strip_trailing_comment(raw[indent:])
        if not content:
            continue
        out.append(_Line(indent=indent, content=content, lineno=n))
    return out


# --- Scalar parsing ---------------------------------------------------------


_BARE_RX = re.compile(r"^[A-Za-z_][A-Za-z0-9_.\-]*$")
_INT_RX = re.compile(r"^-?[0-9]+$")
_RESERVED_BOOL_LIKE = frozenset({"yes", "no", "on", "off", "y", "n", "Y", "N",
                                 "YES", "NO", "ON", "OFF", "True", "False",
                                 "TRUE", "FALSE"})
# Yaml-ish ambiguity sources that an unquoted bare token COULD be
# mistaken for: dates, octals, etc. Detected and rejected so users
# quote their strings explicitly. We don't try to enumerate every
# ambiguity; the rule is "if it looks like a non-string literal,
# either parse it as that literal (true/false/null/int) or reject
# it with a 'quote it' hint."
_DATE_RX = re.compile(r"^-?\d{4}-\d{2}-\d{2}(T\d.*)?$")
_OCTAL_RX = re.compile(r"^0o?\d+$")
_HEX_RX = re.compile(r"^0x[0-9A-Fa-f]+$")
_FLOAT_RX = re.compile(r"^-?\d+\.\d+([eE][-+]?\d+)?$")


def _parse_scalar(s: str, lineno: int) -> object:
    """Turn a stripped value string into a Python value (str, int,
    bool, None). Quoted strings preserve their literal content
    (with standard escapes); bare strings are accepted only when
    they're unambiguous."""
    s = s.strip()
    if not s:
        return ""

    # Quoted forms first — content is whatever's between the quotes
    # with the documented escapes applied.
    if (s.startswith('"') and s.endswith('"')) or (
        s.startswith("'") and s.endswith("'")
    ):
        if len(s) < 2:
            die(f"yaml-subset: unterminated quoted string on line {lineno}")
        body = s[1:-1]
        if s.startswith('"'):
            # JSON-style escapes for double quotes.
            try:
                return body.encode("utf-8").decode("unicode_escape")
            except UnicodeDecodeError as e:
                die(f"yaml-subset: bad escape on line {lineno}: {e}")
        else:
            # Single quotes: only '' → ' (standard YAML); no other escapes.
            return body.replace("''", "'")

    # Reserved bool-like tokens that aren't `true` / `false` —
    # always reject so users have to be explicit.
    if s in _RESERVED_BOOL_LIKE:
        if s in ("true", "false"):
            return s == "true"
        die(
            f"yaml-subset: bare {s!r} on line {lineno} is ambiguous "
            f"(use literal `true` / `false`, or quote it as a string)"
        )

    if s == "true":
        return True
    if s == "false":
        return False
    if s in ("null", "~"):
        return None

    if _INT_RX.match(s):
        return int(s)

    # Look-alikes that we reject to keep the user in control.
    if _DATE_RX.match(s):
        die(
            f"yaml-subset: bare {s!r} on line {lineno} looks like a "
            f"date — quote it as a string or use an explicit int"
        )
    if _OCTAL_RX.match(s):
        die(
            f"yaml-subset: bare {s!r} on line {lineno} looks like an "
            f"octal/0-prefixed integer — quote it as a string"
        )
    if _HEX_RX.match(s):
        die(
            f"yaml-subset: bare {s!r} on line {lineno} looks like a "
            f"hex integer — quote it as a string"
        )
    if _FLOAT_RX.match(s):
        die(
            f"yaml-subset: floats not supported (line {lineno}, "
            f"value {s!r}); use an int or quote as a string"
        )

    # Bare strings: anything that matches the bare-string pattern is
    # accepted as a string literal. Otherwise we hand it back as a
    # string anyway — for URLs, paths, hostnames, etc. that contain
    # special chars. The PRD calls for rejecting "ambiguous" strings,
    # and we've already rejected the ambiguous shapes above; what's
    # left is unambiguously a string.
    return s


# --- Inline list / dict ----------------------------------------------------


def _parse_inline(s: str, lineno: int) -> object:
    """Inline list `[a, b]` or dict `{a: 1, b: 2}` or scalar.
    Nested flow more than one level deep is rejected (PRD)."""
    s = s.strip()
    if s.startswith("["):
        if not s.endswith("]"):
            die(f"yaml-subset: unterminated `[` on line {lineno}")
        body = s[1:-1].strip()
        if not body:
            return []
        items: list[object] = []
        for raw in _split_flow(body, lineno, "list"):
            v = _parse_scalar(raw, lineno)
            items.append(v)
        return items
    if s.startswith("{"):
        if not s.endswith("}"):
            die(f"yaml-subset: unterminated `{{` on line {lineno}")
        body = s[1:-1].strip()
        if not body:
            return {}
        out: dict[str, object] = {}
        for raw in _split_flow(body, lineno, "dict"):
            if ":" not in raw:
                die(
                    f"yaml-subset: inline dict entry on line {lineno} "
                    f"missing `:` ({raw!r})"
                )
            k, _, v = raw.partition(":")
            k = k.strip()
            if not _BARE_RX.match(k):
                die(
                    f"yaml-subset: inline dict key on line {lineno} "
                    f"must be a bare identifier ({k!r})"
                )
            out[k] = _parse_scalar(v.strip(), lineno)
        return out
    return _parse_scalar(s, lineno)


def _split_flow(body: str, lineno: int, kind: str) -> list[str]:
    """Split `a, b, c` respecting quoted strings. Rejects nested
    flow (a list/dict inside the flow body) since the PRD limits
    flow nesting to one level."""
    items: list[str] = []
    depth_b = 0
    depth_c = 0
    in_single = False
    in_double = False
    cur = []
    for ch in body:
        if ch == "'" and not in_double:
            in_single = not in_single
        elif ch == '"' and not in_single:
            in_double = not in_double
        elif not in_single and not in_double:
            if ch in "[{":
                depth_b += 1
            elif ch in "]}":
                depth_b -= 1
            if depth_b > 0:
                die(
                    f"yaml-subset: nested flow {kind} on line "
                    f"{lineno} (only one level of flow allowed)"
                )
            if ch == "," and depth_b == 0 and depth_c == 0:
                items.append("".join(cur))
                cur = []
                continue
        cur.append(ch)
    if cur:
        items.append("".join(cur))
    return [s.strip() for s in items if s.strip()]


# --- Block parser ----------------------------------------------------------


def _split_key_value(content: str, lineno: int) -> tuple[str, str]:
    """Find the FIRST top-level `:` that separates a key from its
    value (ignoring `:` inside quoted strings). Returns (key, value).
    `value` may be empty (block-form mapping)."""
    in_single = False
    in_double = False
    for i, ch in enumerate(content):
        if ch == "'" and not in_double:
            in_single = not in_single
        elif ch == '"' and not in_single:
            in_double = not in_double
        elif ch == ":" and not in_single and not in_double:
            # `:` must be followed by space or be at end-of-line to
            # count as a key separator (otherwise `key:value` would
            # ambiguous with URLs etc.).
            if i + 1 >= len(content) or content[i + 1] in (" ", "\t"):
                return content[:i].strip(), content[i + 1:].lstrip()
    die(f"yaml-subset: line {lineno} missing `: ` separator: {content!r}")


def _parse_block(
    lines: list[_Line], idx: int, base_indent: int
) -> tuple[object, int]:
    """Parse a block starting at `lines[idx]`, expecting that block
    to live at `base_indent`. Returns (value, new_idx) where
    `new_idx` is the index of the first unconsumed line."""
    if idx >= len(lines):
        die("yaml-subset: unexpected end of document")
    first = lines[idx]
    if first.indent < base_indent:
        die(
            f"yaml-subset: line {first.lineno} indented less than "
            f"expected (got {first.indent}, expected >= {base_indent})"
        )
    if first.indent > base_indent:
        die(
            f"yaml-subset: line {first.lineno} indented more than "
            f"expected (got {first.indent}, expected {base_indent})"
        )

    if first.content.startswith("- ") or first.content == "-":
        return _parse_block_list(lines, idx, base_indent)
    return _parse_block_mapping(lines, idx, base_indent)


def _parse_block_mapping(
    lines: list[_Line], idx: int, base_indent: int
) -> tuple[dict[str, object], int]:
    out: dict[str, object] = {}
    while idx < len(lines) and lines[idx].indent == base_indent:
        line = lines[idx]
        if line.content.startswith("- "):
            die(
                f"yaml-subset: line {line.lineno} unexpected list "
                f"item at mapping indent (got `-`, expected `key:`)"
            )
        key, value_text = _split_key_value(line.content, line.lineno)
        if not _BARE_RX.match(key):
            die(
                f"yaml-subset: line {line.lineno} key {key!r} is not "
                f"a bare identifier"
            )
        if key in out:
            die(
                f"yaml-subset: line {line.lineno} duplicate key {key!r}"
            )
        if value_text:
            out[key] = _parse_inline(value_text, line.lineno)
            idx += 1
        else:
            # Value is a block on subsequent lines.
            idx += 1
            if idx >= len(lines) or lines[idx].indent <= base_indent:
                # Empty block — treat as None to match YAML.
                out[key] = None
                continue
            child_indent = lines[idx].indent
            value, idx = _parse_block(lines, idx, child_indent)
            out[key] = value
    return out, idx


def _parse_block_list(
    lines: list[_Line], idx: int, base_indent: int
) -> tuple[list[object], int]:
    items: list[object] = []
    while idx < len(lines) and lines[idx].indent == base_indent and (
        lines[idx].content.startswith("- ") or lines[idx].content == "-"
    ):
        line = lines[idx]
        rest = line.content[2:] if line.content.startswith("- ") else ""
        rest = rest.strip()

        # Look ahead at the next non-empty line: if it's indented
        # more than the dash AND aligned with the rest's column,
        # we have a multi-line mapping item.
        if rest and ":" in rest and _looks_like_kv(rest):
            # The first key:value of a multi-line mapping list item.
            # Subsequent keys live at indent = base_indent + 2 (or
            # wherever the content after `- ` started).
            content_col = base_indent + 2
            first_key, first_value_text = _split_key_value(rest, line.lineno)
            if not _BARE_RX.match(first_key):
                die(
                    f"yaml-subset: line {line.lineno} key {first_key!r} "
                    f"is not a bare identifier"
                )
            item: dict[str, object] = {}
            if first_value_text:
                item[first_key] = _parse_inline(first_value_text, line.lineno)
                idx += 1
            else:
                idx += 1
                if idx < len(lines) and lines[idx].indent > content_col:
                    nested_indent = lines[idx].indent
                    value, idx = _parse_block(lines, idx, nested_indent)
                    item[first_key] = value
                else:
                    item[first_key] = None
            # Consume additional keys at content_col.
            while idx < len(lines) and lines[idx].indent == content_col:
                ln = lines[idx]
                if ln.content.startswith("- "):
                    break  # next list item, not a sibling key
                k, v_text = _split_key_value(ln.content, ln.lineno)
                if not _BARE_RX.match(k):
                    die(
                        f"yaml-subset: line {ln.lineno} key {k!r} is "
                        f"not a bare identifier"
                    )
                if k in item:
                    die(f"yaml-subset: line {ln.lineno} duplicate key {k!r}")
                if v_text:
                    item[k] = _parse_inline(v_text, ln.lineno)
                    idx += 1
                else:
                    idx += 1
                    if idx < len(lines) and lines[idx].indent > content_col:
                        nested_indent = lines[idx].indent
                        value, idx = _parse_block(lines, idx, nested_indent)
                        item[k] = value
                    else:
                        item[k] = None
            items.append(item)
        elif rest:
            # Inline scalar / inline list / inline dict on the dash line.
            items.append(_parse_inline(rest, line.lineno))
            idx += 1
        else:
            # Bare `-` — value is a block on subsequent lines.
            idx += 1
            if idx >= len(lines) or lines[idx].indent <= base_indent:
                items.append(None)
                continue
            child_indent = lines[idx].indent
            value, idx = _parse_block(lines, idx, child_indent)
            items.append(value)
    return items, idx


def _looks_like_kv(s: str) -> bool:
    """Heuristic: does `s` look like a mapping `key: value` line?
    True if there's an unquoted `:` that's followed by space-or-EOL."""
    in_single = False
    in_double = False
    for i, ch in enumerate(s):
        if ch == "'" and not in_double:
            in_single = not in_single
        elif ch == '"' and not in_single:
            in_double = not in_double
        elif ch == ":" and not in_single and not in_double:
            if i + 1 >= len(s) or s[i + 1] in (" ", "\t"):
                return True
    return False


# --- Public API -------------------------------------------------------------


def parse_yaml_subset(text: str) -> dict[str, object]:
    """Parse a YAML-subset document. Top level must be a mapping;
    otherwise we die with a clear pointer."""
    # Reject features that have no place in our schema before we
    # tokenize, with line numbers from the raw text.
    for n, raw in enumerate(text.splitlines(), start=1):
        s = raw.strip()
        if s.startswith("|") or s.startswith(">") or s.startswith("- |") or s.startswith("- >"):
            die(
                f"yaml-subset: line {n} uses a multi-line block "
                f"scalar (`|` / `>`) — not supported. Use a quoted "
                f"single-line string instead."
            )
        if "&" in s or "*" in s:
            # Only flag when `&` or `*` is being used as anchor/alias,
            # not when it's inside a quoted string. Cheap check: any
            # bare `&foo:` / `*foo` at the start of a value position.
            if re.search(r"(^|\s)[&*][A-Za-z0-9_]+", s):
                die(
                    f"yaml-subset: line {n} uses anchors / aliases "
                    f"(`&` / `*`) — not supported."
                )
        if "!!" in s and not (s.count("'") % 2 or s.count('"') % 2):
            die(
                f"yaml-subset: line {n} uses a YAML tag (`!!`) — not "
                f"supported."
            )

    lines = _tokenize(text)
    if not lines:
        return {}
    base_indent = lines[0].indent
    if base_indent != 0:
        die(
            f"yaml-subset: top-level content must start in column 0 "
            f"(got column {base_indent} on line {lines[0].lineno})"
        )
    value, consumed = _parse_block(lines, 0, 0)
    if consumed < len(lines):
        die(
            f"yaml-subset: trailing content starting on line "
            f"{lines[consumed].lineno}"
        )
    if not isinstance(value, dict):
        die("yaml-subset: top-level value must be a mapping")
    return value


def parse_frontmatter(text: str) -> tuple[dict[str, object], str]:
    """Find `---` delimiters at the top of a Markdown file, parse
    the frontmatter as YAML subset, return (mapping, body_text).

    No frontmatter at all → ({}, text). Single opening `---` with
    no closing → die with a clear pointer. Body is the verbatim
    text after the closing `---` line (preserving original line
    endings)."""
    # Split into lines but preserve the original separators so the
    # body slice is exact.
    nl_positions: list[int] = []
    for i, ch in enumerate(text):
        if ch == "\n":
            nl_positions.append(i)
    if not nl_positions and not text:
        return {}, ""

    first_nl = nl_positions[0] if nl_positions else len(text)
    first_line = text[:first_nl].strip()
    if first_line != "---":
        return {}, text  # no frontmatter; whole document is body

    # Find the matching closing `---`.
    body_start = -1
    fm_end_lineno = -1
    line_starts = [0] + [p + 1 for p in nl_positions]
    for line_idx in range(1, len(line_starts)):
        ls = line_starts[line_idx]
        next_nl = nl_positions[line_idx] if line_idx < len(nl_positions) else len(text)
        line = text[ls:next_nl].rstrip()
        if line == "---":
            body_start = next_nl + 1 if next_nl < len(text) else next_nl
            fm_end_lineno = line_idx
            break
    if body_start < 0:
        die("frontmatter: opening `---` has no matching closing `---`")

    fm_text = text[line_starts[1]:line_starts[fm_end_lineno]] if fm_end_lineno > 1 else ""
    fm = parse_yaml_subset(fm_text)
    body = text[body_start:]
    return fm, body