diff --git a/.coveragerc b/.coveragerc index 1d3c78a..7dc1873 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,6 +3,16 @@ branch = True source = . [report] +# Coverage policy: see docs/decisions/0004-coverage-policy.md. +# +# `omit` is reserved for genuinely interactive entry-point shells whose +# bodies are `read_tty_line()` / curses prompt loops — there is no +# behaviour to assert that a test wouldn't have to fake wholesale, so a +# test here would inflate the number without buying confidence. This is +# NOT a place to hide subprocess/backend orchestration: that code is +# security-relevant and is measured via the integration suite instead +# (run scripts/coverage.sh for the combined unit+integration number). omit = bot_bottle/cli/tui.py + bot_bottle/cli/init.py tests/* diff --git a/.gitea/workflows/test.yml b/.gitea/workflows/test.yml index cd563f5..0dabe0f 100644 --- a/.gitea/workflows/test.yml +++ b/.gitea/workflows/test.yml @@ -70,3 +70,32 @@ jobs: - name: Run integration tests run: python3 -m unittest discover -t . -s tests/integration -v + + # Combined unit+integration coverage + the diff-coverage gate. + # See docs/decisions/0004-coverage-policy.md. The hard gate is diff + # coverage (new/changed lines >= 90%); the combined + critical reports + # are informational and degrade gracefully when the runner has no + # Docker (integration tests skip, those modules just read lower). + coverage: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dev requirements + run: python3 -m pip install -r requirements-dev.txt + + - name: Combined coverage (unit + integration) + run: PYTHON=python3 bash scripts/coverage.sh critical + + - name: Diff-coverage gate (changed lines >= 90%) + run: | + git fetch --no-tags origin main:refs/remotes/origin/main + python3 scripts/diff_coverage.py --base origin/main --min 90 diff --git a/docs/decisions/0004-coverage-policy.md b/docs/decisions/0004-coverage-policy.md new file mode 100644 index 0000000..6141e5b --- /dev/null +++ b/docs/decisions/0004-coverage-policy.md @@ -0,0 +1,90 @@ +# ADR 0004: Risk-weighted coverage, not a single global target + +- **Status:** Accepted +- **Date:** 2026-06-25 +- **Deciders:** didericis + +## Context + +bot-bottle is a security tool: it sandboxes agents, scans egress for +secret exfiltration, strips credentials, and gates git pushes. A latent +bug in that logic is expensive, so test coverage there genuinely +matters. But the repo also contains code where coverage is a poor +signal: + +- **Interactive entry-point shells** — `cli/init.py` (a `read_tty_line()` + prompt loop) and `cli/tui.py` (a curses picker). Their bodies are I/O; + a unit test has to fake the entire terminal conversation, so it + inflates the number without asserting behaviour that would otherwise + go unchecked. +- **Subprocess / backend orchestration** — the docker / smolmachines / + macos-container backends shell out to `docker`, `container`, `smolvm`. + Mock-heavy unit tests here mostly re-assert the argv you already + wrote (the test passes whether or not the real teardown works), while + many of the missed *branches* are failure paths you cannot provoke + against a real daemon on cue. + +Chasing a single global percentage (e.g. 90%) pushes the most test +effort onto the least safety-relevant code — exactly backwards — and +invites performative tests written to colour a line rather than to catch +a regression (Goodhart's law). + +## Decision + +Coverage is **risk-weighted**, measured over the **combined unit + +integration** suites, with three rules: + +1. **Critical modules target ≥ 90%.** The security/logic core — + `egress_addon{,_core}.py`, `dlp_detectors.py`, `egress.py`, + `manifest*.py`, `git_gate.py`, `git_http_backend.py`, `supervise.py`, + `yaml_subset.py`, `bottle_state.py` — is Docker-independent and + unit-testable, so it carries the high bar. We ratchet toward 90% as + these modules are touched; new gaps in them are not acceptable. + +2. **Subprocess/backend orchestration is covered by the integration + suite, not omitted.** `scripts/coverage.sh` runs unit + integration + under one coverage measurement so these modules are scored where they + are actually exercised. They stay *visible* — hiding the code that + tears down sandboxes and wires networks is the one place we will not + omit. + +3. **Interactive entry-point shells are omitted** (`.coveragerc`), with a + rationale comment. This is the only sanctioned use of `omit` besides + `tests/*`. + +The forward-looking guard is a **diff-coverage gate** +(`scripts/diff_coverage.py`): new/changed executable lines on a branch +must be ≥ 90% covered. This catches regressions where they are +introduced without forcing a back-fill crusade through legacy glue. The +gate skips lines in omitted files (there is no coverage data for them), +so the omit list cannot launder *new* logic into the dark: anything that +needs real testing must live outside the interactive shells to be +scored at all. + +The **global percentage is informational**, not a CI gate — it would +otherwise be hostage to the CI runner's Docker availability and to the +omit list. + +## Consequences + +- The number we report (`scripts/coverage.sh`) means "coverage of the + code we consider testable, across both suites" — a dip is a real + regression in code we control, not noise from added CLI glue. +- No incentive to write mock-the-mock tests for orchestration to defend + a global figure. +- The omit list needs governance: an entry must be a genuinely + interactive shell, justified in the `.coveragerc` comment and here. + `cli/init.py` and `cli/tui.py` qualify; backend orchestration does + not. +- CI must run the integration suite under coverage to score the + orchestration modules; where the runner lacks Docker those tests skip + and their modules read low — accepted, because the *enforced* gates + (critical-module standard + diff coverage) are Docker-independent. +- "We're at N%" is now a curated figure; outsiders should read the + policy, not just the badge. + +## Links + +- PRs #290 (cover the egress adapter), and the coverage-policy PR that + introduces this record. +- `.coveragerc`, `scripts/coverage.sh`, `scripts/diff_coverage.py`. diff --git a/scripts/coverage.sh b/scripts/coverage.sh new file mode 100755 index 0000000..db9556c --- /dev/null +++ b/scripts/coverage.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Combined unit + integration coverage (see docs/decisions/0004-coverage-policy.md). +# +# Runs the unit suite, then appends the integration suite (which skips +# cleanly when Docker / the backend CLIs are unavailable), and prints one +# combined report. The integration suite is what scores the subprocess / +# backend orchestration modules, so the number here is the policy's +# yardstick — not the unit-only badge. +# +# Usage: +# scripts/coverage.sh # combined report +# scripts/coverage.sh critical # also report just the critical modules +set -euo pipefail + +cd "$(dirname "$0")/.." + +PY="${PYTHON:-python3}" + +# Critical security/logic core held to the high bar by ADR 0004. +CRITICAL="bot_bottle/egress_addon.py,bot_bottle/egress_addon_core.py,\ +bot_bottle/dlp_detectors.py,bot_bottle/egress.py,bot_bottle/manifest.py,\ +bot_bottle/manifest_egress.py,bot_bottle/manifest_agent.py,\ +bot_bottle/manifest_schema.py,bot_bottle/git_gate.py,\ +bot_bottle/git_http_backend.py,bot_bottle/supervise.py,\ +bot_bottle/yaml_subset.py,bot_bottle/bottle_state.py" + +rm -f .coverage + +echo "== unit ==" >&2 +"$PY" -m coverage run -m unittest discover -t . -s tests/unit + +echo "== integration (skips without Docker) ==" >&2 +"$PY" -m coverage run --append -m unittest discover -t . -s tests/integration + +echo "== combined report ==" >&2 +"$PY" -m coverage report -m + +if [ "${1:-}" = "critical" ]; then + echo "== critical modules (ADR 0004 target: 90%) ==" >&2 + "$PY" -m coverage report --include="$CRITICAL" +fi diff --git a/scripts/diff_coverage.py b/scripts/diff_coverage.py new file mode 100755 index 0000000..546c655 --- /dev/null +++ b/scripts/diff_coverage.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Diff-coverage gate (see docs/decisions/0004-coverage-policy.md). + +Fails if too few of the *added/changed* executable lines on this branch +are covered. Stdlib-only by design — the project carries no runtime deps +and we are not adding `diff-cover` to satisfy a check. + +Reads coverage data already produced by a `coverage run` (e.g. via +`scripts/coverage.sh`): it shells out to `coverage json` for per-line +data and to `git diff` for the changed lines. Lines in omitted files +(the interactive shells) have no coverage data and are skipped, by +policy. + +Usage: + scripts/coverage.sh # produce .coverage first + python3 scripts/diff_coverage.py # gate against origin/main, min 90% + python3 scripts/diff_coverage.py --base main --min 85 +""" + +from __future__ import annotations + +import argparse +import json +import re +import subprocess +import sys +import tempfile +from pathlib import Path + +_HUNK_RE = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@") + + +def _run(cmd: list[str]) -> str: + return subprocess.run( + cmd, check=True, capture_output=True, text=True, + ).stdout + + +def added_lines_by_file(base: str) -> dict[str, set[int]]: + """Map each changed .py file to the set of line numbers added/changed + relative to `base`, parsed from a zero-context unified diff.""" + diff = _run(["git", "diff", "--unified=0", f"{base}...HEAD", "--", "*.py"]) + out: dict[str, set[int]] = {} + current: str | None = None + new_line = 0 + for line in diff.splitlines(): + if line.startswith("+++ b/"): + current = line[6:] + out.setdefault(current, set()) + continue + hunk = _HUNK_RE.match(line) + if hunk: + new_line = int(hunk.group(1)) + continue + if current is None: + continue + if line.startswith("+") and not line.startswith("+++"): + out[current].add(new_line) + new_line += 1 + elif line.startswith("-") and not line.startswith("---"): + # Deletion: does not advance the new-file cursor. + continue + return out + + +def coverage_json() -> dict[str, object]: + """Render the existing .coverage data to JSON and load it.""" + with tempfile.NamedTemporaryFile("r", suffix=".json", delete=True) as fh: + _run([sys.executable, "-m", "coverage", "json", "-o", fh.name]) + return json.load(open(fh.name, encoding="utf-8")) + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--base", default="origin/main", + help="git ref to diff against (default: origin/main)") + ap.add_argument("--min", type=float, default=90.0, + help="minimum %% of changed executable lines covered") + args = ap.parse_args() + + if not Path(".coverage").exists(): + print("diff-coverage: no .coverage data; run scripts/coverage.sh first", + file=sys.stderr) + return 2 + + added = added_lines_by_file(args.base) + files = coverage_json().get("files", {}) + if not isinstance(files, dict): + files = {} + + total = 0 + covered = 0 + misses: list[str] = [] + for path, lines in sorted(added.items()): + info = files.get(path) + if not isinstance(info, dict): + # Omitted file or not measured (e.g. a test file) — skip by policy. + continue + executed = set(info.get("executed_lines", [])) + missing = set(info.get("missing_lines", [])) + executable = lines & (executed | missing) + for ln in sorted(executable): + total += 1 + if ln in executed: + covered += 1 + else: + misses.append(f"{path}:{ln}") + + if total == 0: + print("diff-coverage: no measured changed lines to check — pass") + return 0 + + pct = 100.0 * covered / total + print(f"diff-coverage: {covered}/{total} changed lines covered ({pct:.1f}%)") + if misses: + print("uncovered changed lines:", file=sys.stderr) + for m in misses: + print(f" {m}", file=sys.stderr) + if pct + 1e-9 < args.min: + print(f"diff-coverage: below {args.min:.0f}% threshold", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/unit/test_cli_dispatch.py b/tests/unit/test_cli_dispatch.py new file mode 100644 index 0000000..4acbc37 --- /dev/null +++ b/tests/unit/test_cli_dispatch.py @@ -0,0 +1,82 @@ +"""Unit: top-level CLI dispatch in bot_bottle.cli.main (ADR 0004). + +`cli/__init__.py` is dispatch + exit-code mapping, not interactive I/O, +so it carries real unit tests rather than being omitted like the +`cli/init` / `cli/tui` shells.""" + +from __future__ import annotations + +import io +import unittest +from unittest.mock import patch + +import bot_bottle.cli as climod +from bot_bottle.cli import main +from bot_bottle.log import Die +from bot_bottle.manifest import ManifestError + + +class TestMainDispatch(unittest.TestCase): + def test_no_args_prints_usage_returns_2(self) -> None: + with patch("sys.stderr", io.StringIO()): + self.assertEqual(2, main([])) + + def test_help_flags_return_0(self) -> None: + with patch("sys.stderr", io.StringIO()): + self.assertEqual(0, main(["-h"])) + self.assertEqual(0, main(["--help"])) + + def test_unknown_command_dies(self) -> None: + with patch("sys.stderr", io.StringIO()): + with self.assertRaises(Die): + main(["definitely-not-a-command"]) + + def test_handler_return_code_passthrough(self) -> None: + def handler(_rest: list[str]) -> int: + return 7 + + with patch.dict(climod.COMMANDS, {"x": handler}): + self.assertEqual(7, main(["x"])) + + def test_handler_none_return_becomes_0(self) -> None: + def handler(_rest: list[str]) -> int | None: + return None + + with patch.dict(climod.COMMANDS, {"x": handler}): + self.assertEqual(0, main(["x"])) + + def test_args_forwarded_to_handler(self) -> None: + seen: list[list[str]] = [] + + def handler(rest: list[str]) -> int: + seen.append(rest) + return 0 + + with patch.dict(climod.COMMANDS, {"x": handler}): + main(["x", "a", "b"]) + self.assertEqual([["a", "b"]], seen) + + def test_manifest_error_maps_to_1(self) -> None: + def boom(_rest: list[str]) -> int: + raise ManifestError("bad manifest") + + with patch.dict(climod.COMMANDS, {"x": boom}), patch("sys.stderr", io.StringIO()): + self.assertEqual(1, main(["x"])) + + def test_die_maps_to_its_code(self) -> None: + def boom(_rest: list[str]) -> int: + raise Die(3) + + with patch.dict(climod.COMMANDS, {"x": boom}): + self.assertEqual(3, main(["x"])) + + def test_keyboard_interrupt_maps_to_130(self) -> None: + def boom(_rest: list[str]) -> int: + raise KeyboardInterrupt() + + with patch.dict(climod.COMMANDS, {"x": boom}): + self.assertEqual(130, main(["x"])) + + +if __name__ == "__main__": + unittest.main()