Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 632ab002ed |
+10
@@ -3,6 +3,16 @@ branch = True
|
||||
source = .
|
||||
|
||||
[report]
|
||||
# Coverage policy: see docs/decisions/0004-coverage-policy.md.
|
||||
#
|
||||
# `omit` is reserved for genuinely interactive entry-point shells whose
|
||||
# bodies are `read_tty_line()` / curses prompt loops — there is no
|
||||
# behaviour to assert that a test wouldn't have to fake wholesale, so a
|
||||
# test here would inflate the number without buying confidence. This is
|
||||
# NOT a place to hide subprocess/backend orchestration: that code is
|
||||
# security-relevant and is measured via the integration suite instead
|
||||
# (run scripts/coverage.sh for the combined unit+integration number).
|
||||
omit =
|
||||
bot_bottle/cli/tui.py
|
||||
bot_bottle/cli/init.py
|
||||
tests/*
|
||||
|
||||
@@ -70,3 +70,32 @@ jobs:
|
||||
|
||||
- name: Run integration tests
|
||||
run: python3 -m unittest discover -t . -s tests/integration -v
|
||||
|
||||
# Combined unit+integration coverage + the diff-coverage gate.
|
||||
# See docs/decisions/0004-coverage-policy.md. The hard gate is diff
|
||||
# coverage (new/changed lines >= 90%); the combined + critical reports
|
||||
# are informational and degrade gracefully when the runner has no
|
||||
# Docker (integration tests skip, those modules just read lower).
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Install dev requirements
|
||||
run: python3 -m pip install -r requirements-dev.txt
|
||||
|
||||
- name: Combined coverage (unit + integration)
|
||||
run: PYTHON=python3 bash scripts/coverage.sh critical
|
||||
|
||||
- name: Diff-coverage gate (changed lines >= 90%)
|
||||
run: |
|
||||
git fetch --no-tags origin main:refs/remotes/origin/main
|
||||
python3 scripts/diff_coverage.py --base origin/main --min 90
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
# ADR 0004: Risk-weighted coverage, not a single global target
|
||||
|
||||
- **Status:** Accepted
|
||||
- **Date:** 2026-06-25
|
||||
- **Deciders:** didericis
|
||||
|
||||
## Context
|
||||
|
||||
bot-bottle is a security tool: it sandboxes agents, scans egress for
|
||||
secret exfiltration, strips credentials, and gates git pushes. A latent
|
||||
bug in that logic is expensive, so test coverage there genuinely
|
||||
matters. But the repo also contains code where coverage is a poor
|
||||
signal:
|
||||
|
||||
- **Interactive entry-point shells** — `cli/init.py` (a `read_tty_line()`
|
||||
prompt loop) and `cli/tui.py` (a curses picker). Their bodies are I/O;
|
||||
a unit test has to fake the entire terminal conversation, so it
|
||||
inflates the number without asserting behaviour that would otherwise
|
||||
go unchecked.
|
||||
- **Subprocess / backend orchestration** — the docker / smolmachines /
|
||||
macos-container backends shell out to `docker`, `container`, `smolvm`.
|
||||
Mock-heavy unit tests here mostly re-assert the argv you already
|
||||
wrote (the test passes whether or not the real teardown works), while
|
||||
many of the missed *branches* are failure paths you cannot provoke
|
||||
against a real daemon on cue.
|
||||
|
||||
Chasing a single global percentage (e.g. 90%) pushes the most test
|
||||
effort onto the least safety-relevant code — exactly backwards — and
|
||||
invites performative tests written to colour a line rather than to catch
|
||||
a regression (Goodhart's law).
|
||||
|
||||
## Decision
|
||||
|
||||
Coverage is **risk-weighted**, measured over the **combined unit +
|
||||
integration** suites, with three rules:
|
||||
|
||||
1. **Critical modules target ≥ 90%.** The security/logic core —
|
||||
`egress_addon{,_core}.py`, `dlp_detectors.py`, `egress.py`,
|
||||
`manifest*.py`, `git_gate.py`, `git_http_backend.py`, `supervise.py`,
|
||||
`yaml_subset.py`, `bottle_state.py` — is Docker-independent and
|
||||
unit-testable, so it carries the high bar. We ratchet toward 90% as
|
||||
these modules are touched; new gaps in them are not acceptable.
|
||||
|
||||
2. **Subprocess/backend orchestration is covered by the integration
|
||||
suite, not omitted.** `scripts/coverage.sh` runs unit + integration
|
||||
under one coverage measurement so these modules are scored where they
|
||||
are actually exercised. They stay *visible* — hiding the code that
|
||||
tears down sandboxes and wires networks is the one place we will not
|
||||
omit.
|
||||
|
||||
3. **Interactive entry-point shells are omitted** (`.coveragerc`), with a
|
||||
rationale comment. This is the only sanctioned use of `omit` besides
|
||||
`tests/*`.
|
||||
|
||||
The forward-looking guard is a **diff-coverage gate**
|
||||
(`scripts/diff_coverage.py`): new/changed executable lines on a branch
|
||||
must be ≥ 90% covered. This catches regressions where they are
|
||||
introduced without forcing a back-fill crusade through legacy glue. The
|
||||
gate skips lines in omitted files (there is no coverage data for them),
|
||||
so the omit list cannot launder *new* logic into the dark: anything that
|
||||
needs real testing must live outside the interactive shells to be
|
||||
scored at all.
|
||||
|
||||
The **global percentage is informational**, not a CI gate — it would
|
||||
otherwise be hostage to the CI runner's Docker availability and to the
|
||||
omit list.
|
||||
|
||||
## Consequences
|
||||
|
||||
- The number we report (`scripts/coverage.sh`) means "coverage of the
|
||||
code we consider testable, across both suites" — a dip is a real
|
||||
regression in code we control, not noise from added CLI glue.
|
||||
- No incentive to write mock-the-mock tests for orchestration to defend
|
||||
a global figure.
|
||||
- The omit list needs governance: an entry must be a genuinely
|
||||
interactive shell, justified in the `.coveragerc` comment and here.
|
||||
`cli/init.py` and `cli/tui.py` qualify; backend orchestration does
|
||||
not.
|
||||
- CI must run the integration suite under coverage to score the
|
||||
orchestration modules; where the runner lacks Docker those tests skip
|
||||
and their modules read low — accepted, because the *enforced* gates
|
||||
(critical-module standard + diff coverage) are Docker-independent.
|
||||
- "We're at N%" is now a curated figure; outsiders should read the
|
||||
policy, not just the badge.
|
||||
|
||||
## Links
|
||||
|
||||
- PRs #290 (cover the egress adapter), and the coverage-policy PR that
|
||||
introduces this record.
|
||||
- `.coveragerc`, `scripts/coverage.sh`, `scripts/diff_coverage.py`.
|
||||
Executable
+41
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env bash
|
||||
# Combined unit + integration coverage (see docs/decisions/0004-coverage-policy.md).
|
||||
#
|
||||
# Runs the unit suite, then appends the integration suite (which skips
|
||||
# cleanly when Docker / the backend CLIs are unavailable), and prints one
|
||||
# combined report. The integration suite is what scores the subprocess /
|
||||
# backend orchestration modules, so the number here is the policy's
|
||||
# yardstick — not the unit-only badge.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/coverage.sh # combined report
|
||||
# scripts/coverage.sh critical # also report just the critical modules
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
PY="${PYTHON:-python3}"
|
||||
|
||||
# Critical security/logic core held to the high bar by ADR 0004.
|
||||
CRITICAL="bot_bottle/egress_addon.py,bot_bottle/egress_addon_core.py,\
|
||||
bot_bottle/dlp_detectors.py,bot_bottle/egress.py,bot_bottle/manifest.py,\
|
||||
bot_bottle/manifest_egress.py,bot_bottle/manifest_agent.py,\
|
||||
bot_bottle/manifest_schema.py,bot_bottle/git_gate.py,\
|
||||
bot_bottle/git_http_backend.py,bot_bottle/supervise.py,\
|
||||
bot_bottle/yaml_subset.py,bot_bottle/bottle_state.py"
|
||||
|
||||
rm -f .coverage
|
||||
|
||||
echo "== unit ==" >&2
|
||||
"$PY" -m coverage run -m unittest discover -t . -s tests/unit
|
||||
|
||||
echo "== integration (skips without Docker) ==" >&2
|
||||
"$PY" -m coverage run --append -m unittest discover -t . -s tests/integration
|
||||
|
||||
echo "== combined report ==" >&2
|
||||
"$PY" -m coverage report -m
|
||||
|
||||
if [ "${1:-}" = "critical" ]; then
|
||||
echo "== critical modules (ADR 0004 target: 90%) ==" >&2
|
||||
"$PY" -m coverage report --include="$CRITICAL"
|
||||
fi
|
||||
Executable
+126
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Diff-coverage gate (see docs/decisions/0004-coverage-policy.md).
|
||||
|
||||
Fails if too few of the *added/changed* executable lines on this branch
|
||||
are covered. Stdlib-only by design — the project carries no runtime deps
|
||||
and we are not adding `diff-cover` to satisfy a check.
|
||||
|
||||
Reads coverage data already produced by a `coverage run` (e.g. via
|
||||
`scripts/coverage.sh`): it shells out to `coverage json` for per-line
|
||||
data and to `git diff` for the changed lines. Lines in omitted files
|
||||
(the interactive shells) have no coverage data and are skipped, by
|
||||
policy.
|
||||
|
||||
Usage:
|
||||
scripts/coverage.sh # produce .coverage first
|
||||
python3 scripts/diff_coverage.py # gate against origin/main, min 90%
|
||||
python3 scripts/diff_coverage.py --base main --min 85
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
_HUNK_RE = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
|
||||
|
||||
|
||||
def _run(cmd: list[str]) -> str:
|
||||
return subprocess.run(
|
||||
cmd, check=True, capture_output=True, text=True,
|
||||
).stdout
|
||||
|
||||
|
||||
def added_lines_by_file(base: str) -> dict[str, set[int]]:
|
||||
"""Map each changed .py file to the set of line numbers added/changed
|
||||
relative to `base`, parsed from a zero-context unified diff."""
|
||||
diff = _run(["git", "diff", "--unified=0", f"{base}...HEAD", "--", "*.py"])
|
||||
out: dict[str, set[int]] = {}
|
||||
current: str | None = None
|
||||
new_line = 0
|
||||
for line in diff.splitlines():
|
||||
if line.startswith("+++ b/"):
|
||||
current = line[6:]
|
||||
out.setdefault(current, set())
|
||||
continue
|
||||
hunk = _HUNK_RE.match(line)
|
||||
if hunk:
|
||||
new_line = int(hunk.group(1))
|
||||
continue
|
||||
if current is None:
|
||||
continue
|
||||
if line.startswith("+") and not line.startswith("+++"):
|
||||
out[current].add(new_line)
|
||||
new_line += 1
|
||||
elif line.startswith("-") and not line.startswith("---"):
|
||||
# Deletion: does not advance the new-file cursor.
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
def coverage_json() -> dict[str, object]:
|
||||
"""Render the existing .coverage data to JSON and load it."""
|
||||
with tempfile.NamedTemporaryFile("r", suffix=".json", delete=True) as fh:
|
||||
_run([sys.executable, "-m", "coverage", "json", "-o", fh.name])
|
||||
return json.load(open(fh.name, encoding="utf-8"))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--base", default="origin/main",
|
||||
help="git ref to diff against (default: origin/main)")
|
||||
ap.add_argument("--min", type=float, default=90.0,
|
||||
help="minimum %% of changed executable lines covered")
|
||||
args = ap.parse_args()
|
||||
|
||||
if not Path(".coverage").exists():
|
||||
print("diff-coverage: no .coverage data; run scripts/coverage.sh first",
|
||||
file=sys.stderr)
|
||||
return 2
|
||||
|
||||
added = added_lines_by_file(args.base)
|
||||
files = coverage_json().get("files", {})
|
||||
if not isinstance(files, dict):
|
||||
files = {}
|
||||
|
||||
total = 0
|
||||
covered = 0
|
||||
misses: list[str] = []
|
||||
for path, lines in sorted(added.items()):
|
||||
info = files.get(path)
|
||||
if not isinstance(info, dict):
|
||||
# Omitted file or not measured (e.g. a test file) — skip by policy.
|
||||
continue
|
||||
executed = set(info.get("executed_lines", []))
|
||||
missing = set(info.get("missing_lines", []))
|
||||
executable = lines & (executed | missing)
|
||||
for ln in sorted(executable):
|
||||
total += 1
|
||||
if ln in executed:
|
||||
covered += 1
|
||||
else:
|
||||
misses.append(f"{path}:{ln}")
|
||||
|
||||
if total == 0:
|
||||
print("diff-coverage: no measured changed lines to check — pass")
|
||||
return 0
|
||||
|
||||
pct = 100.0 * covered / total
|
||||
print(f"diff-coverage: {covered}/{total} changed lines covered ({pct:.1f}%)")
|
||||
if misses:
|
||||
print("uncovered changed lines:", file=sys.stderr)
|
||||
for m in misses:
|
||||
print(f" {m}", file=sys.stderr)
|
||||
if pct + 1e-9 < args.min:
|
||||
print(f"diff-coverage: below {args.min:.0f}% threshold", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,82 @@
|
||||
"""Unit: top-level CLI dispatch in bot_bottle.cli.main (ADR 0004).
|
||||
|
||||
`cli/__init__.py` is dispatch + exit-code mapping, not interactive I/O,
|
||||
so it carries real unit tests rather than being omitted like the
|
||||
`cli/init` / `cli/tui` shells."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
import bot_bottle.cli as climod
|
||||
from bot_bottle.cli import main
|
||||
from bot_bottle.log import Die
|
||||
from bot_bottle.manifest import ManifestError
|
||||
|
||||
|
||||
class TestMainDispatch(unittest.TestCase):
|
||||
def test_no_args_prints_usage_returns_2(self) -> None:
|
||||
with patch("sys.stderr", io.StringIO()):
|
||||
self.assertEqual(2, main([]))
|
||||
|
||||
def test_help_flags_return_0(self) -> None:
|
||||
with patch("sys.stderr", io.StringIO()):
|
||||
self.assertEqual(0, main(["-h"]))
|
||||
self.assertEqual(0, main(["--help"]))
|
||||
|
||||
def test_unknown_command_dies(self) -> None:
|
||||
with patch("sys.stderr", io.StringIO()):
|
||||
with self.assertRaises(Die):
|
||||
main(["definitely-not-a-command"])
|
||||
|
||||
def test_handler_return_code_passthrough(self) -> None:
|
||||
def handler(_rest: list[str]) -> int:
|
||||
return 7
|
||||
|
||||
with patch.dict(climod.COMMANDS, {"x": handler}):
|
||||
self.assertEqual(7, main(["x"]))
|
||||
|
||||
def test_handler_none_return_becomes_0(self) -> None:
|
||||
def handler(_rest: list[str]) -> int | None:
|
||||
return None
|
||||
|
||||
with patch.dict(climod.COMMANDS, {"x": handler}):
|
||||
self.assertEqual(0, main(["x"]))
|
||||
|
||||
def test_args_forwarded_to_handler(self) -> None:
|
||||
seen: list[list[str]] = []
|
||||
|
||||
def handler(rest: list[str]) -> int:
|
||||
seen.append(rest)
|
||||
return 0
|
||||
|
||||
with patch.dict(climod.COMMANDS, {"x": handler}):
|
||||
main(["x", "a", "b"])
|
||||
self.assertEqual([["a", "b"]], seen)
|
||||
|
||||
def test_manifest_error_maps_to_1(self) -> None:
|
||||
def boom(_rest: list[str]) -> int:
|
||||
raise ManifestError("bad manifest")
|
||||
|
||||
with patch.dict(climod.COMMANDS, {"x": boom}), patch("sys.stderr", io.StringIO()):
|
||||
self.assertEqual(1, main(["x"]))
|
||||
|
||||
def test_die_maps_to_its_code(self) -> None:
|
||||
def boom(_rest: list[str]) -> int:
|
||||
raise Die(3)
|
||||
|
||||
with patch.dict(climod.COMMANDS, {"x": boom}):
|
||||
self.assertEqual(3, main(["x"]))
|
||||
|
||||
def test_keyboard_interrupt_maps_to_130(self) -> None:
|
||||
def boom(_rest: list[str]) -> int:
|
||||
raise KeyboardInterrupt()
|
||||
|
||||
with patch.dict(climod.COMMANDS, {"x": boom}):
|
||||
self.assertEqual(130, main(["x"]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user