docs: draft PRD prd-new for strengthen-outbound-exfil-detection

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
feat(log): add leveled severity and structured context to log wrappers
2026-06-25 00:54:28 +00:00 · 2026-06-24 15:37:57 -04:00 · 2026-06-24 09:32:19 -04:00
5 changed files with 414 additions and 12 deletions
@@ -292,7 +292,10 @@ def cmd_supervise(argv: list[str]) -> int:
        return e.code if isinstance(e.code, int) else 1
    except Exception as e:  # noqa: W0718 — catch supervise crash for logging
        log_path = _write_crash_log(e)
-        error(f"supervise crashed: {type(e).__name__}: {e}")
+        error(
            f"supervise crashed: {type(e).__name__}: {e}",
            context={"error_type": type(e).__name__, "crash_log": str(log_path)},
        )
        error(f"full traceback written to {log_path}")
        return 1
    return 0
@@ -1,21 +1,107 @@
-"""Tiny logging wrappers. All output goes to stderr."""
+"""Tiny logging wrappers. All output goes to stderr.
 Two capabilities layer onto the bare wrappers (issue #252):
  - **Levels.** `debug` / `info` / `warn` / `error` carry an ordered
    severity. Output is gated by `BOT_BOTTLE_LOG_LEVEL` (debug | info |
    warn | error; default `info`). A message emits when its severity is
    at or above the threshold, so `debug` is silent by default and
    `error` always surfaces (nothing sits above it) — which keeps the
    fatal `die` path visible regardless of the configured level.
  - **Context.** Every wrapper takes an optional `context` mapping that
    renders as a parseable ` [k=v ...]` suffix (keys sorted; values with
    whitespace/quotes are quoted), so failures can be filtered and
    correlated instead of being flat strings.
 With no `context` and the default level, output is byte-identical to the
 original `bot-bottle: <msg>` / `bot-bottle: warning: <msg>` /
 `bot-bottle: error: <msg>` lines — the 100+ existing call sites are
 unaffected.
 """
 from __future__ import annotations
 import os
 import sys
-from typing import NoReturn
+from typing import Mapping, NoReturn
 # Ordered severities. Gaps left between values so intermediate levels
 # can be added later without renumbering.
 DEBUG = 10
 INFO = 20
 WARN = 30
 ERROR = 40
 _LEVEL_NAMES: dict[str, int] = {
    "debug": DEBUG,
    "info": INFO,
    "warn": WARN,
    "warning": WARN,
    "error": ERROR,
 }
 # Default threshold when BOT_BOTTLE_LOG_LEVEL is unset or unrecognised.
 _DEFAULT_THRESHOLD = INFO
 _LOG_LEVEL_ENV = "BOT_BOTTLE_LOG_LEVEL"
-def info(msg: str) -> None:
+def _threshold() -> int:
-    print(f"bot-bottle: {msg}", file=sys.stderr)
+    """Resolve the active level threshold from the environment.
    Read per-call (not cached) so the level can be changed at runtime
    and so tests can patch `os.environ` without a reload. Unknown values
    fall back to the default rather than raising — logging must never be
    the thing that crashes the process."""
    raw = os.environ.get(_LOG_LEVEL_ENV, "")
    return _LEVEL_NAMES.get(raw.strip().lower(), _DEFAULT_THRESHOLD)
-def warn(msg: str) -> None:
+def _format_context(context: Mapping[str, object] | None) -> str:
-    print(f"bot-bottle: warning: {msg}", file=sys.stderr)
+    """Render a context mapping as a ` [k=v k2=v2]` suffix.
    Keys are sorted for stable, diffable output. Values that are empty or
    contain whitespace or a quote are wrapped in double quotes (with inner
    quotes escaped) so each `k=v` pair stays parseable. Empty/None context
    renders as the empty string."""
    if not context:
        return ""
    parts: list[str] = []
    for key in sorted(context):
        value = str(context[key])
        if value == "" or any(ch.isspace() for ch in value) or '"' in value:
            value = '"' + value.replace('"', '\\"') + '"'
        parts.append(f"{key}={value}")
    return " [" + " ".join(parts) + "]"
-def error(msg: str) -> None:
+def _emit(
-    print(f"bot-bottle: error: {msg}", file=sys.stderr)
+    level: int,
    label: str,
    msg: str,
    context: Mapping[str, object] | None,
 ) -> None:
    if level < _threshold():
        return
    prefix = f"{label}: " if label else ""
    sys.stderr.write(f"bot-bottle: {prefix}{msg}{_format_context(context)}\n")
 def debug(msg: str, *, context: Mapping[str, object] | None = None) -> None:
    _emit(DEBUG, "debug", msg, context)
 def info(msg: str, *, context: Mapping[str, object] | None = None) -> None:
    _emit(INFO, "", msg, context)
 def warn(msg: str, *, context: Mapping[str, object] | None = None) -> None:
    _emit(WARN, "warning", msg, context)
 def error(msg: str, *, context: Mapping[str, object] | None = None) -> None:
    _emit(ERROR, "error", msg, context)
 class Die(SystemExit):
@@ -31,6 +117,6 @@ class Die(SystemExit):
        self.message = message
-def die(msg: str) -> NoReturn:
+def die(msg: str, *, context: Mapping[str, object] | None = None) -> NoReturn:
-    error(msg)
+    error(msg, context=context)
    raise Die(1, msg)
@@ -0,0 +1,186 @@
 # PRD prd-new: Strengthen outbound exfiltration detection
 - **Status:** Draft
 - **Author:** claude
 - **Created:** 2026-06-25
 - **Issue:** #259
 ## Summary
 The current DLP system has two detection tiers: weak (vendor token regexes)
 and strong (exact matching of provisioned `EGRESS_TOKEN_*` values in multiple
 encodings). This PRD adds four complementary hardening layers in ranked order
 of impact:
 1. **Canary tokens** — inject a per-session fake secret into the agent's
   environment; any outbound appearance is a zero-false-positive exfil signal.
 2. **Broadened known-value scanning** — extend the strong tier beyond
   `EGRESS_TOKEN_*` via a `BOT_BOTTLE_SENSITIVE_PREFIXES` sidecar env var.
 3. **Fragmentation-resistant matching** — catch separator injection and
   long-substring leaks via alnum projection and sliding-window substring
   search.
 4. **Entropy scoring** — warn-only flag on high-entropy blobs as a
   complementary signal for encrypted exfil.
 ## Problem
 After PRD 0056 the outbound scan covers all HTTP request surfaces (host, path,
 query, headers, body). But the content-based strong tier only matches
 `EGRESS_TOKEN_*` values and their encodings. Three gaps remain:
 | Gap | How an agent exploits it |
 |-----|--------------------------|
 | No canary | There is no "can't be false-positive" signal |
 | Narrow prefix filter | Sensitive values provisioned under other env prefixes (MCP keys, API keys injected via the agent provider) are invisible to `scan_known_secrets` |
 | Fragmentation blindness | Inserting `-`, space, or `\n` between secret characters turns an exact-match into a miss |
 ## Goals / Success Criteria
 1. Each launched bottle has a unique canary token in the agent's environment
   (`BOT_BOTTLE_CANARY`) and the egress sidecar's environment
   (`EGRESS_TOKEN_CANARY`). Any outbound appearance of the canary blocks the
   request with reason `"canary token"`.
 2. `scan_known_secrets` accepts a `sensitive_prefixes` parameter (default:
   `("EGRESS_TOKEN_",)`). `scan_outbound` reads
   `BOT_BOTTLE_SENSITIVE_PREFIXES` from `environ` and merges those prefixes
   in, so operators can mark additional env vars as scanned values without
   changing the manifest schema.
 3. For every secret that passes exact-match, a secondary alnum-projection pass
   checks for the secret with all non-alphanumeric characters stripped. This
   catches separator-injection evasion (`MY-SECRET` → body contains
   `MY SECRET`).
 4. A sliding-window partial-match pass checks for long-enough contiguous
   substrings of the secret's alnum projection in the text's alnum projection.
   Any match ≥ `PARTIAL_MATCH_MIN_LEN` (12 chars) blocks with reason
   `"partial match"`.
 5. A new `scan_entropy` detector flags outbound text windows with Shannon
   entropy ≥ `ENTROPY_BLOCK_THRESHOLD` (5.5 bits/char) at **warn** severity
   only. It is registered under the new detector name `"entropy"` in
   `OUTBOUND_DETECTOR_NAMES` and disabled by default (routes must opt in).
 6. Binary request bodies are decoded via `latin-1` instead of
   `utf-8 errors="replace"`, preserving every byte value and allowing
   ASCII-range secrets to be found within binary payloads.
 7. All new behaviour is unit-tested; existing tests pass unchanged.
 ## Non-goals
 - Rolling per-host buffer for split-across-requests detection (state in the
  stateless addon is complex; deferred).
 - Additional vendor regexes.
 - ML / embedding-based detection.
 - Entropy-based hard blocks (warn only per the issue).
 ## Design
 ### Canary token flow
 ```
 Egress.prepare()
  canary = secrets.token_urlsafe(32)
  EgressPlan(canary=canary, ...)
 Docker compose render:
  sidecar env: EGRESS_TOKEN_CANARY=<canary>   ← scanned by existing known-secrets detector
  agent env:   BOT_BOTTLE_CANARY=<canary>      ← visible to agent as a "secret"
 macos-container launch: same literals added to sidecar + agent env entries
 ```
 `EGRESS_TOKEN_CANARY` matches the `EGRESS_TOKEN_` prefix already scanned by
 `scan_known_secrets`, so no detector code changes are required for canary
 detection — only the injection path.
 ### Broadened known-value scanning
 `scan_known_secrets` gains a `sensitive_prefixes` parameter:
 ```python
 def scan_known_secrets(
    text: str,
    *,
    location: str = "body",
    env: Mapping[str, str] | None = None,
    sensitive_prefixes: tuple[str, ...] = ("EGRESS_TOKEN_",),
 ) -> ScanResult | None:
 ```
 `scan_outbound` reads `BOT_BOTTLE_SENSITIVE_PREFIXES` (comma-separated list
 of additional prefixes) from `environ` and appends them:
 ```python
 extra = tuple(
    p for p in environ.get("BOT_BOTTLE_SENSITIVE_PREFIXES", "").split(",") if p
 )
 sensitive_prefixes = ("EGRESS_TOKEN_",) + extra
 ```
 `redact_tokens` receives the same treatment for consistent redaction.
 ### Fragmentation-resistant matching
 A new helper `_alnum_projection(text)` strips all non-alphanumeric characters.
 `scan_known_secrets` runs two passes per secret:
 1. **Exact pass** — existing encoded-variant loop (unchanged).
 2. **Alnum-projection pass** — if the secret's alnum projection has ≥ 8 chars,
   check if it appears in the text's alnum projection. Match → block with
   `"fragmented match (separator injection)"` reason.
 3. **Partial-substring pass** — if the secret's alnum projection has ≥
   `PARTIAL_MATCH_MIN_LEN` chars (12), slide a window of that length across the
   secret's projection and look for each window in the text's alnum projection.
   First match → block with `"partial match"` reason.
 All three passes run only for the `"known_secrets"` detector; the token-pattern
 and entropy detectors are unchanged.
 ### Entropy scoring
 New public function:
 ```python
 def scan_entropy(
    text: str,
    *,
    location: str = "body",
    window: int = ENTROPY_WINDOW,           # 64
    threshold: float = ENTROPY_BLOCK_THRESHOLD,  # 5.5
 ) -> ScanResult | None:
 ```
 Slides a window of `window` characters across `text` in steps of `window // 2`.
 If any window's Shannon entropy exceeds `threshold`, returns a **warn**-severity
 `ScanResult`. Never blocks.
 `OUTBOUND_DETECTOR_NAMES` gains `"entropy"`. Routes opt in via their `dlp`
 block; entropy scanning is **off by default** to avoid false-positive noise on
 legitimate binary payloads.
 ### Binary body handling
 In `scan_outbound`, the bytes → str decoding changes from:
 ```python
 body.decode("utf-8", errors="replace")
 ```
 to:
 ```python
 body.decode("utf-8") if body is str else body.decode("latin-1")
 ```
 `latin-1` is a bijective byte↔codepoint mapping; every byte value is preserved
 as its corresponding Latin-1 code point, so ASCII-range secret strings remain
 intact and `str.find` / regex still locate them correctly. The fallback from
 strict UTF-8 is tried first so valid UTF-8 bodies are decoded faithfully.
 ## Implementation
 Delivered in three commits on the same branch:
 1. **DLP detector changes** — `_alnum_projection`, fragmentation passes,
   `scan_entropy`, broadened `scan_known_secrets`, updated `scan_outbound` and
   `redact_tokens`; all accompanying unit tests.
 2. **Canary injection** — `EgressPlan.canary`, `Egress.prepare()`,
   Docker compose + macos-container backend injection.
 3. **PRD flip** — `Status: Draft → Active`.
@@ -22,7 +22,7 @@ escapes**, and **whether credentials are short-lived and scoped**.
 - Outbound: Docker containers have full internet access by default; no egress monitoring on most home networks
 - Lateral movement: compromised container can reach the LAN — NAS, other machines, internal services
 - Notable: CVE-2025-59536 (CVSS 8.7, Feb 2026) — a poisoned `.claude/settings.json` in a repo gives RCE when Claude Code opens it. `--dangerously-skip-permissions` removes the last gate.
- Supply chain: MCP servers, skills, and npm packages pulled during agent execution. ~20% of ClawHub skills were found malicious in early 2026.
+- Supply chain: MCP servers, skills, and npm packages pulled during agent execution. A Jan 2026 large-scale empirical study of a 98,380-skill snapshot confirmed 157 malicious skills, ~71% of them credential harvesters. Exfiltration was overwhelmingly naive — plaintext HTTP to hardcoded endpoints; under 10% used any code obfuscation, and concealment was mostly at the documentation level, not the code level. ([Malicious Agent Skills in the Wild](https://arxiv.org/html/2602.06547v1), arXiv:2602.06547)
 **What local topology protects:**
 - No inbound attack surface — nothing listening on a public port
@@ -0,0 +1,127 @@
 """Unit: leveled + structured logging wrappers (issue #252).
 Locks three properties of bot_bottle.log:
  - backward compatibility — default output is byte-identical to the
    original bare wrappers, so the 100+ existing single-string call
    sites are unaffected;
  - context rendering — an optional mapping becomes a parseable
    ` [k=v ...]` suffix;
  - level gating — BOT_BOTTLE_LOG_LEVEL filters by severity, debug is
    silent by default, and error always surfaces.
 """
 from __future__ import annotations
 import contextlib
 import io
 import unittest
 from typing import Callable
 from unittest import mock
 from bot_bottle import log
 def _capture(
    fn: Callable[..., None],
    *args: object,
    env: dict[str, str] | None = None,
    **kwargs: object,
 ) -> str:
    buf = io.StringIO()
    patched = mock.patch.dict("os.environ", env or {}, clear=False)
    with patched, contextlib.redirect_stderr(buf):
        fn(*args, **kwargs)
    return buf.getvalue()
 class TestBackwardCompat(unittest.TestCase):
    """No context + default level → exactly the legacy lines."""
    def test_info(self):
        self.assertEqual("bot-bottle: hello\n", _capture(log.info, "hello"))
    def test_warn(self):
        self.assertEqual(
            "bot-bottle: warning: careful\n", _capture(log.warn, "careful")
        )
    def test_error(self):
        self.assertEqual(
            "bot-bottle: error: boom\n", _capture(log.error, "boom")
        )
 class TestContext(unittest.TestCase):
    def test_appends_sorted_parseable_suffix(self):
        out = _capture(
            log.error, "rpc failed", context={"slug": "abc123", "code": "-32603"}
        )
        # keys sorted: code before slug
        self.assertEqual(
            "bot-bottle: error: rpc failed [code=-32603 slug=abc123]\n", out
        )
    def test_quotes_values_with_whitespace(self):
        out = _capture(
            log.info, "did thing", context={"path": "/a b/c", "ok": "yes"}
        )
        self.assertEqual(
            'bot-bottle: did thing [ok=yes path="/a b/c"]\n', out
        )
    def test_empty_context_is_noop_suffix(self):
        self.assertEqual(
            "bot-bottle: x\n", _capture(log.info, "x", context={})
        )
 class TestLevels(unittest.TestCase):
    def test_debug_silent_by_default(self):
        self.assertEqual("", _capture(log.debug, "trace"))
    def test_debug_emits_when_level_lowered(self):
        out = _capture(log.debug, "trace", env={"BOT_BOTTLE_LOG_LEVEL": "debug"})
        self.assertEqual("bot-bottle: debug: trace\n", out)
    def test_error_level_suppresses_info_and_warn(self):
        env = {"BOT_BOTTLE_LOG_LEVEL": "error"}
        self.assertEqual("", _capture(log.info, "i", env=env))
        self.assertEqual("", _capture(log.warn, "w", env=env))
        # error still surfaces — nothing sits above it
        self.assertEqual(
            "bot-bottle: error: e\n", _capture(log.error, "e", env=env)
        )
    def test_unknown_level_falls_back_to_default(self):
        # garbage value → default INFO threshold, so info still prints
        out = _capture(log.info, "i", env={"BOT_BOTTLE_LOG_LEVEL": "loud"})
        self.assertEqual("bot-bottle: i\n", out)
    def test_warning_alias_accepted(self):
        env = {"BOT_BOTTLE_LOG_LEVEL": "warning"}
        self.assertEqual("", _capture(log.info, "i", env=env))
        self.assertEqual(
            "bot-bottle: warning: w\n", _capture(log.warn, "w", env=env)
        )
 class TestDie(unittest.TestCase):
    def test_die_still_raises_and_prints_error(self):
        buf = io.StringIO()
        with contextlib.redirect_stderr(buf):
            with self.assertRaises(log.Die) as cm:
                log.die("fatal thing")
        self.assertEqual("fatal thing", cm.exception.message)
        self.assertIn("bot-bottle: error: fatal thing", buf.getvalue())
    def test_die_surfaces_even_at_error_level(self):
        buf = io.StringIO()
        with mock.patch.dict("os.environ", {"BOT_BOTTLE_LOG_LEVEL": "error"}):
            with contextlib.redirect_stderr(buf):
                with self.assertRaises(log.Die):
                    log.die("still fatal")
        self.assertIn("bot-bottle: error: still fatal", buf.getvalue())
 if __name__ == "__main__":
    unittest.main()