feat(dlp): websocket scanning, response headers, extended encoding variants, sk-proj pattern (PRD 0053)
lint / lint (push) Successful in 1m24s
test / unit (pull_request) Successful in 32s
test / integration (pull_request) Successful in 42s

This commit is contained in:
2026-06-06 17:59:36 +00:00
parent 9954273d26
commit baf1908f76
6 changed files with 300 additions and 33 deletions
+34 -10
View File
@@ -11,6 +11,7 @@ the same try/except import shim pattern.
from __future__ import annotations
import base64
import gzip
import re
import typing
from urllib.parse import quote as url_quote
@@ -31,6 +32,7 @@ TOKEN_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
("GitHub fine-grained token", re.compile(r"github_pat_[A-Za-z0-9_]{82}")),
("Anthropic API key", re.compile(r"sk-ant-[A-Za-z0-9\-_]{93}")),
("OpenAI API key", re.compile(r"sk-[A-Za-z0-9]{48}")),
("OpenAI project API key", re.compile(r"sk-proj-[A-Za-z0-9_\-]{48,}")),
("Stripe live key", re.compile(r"sk_live_[A-Za-z0-9]{24}")),
("Generic Bearer JWT", re.compile(r"Bearer\s+[A-Za-z0-9._\-]{50,}")),
)
@@ -51,18 +53,40 @@ def scan_token_patterns(text: str) -> ScanResult | None:
# ---------------------------------------------------------------------------
def _encoded_variants(secret: str) -> list[str]:
"""Return the secret plus base64, URL-encoded, and hex variants."""
variants = [secret]
"""Return the secret plus common encoded variants for exfil detection."""
seen: set[str] = {secret}
variants: list[str] = [secret]
def _add(v: str) -> None:
if v not in seen:
seen.add(v)
variants.append(v)
secret_bytes = secret.encode("utf-8")
# Standard base64 — with and without padding
b64 = base64.b64encode(secret_bytes).decode("ascii")
if b64 != secret:
variants.append(b64)
url_enc = url_quote(secret, safe="")
if url_enc != secret:
variants.append(url_enc)
hex_enc = secret_bytes.hex()
if hex_enc != secret:
variants.append(hex_enc)
_add(b64)
_add(b64.rstrip("="))
# URL-safe base64 (JWT/OAuth use -_ alphabet) — with and without padding
b64url = base64.urlsafe_b64encode(secret_bytes).decode("ascii")
_add(b64url)
_add(b64url.rstrip("="))
# URL percent-encoding
_add(url_quote(secret, safe=""))
# Hex — lowercase and uppercase
_add(secret_bytes.hex())
_add(secret_bytes.hex().upper())
# Base32 (TOTP seeds, some DNS-exfil channels)
_add(base64.b32encode(secret_bytes).decode("ascii"))
# gzip + base64 (deterministic: mtime=0); recognisable by H4sI prefix
_add(base64.b64encode(gzip.compress(secret_bytes, mtime=0)).decode("ascii"))
return variants
+35 -3
View File
@@ -16,6 +16,7 @@ from mitmproxy import http # type: ignore[import-not-found]
from egress_addon_core import ( # type: ignore[import-not-found]
Route,
build_inbound_scan_text,
build_outbound_scan_text,
decide,
is_git_push_request,
@@ -148,16 +149,18 @@ class EgressAddon:
flow.request.headers["authorization"] = decision.inject_authorization
def response(self, flow: http.HTTPFlow) -> None:
"""DLP inbound scan on response bodies (PRD 0053)."""
"""DLP inbound scan on response headers and body."""
route = match_route(self.routes, flow.request.pretty_host)
if route is None:
return
if flow.response is None:
return
resp_headers = {k.lower(): v for k, v in flow.response.headers.items()}
body = flow.response.get_text(strict=False) or ""
if not body:
scan_text = build_inbound_scan_text(resp_headers, body)
if not scan_text:
return
result = scan_inbound(route, body)
result = scan_inbound(route, scan_text)
if result is None:
return
if result.severity == "block":
@@ -165,5 +168,34 @@ class EgressAddon:
elif result.severity == "warn":
sys.stderr.write(f"egress DLP warn: {result.reason}\n")
def websocket_message(self, flow: http.HTTPFlow) -> None:
"""DLP scan on WebSocket frames.
Outbound frames (from_client) are scanned for credential leakage;
inbound frames are scanned for prompt injection. On a block the
entire connection is killed — there is no HTTP response surface to
write to after the upgrade.
"""
if flow.websocket is None: # type: ignore[union-attr]
return
route = match_route(self.routes, flow.request.pretty_host)
if route is None:
return
message = flow.websocket.messages[-1] # type: ignore[union-attr]
content = message.content.decode("utf-8", errors="replace")
if message.from_client:
result = scan_outbound(route, content, os.environ)
if result is not None and result.severity == "block":
sys.stderr.write(f"egress DLP: {result.reason}\n")
flow.kill() # type: ignore[union-attr]
else:
result = scan_inbound(route, content)
if result is not None:
if result.severity == "block":
sys.stderr.write(f"egress DLP: {result.reason}\n")
flow.kill() # type: ignore[union-attr]
elif result.severity == "warn":
sys.stderr.write(f"egress DLP warn: {result.reason}\n")
addons = [EgressAddon()]
+17
View File
@@ -498,6 +498,22 @@ def build_outbound_scan_text(
return "\n".join(parts)
def build_inbound_scan_text(
headers: typing.Mapping[str, str],
body: str,
) -> str:
"""Assemble inbound response surfaces into one string for DLP scanning.
Covers all response headers plus body.
"""
parts: list[str] = []
for name, value in headers.items():
parts.append(f"{name}: {value}")
if body:
parts.append(body)
return "\n".join(parts)
def _detector_enabled(
configured: tuple[str, ...] | None,
name: str,
@@ -562,6 +578,7 @@ __all__ = [
"PathMatch",
"Route",
"ScanResult",
"build_inbound_scan_text",
"build_outbound_scan_text",
"decide",
"evaluate_matches",