feat(dlp): add 7 token patterns, Unicode normalization, CRLF injection detection (PRD 0053)
Token patterns: HuggingFace (hf_), Databricks (dapi), Slack (xox[baprs]-), npm (npm_), SendGrid (SG.x.y), PyPI (pypi-), HashiCorp Vault (hvs.). Unicode normalization (_normalize_text) applies NFKD + strips combining marks and control chars before pattern matching, defeating fullwidth-char and combining-mark evasion. CRLF injection (scan_crlf_injection) detects %0d%0a in URLs and literal \r\n header-injection patterns; runs unconditionally in scan_outbound regardless of outbound_detectors config.
This commit is contained in:
@@ -574,15 +574,25 @@ def scan_outbound(
|
||||
# at import time (the sidecar copies it flat alongside this file).
|
||||
try:
|
||||
from dlp_detectors import ( # type: ignore[import-not-found]
|
||||
scan_token_patterns, scan_known_secrets,
|
||||
scan_crlf_injection,
|
||||
scan_known_secrets,
|
||||
scan_token_patterns,
|
||||
)
|
||||
except ImportError: # pragma: no cover - host-side path
|
||||
from .dlp_detectors import ( # type: ignore[import-not-found]
|
||||
scan_token_patterns, scan_known_secrets,
|
||||
scan_crlf_injection,
|
||||
scan_known_secrets,
|
||||
scan_token_patterns,
|
||||
)
|
||||
|
||||
text = body if isinstance(body, str) else body.decode("utf-8", errors="replace")
|
||||
|
||||
# CRLF injection is never legitimate — runs unconditionally, not gated
|
||||
# by outbound_detectors config.
|
||||
result = scan_crlf_injection(text)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
if _detector_enabled(route.outbound_detectors, "token_patterns"):
|
||||
result = scan_token_patterns(text, location="body")
|
||||
if result is not None:
|
||||
|
||||
Reference in New Issue
Block a user