perf(dlp): replace O(n*m) proximity check with O(n log n) sorted scan
lint / lint (push) Failing after 1m36s
test / unit (pull_request) Successful in 37s
test / integration (pull_request) Successful in 52s

Sort all match positions and scan linearly instead of checking every
a-b pair. Early-exits on overlap (gap=0) or once the gap drops below
the threshold.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-05 21:01:56 +00:00
parent 52820278fd
commit 6d8c4d62bf
+21 -7
View File
@@ -112,19 +112,33 @@ JAILBREAK_PHRASES: tuple[re.Pattern[str], ...] = (
PROXIMITY_CHARS = 500
def _min_distance(
def _nearby(
a_matches: list[re.Match[str]],
b_matches: list[re.Match[str]],
threshold: int,
) -> int | None:
"""Smallest char distance between any pair of matches."""
"""Return the smallest char gap between any ab pair, or None if
both lists are empty. O(n log n) via sort + linear scan."""
if not a_matches or not b_matches:
return None
best = None
for a in a_matches:
for b in b_matches:
gap = max(0, max(a.start(), b.start()) - min(a.end(), b.end()))
events = sorted(
[(m.start(), m.end(), "a") for m in a_matches]
+ [(m.start(), m.end(), "b") for m in b_matches],
)
best: int | None = None
prev_end: int | None = None
prev_tag: str | None = None
for start, end, tag in events:
if prev_tag is not None and prev_tag != tag and prev_end is not None:
gap = max(0, start - prev_end)
if best is None or gap < best:
best = gap
if best == 0:
return 0
if best <= threshold:
return best
prev_end = end if prev_end is None else max(prev_end, end)
prev_tag = tag
return best
@@ -133,7 +147,7 @@ def scan_naive_injection(text: str) -> ScanResult | None:
jailbreak_hits = [m for p in JAILBREAK_PHRASES for m in p.finditer(text)]
if disclosure_hits and jailbreak_hits:
dist = _min_distance(disclosure_hits, jailbreak_hits)
dist = _nearby(disclosure_hits, jailbreak_hits, PROXIMITY_CHARS)
if dist is not None and dist <= PROXIMITY_CHARS:
return ScanResult(
severity="block",