fix(smolmachines): route agent through egress when routes declared, wait for VM warm-up
test / unit (pull_request) Successful in 26s
test / integration (pull_request) Successful in 42s

Two related bugs:

1. Auth chain bypassed egress. After the Docker-Desktop port
   pivot, the agent always dialed pipelock directly — meaning
   egress (which holds the real OAuth token and rewrites the
   Authorization header) wasn't in the request path. Bearer
   placeholder reached anthropic verbatim → 401 "Invalid bearer
   token". Fix: when the bottle declares egress.routes, the
   agent's first hop is egress (publish egress port 9099 to host
   loopback, leave pipelock bundle-internal). Without routes,
   the agent dials pipelock directly. Same hop order as the
   docker backend.

2. provision_ca's update-ca-certificates SIGKILLed at ~100ms
   on Docker Desktop. Back-to-back `smolvm machine exec` calls
   immediately after machine_start hit a VM warm-up race in
   libkrun's exec channel; the second exec's child got
   SIGKILL'd before producing more than the first line of
   stdout. The agent's trust store never got the egress MITM
   CA's hash symlink, so curl/openssl couldn't validate the
   TLS chain. Fix: 1.5s sleep after machine_start (empirically
   enough), plus fold provision_ca's chown + chmod +
   update-ca-certificates into one `sh -c` so we only pay one
   exec round trip. Bail with a clear error if update-ca-
   certificates doesn't report "1 added" (failing silently was
   how the original SIGKILL went unnoticed).

Net effect on Docker Desktop / macOS: claude's HTTPS_PROXY is
`http://127.0.0.1:<egress port>`, egress rewrites auth, pipelock
allowlists + DLPs, request reaches api.anthropic.com with a
real token. End-to-end verified.

Also drops the PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1
mitigation. The original concern (agent bypassing pipelock by
dialing egress's port on the bundle IP) doesn't apply in this
topology: the agent can only reach whatever port we publish on
host loopback, and egress is the only HTTP/HTTPS chokepoint
that gets published.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 15:57:18 -04:00
parent 4f136a9932
commit 5486170be1
3 changed files with 88 additions and 15 deletions
+42 -8
View File
@@ -21,6 +21,7 @@ from __future__ import annotations
import dataclasses import dataclasses
import os import os
import time
from contextlib import ExitStack, contextmanager from contextlib import ExitStack, contextmanager
from typing import Callable, Generator from typing import Callable, Generator
@@ -34,6 +35,7 @@ from ...util import expand_tilde
from ..docker.egress import ( from ..docker.egress import (
EGRESS_CA_IN_CONTAINER, EGRESS_CA_IN_CONTAINER,
EGRESS_PIPELOCK_CA_IN_CONTAINER, EGRESS_PIPELOCK_CA_IN_CONTAINER,
EGRESS_PORT as _EGRESS_PORT,
egress_tls_init, egress_tls_init,
) )
from ..docker.git_gate import ( from ..docker.git_gate import (
@@ -125,8 +127,22 @@ def launch(
# reachable from the smolvm guest on macOS — TSI uses # reachable from the smolvm guest on macOS — TSI uses
# macOS networking, and macOS sees the daemon's bridge # macOS networking, and macOS sees the daemon's bridge
# via the published-port loopback forward only. # via the published-port loopback forward only.
pipelock_host_port = _bundle.bundle_host_port(plan.slug, _PIPELOCK_PORT) #
agent_proxy_url = f"http://127.0.0.1:{pipelock_host_port}" # Proxy hop order matches the docker backend: when the
# bottle declares egress routes, the agent's first hop is
# egress (for token injection), then pipelock. Without
# routes, the agent dials pipelock directly. Whichever
# one is "agent-facing" is the daemon whose port we
# publish on host loopback; the other stays bundle-
# internal as the upstream proxy.
if plan.egress_plan.routes:
agent_facing_port = _EGRESS_PORT
else:
agent_facing_port = _PIPELOCK_PORT
agent_facing_host_port = _bundle.bundle_host_port(
plan.slug, agent_facing_port,
)
agent_proxy_url = f"http://127.0.0.1:{agent_facing_host_port}"
agent_git_gate_host = "" agent_git_gate_host = ""
if plan.git_gate_plan.upstreams: if plan.git_gate_plan.upstreams:
git_gate_host_port = _bundle.bundle_host_port( git_gate_host_port = _bundle.bundle_host_port(
@@ -189,6 +205,14 @@ def launch(
["chown", "-R", "node:node", "/home/node"], ["chown", "-R", "node:node", "/home/node"],
) )
# Wait briefly for the VM to settle. Back-to-back smolvm
# machine_exec calls immediately after machine_start
# occasionally SIGKILL the in-VM child at ~100ms (looks
# like a VM warm-up race in libkrun's exec channel).
# 1.5s is empirically enough to dodge it; provisioning
# already takes seconds so the wait is amortized.
time.sleep(1.5)
# 7. Provision (CA / prompt / skills / git / supervise). # 7. Provision (CA / prompt / skills / git / supervise).
prompt_path = provision(plan, plan.machine_name) prompt_path = provision(plan, plan.machine_name)
@@ -220,9 +244,14 @@ def _bundle_launch_spec(
env: list[str] = [] env: list[str] = []
volumes: list[tuple[str, str, bool]] = [] volumes: list[tuple[str, str, bool]] = []
# PRD 0023 chunk 3: egress binds 127.0.0.1 inside the bundle # In this Docker-Desktop-compatible topology, whichever daemon
# so TSI's IP-only allowlist can't bypass pipelock. # is "agent-facing" gets its port published on the host
env.append("EGRESS_LISTEN_HOST=127.0.0.1") # loopback (see `_ensure_smolmachine`'s discovery loop) and the
# other stays bundle-internal. The bundle is NOT reachable by
# bridge IP from the smolvm guest, so the
# PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
# isn't needed: the agent can only dial whatever daemon's
# host port we publish, period.
# --- pipelock --------------------------------------------- # --- pipelock ---------------------------------------------
pp = plan.proxy_plan pp = plan.proxy_plan
@@ -279,9 +308,14 @@ def _bundle_launch_spec(
# Container ports the agent reaches from the smolvm guest — # Container ports the agent reaches from the smolvm guest —
# published on host loopback so the guest can dial via TSI + # published on host loopback so the guest can dial via TSI +
# macOS networking. Egress is bundle-internal and never # macOS networking. The HTTP/HTTPS chokepoint is whichever
# published. # daemon's port we publish: egress when routes are declared
ports_to_publish: list[int] = [_PIPELOCK_PORT] # (token injection first, then forwards to bundle-internal
# pipelock), pipelock otherwise.
if ep.routes:
ports_to_publish: list[int] = [_EGRESS_PORT]
else:
ports_to_publish = [_PIPELOCK_PORT]
if gp.upstreams: if gp.upstreams:
ports_to_publish.append(_GIT_GATE_PORT) ports_to_publish.append(_GIT_GATE_PORT)
if sp is not None: if sp is not None:
@@ -66,8 +66,29 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
# default. The env trio (NODE_EXTRA_CA_CERTS / SSL_CERT_FILE / # default. The env trio (NODE_EXTRA_CA_CERTS / SSL_CERT_FILE /
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python # REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
# `requests` / libraries that don't load the system bundle. # `requests` / libraries that don't load the system bundle.
_smolvm.machine_exec(target, ["chmod", "644", AGENT_CA_PATH]) #
_smolvm.machine_exec(target, ["update-ca-certificates"]) # chown + chmod + update-ca-certificates run in one
# `sh -c` so we only pay one machine_exec round trip; the
# `&&` chaining surfaces the first failure as the return
# code.
r = _smolvm.machine_exec(target, [
"sh", "-c",
f"chown root:root {AGENT_CA_PATH} && "
f"chmod 644 {AGENT_CA_PATH} && "
f"update-ca-certificates",
])
if r.returncode != 0 or "1 added" not in (r.stdout or ""):
# update-ca-certificates not adding our cert is fatal —
# claude-code's TLS handshake against the egress-MITM'd
# api.anthropic.com would fail downstream. Bail early
# with what we can see (output is captured by smolvm so
# we can surface it).
die(
f"update-ca-certificates didn't add the agent CA "
f"(exit {r.returncode}): "
f"stdout={(r.stdout or '').strip()!r} "
f"stderr={(r.stderr or '').strip()!r}"
)
# Stdlib SHA-256 of the cert's DER bytes — the standard # Stdlib SHA-256 of the cert's DER bytes — the standard
# fingerprint form. Never the private key. # fingerprint form. Never the private key.
+23 -5
View File
@@ -307,21 +307,38 @@ class TestProvisionCA(unittest.TestCase):
def tearDown(self): def tearDown(self):
self._tmp.cleanup() self._tmp.cleanup()
# provision_ca dies hard if update-ca-certificates' stdout
# doesn't include "1 added"; supply a stock success return
# so the bulk of the tests below exercise the happy path.
_UPDATE_OK = SmolvmRunResult(
returncode=0,
stdout="Updating certificates in /etc/ssl/certs...\n1 added, 0 removed; done.\n",
stderr="",
)
def test_pipelock_path_when_no_routes(self): def test_pipelock_path_when_no_routes(self):
plan = _plan(pipelock_ca_path=self.pipelock_ca) plan = _plan(pipelock_ca_path=self.pipelock_ca)
with patch( with patch(
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp" "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
) as cp, patch( ) as cp, patch(
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec" "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
return_value=self._UPDATE_OK,
) as ex: ) as ex:
_ca.provision_ca(plan, "claude-bottle-demo-abc12") _ca.provision_ca(plan, "claude-bottle-demo-abc12")
cp.assert_called_once_with( cp.assert_called_once_with(
str(self.pipelock_ca), str(self.pipelock_ca),
"claude-bottle-demo-abc12:" + _ca.AGENT_CA_PATH, "claude-bottle-demo-abc12:" + _ca.AGENT_CA_PATH,
) )
argvs = [c.args[1] for c in ex.call_args_list] # chmod + chown + update-ca-certificates are now folded
self.assertIn(["chmod", "644", _ca.AGENT_CA_PATH], argvs) # into one `sh -c` invocation (working around a smolvm
self.assertIn(["update-ca-certificates"], argvs) # exec warm-up SIGKILL race), so we look at the single
# exec's argv rather than expecting separate calls.
ex.assert_called_once()
argv = ex.call_args.args[1]
self.assertEqual("sh", argv[0])
self.assertEqual("-c", argv[1])
self.assertIn("chmod 644", argv[2])
self.assertIn("update-ca-certificates", argv[2])
def test_egress_path_when_routes_declared(self): def test_egress_path_when_routes_declared(self):
plan = _plan( plan = _plan(
@@ -332,7 +349,8 @@ class TestProvisionCA(unittest.TestCase):
with patch( with patch(
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp" "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
) as cp, patch( ) as cp, patch(
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec" "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
return_value=self._UPDATE_OK,
): ):
_ca.provision_ca(plan, "claude-bottle-demo-abc12") _ca.provision_ca(plan, "claude-bottle-demo-abc12")
# When routes are declared, egress is the agent's first hop, # When routes are declared, egress is the agent's first hop,