From 5486170be1399cadd48245103a9069f2ae69dbde Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 15:57:18 -0400 Subject: [PATCH] fix(smolmachines): route agent through egress when routes declared, wait for VM warm-up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related bugs: 1. Auth chain bypassed egress. After the Docker-Desktop port pivot, the agent always dialed pipelock directly — meaning egress (which holds the real OAuth token and rewrites the Authorization header) wasn't in the request path. Bearer placeholder reached anthropic verbatim → 401 "Invalid bearer token". Fix: when the bottle declares egress.routes, the agent's first hop is egress (publish egress port 9099 to host loopback, leave pipelock bundle-internal). Without routes, the agent dials pipelock directly. Same hop order as the docker backend. 2. provision_ca's update-ca-certificates SIGKILLed at ~100ms on Docker Desktop. Back-to-back `smolvm machine exec` calls immediately after machine_start hit a VM warm-up race in libkrun's exec channel; the second exec's child got SIGKILL'd before producing more than the first line of stdout. The agent's trust store never got the egress MITM CA's hash symlink, so curl/openssl couldn't validate the TLS chain. Fix: 1.5s sleep after machine_start (empirically enough), plus fold provision_ca's chown + chmod + update-ca-certificates into one `sh -c` so we only pay one exec round trip. Bail with a clear error if update-ca- certificates doesn't report "1 added" (failing silently was how the original SIGKILL went unnoticed). Net effect on Docker Desktop / macOS: claude's HTTPS_PROXY is `http://127.0.0.1:`, egress rewrites auth, pipelock allowlists + DLPs, request reaches api.anthropic.com with a real token. End-to-end verified. Also drops the PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation. The original concern (agent bypassing pipelock by dialing egress's port on the bundle IP) doesn't apply in this topology: the agent can only reach whatever port we publish on host loopback, and egress is the only HTTP/HTTPS chokepoint that gets published. Co-Authored-By: Claude Opus 4.7 --- claude_bottle/backend/smolmachines/launch.py | 50 ++++++++++++++++--- .../backend/smolmachines/provision/ca.py | 25 +++++++++- tests/unit/test_smolmachines_provision.py | 28 +++++++++-- 3 files changed, 88 insertions(+), 15 deletions(-) diff --git a/claude_bottle/backend/smolmachines/launch.py b/claude_bottle/backend/smolmachines/launch.py index 8b3f409..cf9d61b 100644 --- a/claude_bottle/backend/smolmachines/launch.py +++ b/claude_bottle/backend/smolmachines/launch.py @@ -21,6 +21,7 @@ from __future__ import annotations import dataclasses import os +import time from contextlib import ExitStack, contextmanager from typing import Callable, Generator @@ -34,6 +35,7 @@ from ...util import expand_tilde from ..docker.egress import ( EGRESS_CA_IN_CONTAINER, EGRESS_PIPELOCK_CA_IN_CONTAINER, + EGRESS_PORT as _EGRESS_PORT, egress_tls_init, ) from ..docker.git_gate import ( @@ -125,8 +127,22 @@ def launch( # reachable from the smolvm guest on macOS — TSI uses # macOS networking, and macOS sees the daemon's bridge # via the published-port loopback forward only. - pipelock_host_port = _bundle.bundle_host_port(plan.slug, _PIPELOCK_PORT) - agent_proxy_url = f"http://127.0.0.1:{pipelock_host_port}" + # + # Proxy hop order matches the docker backend: when the + # bottle declares egress routes, the agent's first hop is + # egress (for token injection), then pipelock. Without + # routes, the agent dials pipelock directly. Whichever + # one is "agent-facing" is the daemon whose port we + # publish on host loopback; the other stays bundle- + # internal as the upstream proxy. + if plan.egress_plan.routes: + agent_facing_port = _EGRESS_PORT + else: + agent_facing_port = _PIPELOCK_PORT + agent_facing_host_port = _bundle.bundle_host_port( + plan.slug, agent_facing_port, + ) + agent_proxy_url = f"http://127.0.0.1:{agent_facing_host_port}" agent_git_gate_host = "" if plan.git_gate_plan.upstreams: git_gate_host_port = _bundle.bundle_host_port( @@ -189,6 +205,14 @@ def launch( ["chown", "-R", "node:node", "/home/node"], ) + # Wait briefly for the VM to settle. Back-to-back smolvm + # machine_exec calls immediately after machine_start + # occasionally SIGKILL the in-VM child at ~100ms (looks + # like a VM warm-up race in libkrun's exec channel). + # 1.5s is empirically enough to dodge it; provisioning + # already takes seconds so the wait is amortized. + time.sleep(1.5) + # 7. Provision (CA / prompt / skills / git / supervise). prompt_path = provision(plan, plan.machine_name) @@ -220,9 +244,14 @@ def _bundle_launch_spec( env: list[str] = [] volumes: list[tuple[str, str, bool]] = [] - # PRD 0023 chunk 3: egress binds 127.0.0.1 inside the bundle - # so TSI's IP-only allowlist can't bypass pipelock. - env.append("EGRESS_LISTEN_HOST=127.0.0.1") + # In this Docker-Desktop-compatible topology, whichever daemon + # is "agent-facing" gets its port published on the host + # loopback (see `_ensure_smolmachine`'s discovery loop) and the + # other stays bundle-internal. The bundle is NOT reachable by + # bridge IP from the smolvm guest, so the + # PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation + # isn't needed: the agent can only dial whatever daemon's + # host port we publish, period. # --- pipelock --------------------------------------------- pp = plan.proxy_plan @@ -279,9 +308,14 @@ def _bundle_launch_spec( # Container ports the agent reaches from the smolvm guest — # published on host loopback so the guest can dial via TSI + - # macOS networking. Egress is bundle-internal and never - # published. - ports_to_publish: list[int] = [_PIPELOCK_PORT] + # macOS networking. The HTTP/HTTPS chokepoint is whichever + # daemon's port we publish: egress when routes are declared + # (token injection first, then forwards to bundle-internal + # pipelock), pipelock otherwise. + if ep.routes: + ports_to_publish: list[int] = [_EGRESS_PORT] + else: + ports_to_publish = [_PIPELOCK_PORT] if gp.upstreams: ports_to_publish.append(_GIT_GATE_PORT) if sp is not None: diff --git a/claude_bottle/backend/smolmachines/provision/ca.py b/claude_bottle/backend/smolmachines/provision/ca.py index e610d30..453c725 100644 --- a/claude_bottle/backend/smolmachines/provision/ca.py +++ b/claude_bottle/backend/smolmachines/provision/ca.py @@ -66,8 +66,29 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None: # default. The env trio (NODE_EXTRA_CA_CERTS / SSL_CERT_FILE / # REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python # `requests` / libraries that don't load the system bundle. - _smolvm.machine_exec(target, ["chmod", "644", AGENT_CA_PATH]) - _smolvm.machine_exec(target, ["update-ca-certificates"]) + # + # chown + chmod + update-ca-certificates run in one + # `sh -c` so we only pay one machine_exec round trip; the + # `&&` chaining surfaces the first failure as the return + # code. + r = _smolvm.machine_exec(target, [ + "sh", "-c", + f"chown root:root {AGENT_CA_PATH} && " + f"chmod 644 {AGENT_CA_PATH} && " + f"update-ca-certificates", + ]) + if r.returncode != 0 or "1 added" not in (r.stdout or ""): + # update-ca-certificates not adding our cert is fatal — + # claude-code's TLS handshake against the egress-MITM'd + # api.anthropic.com would fail downstream. Bail early + # with what we can see (output is captured by smolvm so + # we can surface it). + die( + f"update-ca-certificates didn't add the agent CA " + f"(exit {r.returncode}): " + f"stdout={(r.stdout or '').strip()!r} " + f"stderr={(r.stderr or '').strip()!r}" + ) # Stdlib SHA-256 of the cert's DER bytes — the standard # fingerprint form. Never the private key. diff --git a/tests/unit/test_smolmachines_provision.py b/tests/unit/test_smolmachines_provision.py index c3780a7..bb547ad 100644 --- a/tests/unit/test_smolmachines_provision.py +++ b/tests/unit/test_smolmachines_provision.py @@ -307,21 +307,38 @@ class TestProvisionCA(unittest.TestCase): def tearDown(self): self._tmp.cleanup() + # provision_ca dies hard if update-ca-certificates' stdout + # doesn't include "1 added"; supply a stock success return + # so the bulk of the tests below exercise the happy path. + _UPDATE_OK = SmolvmRunResult( + returncode=0, + stdout="Updating certificates in /etc/ssl/certs...\n1 added, 0 removed; done.\n", + stderr="", + ) + def test_pipelock_path_when_no_routes(self): plan = _plan(pipelock_ca_path=self.pipelock_ca) with patch( "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp" ) as cp, patch( - "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec" + "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec", + return_value=self._UPDATE_OK, ) as ex: _ca.provision_ca(plan, "claude-bottle-demo-abc12") cp.assert_called_once_with( str(self.pipelock_ca), "claude-bottle-demo-abc12:" + _ca.AGENT_CA_PATH, ) - argvs = [c.args[1] for c in ex.call_args_list] - self.assertIn(["chmod", "644", _ca.AGENT_CA_PATH], argvs) - self.assertIn(["update-ca-certificates"], argvs) + # chmod + chown + update-ca-certificates are now folded + # into one `sh -c` invocation (working around a smolvm + # exec warm-up SIGKILL race), so we look at the single + # exec's argv rather than expecting separate calls. + ex.assert_called_once() + argv = ex.call_args.args[1] + self.assertEqual("sh", argv[0]) + self.assertEqual("-c", argv[1]) + self.assertIn("chmod 644", argv[2]) + self.assertIn("update-ca-certificates", argv[2]) def test_egress_path_when_routes_declared(self): plan = _plan( @@ -332,7 +349,8 @@ class TestProvisionCA(unittest.TestCase): with patch( "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp" ) as cp, patch( - "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec" + "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec", + return_value=self._UPDATE_OK, ): _ca.provision_ca(plan, "claude-bottle-demo-abc12") # When routes are declared, egress is the agent's first hop,