fix(smolmachines): route agent through egress when routes declared, wait for VM warm-up
Two related bugs: 1. Auth chain bypassed egress. After the Docker-Desktop port pivot, the agent always dialed pipelock directly — meaning egress (which holds the real OAuth token and rewrites the Authorization header) wasn't in the request path. Bearer placeholder reached anthropic verbatim → 401 "Invalid bearer token". Fix: when the bottle declares egress.routes, the agent's first hop is egress (publish egress port 9099 to host loopback, leave pipelock bundle-internal). Without routes, the agent dials pipelock directly. Same hop order as the docker backend. 2. provision_ca's update-ca-certificates SIGKILLed at ~100ms on Docker Desktop. Back-to-back `smolvm machine exec` calls immediately after machine_start hit a VM warm-up race in libkrun's exec channel; the second exec's child got SIGKILL'd before producing more than the first line of stdout. The agent's trust store never got the egress MITM CA's hash symlink, so curl/openssl couldn't validate the TLS chain. Fix: 1.5s sleep after machine_start (empirically enough), plus fold provision_ca's chown + chmod + update-ca-certificates into one `sh -c` so we only pay one exec round trip. Bail with a clear error if update-ca- certificates doesn't report "1 added" (failing silently was how the original SIGKILL went unnoticed). Net effect on Docker Desktop / macOS: claude's HTTPS_PROXY is `http://127.0.0.1:<egress port>`, egress rewrites auth, pipelock allowlists + DLPs, request reaches api.anthropic.com with a real token. End-to-end verified. Also drops the PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation. The original concern (agent bypassing pipelock by dialing egress's port on the bundle IP) doesn't apply in this topology: the agent can only reach whatever port we publish on host loopback, and egress is the only HTTP/HTTPS chokepoint that gets published. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import os
|
||||
import time
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from typing import Callable, Generator
|
||||
|
||||
@@ -34,6 +35,7 @@ from ...util import expand_tilde
|
||||
from ..docker.egress import (
|
||||
EGRESS_CA_IN_CONTAINER,
|
||||
EGRESS_PIPELOCK_CA_IN_CONTAINER,
|
||||
EGRESS_PORT as _EGRESS_PORT,
|
||||
egress_tls_init,
|
||||
)
|
||||
from ..docker.git_gate import (
|
||||
@@ -125,8 +127,22 @@ def launch(
|
||||
# reachable from the smolvm guest on macOS — TSI uses
|
||||
# macOS networking, and macOS sees the daemon's bridge
|
||||
# via the published-port loopback forward only.
|
||||
pipelock_host_port = _bundle.bundle_host_port(plan.slug, _PIPELOCK_PORT)
|
||||
agent_proxy_url = f"http://127.0.0.1:{pipelock_host_port}"
|
||||
#
|
||||
# Proxy hop order matches the docker backend: when the
|
||||
# bottle declares egress routes, the agent's first hop is
|
||||
# egress (for token injection), then pipelock. Without
|
||||
# routes, the agent dials pipelock directly. Whichever
|
||||
# one is "agent-facing" is the daemon whose port we
|
||||
# publish on host loopback; the other stays bundle-
|
||||
# internal as the upstream proxy.
|
||||
if plan.egress_plan.routes:
|
||||
agent_facing_port = _EGRESS_PORT
|
||||
else:
|
||||
agent_facing_port = _PIPELOCK_PORT
|
||||
agent_facing_host_port = _bundle.bundle_host_port(
|
||||
plan.slug, agent_facing_port,
|
||||
)
|
||||
agent_proxy_url = f"http://127.0.0.1:{agent_facing_host_port}"
|
||||
agent_git_gate_host = ""
|
||||
if plan.git_gate_plan.upstreams:
|
||||
git_gate_host_port = _bundle.bundle_host_port(
|
||||
@@ -189,6 +205,14 @@ def launch(
|
||||
["chown", "-R", "node:node", "/home/node"],
|
||||
)
|
||||
|
||||
# Wait briefly for the VM to settle. Back-to-back smolvm
|
||||
# machine_exec calls immediately after machine_start
|
||||
# occasionally SIGKILL the in-VM child at ~100ms (looks
|
||||
# like a VM warm-up race in libkrun's exec channel).
|
||||
# 1.5s is empirically enough to dodge it; provisioning
|
||||
# already takes seconds so the wait is amortized.
|
||||
time.sleep(1.5)
|
||||
|
||||
# 7. Provision (CA / prompt / skills / git / supervise).
|
||||
prompt_path = provision(plan, plan.machine_name)
|
||||
|
||||
@@ -220,9 +244,14 @@ def _bundle_launch_spec(
|
||||
env: list[str] = []
|
||||
volumes: list[tuple[str, str, bool]] = []
|
||||
|
||||
# PRD 0023 chunk 3: egress binds 127.0.0.1 inside the bundle
|
||||
# so TSI's IP-only allowlist can't bypass pipelock.
|
||||
env.append("EGRESS_LISTEN_HOST=127.0.0.1")
|
||||
# In this Docker-Desktop-compatible topology, whichever daemon
|
||||
# is "agent-facing" gets its port published on the host
|
||||
# loopback (see `_ensure_smolmachine`'s discovery loop) and the
|
||||
# other stays bundle-internal. The bundle is NOT reachable by
|
||||
# bridge IP from the smolvm guest, so the
|
||||
# PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
|
||||
# isn't needed: the agent can only dial whatever daemon's
|
||||
# host port we publish, period.
|
||||
|
||||
# --- pipelock ---------------------------------------------
|
||||
pp = plan.proxy_plan
|
||||
@@ -279,9 +308,14 @@ def _bundle_launch_spec(
|
||||
|
||||
# Container ports the agent reaches from the smolvm guest —
|
||||
# published on host loopback so the guest can dial via TSI +
|
||||
# macOS networking. Egress is bundle-internal and never
|
||||
# published.
|
||||
ports_to_publish: list[int] = [_PIPELOCK_PORT]
|
||||
# macOS networking. The HTTP/HTTPS chokepoint is whichever
|
||||
# daemon's port we publish: egress when routes are declared
|
||||
# (token injection first, then forwards to bundle-internal
|
||||
# pipelock), pipelock otherwise.
|
||||
if ep.routes:
|
||||
ports_to_publish: list[int] = [_EGRESS_PORT]
|
||||
else:
|
||||
ports_to_publish = [_PIPELOCK_PORT]
|
||||
if gp.upstreams:
|
||||
ports_to_publish.append(_GIT_GATE_PORT)
|
||||
if sp is not None:
|
||||
|
||||
@@ -66,8 +66,29 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
||||
# default. The env trio (NODE_EXTRA_CA_CERTS / SSL_CERT_FILE /
|
||||
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
|
||||
# `requests` / libraries that don't load the system bundle.
|
||||
_smolvm.machine_exec(target, ["chmod", "644", AGENT_CA_PATH])
|
||||
_smolvm.machine_exec(target, ["update-ca-certificates"])
|
||||
#
|
||||
# chown + chmod + update-ca-certificates run in one
|
||||
# `sh -c` so we only pay one machine_exec round trip; the
|
||||
# `&&` chaining surfaces the first failure as the return
|
||||
# code.
|
||||
r = _smolvm.machine_exec(target, [
|
||||
"sh", "-c",
|
||||
f"chown root:root {AGENT_CA_PATH} && "
|
||||
f"chmod 644 {AGENT_CA_PATH} && "
|
||||
f"update-ca-certificates",
|
||||
])
|
||||
if r.returncode != 0 or "1 added" not in (r.stdout or ""):
|
||||
# update-ca-certificates not adding our cert is fatal —
|
||||
# claude-code's TLS handshake against the egress-MITM'd
|
||||
# api.anthropic.com would fail downstream. Bail early
|
||||
# with what we can see (output is captured by smolvm so
|
||||
# we can surface it).
|
||||
die(
|
||||
f"update-ca-certificates didn't add the agent CA "
|
||||
f"(exit {r.returncode}): "
|
||||
f"stdout={(r.stdout or '').strip()!r} "
|
||||
f"stderr={(r.stderr or '').strip()!r}"
|
||||
)
|
||||
|
||||
# Stdlib SHA-256 of the cert's DER bytes — the standard
|
||||
# fingerprint form. Never the private key.
|
||||
|
||||
@@ -307,21 +307,38 @@ class TestProvisionCA(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
self._tmp.cleanup()
|
||||
|
||||
# provision_ca dies hard if update-ca-certificates' stdout
|
||||
# doesn't include "1 added"; supply a stock success return
|
||||
# so the bulk of the tests below exercise the happy path.
|
||||
_UPDATE_OK = SmolvmRunResult(
|
||||
returncode=0,
|
||||
stdout="Updating certificates in /etc/ssl/certs...\n1 added, 0 removed; done.\n",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
def test_pipelock_path_when_no_routes(self):
|
||||
plan = _plan(pipelock_ca_path=self.pipelock_ca)
|
||||
with patch(
|
||||
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
|
||||
) as cp, patch(
|
||||
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec"
|
||||
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
|
||||
return_value=self._UPDATE_OK,
|
||||
) as ex:
|
||||
_ca.provision_ca(plan, "claude-bottle-demo-abc12")
|
||||
cp.assert_called_once_with(
|
||||
str(self.pipelock_ca),
|
||||
"claude-bottle-demo-abc12:" + _ca.AGENT_CA_PATH,
|
||||
)
|
||||
argvs = [c.args[1] for c in ex.call_args_list]
|
||||
self.assertIn(["chmod", "644", _ca.AGENT_CA_PATH], argvs)
|
||||
self.assertIn(["update-ca-certificates"], argvs)
|
||||
# chmod + chown + update-ca-certificates are now folded
|
||||
# into one `sh -c` invocation (working around a smolvm
|
||||
# exec warm-up SIGKILL race), so we look at the single
|
||||
# exec's argv rather than expecting separate calls.
|
||||
ex.assert_called_once()
|
||||
argv = ex.call_args.args[1]
|
||||
self.assertEqual("sh", argv[0])
|
||||
self.assertEqual("-c", argv[1])
|
||||
self.assertIn("chmod 644", argv[2])
|
||||
self.assertIn("update-ca-certificates", argv[2])
|
||||
|
||||
def test_egress_path_when_routes_declared(self):
|
||||
plan = _plan(
|
||||
@@ -332,7 +349,8 @@ class TestProvisionCA(unittest.TestCase):
|
||||
with patch(
|
||||
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
|
||||
) as cp, patch(
|
||||
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec"
|
||||
"claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
|
||||
return_value=self._UPDATE_OK,
|
||||
):
|
||||
_ca.provision_ca(plan, "claude-bottle-demo-abc12")
|
||||
# When routes are declared, egress is the agent's first hop,
|
||||
|
||||
Reference in New Issue
Block a user