diff --git a/claude_bottle/backend/smolmachines/launch.py b/claude_bottle/backend/smolmachines/launch.py index 8b3f409..cf9d61b 100644 --- a/claude_bottle/backend/smolmachines/launch.py +++ b/claude_bottle/backend/smolmachines/launch.py @@ -21,6 +21,7 @@ from __future__ import annotations import dataclasses import os +import time from contextlib import ExitStack, contextmanager from typing import Callable, Generator @@ -34,6 +35,7 @@ from ...util import expand_tilde from ..docker.egress import ( EGRESS_CA_IN_CONTAINER, EGRESS_PIPELOCK_CA_IN_CONTAINER, + EGRESS_PORT as _EGRESS_PORT, egress_tls_init, ) from ..docker.git_gate import ( @@ -125,8 +127,22 @@ def launch( # reachable from the smolvm guest on macOS — TSI uses # macOS networking, and macOS sees the daemon's bridge # via the published-port loopback forward only. - pipelock_host_port = _bundle.bundle_host_port(plan.slug, _PIPELOCK_PORT) - agent_proxy_url = f"http://127.0.0.1:{pipelock_host_port}" + # + # Proxy hop order matches the docker backend: when the + # bottle declares egress routes, the agent's first hop is + # egress (for token injection), then pipelock. Without + # routes, the agent dials pipelock directly. Whichever + # one is "agent-facing" is the daemon whose port we + # publish on host loopback; the other stays bundle- + # internal as the upstream proxy. + if plan.egress_plan.routes: + agent_facing_port = _EGRESS_PORT + else: + agent_facing_port = _PIPELOCK_PORT + agent_facing_host_port = _bundle.bundle_host_port( + plan.slug, agent_facing_port, + ) + agent_proxy_url = f"http://127.0.0.1:{agent_facing_host_port}" agent_git_gate_host = "" if plan.git_gate_plan.upstreams: git_gate_host_port = _bundle.bundle_host_port( @@ -189,6 +205,14 @@ def launch( ["chown", "-R", "node:node", "/home/node"], ) + # Wait briefly for the VM to settle. Back-to-back smolvm + # machine_exec calls immediately after machine_start + # occasionally SIGKILL the in-VM child at ~100ms (looks + # like a VM warm-up race in libkrun's exec channel). + # 1.5s is empirically enough to dodge it; provisioning + # already takes seconds so the wait is amortized. + time.sleep(1.5) + # 7. Provision (CA / prompt / skills / git / supervise). prompt_path = provision(plan, plan.machine_name) @@ -220,9 +244,14 @@ def _bundle_launch_spec( env: list[str] = [] volumes: list[tuple[str, str, bool]] = [] - # PRD 0023 chunk 3: egress binds 127.0.0.1 inside the bundle - # so TSI's IP-only allowlist can't bypass pipelock. - env.append("EGRESS_LISTEN_HOST=127.0.0.1") + # In this Docker-Desktop-compatible topology, whichever daemon + # is "agent-facing" gets its port published on the host + # loopback (see `_ensure_smolmachine`'s discovery loop) and the + # other stays bundle-internal. The bundle is NOT reachable by + # bridge IP from the smolvm guest, so the + # PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation + # isn't needed: the agent can only dial whatever daemon's + # host port we publish, period. # --- pipelock --------------------------------------------- pp = plan.proxy_plan @@ -279,9 +308,14 @@ def _bundle_launch_spec( # Container ports the agent reaches from the smolvm guest — # published on host loopback so the guest can dial via TSI + - # macOS networking. Egress is bundle-internal and never - # published. - ports_to_publish: list[int] = [_PIPELOCK_PORT] + # macOS networking. The HTTP/HTTPS chokepoint is whichever + # daemon's port we publish: egress when routes are declared + # (token injection first, then forwards to bundle-internal + # pipelock), pipelock otherwise. + if ep.routes: + ports_to_publish: list[int] = [_EGRESS_PORT] + else: + ports_to_publish = [_PIPELOCK_PORT] if gp.upstreams: ports_to_publish.append(_GIT_GATE_PORT) if sp is not None: diff --git a/claude_bottle/backend/smolmachines/provision/ca.py b/claude_bottle/backend/smolmachines/provision/ca.py index e610d30..453c725 100644 --- a/claude_bottle/backend/smolmachines/provision/ca.py +++ b/claude_bottle/backend/smolmachines/provision/ca.py @@ -66,8 +66,29 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None: # default. The env trio (NODE_EXTRA_CA_CERTS / SSL_CERT_FILE / # REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python # `requests` / libraries that don't load the system bundle. - _smolvm.machine_exec(target, ["chmod", "644", AGENT_CA_PATH]) - _smolvm.machine_exec(target, ["update-ca-certificates"]) + # + # chown + chmod + update-ca-certificates run in one + # `sh -c` so we only pay one machine_exec round trip; the + # `&&` chaining surfaces the first failure as the return + # code. + r = _smolvm.machine_exec(target, [ + "sh", "-c", + f"chown root:root {AGENT_CA_PATH} && " + f"chmod 644 {AGENT_CA_PATH} && " + f"update-ca-certificates", + ]) + if r.returncode != 0 or "1 added" not in (r.stdout or ""): + # update-ca-certificates not adding our cert is fatal — + # claude-code's TLS handshake against the egress-MITM'd + # api.anthropic.com would fail downstream. Bail early + # with what we can see (output is captured by smolvm so + # we can surface it). + die( + f"update-ca-certificates didn't add the agent CA " + f"(exit {r.returncode}): " + f"stdout={(r.stdout or '').strip()!r} " + f"stderr={(r.stderr or '').strip()!r}" + ) # Stdlib SHA-256 of the cert's DER bytes — the standard # fingerprint form. Never the private key. diff --git a/tests/unit/test_smolmachines_provision.py b/tests/unit/test_smolmachines_provision.py index c3780a7..bb547ad 100644 --- a/tests/unit/test_smolmachines_provision.py +++ b/tests/unit/test_smolmachines_provision.py @@ -307,21 +307,38 @@ class TestProvisionCA(unittest.TestCase): def tearDown(self): self._tmp.cleanup() + # provision_ca dies hard if update-ca-certificates' stdout + # doesn't include "1 added"; supply a stock success return + # so the bulk of the tests below exercise the happy path. + _UPDATE_OK = SmolvmRunResult( + returncode=0, + stdout="Updating certificates in /etc/ssl/certs...\n1 added, 0 removed; done.\n", + stderr="", + ) + def test_pipelock_path_when_no_routes(self): plan = _plan(pipelock_ca_path=self.pipelock_ca) with patch( "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp" ) as cp, patch( - "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec" + "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec", + return_value=self._UPDATE_OK, ) as ex: _ca.provision_ca(plan, "claude-bottle-demo-abc12") cp.assert_called_once_with( str(self.pipelock_ca), "claude-bottle-demo-abc12:" + _ca.AGENT_CA_PATH, ) - argvs = [c.args[1] for c in ex.call_args_list] - self.assertIn(["chmod", "644", _ca.AGENT_CA_PATH], argvs) - self.assertIn(["update-ca-certificates"], argvs) + # chmod + chown + update-ca-certificates are now folded + # into one `sh -c` invocation (working around a smolvm + # exec warm-up SIGKILL race), so we look at the single + # exec's argv rather than expecting separate calls. + ex.assert_called_once() + argv = ex.call_args.args[1] + self.assertEqual("sh", argv[0]) + self.assertEqual("-c", argv[1]) + self.assertIn("chmod 644", argv[2]) + self.assertIn("update-ca-certificates", argv[2]) def test_egress_path_when_routes_declared(self): plan = _plan( @@ -332,7 +349,8 @@ class TestProvisionCA(unittest.TestCase): with patch( "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp" ) as cp, patch( - "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec" + "claude_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec", + return_value=self._UPDATE_OK, ): _ca.provision_ca(plan, "claude-bottle-demo-abc12") # When routes are declared, egress is the agent's first hop,