4f136a9932
Claude hung on outbound network calls under CLAUDE_BOTTLE_BACKEND=smolmachines: Unable to connect to API (FailedToOpenSocket) Root cause: the PRD-0023 design pinned the bundle at a docker bridge IP (192.168.X.2) and set the smolvm guest's TSI allowlist to `<bundle-ip>/32`. On native Linux this works — host shares the docker bridge's network namespace, TSI's syscall impersonation reaches the bridge IP directly. On Docker Desktop (macOS), the daemon runs in its own Linux VM and docker bridge IPs aren't reachable from macOS networking, so the smolvm guest's TSI requests die "Network is unreachable" before they hit pipelock. Fix: publish each agent-facing bundle daemon's port on host loopback (-p 127.0.0.1::PORT), discover the random host-side ports after start, and route the agent through `127.0.0.1:<host port>` instead of the bridge IP. macOS loopback is the surface Docker Desktop's gvproxy forwards into the daemon's VM, so the chain (guest TSI -> macOS loopback -> daemon VM port-forward -> bundle container) works on both Docker Desktop and native Linux. Concrete changes: - BundleLaunchSpec: add `ports_to_publish` so start_bundle adds `-p 127.0.0.1::PORT` for the agent-facing ports (pipelock always; git-gate when upstreams declared; supervise when enabled). Egress's port stays bundle-internal. - sidecar_bundle.bundle_host_port(): wrap `docker port <bundle> <container_port>/tcp` so launch can look up the random host-side mapping after start. - launch.py: discover the host ports, build URLs of the form `http://127.0.0.1:<host port>` / `git://127.0.0.1:<host port>`, stamp onto guest_env + new agent_*_url fields on the plan. - launch.py: TSI allow_cidrs flips to `["127.0.0.1/32"]`. The bundle IP is no longer the agent's target. - prepare.py: stop synthesizing HTTPS_PROXY / GIT_GATE_URL / MCP_SUPERVISE_URL at prepare time — launch owns those now (the values depend on a port docker hasn't assigned yet). - provision_git: gate_host from plan.agent_git_gate_host. - provision_supervise: URL from plan.agent_supervise_url. End-to-end verified on Docker Desktop / macOS: guest dials pipelock through TSI, pipelock forwards to api.anthropic.com, the API responds with 401 (i.e. it received the request). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
313 lines
12 KiB
Python
313 lines
12 KiB
Python
"""End-to-end launch flow for the smolmachines backend
|
|
(PRD 0023 chunks 2d + 4b).
|
|
|
|
Brings up the per-bottle docker bridge + sidecar bundle (with
|
|
real daemons + their config files), creates + starts the smolvm
|
|
guest pointed at the bundle's pinned IP via TSI's
|
|
`--allow-cidr <bundle-ip>/32` allowlist, yields a
|
|
`SmolmachinesBottle` handle, tears everything down on context
|
|
exit.
|
|
|
|
The bundle's daemons consume the inner Plans the docker backend
|
|
already produces: pipelock reads its yaml + CA from the
|
|
PipelockProxyPlan; egress reads routes + CAs from the EgressPlan
|
|
+ EGRESS_UPSTREAM_PROXY pointing at `127.0.0.1:8888` (bundle
|
|
local), since the agent dials pipelock first (not egress) on the
|
|
smolmachines path. Git-gate + supervise plumb through the same
|
|
plans the docker backend uses, minus the docker-network fields
|
|
that don't apply here."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import dataclasses
|
|
import os
|
|
from contextlib import ExitStack, contextmanager
|
|
from typing import Callable, Generator
|
|
|
|
from ...egress import EGRESS_ROUTES_IN_CONTAINER, egress_resolve_token_values
|
|
from ...pipelock import (
|
|
PIPELOCK_CA_CERT_IN_CONTAINER,
|
|
PIPELOCK_CA_KEY_IN_CONTAINER,
|
|
)
|
|
from ...supervise import QUEUE_DIR_IN_CONTAINER, SUPERVISE_PORT
|
|
from ...util import expand_tilde
|
|
from ..docker.egress import (
|
|
EGRESS_CA_IN_CONTAINER,
|
|
EGRESS_PIPELOCK_CA_IN_CONTAINER,
|
|
egress_tls_init,
|
|
)
|
|
from ..docker.git_gate import (
|
|
GIT_GATE_ACCESS_HOOK_IN_CONTAINER,
|
|
GIT_GATE_CREDS_DIR_IN_CONTAINER,
|
|
GIT_GATE_ENTRYPOINT_IN_CONTAINER,
|
|
GIT_GATE_HOOK_IN_CONTAINER,
|
|
GIT_GATE_PORT as _GIT_GATE_PORT,
|
|
)
|
|
from ..docker.pipelock import (
|
|
BUNDLE_LOCAL_PIPELOCK_URL,
|
|
PIPELOCK_PORT as _PIPELOCK_PORT_STR,
|
|
pipelock_tls_init,
|
|
)
|
|
from . import sidecar_bundle as _bundle
|
|
from . import smolvm as _smolvm
|
|
from .bottle import SmolmachinesBottle
|
|
from .bottle_plan import SmolmachinesBottlePlan
|
|
|
|
|
|
# Container-internal listening ports for each bundle daemon. The
|
|
# bundle publishes each one on a random host loopback port (see
|
|
# `_bundle.start_bundle`), and `_bundle.bundle_host_port` looks
|
|
# them up post-start. Pipelock's port is an env-overridable string
|
|
# in docker.pipelock; coerce to int here.
|
|
_PIPELOCK_PORT = int(_PIPELOCK_PORT_STR)
|
|
_SUPERVISE_PORT = SUPERVISE_PORT
|
|
|
|
|
|
@contextmanager
|
|
def launch(
|
|
plan: SmolmachinesBottlePlan,
|
|
*,
|
|
provision: Callable[[SmolmachinesBottlePlan, str], str | None],
|
|
) -> Generator[SmolmachinesBottle, None, None]:
|
|
"""Build + run the bottle and yield a handle; tear everything
|
|
down on exit. Errors during bringup unwind any partial state
|
|
via the ExitStack."""
|
|
stack = ExitStack()
|
|
try:
|
|
# 1. Per-bottle docker bridge.
|
|
network = _bundle.bundle_network_name(plan.slug)
|
|
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
|
|
stack.callback(_bundle.remove_bundle_network, network)
|
|
|
|
# 2. Mint per-bottle CAs and update the inner Plans with
|
|
# their launch-time paths. pipelock always runs in the
|
|
# bundle; egress's CA is only minted when the bottle
|
|
# declares routes (otherwise egress runs idle without
|
|
# MITM and the CA files would be unused).
|
|
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
|
|
proxy_plan = dataclasses.replace(
|
|
plan.proxy_plan,
|
|
ca_cert_host_path=ca_cert_host,
|
|
ca_key_host_path=ca_key_host,
|
|
)
|
|
egress_plan = plan.egress_plan
|
|
if egress_plan.routes:
|
|
egress_ca_host, egress_ca_cert_only = egress_tls_init(
|
|
plan.egress_plan.routes_path.parent,
|
|
)
|
|
egress_plan = dataclasses.replace(
|
|
egress_plan,
|
|
mitmproxy_ca_host_path=egress_ca_host,
|
|
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
|
|
pipelock_ca_host_path=ca_cert_host,
|
|
# On smolmachines, egress's upstream is pipelock
|
|
# on the bundle's localhost — they're in the same
|
|
# container's network namespace.
|
|
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
|
|
)
|
|
plan = dataclasses.replace(
|
|
plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
|
|
)
|
|
|
|
# 3. Build the BundleLaunchSpec from the (now-resolved)
|
|
# inner Plans: daemon subset, env, bind-mounts. The spec's
|
|
# ports_to_publish list expands depending on which daemons
|
|
# the agent needs to reach from the smolvm guest.
|
|
bundle_spec = _bundle_launch_spec(plan, network)
|
|
token_env = _resolve_token_env(plan, os.environ)
|
|
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
|
|
stack.callback(_bundle.stop_bundle, plan.slug)
|
|
|
|
# 4. Discover the host-side ports docker assigned for the
|
|
# bundle's published container ports, and bind the
|
|
# agent's URLs to `127.0.0.1:<host port>`. Docker container
|
|
# IPs (192.168.x.x in the daemon's bridge) aren't
|
|
# reachable from the smolvm guest on macOS — TSI uses
|
|
# macOS networking, and macOS sees the daemon's bridge
|
|
# via the published-port loopback forward only.
|
|
pipelock_host_port = _bundle.bundle_host_port(plan.slug, _PIPELOCK_PORT)
|
|
agent_proxy_url = f"http://127.0.0.1:{pipelock_host_port}"
|
|
agent_git_gate_host = ""
|
|
if plan.git_gate_plan.upstreams:
|
|
git_gate_host_port = _bundle.bundle_host_port(
|
|
plan.slug, _GIT_GATE_PORT,
|
|
)
|
|
agent_git_gate_host = f"127.0.0.1:{git_gate_host_port}"
|
|
agent_supervise_url = ""
|
|
if plan.supervise_plan is not None:
|
|
supervise_host_port = _bundle.bundle_host_port(
|
|
plan.slug, _SUPERVISE_PORT,
|
|
)
|
|
agent_supervise_url = f"http://127.0.0.1:{supervise_host_port}/"
|
|
|
|
# Stamp the URLs onto the plan + guest_env. provision_git
|
|
# and provision_supervise read the plan fields; the agent
|
|
# reads guest_env on every exec_claude.
|
|
guest_env = {
|
|
**plan.guest_env,
|
|
"HTTPS_PROXY": agent_proxy_url,
|
|
"HTTP_PROXY": agent_proxy_url,
|
|
}
|
|
if agent_git_gate_host:
|
|
guest_env["GIT_GATE_URL"] = f"git://{agent_git_gate_host}"
|
|
if agent_supervise_url:
|
|
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
|
|
plan = dataclasses.replace(
|
|
plan,
|
|
guest_env=guest_env,
|
|
agent_proxy_url=agent_proxy_url,
|
|
agent_git_gate_host=agent_git_gate_host,
|
|
agent_supervise_url=agent_supervise_url,
|
|
)
|
|
|
|
# 5. smolvm VM. --from carries the pre-packed .smolmachine
|
|
# artifact (built by prepare); --allow-cidr + -e carry the
|
|
# per-bottle TSI allowlist + env. The allowlist is
|
|
# `127.0.0.1/32` because every bundle daemon the agent
|
|
# reaches is fronted by a host loopback port-forward.
|
|
# Smolfile isn't usable here — smolvm 0.8.0 makes `--from`
|
|
# and `--smolfile` mutually exclusive.
|
|
_smolvm.machine_create(
|
|
plan.machine_name,
|
|
from_path=plan.agent_from_path,
|
|
allow_cidrs=["127.0.0.1/32"],
|
|
env=plan.guest_env,
|
|
)
|
|
stack.callback(_smolvm.machine_delete, plan.machine_name)
|
|
_smolvm.machine_start(plan.machine_name)
|
|
stack.callback(_smolvm.machine_stop, plan.machine_name)
|
|
|
|
# 6. Reclaim /home/node for the node user. smolvm's pack
|
|
# process remaps OCI-layer ownership to the host invoker's
|
|
# uid (501 on macOS) rather than preserving the image's
|
|
# uid 1000 — so without this chown, node can't write its
|
|
# own dotfiles (claude appendFileSync on
|
|
# ~/.claude.json bails with ENOENT/EPERM and the TUI hangs
|
|
# without surfacing the error).
|
|
_smolvm.machine_exec(
|
|
plan.machine_name,
|
|
["chown", "-R", "node:node", "/home/node"],
|
|
)
|
|
|
|
# 7. Provision (CA / prompt / skills / git / supervise).
|
|
prompt_path = provision(plan, plan.machine_name)
|
|
|
|
yield SmolmachinesBottle(
|
|
plan.machine_name,
|
|
prompt_path=prompt_path,
|
|
guest_env=plan.guest_env,
|
|
)
|
|
finally:
|
|
stack.close()
|
|
|
|
|
|
def _bundle_launch_spec(
|
|
plan: SmolmachinesBottlePlan, network: str
|
|
) -> _bundle.BundleLaunchSpec:
|
|
"""Build a BundleLaunchSpec from the resolved inner Plans.
|
|
|
|
Daemons in the CSV:
|
|
- egress + pipelock are always present (pipelock is the
|
|
agent's first hop; egress is its upstream).
|
|
- git-gate is conditional on plan.git_gate_plan.upstreams.
|
|
- supervise is conditional on plan.supervise_plan.
|
|
|
|
Env + volumes are the union of the four daemons' needs, with
|
|
daemon-private values only (HTTPS_PROXY is scoped to the
|
|
egress process by egress_entrypoint.sh — see PRD 0024's bundle
|
|
bind-address PR)."""
|
|
daemons: list[str] = ["egress", "pipelock"]
|
|
env: list[str] = []
|
|
volumes: list[tuple[str, str, bool]] = []
|
|
|
|
# PRD 0023 chunk 3: egress binds 127.0.0.1 inside the bundle
|
|
# so TSI's IP-only allowlist can't bypass pipelock.
|
|
env.append("EGRESS_LISTEN_HOST=127.0.0.1")
|
|
|
|
# --- pipelock ---------------------------------------------
|
|
pp = plan.proxy_plan
|
|
volumes += [
|
|
(str(pp.yaml_path), "/etc/pipelock.yaml", True),
|
|
(str(pp.ca_cert_host_path), PIPELOCK_CA_CERT_IN_CONTAINER, True),
|
|
(str(pp.ca_key_host_path), PIPELOCK_CA_KEY_IN_CONTAINER, True),
|
|
]
|
|
|
|
# --- egress -----------------------------------------------
|
|
ep = plan.egress_plan
|
|
if ep.routes:
|
|
env.append(f"EGRESS_UPSTREAM_PROXY={ep.pipelock_proxy_url}")
|
|
env.append(f"EGRESS_UPSTREAM_CA={EGRESS_PIPELOCK_CA_IN_CONTAINER}")
|
|
volumes += [
|
|
(str(ep.routes_path), EGRESS_ROUTES_IN_CONTAINER, True),
|
|
(str(ep.mitmproxy_ca_host_path), EGRESS_CA_IN_CONTAINER, True),
|
|
(str(ep.pipelock_ca_host_path), EGRESS_PIPELOCK_CA_IN_CONTAINER, True),
|
|
]
|
|
# Bare-name entries for upstream-token slots. Their values
|
|
# come from the docker-run subprocess env (inherited from
|
|
# the operator's shell), never landing on argv.
|
|
for token_env in sorted(ep.token_env_map.keys()):
|
|
env.append(token_env)
|
|
|
|
# --- git-gate ---------------------------------------------
|
|
extra_hosts: list[str] = []
|
|
gp = plan.git_gate_plan
|
|
if gp.upstreams:
|
|
daemons.append("git-gate")
|
|
volumes += [
|
|
(str(gp.entrypoint_script), GIT_GATE_ENTRYPOINT_IN_CONTAINER, True),
|
|
(str(gp.hook_script), GIT_GATE_HOOK_IN_CONTAINER, True),
|
|
(str(gp.access_hook_script), GIT_GATE_ACCESS_HOOK_IN_CONTAINER, True),
|
|
]
|
|
for u in gp.upstreams:
|
|
keypath = expand_tilde(u.identity_file)
|
|
volumes.append((
|
|
keypath,
|
|
f"{GIT_GATE_CREDS_DIR_IN_CONTAINER}/{u.name}-key",
|
|
True,
|
|
))
|
|
|
|
# --- supervise --------------------------------------------
|
|
sp = plan.supervise_plan
|
|
if sp is not None:
|
|
daemons.append("supervise")
|
|
env += [
|
|
f"SUPERVISE_BOTTLE_SLUG={plan.slug}",
|
|
f"SUPERVISE_QUEUE_DIR={QUEUE_DIR_IN_CONTAINER}",
|
|
f"SUPERVISE_PORT={SUPERVISE_PORT}",
|
|
]
|
|
volumes.append((str(sp.queue_dir), QUEUE_DIR_IN_CONTAINER, False))
|
|
|
|
# Container ports the agent reaches from the smolvm guest —
|
|
# published on host loopback so the guest can dial via TSI +
|
|
# macOS networking. Egress is bundle-internal and never
|
|
# published.
|
|
ports_to_publish: list[int] = [_PIPELOCK_PORT]
|
|
if gp.upstreams:
|
|
ports_to_publish.append(_GIT_GATE_PORT)
|
|
if sp is not None:
|
|
ports_to_publish.append(_SUPERVISE_PORT)
|
|
|
|
return _bundle.BundleLaunchSpec(
|
|
slug=plan.slug,
|
|
network_name=network,
|
|
subnet=plan.bundle_subnet,
|
|
gateway=plan.bundle_gateway,
|
|
bundle_ip=plan.bundle_ip,
|
|
daemons_csv=",".join(daemons),
|
|
environment=tuple(env),
|
|
volumes=tuple(volumes),
|
|
ports_to_publish=tuple(ports_to_publish),
|
|
)
|
|
|
|
|
|
def _resolve_token_env(
|
|
plan: SmolmachinesBottlePlan, host_env: object
|
|
) -> dict[str, str]:
|
|
"""Resolve the egress token env-var values from the host's
|
|
environ so they reach the bundle's process env via docker's
|
|
`-e NAME` inheritance. Empty when no routes declare auth."""
|
|
ep = plan.egress_plan
|
|
if not ep.routes:
|
|
return {}
|
|
return egress_resolve_token_values(ep.token_env_map, dict(host_env))
|