From cefdc8c6e9800f99f455aa0b1a6540471383ff23 Mon Sep 17 00:00:00 2001 From: didericis Date: Mon, 25 May 2026 23:16:40 -0400 Subject: [PATCH] feat(launch): switch start to docker compose project per bottle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PRD 0018 chunk 3. Each instance is now one `docker compose` project: - launch.py renders the compose spec via chunk-1's bottle_plan_to_compose, writes it to state//docker-compose.yml, `docker compose up -d`s, and (on teardown) dumps `docker compose logs --no-color --timestamps` to state//compose.log before `docker compose down`. - Networks are pre-created (`docker network create --internal` + user-defined bridge) so pipelock yaml can know the internal CIDR before compose-up. Compose references them with `external: true`; the launch step's ExitStack still owns network removal. - Agent still runs `sleep infinity`; claude reaches it via `docker exec -it` exactly like before (per the PRD's resolved TTY question). - metadata.json grows a `compose_project` field so dashboard / cleanup tooling can derive compose invocations without re-deriving the slug. Security follow-ups from chunk-2 review: (b) CA private keys: pipelock + egress ca-key.pem land at 0o600 explicitly. The mitmproxy cert+key concat stays 0o644 because the egress container's uid-1000 user reads it through the bind mount; parent dir at 0o700 still restricts host-side reach. (c) Apply atomicity: egress_apply + pipelock_apply switch from `docker cp` to host-side write-temp-then-rename on the bind-mount source. POSIX rename is atomic on the same filesystem, so a sidecar SIGHUP racing the apply can't see a half-written routes.yaml / pipelock.yaml. Per-sidecar Docker{Sidecar}.start/stop methods stay in place — the integration test suite drives them directly to validate each image in isolation, which is still useful. launch.py no longer calls them; a follow-up chunk can prune if the integration tests move to the compose lifecycle. git-gate entrypoint's chmod 600 on the keyfile + known_hosts now tolerates EROFS (`|| true`) — the host SSH key is already 0600 (SSH refuses to load otherwise), so the inside-container chmod was already a no-op in the docker-cp path and now just needs to not error on the read-only bind mount. 422 unit tests pass; supervise integration test passes; end-to-end `./cli.py start implementer` brings up the project, attaches, captures full merged logs on teardown, and reaps all containers + networks. --- claude_bottle/backend/docker/bottle_state.py | 8 + claude_bottle/backend/docker/compose.py | 142 +++++++- claude_bottle/backend/docker/egress.py | 10 + claude_bottle/backend/docker/egress_apply.py | 55 +-- claude_bottle/backend/docker/launch.py | 337 +++++++----------- claude_bottle/backend/docker/pipelock.py | 6 + .../backend/docker/pipelock_apply.py | 40 ++- claude_bottle/backend/docker/prepare.py | 1 + claude_bottle/git_gate.py | 10 +- tests/unit/test_agent_no_proxy.py | 42 --- tests/unit/test_compose.py | 13 +- 11 files changed, 362 insertions(+), 302 deletions(-) delete mode 100644 tests/unit/test_agent_no_proxy.py diff --git a/claude_bottle/backend/docker/bottle_state.py b/claude_bottle/backend/docker/bottle_state.py index 136c1cf..1ea2c6b 100644 --- a/claude_bottle/backend/docker/bottle_state.py +++ b/claude_bottle/backend/docker/bottle_state.py @@ -98,6 +98,13 @@ class BottleMetadata: cwd: str # empty string when --cwd was not passed copy_cwd: bool started_at: str # ISO 8601 UTC + # PRD 0018 chunk 3: derivable from identity via + # `compose_project_name(identity)`, but persisted explicitly so + # dashboard / cleanup / resume tooling can read it without + # importing the compose module. Empty string for state dirs + # written before chunk 3 (resume / inspect should fall back to + # deriving from identity in that case). + compose_project: str = "" def metadata_path(identity: str) -> Path: @@ -130,6 +137,7 @@ def read_metadata(identity: str) -> BottleMetadata | None: cwd=str(raw.get("cwd", "")), copy_cwd=bool(raw.get("copy_cwd", False)), started_at=str(raw.get("started_at", "")), + compose_project=str(raw.get("compose_project", "")), ) diff --git a/claude_bottle/backend/docker/compose.py b/claude_bottle/backend/docker/compose.py index 00fcbaf..432316c 100644 --- a/claude_bottle/backend/docker/compose.py +++ b/claude_bottle/backend/docker/compose.py @@ -39,6 +39,9 @@ aren't rebuilt on every up. from __future__ import annotations +import json +import subprocess +import sys from pathlib import Path from typing import Any @@ -46,6 +49,7 @@ from ...egress import ( EGRESS_HOSTNAME, EGRESS_ROUTES_IN_CONTAINER, ) +from ...log import die, warn from ...git_gate import git_gate_aggregate_extra_hosts from ...supervise import ( CURRENT_CONFIG_DIR_IN_AGENT, @@ -126,18 +130,21 @@ def bottle_plan_to_compose(plan: DockerBottlePlan) -> dict[str, Any]: def _networks(plan: DockerBottlePlan) -> dict[str, Any]: - """Two compose-managed networks with explicit `name:` matching - the existing slug-suffixed convention. The internal one is - `--internal` (no default gateway); the egress one is a normal - user-defined bridge so the upstream-bound sidecars can resolve - + reach the outside world.""" + """Both networks are `external: true` — chunk 3 pre-creates them + via `docker network create` so pipelock's yaml can embed the + internal-network CIDR in its SSRF allowlist before compose-up. + Compose just references the pre-existing networks by name. + Network lifecycle (create / remove) is owned by the compose- + lifecycle helpers, not compose itself; `docker compose down` + leaves external networks alone.""" return { "internal": { "name": plan.proxy_plan.internal_network, - "internal": True, + "external": True, }, "egress": { "name": plan.proxy_plan.egress_network, + "external": True, }, } @@ -382,4 +389,125 @@ def _agent_no_proxy(plan: DockerBottlePlan) -> str: return ",".join(hosts) -__all__ = ["bottle_plan_to_compose"] +# --- Lifecycle helpers (PRD 0018 chunk 3) ---------------------------------- +# +# The renderer above is pure. The helpers below own the I/O side: +# serialize the spec to disk, drive `docker compose up`, dump the +# merged log file on teardown, and `docker compose down` to clean up +# (networks are pre-created externally so `down` leaves them alone; +# the launch step removes them in its own teardown step). + + +COMPOSE_FILE_NAME = "docker-compose.yml" +COMPOSE_LOG_NAME = "compose.log" + + +def compose_project_name(slug: str) -> str: + """Stable mapping from slug → compose project. Matches the + `name:` field the renderer emits, so `docker compose ls` + enumeration and direct CLI invocations agree on the project + identifier.""" + return f"claude-bottle-{slug}" + + +def compose_file_path(state_dir: Path) -> Path: + return state_dir / COMPOSE_FILE_NAME + + +def compose_log_path(state_dir: Path) -> Path: + return state_dir / COMPOSE_LOG_NAME + + +def write_compose_file(spec: dict[str, Any], path: Path) -> Path: + """Serialize the compose dict to disk. JSON content with a + `.yml` filename — JSON is a strict subset of YAML 1.2 for the + constructs the renderer uses (mappings, lists, strings, bools, + nulls), and `docker compose -f file.yml` parses it as YAML. + Avoids a yaml dependency while keeping the file `cat`-readable. + """ + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(spec, indent=2, sort_keys=False) + "\n") + path.chmod(0o644) + return path + + +def _compose_argv(project: str, compose_file: Path, *cmd: str) -> list[str]: + return [ + "docker", "compose", + "-p", project, + "-f", str(compose_file), + *cmd, + ] + + +def compose_up( + project: str, + compose_file: Path, + *, + env: dict[str, str] | None = None, +) -> None: + """`docker compose up -d` for the project. Env-inheritance is + via `env=` on the subprocess — every `environment: [NAME]` (bare + name) entry in the compose file resolves to whatever value + `NAME` has in `env` at exec time. Secrets never land on argv or + in the compose file.""" + argv = _compose_argv(project, compose_file, "up", "-d") + result = subprocess.run( + argv, capture_output=True, text=True, env=env, check=False, + ) + if result.returncode != 0: + sys.stderr.write(result.stderr) + die(f"docker compose up failed for project {project}") + + +def compose_dump_logs(project: str, compose_file: Path, output: Path) -> None: + """Write the merged stdout/stderr of every service to `output` + using `docker compose logs --no-color --timestamps`. Best-effort + — failures here shouldn't block teardown. The interleaved single + file is what the user reads post-mortem; per-service tail still + works through `docker compose logs -f ` while the + project is up.""" + output.parent.mkdir(parents=True, exist_ok=True) + argv = _compose_argv(project, compose_file, "logs", "--no-color", "--timestamps") + try: + with open(output, "wb") as f: + subprocess.run( + argv, + stdout=f, + stderr=subprocess.STDOUT, + check=False, + ) + output.chmod(0o644) + except OSError as e: + warn(f"failed to write compose log to {output}: {e}") + + +def compose_down(project: str, compose_file: Path) -> None: + """`docker compose down` for the project. External networks are + intentionally NOT removed by compose (`external: true` on the + networks block); the launch step's own teardown removes them + via `network_remove` so the per-bottle ephemeral subnet doesn't + accumulate.""" + argv = _compose_argv(project, compose_file, "down") + result = subprocess.run( + argv, capture_output=True, text=True, check=False, + ) + if result.returncode != 0: + warn( + f"docker compose down failed for project {project}: " + f"{result.stderr.strip()}" + ) + + +__all__ = [ + "COMPOSE_FILE_NAME", + "COMPOSE_LOG_NAME", + "bottle_plan_to_compose", + "compose_down", + "compose_dump_logs", + "compose_file_path", + "compose_log_path", + "compose_project_name", + "compose_up", + "write_compose_file", +] diff --git a/claude_bottle/backend/docker/egress.py b/claude_bottle/backend/docker/egress.py index 9d1f5f1..c535608 100644 --- a/claude_bottle/backend/docker/egress.py +++ b/claude_bottle/backend/docker/egress.py @@ -114,6 +114,10 @@ def egress_tls_init(stage_dir: Path) -> tuple[Path, Path]: ) if keygen.returncode != 0: die(f"egress ca keygen failed: {keygen.stderr.strip()}") + # Standalone private key — never docker-cp'd, never bind-mounted + # (mitmproxy reads the cert+key concat below). Lock to owner- + # only so it doesn't sit at the default umask on disk. + key_path.chmod(0o600) # `subjectKeyIdentifier=hash` makes openssl compute the SKI as # SHA-1(pubkey), matching how mitmproxy computes the AKI on the @@ -149,6 +153,12 @@ def egress_tls_init(stage_dir: Path) -> tuple[Path, Path]: cert_path.chmod(0o644) # mitmproxy reads cert + key from a single concatenated PEM file. + # This file IS bind-mounted into the egress container (chunk 3+), + # where mitmproxy runs as uid 1000 — so the host file has to be + # world-readable for the container's user to read it through the + # mount. Owner-only mode on the parent dir (state//, under + # ~/.claude-bottle which inherits ~'s 0o700) is what actually + # restricts who can reach this file on the host. mitm = work / "mitmproxy-ca.pem" mitm.write_bytes(cert_path.read_bytes() + key_path.read_bytes()) mitm.chmod(0o644) diff --git a/claude_bottle/backend/docker/egress_apply.py b/claude_bottle/backend/docker/egress_apply.py index 63e633a..eb72cd8 100644 --- a/claude_bottle/backend/docker/egress_apply.py +++ b/claude_bottle/backend/docker/egress_apply.py @@ -31,6 +31,7 @@ from pathlib import Path from ...egress import EGRESS_ROUTES_IN_CONTAINER from ...egress_addon_core import load_routes +from .bottle_state import egress_state_dir from .egress import egress_container_name from .pipelock_apply import ( PipelockApplyError, @@ -41,6 +42,12 @@ from .pipelock_apply import ( ) +def _egress_routes_host_path(slug: str) -> Path: + """The bind-mount source for the egress sidecar's routes.yaml. + Must match what egress.prepare wrote at chunk-2 paths.""" + return egress_state_dir(slug) / "egress_routes.yaml" + + class EgressApplyError(RuntimeError): """Raised when fetch / apply fails. Caller renders to the operator; does not crash the dashboard.""" @@ -163,31 +170,29 @@ def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]: # and the operator gets a clear error about the half-state. _mirror_hosts_to_pipelock(slug, _hosts_in_routes(new_content)) - fd, tmp_path = tempfile.mkstemp(prefix="cb-routes.", suffix=".yaml") + # PRD 0018 chunk 3 + security item (c): routes.yaml is bind- + # mounted into the egress container, so the write target is the + # host path the sidecar reads through the mount. POSIX + # rename-onto-self is atomic on the same filesystem, so a sidecar + # SIGHUP racing the apply can never observe a half-written file — + # it sees either the old bytes or the new ones. + target = _egress_routes_host_path(slug) + target.parent.mkdir(parents=True, exist_ok=True) + fd, tmp_path_str = tempfile.mkstemp( + prefix=".egress_routes.", suffix=".yaml.tmp", dir=str(target.parent), + ) + tmp_path = Path(tmp_path_str) try: with os.fdopen(fd, "w") as f: f.write(new_content) - # mkstemp creates the file with mode 0600. `docker cp` - # preserves mode + host uid into the container, so without - # chmod the file lands as 0600 owned by the host user's uid, - # which inside the container is not mitmproxy (uid 1000) — - # the addon's reload then fails with PermissionError on the - # SIGHUP-triggered re-read and the old routes table stays in - # memory. Bump to 0644 so mitmproxy can read it post-cp; - # the host stage_dir doesn't apply to this tmp file but the - # content isn't secret (no tokens — those live in the - # container's environ), so 0644 in /tmp is fine. + # mitmproxy in the container reads through the bind mount as + # uid 1000; the host file has to be world-readable for that + # read to succeed (parent dir at 0o700 still restricts who + # can reach the file on the host). Routes content is not + # secret — tokens live in the container's environ — so 0o644 + # is the right trade-off. os.chmod(tmp_path, 0o644) - cp = subprocess.run( - ["docker", "cp", tmp_path, - f"{container}:{EGRESS_ROUTES_IN_CONTAINER}"], - capture_output=True, text=True, check=False, - ) - if cp.returncode != 0: - raise EgressApplyError( - f"failed to copy routes.yaml into {container}: " - f"{(cp.stderr or '').strip()}" - ) + os.replace(tmp_path, target) sig = subprocess.run( ["docker", "kill", "--signal", "HUP", container], capture_output=True, text=True, check=False, @@ -197,11 +202,15 @@ def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]: f"failed to SIGHUP {container}: " f"{(sig.stderr or '').strip()}" ) - finally: + except BaseException: + # On any failure pre-rename, drop the tmp file. Post-rename + # there's nothing to clean up — `os.replace` is atomic so + # either the new file is in place or the old one still is. try: - Path(tmp_path).unlink() + tmp_path.unlink() except OSError: pass + raise return before, new_content diff --git a/claude_bottle/backend/docker/launch.py b/claude_bottle/backend/docker/launch.py index 4745781..5bb439f 100644 --- a/claude_bottle/backend/docker/launch.py +++ b/claude_bottle/backend/docker/launch.py @@ -1,34 +1,72 @@ """Launch step for the Docker bottle backend. -`launch` is a context manager: builds the image(s), creates the per- -agent networks, brings up the pipelock sidecar, starts the agent -container, then runs the provision step. Teardown is sequenced via an -ExitStack so callbacks fire in reverse-order of registration even if -something raises mid-bring-up. +PRD 0018 chunk 3: each instance is one `docker compose` project. + +The flow is: + + 1. Build the agent's base + derived image (compose builds the + sidecar images via the `build:` directive on first up). + 2. Pre-create the per-bottle networks. We do this outside compose + so we can inspect the assigned internal CIDR and embed it in + pipelock's yaml (compose's `external: true` lets the compose + file reference these pre-existing networks). + 3. Mint the per-bottle CAs (chunk 2 writes them under + state//{pipelock,egress}/). + 4. Re-render pipelock yaml with the now-known internal CIDR so + the SSRF allowlist exempts the bottle's own subnet. + 5. Populate the inner plans with launch-time fields so the + renderer can read network names, CA paths, pipelock URL. + 6. Render the compose spec, write it to + state//docker-compose.yml, write metadata.json. + 7. `docker compose up -d` (token + OAuth values flow into the + compose subprocess env so `environment: [NAME]` bare-name + entries inherit without rendering values into the file). + 8. Provision (CA install, prompt copy, skills, git, supervise + config) — unchanged, uses `docker exec`. + 9. Yield a DockerBottle handle. `exec_claude` runs claude via + `docker exec -it` exactly like the pre-compose world. + +Teardown (ExitStack callbacks fire in reverse): + - Dump `docker compose logs --no-color --timestamps` to + state//compose.log (best-effort). + - `docker compose down` removes the project's containers (not the + external networks). + - `network_remove` deletes the two networks we pre-created. """ from __future__ import annotations import dataclasses import os -import subprocess -import sys from contextlib import ExitStack, contextmanager from pathlib import Path from typing import Callable, Generator -from ...log import die, info +from ...egress import egress_resolve_token_values +from ...log import info from ...pipelock import pipelock_build_config, pipelock_render_yaml -from ...supervise import CURRENT_CONFIG_DIR_IN_AGENT, SUPERVISE_HOSTNAME from . import network as network_mod from . import util as docker_mod from .bottle import DockerBottle from .bottle_plan import DockerBottlePlan -from .bottle_state import egress_state_dir, pipelock_state_dir +from .bottle_state import ( + bottle_state_dir, + egress_state_dir, + pipelock_state_dir, +) +from .compose import ( + bottle_plan_to_compose, + compose_down, + compose_dump_logs, + compose_file_path, + compose_log_path, + compose_project_name, + compose_up, + write_compose_file, +) from .egress import ( DockerEgress, egress_tls_init, - egress_url, ) from .git_gate import DockerGitGate from .pipelock import ( @@ -38,7 +76,6 @@ from .pipelock import ( pipelock_proxy_url, pipelock_tls_init, ) -from .provision.ca import AGENT_CA_BUNDLE, AGENT_CA_PATH from .supervise import DockerSupervise @@ -56,10 +93,15 @@ def launch( supervise: DockerSupervise, provision: Callable[[DockerBottlePlan, str], str | None], ) -> Generator[DockerBottle, None, None]: - """Build, launch, and provision a Docker bottle. Teardown on exit. + """Build, launch, and provision a Docker bottle via compose. + Teardown on exit. The per-sidecar `proxy / git_gate / egress / + supervise` parameters are vestigial from the pre-compose flow — + kept for backwards-compat with backend.py's call site; the + `start()`/`stop()` methods on those classes are no longer + invoked (chunk 3 collapsed them into the compose service spec). + They'll be removed entirely in a follow-up cleanup.""" + del proxy, git_gate, egress, supervise # not invoked in compose flow - `provision` is the backend's provision orchestrator (passed in so - this module stays free of backend-class plumbing).""" stack = ExitStack() def teardown() -> None: @@ -71,6 +113,8 @@ def launch( pass try: + # Step 1: agent image build. Sidecar images get built lazily by + # `docker compose up` via the renderer's `build:` directives. docker_mod.build_image( plan.image, _REPO_DIR, dockerfile=plan.dockerfile_path, @@ -80,45 +124,26 @@ def launch( plan.derived_image, plan.image, plan.spec.user_cwd ) + # Step 2: pre-create networks so we know the internal CIDR + # before pipelock yaml renders. internal_network = network_mod.network_create_internal(plan.slug) stack.callback(network_mod.network_remove, internal_network) egress_network = network_mod.network_create_egress(plan.slug) stack.callback(network_mod.network_remove, egress_network) - # Docker assigns a CIDR to the new internal network. Pipelock's - # SSRF guard otherwise rejects any destination resolving into - # RFC1918 space — which includes the sibling sidecars - # (egress → pipelock on the upstream leg, etc.). - # Allowlist the bottle's own internal subnet so internal - # traffic passes through pipelock; api_allowlist + body-scanning - # still apply. internal_cidr = network_mod.network_inspect_cidr(internal_network) - # Per-bottle ephemeral CAs (PRD 0006 + PRD 0017). Two - # separate CAs: - # - pipelock CA: signs MITM certs pipelock presents on the - # egress → upstream leg. - # - egress CA: signs MITM certs egress presents - # to the agent on the agent → egress leg. - # Both are minted by one-shot pipelock containers (pipelock's - # `tls init` is a known-good RSA CA minter) under stage_dir; - # the .start steps docker-cp the files in. Private keys never - # leave the host stage dir, which start.py's outer finally - # `shutil.rmtree`s after the sidecars are torn down. - # PRD 0018 chunk 2: CAs live under the bottle's state subdirs - # so chunk 3's compose bind-mounts have stable sources. The - # subdirs were created by prepare; tls_init makes the - # `pipelock-ca/` and `egress-ca/` children under them. + # Step 3: mint per-bottle CAs into state//{pipelock,egress}/. ca_cert_host, ca_key_host = pipelock_tls_init(pipelock_state_dir(plan.slug)) egress_ca_host, egress_ca_cert_only = egress_tls_init( egress_state_dir(plan.slug), ) - # Re-render the pipelock yaml with the SSRF allowlist now that - # we know the internal CIDR. Prepare wrote the yaml without - # the ssrf block (CIDR wasn't known yet); overwrite the same - # path so .start docker-cp's the updated content. + # Step 4: re-render pipelock yaml with the SSRF allowlist now + # that we know the internal CIDR. Prepare wrote the yaml + # without the ssrf block; overwrite the same path so the + # bind-mount picks up the updated content. bottle = plan.spec.manifest.bottle_for(plan.spec.agent_name) cfg = pipelock_build_config( bottle, @@ -129,6 +154,10 @@ def launch( plan.proxy_plan.yaml_path.write_text(pipelock_render_yaml(cfg)) plan.proxy_plan.yaml_path.chmod(0o600) + # Step 5: populate launch-time fields on every inner plan so + # the renderer reads concrete network names, CA paths, and + # pipelock URL. Match the field-by-field replacement the + # pre-compose launch did, just rolled into one pass. proxy_plan = dataclasses.replace( plan.proxy_plan, internal_network=internal_network, @@ -137,40 +166,17 @@ def launch( ca_cert_host_path=ca_cert_host, ca_key_host_path=ca_key_host, ) - # Re-bind the outer plan so provision_ca (which runs later - # from `provision(plan, container)`) can read the populated - # CA paths off plan.proxy_plan. - plan = dataclasses.replace(plan, proxy_plan=proxy_plan) - pipelock_name = proxy.start(plan.proxy_plan) - stack.callback(proxy.stop, pipelock_name) - - # Git gate (PRD 0008). One sidecar per agent, only brought up - # when the bottle has git entries. Same internal + egress - # network attachment as the other sidecars; agent dials it as - # `git:///.git` via the pushInsteadOf - # rules provision_git writes into ~/.gitconfig. - if plan.git_gate_plan.upstreams: + git_gate_plan = plan.git_gate_plan + if git_gate_plan.upstreams: git_gate_plan = dataclasses.replace( - plan.git_gate_plan, + git_gate_plan, internal_network=internal_network, egress_network=egress_network, ) - plan = dataclasses.replace(plan, git_gate_plan=git_gate_plan) - git_gate_name = git_gate.start(plan.git_gate_plan) - stack.callback(git_gate.stop, git_gate_name) - - # Egress-proxy (PRD 0017). One sidecar per bottle when - # bottle.egress.routes is non-empty. Must come up AFTER - # pipelock — egress routes its outbound HTTPS through - # pipelock (HTTPS_PROXY in environ + the pipelock CA in its - # trust store) so the egress allowlist + body scanner sit on - # the egress → upstream leg. Must come up BEFORE the - # agent so DNS resolution for `egress` succeeds on the - # agent's first call; tokens flow from the host env into the - # sidecar's environ, not the agent's. - if plan.egress_plan.routes: + egress_plan = plan.egress_plan + if egress_plan.routes: egress_plan = dataclasses.replace( - plan.egress_plan, + egress_plan, internal_network=internal_network, egress_network=egress_network, mitmproxy_ca_host_path=egress_ca_host, @@ -178,151 +184,62 @@ def launch( pipelock_ca_host_path=ca_cert_host, pipelock_proxy_url=pipelock_proxy_url(plan.slug), ) - plan = dataclasses.replace(plan, egress_plan=egress_plan) - egress_name = egress.start(plan.egress_plan) - stack.callback(egress.stop, egress_name) - - # Supervise sidecar (PRD 0013). Opt-in via bottle.supervise. - # Internal-network only — the sidecar makes no outbound calls. - # Must come up BEFORE the agent so DNS resolution for - # `supervise` succeeds on the agent's first tool call. - if plan.supervise_plan is not None: + supervise_plan = plan.supervise_plan + if supervise_plan is not None: supervise_plan = dataclasses.replace( - plan.supervise_plan, + supervise_plan, internal_network=internal_network, ) - plan = dataclasses.replace(plan, supervise_plan=supervise_plan) - supervise_name = supervise.start(plan.supervise_plan) - stack.callback(supervise.stop, supervise_name) + plan = dataclasses.replace( + plan, + proxy_plan=proxy_plan, + git_gate_plan=git_gate_plan, + egress_plan=egress_plan, + supervise_plan=supervise_plan, + ) - container = _run_agent_container(plan, internal_network) - stack.callback(docker_mod.force_remove_container, container) + # Step 6: render + write the compose file. metadata.json + # was written at prepare time and already carries + # compose_project; nothing to update here. + state_dir = bottle_state_dir(plan.slug) + spec = bottle_plan_to_compose(plan) + compose_file = write_compose_file(spec, compose_file_path(state_dir)) + project = compose_project_name(plan.slug) - prompt_path = provision(plan, container) + # Step 7: compose up. Token values + the OAuth placeholder + # flow through subprocess env; the compose file holds only + # bare names for the secret-carrying entries. + token_values: dict[str, str] = {} + if plan.egress_plan.routes: + token_values = egress_resolve_token_values( + plan.egress_plan.token_env_map, dict(os.environ), + ) + compose_env: dict[str, str] = { + **os.environ, + **plan.forwarded_env, + **token_values, + } + info( + f"docker compose up -d (project {project}, " + f"{len(spec['services'])} services)" + ) + compose_up(project, compose_file, env=compose_env) - yield DockerBottle(container, teardown, prompt_path) + # Register teardown in reverse order: log dump first, then + # `compose down`. Networks come down last via callbacks + # registered in step 2. + stack.callback(compose_down, project, compose_file) + stack.callback( + compose_dump_logs, project, compose_file, compose_log_path(state_dir), + ) + + # Step 8: provision. Unchanged — uses `docker exec` against + # the agent container by its known name. + prompt_path = provision(plan, plan.container_name) + + # Step 9: yield. exec_claude continues to use `docker exec -it` + # — the agent runs `sleep infinity` per the renderer's + # service spec. + yield DockerBottle(plan.container_name, teardown, prompt_path) finally: teardown() - - -def _agent_no_proxy(plan: DockerBottlePlan) -> str: - """NO_PROXY value for the agent container. Standard loopback + - `supervise` when the supervise sidecar is enabled. - - Supervise needs to bypass pipelock because the MCP tool-call - pattern is long-poll: claude-code opens an HTTPS-style request to - http://supervise:9100/, the sidecar holds it open until the - operator approves (potentially minutes), then returns the - response. Pipelock is a forward proxy with idle timeouts; - pipelock cuts the long-polled connection well before the operator - can act, and claude-code reports the tool as ✘ failed even - though /mcp shows ✔ connected. - - The supervise sidecar is on the bottle's internal network with - the `supervise` network-alias, so the agent can dial it - directly via docker DNS. Body-scanning the supervise traffic - isn't critical — the operator reviews every proposal in the TUI.""" - hosts = ["localhost", "127.0.0.1"] - if plan.supervise_plan is not None: - hosts.append(SUPERVISE_HOSTNAME) - return ",".join(hosts) - - -def _agent_proxy_url(plan: DockerBottlePlan) -> str: - """Pick the proxy URL the agent's HTTP_PROXY env points at. PRD - 0017: when an egress is declared, the agent goes through - egress (which in turn uses HTTPS_PROXY=pipelock on its - outbound leg). Otherwise the agent talks straight to pipelock — - keeps the network surface minimal for bottles that don't need - path filtering or credential injection.""" - if plan.egress_plan.routes: - return egress_url() - return pipelock_proxy_url(plan.slug) - - -def _run_agent_container(plan: DockerBottlePlan, internal_network: str) -> str: - """Build the `docker run` argv and execute it, handling name- - conflict races by incrementing the suffix (unless the name was - user-pinned). Returns the resolved container name.""" - proxy_url = _agent_proxy_url(plan) - no_proxy = _agent_no_proxy(plan) - # Set BOTH cases of every *_PROXY var. libcurl's CVE-2016-5388 - # httpoxy mitigation makes it ignore uppercase `HTTP_PROXY` for - # `http://` URLs and only honor lowercase `http_proxy`. Without - # the lowercase var, plain-HTTP requests from the agent bypass - # egress entirely (going direct, then failing with - # "network unreachable" because the agent's bridge is - # --internal). Lowercase HTTPS_PROXY isn't strictly needed but - # we set it for symmetry — some tools check one or the other. - docker_args: list[str] = [ - "--rm", "-d", - "--name", plan.container_name, - "--network", internal_network, - "-e", f"HTTPS_PROXY={proxy_url}", - "-e", f"HTTP_PROXY={proxy_url}", - "-e", f"https_proxy={proxy_url}", - "-e", f"http_proxy={proxy_url}", - "-e", f"NO_PROXY={no_proxy}", - "-e", f"no_proxy={no_proxy}", - # CA trust trio for the agent process. Docker propagates - # run-time env into `docker exec`, so `claude` sees these - # without per-exec threading. NODE_EXTRA_CA_CERTS points at - # the cert file (Node appends it to its bundled roots); - # SSL_CERT_FILE / REQUESTS_CA_BUNDLE point at the system - # bundle that `update-ca-certificates` rebuilds in - # provision_ca. - "-e", f"NODE_EXTRA_CA_CERTS={AGENT_CA_PATH}", - "-e", f"SSL_CERT_FILE={AGENT_CA_BUNDLE}", - "-e", f"REQUESTS_CA_BUNDLE={AGENT_CA_BUNDLE}", - ] - if plan.use_runsc: - docker_args.extend(["--runtime", "runsc"]) - if plan.env_file.stat().st_size > 0: - docker_args.extend(["--env-file", str(plan.env_file)]) - for name in plan.forwarded_env: - docker_args.extend(["-e", name]) - - # PRD 0013: read-only current-config mount so the agent can read - # routes.yaml / allowlist / Dockerfile before composing a - # supervise tool-call proposal. Mounted from the per-bottle - # stage_dir/current-config/ populated at prepare time. - if plan.supervise_plan is not None: - docker_args.extend([ - "-v", - f"{plan.supervise_plan.current_config_dir}:{CURRENT_CONFIG_DIR_IN_AGENT}:ro", - ]) - - docker_args.extend([plan.runtime_image, "sleep", "infinity"]) - - info(f"starting container {plan.container_name} from {plan.runtime_image}") - - # Inject forwarded values (secrets, interpolated host vars, the - # renamed OAuth token) into the docker-run child's env so the - # `-e NAME` flags above pick them up — without touching our own - # os.environ or putting values on argv. - child_env: dict[str, str] = {**os.environ, **plan.forwarded_env} - - name_idx = docker_args.index("--name") + 1 - for candidate in docker_mod.container_name_candidates(plan.container_name): - docker_args[name_idx] = candidate - run_result = subprocess.run( - ["docker", "run", *docker_args], - capture_output=True, - text=True, - env=child_env, - check=False, - ) - if run_result.returncode == 0: - return candidate - err_text = run_result.stderr - if plan.container_name_pinned or "is already in use" not in err_text: - sys.stderr.write(err_text + "\n") - die(f"docker run failed for container '{candidate}'") - info(f"name conflict on {candidate}; retrying with next candidate") - die( - f"could not find a free container name after " - f"{plan.container_name}-{docker_mod.MAX_CONTAINER_SUFFIX} retries; " - f"clean up old containers" - ) - - diff --git a/claude_bottle/backend/docker/pipelock.py b/claude_bottle/backend/docker/pipelock.py index 89ce9f2..a7b3a6b 100644 --- a/claude_bottle/backend/docker/pipelock.py +++ b/claude_bottle/backend/docker/pipelock.py @@ -67,6 +67,12 @@ def pipelock_tls_init(stage_dir: Path) -> tuple[Path, Path]: key = work / "ca-key.pem" if not cert.is_file() or not key.is_file(): die(f"pipelock tls init did not produce ca files in {work}") + # Explicit perms in case a future pipelock release changes + # defaults. Pipelock runs as root in its distroless image and + # bind-mounts work with 0o600 (root reads everything); the key + # has no reason to be readable to anyone else on the host. + key.chmod(0o600) + cert.chmod(0o644) return (cert, key) diff --git a/claude_bottle/backend/docker/pipelock_apply.py b/claude_bottle/backend/docker/pipelock_apply.py index 9a27cd2..6c3ae0a 100644 --- a/claude_bottle/backend/docker/pipelock_apply.py +++ b/claude_bottle/backend/docker/pipelock_apply.py @@ -24,9 +24,17 @@ from pathlib import Path from ...pipelock import pipelock_render_yaml from ...yaml_subset import parse_yaml_subset +from .bottle_state import pipelock_state_dir from .pipelock import pipelock_container_name +def _pipelock_yaml_host_path(slug: str) -> Path: + """The bind-mount source for the pipelock sidecar's + pipelock.yaml — matches what pipelock.prepare wrote at chunk-2 + paths.""" + return pipelock_state_dir(slug) / "pipelock.yaml" + + PIPELOCK_YAML_IN_CONTAINER = "/etc/pipelock.yaml" # Allowlist proposals are one-hostname-per-line. Blank lines and @@ -141,19 +149,26 @@ def apply_allowlist_change( cfg["api_allowlist"] = new_hosts rendered = pipelock_render_yaml(cfg) - fd, tmp_path = tempfile.mkstemp(prefix="cb-pipelock-yaml.", suffix=".yaml") + # PRD 0018 chunk 3 + security item (c): pipelock.yaml is + # bind-mounted into the container, so the write target is the + # host path the sidecar reads. POSIX rename is atomic on the + # same filesystem, which matters less here than for the + # SIGHUP-reload egress case (pipelock fully restarts and + # re-reads on boot), but the pattern is uniform across both + # apply paths. + target = _pipelock_yaml_host_path(slug) + target.parent.mkdir(parents=True, exist_ok=True) + fd, tmp_path_str = tempfile.mkstemp( + prefix=".pipelock.", suffix=".yaml.tmp", dir=str(target.parent), + ) + tmp_path = Path(tmp_path_str) try: with os.fdopen(fd, "w") as f: f.write(rendered) - cp = subprocess.run( - ["docker", "cp", tmp_path, f"{container}:{PIPELOCK_YAML_IN_CONTAINER}"], - capture_output=True, text=True, check=False, - ) - if cp.returncode != 0: - raise PipelockApplyError( - f"failed to copy pipelock.yaml into {container}: " - f"{(cp.stderr or '').strip()}" - ) + # pipelock runs as root in its distroless image — any mode + # is fine — but 0o600 matches what prepare wrote. + os.chmod(tmp_path, 0o600) + os.replace(tmp_path, target) restart = subprocess.run( ["docker", "restart", container], capture_output=True, text=True, check=False, @@ -163,11 +178,12 @@ def apply_allowlist_change( f"failed to restart {container}: " f"{(restart.stderr or '').strip()}" ) - finally: + except BaseException: try: - Path(tmp_path).unlink() + tmp_path.unlink() except OSError: pass + raise return before, after diff --git a/claude_bottle/backend/docker/prepare.py b/claude_bottle/backend/docker/prepare.py index 555ac14..f19a627 100644 --- a/claude_bottle/backend/docker/prepare.py +++ b/claude_bottle/backend/docker/prepare.py @@ -72,6 +72,7 @@ def resolve_plan( cwd=spec.user_cwd if spec.copy_cwd else "", copy_cwd=spec.copy_cwd, started_at=datetime.now(timezone.utc).isoformat(), + compose_project=f"claude-bottle-{slug}", )) # Clear any leftover preserve marker from a prior capability-block # so this fresh launch can be cleaned up at session-end unless diff --git a/claude_bottle/git_gate.py b/claude_bottle/git_gate.py index 0de6c22..db092a2 100644 --- a/claude_bottle/git_gate.py +++ b/claude_bottle/git_gate.py @@ -163,9 +163,15 @@ def git_gate_render_entrypoint(upstreams: tuple[GitGateUpstream, ...]) -> str: " keyfile=/git-gate/creds/${name}-key", " hostsfile=/git-gate/creds/${name}-known_hosts", "", - " chmod 600 \"$keyfile\"", + # `|| true`: PRD 0018 chunk 3+ bind-mounts these RO from the + # host, so chmod-syscalls fail with EROFS. The files already + # have the right perms on the host (SSH requires 0600 to load + # the key in the first place), so the chmod is best-effort + # cleanup for the legacy docker-cp path where the file + # landed at the host's umask perms. + " chmod 600 \"$keyfile\" 2>/dev/null || true", " if [ -f \"$hostsfile\" ]; then", - " chmod 600 \"$hostsfile\"", + " chmod 600 \"$hostsfile\" 2>/dev/null || true", " fi", "", " repo=/git/${name}.git", diff --git a/tests/unit/test_agent_no_proxy.py b/tests/unit/test_agent_no_proxy.py deleted file mode 100644 index d72a2b3..0000000 --- a/tests/unit/test_agent_no_proxy.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Unit: agent NO_PROXY value builder (PR #25 follow-up). - -claude-code's HTTP MCP client must bypass pipelock for the supervise -sidecar — long-poll tool calls would hit pipelock's idle timeout -otherwise. This test pins the rule: localhost always; supervise iff -the supervise sidecar is in the plan.""" - -import unittest -from pathlib import Path - -from claude_bottle.backend.docker.launch import _agent_no_proxy - - -class _FakePlan: - """Just enough plan shape for the helper — no full DockerBottlePlan - construction needed.""" - - def __init__(self, supervise_plan): - self.supervise_plan = supervise_plan - - -class _SentinelSupervisePlan: - """The helper only checks `supervise_plan is not None`; any object - is fine.""" - - -class TestAgentNoProxy(unittest.TestCase): - def test_loopback_only_when_no_supervise(self): - self.assertEqual( - "localhost,127.0.0.1", - _agent_no_proxy(_FakePlan(supervise_plan=None)), - ) - - def test_supervise_appended_when_enabled(self): - self.assertEqual( - "localhost,127.0.0.1,supervise", - _agent_no_proxy(_FakePlan(supervise_plan=_SentinelSupervisePlan())), - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/test_compose.py b/tests/unit/test_compose.py index 11aa01c..7306833 100644 --- a/tests/unit/test_compose.py +++ b/tests/unit/test_compose.py @@ -176,19 +176,20 @@ class TestProjectAndNetworks(unittest.TestCase): spec = bottle_plan_to_compose(_plan()) self.assertEqual(f"claude-bottle-{SLUG}", spec["name"]) - def test_internal_network_is_internal(self): + def test_internal_network_marked_external(self): + # Chunk 3 pre-creates networks with `docker network create + # --internal` so pipelock can know the CIDR before compose-up. + # Compose references the network by name with `external: true`. spec = bottle_plan_to_compose(_plan()) net = spec["networks"]["internal"] self.assertEqual(f"claude-bottle-net-{SLUG}", net["name"]) - self.assertTrue(net["internal"]) + self.assertTrue(net["external"]) - def test_egress_network_is_external_bridge(self): + def test_egress_network_marked_external(self): spec = bottle_plan_to_compose(_plan()) net = spec["networks"]["egress"] self.assertEqual(f"claude-bottle-egress-{SLUG}", net["name"]) - # No `internal:` key on the egress network — defaults to a - # normal user-defined bridge. - self.assertNotIn("internal", net) + self.assertTrue(net["external"]) class TestPipelockAlwaysPresent(unittest.TestCase):