feat(launch): switch start to docker compose project per bottle

PRD 0018 chunk 3. Each instance is now one `docker compose` project: - launch.py renders the compose spec via chunk-1's bottle_plan_to_compose, writes it to state/<slug>/docker-compose.yml, `docker compose up -d`s, and (on teardown) dumps `docker compose logs --no-color --timestamps` to state/<slug>/compose.log before `docker compose down`. - Networks are pre-created (`docker network create --internal` + user-defined bridge) so pipelock yaml can know the internal CIDR before compose-up. Compose references them with `external: true`; the launch step's ExitStack still owns network removal. - Agent still runs `sleep infinity`; claude reaches it via `docker exec -it` exactly like before (per the PRD's resolved TTY question). - metadata.json grows a `compose_project` field so dashboard / cleanup tooling can derive compose invocations without re-deriving the slug. Security follow-ups from chunk-2 review: (b) CA private keys: pipelock + egress ca-key.pem land at 0o600 explicitly. The mitmproxy cert+key concat stays 0o644 because the egress container's uid-1000 user reads it through the bind mount; parent dir at 0o700 still restricts host-side reach. (c) Apply atomicity: egress_apply + pipelock_apply switch from `docker cp` to host-side write-temp-then-rename on the bind-mount source. POSIX rename is atomic on the same filesystem, so a sidecar SIGHUP racing the apply can't see a half-written routes.yaml / pipelock.yaml. Per-sidecar Docker{Sidecar}.start/stop methods stay in place — the integration test suite drives them directly to validate each image in isolation, which is still useful. launch.py no longer calls them; a follow-up chunk can prune if the integration tests move to the compose lifecycle. git-gate entrypoint's chmod 600 on the keyfile + known_hosts now tolerates EROFS (`|| true`) — the host SSH key is already 0600 (SSH refuses to load otherwise), so the inside-container chmod was already a no-op in the docker-cp path and now just needs to not error on the read-only bind mount. 422 unit tests pass; supervise integration test passes; end-to-end `./cli.py start implementer` brings up the project, attaches, captures full merged logs on teardown, and reaps all containers + networks.
2026-05-25 23:16:40 -04:00
parent b9f6889d09
commit cefdc8c6e9
11 changed files with 362 additions and 302 deletions
@@ -31,6 +31,7 @@ from pathlib import Path

 from ...egress import EGRESS_ROUTES_IN_CONTAINER
 from ...egress_addon_core import load_routes
+from .bottle_state import egress_state_dir
 from .egress import egress_container_name
 from .pipelock_apply import (
    PipelockApplyError,
@@ -41,6 +42,12 @@ from .pipelock_apply import (
 )


+def _egress_routes_host_path(slug: str) -> Path:
+    """The bind-mount source for the egress sidecar's routes.yaml.
+    Must match what egress.prepare wrote at chunk-2 paths."""
+    return egress_state_dir(slug) / "egress_routes.yaml"
+
+
 class EgressApplyError(RuntimeError):
    """Raised when fetch / apply fails. Caller renders to the
    operator; does not crash the dashboard."""
@@ -163,31 +170,29 @@ def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]:
    # and the operator gets a clear error about the half-state.
    _mirror_hosts_to_pipelock(slug, _hosts_in_routes(new_content))

-    fd, tmp_path = tempfile.mkstemp(prefix="cb-routes.", suffix=".yaml")
+    # PRD 0018 chunk 3 + security item (c): routes.yaml is bind-
+    # mounted into the egress container, so the write target is the
+    # host path the sidecar reads through the mount. POSIX
+    # rename-onto-self is atomic on the same filesystem, so a sidecar
+    # SIGHUP racing the apply can never observe a half-written file —
+    # it sees either the old bytes or the new ones.
+    target = _egress_routes_host_path(slug)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_path_str = tempfile.mkstemp(
+        prefix=".egress_routes.", suffix=".yaml.tmp", dir=str(target.parent),
+    )
+    tmp_path = Path(tmp_path_str)
    try:
        with os.fdopen(fd, "w") as f:
            f.write(new_content)
-        # mkstemp creates the file with mode 0600. `docker cp`
-        # preserves mode + host uid into the container, so without
-        # chmod the file lands as 0600 owned by the host user's uid,
-        # which inside the container is not mitmproxy (uid 1000) —
-        # the addon's reload then fails with PermissionError on the
-        # SIGHUP-triggered re-read and the old routes table stays in
-        # memory. Bump to 0644 so mitmproxy can read it post-cp;
-        # the host stage_dir doesn't apply to this tmp file but the
-        # content isn't secret (no tokens — those live in the
-        # container's environ), so 0644 in /tmp is fine.
+        # mitmproxy in the container reads through the bind mount as
+        # uid 1000; the host file has to be world-readable for that
+        # read to succeed (parent dir at 0o700 still restricts who
+        # can reach the file on the host). Routes content is not
+        # secret — tokens live in the container's environ — so 0o644
+        # is the right trade-off.
        os.chmod(tmp_path, 0o644)
-        cp = subprocess.run(
-            ["docker", "cp", tmp_path,
-             f"{container}:{EGRESS_ROUTES_IN_CONTAINER}"],
-            capture_output=True, text=True, check=False,
-        )
-        if cp.returncode != 0:
-            raise EgressApplyError(
-                f"failed to copy routes.yaml into {container}: "
-                f"{(cp.stderr or '').strip()}"
-            )
+        os.replace(tmp_path, target)
        sig = subprocess.run(
            ["docker", "kill", "--signal", "HUP", container],
            capture_output=True, text=True, check=False,
@@ -197,11 +202,15 @@ def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]:
                f"failed to SIGHUP {container}: "
                f"{(sig.stderr or '').strip()}"
            )
-    finally:
+    except BaseException:
+        # On any failure pre-rename, drop the tmp file. Post-rename
+        # there's nothing to clean up — `os.replace` is atomic so
+        # either the new file is in place or the old one still is.
        try:
-            Path(tmp_path).unlink()
+            tmp_path.unlink()
        except OSError:
            pass
+        raise

    return before, new_content