feat(launch): switch start to docker compose project per bottle
PRD 0018 chunk 3. Each instance is now one `docker compose` project:
- launch.py renders the compose spec via chunk-1's
bottle_plan_to_compose, writes it to state/<slug>/docker-compose.yml,
`docker compose up -d`s, and (on teardown) dumps
`docker compose logs --no-color --timestamps` to
state/<slug>/compose.log before `docker compose down`.
- Networks are pre-created (`docker network create --internal` +
user-defined bridge) so pipelock yaml can know the internal CIDR
before compose-up. Compose references them with `external: true`;
the launch step's ExitStack still owns network removal.
- Agent still runs `sleep infinity`; claude reaches it via
`docker exec -it` exactly like before (per the PRD's resolved
TTY question).
- metadata.json grows a `compose_project` field so dashboard /
cleanup tooling can derive compose invocations without
re-deriving the slug.
Security follow-ups from chunk-2 review:
(b) CA private keys: pipelock + egress ca-key.pem land at 0o600
explicitly. The mitmproxy cert+key concat stays 0o644 because
the egress container's uid-1000 user reads it through the
bind mount; parent dir at 0o700 still restricts host-side
reach.
(c) Apply atomicity: egress_apply + pipelock_apply switch from
`docker cp` to host-side write-temp-then-rename on the
bind-mount source. POSIX rename is atomic on the same
filesystem, so a sidecar SIGHUP racing the apply can't see
a half-written routes.yaml / pipelock.yaml.
Per-sidecar Docker{Sidecar}.start/stop methods stay in place — the
integration test suite drives them directly to validate each image
in isolation, which is still useful. launch.py no longer calls
them; a follow-up chunk can prune if the integration tests move to
the compose lifecycle.
git-gate entrypoint's chmod 600 on the keyfile + known_hosts now
tolerates EROFS (`|| true`) — the host SSH key is already 0600
(SSH refuses to load otherwise), so the inside-container chmod
was already a no-op in the docker-cp path and now just needs to
not error on the read-only bind mount.
422 unit tests pass; supervise integration test passes; end-to-end
`./cli.py start implementer` brings up the project, attaches,
captures full merged logs on teardown, and reaps all containers +
networks.
This commit is contained in:
@@ -31,6 +31,7 @@ from pathlib import Path
|
||||
|
||||
from ...egress import EGRESS_ROUTES_IN_CONTAINER
|
||||
from ...egress_addon_core import load_routes
|
||||
from .bottle_state import egress_state_dir
|
||||
from .egress import egress_container_name
|
||||
from .pipelock_apply import (
|
||||
PipelockApplyError,
|
||||
@@ -41,6 +42,12 @@ from .pipelock_apply import (
|
||||
)
|
||||
|
||||
|
||||
def _egress_routes_host_path(slug: str) -> Path:
|
||||
"""The bind-mount source for the egress sidecar's routes.yaml.
|
||||
Must match what egress.prepare wrote at chunk-2 paths."""
|
||||
return egress_state_dir(slug) / "egress_routes.yaml"
|
||||
|
||||
|
||||
class EgressApplyError(RuntimeError):
|
||||
"""Raised when fetch / apply fails. Caller renders to the
|
||||
operator; does not crash the dashboard."""
|
||||
@@ -163,31 +170,29 @@ def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]:
|
||||
# and the operator gets a clear error about the half-state.
|
||||
_mirror_hosts_to_pipelock(slug, _hosts_in_routes(new_content))
|
||||
|
||||
fd, tmp_path = tempfile.mkstemp(prefix="cb-routes.", suffix=".yaml")
|
||||
# PRD 0018 chunk 3 + security item (c): routes.yaml is bind-
|
||||
# mounted into the egress container, so the write target is the
|
||||
# host path the sidecar reads through the mount. POSIX
|
||||
# rename-onto-self is atomic on the same filesystem, so a sidecar
|
||||
# SIGHUP racing the apply can never observe a half-written file —
|
||||
# it sees either the old bytes or the new ones.
|
||||
target = _egress_routes_host_path(slug)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
fd, tmp_path_str = tempfile.mkstemp(
|
||||
prefix=".egress_routes.", suffix=".yaml.tmp", dir=str(target.parent),
|
||||
)
|
||||
tmp_path = Path(tmp_path_str)
|
||||
try:
|
||||
with os.fdopen(fd, "w") as f:
|
||||
f.write(new_content)
|
||||
# mkstemp creates the file with mode 0600. `docker cp`
|
||||
# preserves mode + host uid into the container, so without
|
||||
# chmod the file lands as 0600 owned by the host user's uid,
|
||||
# which inside the container is not mitmproxy (uid 1000) —
|
||||
# the addon's reload then fails with PermissionError on the
|
||||
# SIGHUP-triggered re-read and the old routes table stays in
|
||||
# memory. Bump to 0644 so mitmproxy can read it post-cp;
|
||||
# the host stage_dir doesn't apply to this tmp file but the
|
||||
# content isn't secret (no tokens — those live in the
|
||||
# container's environ), so 0644 in /tmp is fine.
|
||||
# mitmproxy in the container reads through the bind mount as
|
||||
# uid 1000; the host file has to be world-readable for that
|
||||
# read to succeed (parent dir at 0o700 still restricts who
|
||||
# can reach the file on the host). Routes content is not
|
||||
# secret — tokens live in the container's environ — so 0o644
|
||||
# is the right trade-off.
|
||||
os.chmod(tmp_path, 0o644)
|
||||
cp = subprocess.run(
|
||||
["docker", "cp", tmp_path,
|
||||
f"{container}:{EGRESS_ROUTES_IN_CONTAINER}"],
|
||||
capture_output=True, text=True, check=False,
|
||||
)
|
||||
if cp.returncode != 0:
|
||||
raise EgressApplyError(
|
||||
f"failed to copy routes.yaml into {container}: "
|
||||
f"{(cp.stderr or '').strip()}"
|
||||
)
|
||||
os.replace(tmp_path, target)
|
||||
sig = subprocess.run(
|
||||
["docker", "kill", "--signal", "HUP", container],
|
||||
capture_output=True, text=True, check=False,
|
||||
@@ -197,11 +202,15 @@ def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]:
|
||||
f"failed to SIGHUP {container}: "
|
||||
f"{(sig.stderr or '').strip()}"
|
||||
)
|
||||
finally:
|
||||
except BaseException:
|
||||
# On any failure pre-rename, drop the tmp file. Post-rename
|
||||
# there's nothing to clean up — `os.replace` is atomic so
|
||||
# either the new file is in place or the old one still is.
|
||||
try:
|
||||
Path(tmp_path).unlink()
|
||||
tmp_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
return before, new_content
|
||||
|
||||
|
||||
Reference in New Issue
Block a user