diff --git a/claude_bottle/backend/docker/pipelock_apply.py b/claude_bottle/backend/docker/pipelock_apply.py index cfb7f27..a271398 100644 --- a/claude_bottle/backend/docker/pipelock_apply.py +++ b/claude_bottle/backend/docker/pipelock_apply.py @@ -5,7 +5,9 @@ Used by the supervise dashboard when the operator approves a pipelock-block proposal (or runs the operator-initiated `pipelock edit ` verb). Fetches the current pipelock.yaml via `docker exec`, parses it, swaps the api_allowlist with the proposed hosts, -re-renders, writes back via `docker cp`, then `docker restart` so +re-renders, writes back via the bind-mount path, then signals the +bundle supervisor to restart the pipelock daemon (`docker kill +--signal USR1`) so pipelock picks up the new config. v1 uses restart, not SIGHUP — pipelock has no in-process reload @@ -130,19 +132,17 @@ def apply_allowlist_change( 2. Fetch + parse current pipelock.yaml. 3. Replace api_allowlist with the proposed hosts; re-render. 4. Write the new yaml to the bind-mount source. - 5. `docker restart` the bundle so pipelock reloads. - - The restart bounces ALL four daemons inside the bundle, not - just pipelock — pipelock has no in-process reload and the - bundle init re-spawns the four daemons on container restart. - Per-daemon reload would need a supervisor IPC channel (PRD - 0024 open question 1's "eventually" path); the bundle-wide - restart is the v1 trade-off. + 5. `docker kill --signal USR1 ` so the supervisor + restarts the pipelock daemon in place (leaving egress, + git-gate, and supervise running). Pipelock has no + in-process reload; the supervisor's per-daemon restart + keeps the agent's MCP socket alive — a whole-bundle + `docker restart` would bounce supervise too. Returns (before, after) where both are one-per-line allowlist strings (operator-facing format). Raises PipelockApplyError on any failure; the sidecar's existing config stays in place until - the host write succeeds, and the restart is what makes it + the host write succeeds, and the SIGUSR1 is what makes it live.""" new_hosts = parse_allowlist_content(new_allowlist_content) container = sidecar_bundle_container_name(slug) @@ -167,9 +167,9 @@ def apply_allowlist_change( # FILE — same Docker single-file inode issue as egress_apply: # write-temp-then-rename swaps the host inode and leaves the # container's mount pointing at the orphaned old one. Write - # in-place. `docker restart` below picks up the new content - # (and pipelock has no in-process reload anyway, so the - # restart is what makes it live regardless of write atomicity). + # in-place. The SIGUSR1 below makes the new content live + # (pipelock has no in-process reload, so the supervisor + # restarts the pipelock daemon in response). target = _pipelock_yaml_host_path(slug) target.parent.mkdir(parents=True, exist_ok=True) target.write_text(rendered) @@ -177,12 +177,12 @@ def apply_allowlist_change( # fine — but 0o600 matches what prepare wrote. target.chmod(0o600) restart = subprocess.run( - ["docker", "restart", container], + ["docker", "kill", "--signal", "USR1", container], capture_output=True, text=True, check=False, ) if restart.returncode != 0: raise PipelockApplyError( - f"failed to restart {container}: " + f"failed to signal {container} for pipelock restart: " f"{(restart.stderr or '').strip()}" ) diff --git a/claude_bottle/sidecar_init.py b/claude_bottle/sidecar_init.py index 9b08951..88a3a40 100644 --- a/claude_bottle/sidecar_init.py +++ b/claude_bottle/sidecar_init.py @@ -224,6 +224,52 @@ class _Supervisor: return True return False + def restart_daemon(self, daemon_name: str, *, grace: float = 5.0) -> bool: + """Terminate one named child and spawn a fresh one, leaving + the other daemons running. Used by the pipelock-apply path: + pipelock has no in-process reload, so apply_allowlist_change + runs `docker kill --signal USR1 ` after writing the + new yaml; the supervisor catches SIGUSR1 and calls this. + + Behavior: SIGTERM → wait up to `grace` seconds → SIGKILL if + still alive → spawn a replacement under the same DaemonSpec. + The `procs` slot is updated in place so subsequent + forward_signal / shutdown calls reach the new pid. + + Returns True iff a daemon by that name was running and a + replacement spawned; False if no such daemon (the + compose-renderer subset said this bottle doesn't run it).""" + if self.shutdown_at is not None: + _log(f"restart {daemon_name} skipped; supervisor is shutting down") + return False + for idx, (spec, p) in enumerate(self.procs): + if spec.name != daemon_name: + continue + _log(f"restarting {daemon_name}") + if p.poll() is None: + try: + p.terminate() + except ProcessLookupError: + pass + try: + p.wait(timeout=grace) + except subprocess.TimeoutExpired: + _log( + f"{daemon_name} did not exit within {grace:.0f}s " + f"of SIGTERM; SIGKILL" + ) + try: + p.kill() + except ProcessLookupError: + pass + p.wait() + self._logged_dead.discard(daemon_name) + new_proc = _spawn(spec) + self.procs[idx] = (spec, new_proc) + _log(f"{daemon_name} restarted (pid {new_proc.pid})") + return True + return False + def main(argv: Sequence[str] | None = None) -> int: del argv # no flags yet; env-driven only @@ -237,14 +283,18 @@ def main(argv: Sequence[str] | None = None) -> int: signal.signal(signal.SIGTERM, lambda *_: sup.request_shutdown("SIGTERM")) signal.signal(signal.SIGINT, lambda *_: sup.request_shutdown("SIGINT")) - # SIGHUP reload path: routes.yaml + pipelock.yaml hot-reloads - # in egress_apply.py / pipelock_apply.py issue `docker kill - # --signal HUP `. The kernel delivers SIGHUP to PID 1 - # (this supervisor); we forward it to the daemon that knows - # how to reload (egress = mitmdump-reload-addons; pipelock has - # no in-process reload, so the pipelock-apply path uses - # `docker restart` instead). + # SIGHUP reload path: egress_apply.py runs `docker kill + # --signal HUP ` after writing routes.yaml. The kernel + # delivers SIGHUP to PID 1 (this supervisor); forward it to + # mitmdump so it reloads its addon. signal.signal(signal.SIGHUP, lambda *_: sup.forward_signal(signal.SIGHUP, "egress")) + # SIGUSR1 pipelock-restart path: pipelock_apply.py runs + # `docker kill --signal USR1 ` after writing + # pipelock.yaml. Pipelock has no in-process reload, so the + # supervisor restarts the pipelock daemon in place (other + # daemons keep running — specifically supervise, whose MCP + # socket would drop on a whole-container `docker restart`). + signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock")) while not sup.tick(): time.sleep(_POLL_INTERVAL) diff --git a/tests/integration/test_pipelock_apply.py b/tests/integration/test_pipelock_apply.py index 86fbb9c..54523a3 100644 --- a/tests/integration/test_pipelock_apply.py +++ b/tests/integration/test_pipelock_apply.py @@ -35,8 +35,6 @@ from claude_bottle.backend.docker.network import ( from claude_bottle.backend.docker.pipelock import ( PIPELOCK_CA_CERT_IN_CONTAINER, PIPELOCK_CA_KEY_IN_CONTAINER, - PIPELOCK_IMAGE, - PIPELOCK_PORT, DockerPipelockProxy, pipelock_tls_init, ) @@ -47,6 +45,7 @@ from claude_bottle.backend.docker.pipelock_apply import ( fetch_current_yaml, ) from claude_bottle.backend.docker.sidecar_bundle import ( + SIDECAR_BUNDLE_IMAGE, sidecar_bundle_container_name, ) from claude_bottle.yaml_subset import parse_yaml_subset @@ -85,14 +84,12 @@ class TestPipelockApply(unittest.TestCase): shutil.rmtree(pipelock_state_dir(self.slug), ignore_errors=True) def _bring_up(self) -> None: - """Replicates the pre-chunk-3 bring-up sequence (create on - internal network → bind-mount yaml + CAs → attach egress - network → docker start) without going through the deleted - `DockerPipelockProxy.start` helper. The same sequence is - what `docker compose up` does for the pipelock service in - production; this test path keeps the standalone-pipelock - smoke alive so `apply_allowlist_change`'s host-side - write + docker-restart loop has integration coverage. + """Brings up the bundle image with only the pipelock daemon + selected. The bundle's Python supervisor is PID 1, which is + what apply_allowlist_change targets via `docker kill + --signal USR1` — pipelock alone as PID 1 wouldn't survive + SIGUSR1 (default disposition = terminate). This shape is + what runs in production minus the other three daemons. The yaml stages into the production-real `pipelock_state_dir(slug)` (not a private temp dir) so the @@ -109,24 +106,28 @@ class TestPipelockApply(unittest.TestCase): self.egress_net = network_create_egress(self.slug) ca_cert_host, ca_key_host = pipelock_tls_init(state_dir) - # apply_allowlist_change targets sidecar_bundle_container_name - # (chunk 5 flipped the bundle to the only shape). Bringing the - # standalone pipelock up under that name keeps this test - # exercising the real production code path; the bundle's - # other three daemons aren't running here, but the - # apply/fetch code only touches /etc/pipelock.yaml + the - # pipelock binary, so the lighter setup is fine. + # Ensure the bundle image is built. compose normally builds + # this lazily; we go through `docker run` here so we have to + # do it ourselves. Idempotent — cached layers make repeats + # fast. + repo_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + subprocess.run( + ["docker", "build", + "-t", SIDECAR_BUNDLE_IMAGE, + "-f", "Dockerfile.sidecars", "."], + cwd=repo_root, check=True, capture_output=True, + ) + self.sidecar_name = sidecar_bundle_container_name(self.slug) subprocess.run( ["docker", "create", "--name", self.sidecar_name, "--network", self.internal_net, + "-e", "CLAUDE_BOTTLE_SIDECAR_DAEMONS=pipelock", "-v", f"{prep.yaml_path}:/etc/pipelock.yaml:ro", "-v", f"{ca_cert_host}:{PIPELOCK_CA_CERT_IN_CONTAINER}:ro", "-v", f"{ca_key_host}:{PIPELOCK_CA_KEY_IN_CONTAINER}:ro", - PIPELOCK_IMAGE, - "run", "--config", "/etc/pipelock.yaml", - "--listen", f"0.0.0.0:{PIPELOCK_PORT}"], + SIDECAR_BUNDLE_IMAGE], check=True, capture_output=True, ) subprocess.run( diff --git a/tests/unit/test_sidecar_init.py b/tests/unit/test_sidecar_init.py index 239bb0c..b750605 100644 --- a/tests/unit/test_sidecar_init.py +++ b/tests/unit/test_sidecar_init.py @@ -227,6 +227,53 @@ class TestSupervisor(unittest.TestCase): sup.request_shutdown(reason="cleanup") self._drive(sup) + def test_restart_daemon_replaces_in_place(self): + # pipelock_apply.py sends SIGUSR1 to the bundle, supervisor + # restarts the pipelock daemon, supervise (the other + # daemon's MCP server in production) stays up. + specs = [ + _DaemonSpec("pipelock", ("/bin/sleep", "30")), + _DaemonSpec("supervise", ("/bin/sleep", "30")), + ] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.1) + old_pipelock_pid = sup.procs[0][1].pid + supervise_pid = sup.procs[1][1].pid + + ok = sup.restart_daemon("pipelock", grace=2.0) + self.assertTrue(ok) + + # Pipelock got a fresh PID — different process. + new_pipelock_pid = sup.procs[0][1].pid + self.assertNotEqual(old_pipelock_pid, new_pipelock_pid) + # Supervise's PID is unchanged — it was NOT restarted. + self.assertEqual(supervise_pid, sup.procs[1][1].pid) + self.assertIsNone(sup.procs[1][1].poll(), + "supervise should still be running") + + sup.request_shutdown(reason="cleanup") + self._drive(sup) + + def test_restart_unknown_daemon_no_op(self): + specs = [_DaemonSpec("a", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + ok = sup.restart_daemon("ghost") + self.assertFalse(ok) + sup.request_shutdown(reason="cleanup") + self._drive(sup) + + def test_restart_during_shutdown_is_no_op(self): + specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + sup.request_shutdown(reason="test") + ok = sup.restart_daemon("pipelock") + self.assertFalse(ok, + "must not respawn a daemon during teardown") + self._drive(sup) + def test_shutdown_after_start_terminates_children(self): # Two long-running children. Caller requests shutdown; # both should receive SIGTERM and exit. exit_code() is