Merge pull request 'fix(sidecars): per-daemon pipelock restart keeps supervise socket alive' (#61) from fix-pipelock-restart-keeps-bundle-up into main
test / unit (push) Successful in 21s
test / integration (push) Successful in 42s

This commit was merged in pull request #61.
This commit is contained in:
2026-05-27 02:14:33 -04:00
4 changed files with 140 additions and 42 deletions
+15 -15
View File
@@ -5,7 +5,9 @@ Used by the supervise dashboard when the operator approves a
pipelock-block proposal (or runs the operator-initiated `pipelock pipelock-block proposal (or runs the operator-initiated `pipelock
edit <bottle>` verb). Fetches the current pipelock.yaml via `docker edit <bottle>` verb). Fetches the current pipelock.yaml via `docker
exec`, parses it, swaps the api_allowlist with the proposed hosts, exec`, parses it, swaps the api_allowlist with the proposed hosts,
re-renders, writes back via `docker cp`, then `docker restart` so re-renders, writes back via the bind-mount path, then signals the
bundle supervisor to restart the pipelock daemon (`docker kill
--signal USR1`) so
pipelock picks up the new config. pipelock picks up the new config.
v1 uses restart, not SIGHUP — pipelock has no in-process reload v1 uses restart, not SIGHUP — pipelock has no in-process reload
@@ -130,19 +132,17 @@ def apply_allowlist_change(
2. Fetch + parse current pipelock.yaml. 2. Fetch + parse current pipelock.yaml.
3. Replace api_allowlist with the proposed hosts; re-render. 3. Replace api_allowlist with the proposed hosts; re-render.
4. Write the new yaml to the bind-mount source. 4. Write the new yaml to the bind-mount source.
5. `docker restart` the bundle so pipelock reloads. 5. `docker kill --signal USR1 <bundle>` so the supervisor
restarts the pipelock daemon in place (leaving egress,
The restart bounces ALL four daemons inside the bundle, not git-gate, and supervise running). Pipelock has no
just pipelock — pipelock has no in-process reload and the in-process reload; the supervisor's per-daemon restart
bundle init re-spawns the four daemons on container restart. keeps the agent's MCP socket alive — a whole-bundle
Per-daemon reload would need a supervisor IPC channel (PRD `docker restart` would bounce supervise too.
0024 open question 1's "eventually" path); the bundle-wide
restart is the v1 trade-off.
Returns (before, after) where both are one-per-line allowlist Returns (before, after) where both are one-per-line allowlist
strings (operator-facing format). Raises PipelockApplyError on strings (operator-facing format). Raises PipelockApplyError on
any failure; the sidecar's existing config stays in place until any failure; the sidecar's existing config stays in place until
the host write succeeds, and the restart is what makes it the host write succeeds, and the SIGUSR1 is what makes it
live.""" live."""
new_hosts = parse_allowlist_content(new_allowlist_content) new_hosts = parse_allowlist_content(new_allowlist_content)
container = sidecar_bundle_container_name(slug) container = sidecar_bundle_container_name(slug)
@@ -167,9 +167,9 @@ def apply_allowlist_change(
# FILE — same Docker single-file inode issue as egress_apply: # FILE — same Docker single-file inode issue as egress_apply:
# write-temp-then-rename swaps the host inode and leaves the # write-temp-then-rename swaps the host inode and leaves the
# container's mount pointing at the orphaned old one. Write # container's mount pointing at the orphaned old one. Write
# in-place. `docker restart` below picks up the new content # in-place. The SIGUSR1 below makes the new content live
# (and pipelock has no in-process reload anyway, so the # (pipelock has no in-process reload, so the supervisor
# restart is what makes it live regardless of write atomicity). # restarts the pipelock daemon in response).
target = _pipelock_yaml_host_path(slug) target = _pipelock_yaml_host_path(slug)
target.parent.mkdir(parents=True, exist_ok=True) target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(rendered) target.write_text(rendered)
@@ -177,12 +177,12 @@ def apply_allowlist_change(
# fine — but 0o600 matches what prepare wrote. # fine — but 0o600 matches what prepare wrote.
target.chmod(0o600) target.chmod(0o600)
restart = subprocess.run( restart = subprocess.run(
["docker", "restart", container], ["docker", "kill", "--signal", "USR1", container],
capture_output=True, text=True, check=False, capture_output=True, text=True, check=False,
) )
if restart.returncode != 0: if restart.returncode != 0:
raise PipelockApplyError( raise PipelockApplyError(
f"failed to restart {container}: " f"failed to signal {container} for pipelock restart: "
f"{(restart.stderr or '').strip()}" f"{(restart.stderr or '').strip()}"
) )
+57 -7
View File
@@ -224,6 +224,52 @@ class _Supervisor:
return True return True
return False return False
def restart_daemon(self, daemon_name: str, *, grace: float = 5.0) -> bool:
"""Terminate one named child and spawn a fresh one, leaving
the other daemons running. Used by the pipelock-apply path:
pipelock has no in-process reload, so apply_allowlist_change
runs `docker kill --signal USR1 <bundle>` after writing the
new yaml; the supervisor catches SIGUSR1 and calls this.
Behavior: SIGTERM → wait up to `grace` seconds → SIGKILL if
still alive → spawn a replacement under the same DaemonSpec.
The `procs` slot is updated in place so subsequent
forward_signal / shutdown calls reach the new pid.
Returns True iff a daemon by that name was running and a
replacement spawned; False if no such daemon (the
compose-renderer subset said this bottle doesn't run it)."""
if self.shutdown_at is not None:
_log(f"restart {daemon_name} skipped; supervisor is shutting down")
return False
for idx, (spec, p) in enumerate(self.procs):
if spec.name != daemon_name:
continue
_log(f"restarting {daemon_name}")
if p.poll() is None:
try:
p.terminate()
except ProcessLookupError:
pass
try:
p.wait(timeout=grace)
except subprocess.TimeoutExpired:
_log(
f"{daemon_name} did not exit within {grace:.0f}s "
f"of SIGTERM; SIGKILL"
)
try:
p.kill()
except ProcessLookupError:
pass
p.wait()
self._logged_dead.discard(daemon_name)
new_proc = _spawn(spec)
self.procs[idx] = (spec, new_proc)
_log(f"{daemon_name} restarted (pid {new_proc.pid})")
return True
return False
def main(argv: Sequence[str] | None = None) -> int: def main(argv: Sequence[str] | None = None) -> int:
del argv # no flags yet; env-driven only del argv # no flags yet; env-driven only
@@ -237,14 +283,18 @@ def main(argv: Sequence[str] | None = None) -> int:
signal.signal(signal.SIGTERM, lambda *_: sup.request_shutdown("SIGTERM")) signal.signal(signal.SIGTERM, lambda *_: sup.request_shutdown("SIGTERM"))
signal.signal(signal.SIGINT, lambda *_: sup.request_shutdown("SIGINT")) signal.signal(signal.SIGINT, lambda *_: sup.request_shutdown("SIGINT"))
# SIGHUP reload path: routes.yaml + pipelock.yaml hot-reloads # SIGHUP reload path: egress_apply.py runs `docker kill
# in egress_apply.py / pipelock_apply.py issue `docker kill # --signal HUP <bundle>` after writing routes.yaml. The kernel
# --signal HUP <bundle>`. The kernel delivers SIGHUP to PID 1 # delivers SIGHUP to PID 1 (this supervisor); forward it to
# (this supervisor); we forward it to the daemon that knows # mitmdump so it reloads its addon.
# how to reload (egress = mitmdump-reload-addons; pipelock has
# no in-process reload, so the pipelock-apply path uses
# `docker restart` instead).
signal.signal(signal.SIGHUP, lambda *_: sup.forward_signal(signal.SIGHUP, "egress")) signal.signal(signal.SIGHUP, lambda *_: sup.forward_signal(signal.SIGHUP, "egress"))
# SIGUSR1 pipelock-restart path: pipelock_apply.py runs
# `docker kill --signal USR1 <bundle>` after writing
# pipelock.yaml. Pipelock has no in-process reload, so the
# supervisor restarts the pipelock daemon in place (other
# daemons keep running — specifically supervise, whose MCP
# socket would drop on a whole-container `docker restart`).
signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock"))
while not sup.tick(): while not sup.tick():
time.sleep(_POLL_INTERVAL) time.sleep(_POLL_INTERVAL)
+21 -20
View File
@@ -35,8 +35,6 @@ from claude_bottle.backend.docker.network import (
from claude_bottle.backend.docker.pipelock import ( from claude_bottle.backend.docker.pipelock import (
PIPELOCK_CA_CERT_IN_CONTAINER, PIPELOCK_CA_CERT_IN_CONTAINER,
PIPELOCK_CA_KEY_IN_CONTAINER, PIPELOCK_CA_KEY_IN_CONTAINER,
PIPELOCK_IMAGE,
PIPELOCK_PORT,
DockerPipelockProxy, DockerPipelockProxy,
pipelock_tls_init, pipelock_tls_init,
) )
@@ -47,6 +45,7 @@ from claude_bottle.backend.docker.pipelock_apply import (
fetch_current_yaml, fetch_current_yaml,
) )
from claude_bottle.backend.docker.sidecar_bundle import ( from claude_bottle.backend.docker.sidecar_bundle import (
SIDECAR_BUNDLE_IMAGE,
sidecar_bundle_container_name, sidecar_bundle_container_name,
) )
from claude_bottle.yaml_subset import parse_yaml_subset from claude_bottle.yaml_subset import parse_yaml_subset
@@ -85,14 +84,12 @@ class TestPipelockApply(unittest.TestCase):
shutil.rmtree(pipelock_state_dir(self.slug), ignore_errors=True) shutil.rmtree(pipelock_state_dir(self.slug), ignore_errors=True)
def _bring_up(self) -> None: def _bring_up(self) -> None:
"""Replicates the pre-chunk-3 bring-up sequence (create on """Brings up the bundle image with only the pipelock daemon
internal network → bind-mount yaml + CAs → attach egress selected. The bundle's Python supervisor is PID 1, which is
network → docker start) without going through the deleted what apply_allowlist_change targets via `docker kill
`DockerPipelockProxy.start` helper. The same sequence is --signal USR1` — pipelock alone as PID 1 wouldn't survive
what `docker compose up` does for the pipelock service in SIGUSR1 (default disposition = terminate). This shape is
production; this test path keeps the standalone-pipelock what runs in production minus the other three daemons.
smoke alive so `apply_allowlist_change`'s host-side
write + docker-restart loop has integration coverage.
The yaml stages into the production-real The yaml stages into the production-real
`pipelock_state_dir(slug)` (not a private temp dir) so the `pipelock_state_dir(slug)` (not a private temp dir) so the
@@ -109,24 +106,28 @@ class TestPipelockApply(unittest.TestCase):
self.egress_net = network_create_egress(self.slug) self.egress_net = network_create_egress(self.slug)
ca_cert_host, ca_key_host = pipelock_tls_init(state_dir) ca_cert_host, ca_key_host = pipelock_tls_init(state_dir)
# apply_allowlist_change targets sidecar_bundle_container_name # Ensure the bundle image is built. compose normally builds
# (chunk 5 flipped the bundle to the only shape). Bringing the # this lazily; we go through `docker run` here so we have to
# standalone pipelock up under that name keeps this test # do it ourselves. Idempotent — cached layers make repeats
# exercising the real production code path; the bundle's # fast.
# other three daemons aren't running here, but the repo_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
# apply/fetch code only touches /etc/pipelock.yaml + the subprocess.run(
# pipelock binary, so the lighter setup is fine. ["docker", "build",
"-t", SIDECAR_BUNDLE_IMAGE,
"-f", "Dockerfile.sidecars", "."],
cwd=repo_root, check=True, capture_output=True,
)
self.sidecar_name = sidecar_bundle_container_name(self.slug) self.sidecar_name = sidecar_bundle_container_name(self.slug)
subprocess.run( subprocess.run(
["docker", "create", ["docker", "create",
"--name", self.sidecar_name, "--name", self.sidecar_name,
"--network", self.internal_net, "--network", self.internal_net,
"-e", "CLAUDE_BOTTLE_SIDECAR_DAEMONS=pipelock",
"-v", f"{prep.yaml_path}:/etc/pipelock.yaml:ro", "-v", f"{prep.yaml_path}:/etc/pipelock.yaml:ro",
"-v", f"{ca_cert_host}:{PIPELOCK_CA_CERT_IN_CONTAINER}:ro", "-v", f"{ca_cert_host}:{PIPELOCK_CA_CERT_IN_CONTAINER}:ro",
"-v", f"{ca_key_host}:{PIPELOCK_CA_KEY_IN_CONTAINER}:ro", "-v", f"{ca_key_host}:{PIPELOCK_CA_KEY_IN_CONTAINER}:ro",
PIPELOCK_IMAGE, SIDECAR_BUNDLE_IMAGE],
"run", "--config", "/etc/pipelock.yaml",
"--listen", f"0.0.0.0:{PIPELOCK_PORT}"],
check=True, capture_output=True, check=True, capture_output=True,
) )
subprocess.run( subprocess.run(
+47
View File
@@ -227,6 +227,53 @@ class TestSupervisor(unittest.TestCase):
sup.request_shutdown(reason="cleanup") sup.request_shutdown(reason="cleanup")
self._drive(sup) self._drive(sup)
def test_restart_daemon_replaces_in_place(self):
# pipelock_apply.py sends SIGUSR1 to the bundle, supervisor
# restarts the pipelock daemon, supervise (the other
# daemon's MCP server in production) stays up.
specs = [
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
_DaemonSpec("supervise", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
old_pipelock_pid = sup.procs[0][1].pid
supervise_pid = sup.procs[1][1].pid
ok = sup.restart_daemon("pipelock", grace=2.0)
self.assertTrue(ok)
# Pipelock got a fresh PID — different process.
new_pipelock_pid = sup.procs[0][1].pid
self.assertNotEqual(old_pipelock_pid, new_pipelock_pid)
# Supervise's PID is unchanged — it was NOT restarted.
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
self.assertIsNone(sup.procs[1][1].poll(),
"supervise should still be running")
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_restart_unknown_daemon_no_op(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
ok = sup.restart_daemon("ghost")
self.assertFalse(ok)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_restart_during_shutdown_is_no_op(self):
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
sup.request_shutdown(reason="test")
ok = sup.restart_daemon("pipelock")
self.assertFalse(ok,
"must not respawn a daemon during teardown")
self._drive(sup)
def test_shutdown_after_start_terminates_children(self): def test_shutdown_after_start_terminates_children(self):
# Two long-running children. Caller requests shutdown; # Two long-running children. Caller requests shutdown;
# both should receive SIGTERM and exit. exit_code() is # both should receive SIGTERM and exit. exit_code() is