fix(sidecars): per-daemon pipelock restart keeps supervise socket alive
test / unit (pull_request) Successful in 21s
test / integration (pull_request) Successful in 43s

`apply_allowlist_change` used `docker restart <bundle>` to make
pipelock reload, which bounced ALL four daemons — including
supervise, whose MCP socket the agent's claude-code client had
open. That dropped the connection. A second apply works because
supervise has come back up by then.

Fix: per-daemon restart via SIGUSR1.

- New `_Supervisor.restart_daemon(name)` terminates one named
  child and spawns a replacement in place. Other daemons keep
  running.
- main() wires SIGUSR1 → `restart_daemon("pipelock")`. Pipelock
  has no in-process reload, so this is its analog of egress's
  SIGHUP-reload-addon path. Pipelock is the only daemon that
  currently needs hot-config reload via restart; if others
  acquire the need, add a new signal.
- `apply_allowlist_change` now `docker kill --signal USR1
  <bundle>` instead of `docker restart`. Supervise / egress /
  git-gate keep running across the apply.

Tests:
- New `_Supervisor.restart_daemon` cases: replaces in place
  (different pid post-restart, sibling daemon unchanged),
  unknown name is a no-op, restart-during-shutdown is a no-op.
- `test_pipelock_apply` rewritten to bring up the bundle image
  with `CLAUDE_BOTTLE_SIDECAR_DAEMONS=pipelock` so the
  supervisor is PID 1 and handles SIGUSR1. The previous
  standalone-pipelock setup wouldn't survive SIGUSR1 (pipelock
  default disposition is terminate). Test builds the bundle
  image in setUpClass (cached layers make repeat runs fast).

531 tests passing locally (unit + integration).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 02:12:37 -04:00
parent c48f791d7d
commit 5b9ceaaaee
4 changed files with 140 additions and 42 deletions
+15 -15
View File
@@ -5,7 +5,9 @@ Used by the supervise dashboard when the operator approves a
pipelock-block proposal (or runs the operator-initiated `pipelock
edit <bottle>` verb). Fetches the current pipelock.yaml via `docker
exec`, parses it, swaps the api_allowlist with the proposed hosts,
re-renders, writes back via `docker cp`, then `docker restart` so
re-renders, writes back via the bind-mount path, then signals the
bundle supervisor to restart the pipelock daemon (`docker kill
--signal USR1`) so
pipelock picks up the new config.
v1 uses restart, not SIGHUP — pipelock has no in-process reload
@@ -130,19 +132,17 @@ def apply_allowlist_change(
2. Fetch + parse current pipelock.yaml.
3. Replace api_allowlist with the proposed hosts; re-render.
4. Write the new yaml to the bind-mount source.
5. `docker restart` the bundle so pipelock reloads.
The restart bounces ALL four daemons inside the bundle, not
just pipelock — pipelock has no in-process reload and the
bundle init re-spawns the four daemons on container restart.
Per-daemon reload would need a supervisor IPC channel (PRD
0024 open question 1's "eventually" path); the bundle-wide
restart is the v1 trade-off.
5. `docker kill --signal USR1 <bundle>` so the supervisor
restarts the pipelock daemon in place (leaving egress,
git-gate, and supervise running). Pipelock has no
in-process reload; the supervisor's per-daemon restart
keeps the agent's MCP socket alive — a whole-bundle
`docker restart` would bounce supervise too.
Returns (before, after) where both are one-per-line allowlist
strings (operator-facing format). Raises PipelockApplyError on
any failure; the sidecar's existing config stays in place until
the host write succeeds, and the restart is what makes it
the host write succeeds, and the SIGUSR1 is what makes it
live."""
new_hosts = parse_allowlist_content(new_allowlist_content)
container = sidecar_bundle_container_name(slug)
@@ -167,9 +167,9 @@ def apply_allowlist_change(
# FILE — same Docker single-file inode issue as egress_apply:
# write-temp-then-rename swaps the host inode and leaves the
# container's mount pointing at the orphaned old one. Write
# in-place. `docker restart` below picks up the new content
# (and pipelock has no in-process reload anyway, so the
# restart is what makes it live regardless of write atomicity).
# in-place. The SIGUSR1 below makes the new content live
# (pipelock has no in-process reload, so the supervisor
# restarts the pipelock daemon in response).
target = _pipelock_yaml_host_path(slug)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(rendered)
@@ -177,12 +177,12 @@ def apply_allowlist_change(
# fine — but 0o600 matches what prepare wrote.
target.chmod(0o600)
restart = subprocess.run(
["docker", "restart", container],
["docker", "kill", "--signal", "USR1", container],
capture_output=True, text=True, check=False,
)
if restart.returncode != 0:
raise PipelockApplyError(
f"failed to restart {container}: "
f"failed to signal {container} for pipelock restart: "
f"{(restart.stderr or '').strip()}"
)
+57 -7
View File
@@ -224,6 +224,52 @@ class _Supervisor:
return True
return False
def restart_daemon(self, daemon_name: str, *, grace: float = 5.0) -> bool:
"""Terminate one named child and spawn a fresh one, leaving
the other daemons running. Used by the pipelock-apply path:
pipelock has no in-process reload, so apply_allowlist_change
runs `docker kill --signal USR1 <bundle>` after writing the
new yaml; the supervisor catches SIGUSR1 and calls this.
Behavior: SIGTERM → wait up to `grace` seconds → SIGKILL if
still alive → spawn a replacement under the same DaemonSpec.
The `procs` slot is updated in place so subsequent
forward_signal / shutdown calls reach the new pid.
Returns True iff a daemon by that name was running and a
replacement spawned; False if no such daemon (the
compose-renderer subset said this bottle doesn't run it)."""
if self.shutdown_at is not None:
_log(f"restart {daemon_name} skipped; supervisor is shutting down")
return False
for idx, (spec, p) in enumerate(self.procs):
if spec.name != daemon_name:
continue
_log(f"restarting {daemon_name}")
if p.poll() is None:
try:
p.terminate()
except ProcessLookupError:
pass
try:
p.wait(timeout=grace)
except subprocess.TimeoutExpired:
_log(
f"{daemon_name} did not exit within {grace:.0f}s "
f"of SIGTERM; SIGKILL"
)
try:
p.kill()
except ProcessLookupError:
pass
p.wait()
self._logged_dead.discard(daemon_name)
new_proc = _spawn(spec)
self.procs[idx] = (spec, new_proc)
_log(f"{daemon_name} restarted (pid {new_proc.pid})")
return True
return False
def main(argv: Sequence[str] | None = None) -> int:
del argv # no flags yet; env-driven only
@@ -237,14 +283,18 @@ def main(argv: Sequence[str] | None = None) -> int:
signal.signal(signal.SIGTERM, lambda *_: sup.request_shutdown("SIGTERM"))
signal.signal(signal.SIGINT, lambda *_: sup.request_shutdown("SIGINT"))
# SIGHUP reload path: routes.yaml + pipelock.yaml hot-reloads
# in egress_apply.py / pipelock_apply.py issue `docker kill
# --signal HUP <bundle>`. The kernel delivers SIGHUP to PID 1
# (this supervisor); we forward it to the daemon that knows
# how to reload (egress = mitmdump-reload-addons; pipelock has
# no in-process reload, so the pipelock-apply path uses
# `docker restart` instead).
# SIGHUP reload path: egress_apply.py runs `docker kill
# --signal HUP <bundle>` after writing routes.yaml. The kernel
# delivers SIGHUP to PID 1 (this supervisor); forward it to
# mitmdump so it reloads its addon.
signal.signal(signal.SIGHUP, lambda *_: sup.forward_signal(signal.SIGHUP, "egress"))
# SIGUSR1 pipelock-restart path: pipelock_apply.py runs
# `docker kill --signal USR1 <bundle>` after writing
# pipelock.yaml. Pipelock has no in-process reload, so the
# supervisor restarts the pipelock daemon in place (other
# daemons keep running — specifically supervise, whose MCP
# socket would drop on a whole-container `docker restart`).
signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock"))
while not sup.tick():
time.sleep(_POLL_INTERVAL)
+21 -20
View File
@@ -35,8 +35,6 @@ from claude_bottle.backend.docker.network import (
from claude_bottle.backend.docker.pipelock import (
PIPELOCK_CA_CERT_IN_CONTAINER,
PIPELOCK_CA_KEY_IN_CONTAINER,
PIPELOCK_IMAGE,
PIPELOCK_PORT,
DockerPipelockProxy,
pipelock_tls_init,
)
@@ -47,6 +45,7 @@ from claude_bottle.backend.docker.pipelock_apply import (
fetch_current_yaml,
)
from claude_bottle.backend.docker.sidecar_bundle import (
SIDECAR_BUNDLE_IMAGE,
sidecar_bundle_container_name,
)
from claude_bottle.yaml_subset import parse_yaml_subset
@@ -85,14 +84,12 @@ class TestPipelockApply(unittest.TestCase):
shutil.rmtree(pipelock_state_dir(self.slug), ignore_errors=True)
def _bring_up(self) -> None:
"""Replicates the pre-chunk-3 bring-up sequence (create on
internal network → bind-mount yaml + CAs → attach egress
network → docker start) without going through the deleted
`DockerPipelockProxy.start` helper. The same sequence is
what `docker compose up` does for the pipelock service in
production; this test path keeps the standalone-pipelock
smoke alive so `apply_allowlist_change`'s host-side
write + docker-restart loop has integration coverage.
"""Brings up the bundle image with only the pipelock daemon
selected. The bundle's Python supervisor is PID 1, which is
what apply_allowlist_change targets via `docker kill
--signal USR1` — pipelock alone as PID 1 wouldn't survive
SIGUSR1 (default disposition = terminate). This shape is
what runs in production minus the other three daemons.
The yaml stages into the production-real
`pipelock_state_dir(slug)` (not a private temp dir) so the
@@ -109,24 +106,28 @@ class TestPipelockApply(unittest.TestCase):
self.egress_net = network_create_egress(self.slug)
ca_cert_host, ca_key_host = pipelock_tls_init(state_dir)
# apply_allowlist_change targets sidecar_bundle_container_name
# (chunk 5 flipped the bundle to the only shape). Bringing the
# standalone pipelock up under that name keeps this test
# exercising the real production code path; the bundle's
# other three daemons aren't running here, but the
# apply/fetch code only touches /etc/pipelock.yaml + the
# pipelock binary, so the lighter setup is fine.
# Ensure the bundle image is built. compose normally builds
# this lazily; we go through `docker run` here so we have to
# do it ourselves. Idempotent — cached layers make repeats
# fast.
repo_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
subprocess.run(
["docker", "build",
"-t", SIDECAR_BUNDLE_IMAGE,
"-f", "Dockerfile.sidecars", "."],
cwd=repo_root, check=True, capture_output=True,
)
self.sidecar_name = sidecar_bundle_container_name(self.slug)
subprocess.run(
["docker", "create",
"--name", self.sidecar_name,
"--network", self.internal_net,
"-e", "CLAUDE_BOTTLE_SIDECAR_DAEMONS=pipelock",
"-v", f"{prep.yaml_path}:/etc/pipelock.yaml:ro",
"-v", f"{ca_cert_host}:{PIPELOCK_CA_CERT_IN_CONTAINER}:ro",
"-v", f"{ca_key_host}:{PIPELOCK_CA_KEY_IN_CONTAINER}:ro",
PIPELOCK_IMAGE,
"run", "--config", "/etc/pipelock.yaml",
"--listen", f"0.0.0.0:{PIPELOCK_PORT}"],
SIDECAR_BUNDLE_IMAGE],
check=True, capture_output=True,
)
subprocess.run(
+47
View File
@@ -227,6 +227,53 @@ class TestSupervisor(unittest.TestCase):
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_restart_daemon_replaces_in_place(self):
# pipelock_apply.py sends SIGUSR1 to the bundle, supervisor
# restarts the pipelock daemon, supervise (the other
# daemon's MCP server in production) stays up.
specs = [
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
_DaemonSpec("supervise", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
old_pipelock_pid = sup.procs[0][1].pid
supervise_pid = sup.procs[1][1].pid
ok = sup.restart_daemon("pipelock", grace=2.0)
self.assertTrue(ok)
# Pipelock got a fresh PID — different process.
new_pipelock_pid = sup.procs[0][1].pid
self.assertNotEqual(old_pipelock_pid, new_pipelock_pid)
# Supervise's PID is unchanged — it was NOT restarted.
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
self.assertIsNone(sup.procs[1][1].poll(),
"supervise should still be running")
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_restart_unknown_daemon_no_op(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
ok = sup.restart_daemon("ghost")
self.assertFalse(ok)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_restart_during_shutdown_is_no_op(self):
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
sup.request_shutdown(reason="test")
ok = sup.restart_daemon("pipelock")
self.assertFalse(ok,
"must not respawn a daemon during teardown")
self._drive(sup)
def test_shutdown_after_start_terminates_children(self):
# Two long-running children. Caller requests shutdown;
# both should receive SIGTERM and exit. exit_code() is