fix(sidecars): per-daemon pipelock restart keeps supervise socket alive
`apply_allowlist_change` used `docker restart <bundle>` to make
pipelock reload, which bounced ALL four daemons — including
supervise, whose MCP socket the agent's claude-code client had
open. That dropped the connection. A second apply works because
supervise has come back up by then.
Fix: per-daemon restart via SIGUSR1.
- New `_Supervisor.restart_daemon(name)` terminates one named
child and spawns a replacement in place. Other daemons keep
running.
- main() wires SIGUSR1 → `restart_daemon("pipelock")`. Pipelock
has no in-process reload, so this is its analog of egress's
SIGHUP-reload-addon path. Pipelock is the only daemon that
currently needs hot-config reload via restart; if others
acquire the need, add a new signal.
- `apply_allowlist_change` now `docker kill --signal USR1
<bundle>` instead of `docker restart`. Supervise / egress /
git-gate keep running across the apply.
Tests:
- New `_Supervisor.restart_daemon` cases: replaces in place
(different pid post-restart, sibling daemon unchanged),
unknown name is a no-op, restart-during-shutdown is a no-op.
- `test_pipelock_apply` rewritten to bring up the bundle image
with `CLAUDE_BOTTLE_SIDECAR_DAEMONS=pipelock` so the
supervisor is PID 1 and handles SIGUSR1. The previous
standalone-pipelock setup wouldn't survive SIGUSR1 (pipelock
default disposition is terminate). Test builds the bundle
image in setUpClass (cached layers make repeat runs fast).
531 tests passing locally (unit + integration).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -35,8 +35,6 @@ from claude_bottle.backend.docker.network import (
|
||||
from claude_bottle.backend.docker.pipelock import (
|
||||
PIPELOCK_CA_CERT_IN_CONTAINER,
|
||||
PIPELOCK_CA_KEY_IN_CONTAINER,
|
||||
PIPELOCK_IMAGE,
|
||||
PIPELOCK_PORT,
|
||||
DockerPipelockProxy,
|
||||
pipelock_tls_init,
|
||||
)
|
||||
@@ -47,6 +45,7 @@ from claude_bottle.backend.docker.pipelock_apply import (
|
||||
fetch_current_yaml,
|
||||
)
|
||||
from claude_bottle.backend.docker.sidecar_bundle import (
|
||||
SIDECAR_BUNDLE_IMAGE,
|
||||
sidecar_bundle_container_name,
|
||||
)
|
||||
from claude_bottle.yaml_subset import parse_yaml_subset
|
||||
@@ -85,14 +84,12 @@ class TestPipelockApply(unittest.TestCase):
|
||||
shutil.rmtree(pipelock_state_dir(self.slug), ignore_errors=True)
|
||||
|
||||
def _bring_up(self) -> None:
|
||||
"""Replicates the pre-chunk-3 bring-up sequence (create on
|
||||
internal network → bind-mount yaml + CAs → attach egress
|
||||
network → docker start) without going through the deleted
|
||||
`DockerPipelockProxy.start` helper. The same sequence is
|
||||
what `docker compose up` does for the pipelock service in
|
||||
production; this test path keeps the standalone-pipelock
|
||||
smoke alive so `apply_allowlist_change`'s host-side
|
||||
write + docker-restart loop has integration coverage.
|
||||
"""Brings up the bundle image with only the pipelock daemon
|
||||
selected. The bundle's Python supervisor is PID 1, which is
|
||||
what apply_allowlist_change targets via `docker kill
|
||||
--signal USR1` — pipelock alone as PID 1 wouldn't survive
|
||||
SIGUSR1 (default disposition = terminate). This shape is
|
||||
what runs in production minus the other three daemons.
|
||||
|
||||
The yaml stages into the production-real
|
||||
`pipelock_state_dir(slug)` (not a private temp dir) so the
|
||||
@@ -109,24 +106,28 @@ class TestPipelockApply(unittest.TestCase):
|
||||
self.egress_net = network_create_egress(self.slug)
|
||||
ca_cert_host, ca_key_host = pipelock_tls_init(state_dir)
|
||||
|
||||
# apply_allowlist_change targets sidecar_bundle_container_name
|
||||
# (chunk 5 flipped the bundle to the only shape). Bringing the
|
||||
# standalone pipelock up under that name keeps this test
|
||||
# exercising the real production code path; the bundle's
|
||||
# other three daemons aren't running here, but the
|
||||
# apply/fetch code only touches /etc/pipelock.yaml + the
|
||||
# pipelock binary, so the lighter setup is fine.
|
||||
# Ensure the bundle image is built. compose normally builds
|
||||
# this lazily; we go through `docker run` here so we have to
|
||||
# do it ourselves. Idempotent — cached layers make repeats
|
||||
# fast.
|
||||
repo_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
||||
subprocess.run(
|
||||
["docker", "build",
|
||||
"-t", SIDECAR_BUNDLE_IMAGE,
|
||||
"-f", "Dockerfile.sidecars", "."],
|
||||
cwd=repo_root, check=True, capture_output=True,
|
||||
)
|
||||
|
||||
self.sidecar_name = sidecar_bundle_container_name(self.slug)
|
||||
subprocess.run(
|
||||
["docker", "create",
|
||||
"--name", self.sidecar_name,
|
||||
"--network", self.internal_net,
|
||||
"-e", "CLAUDE_BOTTLE_SIDECAR_DAEMONS=pipelock",
|
||||
"-v", f"{prep.yaml_path}:/etc/pipelock.yaml:ro",
|
||||
"-v", f"{ca_cert_host}:{PIPELOCK_CA_CERT_IN_CONTAINER}:ro",
|
||||
"-v", f"{ca_key_host}:{PIPELOCK_CA_KEY_IN_CONTAINER}:ro",
|
||||
PIPELOCK_IMAGE,
|
||||
"run", "--config", "/etc/pipelock.yaml",
|
||||
"--listen", f"0.0.0.0:{PIPELOCK_PORT}"],
|
||||
SIDECAR_BUNDLE_IMAGE],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
subprocess.run(
|
||||
|
||||
@@ -227,6 +227,53 @@ class TestSupervisor(unittest.TestCase):
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_restart_daemon_replaces_in_place(self):
|
||||
# pipelock_apply.py sends SIGUSR1 to the bundle, supervisor
|
||||
# restarts the pipelock daemon, supervise (the other
|
||||
# daemon's MCP server in production) stays up.
|
||||
specs = [
|
||||
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
|
||||
_DaemonSpec("supervise", ("/bin/sleep", "30")),
|
||||
]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
time.sleep(0.1)
|
||||
old_pipelock_pid = sup.procs[0][1].pid
|
||||
supervise_pid = sup.procs[1][1].pid
|
||||
|
||||
ok = sup.restart_daemon("pipelock", grace=2.0)
|
||||
self.assertTrue(ok)
|
||||
|
||||
# Pipelock got a fresh PID — different process.
|
||||
new_pipelock_pid = sup.procs[0][1].pid
|
||||
self.assertNotEqual(old_pipelock_pid, new_pipelock_pid)
|
||||
# Supervise's PID is unchanged — it was NOT restarted.
|
||||
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
|
||||
self.assertIsNone(sup.procs[1][1].poll(),
|
||||
"supervise should still be running")
|
||||
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_restart_unknown_daemon_no_op(self):
|
||||
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
ok = sup.restart_daemon("ghost")
|
||||
self.assertFalse(ok)
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_restart_during_shutdown_is_no_op(self):
|
||||
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
sup.request_shutdown(reason="test")
|
||||
ok = sup.restart_daemon("pipelock")
|
||||
self.assertFalse(ok,
|
||||
"must not respawn a daemon during teardown")
|
||||
self._drive(sup)
|
||||
|
||||
def test_shutdown_after_start_terminates_children(self):
|
||||
# Two long-running children. Caller requests shutdown;
|
||||
# both should receive SIGTERM and exit. exit_code() is
|
||||
|
||||
Reference in New Issue
Block a user