Merge pull request 'fix(sidecars): per-daemon pipelock restart keeps supervise socket alive' (#61) from fix-pipelock-restart-keeps-bundle-up into main
This commit was merged in pull request #61.
This commit is contained in:
@@ -5,7 +5,9 @@ Used by the supervise dashboard when the operator approves a
|
||||
pipelock-block proposal (or runs the operator-initiated `pipelock
|
||||
edit <bottle>` verb). Fetches the current pipelock.yaml via `docker
|
||||
exec`, parses it, swaps the api_allowlist with the proposed hosts,
|
||||
re-renders, writes back via `docker cp`, then `docker restart` so
|
||||
re-renders, writes back via the bind-mount path, then signals the
|
||||
bundle supervisor to restart the pipelock daemon (`docker kill
|
||||
--signal USR1`) so
|
||||
pipelock picks up the new config.
|
||||
|
||||
v1 uses restart, not SIGHUP — pipelock has no in-process reload
|
||||
@@ -130,19 +132,17 @@ def apply_allowlist_change(
|
||||
2. Fetch + parse current pipelock.yaml.
|
||||
3. Replace api_allowlist with the proposed hosts; re-render.
|
||||
4. Write the new yaml to the bind-mount source.
|
||||
5. `docker restart` the bundle so pipelock reloads.
|
||||
|
||||
The restart bounces ALL four daemons inside the bundle, not
|
||||
just pipelock — pipelock has no in-process reload and the
|
||||
bundle init re-spawns the four daemons on container restart.
|
||||
Per-daemon reload would need a supervisor IPC channel (PRD
|
||||
0024 open question 1's "eventually" path); the bundle-wide
|
||||
restart is the v1 trade-off.
|
||||
5. `docker kill --signal USR1 <bundle>` so the supervisor
|
||||
restarts the pipelock daemon in place (leaving egress,
|
||||
git-gate, and supervise running). Pipelock has no
|
||||
in-process reload; the supervisor's per-daemon restart
|
||||
keeps the agent's MCP socket alive — a whole-bundle
|
||||
`docker restart` would bounce supervise too.
|
||||
|
||||
Returns (before, after) where both are one-per-line allowlist
|
||||
strings (operator-facing format). Raises PipelockApplyError on
|
||||
any failure; the sidecar's existing config stays in place until
|
||||
the host write succeeds, and the restart is what makes it
|
||||
the host write succeeds, and the SIGUSR1 is what makes it
|
||||
live."""
|
||||
new_hosts = parse_allowlist_content(new_allowlist_content)
|
||||
container = sidecar_bundle_container_name(slug)
|
||||
@@ -167,9 +167,9 @@ def apply_allowlist_change(
|
||||
# FILE — same Docker single-file inode issue as egress_apply:
|
||||
# write-temp-then-rename swaps the host inode and leaves the
|
||||
# container's mount pointing at the orphaned old one. Write
|
||||
# in-place. `docker restart` below picks up the new content
|
||||
# (and pipelock has no in-process reload anyway, so the
|
||||
# restart is what makes it live regardless of write atomicity).
|
||||
# in-place. The SIGUSR1 below makes the new content live
|
||||
# (pipelock has no in-process reload, so the supervisor
|
||||
# restarts the pipelock daemon in response).
|
||||
target = _pipelock_yaml_host_path(slug)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(rendered)
|
||||
@@ -177,12 +177,12 @@ def apply_allowlist_change(
|
||||
# fine — but 0o600 matches what prepare wrote.
|
||||
target.chmod(0o600)
|
||||
restart = subprocess.run(
|
||||
["docker", "restart", container],
|
||||
["docker", "kill", "--signal", "USR1", container],
|
||||
capture_output=True, text=True, check=False,
|
||||
)
|
||||
if restart.returncode != 0:
|
||||
raise PipelockApplyError(
|
||||
f"failed to restart {container}: "
|
||||
f"failed to signal {container} for pipelock restart: "
|
||||
f"{(restart.stderr or '').strip()}"
|
||||
)
|
||||
|
||||
|
||||
@@ -224,6 +224,52 @@ class _Supervisor:
|
||||
return True
|
||||
return False
|
||||
|
||||
def restart_daemon(self, daemon_name: str, *, grace: float = 5.0) -> bool:
|
||||
"""Terminate one named child and spawn a fresh one, leaving
|
||||
the other daemons running. Used by the pipelock-apply path:
|
||||
pipelock has no in-process reload, so apply_allowlist_change
|
||||
runs `docker kill --signal USR1 <bundle>` after writing the
|
||||
new yaml; the supervisor catches SIGUSR1 and calls this.
|
||||
|
||||
Behavior: SIGTERM → wait up to `grace` seconds → SIGKILL if
|
||||
still alive → spawn a replacement under the same DaemonSpec.
|
||||
The `procs` slot is updated in place so subsequent
|
||||
forward_signal / shutdown calls reach the new pid.
|
||||
|
||||
Returns True iff a daemon by that name was running and a
|
||||
replacement spawned; False if no such daemon (the
|
||||
compose-renderer subset said this bottle doesn't run it)."""
|
||||
if self.shutdown_at is not None:
|
||||
_log(f"restart {daemon_name} skipped; supervisor is shutting down")
|
||||
return False
|
||||
for idx, (spec, p) in enumerate(self.procs):
|
||||
if spec.name != daemon_name:
|
||||
continue
|
||||
_log(f"restarting {daemon_name}")
|
||||
if p.poll() is None:
|
||||
try:
|
||||
p.terminate()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
try:
|
||||
p.wait(timeout=grace)
|
||||
except subprocess.TimeoutExpired:
|
||||
_log(
|
||||
f"{daemon_name} did not exit within {grace:.0f}s "
|
||||
f"of SIGTERM; SIGKILL"
|
||||
)
|
||||
try:
|
||||
p.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
p.wait()
|
||||
self._logged_dead.discard(daemon_name)
|
||||
new_proc = _spawn(spec)
|
||||
self.procs[idx] = (spec, new_proc)
|
||||
_log(f"{daemon_name} restarted (pid {new_proc.pid})")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def main(argv: Sequence[str] | None = None) -> int:
|
||||
del argv # no flags yet; env-driven only
|
||||
@@ -237,14 +283,18 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
signal.signal(signal.SIGTERM, lambda *_: sup.request_shutdown("SIGTERM"))
|
||||
signal.signal(signal.SIGINT, lambda *_: sup.request_shutdown("SIGINT"))
|
||||
# SIGHUP reload path: routes.yaml + pipelock.yaml hot-reloads
|
||||
# in egress_apply.py / pipelock_apply.py issue `docker kill
|
||||
# --signal HUP <bundle>`. The kernel delivers SIGHUP to PID 1
|
||||
# (this supervisor); we forward it to the daemon that knows
|
||||
# how to reload (egress = mitmdump-reload-addons; pipelock has
|
||||
# no in-process reload, so the pipelock-apply path uses
|
||||
# `docker restart` instead).
|
||||
# SIGHUP reload path: egress_apply.py runs `docker kill
|
||||
# --signal HUP <bundle>` after writing routes.yaml. The kernel
|
||||
# delivers SIGHUP to PID 1 (this supervisor); forward it to
|
||||
# mitmdump so it reloads its addon.
|
||||
signal.signal(signal.SIGHUP, lambda *_: sup.forward_signal(signal.SIGHUP, "egress"))
|
||||
# SIGUSR1 pipelock-restart path: pipelock_apply.py runs
|
||||
# `docker kill --signal USR1 <bundle>` after writing
|
||||
# pipelock.yaml. Pipelock has no in-process reload, so the
|
||||
# supervisor restarts the pipelock daemon in place (other
|
||||
# daemons keep running — specifically supervise, whose MCP
|
||||
# socket would drop on a whole-container `docker restart`).
|
||||
signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock"))
|
||||
|
||||
while not sup.tick():
|
||||
time.sleep(_POLL_INTERVAL)
|
||||
|
||||
@@ -35,8 +35,6 @@ from claude_bottle.backend.docker.network import (
|
||||
from claude_bottle.backend.docker.pipelock import (
|
||||
PIPELOCK_CA_CERT_IN_CONTAINER,
|
||||
PIPELOCK_CA_KEY_IN_CONTAINER,
|
||||
PIPELOCK_IMAGE,
|
||||
PIPELOCK_PORT,
|
||||
DockerPipelockProxy,
|
||||
pipelock_tls_init,
|
||||
)
|
||||
@@ -47,6 +45,7 @@ from claude_bottle.backend.docker.pipelock_apply import (
|
||||
fetch_current_yaml,
|
||||
)
|
||||
from claude_bottle.backend.docker.sidecar_bundle import (
|
||||
SIDECAR_BUNDLE_IMAGE,
|
||||
sidecar_bundle_container_name,
|
||||
)
|
||||
from claude_bottle.yaml_subset import parse_yaml_subset
|
||||
@@ -85,14 +84,12 @@ class TestPipelockApply(unittest.TestCase):
|
||||
shutil.rmtree(pipelock_state_dir(self.slug), ignore_errors=True)
|
||||
|
||||
def _bring_up(self) -> None:
|
||||
"""Replicates the pre-chunk-3 bring-up sequence (create on
|
||||
internal network → bind-mount yaml + CAs → attach egress
|
||||
network → docker start) without going through the deleted
|
||||
`DockerPipelockProxy.start` helper. The same sequence is
|
||||
what `docker compose up` does for the pipelock service in
|
||||
production; this test path keeps the standalone-pipelock
|
||||
smoke alive so `apply_allowlist_change`'s host-side
|
||||
write + docker-restart loop has integration coverage.
|
||||
"""Brings up the bundle image with only the pipelock daemon
|
||||
selected. The bundle's Python supervisor is PID 1, which is
|
||||
what apply_allowlist_change targets via `docker kill
|
||||
--signal USR1` — pipelock alone as PID 1 wouldn't survive
|
||||
SIGUSR1 (default disposition = terminate). This shape is
|
||||
what runs in production minus the other three daemons.
|
||||
|
||||
The yaml stages into the production-real
|
||||
`pipelock_state_dir(slug)` (not a private temp dir) so the
|
||||
@@ -109,24 +106,28 @@ class TestPipelockApply(unittest.TestCase):
|
||||
self.egress_net = network_create_egress(self.slug)
|
||||
ca_cert_host, ca_key_host = pipelock_tls_init(state_dir)
|
||||
|
||||
# apply_allowlist_change targets sidecar_bundle_container_name
|
||||
# (chunk 5 flipped the bundle to the only shape). Bringing the
|
||||
# standalone pipelock up under that name keeps this test
|
||||
# exercising the real production code path; the bundle's
|
||||
# other three daemons aren't running here, but the
|
||||
# apply/fetch code only touches /etc/pipelock.yaml + the
|
||||
# pipelock binary, so the lighter setup is fine.
|
||||
# Ensure the bundle image is built. compose normally builds
|
||||
# this lazily; we go through `docker run` here so we have to
|
||||
# do it ourselves. Idempotent — cached layers make repeats
|
||||
# fast.
|
||||
repo_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
||||
subprocess.run(
|
||||
["docker", "build",
|
||||
"-t", SIDECAR_BUNDLE_IMAGE,
|
||||
"-f", "Dockerfile.sidecars", "."],
|
||||
cwd=repo_root, check=True, capture_output=True,
|
||||
)
|
||||
|
||||
self.sidecar_name = sidecar_bundle_container_name(self.slug)
|
||||
subprocess.run(
|
||||
["docker", "create",
|
||||
"--name", self.sidecar_name,
|
||||
"--network", self.internal_net,
|
||||
"-e", "CLAUDE_BOTTLE_SIDECAR_DAEMONS=pipelock",
|
||||
"-v", f"{prep.yaml_path}:/etc/pipelock.yaml:ro",
|
||||
"-v", f"{ca_cert_host}:{PIPELOCK_CA_CERT_IN_CONTAINER}:ro",
|
||||
"-v", f"{ca_key_host}:{PIPELOCK_CA_KEY_IN_CONTAINER}:ro",
|
||||
PIPELOCK_IMAGE,
|
||||
"run", "--config", "/etc/pipelock.yaml",
|
||||
"--listen", f"0.0.0.0:{PIPELOCK_PORT}"],
|
||||
SIDECAR_BUNDLE_IMAGE],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
subprocess.run(
|
||||
|
||||
@@ -227,6 +227,53 @@ class TestSupervisor(unittest.TestCase):
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_restart_daemon_replaces_in_place(self):
|
||||
# pipelock_apply.py sends SIGUSR1 to the bundle, supervisor
|
||||
# restarts the pipelock daemon, supervise (the other
|
||||
# daemon's MCP server in production) stays up.
|
||||
specs = [
|
||||
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
|
||||
_DaemonSpec("supervise", ("/bin/sleep", "30")),
|
||||
]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
time.sleep(0.1)
|
||||
old_pipelock_pid = sup.procs[0][1].pid
|
||||
supervise_pid = sup.procs[1][1].pid
|
||||
|
||||
ok = sup.restart_daemon("pipelock", grace=2.0)
|
||||
self.assertTrue(ok)
|
||||
|
||||
# Pipelock got a fresh PID — different process.
|
||||
new_pipelock_pid = sup.procs[0][1].pid
|
||||
self.assertNotEqual(old_pipelock_pid, new_pipelock_pid)
|
||||
# Supervise's PID is unchanged — it was NOT restarted.
|
||||
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
|
||||
self.assertIsNone(sup.procs[1][1].poll(),
|
||||
"supervise should still be running")
|
||||
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_restart_unknown_daemon_no_op(self):
|
||||
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
ok = sup.restart_daemon("ghost")
|
||||
self.assertFalse(ok)
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_restart_during_shutdown_is_no_op(self):
|
||||
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
sup.request_shutdown(reason="test")
|
||||
ok = sup.restart_daemon("pipelock")
|
||||
self.assertFalse(ok,
|
||||
"must not respawn a daemon during teardown")
|
||||
self._drive(sup)
|
||||
|
||||
def test_shutdown_after_start_terminates_children(self):
|
||||
# Two long-running children. Caller requests shutdown;
|
||||
# both should receive SIGTERM and exit. exit_code() is
|
||||
|
||||
Reference in New Issue
Block a user