PRD 0034: Sidecar Restart and Shutdown Semantics #127

Merged
didericis merged 3 commits from prd-0034-sidecar-restart-shutdown-semantics into main 2026-06-02 03:55:06 -04:00
3 changed files with 135 additions and 33 deletions
Showing only changes of commit 31708abfad - Show all commits
+49 -6
View File
@@ -163,6 +163,10 @@ class _Supervisor:
# Names of children that have been logged as having exited # Names of children that have been logged as having exited
# so we only log each death once across watch-loop ticks. # so we only log each death once across watch-loop ticks.
self._logged_dead: set[str] = set() self._logged_dead: set[str] = set()
# Signal handlers add daemon names here and return quickly.
# The main watch loop drains the set, so repeated restart
# requests for one daemon coalesce into one restart.
self._restart_requested: set[str] = set()
def start_all(self) -> None: def start_all(self) -> None:
for spec in self.specs: for spec in self.specs:
@@ -173,6 +177,7 @@ class _Supervisor:
if self.shutdown_at is not None: if self.shutdown_at is not None:
return return
self.shutdown_at = time.monotonic() self.shutdown_at = time.monotonic()
self._restart_requested.clear()
_log(f"shutting down ({reason}); forwarding SIGTERM") _log(f"shutting down ({reason}); forwarding SIGTERM")
for _, p in self.procs: for _, p in self.procs:
if p.poll() is None: if p.poll() is None:
@@ -181,6 +186,24 @@ class _Supervisor:
except ProcessLookupError: except ProcessLookupError:
pass pass
def request_restart(self, daemon_name: str) -> bool:
"""Queue a daemon restart for the main loop to process.
Signal handlers use this non-blocking path instead of doing
subprocess lifecycle work directly. Requests coalesce by
daemon name: one pending restart is enough to make the daemon
reread the latest config from disk.
Returns True iff a daemon by that name is known to the
supervisor and shutdown has not started."""
if self.shutdown_at is not None:
_log(f"restart {daemon_name} skipped; supervisor is shutting down")
return False
if not any(spec.name == daemon_name for spec, _ in self.procs):
return False
self._restart_requested.add(daemon_name)
return True
def tick(self) -> bool: def tick(self) -> bool:
"""One iteration of the watch loop. Returns True when every """One iteration of the watch loop. Returns True when every
child has exited and the supervisor can return. child has exited and the supervisor can return.
@@ -188,6 +211,8 @@ class _Supervisor:
A child dying unexpectedly is logged but does NOT initiate A child dying unexpectedly is logged but does NOT initiate
shutdown — see the module docstring's failure-policy shutdown — see the module docstring's failure-policy
section. Shutdown is signal-driven only.""" section. Shutdown is signal-driven only."""
self._drain_restart_requests()
for spec, p in self.procs: for spec, p in self.procs:
rc = p.poll() rc = p.poll()
if rc is None or spec.name in self._logged_dead: if rc is None or spec.name in self._logged_dead:
@@ -223,11 +248,29 @@ class _Supervisor:
return all(p.poll() is not None for _, p in self.procs) return all(p.poll() is not None for _, p in self.procs)
def exit_code(self) -> int: def exit_code(self) -> int:
"""Worst child returncode wins. On graceful shutdown every """Positive child failures win; otherwise report success.
child is signal-killed (negative returncode) and max()
returns 0; if some child crashed nonzero before the signal Python represents signal-terminated children as negative
the operator gets that code on container exit.""" return codes. A signal-only graceful shutdown should not leak
return max((p.returncode for _, p in self.procs), default=0) that platform-specific detail into the container exit status,
but a positive crash before shutdown should remain visible."""
positives = [
p.returncode for _, p in self.procs
if p.returncode is not None and p.returncode > 0
]
return max(positives, default=0)
def _drain_restart_requests(self) -> None:
if self.shutdown_at is not None:
self._restart_requested.clear()
return
requested = tuple(sorted(self._restart_requested))
self._restart_requested.clear()
for daemon_name in requested:
if self.shutdown_at is not None:
self._restart_requested.clear()
return
self.restart_daemon(daemon_name)
def forward_signal(self, sig: int, daemon_name: str) -> bool: def forward_signal(self, sig: int, daemon_name: str) -> bool:
"""Forward a signal to one named child. Used by the SIGHUP """Forward a signal to one named child. Used by the SIGHUP
@@ -323,7 +366,7 @@ def main(argv: Sequence[str] | None = None) -> int:
# supervisor restarts the pipelock daemon in place (other # supervisor restarts the pipelock daemon in place (other
# daemons keep running — specifically supervise, whose MCP # daemons keep running — specifically supervise, whose MCP
# socket would drop on a whole-container `docker restart`). # socket would drop on a whole-container `docker restart`).
signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock")) signal.signal(signal.SIGUSR1, lambda *_: sup.request_restart("pipelock"))
while not sup.tick(): while not sup.tick():
time.sleep(_POLL_INTERVAL) time.sleep(_POLL_INTERVAL)
@@ -100,11 +100,11 @@ loop should continue to call `tick()` and sleep on `_POLL_INTERVAL`; `tick()`
then performs the actual `restart_daemon("pipelock")` work while normal Python then performs the actual `restart_daemon("pipelock")` work while normal Python
control flow is in the supervisor loop. control flow is in the supervisor loop.
Repeated restart requests should not overlap. Either coalescing or FIFO Repeated restart requests should not overlap. Restart requests coalesce by
serialization is acceptable, but the PRD prefers coalescing by daemon name: if daemon name: if three SIGUSR1 signals arrive before the next loop turn, one
three SIGUSR1 signals arrive before the next loop turn, one pipelock restart is pipelock restart is enough because each restart rereads the latest
enough because each restart rereads the latest `pipelock.yaml` from disk. `pipelock.yaml` from disk. This treats SIGUSR1 as "make pipelock reflect the
Document this because it is a semantic choice. current config" rather than "run exactly one restart per signal."
Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is
pending, the supervisor should drop the pending restart and terminate live pending, the supervisor should drop the pending restart and terminate live
@@ -116,9 +116,9 @@ between bytecodes and cannot interrupt a single blocking `wait()` until control
returns to Python. returns to Python.
Exit-code behavior should be documented as "positive failures win, otherwise Exit-code behavior should be documented as "positive failures win, otherwise
return the maximum observed child return code." That matches the current intent: return zero." Positive process failures remain visible, while a clean shutdown
positive process failures remain visible, while a clean shutdown of only of only zero-exit or signal-terminated children returns zero instead of leaking
signal-terminated children does not hide an earlier crash. platform-specific negative signal return codes to the container exit status.
## Implementation Chunks ## Implementation Chunks
@@ -148,10 +148,4 @@ Also run the full unit suite before merge:
## Open Questions ## Open Questions
- Should repeated restart requests be coalesced by daemon name, or should the None.
supervisor preserve every queued request? Coalescing is simpler and appears
sufficient because pipelock rereads the latest config on restart.
- Should exit-code handling clamp all negative signal return codes to zero
when no positive child failure occurred, or should it continue returning the
maximum raw child return code? The current tests tolerate platform-specific
negative signal codes; tightening this would be a behavior change.
+77 -12
View File
@@ -301,6 +301,64 @@ class TestSupervisor(unittest.TestCase):
sup.request_shutdown(reason="cleanup") sup.request_shutdown(reason="cleanup")
self._drive(sup) self._drive(sup)
def test_request_restart_is_drained_by_tick(self):
specs = [
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
_DaemonSpec("supervise", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
old_pipelock_pid = sup.procs[0][1].pid
supervise_pid = sup.procs[1][1].pid
ok = sup.request_restart("pipelock")
self.assertTrue(ok)
# The non-blocking request path only records intent.
self.assertEqual(old_pipelock_pid, sup.procs[0][1].pid)
done = sup.tick()
self.assertFalse(done)
self.assertNotEqual(old_pipelock_pid, sup.procs[0][1].pid)
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_repeated_restart_requests_coalesce(self):
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
self.assertTrue(sup.request_restart("pipelock"))
self.assertTrue(sup.request_restart("pipelock"))
self.assertEqual({"pipelock"}, sup._restart_requested)
old_pid = sup.procs[0][1].pid
sup.tick()
first_restarted_pid = sup.procs[0][1].pid
self.assertNotEqual(old_pid, first_restarted_pid)
# A second tick should not restart again; the coalesced
# request was consumed by the first tick.
sup.tick()
self.assertEqual(first_restarted_pid, sup.procs[0][1].pid)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_request_restart_unknown_daemon_no_op(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
ok = sup.request_restart("ghost")
self.assertFalse(ok)
self.assertEqual(set(), sup._restart_requested)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_restart_unknown_daemon_no_op(self): def test_restart_unknown_daemon_no_op(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))] specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
sup = _Supervisor(specs) sup = _Supervisor(specs)
@@ -320,12 +378,24 @@ class TestSupervisor(unittest.TestCase):
"must not respawn a daemon during teardown") "must not respawn a daemon during teardown")
self._drive(sup) self._drive(sup)
def test_pending_restart_dropped_during_shutdown(self):
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
old_pid = sup.procs[0][1].pid
self.assertTrue(sup.request_restart("pipelock"))
sup.request_shutdown(reason="test")
self.assertEqual(set(), sup._restart_requested)
self._drive(sup)
self.assertEqual(old_pid, sup.procs[0][1].pid)
def test_shutdown_after_start_terminates_children(self): def test_shutdown_after_start_terminates_children(self):
# Two long-running children. Caller requests shutdown; # Two long-running children. Caller requests shutdown;
# both should receive SIGTERM and exit. exit_code() is # both should receive SIGTERM and exit. Signal-only
# max of (returncodes) — both signal-killed (negative), # shutdown clamps to a zero supervisor exit code.
# so max() picks 0 in the typical case (or the
# platform-specific signal returncode).
specs = [ specs = [
_DaemonSpec("a", ("/bin/sleep", "60")), _DaemonSpec("a", ("/bin/sleep", "60")),
_DaemonSpec("b", ("/bin/sleep", "60")), _DaemonSpec("b", ("/bin/sleep", "60")),
@@ -335,7 +405,7 @@ class TestSupervisor(unittest.TestCase):
time.sleep(0.2) # let them actually start time.sleep(0.2) # let them actually start
sup.request_shutdown(reason="test") sup.request_shutdown(reason="test")
rc = self._drive(sup) rc = self._drive(sup)
self.assertIsNotNone(rc) self.assertEqual(0, rc)
# Both children got the signal — neither survived past # Both children got the signal — neither survived past
# the grace deadline. # the grace deadline.
for _, p in sup.procs: for _, p in sup.procs:
@@ -360,7 +430,7 @@ class TestSupervisor(unittest.TestCase):
# Process was SIGKILL'd → returncode -9 on POSIX. # Process was SIGKILL'd → returncode -9 on POSIX.
self.assertEqual(-9, sup.procs[0][1].returncode) self.assertEqual(-9, sup.procs[0][1].returncode)
self.assertIsNotNone(rc) self.assertEqual(0, rc)
def test_idempotent_shutdown_requests(self): def test_idempotent_shutdown_requests(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "60"))] specs = [_DaemonSpec("a", ("/bin/sleep", "60"))]
@@ -426,12 +496,7 @@ class TestMainEndToEnd(unittest.TestCase):
self.assertIn("starting alpha", out) self.assertIn("starting alpha", out)
self.assertIn("starting beta", out) self.assertIn("starting beta", out)
self.assertIn("forwarding SIGTERM", out) self.assertIn("forwarding SIGTERM", out)
# Sleep terminated by SIGTERM exits with returncode -15; self.assertEqual(0, rc)
# supervisor surfaces that via max(...) and main()
# returns -15 → process exit becomes 256-15 = 241.
# On macOS bash may convert to 143. Either way, nonzero
# AND the child finished — we don't pin the exact code.
self.assertNotEqual(0, rc)
def test_empty_daemon_set_exits_zero_immediately(self): def test_empty_daemon_set_exits_zero_immediately(self):
# Use a sentinel value that filters out both alpha+beta. # Use a sentinel value that filters out both alpha+beta.