From 31708abfaddce53104658ac06ae9909bbea9ed65 Mon Sep 17 00:00:00 2001 From: codex Date: Tue, 2 Jun 2026 07:52:19 +0000 Subject: [PATCH] fix(sidecar): queue restart signals --- bot_bottle/sidecar_init.py | 55 ++++++++++-- ...0034-sidecar-restart-shutdown-semantics.md | 24 ++--- tests/unit/test_sidecar_init.py | 89 ++++++++++++++++--- 3 files changed, 135 insertions(+), 33 deletions(-) diff --git a/bot_bottle/sidecar_init.py b/bot_bottle/sidecar_init.py index 1681c66..8bfa984 100644 --- a/bot_bottle/sidecar_init.py +++ b/bot_bottle/sidecar_init.py @@ -163,6 +163,10 @@ class _Supervisor: # Names of children that have been logged as having exited # so we only log each death once across watch-loop ticks. self._logged_dead: set[str] = set() + # Signal handlers add daemon names here and return quickly. + # The main watch loop drains the set, so repeated restart + # requests for one daemon coalesce into one restart. + self._restart_requested: set[str] = set() def start_all(self) -> None: for spec in self.specs: @@ -173,6 +177,7 @@ class _Supervisor: if self.shutdown_at is not None: return self.shutdown_at = time.monotonic() + self._restart_requested.clear() _log(f"shutting down ({reason}); forwarding SIGTERM") for _, p in self.procs: if p.poll() is None: @@ -181,6 +186,24 @@ class _Supervisor: except ProcessLookupError: pass + def request_restart(self, daemon_name: str) -> bool: + """Queue a daemon restart for the main loop to process. + + Signal handlers use this non-blocking path instead of doing + subprocess lifecycle work directly. Requests coalesce by + daemon name: one pending restart is enough to make the daemon + reread the latest config from disk. + + Returns True iff a daemon by that name is known to the + supervisor and shutdown has not started.""" + if self.shutdown_at is not None: + _log(f"restart {daemon_name} skipped; supervisor is shutting down") + return False + if not any(spec.name == daemon_name for spec, _ in self.procs): + return False + self._restart_requested.add(daemon_name) + return True + def tick(self) -> bool: """One iteration of the watch loop. Returns True when every child has exited and the supervisor can return. @@ -188,6 +211,8 @@ class _Supervisor: A child dying unexpectedly is logged but does NOT initiate shutdown — see the module docstring's failure-policy section. Shutdown is signal-driven only.""" + self._drain_restart_requests() + for spec, p in self.procs: rc = p.poll() if rc is None or spec.name in self._logged_dead: @@ -223,11 +248,29 @@ class _Supervisor: return all(p.poll() is not None for _, p in self.procs) def exit_code(self) -> int: - """Worst child returncode wins. On graceful shutdown every - child is signal-killed (negative returncode) and max() - returns 0; if some child crashed nonzero before the signal - the operator gets that code on container exit.""" - return max((p.returncode for _, p in self.procs), default=0) + """Positive child failures win; otherwise report success. + + Python represents signal-terminated children as negative + return codes. A signal-only graceful shutdown should not leak + that platform-specific detail into the container exit status, + but a positive crash before shutdown should remain visible.""" + positives = [ + p.returncode for _, p in self.procs + if p.returncode is not None and p.returncode > 0 + ] + return max(positives, default=0) + + def _drain_restart_requests(self) -> None: + if self.shutdown_at is not None: + self._restart_requested.clear() + return + requested = tuple(sorted(self._restart_requested)) + self._restart_requested.clear() + for daemon_name in requested: + if self.shutdown_at is not None: + self._restart_requested.clear() + return + self.restart_daemon(daemon_name) def forward_signal(self, sig: int, daemon_name: str) -> bool: """Forward a signal to one named child. Used by the SIGHUP @@ -323,7 +366,7 @@ def main(argv: Sequence[str] | None = None) -> int: # supervisor restarts the pipelock daemon in place (other # daemons keep running — specifically supervise, whose MCP # socket would drop on a whole-container `docker restart`). - signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock")) + signal.signal(signal.SIGUSR1, lambda *_: sup.request_restart("pipelock")) while not sup.tick(): time.sleep(_POLL_INTERVAL) diff --git a/docs/prds/0034-sidecar-restart-shutdown-semantics.md b/docs/prds/0034-sidecar-restart-shutdown-semantics.md index 3ac997c..d291e57 100644 --- a/docs/prds/0034-sidecar-restart-shutdown-semantics.md +++ b/docs/prds/0034-sidecar-restart-shutdown-semantics.md @@ -100,11 +100,11 @@ loop should continue to call `tick()` and sleep on `_POLL_INTERVAL`; `tick()` then performs the actual `restart_daemon("pipelock")` work while normal Python control flow is in the supervisor loop. -Repeated restart requests should not overlap. Either coalescing or FIFO -serialization is acceptable, but the PRD prefers coalescing by daemon name: if -three SIGUSR1 signals arrive before the next loop turn, one pipelock restart is -enough because each restart rereads the latest `pipelock.yaml` from disk. -Document this because it is a semantic choice. +Repeated restart requests should not overlap. Restart requests coalesce by +daemon name: if three SIGUSR1 signals arrive before the next loop turn, one +pipelock restart is enough because each restart rereads the latest +`pipelock.yaml` from disk. This treats SIGUSR1 as "make pipelock reflect the +current config" rather than "run exactly one restart per signal." Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is pending, the supervisor should drop the pending restart and terminate live @@ -116,9 +116,9 @@ between bytecodes and cannot interrupt a single blocking `wait()` until control returns to Python. Exit-code behavior should be documented as "positive failures win, otherwise -return the maximum observed child return code." That matches the current intent: -positive process failures remain visible, while a clean shutdown of only -signal-terminated children does not hide an earlier crash. +return zero." Positive process failures remain visible, while a clean shutdown +of only zero-exit or signal-terminated children returns zero instead of leaking +platform-specific negative signal return codes to the container exit status. ## Implementation Chunks @@ -148,10 +148,4 @@ Also run the full unit suite before merge: ## Open Questions -- Should repeated restart requests be coalesced by daemon name, or should the - supervisor preserve every queued request? Coalescing is simpler and appears - sufficient because pipelock rereads the latest config on restart. -- Should exit-code handling clamp all negative signal return codes to zero - when no positive child failure occurred, or should it continue returning the - maximum raw child return code? The current tests tolerate platform-specific - negative signal codes; tightening this would be a behavior change. +None. diff --git a/tests/unit/test_sidecar_init.py b/tests/unit/test_sidecar_init.py index 569de07..88f0b9b 100644 --- a/tests/unit/test_sidecar_init.py +++ b/tests/unit/test_sidecar_init.py @@ -301,6 +301,64 @@ class TestSupervisor(unittest.TestCase): sup.request_shutdown(reason="cleanup") self._drive(sup) + def test_request_restart_is_drained_by_tick(self): + specs = [ + _DaemonSpec("pipelock", ("/bin/sleep", "30")), + _DaemonSpec("supervise", ("/bin/sleep", "30")), + ] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.1) + old_pipelock_pid = sup.procs[0][1].pid + supervise_pid = sup.procs[1][1].pid + + ok = sup.request_restart("pipelock") + self.assertTrue(ok) + # The non-blocking request path only records intent. + self.assertEqual(old_pipelock_pid, sup.procs[0][1].pid) + + done = sup.tick() + self.assertFalse(done) + + self.assertNotEqual(old_pipelock_pid, sup.procs[0][1].pid) + self.assertEqual(supervise_pid, sup.procs[1][1].pid) + + sup.request_shutdown(reason="cleanup") + self._drive(sup) + + def test_repeated_restart_requests_coalesce(self): + specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.1) + + self.assertTrue(sup.request_restart("pipelock")) + self.assertTrue(sup.request_restart("pipelock")) + self.assertEqual({"pipelock"}, sup._restart_requested) + + old_pid = sup.procs[0][1].pid + sup.tick() + first_restarted_pid = sup.procs[0][1].pid + self.assertNotEqual(old_pid, first_restarted_pid) + + # A second tick should not restart again; the coalesced + # request was consumed by the first tick. + sup.tick() + self.assertEqual(first_restarted_pid, sup.procs[0][1].pid) + + sup.request_shutdown(reason="cleanup") + self._drive(sup) + + def test_request_restart_unknown_daemon_no_op(self): + specs = [_DaemonSpec("a", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + ok = sup.request_restart("ghost") + self.assertFalse(ok) + self.assertEqual(set(), sup._restart_requested) + sup.request_shutdown(reason="cleanup") + self._drive(sup) + def test_restart_unknown_daemon_no_op(self): specs = [_DaemonSpec("a", ("/bin/sleep", "30"))] sup = _Supervisor(specs) @@ -320,12 +378,24 @@ class TestSupervisor(unittest.TestCase): "must not respawn a daemon during teardown") self._drive(sup) + def test_pending_restart_dropped_during_shutdown(self): + specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.1) + old_pid = sup.procs[0][1].pid + + self.assertTrue(sup.request_restart("pipelock")) + sup.request_shutdown(reason="test") + self.assertEqual(set(), sup._restart_requested) + self._drive(sup) + + self.assertEqual(old_pid, sup.procs[0][1].pid) + def test_shutdown_after_start_terminates_children(self): # Two long-running children. Caller requests shutdown; - # both should receive SIGTERM and exit. exit_code() is - # max of (returncodes) — both signal-killed (negative), - # so max() picks 0 in the typical case (or the - # platform-specific signal returncode). + # both should receive SIGTERM and exit. Signal-only + # shutdown clamps to a zero supervisor exit code. specs = [ _DaemonSpec("a", ("/bin/sleep", "60")), _DaemonSpec("b", ("/bin/sleep", "60")), @@ -335,7 +405,7 @@ class TestSupervisor(unittest.TestCase): time.sleep(0.2) # let them actually start sup.request_shutdown(reason="test") rc = self._drive(sup) - self.assertIsNotNone(rc) + self.assertEqual(0, rc) # Both children got the signal — neither survived past # the grace deadline. for _, p in sup.procs: @@ -360,7 +430,7 @@ class TestSupervisor(unittest.TestCase): # Process was SIGKILL'd → returncode -9 on POSIX. self.assertEqual(-9, sup.procs[0][1].returncode) - self.assertIsNotNone(rc) + self.assertEqual(0, rc) def test_idempotent_shutdown_requests(self): specs = [_DaemonSpec("a", ("/bin/sleep", "60"))] @@ -426,12 +496,7 @@ class TestMainEndToEnd(unittest.TestCase): self.assertIn("starting alpha", out) self.assertIn("starting beta", out) self.assertIn("forwarding SIGTERM", out) - # Sleep terminated by SIGTERM exits with returncode -15; - # supervisor surfaces that via max(...) and main() - # returns -15 → process exit becomes 256-15 = 241. - # On macOS bash may convert to 143. Either way, nonzero - # AND the child finished — we don't pin the exact code. - self.assertNotEqual(0, rc) + self.assertEqual(0, rc) def test_empty_daemon_set_exits_zero_immediately(self): # Use a sentinel value that filters out both alpha+beta.