From 1b34b1df8548cb27c4bc07c698f8a4721a5e6cca Mon Sep 17 00:00:00 2001 From: codex Date: Tue, 2 Jun 2026 07:43:34 +0000 Subject: [PATCH 1/3] docs(prd): add sidecar restart semantics --- ...0034-sidecar-restart-shutdown-semantics.md | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 docs/prds/0034-sidecar-restart-shutdown-semantics.md diff --git a/docs/prds/0034-sidecar-restart-shutdown-semantics.md b/docs/prds/0034-sidecar-restart-shutdown-semantics.md new file mode 100644 index 0000000..3ac997c --- /dev/null +++ b/docs/prds/0034-sidecar-restart-shutdown-semantics.md @@ -0,0 +1,157 @@ +# PRD 0034: Sidecar Restart and Shutdown Semantics + +- **Status:** Draft +- **Author:** didericis-codex +- **Created:** 2026-06-02 +- **Issue:** #126 + +## Summary + +Make the sidecar bundle supervisor's signal, restart, and exit-code behavior +explicit and easier to reason about. In particular, move pipelock restart work +out of direct SIGUSR1 handler execution while preserving the caller-visible +`docker kill --signal USR1 ` contract used by pipelock apply flows. + +## Problem + +`bot_bottle/sidecar_init.py` is PID 1 for the bundled sidecar container. It +starts egress, pipelock, git-gate/git-http, and supervise; forwards shutdown +signals; forwards SIGHUP to egress; and restarts pipelock on SIGUSR1 after +allowlist changes. + +The current SIGUSR1 handler calls `sup.restart_daemon("pipelock")` directly. +That restart path can terminate a child, wait up to a grace timeout, SIGKILL a +stubborn child, spawn a replacement with `subprocess.Popen`, and start a new +pump thread. In CPython signal handlers run between bytecodes in the main +thread, so this is not the same as POSIX async-signal-unsafe C code, but it +still means signal handling can block the supervisor loop for the restart grace +window and makes stacked signals harder to reason about. + +The exit-code contract is also easy to misread. `_Supervisor.exit_code()` +returns the maximum child return code. That preserves a positive crash code +when a child failed before graceful shutdown, but the docstring currently +frames graceful shutdown as returning zero because signal-killed children have +negative return codes. The implementation is reasonable; the contract needs to +be deliberate and tested around crash-then-shutdown behavior. + +## Goals / Success Criteria + +- Preserve the external signal contract: + - SIGTERM/SIGINT requests bundle shutdown. + - SIGHUP still forwards to the live egress child. + - SIGUSR1 still requests an in-place pipelock restart. +- Keep signal handlers small: handlers should record intent and return, not + perform blocking subprocess lifecycle work directly. +- Process queued restart requests from the supervisor's main loop so restart + behavior is serialized with `tick()` and shutdown state. +- Avoid respawning children after shutdown has started. +- Coalesce or serialize repeated pipelock restart requests in a documented way + so stacked SIGUSR1 delivery cannot overlap restarts. +- Clarify and test aggregate exit-code semantics: + - clean unattended exits return zero when every child exits zero. + - signal-only shutdown does not invent a positive failure code. + - a positive child crash before shutdown remains visible on supervisor exit. +- Keep the implementation stdlib-only. + +## Non-goals + +- No new process supervisor dependency such as supervisord, s6, or runit. +- No automatic restart policy for arbitrary unexpected child death. +- No changes to the bundle's daemon set, daemon argv, env filtering, or Docker + compose contract. +- No changes to egress route reload semantics beyond preserving SIGHUP + forwarding. +- No user-facing CLI changes. + +## Scope + +In scope: + +- Internal signal handling and supervisor event-loop structure in + `bot_bottle/sidecar_init.py`. +- Tests in `tests/unit/test_sidecar_init.py` for queued restart behavior, + shutdown/restart ordering, repeated restart requests, and exit-code + semantics. +- Docstring/comment updates that describe the concrete signal and exit-code + contracts. + +Out of scope: + +- Changing pipelock itself to reload config in process. +- Restarting egress, git-gate, git-http, or supervise on demand. +- Reporting restart events to the supervise MCP plane. +- Changing the interim policy that unexpected child death leaves surviving + daemons running. + +## Design + +Keep `_Supervisor` as the owner of child process state, but add an explicit +pending-action boundary between signal delivery and subprocess lifecycle work. +The exact API can be small, for example: + +- `request_shutdown(reason)` keeps its existing idempotent behavior. +- `request_restart(daemon_name)` records a pending restart request unless + shutdown is already in progress. +- `tick()` drains pending restart work before or after child-death logging in a + documented order. + +The SIGUSR1 handler should call only the non-blocking request method. The main +loop should continue to call `tick()` and sleep on `_POLL_INTERVAL`; `tick()` +then performs the actual `restart_daemon("pipelock")` work while normal Python +control flow is in the supervisor loop. + +Repeated restart requests should not overlap. Either coalescing or FIFO +serialization is acceptable, but the PRD prefers coalescing by daemon name: if +three SIGUSR1 signals arrive before the next loop turn, one pipelock restart is +enough because each restart rereads the latest `pipelock.yaml` from disk. +Document this because it is a semantic choice. + +Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is +pending, the supervisor should drop the pending restart and terminate live +children. If shutdown starts while `restart_daemon` is already executing in the +main loop, the existing restart operation may finish, but no additional queued +restart should start after shutdown state is set. A simpler implementation may +check shutdown only before each queued restart, because signal handlers execute +between bytecodes and cannot interrupt a single blocking `wait()` until control +returns to Python. + +Exit-code behavior should be documented as "positive failures win, otherwise +return the maximum observed child return code." That matches the current intent: +positive process failures remain visible, while a clean shutdown of only +signal-terminated children does not hide an earlier crash. + +## Implementation Chunks + +1. Add characterization tests for SIGUSR1 queuing, repeated restart coalescing, + shutdown dropping pending restarts, and crash-then-shutdown exit codes. +2. Add a pending restart request structure to `_Supervisor` and a + non-blocking request method. +3. Change the SIGUSR1 handler in `main()` to enqueue the pipelock restart + instead of calling `restart_daemon` directly. +4. Drain pending restarts from `tick()` with shutdown checks and documented + ordering. +5. Update docstrings and comments around signal handling and `exit_code()`. + +## Testing Strategy + +Run the existing sidecar unit tests: + +- `python3 -m unittest tests.unit.test_sidecar_init` + +Add focused unit tests that avoid process-wide signal handler races where +possible by driving `_Supervisor` directly. End-to-end signal tests can remain +limited to `main()` behavior that cannot be exercised otherwise. + +Also run the full unit suite before merge: + +- `python3 -m unittest discover -s tests/unit` + +## Open Questions + +- Should repeated restart requests be coalesced by daemon name, or should the + supervisor preserve every queued request? Coalescing is simpler and appears + sufficient because pipelock rereads the latest config on restart. +- Should exit-code handling clamp all negative signal return codes to zero + when no positive child failure occurred, or should it continue returning the + maximum raw child return code? The current tests tolerate platform-specific + negative signal codes; tightening this would be a behavior change. -- 2.52.0 From 31708abfaddce53104658ac06ae9909bbea9ed65 Mon Sep 17 00:00:00 2001 From: codex Date: Tue, 2 Jun 2026 07:52:19 +0000 Subject: [PATCH 2/3] fix(sidecar): queue restart signals --- bot_bottle/sidecar_init.py | 55 ++++++++++-- ...0034-sidecar-restart-shutdown-semantics.md | 24 ++--- tests/unit/test_sidecar_init.py | 89 ++++++++++++++++--- 3 files changed, 135 insertions(+), 33 deletions(-) diff --git a/bot_bottle/sidecar_init.py b/bot_bottle/sidecar_init.py index 1681c66..8bfa984 100644 --- a/bot_bottle/sidecar_init.py +++ b/bot_bottle/sidecar_init.py @@ -163,6 +163,10 @@ class _Supervisor: # Names of children that have been logged as having exited # so we only log each death once across watch-loop ticks. self._logged_dead: set[str] = set() + # Signal handlers add daemon names here and return quickly. + # The main watch loop drains the set, so repeated restart + # requests for one daemon coalesce into one restart. + self._restart_requested: set[str] = set() def start_all(self) -> None: for spec in self.specs: @@ -173,6 +177,7 @@ class _Supervisor: if self.shutdown_at is not None: return self.shutdown_at = time.monotonic() + self._restart_requested.clear() _log(f"shutting down ({reason}); forwarding SIGTERM") for _, p in self.procs: if p.poll() is None: @@ -181,6 +186,24 @@ class _Supervisor: except ProcessLookupError: pass + def request_restart(self, daemon_name: str) -> bool: + """Queue a daemon restart for the main loop to process. + + Signal handlers use this non-blocking path instead of doing + subprocess lifecycle work directly. Requests coalesce by + daemon name: one pending restart is enough to make the daemon + reread the latest config from disk. + + Returns True iff a daemon by that name is known to the + supervisor and shutdown has not started.""" + if self.shutdown_at is not None: + _log(f"restart {daemon_name} skipped; supervisor is shutting down") + return False + if not any(spec.name == daemon_name for spec, _ in self.procs): + return False + self._restart_requested.add(daemon_name) + return True + def tick(self) -> bool: """One iteration of the watch loop. Returns True when every child has exited and the supervisor can return. @@ -188,6 +211,8 @@ class _Supervisor: A child dying unexpectedly is logged but does NOT initiate shutdown — see the module docstring's failure-policy section. Shutdown is signal-driven only.""" + self._drain_restart_requests() + for spec, p in self.procs: rc = p.poll() if rc is None or spec.name in self._logged_dead: @@ -223,11 +248,29 @@ class _Supervisor: return all(p.poll() is not None for _, p in self.procs) def exit_code(self) -> int: - """Worst child returncode wins. On graceful shutdown every - child is signal-killed (negative returncode) and max() - returns 0; if some child crashed nonzero before the signal - the operator gets that code on container exit.""" - return max((p.returncode for _, p in self.procs), default=0) + """Positive child failures win; otherwise report success. + + Python represents signal-terminated children as negative + return codes. A signal-only graceful shutdown should not leak + that platform-specific detail into the container exit status, + but a positive crash before shutdown should remain visible.""" + positives = [ + p.returncode for _, p in self.procs + if p.returncode is not None and p.returncode > 0 + ] + return max(positives, default=0) + + def _drain_restart_requests(self) -> None: + if self.shutdown_at is not None: + self._restart_requested.clear() + return + requested = tuple(sorted(self._restart_requested)) + self._restart_requested.clear() + for daemon_name in requested: + if self.shutdown_at is not None: + self._restart_requested.clear() + return + self.restart_daemon(daemon_name) def forward_signal(self, sig: int, daemon_name: str) -> bool: """Forward a signal to one named child. Used by the SIGHUP @@ -323,7 +366,7 @@ def main(argv: Sequence[str] | None = None) -> int: # supervisor restarts the pipelock daemon in place (other # daemons keep running — specifically supervise, whose MCP # socket would drop on a whole-container `docker restart`). - signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock")) + signal.signal(signal.SIGUSR1, lambda *_: sup.request_restart("pipelock")) while not sup.tick(): time.sleep(_POLL_INTERVAL) diff --git a/docs/prds/0034-sidecar-restart-shutdown-semantics.md b/docs/prds/0034-sidecar-restart-shutdown-semantics.md index 3ac997c..d291e57 100644 --- a/docs/prds/0034-sidecar-restart-shutdown-semantics.md +++ b/docs/prds/0034-sidecar-restart-shutdown-semantics.md @@ -100,11 +100,11 @@ loop should continue to call `tick()` and sleep on `_POLL_INTERVAL`; `tick()` then performs the actual `restart_daemon("pipelock")` work while normal Python control flow is in the supervisor loop. -Repeated restart requests should not overlap. Either coalescing or FIFO -serialization is acceptable, but the PRD prefers coalescing by daemon name: if -three SIGUSR1 signals arrive before the next loop turn, one pipelock restart is -enough because each restart rereads the latest `pipelock.yaml` from disk. -Document this because it is a semantic choice. +Repeated restart requests should not overlap. Restart requests coalesce by +daemon name: if three SIGUSR1 signals arrive before the next loop turn, one +pipelock restart is enough because each restart rereads the latest +`pipelock.yaml` from disk. This treats SIGUSR1 as "make pipelock reflect the +current config" rather than "run exactly one restart per signal." Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is pending, the supervisor should drop the pending restart and terminate live @@ -116,9 +116,9 @@ between bytecodes and cannot interrupt a single blocking `wait()` until control returns to Python. Exit-code behavior should be documented as "positive failures win, otherwise -return the maximum observed child return code." That matches the current intent: -positive process failures remain visible, while a clean shutdown of only -signal-terminated children does not hide an earlier crash. +return zero." Positive process failures remain visible, while a clean shutdown +of only zero-exit or signal-terminated children returns zero instead of leaking +platform-specific negative signal return codes to the container exit status. ## Implementation Chunks @@ -148,10 +148,4 @@ Also run the full unit suite before merge: ## Open Questions -- Should repeated restart requests be coalesced by daemon name, or should the - supervisor preserve every queued request? Coalescing is simpler and appears - sufficient because pipelock rereads the latest config on restart. -- Should exit-code handling clamp all negative signal return codes to zero - when no positive child failure occurred, or should it continue returning the - maximum raw child return code? The current tests tolerate platform-specific - negative signal codes; tightening this would be a behavior change. +None. diff --git a/tests/unit/test_sidecar_init.py b/tests/unit/test_sidecar_init.py index 569de07..88f0b9b 100644 --- a/tests/unit/test_sidecar_init.py +++ b/tests/unit/test_sidecar_init.py @@ -301,6 +301,64 @@ class TestSupervisor(unittest.TestCase): sup.request_shutdown(reason="cleanup") self._drive(sup) + def test_request_restart_is_drained_by_tick(self): + specs = [ + _DaemonSpec("pipelock", ("/bin/sleep", "30")), + _DaemonSpec("supervise", ("/bin/sleep", "30")), + ] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.1) + old_pipelock_pid = sup.procs[0][1].pid + supervise_pid = sup.procs[1][1].pid + + ok = sup.request_restart("pipelock") + self.assertTrue(ok) + # The non-blocking request path only records intent. + self.assertEqual(old_pipelock_pid, sup.procs[0][1].pid) + + done = sup.tick() + self.assertFalse(done) + + self.assertNotEqual(old_pipelock_pid, sup.procs[0][1].pid) + self.assertEqual(supervise_pid, sup.procs[1][1].pid) + + sup.request_shutdown(reason="cleanup") + self._drive(sup) + + def test_repeated_restart_requests_coalesce(self): + specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.1) + + self.assertTrue(sup.request_restart("pipelock")) + self.assertTrue(sup.request_restart("pipelock")) + self.assertEqual({"pipelock"}, sup._restart_requested) + + old_pid = sup.procs[0][1].pid + sup.tick() + first_restarted_pid = sup.procs[0][1].pid + self.assertNotEqual(old_pid, first_restarted_pid) + + # A second tick should not restart again; the coalesced + # request was consumed by the first tick. + sup.tick() + self.assertEqual(first_restarted_pid, sup.procs[0][1].pid) + + sup.request_shutdown(reason="cleanup") + self._drive(sup) + + def test_request_restart_unknown_daemon_no_op(self): + specs = [_DaemonSpec("a", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + ok = sup.request_restart("ghost") + self.assertFalse(ok) + self.assertEqual(set(), sup._restart_requested) + sup.request_shutdown(reason="cleanup") + self._drive(sup) + def test_restart_unknown_daemon_no_op(self): specs = [_DaemonSpec("a", ("/bin/sleep", "30"))] sup = _Supervisor(specs) @@ -320,12 +378,24 @@ class TestSupervisor(unittest.TestCase): "must not respawn a daemon during teardown") self._drive(sup) + def test_pending_restart_dropped_during_shutdown(self): + specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.1) + old_pid = sup.procs[0][1].pid + + self.assertTrue(sup.request_restart("pipelock")) + sup.request_shutdown(reason="test") + self.assertEqual(set(), sup._restart_requested) + self._drive(sup) + + self.assertEqual(old_pid, sup.procs[0][1].pid) + def test_shutdown_after_start_terminates_children(self): # Two long-running children. Caller requests shutdown; - # both should receive SIGTERM and exit. exit_code() is - # max of (returncodes) — both signal-killed (negative), - # so max() picks 0 in the typical case (or the - # platform-specific signal returncode). + # both should receive SIGTERM and exit. Signal-only + # shutdown clamps to a zero supervisor exit code. specs = [ _DaemonSpec("a", ("/bin/sleep", "60")), _DaemonSpec("b", ("/bin/sleep", "60")), @@ -335,7 +405,7 @@ class TestSupervisor(unittest.TestCase): time.sleep(0.2) # let them actually start sup.request_shutdown(reason="test") rc = self._drive(sup) - self.assertIsNotNone(rc) + self.assertEqual(0, rc) # Both children got the signal — neither survived past # the grace deadline. for _, p in sup.procs: @@ -360,7 +430,7 @@ class TestSupervisor(unittest.TestCase): # Process was SIGKILL'd → returncode -9 on POSIX. self.assertEqual(-9, sup.procs[0][1].returncode) - self.assertIsNotNone(rc) + self.assertEqual(0, rc) def test_idempotent_shutdown_requests(self): specs = [_DaemonSpec("a", ("/bin/sleep", "60"))] @@ -426,12 +496,7 @@ class TestMainEndToEnd(unittest.TestCase): self.assertIn("starting alpha", out) self.assertIn("starting beta", out) self.assertIn("forwarding SIGTERM", out) - # Sleep terminated by SIGTERM exits with returncode -15; - # supervisor surfaces that via max(...) and main() - # returns -15 → process exit becomes 256-15 = 241. - # On macOS bash may convert to 143. Either way, nonzero - # AND the child finished — we don't pin the exact code. - self.assertNotEqual(0, rc) + self.assertEqual(0, rc) def test_empty_daemon_set_exits_zero_immediately(self): # Use a sentinel value that filters out both alpha+beta. -- 2.52.0 From fe6059e4a67ad540aa2431ab14c48ff07c03e261 Mon Sep 17 00:00:00 2001 From: codex Date: Tue, 2 Jun 2026 07:52:38 +0000 Subject: [PATCH 3/3] complete(prd): mark PRD 0034 active --- docs/prds/0034-sidecar-restart-shutdown-semantics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/prds/0034-sidecar-restart-shutdown-semantics.md b/docs/prds/0034-sidecar-restart-shutdown-semantics.md index d291e57..9985b1f 100644 --- a/docs/prds/0034-sidecar-restart-shutdown-semantics.md +++ b/docs/prds/0034-sidecar-restart-shutdown-semantics.md @@ -1,6 +1,6 @@ # PRD 0034: Sidecar Restart and Shutdown Semantics -- **Status:** Draft +- **Status:** Active - **Author:** didericis-codex - **Created:** 2026-06-02 - **Issue:** #126 -- 2.52.0