PRD 0034: Sidecar Restart and Shutdown Semantics #127

Merged
didericis merged 3 commits from prd-0034-sidecar-restart-shutdown-semantics into main 2026-06-02 03:55:06 -04:00
3 changed files with 277 additions and 18 deletions
+49 -6
View File
@@ -163,6 +163,10 @@ class _Supervisor:
# Names of children that have been logged as having exited # Names of children that have been logged as having exited
# so we only log each death once across watch-loop ticks. # so we only log each death once across watch-loop ticks.
self._logged_dead: set[str] = set() self._logged_dead: set[str] = set()
# Signal handlers add daemon names here and return quickly.
# The main watch loop drains the set, so repeated restart
# requests for one daemon coalesce into one restart.
self._restart_requested: set[str] = set()
def start_all(self) -> None: def start_all(self) -> None:
for spec in self.specs: for spec in self.specs:
@@ -173,6 +177,7 @@ class _Supervisor:
if self.shutdown_at is not None: if self.shutdown_at is not None:
return return
self.shutdown_at = time.monotonic() self.shutdown_at = time.monotonic()
self._restart_requested.clear()
_log(f"shutting down ({reason}); forwarding SIGTERM") _log(f"shutting down ({reason}); forwarding SIGTERM")
for _, p in self.procs: for _, p in self.procs:
if p.poll() is None: if p.poll() is None:
@@ -181,6 +186,24 @@ class _Supervisor:
except ProcessLookupError: except ProcessLookupError:
pass pass
def request_restart(self, daemon_name: str) -> bool:
"""Queue a daemon restart for the main loop to process.
Signal handlers use this non-blocking path instead of doing
subprocess lifecycle work directly. Requests coalesce by
daemon name: one pending restart is enough to make the daemon
reread the latest config from disk.
Returns True iff a daemon by that name is known to the
supervisor and shutdown has not started."""
if self.shutdown_at is not None:
_log(f"restart {daemon_name} skipped; supervisor is shutting down")
return False
if not any(spec.name == daemon_name for spec, _ in self.procs):
return False
self._restart_requested.add(daemon_name)
return True
def tick(self) -> bool: def tick(self) -> bool:
"""One iteration of the watch loop. Returns True when every """One iteration of the watch loop. Returns True when every
child has exited and the supervisor can return. child has exited and the supervisor can return.
@@ -188,6 +211,8 @@ class _Supervisor:
A child dying unexpectedly is logged but does NOT initiate A child dying unexpectedly is logged but does NOT initiate
shutdown — see the module docstring's failure-policy shutdown — see the module docstring's failure-policy
section. Shutdown is signal-driven only.""" section. Shutdown is signal-driven only."""
self._drain_restart_requests()
for spec, p in self.procs: for spec, p in self.procs:
rc = p.poll() rc = p.poll()
if rc is None or spec.name in self._logged_dead: if rc is None or spec.name in self._logged_dead:
@@ -223,11 +248,29 @@ class _Supervisor:
return all(p.poll() is not None for _, p in self.procs) return all(p.poll() is not None for _, p in self.procs)
def exit_code(self) -> int: def exit_code(self) -> int:
"""Worst child returncode wins. On graceful shutdown every """Positive child failures win; otherwise report success.
child is signal-killed (negative returncode) and max()
returns 0; if some child crashed nonzero before the signal Python represents signal-terminated children as negative
the operator gets that code on container exit.""" return codes. A signal-only graceful shutdown should not leak
return max((p.returncode for _, p in self.procs), default=0) that platform-specific detail into the container exit status,
but a positive crash before shutdown should remain visible."""
positives = [
p.returncode for _, p in self.procs
if p.returncode is not None and p.returncode > 0
]
return max(positives, default=0)
def _drain_restart_requests(self) -> None:
if self.shutdown_at is not None:
self._restart_requested.clear()
return
requested = tuple(sorted(self._restart_requested))
self._restart_requested.clear()
for daemon_name in requested:
if self.shutdown_at is not None:
self._restart_requested.clear()
return
self.restart_daemon(daemon_name)
def forward_signal(self, sig: int, daemon_name: str) -> bool: def forward_signal(self, sig: int, daemon_name: str) -> bool:
"""Forward a signal to one named child. Used by the SIGHUP """Forward a signal to one named child. Used by the SIGHUP
@@ -323,7 +366,7 @@ def main(argv: Sequence[str] | None = None) -> int:
# supervisor restarts the pipelock daemon in place (other # supervisor restarts the pipelock daemon in place (other
# daemons keep running — specifically supervise, whose MCP # daemons keep running — specifically supervise, whose MCP
# socket would drop on a whole-container `docker restart`). # socket would drop on a whole-container `docker restart`).
signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock")) signal.signal(signal.SIGUSR1, lambda *_: sup.request_restart("pipelock"))
while not sup.tick(): while not sup.tick():
time.sleep(_POLL_INTERVAL) time.sleep(_POLL_INTERVAL)
@@ -0,0 +1,151 @@
# PRD 0034: Sidecar Restart and Shutdown Semantics
- **Status:** Active
- **Author:** didericis-codex
- **Created:** 2026-06-02
- **Issue:** #126
## Summary
Make the sidecar bundle supervisor's signal, restart, and exit-code behavior
explicit and easier to reason about. In particular, move pipelock restart work
out of direct SIGUSR1 handler execution while preserving the caller-visible
`docker kill --signal USR1 <bundle>` contract used by pipelock apply flows.
## Problem
`bot_bottle/sidecar_init.py` is PID 1 for the bundled sidecar container. It
starts egress, pipelock, git-gate/git-http, and supervise; forwards shutdown
signals; forwards SIGHUP to egress; and restarts pipelock on SIGUSR1 after
allowlist changes.
The current SIGUSR1 handler calls `sup.restart_daemon("pipelock")` directly.
That restart path can terminate a child, wait up to a grace timeout, SIGKILL a
stubborn child, spawn a replacement with `subprocess.Popen`, and start a new
pump thread. In CPython signal handlers run between bytecodes in the main
thread, so this is not the same as POSIX async-signal-unsafe C code, but it
still means signal handling can block the supervisor loop for the restart grace
window and makes stacked signals harder to reason about.
The exit-code contract is also easy to misread. `_Supervisor.exit_code()`
returns the maximum child return code. That preserves a positive crash code
when a child failed before graceful shutdown, but the docstring currently
frames graceful shutdown as returning zero because signal-killed children have
negative return codes. The implementation is reasonable; the contract needs to
be deliberate and tested around crash-then-shutdown behavior.
## Goals / Success Criteria
- Preserve the external signal contract:
- SIGTERM/SIGINT requests bundle shutdown.
- SIGHUP still forwards to the live egress child.
- SIGUSR1 still requests an in-place pipelock restart.
- Keep signal handlers small: handlers should record intent and return, not
perform blocking subprocess lifecycle work directly.
- Process queued restart requests from the supervisor's main loop so restart
behavior is serialized with `tick()` and shutdown state.
- Avoid respawning children after shutdown has started.
- Coalesce or serialize repeated pipelock restart requests in a documented way
so stacked SIGUSR1 delivery cannot overlap restarts.
- Clarify and test aggregate exit-code semantics:
- clean unattended exits return zero when every child exits zero.
- signal-only shutdown does not invent a positive failure code.
- a positive child crash before shutdown remains visible on supervisor exit.
- Keep the implementation stdlib-only.
## Non-goals
- No new process supervisor dependency such as supervisord, s6, or runit.
- No automatic restart policy for arbitrary unexpected child death.
- No changes to the bundle's daemon set, daemon argv, env filtering, or Docker
compose contract.
- No changes to egress route reload semantics beyond preserving SIGHUP
forwarding.
- No user-facing CLI changes.
## Scope
In scope:
- Internal signal handling and supervisor event-loop structure in
`bot_bottle/sidecar_init.py`.
- Tests in `tests/unit/test_sidecar_init.py` for queued restart behavior,
shutdown/restart ordering, repeated restart requests, and exit-code
semantics.
- Docstring/comment updates that describe the concrete signal and exit-code
contracts.
Out of scope:
- Changing pipelock itself to reload config in process.
- Restarting egress, git-gate, git-http, or supervise on demand.
- Reporting restart events to the supervise MCP plane.
- Changing the interim policy that unexpected child death leaves surviving
daemons running.
## Design
Keep `_Supervisor` as the owner of child process state, but add an explicit
pending-action boundary between signal delivery and subprocess lifecycle work.
The exact API can be small, for example:
- `request_shutdown(reason)` keeps its existing idempotent behavior.
- `request_restart(daemon_name)` records a pending restart request unless
shutdown is already in progress.
- `tick()` drains pending restart work before or after child-death logging in a
documented order.
The SIGUSR1 handler should call only the non-blocking request method. The main
loop should continue to call `tick()` and sleep on `_POLL_INTERVAL`; `tick()`
then performs the actual `restart_daemon("pipelock")` work while normal Python
control flow is in the supervisor loop.
Repeated restart requests should not overlap. Restart requests coalesce by
daemon name: if three SIGUSR1 signals arrive before the next loop turn, one
pipelock restart is enough because each restart rereads the latest
`pipelock.yaml` from disk. This treats SIGUSR1 as "make pipelock reflect the
current config" rather than "run exactly one restart per signal."
Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is
pending, the supervisor should drop the pending restart and terminate live
children. If shutdown starts while `restart_daemon` is already executing in the
main loop, the existing restart operation may finish, but no additional queued
restart should start after shutdown state is set. A simpler implementation may
check shutdown only before each queued restart, because signal handlers execute
between bytecodes and cannot interrupt a single blocking `wait()` until control
returns to Python.
Exit-code behavior should be documented as "positive failures win, otherwise
return zero." Positive process failures remain visible, while a clean shutdown
of only zero-exit or signal-terminated children returns zero instead of leaking
platform-specific negative signal return codes to the container exit status.
## Implementation Chunks
1. Add characterization tests for SIGUSR1 queuing, repeated restart coalescing,
shutdown dropping pending restarts, and crash-then-shutdown exit codes.
2. Add a pending restart request structure to `_Supervisor` and a
non-blocking request method.
3. Change the SIGUSR1 handler in `main()` to enqueue the pipelock restart
instead of calling `restart_daemon` directly.
4. Drain pending restarts from `tick()` with shutdown checks and documented
ordering.
5. Update docstrings and comments around signal handling and `exit_code()`.
## Testing Strategy
Run the existing sidecar unit tests:
- `python3 -m unittest tests.unit.test_sidecar_init`
Add focused unit tests that avoid process-wide signal handler races where
possible by driving `_Supervisor` directly. End-to-end signal tests can remain
limited to `main()` behavior that cannot be exercised otherwise.
Also run the full unit suite before merge:
- `python3 -m unittest discover -s tests/unit`
## Open Questions
None.
+77 -12
View File
@@ -301,6 +301,64 @@ class TestSupervisor(unittest.TestCase):
sup.request_shutdown(reason="cleanup") sup.request_shutdown(reason="cleanup")
self._drive(sup) self._drive(sup)
def test_request_restart_is_drained_by_tick(self):
specs = [
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
_DaemonSpec("supervise", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
old_pipelock_pid = sup.procs[0][1].pid
supervise_pid = sup.procs[1][1].pid
ok = sup.request_restart("pipelock")
self.assertTrue(ok)
# The non-blocking request path only records intent.
self.assertEqual(old_pipelock_pid, sup.procs[0][1].pid)
done = sup.tick()
self.assertFalse(done)
self.assertNotEqual(old_pipelock_pid, sup.procs[0][1].pid)
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_repeated_restart_requests_coalesce(self):
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
self.assertTrue(sup.request_restart("pipelock"))
self.assertTrue(sup.request_restart("pipelock"))
self.assertEqual({"pipelock"}, sup._restart_requested)
old_pid = sup.procs[0][1].pid
sup.tick()
first_restarted_pid = sup.procs[0][1].pid
self.assertNotEqual(old_pid, first_restarted_pid)
# A second tick should not restart again; the coalesced
# request was consumed by the first tick.
sup.tick()
self.assertEqual(first_restarted_pid, sup.procs[0][1].pid)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_request_restart_unknown_daemon_no_op(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
ok = sup.request_restart("ghost")
self.assertFalse(ok)
self.assertEqual(set(), sup._restart_requested)
sup.request_shutdown(reason="cleanup")
self._drive(sup)
def test_restart_unknown_daemon_no_op(self): def test_restart_unknown_daemon_no_op(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))] specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
sup = _Supervisor(specs) sup = _Supervisor(specs)
@@ -320,12 +378,24 @@ class TestSupervisor(unittest.TestCase):
"must not respawn a daemon during teardown") "must not respawn a daemon during teardown")
self._drive(sup) self._drive(sup)
def test_pending_restart_dropped_during_shutdown(self):
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
old_pid = sup.procs[0][1].pid
self.assertTrue(sup.request_restart("pipelock"))
sup.request_shutdown(reason="test")
self.assertEqual(set(), sup._restart_requested)
self._drive(sup)
self.assertEqual(old_pid, sup.procs[0][1].pid)
def test_shutdown_after_start_terminates_children(self): def test_shutdown_after_start_terminates_children(self):
# Two long-running children. Caller requests shutdown; # Two long-running children. Caller requests shutdown;
# both should receive SIGTERM and exit. exit_code() is # both should receive SIGTERM and exit. Signal-only
# max of (returncodes) — both signal-killed (negative), # shutdown clamps to a zero supervisor exit code.
# so max() picks 0 in the typical case (or the
# platform-specific signal returncode).
specs = [ specs = [
_DaemonSpec("a", ("/bin/sleep", "60")), _DaemonSpec("a", ("/bin/sleep", "60")),
_DaemonSpec("b", ("/bin/sleep", "60")), _DaemonSpec("b", ("/bin/sleep", "60")),
@@ -335,7 +405,7 @@ class TestSupervisor(unittest.TestCase):
time.sleep(0.2) # let them actually start time.sleep(0.2) # let them actually start
sup.request_shutdown(reason="test") sup.request_shutdown(reason="test")
rc = self._drive(sup) rc = self._drive(sup)
self.assertIsNotNone(rc) self.assertEqual(0, rc)
# Both children got the signal — neither survived past # Both children got the signal — neither survived past
# the grace deadline. # the grace deadline.
for _, p in sup.procs: for _, p in sup.procs:
@@ -360,7 +430,7 @@ class TestSupervisor(unittest.TestCase):
# Process was SIGKILL'd → returncode -9 on POSIX. # Process was SIGKILL'd → returncode -9 on POSIX.
self.assertEqual(-9, sup.procs[0][1].returncode) self.assertEqual(-9, sup.procs[0][1].returncode)
self.assertIsNotNone(rc) self.assertEqual(0, rc)
def test_idempotent_shutdown_requests(self): def test_idempotent_shutdown_requests(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "60"))] specs = [_DaemonSpec("a", ("/bin/sleep", "60"))]
@@ -426,12 +496,7 @@ class TestMainEndToEnd(unittest.TestCase):
self.assertIn("starting alpha", out) self.assertIn("starting alpha", out)
self.assertIn("starting beta", out) self.assertIn("starting beta", out)
self.assertIn("forwarding SIGTERM", out) self.assertIn("forwarding SIGTERM", out)
# Sleep terminated by SIGTERM exits with returncode -15; self.assertEqual(0, rc)
# supervisor surfaces that via max(...) and main()
# returns -15 → process exit becomes 256-15 = 241.
# On macOS bash may convert to 143. Either way, nonzero
# AND the child finished — we don't pin the exact code.
self.assertNotEqual(0, rc)
def test_empty_daemon_set_exits_zero_immediately(self): def test_empty_daemon_set_exits_zero_immediately(self):
# Use a sentinel value that filters out both alpha+beta. # Use a sentinel value that filters out both alpha+beta.