PRD 0034: Sidecar Restart and Shutdown Semantics #127
@@ -163,6 +163,10 @@ class _Supervisor:
|
||||
# Names of children that have been logged as having exited
|
||||
# so we only log each death once across watch-loop ticks.
|
||||
self._logged_dead: set[str] = set()
|
||||
# Signal handlers add daemon names here and return quickly.
|
||||
# The main watch loop drains the set, so repeated restart
|
||||
# requests for one daemon coalesce into one restart.
|
||||
self._restart_requested: set[str] = set()
|
||||
|
||||
def start_all(self) -> None:
|
||||
for spec in self.specs:
|
||||
@@ -173,6 +177,7 @@ class _Supervisor:
|
||||
if self.shutdown_at is not None:
|
||||
return
|
||||
self.shutdown_at = time.monotonic()
|
||||
self._restart_requested.clear()
|
||||
_log(f"shutting down ({reason}); forwarding SIGTERM")
|
||||
for _, p in self.procs:
|
||||
if p.poll() is None:
|
||||
@@ -181,6 +186,24 @@ class _Supervisor:
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
def request_restart(self, daemon_name: str) -> bool:
|
||||
"""Queue a daemon restart for the main loop to process.
|
||||
|
||||
Signal handlers use this non-blocking path instead of doing
|
||||
subprocess lifecycle work directly. Requests coalesce by
|
||||
daemon name: one pending restart is enough to make the daemon
|
||||
reread the latest config from disk.
|
||||
|
||||
Returns True iff a daemon by that name is known to the
|
||||
supervisor and shutdown has not started."""
|
||||
if self.shutdown_at is not None:
|
||||
_log(f"restart {daemon_name} skipped; supervisor is shutting down")
|
||||
return False
|
||||
if not any(spec.name == daemon_name for spec, _ in self.procs):
|
||||
return False
|
||||
self._restart_requested.add(daemon_name)
|
||||
return True
|
||||
|
||||
def tick(self) -> bool:
|
||||
"""One iteration of the watch loop. Returns True when every
|
||||
child has exited and the supervisor can return.
|
||||
@@ -188,6 +211,8 @@ class _Supervisor:
|
||||
A child dying unexpectedly is logged but does NOT initiate
|
||||
shutdown — see the module docstring's failure-policy
|
||||
section. Shutdown is signal-driven only."""
|
||||
self._drain_restart_requests()
|
||||
|
||||
for spec, p in self.procs:
|
||||
rc = p.poll()
|
||||
if rc is None or spec.name in self._logged_dead:
|
||||
@@ -223,11 +248,29 @@ class _Supervisor:
|
||||
return all(p.poll() is not None for _, p in self.procs)
|
||||
|
||||
def exit_code(self) -> int:
|
||||
"""Worst child returncode wins. On graceful shutdown every
|
||||
child is signal-killed (negative returncode) and max()
|
||||
returns 0; if some child crashed nonzero before the signal
|
||||
the operator gets that code on container exit."""
|
||||
return max((p.returncode for _, p in self.procs), default=0)
|
||||
"""Positive child failures win; otherwise report success.
|
||||
|
||||
Python represents signal-terminated children as negative
|
||||
return codes. A signal-only graceful shutdown should not leak
|
||||
that platform-specific detail into the container exit status,
|
||||
but a positive crash before shutdown should remain visible."""
|
||||
positives = [
|
||||
p.returncode for _, p in self.procs
|
||||
if p.returncode is not None and p.returncode > 0
|
||||
]
|
||||
return max(positives, default=0)
|
||||
|
||||
def _drain_restart_requests(self) -> None:
|
||||
if self.shutdown_at is not None:
|
||||
self._restart_requested.clear()
|
||||
return
|
||||
requested = tuple(sorted(self._restart_requested))
|
||||
self._restart_requested.clear()
|
||||
for daemon_name in requested:
|
||||
if self.shutdown_at is not None:
|
||||
self._restart_requested.clear()
|
||||
return
|
||||
self.restart_daemon(daemon_name)
|
||||
|
||||
def forward_signal(self, sig: int, daemon_name: str) -> bool:
|
||||
"""Forward a signal to one named child. Used by the SIGHUP
|
||||
@@ -323,7 +366,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
# supervisor restarts the pipelock daemon in place (other
|
||||
# daemons keep running — specifically supervise, whose MCP
|
||||
# socket would drop on a whole-container `docker restart`).
|
||||
signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock"))
|
||||
signal.signal(signal.SIGUSR1, lambda *_: sup.request_restart("pipelock"))
|
||||
|
||||
while not sup.tick():
|
||||
time.sleep(_POLL_INTERVAL)
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
# PRD 0034: Sidecar Restart and Shutdown Semantics
|
||||
|
||||
- **Status:** Active
|
||||
- **Author:** didericis-codex
|
||||
- **Created:** 2026-06-02
|
||||
- **Issue:** #126
|
||||
|
||||
## Summary
|
||||
|
||||
Make the sidecar bundle supervisor's signal, restart, and exit-code behavior
|
||||
explicit and easier to reason about. In particular, move pipelock restart work
|
||||
out of direct SIGUSR1 handler execution while preserving the caller-visible
|
||||
`docker kill --signal USR1 <bundle>` contract used by pipelock apply flows.
|
||||
|
||||
## Problem
|
||||
|
||||
`bot_bottle/sidecar_init.py` is PID 1 for the bundled sidecar container. It
|
||||
starts egress, pipelock, git-gate/git-http, and supervise; forwards shutdown
|
||||
signals; forwards SIGHUP to egress; and restarts pipelock on SIGUSR1 after
|
||||
allowlist changes.
|
||||
|
||||
The current SIGUSR1 handler calls `sup.restart_daemon("pipelock")` directly.
|
||||
That restart path can terminate a child, wait up to a grace timeout, SIGKILL a
|
||||
stubborn child, spawn a replacement with `subprocess.Popen`, and start a new
|
||||
pump thread. In CPython signal handlers run between bytecodes in the main
|
||||
thread, so this is not the same as POSIX async-signal-unsafe C code, but it
|
||||
still means signal handling can block the supervisor loop for the restart grace
|
||||
window and makes stacked signals harder to reason about.
|
||||
|
||||
The exit-code contract is also easy to misread. `_Supervisor.exit_code()`
|
||||
returns the maximum child return code. That preserves a positive crash code
|
||||
when a child failed before graceful shutdown, but the docstring currently
|
||||
frames graceful shutdown as returning zero because signal-killed children have
|
||||
negative return codes. The implementation is reasonable; the contract needs to
|
||||
be deliberate and tested around crash-then-shutdown behavior.
|
||||
|
||||
## Goals / Success Criteria
|
||||
|
||||
- Preserve the external signal contract:
|
||||
- SIGTERM/SIGINT requests bundle shutdown.
|
||||
- SIGHUP still forwards to the live egress child.
|
||||
- SIGUSR1 still requests an in-place pipelock restart.
|
||||
- Keep signal handlers small: handlers should record intent and return, not
|
||||
perform blocking subprocess lifecycle work directly.
|
||||
- Process queued restart requests from the supervisor's main loop so restart
|
||||
behavior is serialized with `tick()` and shutdown state.
|
||||
- Avoid respawning children after shutdown has started.
|
||||
- Coalesce or serialize repeated pipelock restart requests in a documented way
|
||||
so stacked SIGUSR1 delivery cannot overlap restarts.
|
||||
- Clarify and test aggregate exit-code semantics:
|
||||
- clean unattended exits return zero when every child exits zero.
|
||||
- signal-only shutdown does not invent a positive failure code.
|
||||
- a positive child crash before shutdown remains visible on supervisor exit.
|
||||
- Keep the implementation stdlib-only.
|
||||
|
||||
## Non-goals
|
||||
|
||||
- No new process supervisor dependency such as supervisord, s6, or runit.
|
||||
- No automatic restart policy for arbitrary unexpected child death.
|
||||
- No changes to the bundle's daemon set, daemon argv, env filtering, or Docker
|
||||
compose contract.
|
||||
- No changes to egress route reload semantics beyond preserving SIGHUP
|
||||
forwarding.
|
||||
- No user-facing CLI changes.
|
||||
|
||||
## Scope
|
||||
|
||||
In scope:
|
||||
|
||||
- Internal signal handling and supervisor event-loop structure in
|
||||
`bot_bottle/sidecar_init.py`.
|
||||
- Tests in `tests/unit/test_sidecar_init.py` for queued restart behavior,
|
||||
shutdown/restart ordering, repeated restart requests, and exit-code
|
||||
semantics.
|
||||
- Docstring/comment updates that describe the concrete signal and exit-code
|
||||
contracts.
|
||||
|
||||
Out of scope:
|
||||
|
||||
- Changing pipelock itself to reload config in process.
|
||||
- Restarting egress, git-gate, git-http, or supervise on demand.
|
||||
- Reporting restart events to the supervise MCP plane.
|
||||
- Changing the interim policy that unexpected child death leaves surviving
|
||||
daemons running.
|
||||
|
||||
## Design
|
||||
|
||||
Keep `_Supervisor` as the owner of child process state, but add an explicit
|
||||
pending-action boundary between signal delivery and subprocess lifecycle work.
|
||||
The exact API can be small, for example:
|
||||
|
||||
- `request_shutdown(reason)` keeps its existing idempotent behavior.
|
||||
- `request_restart(daemon_name)` records a pending restart request unless
|
||||
shutdown is already in progress.
|
||||
- `tick()` drains pending restart work before or after child-death logging in a
|
||||
documented order.
|
||||
|
||||
The SIGUSR1 handler should call only the non-blocking request method. The main
|
||||
loop should continue to call `tick()` and sleep on `_POLL_INTERVAL`; `tick()`
|
||||
then performs the actual `restart_daemon("pipelock")` work while normal Python
|
||||
control flow is in the supervisor loop.
|
||||
|
||||
Repeated restart requests should not overlap. Restart requests coalesce by
|
||||
daemon name: if three SIGUSR1 signals arrive before the next loop turn, one
|
||||
pipelock restart is enough because each restart rereads the latest
|
||||
`pipelock.yaml` from disk. This treats SIGUSR1 as "make pipelock reflect the
|
||||
current config" rather than "run exactly one restart per signal."
|
||||
|
||||
Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is
|
||||
pending, the supervisor should drop the pending restart and terminate live
|
||||
children. If shutdown starts while `restart_daemon` is already executing in the
|
||||
main loop, the existing restart operation may finish, but no additional queued
|
||||
restart should start after shutdown state is set. A simpler implementation may
|
||||
check shutdown only before each queued restart, because signal handlers execute
|
||||
between bytecodes and cannot interrupt a single blocking `wait()` until control
|
||||
returns to Python.
|
||||
|
||||
Exit-code behavior should be documented as "positive failures win, otherwise
|
||||
return zero." Positive process failures remain visible, while a clean shutdown
|
||||
of only zero-exit or signal-terminated children returns zero instead of leaking
|
||||
platform-specific negative signal return codes to the container exit status.
|
||||
|
||||
## Implementation Chunks
|
||||
|
||||
1. Add characterization tests for SIGUSR1 queuing, repeated restart coalescing,
|
||||
shutdown dropping pending restarts, and crash-then-shutdown exit codes.
|
||||
2. Add a pending restart request structure to `_Supervisor` and a
|
||||
non-blocking request method.
|
||||
3. Change the SIGUSR1 handler in `main()` to enqueue the pipelock restart
|
||||
instead of calling `restart_daemon` directly.
|
||||
4. Drain pending restarts from `tick()` with shutdown checks and documented
|
||||
ordering.
|
||||
5. Update docstrings and comments around signal handling and `exit_code()`.
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
Run the existing sidecar unit tests:
|
||||
|
||||
- `python3 -m unittest tests.unit.test_sidecar_init`
|
||||
|
||||
Add focused unit tests that avoid process-wide signal handler races where
|
||||
possible by driving `_Supervisor` directly. End-to-end signal tests can remain
|
||||
limited to `main()` behavior that cannot be exercised otherwise.
|
||||
|
||||
Also run the full unit suite before merge:
|
||||
|
||||
- `python3 -m unittest discover -s tests/unit`
|
||||
|
||||
## Open Questions
|
||||
|
||||
None.
|
||||
@@ -301,6 +301,64 @@ class TestSupervisor(unittest.TestCase):
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_request_restart_is_drained_by_tick(self):
|
||||
specs = [
|
||||
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
|
||||
_DaemonSpec("supervise", ("/bin/sleep", "30")),
|
||||
]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
time.sleep(0.1)
|
||||
old_pipelock_pid = sup.procs[0][1].pid
|
||||
supervise_pid = sup.procs[1][1].pid
|
||||
|
||||
ok = sup.request_restart("pipelock")
|
||||
self.assertTrue(ok)
|
||||
# The non-blocking request path only records intent.
|
||||
self.assertEqual(old_pipelock_pid, sup.procs[0][1].pid)
|
||||
|
||||
done = sup.tick()
|
||||
self.assertFalse(done)
|
||||
|
||||
self.assertNotEqual(old_pipelock_pid, sup.procs[0][1].pid)
|
||||
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
|
||||
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_repeated_restart_requests_coalesce(self):
|
||||
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
time.sleep(0.1)
|
||||
|
||||
self.assertTrue(sup.request_restart("pipelock"))
|
||||
self.assertTrue(sup.request_restart("pipelock"))
|
||||
self.assertEqual({"pipelock"}, sup._restart_requested)
|
||||
|
||||
old_pid = sup.procs[0][1].pid
|
||||
sup.tick()
|
||||
first_restarted_pid = sup.procs[0][1].pid
|
||||
self.assertNotEqual(old_pid, first_restarted_pid)
|
||||
|
||||
# A second tick should not restart again; the coalesced
|
||||
# request was consumed by the first tick.
|
||||
sup.tick()
|
||||
self.assertEqual(first_restarted_pid, sup.procs[0][1].pid)
|
||||
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_request_restart_unknown_daemon_no_op(self):
|
||||
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
ok = sup.request_restart("ghost")
|
||||
self.assertFalse(ok)
|
||||
self.assertEqual(set(), sup._restart_requested)
|
||||
sup.request_shutdown(reason="cleanup")
|
||||
self._drive(sup)
|
||||
|
||||
def test_restart_unknown_daemon_no_op(self):
|
||||
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
@@ -320,12 +378,24 @@ class TestSupervisor(unittest.TestCase):
|
||||
"must not respawn a daemon during teardown")
|
||||
self._drive(sup)
|
||||
|
||||
def test_pending_restart_dropped_during_shutdown(self):
|
||||
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
|
||||
sup = _Supervisor(specs)
|
||||
sup.start_all()
|
||||
time.sleep(0.1)
|
||||
old_pid = sup.procs[0][1].pid
|
||||
|
||||
self.assertTrue(sup.request_restart("pipelock"))
|
||||
sup.request_shutdown(reason="test")
|
||||
self.assertEqual(set(), sup._restart_requested)
|
||||
self._drive(sup)
|
||||
|
||||
self.assertEqual(old_pid, sup.procs[0][1].pid)
|
||||
|
||||
def test_shutdown_after_start_terminates_children(self):
|
||||
# Two long-running children. Caller requests shutdown;
|
||||
# both should receive SIGTERM and exit. exit_code() is
|
||||
# max of (returncodes) — both signal-killed (negative),
|
||||
# so max() picks 0 in the typical case (or the
|
||||
# platform-specific signal returncode).
|
||||
# both should receive SIGTERM and exit. Signal-only
|
||||
# shutdown clamps to a zero supervisor exit code.
|
||||
specs = [
|
||||
_DaemonSpec("a", ("/bin/sleep", "60")),
|
||||
_DaemonSpec("b", ("/bin/sleep", "60")),
|
||||
@@ -335,7 +405,7 @@ class TestSupervisor(unittest.TestCase):
|
||||
time.sleep(0.2) # let them actually start
|
||||
sup.request_shutdown(reason="test")
|
||||
rc = self._drive(sup)
|
||||
self.assertIsNotNone(rc)
|
||||
self.assertEqual(0, rc)
|
||||
# Both children got the signal — neither survived past
|
||||
# the grace deadline.
|
||||
for _, p in sup.procs:
|
||||
@@ -360,7 +430,7 @@ class TestSupervisor(unittest.TestCase):
|
||||
|
||||
# Process was SIGKILL'd → returncode -9 on POSIX.
|
||||
self.assertEqual(-9, sup.procs[0][1].returncode)
|
||||
self.assertIsNotNone(rc)
|
||||
self.assertEqual(0, rc)
|
||||
|
||||
def test_idempotent_shutdown_requests(self):
|
||||
specs = [_DaemonSpec("a", ("/bin/sleep", "60"))]
|
||||
@@ -426,12 +496,7 @@ class TestMainEndToEnd(unittest.TestCase):
|
||||
self.assertIn("starting alpha", out)
|
||||
self.assertIn("starting beta", out)
|
||||
self.assertIn("forwarding SIGTERM", out)
|
||||
# Sleep terminated by SIGTERM exits with returncode -15;
|
||||
# supervisor surfaces that via max(...) and main()
|
||||
# returns -15 → process exit becomes 256-15 = 241.
|
||||
# On macOS bash may convert to 143. Either way, nonzero
|
||||
# AND the child finished — we don't pin the exact code.
|
||||
self.assertNotEqual(0, rc)
|
||||
self.assertEqual(0, rc)
|
||||
|
||||
def test_empty_daemon_set_exits_zero_immediately(self):
|
||||
# Use a sentinel value that filters out both alpha+beta.
|
||||
|
||||
Reference in New Issue
Block a user