fix(sidecar): queue restart signals
This commit is contained in:
@@ -163,6 +163,10 @@ class _Supervisor:
|
|||||||
# Names of children that have been logged as having exited
|
# Names of children that have been logged as having exited
|
||||||
# so we only log each death once across watch-loop ticks.
|
# so we only log each death once across watch-loop ticks.
|
||||||
self._logged_dead: set[str] = set()
|
self._logged_dead: set[str] = set()
|
||||||
|
# Signal handlers add daemon names here and return quickly.
|
||||||
|
# The main watch loop drains the set, so repeated restart
|
||||||
|
# requests for one daemon coalesce into one restart.
|
||||||
|
self._restart_requested: set[str] = set()
|
||||||
|
|
||||||
def start_all(self) -> None:
|
def start_all(self) -> None:
|
||||||
for spec in self.specs:
|
for spec in self.specs:
|
||||||
@@ -173,6 +177,7 @@ class _Supervisor:
|
|||||||
if self.shutdown_at is not None:
|
if self.shutdown_at is not None:
|
||||||
return
|
return
|
||||||
self.shutdown_at = time.monotonic()
|
self.shutdown_at = time.monotonic()
|
||||||
|
self._restart_requested.clear()
|
||||||
_log(f"shutting down ({reason}); forwarding SIGTERM")
|
_log(f"shutting down ({reason}); forwarding SIGTERM")
|
||||||
for _, p in self.procs:
|
for _, p in self.procs:
|
||||||
if p.poll() is None:
|
if p.poll() is None:
|
||||||
@@ -181,6 +186,24 @@ class _Supervisor:
|
|||||||
except ProcessLookupError:
|
except ProcessLookupError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def request_restart(self, daemon_name: str) -> bool:
|
||||||
|
"""Queue a daemon restart for the main loop to process.
|
||||||
|
|
||||||
|
Signal handlers use this non-blocking path instead of doing
|
||||||
|
subprocess lifecycle work directly. Requests coalesce by
|
||||||
|
daemon name: one pending restart is enough to make the daemon
|
||||||
|
reread the latest config from disk.
|
||||||
|
|
||||||
|
Returns True iff a daemon by that name is known to the
|
||||||
|
supervisor and shutdown has not started."""
|
||||||
|
if self.shutdown_at is not None:
|
||||||
|
_log(f"restart {daemon_name} skipped; supervisor is shutting down")
|
||||||
|
return False
|
||||||
|
if not any(spec.name == daemon_name for spec, _ in self.procs):
|
||||||
|
return False
|
||||||
|
self._restart_requested.add(daemon_name)
|
||||||
|
return True
|
||||||
|
|
||||||
def tick(self) -> bool:
|
def tick(self) -> bool:
|
||||||
"""One iteration of the watch loop. Returns True when every
|
"""One iteration of the watch loop. Returns True when every
|
||||||
child has exited and the supervisor can return.
|
child has exited and the supervisor can return.
|
||||||
@@ -188,6 +211,8 @@ class _Supervisor:
|
|||||||
A child dying unexpectedly is logged but does NOT initiate
|
A child dying unexpectedly is logged but does NOT initiate
|
||||||
shutdown — see the module docstring's failure-policy
|
shutdown — see the module docstring's failure-policy
|
||||||
section. Shutdown is signal-driven only."""
|
section. Shutdown is signal-driven only."""
|
||||||
|
self._drain_restart_requests()
|
||||||
|
|
||||||
for spec, p in self.procs:
|
for spec, p in self.procs:
|
||||||
rc = p.poll()
|
rc = p.poll()
|
||||||
if rc is None or spec.name in self._logged_dead:
|
if rc is None or spec.name in self._logged_dead:
|
||||||
@@ -223,11 +248,29 @@ class _Supervisor:
|
|||||||
return all(p.poll() is not None for _, p in self.procs)
|
return all(p.poll() is not None for _, p in self.procs)
|
||||||
|
|
||||||
def exit_code(self) -> int:
|
def exit_code(self) -> int:
|
||||||
"""Worst child returncode wins. On graceful shutdown every
|
"""Positive child failures win; otherwise report success.
|
||||||
child is signal-killed (negative returncode) and max()
|
|
||||||
returns 0; if some child crashed nonzero before the signal
|
Python represents signal-terminated children as negative
|
||||||
the operator gets that code on container exit."""
|
return codes. A signal-only graceful shutdown should not leak
|
||||||
return max((p.returncode for _, p in self.procs), default=0)
|
that platform-specific detail into the container exit status,
|
||||||
|
but a positive crash before shutdown should remain visible."""
|
||||||
|
positives = [
|
||||||
|
p.returncode for _, p in self.procs
|
||||||
|
if p.returncode is not None and p.returncode > 0
|
||||||
|
]
|
||||||
|
return max(positives, default=0)
|
||||||
|
|
||||||
|
def _drain_restart_requests(self) -> None:
|
||||||
|
if self.shutdown_at is not None:
|
||||||
|
self._restart_requested.clear()
|
||||||
|
return
|
||||||
|
requested = tuple(sorted(self._restart_requested))
|
||||||
|
self._restart_requested.clear()
|
||||||
|
for daemon_name in requested:
|
||||||
|
if self.shutdown_at is not None:
|
||||||
|
self._restart_requested.clear()
|
||||||
|
return
|
||||||
|
self.restart_daemon(daemon_name)
|
||||||
|
|
||||||
def forward_signal(self, sig: int, daemon_name: str) -> bool:
|
def forward_signal(self, sig: int, daemon_name: str) -> bool:
|
||||||
"""Forward a signal to one named child. Used by the SIGHUP
|
"""Forward a signal to one named child. Used by the SIGHUP
|
||||||
@@ -323,7 +366,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
# supervisor restarts the pipelock daemon in place (other
|
# supervisor restarts the pipelock daemon in place (other
|
||||||
# daemons keep running — specifically supervise, whose MCP
|
# daemons keep running — specifically supervise, whose MCP
|
||||||
# socket would drop on a whole-container `docker restart`).
|
# socket would drop on a whole-container `docker restart`).
|
||||||
signal.signal(signal.SIGUSR1, lambda *_: sup.restart_daemon("pipelock"))
|
signal.signal(signal.SIGUSR1, lambda *_: sup.request_restart("pipelock"))
|
||||||
|
|
||||||
while not sup.tick():
|
while not sup.tick():
|
||||||
time.sleep(_POLL_INTERVAL)
|
time.sleep(_POLL_INTERVAL)
|
||||||
|
|||||||
@@ -100,11 +100,11 @@ loop should continue to call `tick()` and sleep on `_POLL_INTERVAL`; `tick()`
|
|||||||
then performs the actual `restart_daemon("pipelock")` work while normal Python
|
then performs the actual `restart_daemon("pipelock")` work while normal Python
|
||||||
control flow is in the supervisor loop.
|
control flow is in the supervisor loop.
|
||||||
|
|
||||||
Repeated restart requests should not overlap. Either coalescing or FIFO
|
Repeated restart requests should not overlap. Restart requests coalesce by
|
||||||
serialization is acceptable, but the PRD prefers coalescing by daemon name: if
|
daemon name: if three SIGUSR1 signals arrive before the next loop turn, one
|
||||||
three SIGUSR1 signals arrive before the next loop turn, one pipelock restart is
|
pipelock restart is enough because each restart rereads the latest
|
||||||
enough because each restart rereads the latest `pipelock.yaml` from disk.
|
`pipelock.yaml` from disk. This treats SIGUSR1 as "make pipelock reflect the
|
||||||
Document this because it is a semantic choice.
|
current config" rather than "run exactly one restart per signal."
|
||||||
|
|
||||||
Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is
|
Shutdown wins over restart. If SIGTERM/SIGINT is received while a restart is
|
||||||
pending, the supervisor should drop the pending restart and terminate live
|
pending, the supervisor should drop the pending restart and terminate live
|
||||||
@@ -116,9 +116,9 @@ between bytecodes and cannot interrupt a single blocking `wait()` until control
|
|||||||
returns to Python.
|
returns to Python.
|
||||||
|
|
||||||
Exit-code behavior should be documented as "positive failures win, otherwise
|
Exit-code behavior should be documented as "positive failures win, otherwise
|
||||||
return the maximum observed child return code." That matches the current intent:
|
return zero." Positive process failures remain visible, while a clean shutdown
|
||||||
positive process failures remain visible, while a clean shutdown of only
|
of only zero-exit or signal-terminated children returns zero instead of leaking
|
||||||
signal-terminated children does not hide an earlier crash.
|
platform-specific negative signal return codes to the container exit status.
|
||||||
|
|
||||||
## Implementation Chunks
|
## Implementation Chunks
|
||||||
|
|
||||||
@@ -148,10 +148,4 @@ Also run the full unit suite before merge:
|
|||||||
|
|
||||||
## Open Questions
|
## Open Questions
|
||||||
|
|
||||||
- Should repeated restart requests be coalesced by daemon name, or should the
|
None.
|
||||||
supervisor preserve every queued request? Coalescing is simpler and appears
|
|
||||||
sufficient because pipelock rereads the latest config on restart.
|
|
||||||
- Should exit-code handling clamp all negative signal return codes to zero
|
|
||||||
when no positive child failure occurred, or should it continue returning the
|
|
||||||
maximum raw child return code? The current tests tolerate platform-specific
|
|
||||||
negative signal codes; tightening this would be a behavior change.
|
|
||||||
|
|||||||
@@ -301,6 +301,64 @@ class TestSupervisor(unittest.TestCase):
|
|||||||
sup.request_shutdown(reason="cleanup")
|
sup.request_shutdown(reason="cleanup")
|
||||||
self._drive(sup)
|
self._drive(sup)
|
||||||
|
|
||||||
|
def test_request_restart_is_drained_by_tick(self):
|
||||||
|
specs = [
|
||||||
|
_DaemonSpec("pipelock", ("/bin/sleep", "30")),
|
||||||
|
_DaemonSpec("supervise", ("/bin/sleep", "30")),
|
||||||
|
]
|
||||||
|
sup = _Supervisor(specs)
|
||||||
|
sup.start_all()
|
||||||
|
time.sleep(0.1)
|
||||||
|
old_pipelock_pid = sup.procs[0][1].pid
|
||||||
|
supervise_pid = sup.procs[1][1].pid
|
||||||
|
|
||||||
|
ok = sup.request_restart("pipelock")
|
||||||
|
self.assertTrue(ok)
|
||||||
|
# The non-blocking request path only records intent.
|
||||||
|
self.assertEqual(old_pipelock_pid, sup.procs[0][1].pid)
|
||||||
|
|
||||||
|
done = sup.tick()
|
||||||
|
self.assertFalse(done)
|
||||||
|
|
||||||
|
self.assertNotEqual(old_pipelock_pid, sup.procs[0][1].pid)
|
||||||
|
self.assertEqual(supervise_pid, sup.procs[1][1].pid)
|
||||||
|
|
||||||
|
sup.request_shutdown(reason="cleanup")
|
||||||
|
self._drive(sup)
|
||||||
|
|
||||||
|
def test_repeated_restart_requests_coalesce(self):
|
||||||
|
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
|
||||||
|
sup = _Supervisor(specs)
|
||||||
|
sup.start_all()
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
self.assertTrue(sup.request_restart("pipelock"))
|
||||||
|
self.assertTrue(sup.request_restart("pipelock"))
|
||||||
|
self.assertEqual({"pipelock"}, sup._restart_requested)
|
||||||
|
|
||||||
|
old_pid = sup.procs[0][1].pid
|
||||||
|
sup.tick()
|
||||||
|
first_restarted_pid = sup.procs[0][1].pid
|
||||||
|
self.assertNotEqual(old_pid, first_restarted_pid)
|
||||||
|
|
||||||
|
# A second tick should not restart again; the coalesced
|
||||||
|
# request was consumed by the first tick.
|
||||||
|
sup.tick()
|
||||||
|
self.assertEqual(first_restarted_pid, sup.procs[0][1].pid)
|
||||||
|
|
||||||
|
sup.request_shutdown(reason="cleanup")
|
||||||
|
self._drive(sup)
|
||||||
|
|
||||||
|
def test_request_restart_unknown_daemon_no_op(self):
|
||||||
|
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
|
||||||
|
sup = _Supervisor(specs)
|
||||||
|
sup.start_all()
|
||||||
|
ok = sup.request_restart("ghost")
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertEqual(set(), sup._restart_requested)
|
||||||
|
sup.request_shutdown(reason="cleanup")
|
||||||
|
self._drive(sup)
|
||||||
|
|
||||||
def test_restart_unknown_daemon_no_op(self):
|
def test_restart_unknown_daemon_no_op(self):
|
||||||
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
|
specs = [_DaemonSpec("a", ("/bin/sleep", "30"))]
|
||||||
sup = _Supervisor(specs)
|
sup = _Supervisor(specs)
|
||||||
@@ -320,12 +378,24 @@ class TestSupervisor(unittest.TestCase):
|
|||||||
"must not respawn a daemon during teardown")
|
"must not respawn a daemon during teardown")
|
||||||
self._drive(sup)
|
self._drive(sup)
|
||||||
|
|
||||||
|
def test_pending_restart_dropped_during_shutdown(self):
|
||||||
|
specs = [_DaemonSpec("pipelock", ("/bin/sleep", "30"))]
|
||||||
|
sup = _Supervisor(specs)
|
||||||
|
sup.start_all()
|
||||||
|
time.sleep(0.1)
|
||||||
|
old_pid = sup.procs[0][1].pid
|
||||||
|
|
||||||
|
self.assertTrue(sup.request_restart("pipelock"))
|
||||||
|
sup.request_shutdown(reason="test")
|
||||||
|
self.assertEqual(set(), sup._restart_requested)
|
||||||
|
self._drive(sup)
|
||||||
|
|
||||||
|
self.assertEqual(old_pid, sup.procs[0][1].pid)
|
||||||
|
|
||||||
def test_shutdown_after_start_terminates_children(self):
|
def test_shutdown_after_start_terminates_children(self):
|
||||||
# Two long-running children. Caller requests shutdown;
|
# Two long-running children. Caller requests shutdown;
|
||||||
# both should receive SIGTERM and exit. exit_code() is
|
# both should receive SIGTERM and exit. Signal-only
|
||||||
# max of (returncodes) — both signal-killed (negative),
|
# shutdown clamps to a zero supervisor exit code.
|
||||||
# so max() picks 0 in the typical case (or the
|
|
||||||
# platform-specific signal returncode).
|
|
||||||
specs = [
|
specs = [
|
||||||
_DaemonSpec("a", ("/bin/sleep", "60")),
|
_DaemonSpec("a", ("/bin/sleep", "60")),
|
||||||
_DaemonSpec("b", ("/bin/sleep", "60")),
|
_DaemonSpec("b", ("/bin/sleep", "60")),
|
||||||
@@ -335,7 +405,7 @@ class TestSupervisor(unittest.TestCase):
|
|||||||
time.sleep(0.2) # let them actually start
|
time.sleep(0.2) # let them actually start
|
||||||
sup.request_shutdown(reason="test")
|
sup.request_shutdown(reason="test")
|
||||||
rc = self._drive(sup)
|
rc = self._drive(sup)
|
||||||
self.assertIsNotNone(rc)
|
self.assertEqual(0, rc)
|
||||||
# Both children got the signal — neither survived past
|
# Both children got the signal — neither survived past
|
||||||
# the grace deadline.
|
# the grace deadline.
|
||||||
for _, p in sup.procs:
|
for _, p in sup.procs:
|
||||||
@@ -360,7 +430,7 @@ class TestSupervisor(unittest.TestCase):
|
|||||||
|
|
||||||
# Process was SIGKILL'd → returncode -9 on POSIX.
|
# Process was SIGKILL'd → returncode -9 on POSIX.
|
||||||
self.assertEqual(-9, sup.procs[0][1].returncode)
|
self.assertEqual(-9, sup.procs[0][1].returncode)
|
||||||
self.assertIsNotNone(rc)
|
self.assertEqual(0, rc)
|
||||||
|
|
||||||
def test_idempotent_shutdown_requests(self):
|
def test_idempotent_shutdown_requests(self):
|
||||||
specs = [_DaemonSpec("a", ("/bin/sleep", "60"))]
|
specs = [_DaemonSpec("a", ("/bin/sleep", "60"))]
|
||||||
@@ -426,12 +496,7 @@ class TestMainEndToEnd(unittest.TestCase):
|
|||||||
self.assertIn("starting alpha", out)
|
self.assertIn("starting alpha", out)
|
||||||
self.assertIn("starting beta", out)
|
self.assertIn("starting beta", out)
|
||||||
self.assertIn("forwarding SIGTERM", out)
|
self.assertIn("forwarding SIGTERM", out)
|
||||||
# Sleep terminated by SIGTERM exits with returncode -15;
|
self.assertEqual(0, rc)
|
||||||
# supervisor surfaces that via max(...) and main()
|
|
||||||
# returns -15 → process exit becomes 256-15 = 241.
|
|
||||||
# On macOS bash may convert to 143. Either way, nonzero
|
|
||||||
# AND the child finished — we don't pin the exact code.
|
|
||||||
self.assertNotEqual(0, rc)
|
|
||||||
|
|
||||||
def test_empty_daemon_set_exits_zero_immediately(self):
|
def test_empty_daemon_set_exits_zero_immediately(self):
|
||||||
# Use a sentinel value that filters out both alpha+beta.
|
# Use a sentinel value that filters out both alpha+beta.
|
||||||
|
|||||||
Reference in New Issue
Block a user