From 62109a1caf6012517dc3ad9b7054953e21b6b415 Mon Sep 17 00:00:00 2001 From: didericis Date: Wed, 27 May 2026 00:19:50 -0400 Subject: [PATCH] fix(sidecars): child death no longer tears down the bundle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverses chunk 1's "any unexpected child death tears down the rest" policy. New behavior: a daemon dying is logged but does NOT initiate shutdown — the surviving daemons keep running and whatever the dead one served starts failing visibly on the agent side. The supervisor exits only when (a) it receives SIGTERM/SIGINT, or (b) every child has died on its own. Eventual design is restart-the-dead-daemon plus a notification to the supervise sidecar so the operator sees the event explicitly; this commit ships only the "log and leave alone" half. PRD 0024 open question 1 updated to reflect the new intent. Tests updated: replaced "crash propagates exit code via auto-teardown" with three cases that exercise the new policy (crash without shutdown leaves survivors up, crash-then-signal surfaces the nonzero code, all-children-die-unattended still converges the loop). Co-Authored-By: Claude Opus 4.7 --- claude_bottle/sidecar_init.py | 58 +++++++------ docs/prds/0024-consolidate-sidecar-bundle.md | 26 +++--- tests/unit/test_sidecar_init.py | 91 +++++++++++++++----- 3 files changed, 117 insertions(+), 58 deletions(-) diff --git a/claude_bottle/sidecar_init.py b/claude_bottle/sidecar_init.py index ab36753..f00e062 100644 --- a/claude_bottle/sidecar_init.py +++ b/claude_bottle/sidecar_init.py @@ -2,15 +2,21 @@ PID 1 inside the `claude-bottle-sidecars` bundle image. Spawns the configured daemons (egress, pipelock, git-gate, supervise), -forwards SIGTERM/SIGINT to each child, propagates per-daemon -stdout+stderr to the container log with a `[name] ` prefix, and -exits with the first unexpected child exit code. If every child -exits 0 after a graceful shutdown was requested, exit 0. +forwards SIGTERM/SIGINT to each child, and propagates per-daemon +stdout+stderr to the container log with a `[name] ` prefix. -Per the PRD's "init failure semantics" open question, this -implementation goes with "any unexpected child death tears down -the rest" — bundling means the daemons share fate. Restart-just- -this-one logic can land later if operators hit pain. +Failure policy (interim): when a child dies unexpectedly, the +supervisor logs the death and leaves the surviving children +running. The bundle stays up; whatever the dead daemon served +will start failing, surfacing in the agent's own error path. +The supervisor itself exits only when (a) the operator/compose +sends SIGTERM/SIGINT, or (b) every child has died. + +Failure policy (eventual): on unexpected death, the supervisor +restarts the daemon and emits a notification to the supervise +sidecar so the operator sees the event. That lands in a later +PR; the interim policy is "don't take the bundle down for one +sick daemon." Daemon subset is env-driven. The compose renderer narrows it via `CLAUDE_BOTTLE_SIDECAR_DAEMONS=egress,pipelock` for bottles that @@ -118,8 +124,9 @@ class _Supervisor: self.specs = tuple(specs) self.procs: list[tuple[_DaemonSpec, subprocess.Popen]] = [] self.shutdown_at: float | None = None - self.first_unexpected_rc: int | None = None - self.first_unexpected_name: str | None = None + # Names of children that have been logged as having exited + # so we only log each death once across watch-loop ticks. + self._logged_dead: set[str] = set() def start_all(self) -> None: for spec in self.specs: @@ -140,21 +147,24 @@ class _Supervisor: def tick(self) -> bool: """One iteration of the watch loop. Returns True when every - child has exited and the supervisor can return.""" + child has exited and the supervisor can return. + + A child dying unexpectedly is logged but does NOT initiate + shutdown — see the module docstring's failure-policy + section. Shutdown is signal-driven only.""" for spec, p in self.procs: rc = p.poll() - if rc is None: + if rc is None or spec.name in self._logged_dead: continue - if self.first_unexpected_rc is None and self.shutdown_at is None: - # First exit BEFORE we asked for shutdown: that's - # the failure signal. Record it and tear down. - self.first_unexpected_rc = rc - self.first_unexpected_name = spec.name + self._logged_dead.add(spec.name) + if self.shutdown_at is None: _log( - f"{spec.name} exited unexpectedly with code {rc}; " - "tearing down" + f"{spec.name} exited with code {rc}; leaving " + f"surviving daemons running (operator-visible " + f"via agent-side failure)" ) - self.request_shutdown(reason="child_exit") + else: + _log(f"{spec.name} exited with code {rc}") if self.shutdown_at is not None: elapsed = time.monotonic() - self.shutdown_at @@ -177,10 +187,10 @@ class _Supervisor: return all(p.poll() is not None for _, p in self.procs) def exit_code(self) -> int: - if self.first_unexpected_rc is not None: - return self.first_unexpected_rc - # Graceful shutdown — surface the worst child code so a - # daemon that died nonzero during teardown is visible. + """Worst child returncode wins. On graceful shutdown every + child is signal-killed (negative returncode) and max() + returns 0; if some child crashed nonzero before the signal + the operator gets that code on container exit.""" return max((p.returncode for _, p in self.procs), default=0) diff --git a/docs/prds/0024-consolidate-sidecar-bundle.md b/docs/prds/0024-consolidate-sidecar-bundle.md index 8f140bb..4e6be07 100644 --- a/docs/prds/0024-consolidate-sidecar-bundle.md +++ b/docs/prds/0024-consolidate-sidecar-bundle.md @@ -390,17 +390,21 @@ rewrite. ## Open questions 1. **Init failure semantics.** When one daemon crashes mid-run, - should the bundle exit (killing the bottle) or restart just - that daemon? Today, with four separate containers, docker - restarts the crashed one and the bottle stays up. Default - for this PRD: bundle exits on any child death; the bottle - tears down. Restart logic can land later if operators hit - it in practice. -2. **Exit-code propagation.** If multiple daemons die in quick - succession (likely under SIGTERM), which exit code wins? - First-to-die is simplest. Worst-case (highest nonzero - exit code) gives clearest signal in logs. Default to - first-to-die unless an operator scenario disagrees. + the bundle does NOT tear down the survivors — the failure is + logged, the surviving daemons keep running, and whatever the + dead one served starts failing in a way the agent surfaces. + The eventual design is restart-the-dead-daemon plus a + notification to the supervise sidecar so the operator sees + the event explicitly; chunk 1 ships only the "log and leave + alone" half. Tear-down-the-bundle was considered and + rejected: one sick daemon shouldn't take the bottle offline. +2. **Exit-code propagation.** When the supervisor finally exits + (signal-driven shutdown, or every child having died on its + own), the container exits with `max(child returncodes)` — + the worst nonzero code wins. On graceful shutdown every child + is signal-killed (negative returncode) so the max is 0; a + crashed-before-signal daemon's nonzero code wins and reaches + the operator on container exit. 3. **Image pin policy.** Pin `claude-bottle-sidecars` by tag (`:latest` rebuilt per-release) or by digest written into a `CLAUDE_BOTTLE_SIDECAR_IMAGE` env var like the existing diff --git a/tests/unit/test_sidecar_init.py b/tests/unit/test_sidecar_init.py index 7d3ea11..1112ada 100644 --- a/tests/unit/test_sidecar_init.py +++ b/tests/unit/test_sidecar_init.py @@ -98,11 +98,10 @@ class TestSupervisor(unittest.TestCase): return sup.exit_code() def test_all_children_succeed_returns_zero(self): - # `sh -c :` exits 0 immediately. With no shutdown request, - # the FIRST child to exit counts as "unexpected" — that's - # the design (children are supposed to be long-lived - # daemons). But all of them exit with 0, so the recorded - # first-unexpected rc is 0, and exit_code() returns 0. + # `sh -c :` exits 0 immediately. With the new failure + # policy a child dying doesn't trigger shutdown, so the + # loop only converges once BOTH have exited on their own. + # Both exit 0 → max(0, 0) = 0. specs = [ _DaemonSpec("a", ("/bin/sh", "-c", ":")), _DaemonSpec("b", ("/bin/sh", "-c", ":")), @@ -112,25 +111,75 @@ class TestSupervisor(unittest.TestCase): rc = self._drive(sup) self.assertEqual(0, rc) - def test_child_crash_propagates_exit_code(self): - # `sh -c "exit 1"` exits 1. /bin/sleep 60 would still be - # running when it exits — the supervisor must tear it down - # and surface the crasher's exit code. + def test_child_crash_does_not_initiate_shutdown(self): + # Failure policy (PRD 0024, interim): a child dying + # unexpectedly is logged but the supervisor does NOT tear + # down the survivors. Verified by giving the crasher + # ~0.3s to die, then asserting the long-runner is still + # up and the supervisor never set shutdown_at. specs = [ _DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")), - _DaemonSpec("longrun", ("/bin/sleep", "60")), + _DaemonSpec("longrun", ("/bin/sleep", "30")), + ] + sup = _Supervisor(specs) + sup.start_all() + # Drive ticks for a while; crasher should die, longrun + # should survive. + deadline = time.monotonic() + 1.0 + while time.monotonic() < deadline: + done = sup.tick() + self.assertFalse(done, "loop converged with a child still alive") + if sup.procs[0][1].poll() is not None: + break + time.sleep(0.05) + + self.assertEqual(1, sup.procs[0][1].returncode, + "crasher should have exited 1") + self.assertIsNone(sup.procs[1][1].poll(), + "longrun should still be running") + self.assertIsNone(sup.shutdown_at, + "supervisor must not initiate shutdown on child death") + + # Clean up — explicit signal-driven shutdown. + sup.request_shutdown(reason="test-teardown") + self._drive(sup) + + def test_crash_then_signal_surfaces_nonzero_exit_code(self): + # The crasher's exit code is what reaches the container + # exit even though shutdown was triggered by SIGTERM. + # exit_code() = max(child returncodes) → 1 wins over the + # signal-killed longrun's negative returncode. + specs = [ + _DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")), + _DaemonSpec("longrun", ("/bin/sleep", "30")), + ] + sup = _Supervisor(specs) + sup.start_all() + time.sleep(0.3) # let crasher die + sup.request_shutdown(reason="test") + rc = self._drive(sup) + self.assertEqual(1, rc) + + def test_all_children_die_unattended_loop_converges(self): + # If nobody sends a signal but every child eventually + # dies on its own, the supervisor still exits — nothing + # left to supervise. + specs = [ + _DaemonSpec("a", ("/bin/sh", "-c", "exit 0")), + _DaemonSpec("b", ("/bin/sh", "-c", "exit 2")), ] sup = _Supervisor(specs) sup.start_all() rc = self._drive(sup) - self.assertEqual(1, rc) - self.assertEqual("crasher", sup.first_unexpected_name) + self.assertEqual(2, rc) + self.assertIsNone(sup.shutdown_at) def test_shutdown_after_start_terminates_children(self): # Two long-running children. Caller requests shutdown; - # both should receive SIGTERM, exit cleanly, and the - # supervisor reports exit 0 (graceful path, no recorded - # unexpected death). + # both should receive SIGTERM and exit. exit_code() is + # max of (returncodes) — both signal-killed (negative), + # so max() picks 0 in the typical case (or the + # platform-specific signal returncode). specs = [ _DaemonSpec("a", ("/bin/sleep", "60")), _DaemonSpec("b", ("/bin/sleep", "60")), @@ -140,15 +189,11 @@ class TestSupervisor(unittest.TestCase): time.sleep(0.2) # let them actually start sup.request_shutdown(reason="test") rc = self._drive(sup) - # /bin/sleep on Linux returns 130 (= 128 + SIGINT) or - # similar nonzero on signal-induced exit; on macOS it - # may be -15. The exit_code() path returns max() which - # may be negative or positive depending. We don't pin - # the value here — just confirm the supervisor exited - # AND that no child was recorded as having died - # unexpectedly (request_shutdown was first). - self.assertIsNone(sup.first_unexpected_name) self.assertIsNotNone(rc) + # Both children got the signal — neither survived past + # the grace deadline. + for _, p in sup.procs: + self.assertIsNotNone(p.returncode) def test_grace_period_escalates_to_sigkill(self): # A child that ignores SIGTERM. The supervisor's