2026-05-27 00:37:56 -04:00
3 changed files with 117 additions and 58 deletions
@@ -2,15 +2,21 @@
 PID 1 inside the `claude-bottle-sidecars` bundle image. Spawns
 the configured daemons (egress, pipelock, git-gate, supervise),
-forwards SIGTERM/SIGINT to each child, propagates per-daemon
+forwards SIGTERM/SIGINT to each child, and propagates per-daemon
-stdout+stderr to the container log with a `[name] ` prefix, and
+stdout+stderr to the container log with a `[name] ` prefix.
 exits with the first unexpected child exit code. If every child
 exits 0 after a graceful shutdown was requested, exit 0.
-Per the PRD's "init failure semantics" open question, this
+Failure policy (interim): when a child dies unexpectedly, the
-implementation goes with "any unexpected child death tears down
+supervisor logs the death and leaves the surviving children
-the rest" — bundling means the daemons share fate. Restart-just-
+running. The bundle stays up; whatever the dead daemon served
-this-one logic can land later if operators hit pain.
+will start failing, surfacing in the agent's own error path.
 The supervisor itself exits only when (a) the operator/compose
 sends SIGTERM/SIGINT, or (b) every child has died.
 Failure policy (eventual): on unexpected death, the supervisor
 restarts the daemon and emits a notification to the supervise
 sidecar so the operator sees the event. That lands in a later
 PR; the interim policy is "don't take the bundle down for one
 sick daemon."
 Daemon subset is env-driven. The compose renderer narrows it via
 `CLAUDE_BOTTLE_SIDECAR_DAEMONS=egress,pipelock` for bottles that
@@ -118,8 +124,9 @@ class _Supervisor:
        self.specs = tuple(specs)
        self.procs: list[tuple[_DaemonSpec, subprocess.Popen]] = []
        self.shutdown_at: float | None = None
-        self.first_unexpected_rc: int | None = None
+        # Names of children that have been logged as having exited
-        self.first_unexpected_name: str | None = None
+        # so we only log each death once across watch-loop ticks.
        self._logged_dead: set[str] = set()
    def start_all(self) -> None:
        for spec in self.specs:
@@ -140,21 +147,24 @@ class _Supervisor:
    def tick(self) -> bool:
        """One iteration of the watch loop. Returns True when every
-        child has exited and the supervisor can return."""
+        child has exited and the supervisor can return.
        A child dying unexpectedly is logged but does NOT initiate
        shutdown — see the module docstring's failure-policy
        section. Shutdown is signal-driven only."""
        for spec, p in self.procs:
            rc = p.poll()
-            if rc is None:
+            if rc is None or spec.name in self._logged_dead:
                continue
-            if self.first_unexpected_rc is None and self.shutdown_at is None:
+            self._logged_dead.add(spec.name)
-                # First exit BEFORE we asked for shutdown: that's
+            if self.shutdown_at is None:
                # the failure signal. Record it and tear down.
                self.first_unexpected_rc = rc
                self.first_unexpected_name = spec.name
                _log(
-                    f"{spec.name} exited unexpectedly with code {rc}; "
+                    f"{spec.name} exited with code {rc}; leaving "
-                    "tearing down"
+                    f"surviving daemons running (operator-visible "
                    f"via agent-side failure)"
                )
-                self.request_shutdown(reason="child_exit")
+            else:
                _log(f"{spec.name} exited with code {rc}")
        if self.shutdown_at is not None:
            elapsed = time.monotonic() - self.shutdown_at
@@ -177,10 +187,10 @@ class _Supervisor:
        return all(p.poll() is not None for _, p in self.procs)
    def exit_code(self) -> int:
-        if self.first_unexpected_rc is not None:
+        """Worst child returncode wins. On graceful shutdown every
-            return self.first_unexpected_rc
+        child is signal-killed (negative returncode) and max()
-        # Graceful shutdown — surface the worst child code so a
+        returns 0; if some child crashed nonzero before the signal
-        # daemon that died nonzero during teardown is visible.
+        the operator gets that code on container exit."""
        return max((p.returncode for _, p in self.procs), default=0)
@@ -390,17 +390,21 @@ rewrite.
 ## Open questions
 1. **Init failure semantics.** When one daemon crashes mid-run,
-   should the bundle exit (killing the bottle) or restart just
+   the bundle does NOT tear down the survivors — the failure is
-   that daemon? Today, with four separate containers, docker
+   logged, the surviving daemons keep running, and whatever the
-   restarts the crashed one and the bottle stays up. Default
+   dead one served starts failing in a way the agent surfaces.
-   for this PRD: bundle exits on any child death; the bottle
+   The eventual design is restart-the-dead-daemon plus a
-   tears down. Restart logic can land later if operators hit
+   notification to the supervise sidecar so the operator sees
-   it in practice.
+   the event explicitly; chunk 1 ships only the "log and leave
-2. **Exit-code propagation.** If multiple daemons die in quick
+   alone" half. Tear-down-the-bundle was considered and
-   succession (likely under SIGTERM), which exit code wins?
+   rejected: one sick daemon shouldn't take the bottle offline.
-   First-to-die is simplest. Worst-case (highest nonzero
+2. **Exit-code propagation.** When the supervisor finally exits
-   exit code) gives clearest signal in logs. Default to
+   (signal-driven shutdown, or every child having died on its
-   first-to-die unless an operator scenario disagrees.
+   own), the container exits with `max(child returncodes)` —
   the worst nonzero code wins. On graceful shutdown every child
   is signal-killed (negative returncode) so the max is 0; a
   crashed-before-signal daemon's nonzero code wins and reaches
   the operator on container exit.
 3. **Image pin policy.** Pin `claude-bottle-sidecars` by tag
   (`:latest` rebuilt per-release) or by digest written into a
   `CLAUDE_BOTTLE_SIDECAR_IMAGE` env var like the existing
@@ -98,11 +98,10 @@ class TestSupervisor(unittest.TestCase):
        return sup.exit_code()
    def test_all_children_succeed_returns_zero(self):
-        # `sh -c :` exits 0 immediately. With no shutdown request,
+        # `sh -c :` exits 0 immediately. With the new failure
-        # the FIRST child to exit counts as "unexpected" — that's
+        # policy a child dying doesn't trigger shutdown, so the
-        # the design (children are supposed to be long-lived
+        # loop only converges once BOTH have exited on their own.
-        # daemons). But all of them exit with 0, so the recorded
+        # Both exit 0 → max(0, 0) = 0.
        # first-unexpected rc is 0, and exit_code() returns 0.
        specs = [
            _DaemonSpec("a", ("/bin/sh", "-c", ":")),
            _DaemonSpec("b", ("/bin/sh", "-c", ":")),
@@ -112,25 +111,75 @@ class TestSupervisor(unittest.TestCase):
        rc = self._drive(sup)
        self.assertEqual(0, rc)
-    def test_child_crash_propagates_exit_code(self):
+    def test_child_crash_does_not_initiate_shutdown(self):
-        # `sh -c "exit 1"` exits 1. /bin/sleep 60 would still be
+        # Failure policy (PRD 0024, interim): a child dying
-        # running when it exits — the supervisor must tear it down
+        # unexpectedly is logged but the supervisor does NOT tear
-        # and surface the crasher's exit code.
+        # down the survivors. Verified by giving the crasher
        # ~0.3s to die, then asserting the long-runner is still
        # up and the supervisor never set shutdown_at.
        specs = [
            _DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
-            _DaemonSpec("longrun", ("/bin/sleep", "60")),
+            _DaemonSpec("longrun", ("/bin/sleep", "30")),
        ]
        sup = _Supervisor(specs)
        sup.start_all()
        # Drive ticks for a while; crasher should die, longrun
        # should survive.
        deadline = time.monotonic() + 1.0
        while time.monotonic() < deadline:
            done = sup.tick()
            self.assertFalse(done, "loop converged with a child still alive")
            if sup.procs[0][1].poll() is not None:
                break
            time.sleep(0.05)
        self.assertEqual(1, sup.procs[0][1].returncode,
                         "crasher should have exited 1")
        self.assertIsNone(sup.procs[1][1].poll(),
                          "longrun should still be running")
        self.assertIsNone(sup.shutdown_at,
                          "supervisor must not initiate shutdown on child death")
        # Clean up — explicit signal-driven shutdown.
        sup.request_shutdown(reason="test-teardown")
        self._drive(sup)
    def test_crash_then_signal_surfaces_nonzero_exit_code(self):
        # The crasher's exit code is what reaches the container
        # exit even though shutdown was triggered by SIGTERM.
        # exit_code() = max(child returncodes) → 1 wins over the
        # signal-killed longrun's negative returncode.
        specs = [
            _DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
            _DaemonSpec("longrun", ("/bin/sleep", "30")),
        ]
        sup = _Supervisor(specs)
        sup.start_all()
        time.sleep(0.3)  # let crasher die
        sup.request_shutdown(reason="test")
        rc = self._drive(sup)
        self.assertEqual(1, rc)
    def test_all_children_die_unattended_loop_converges(self):
        # If nobody sends a signal but every child eventually
        # dies on its own, the supervisor still exits — nothing
        # left to supervise.
        specs = [
            _DaemonSpec("a", ("/bin/sh", "-c", "exit 0")),
            _DaemonSpec("b", ("/bin/sh", "-c", "exit 2")),
        ]
        sup = _Supervisor(specs)
        sup.start_all()
        rc = self._drive(sup)
-        self.assertEqual(1, rc)
+        self.assertEqual(2, rc)
-        self.assertEqual("crasher", sup.first_unexpected_name)
+        self.assertIsNone(sup.shutdown_at)
    def test_shutdown_after_start_terminates_children(self):
        # Two long-running children. Caller requests shutdown;
-        # both should receive SIGTERM, exit cleanly, and the
+        # both should receive SIGTERM and exit. exit_code() is
-        # supervisor reports exit 0 (graceful path, no recorded
+        # max of (returncodes) — both signal-killed (negative),
-        # unexpected death).
+        # so max() picks 0 in the typical case (or the
        # platform-specific signal returncode).
        specs = [
            _DaemonSpec("a", ("/bin/sleep", "60")),
            _DaemonSpec("b", ("/bin/sleep", "60")),
@@ -140,15 +189,11 @@ class TestSupervisor(unittest.TestCase):
        time.sleep(0.2)  # let them actually start
        sup.request_shutdown(reason="test")
        rc = self._drive(sup)
        # /bin/sleep on Linux returns 130 (= 128 + SIGINT) or
        # similar nonzero on signal-induced exit; on macOS it
        # may be -15. The exit_code() path returns max() which
        # may be negative or positive depending. We don't pin
        # the value here — just confirm the supervisor exited
        # AND that no child was recorded as having died
        # unexpectedly (request_shutdown was first).
        self.assertIsNone(sup.first_unexpected_name)
        self.assertIsNotNone(rc)
        # Both children got the signal — neither survived past
        # the grace deadline.
        for _, p in sup.procs:
            self.assertIsNotNone(p.returncode)
    def test_grace_period_escalates_to_sigkill(self):
        # A child that ignores SIGTERM. The supervisor's