feat(sidecars): bundle image + init supervisor (PRD 0024 chunk 1) #55

Merged
didericis merged 4 commits from prd-0024-chunk-1-bundle-image into main 2026-05-27 00:37:56 -04:00
3 changed files with 117 additions and 58 deletions
Showing only changes of commit 62109a1caf - Show all commits
+34 -24
View File
@@ -2,15 +2,21 @@
PID 1 inside the `claude-bottle-sidecars` bundle image. Spawns PID 1 inside the `claude-bottle-sidecars` bundle image. Spawns
the configured daemons (egress, pipelock, git-gate, supervise), the configured daemons (egress, pipelock, git-gate, supervise),
forwards SIGTERM/SIGINT to each child, propagates per-daemon forwards SIGTERM/SIGINT to each child, and propagates per-daemon
stdout+stderr to the container log with a `[name] ` prefix, and stdout+stderr to the container log with a `[name] ` prefix.
exits with the first unexpected child exit code. If every child
exits 0 after a graceful shutdown was requested, exit 0.
Per the PRD's "init failure semantics" open question, this Failure policy (interim): when a child dies unexpectedly, the
implementation goes with "any unexpected child death tears down supervisor logs the death and leaves the surviving children
the rest" — bundling means the daemons share fate. Restart-just- running. The bundle stays up; whatever the dead daemon served
this-one logic can land later if operators hit pain. will start failing, surfacing in the agent's own error path.
The supervisor itself exits only when (a) the operator/compose
sends SIGTERM/SIGINT, or (b) every child has died.
Failure policy (eventual): on unexpected death, the supervisor
restarts the daemon and emits a notification to the supervise
sidecar so the operator sees the event. That lands in a later
PR; the interim policy is "don't take the bundle down for one
sick daemon."
Daemon subset is env-driven. The compose renderer narrows it via Daemon subset is env-driven. The compose renderer narrows it via
`CLAUDE_BOTTLE_SIDECAR_DAEMONS=egress,pipelock` for bottles that `CLAUDE_BOTTLE_SIDECAR_DAEMONS=egress,pipelock` for bottles that
@@ -118,8 +124,9 @@ class _Supervisor:
self.specs = tuple(specs) self.specs = tuple(specs)
self.procs: list[tuple[_DaemonSpec, subprocess.Popen]] = [] self.procs: list[tuple[_DaemonSpec, subprocess.Popen]] = []
self.shutdown_at: float | None = None self.shutdown_at: float | None = None
self.first_unexpected_rc: int | None = None # Names of children that have been logged as having exited
self.first_unexpected_name: str | None = None # so we only log each death once across watch-loop ticks.
self._logged_dead: set[str] = set()
def start_all(self) -> None: def start_all(self) -> None:
for spec in self.specs: for spec in self.specs:
@@ -140,21 +147,24 @@ class _Supervisor:
def tick(self) -> bool: def tick(self) -> bool:
"""One iteration of the watch loop. Returns True when every """One iteration of the watch loop. Returns True when every
child has exited and the supervisor can return.""" child has exited and the supervisor can return.
A child dying unexpectedly is logged but does NOT initiate
shutdown — see the module docstring's failure-policy
section. Shutdown is signal-driven only."""
for spec, p in self.procs: for spec, p in self.procs:
rc = p.poll() rc = p.poll()
if rc is None: if rc is None or spec.name in self._logged_dead:
continue continue
if self.first_unexpected_rc is None and self.shutdown_at is None: self._logged_dead.add(spec.name)
# First exit BEFORE we asked for shutdown: that's if self.shutdown_at is None:
# the failure signal. Record it and tear down.
self.first_unexpected_rc = rc
self.first_unexpected_name = spec.name
_log( _log(
f"{spec.name} exited unexpectedly with code {rc}; " f"{spec.name} exited with code {rc}; leaving "
"tearing down" f"surviving daemons running (operator-visible "
f"via agent-side failure)"
) )
self.request_shutdown(reason="child_exit") else:
_log(f"{spec.name} exited with code {rc}")
if self.shutdown_at is not None: if self.shutdown_at is not None:
elapsed = time.monotonic() - self.shutdown_at elapsed = time.monotonic() - self.shutdown_at
@@ -177,10 +187,10 @@ class _Supervisor:
return all(p.poll() is not None for _, p in self.procs) return all(p.poll() is not None for _, p in self.procs)
def exit_code(self) -> int: def exit_code(self) -> int:
if self.first_unexpected_rc is not None: """Worst child returncode wins. On graceful shutdown every
return self.first_unexpected_rc child is signal-killed (negative returncode) and max()
# Graceful shutdown — surface the worst child code so a returns 0; if some child crashed nonzero before the signal
# daemon that died nonzero during teardown is visible. the operator gets that code on container exit."""
return max((p.returncode for _, p in self.procs), default=0) return max((p.returncode for _, p in self.procs), default=0)
+15 -11
View File
@@ -390,17 +390,21 @@ rewrite.
## Open questions ## Open questions
1. **Init failure semantics.** When one daemon crashes mid-run, 1. **Init failure semantics.** When one daemon crashes mid-run,
should the bundle exit (killing the bottle) or restart just the bundle does NOT tear down the survivors — the failure is
that daemon? Today, with four separate containers, docker logged, the surviving daemons keep running, and whatever the
restarts the crashed one and the bottle stays up. Default dead one served starts failing in a way the agent surfaces.
for this PRD: bundle exits on any child death; the bottle The eventual design is restart-the-dead-daemon plus a
tears down. Restart logic can land later if operators hit notification to the supervise sidecar so the operator sees
it in practice. the event explicitly; chunk 1 ships only the "log and leave
2. **Exit-code propagation.** If multiple daemons die in quick alone" half. Tear-down-the-bundle was considered and
succession (likely under SIGTERM), which exit code wins? rejected: one sick daemon shouldn't take the bottle offline.
First-to-die is simplest. Worst-case (highest nonzero 2. **Exit-code propagation.** When the supervisor finally exits
exit code) gives clearest signal in logs. Default to (signal-driven shutdown, or every child having died on its
first-to-die unless an operator scenario disagrees. own), the container exits with `max(child returncodes)` —
the worst nonzero code wins. On graceful shutdown every child
is signal-killed (negative returncode) so the max is 0; a
crashed-before-signal daemon's nonzero code wins and reaches
the operator on container exit.
3. **Image pin policy.** Pin `claude-bottle-sidecars` by tag 3. **Image pin policy.** Pin `claude-bottle-sidecars` by tag
(`:latest` rebuilt per-release) or by digest written into a (`:latest` rebuilt per-release) or by digest written into a
`CLAUDE_BOTTLE_SIDECAR_IMAGE` env var like the existing `CLAUDE_BOTTLE_SIDECAR_IMAGE` env var like the existing
+68 -23
View File
@@ -98,11 +98,10 @@ class TestSupervisor(unittest.TestCase):
return sup.exit_code() return sup.exit_code()
def test_all_children_succeed_returns_zero(self): def test_all_children_succeed_returns_zero(self):
# `sh -c :` exits 0 immediately. With no shutdown request, # `sh -c :` exits 0 immediately. With the new failure
# the FIRST child to exit counts as "unexpected" — that's # policy a child dying doesn't trigger shutdown, so the
# the design (children are supposed to be long-lived # loop only converges once BOTH have exited on their own.
# daemons). But all of them exit with 0, so the recorded # Both exit 0 → max(0, 0) = 0.
# first-unexpected rc is 0, and exit_code() returns 0.
specs = [ specs = [
_DaemonSpec("a", ("/bin/sh", "-c", ":")), _DaemonSpec("a", ("/bin/sh", "-c", ":")),
_DaemonSpec("b", ("/bin/sh", "-c", ":")), _DaemonSpec("b", ("/bin/sh", "-c", ":")),
@@ -112,25 +111,75 @@ class TestSupervisor(unittest.TestCase):
rc = self._drive(sup) rc = self._drive(sup)
self.assertEqual(0, rc) self.assertEqual(0, rc)
def test_child_crash_propagates_exit_code(self): def test_child_crash_does_not_initiate_shutdown(self):
# `sh -c "exit 1"` exits 1. /bin/sleep 60 would still be # Failure policy (PRD 0024, interim): a child dying
# running when it exits — the supervisor must tear it down # unexpectedly is logged but the supervisor does NOT tear
# and surface the crasher's exit code. # down the survivors. Verified by giving the crasher
# ~0.3s to die, then asserting the long-runner is still
# up and the supervisor never set shutdown_at.
specs = [ specs = [
_DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")), _DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
_DaemonSpec("longrun", ("/bin/sleep", "60")), _DaemonSpec("longrun", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
# Drive ticks for a while; crasher should die, longrun
# should survive.
deadline = time.monotonic() + 1.0
while time.monotonic() < deadline:
done = sup.tick()
self.assertFalse(done, "loop converged with a child still alive")
if sup.procs[0][1].poll() is not None:
break
time.sleep(0.05)
self.assertEqual(1, sup.procs[0][1].returncode,
"crasher should have exited 1")
self.assertIsNone(sup.procs[1][1].poll(),
"longrun should still be running")
self.assertIsNone(sup.shutdown_at,
"supervisor must not initiate shutdown on child death")
# Clean up — explicit signal-driven shutdown.
sup.request_shutdown(reason="test-teardown")
self._drive(sup)
def test_crash_then_signal_surfaces_nonzero_exit_code(self):
# The crasher's exit code is what reaches the container
# exit even though shutdown was triggered by SIGTERM.
# exit_code() = max(child returncodes) → 1 wins over the
# signal-killed longrun's negative returncode.
specs = [
_DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
_DaemonSpec("longrun", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.3) # let crasher die
sup.request_shutdown(reason="test")
rc = self._drive(sup)
self.assertEqual(1, rc)
def test_all_children_die_unattended_loop_converges(self):
# If nobody sends a signal but every child eventually
# dies on its own, the supervisor still exits — nothing
# left to supervise.
specs = [
_DaemonSpec("a", ("/bin/sh", "-c", "exit 0")),
_DaemonSpec("b", ("/bin/sh", "-c", "exit 2")),
] ]
sup = _Supervisor(specs) sup = _Supervisor(specs)
sup.start_all() sup.start_all()
rc = self._drive(sup) rc = self._drive(sup)
self.assertEqual(1, rc) self.assertEqual(2, rc)
self.assertEqual("crasher", sup.first_unexpected_name) self.assertIsNone(sup.shutdown_at)
def test_shutdown_after_start_terminates_children(self): def test_shutdown_after_start_terminates_children(self):
# Two long-running children. Caller requests shutdown; # Two long-running children. Caller requests shutdown;
# both should receive SIGTERM, exit cleanly, and the # both should receive SIGTERM and exit. exit_code() is
# supervisor reports exit 0 (graceful path, no recorded # max of (returncodes) — both signal-killed (negative),
# unexpected death). # so max() picks 0 in the typical case (or the
# platform-specific signal returncode).
specs = [ specs = [
_DaemonSpec("a", ("/bin/sleep", "60")), _DaemonSpec("a", ("/bin/sleep", "60")),
_DaemonSpec("b", ("/bin/sleep", "60")), _DaemonSpec("b", ("/bin/sleep", "60")),
@@ -140,15 +189,11 @@ class TestSupervisor(unittest.TestCase):
time.sleep(0.2) # let them actually start time.sleep(0.2) # let them actually start
sup.request_shutdown(reason="test") sup.request_shutdown(reason="test")
rc = self._drive(sup) rc = self._drive(sup)
# /bin/sleep on Linux returns 130 (= 128 + SIGINT) or
# similar nonzero on signal-induced exit; on macOS it
# may be -15. The exit_code() path returns max() which
# may be negative or positive depending. We don't pin
# the value here — just confirm the supervisor exited
# AND that no child was recorded as having died
# unexpectedly (request_shutdown was first).
self.assertIsNone(sup.first_unexpected_name)
self.assertIsNotNone(rc) self.assertIsNotNone(rc)
# Both children got the signal — neither survived past
# the grace deadline.
for _, p in sup.procs:
self.assertIsNotNone(p.returncode)
def test_grace_period_escalates_to_sigkill(self): def test_grace_period_escalates_to_sigkill(self):
# A child that ignores SIGTERM. The supervisor's # A child that ignores SIGTERM. The supervisor's