fix(sidecars): child death no longer tears down the bundle
Reverses chunk 1's "any unexpected child death tears down the rest" policy. New behavior: a daemon dying is logged but does NOT initiate shutdown — the surviving daemons keep running and whatever the dead one served starts failing visibly on the agent side. The supervisor exits only when (a) it receives SIGTERM/SIGINT, or (b) every child has died on its own. Eventual design is restart-the-dead-daemon plus a notification to the supervise sidecar so the operator sees the event explicitly; this commit ships only the "log and leave alone" half. PRD 0024 open question 1 updated to reflect the new intent. Tests updated: replaced "crash propagates exit code via auto-teardown" with three cases that exercise the new policy (crash without shutdown leaves survivors up, crash-then-signal surfaces the nonzero code, all-children-die-unattended still converges the loop). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -2,15 +2,21 @@
|
|||||||
|
|
||||||
PID 1 inside the `claude-bottle-sidecars` bundle image. Spawns
|
PID 1 inside the `claude-bottle-sidecars` bundle image. Spawns
|
||||||
the configured daemons (egress, pipelock, git-gate, supervise),
|
the configured daemons (egress, pipelock, git-gate, supervise),
|
||||||
forwards SIGTERM/SIGINT to each child, propagates per-daemon
|
forwards SIGTERM/SIGINT to each child, and propagates per-daemon
|
||||||
stdout+stderr to the container log with a `[name] ` prefix, and
|
stdout+stderr to the container log with a `[name] ` prefix.
|
||||||
exits with the first unexpected child exit code. If every child
|
|
||||||
exits 0 after a graceful shutdown was requested, exit 0.
|
|
||||||
|
|
||||||
Per the PRD's "init failure semantics" open question, this
|
Failure policy (interim): when a child dies unexpectedly, the
|
||||||
implementation goes with "any unexpected child death tears down
|
supervisor logs the death and leaves the surviving children
|
||||||
the rest" — bundling means the daemons share fate. Restart-just-
|
running. The bundle stays up; whatever the dead daemon served
|
||||||
this-one logic can land later if operators hit pain.
|
will start failing, surfacing in the agent's own error path.
|
||||||
|
The supervisor itself exits only when (a) the operator/compose
|
||||||
|
sends SIGTERM/SIGINT, or (b) every child has died.
|
||||||
|
|
||||||
|
Failure policy (eventual): on unexpected death, the supervisor
|
||||||
|
restarts the daemon and emits a notification to the supervise
|
||||||
|
sidecar so the operator sees the event. That lands in a later
|
||||||
|
PR; the interim policy is "don't take the bundle down for one
|
||||||
|
sick daemon."
|
||||||
|
|
||||||
Daemon subset is env-driven. The compose renderer narrows it via
|
Daemon subset is env-driven. The compose renderer narrows it via
|
||||||
`CLAUDE_BOTTLE_SIDECAR_DAEMONS=egress,pipelock` for bottles that
|
`CLAUDE_BOTTLE_SIDECAR_DAEMONS=egress,pipelock` for bottles that
|
||||||
@@ -118,8 +124,9 @@ class _Supervisor:
|
|||||||
self.specs = tuple(specs)
|
self.specs = tuple(specs)
|
||||||
self.procs: list[tuple[_DaemonSpec, subprocess.Popen]] = []
|
self.procs: list[tuple[_DaemonSpec, subprocess.Popen]] = []
|
||||||
self.shutdown_at: float | None = None
|
self.shutdown_at: float | None = None
|
||||||
self.first_unexpected_rc: int | None = None
|
# Names of children that have been logged as having exited
|
||||||
self.first_unexpected_name: str | None = None
|
# so we only log each death once across watch-loop ticks.
|
||||||
|
self._logged_dead: set[str] = set()
|
||||||
|
|
||||||
def start_all(self) -> None:
|
def start_all(self) -> None:
|
||||||
for spec in self.specs:
|
for spec in self.specs:
|
||||||
@@ -140,21 +147,24 @@ class _Supervisor:
|
|||||||
|
|
||||||
def tick(self) -> bool:
|
def tick(self) -> bool:
|
||||||
"""One iteration of the watch loop. Returns True when every
|
"""One iteration of the watch loop. Returns True when every
|
||||||
child has exited and the supervisor can return."""
|
child has exited and the supervisor can return.
|
||||||
|
|
||||||
|
A child dying unexpectedly is logged but does NOT initiate
|
||||||
|
shutdown — see the module docstring's failure-policy
|
||||||
|
section. Shutdown is signal-driven only."""
|
||||||
for spec, p in self.procs:
|
for spec, p in self.procs:
|
||||||
rc = p.poll()
|
rc = p.poll()
|
||||||
if rc is None:
|
if rc is None or spec.name in self._logged_dead:
|
||||||
continue
|
continue
|
||||||
if self.first_unexpected_rc is None and self.shutdown_at is None:
|
self._logged_dead.add(spec.name)
|
||||||
# First exit BEFORE we asked for shutdown: that's
|
if self.shutdown_at is None:
|
||||||
# the failure signal. Record it and tear down.
|
|
||||||
self.first_unexpected_rc = rc
|
|
||||||
self.first_unexpected_name = spec.name
|
|
||||||
_log(
|
_log(
|
||||||
f"{spec.name} exited unexpectedly with code {rc}; "
|
f"{spec.name} exited with code {rc}; leaving "
|
||||||
"tearing down"
|
f"surviving daemons running (operator-visible "
|
||||||
|
f"via agent-side failure)"
|
||||||
)
|
)
|
||||||
self.request_shutdown(reason="child_exit")
|
else:
|
||||||
|
_log(f"{spec.name} exited with code {rc}")
|
||||||
|
|
||||||
if self.shutdown_at is not None:
|
if self.shutdown_at is not None:
|
||||||
elapsed = time.monotonic() - self.shutdown_at
|
elapsed = time.monotonic() - self.shutdown_at
|
||||||
@@ -177,10 +187,10 @@ class _Supervisor:
|
|||||||
return all(p.poll() is not None for _, p in self.procs)
|
return all(p.poll() is not None for _, p in self.procs)
|
||||||
|
|
||||||
def exit_code(self) -> int:
|
def exit_code(self) -> int:
|
||||||
if self.first_unexpected_rc is not None:
|
"""Worst child returncode wins. On graceful shutdown every
|
||||||
return self.first_unexpected_rc
|
child is signal-killed (negative returncode) and max()
|
||||||
# Graceful shutdown — surface the worst child code so a
|
returns 0; if some child crashed nonzero before the signal
|
||||||
# daemon that died nonzero during teardown is visible.
|
the operator gets that code on container exit."""
|
||||||
return max((p.returncode for _, p in self.procs), default=0)
|
return max((p.returncode for _, p in self.procs), default=0)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -390,17 +390,21 @@ rewrite.
|
|||||||
## Open questions
|
## Open questions
|
||||||
|
|
||||||
1. **Init failure semantics.** When one daemon crashes mid-run,
|
1. **Init failure semantics.** When one daemon crashes mid-run,
|
||||||
should the bundle exit (killing the bottle) or restart just
|
the bundle does NOT tear down the survivors — the failure is
|
||||||
that daemon? Today, with four separate containers, docker
|
logged, the surviving daemons keep running, and whatever the
|
||||||
restarts the crashed one and the bottle stays up. Default
|
dead one served starts failing in a way the agent surfaces.
|
||||||
for this PRD: bundle exits on any child death; the bottle
|
The eventual design is restart-the-dead-daemon plus a
|
||||||
tears down. Restart logic can land later if operators hit
|
notification to the supervise sidecar so the operator sees
|
||||||
it in practice.
|
the event explicitly; chunk 1 ships only the "log and leave
|
||||||
2. **Exit-code propagation.** If multiple daemons die in quick
|
alone" half. Tear-down-the-bundle was considered and
|
||||||
succession (likely under SIGTERM), which exit code wins?
|
rejected: one sick daemon shouldn't take the bottle offline.
|
||||||
First-to-die is simplest. Worst-case (highest nonzero
|
2. **Exit-code propagation.** When the supervisor finally exits
|
||||||
exit code) gives clearest signal in logs. Default to
|
(signal-driven shutdown, or every child having died on its
|
||||||
first-to-die unless an operator scenario disagrees.
|
own), the container exits with `max(child returncodes)` —
|
||||||
|
the worst nonzero code wins. On graceful shutdown every child
|
||||||
|
is signal-killed (negative returncode) so the max is 0; a
|
||||||
|
crashed-before-signal daemon's nonzero code wins and reaches
|
||||||
|
the operator on container exit.
|
||||||
3. **Image pin policy.** Pin `claude-bottle-sidecars` by tag
|
3. **Image pin policy.** Pin `claude-bottle-sidecars` by tag
|
||||||
(`:latest` rebuilt per-release) or by digest written into a
|
(`:latest` rebuilt per-release) or by digest written into a
|
||||||
`CLAUDE_BOTTLE_SIDECAR_IMAGE` env var like the existing
|
`CLAUDE_BOTTLE_SIDECAR_IMAGE` env var like the existing
|
||||||
|
|||||||
@@ -98,11 +98,10 @@ class TestSupervisor(unittest.TestCase):
|
|||||||
return sup.exit_code()
|
return sup.exit_code()
|
||||||
|
|
||||||
def test_all_children_succeed_returns_zero(self):
|
def test_all_children_succeed_returns_zero(self):
|
||||||
# `sh -c :` exits 0 immediately. With no shutdown request,
|
# `sh -c :` exits 0 immediately. With the new failure
|
||||||
# the FIRST child to exit counts as "unexpected" — that's
|
# policy a child dying doesn't trigger shutdown, so the
|
||||||
# the design (children are supposed to be long-lived
|
# loop only converges once BOTH have exited on their own.
|
||||||
# daemons). But all of them exit with 0, so the recorded
|
# Both exit 0 → max(0, 0) = 0.
|
||||||
# first-unexpected rc is 0, and exit_code() returns 0.
|
|
||||||
specs = [
|
specs = [
|
||||||
_DaemonSpec("a", ("/bin/sh", "-c", ":")),
|
_DaemonSpec("a", ("/bin/sh", "-c", ":")),
|
||||||
_DaemonSpec("b", ("/bin/sh", "-c", ":")),
|
_DaemonSpec("b", ("/bin/sh", "-c", ":")),
|
||||||
@@ -112,25 +111,75 @@ class TestSupervisor(unittest.TestCase):
|
|||||||
rc = self._drive(sup)
|
rc = self._drive(sup)
|
||||||
self.assertEqual(0, rc)
|
self.assertEqual(0, rc)
|
||||||
|
|
||||||
def test_child_crash_propagates_exit_code(self):
|
def test_child_crash_does_not_initiate_shutdown(self):
|
||||||
# `sh -c "exit 1"` exits 1. /bin/sleep 60 would still be
|
# Failure policy (PRD 0024, interim): a child dying
|
||||||
# running when it exits — the supervisor must tear it down
|
# unexpectedly is logged but the supervisor does NOT tear
|
||||||
# and surface the crasher's exit code.
|
# down the survivors. Verified by giving the crasher
|
||||||
|
# ~0.3s to die, then asserting the long-runner is still
|
||||||
|
# up and the supervisor never set shutdown_at.
|
||||||
specs = [
|
specs = [
|
||||||
_DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
|
_DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
|
||||||
_DaemonSpec("longrun", ("/bin/sleep", "60")),
|
_DaemonSpec("longrun", ("/bin/sleep", "30")),
|
||||||
|
]
|
||||||
|
sup = _Supervisor(specs)
|
||||||
|
sup.start_all()
|
||||||
|
# Drive ticks for a while; crasher should die, longrun
|
||||||
|
# should survive.
|
||||||
|
deadline = time.monotonic() + 1.0
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
done = sup.tick()
|
||||||
|
self.assertFalse(done, "loop converged with a child still alive")
|
||||||
|
if sup.procs[0][1].poll() is not None:
|
||||||
|
break
|
||||||
|
time.sleep(0.05)
|
||||||
|
|
||||||
|
self.assertEqual(1, sup.procs[0][1].returncode,
|
||||||
|
"crasher should have exited 1")
|
||||||
|
self.assertIsNone(sup.procs[1][1].poll(),
|
||||||
|
"longrun should still be running")
|
||||||
|
self.assertIsNone(sup.shutdown_at,
|
||||||
|
"supervisor must not initiate shutdown on child death")
|
||||||
|
|
||||||
|
# Clean up — explicit signal-driven shutdown.
|
||||||
|
sup.request_shutdown(reason="test-teardown")
|
||||||
|
self._drive(sup)
|
||||||
|
|
||||||
|
def test_crash_then_signal_surfaces_nonzero_exit_code(self):
|
||||||
|
# The crasher's exit code is what reaches the container
|
||||||
|
# exit even though shutdown was triggered by SIGTERM.
|
||||||
|
# exit_code() = max(child returncodes) → 1 wins over the
|
||||||
|
# signal-killed longrun's negative returncode.
|
||||||
|
specs = [
|
||||||
|
_DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
|
||||||
|
_DaemonSpec("longrun", ("/bin/sleep", "30")),
|
||||||
|
]
|
||||||
|
sup = _Supervisor(specs)
|
||||||
|
sup.start_all()
|
||||||
|
time.sleep(0.3) # let crasher die
|
||||||
|
sup.request_shutdown(reason="test")
|
||||||
|
rc = self._drive(sup)
|
||||||
|
self.assertEqual(1, rc)
|
||||||
|
|
||||||
|
def test_all_children_die_unattended_loop_converges(self):
|
||||||
|
# If nobody sends a signal but every child eventually
|
||||||
|
# dies on its own, the supervisor still exits — nothing
|
||||||
|
# left to supervise.
|
||||||
|
specs = [
|
||||||
|
_DaemonSpec("a", ("/bin/sh", "-c", "exit 0")),
|
||||||
|
_DaemonSpec("b", ("/bin/sh", "-c", "exit 2")),
|
||||||
]
|
]
|
||||||
sup = _Supervisor(specs)
|
sup = _Supervisor(specs)
|
||||||
sup.start_all()
|
sup.start_all()
|
||||||
rc = self._drive(sup)
|
rc = self._drive(sup)
|
||||||
self.assertEqual(1, rc)
|
self.assertEqual(2, rc)
|
||||||
self.assertEqual("crasher", sup.first_unexpected_name)
|
self.assertIsNone(sup.shutdown_at)
|
||||||
|
|
||||||
def test_shutdown_after_start_terminates_children(self):
|
def test_shutdown_after_start_terminates_children(self):
|
||||||
# Two long-running children. Caller requests shutdown;
|
# Two long-running children. Caller requests shutdown;
|
||||||
# both should receive SIGTERM, exit cleanly, and the
|
# both should receive SIGTERM and exit. exit_code() is
|
||||||
# supervisor reports exit 0 (graceful path, no recorded
|
# max of (returncodes) — both signal-killed (negative),
|
||||||
# unexpected death).
|
# so max() picks 0 in the typical case (or the
|
||||||
|
# platform-specific signal returncode).
|
||||||
specs = [
|
specs = [
|
||||||
_DaemonSpec("a", ("/bin/sleep", "60")),
|
_DaemonSpec("a", ("/bin/sleep", "60")),
|
||||||
_DaemonSpec("b", ("/bin/sleep", "60")),
|
_DaemonSpec("b", ("/bin/sleep", "60")),
|
||||||
@@ -140,15 +189,11 @@ class TestSupervisor(unittest.TestCase):
|
|||||||
time.sleep(0.2) # let them actually start
|
time.sleep(0.2) # let them actually start
|
||||||
sup.request_shutdown(reason="test")
|
sup.request_shutdown(reason="test")
|
||||||
rc = self._drive(sup)
|
rc = self._drive(sup)
|
||||||
# /bin/sleep on Linux returns 130 (= 128 + SIGINT) or
|
|
||||||
# similar nonzero on signal-induced exit; on macOS it
|
|
||||||
# may be -15. The exit_code() path returns max() which
|
|
||||||
# may be negative or positive depending. We don't pin
|
|
||||||
# the value here — just confirm the supervisor exited
|
|
||||||
# AND that no child was recorded as having died
|
|
||||||
# unexpectedly (request_shutdown was first).
|
|
||||||
self.assertIsNone(sup.first_unexpected_name)
|
|
||||||
self.assertIsNotNone(rc)
|
self.assertIsNotNone(rc)
|
||||||
|
# Both children got the signal — neither survived past
|
||||||
|
# the grace deadline.
|
||||||
|
for _, p in sup.procs:
|
||||||
|
self.assertIsNotNone(p.returncode)
|
||||||
|
|
||||||
def test_grace_period_escalates_to_sigkill(self):
|
def test_grace_period_escalates_to_sigkill(self):
|
||||||
# A child that ignores SIGTERM. The supervisor's
|
# A child that ignores SIGTERM. The supervisor's
|
||||||
|
|||||||
Reference in New Issue
Block a user