feat(sidecars): bundle image + init supervisor (PRD 0024 chunk 1) #55

Merged
didericis merged 4 commits from prd-0024-chunk-1-bundle-image into main 2026-05-27 00:37:56 -04:00
6 changed files with 798 additions and 11 deletions
+108
View File
@@ -0,0 +1,108 @@
# Per-bottle sidecar bundle image (PRD 0024).
#
# Collapses the four prior per-sidecar images (pipelock, egress,
# git-gate, supervise) into one. A small stdlib-Python init
# supervisor at /app/sidecar_init.py spawns all four daemons,
# forwards SIGTERM, and propagates per-daemon stdout/stderr to the
# container log with a `[name]` prefix. See PRD 0024 for the
# rationale.
#
# Layout (preserved verbatim from the prior four Dockerfiles so the
# compose renderer's bind-mount paths and docker-cp targets keep
# working):
#
# /usr/local/bin/pipelock pipelock binary
# /usr/bin/gitleaks gitleaks binary
# /app/egress_addon.py + siblings mitmproxy addon (egress)
# /app/egress-entrypoint.sh mitmdump launcher
# /app/supervise_server.py + .py supervise MCP server
# /app/sidecar_init.py PID 1 supervisor
# /etc/pipelock.yaml bind-mounted at run time
# /etc/egress/routes.yaml bind-mounted at run time
# /etc/git-gate/pre-receive docker-cp'd at start time
# /git-gate-entrypoint.sh docker-cp'd at start time
# /git-gate/creds/* docker-cp'd at start time
# /git/* bare repos, populated at runtime
# /run/supervise/queue/ bind-mounted at run time
# /home/mitmproxy/.mitmproxy/ mitmproxy CA dir
#
# Exposed ports inside the container:
# 8888 pipelock (HTTPS_PROXY)
# 9099 egress (mitmproxy, pipelock's upstream — not externally
# addressed by the agent)
# 9418 git-gate (git-daemon)
# 9100 supervise (MCP HTTP)
# Stage 1: pipelock binary. The upstream pipelock image is a
# scratch image with the binary at /pipelock (entrypoint).
# Pinned by digest in lockstep with
# claude_bottle/backend/docker/pipelock.py:PIPELOCK_IMAGE.
FROM ghcr.io/luckypipewrench/pipelock@sha256:3b1a39417b98406ddc5dc2d8fcb42865ddc0c68a43d355db55f0f8cb06bc6de9 AS pipelock-src
# Stage 2: gitleaks binary. The upstream gitleaks image is alpine
# with the binary at /usr/bin/gitleaks. Pinned by digest in lockstep
# with Dockerfile.git-gate's prior base (now deleted at chunk 3).
FROM zricethezav/gitleaks@sha256:c00b6bd0aeb3071cbcb79009cb16a60dd9e0a7c60e2be9ab65d25e6bc8abbb7f AS gitleaks-src
# Stage 3: assembly. mitmproxy/mitmproxy is debian-slim-based with
# Python + mitmdump pre-installed — heavier than the others, so
# this stage starts there and pulls the standalone binaries in.
FROM mitmproxy/mitmproxy:11.1.3
# Run as root inside the bundle. The bundle is the isolation
# boundary; per-daemon user separation inside it is not load-bearing
# and complicates the supervisor's spawn path.
USER root
# Runtime system deps:
# git supplies the `git daemon` subcommand (no separate package)
# plus the core `git` binary the pre-receive hook invokes.
# openssh-client supplies the upstream SSH transport the
# pre-receive hook uses to forward accepted refs.
# ca-certificates is needed for both pipelock and mitmdump
# upstream TLS (the base image already has it; listed for
# explicitness).
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
git openssh-client ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Pull the standalone binaries into the final image.
COPY --from=pipelock-src /pipelock /usr/local/bin/pipelock
COPY --from=gitleaks-src /usr/bin/gitleaks /usr/bin/gitleaks
# Project Python: addon + server modules + the init supervisor.
# Kept flat under /app/ so mitmdump's loader resolves them as
# top-level siblings (absolute imports), matching the prior
# Dockerfile.egress / Dockerfile.supervise layout.
COPY claude_bottle/egress_addon_core.py /app/egress_addon_core.py
COPY claude_bottle/egress_addon.py /app/egress_addon.py
COPY claude_bottle/yaml_subset.py /app/yaml_subset.py
COPY claude_bottle/supervise.py /app/supervise.py
COPY claude_bottle/supervise_server.py /app/supervise_server.py
COPY claude_bottle/sidecar_init.py /app/sidecar_init.py
COPY claude_bottle/egress_entrypoint.sh /app/egress-entrypoint.sh
RUN chmod +x /app/egress-entrypoint.sh
# Pre-create runtime directories the compose renderer + start
# step expect to exist. `docker cp` does not create intermediate
# dirs, and bind mounts won't either if the parent is missing.
RUN mkdir -p \
/etc/egress \
/etc/git-gate \
/git-gate/creds \
/git \
/run/supervise/queue \
/home/mitmproxy/.mitmproxy
# Documentation only — the compose renderer publishes whichever
# subset the bottle uses.
EXPOSE 8888 9099 9418 9100
# WORKDIR matches Dockerfile.supervise's prior layout so the
# in-app same-dir import in supervise_server.py stays deterministic.
WORKDIR /app
# PID 1 is the supervisor. It owns signal handling and exit-code
# propagation; no `exec` chain in the entrypoint itself.
ENTRYPOINT ["python3", "/app/sidecar_init.py"]
+35
View File
@@ -0,0 +1,35 @@
#!/bin/sh
# Egress daemon entrypoint inside the sidecar bundle (PRD 0024).
#
# Extracted verbatim from Dockerfile.egress's prior inline `sh -c`
# ENTRYPOINT so the supervisor in claude_bottle/sidecar_init.py can
# call it as a normal child. Behavior is unchanged:
#
# * Upstream proxy: when EGRESS_UPSTREAM_PROXY is set, switch
# to `--mode upstream:URL` to forward all post-MITM traffic
# through pipelock. mitmproxy does NOT honor HTTPS_PROXY on
# its outbound side, so the upstream wiring has to be the
# mitmproxy mode flag, not env.
# * Upstream trust: when EGRESS_UPSTREAM_CA is set, build a
# combined trust bundle (system roots + pipelock CA) and point
# mitmproxy at it. The option REPLACES mitmproxy's default
# trust store, so passing pipelock's CA alone would break
# pipelock-passthrough hosts (api.anthropic.com etc.).
# * `-s /app/egress_addon.py` loads the addon that reads
# /etc/egress/routes.yaml.
set -e
MODE="--mode regular@9099"
if [ -n "$EGRESS_UPSTREAM_PROXY" ]; then
MODE="--mode upstream:$EGRESS_UPSTREAM_PROXY --listen-port 9099"
fi
TRUST_FLAG=""
if [ -n "$EGRESS_UPSTREAM_CA" ] && [ -f "$EGRESS_UPSTREAM_CA" ]; then
COMBINED=/home/mitmproxy/.mitmproxy/combined-trust.pem
cat /etc/ssl/certs/ca-certificates.crt "$EGRESS_UPSTREAM_CA" > "$COMBINED"
TRUST_FLAG="--set ssl_verify_upstream_trusted_ca=$COMBINED"
fi
exec mitmdump $MODE $TRUST_FLAG -s /app/egress_addon.py
+219
View File
@@ -0,0 +1,219 @@
"""Per-bottle sidecar supervisor (PRD 0024 chunk 1).
PID 1 inside the `claude-bottle-sidecars` bundle image. Spawns
the configured daemons (egress, pipelock, git-gate, supervise),
forwards SIGTERM/SIGINT to each child, and propagates per-daemon
stdout+stderr to the container log with a `[name] ` prefix.
Failure policy (interim): when a child dies unexpectedly, the
supervisor logs the death and leaves the surviving children
running. The bundle stays up; whatever the dead daemon served
will start failing, surfacing in the agent's own error path.
The supervisor itself exits only when (a) the operator/compose
sends SIGTERM/SIGINT, or (b) every child has died.
Failure policy (eventual): on unexpected death, the supervisor
restarts the daemon and emits a notification to the supervise
sidecar so the operator sees the event. That lands in a later
PR; the interim policy is "don't take the bundle down for one
sick daemon."
Daemon subset is env-driven. The compose renderer narrows it via
`CLAUDE_BOTTLE_SIDECAR_DAEMONS=egress,pipelock` for bottles that
don't use git-gate or supervise. Default: all four.
Stdlib-only by design — adding supervisord/s6/runit for four
daemons is heavier than this script.
"""
from __future__ import annotations
import os
import signal
import subprocess
import sys
import threading
import time
from dataclasses import dataclass
from typing import IO, Sequence
# Below compose's default 10s `stop_grace_period`. After this many
# seconds past SIGTERM, escalate to SIGKILL on any still-running
# child.
_GRACE_SECONDS = 8.0
# Tight enough that exits and signals propagate without lag; loose
# enough that the main loop isn't a CPU hog.
_POLL_INTERVAL = 0.1
@dataclass(frozen=True)
class _DaemonSpec:
name: str
argv: Sequence[str]
# Order matters only for first-launch race-window reasons: egress
# starts first so pipelock's upstream connect succeeds during
# pipelock's own startup. git-gate and supervise are independent.
_DAEMONS: tuple[_DaemonSpec, ...] = (
_DaemonSpec("egress", ("/bin/sh", "/app/egress-entrypoint.sh")),
_DaemonSpec(
"pipelock",
("/usr/local/bin/pipelock", "run", "--config", "/etc/pipelock.yaml"),
),
_DaemonSpec("git-gate", ("/bin/sh", "/git-gate-entrypoint.sh")),
_DaemonSpec("supervise", ("python3", "/app/supervise_server.py")),
)
def _selected_daemons(
env: dict[str, str],
all_daemons: Sequence[_DaemonSpec] | None = None,
) -> tuple[_DaemonSpec, ...]:
"""Filter the daemon set by the CLAUDE_BOTTLE_SIDECAR_DAEMONS env
var. Unknown names in the list are ignored — the renderer is the
source of truth for which daemons are wired.
`all_daemons` defaults to `_DAEMONS` resolved at call time (not
at definition time), so tests can monkey-patch the module-level
`_DAEMONS` and have the new value take effect."""
if all_daemons is None:
all_daemons = _DAEMONS
raw = env.get("CLAUDE_BOTTLE_SIDECAR_DAEMONS", "").strip()
if not raw:
return tuple(all_daemons)
wanted = {n.strip() for n in raw.split(",") if n.strip()}
return tuple(d for d in all_daemons if d.name in wanted)
def _log(msg: str) -> None:
sys.stdout.write(f"sidecar-init: {msg}\n")
sys.stdout.flush()
def _pump(name: str, stream: IO[bytes]) -> None:
"""Read lines from `stream`, prefix with `[name]`, write to
stdout. Runs in its own thread per child; daemon=True so a
blocked read doesn't keep the process alive after main exits."""
for raw in iter(stream.readline, b""):
line = raw.decode("utf-8", errors="replace").rstrip("\n")
sys.stdout.write(f"[{name}] {line}\n")
sys.stdout.flush()
def _spawn(spec: _DaemonSpec) -> subprocess.Popen:
proc = subprocess.Popen(
list(spec.argv),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=0,
)
threading.Thread(
target=_pump, args=(spec.name, proc.stdout), daemon=True
).start()
return proc
class _Supervisor:
"""Holds the running children + shutdown state. Pulled out so
the test suite can drive it with fake commands."""
def __init__(self, specs: Sequence[_DaemonSpec]):
self.specs = tuple(specs)
self.procs: list[tuple[_DaemonSpec, subprocess.Popen]] = []
self.shutdown_at: float | None = None
# Names of children that have been logged as having exited
# so we only log each death once across watch-loop ticks.
self._logged_dead: set[str] = set()
def start_all(self) -> None:
for spec in self.specs:
_log(f"starting {spec.name}")
self.procs.append((spec, _spawn(spec)))
def request_shutdown(self, reason: str) -> None:
if self.shutdown_at is not None:
return
self.shutdown_at = time.monotonic()
_log(f"shutting down ({reason}); forwarding SIGTERM")
for _, p in self.procs:
if p.poll() is None:
try:
p.terminate()
except ProcessLookupError:
pass
def tick(self) -> bool:
"""One iteration of the watch loop. Returns True when every
child has exited and the supervisor can return.
A child dying unexpectedly is logged but does NOT initiate
shutdown — see the module docstring's failure-policy
section. Shutdown is signal-driven only."""
for spec, p in self.procs:
rc = p.poll()
if rc is None or spec.name in self._logged_dead:
continue
self._logged_dead.add(spec.name)
if self.shutdown_at is None:
_log(
f"{spec.name} exited with code {rc}; leaving "
f"surviving daemons running (operator-visible "
f"via agent-side failure)"
)
else:
_log(f"{spec.name} exited with code {rc}")
if self.shutdown_at is not None:
elapsed = time.monotonic() - self.shutdown_at
if elapsed > _GRACE_SECONDS:
still_running = [
spec.name for spec, p in self.procs if p.poll() is None
]
if still_running:
_log(
f"grace ({_GRACE_SECONDS:.0f}s) elapsed; SIGKILL on "
f"{', '.join(still_running)}"
)
for _, p in self.procs:
if p.poll() is None:
try:
p.kill()
except ProcessLookupError:
pass
return all(p.poll() is not None for _, p in self.procs)
def exit_code(self) -> int:
"""Worst child returncode wins. On graceful shutdown every
child is signal-killed (negative returncode) and max()
returns 0; if some child crashed nonzero before the signal
the operator gets that code on container exit."""
return max((p.returncode for _, p in self.procs), default=0)
def main(argv: Sequence[str] | None = None) -> int:
del argv # no flags yet; env-driven only
specs = _selected_daemons(dict(os.environ))
if not specs:
_log("no daemons selected; nothing to do")
return 0
sup = _Supervisor(specs)
sup.start_all()
signal.signal(signal.SIGTERM, lambda *_: sup.request_shutdown("SIGTERM"))
signal.signal(signal.SIGINT, lambda *_: sup.request_shutdown("SIGINT"))
while not sup.tick():
time.sleep(_POLL_INTERVAL)
rc = sup.exit_code()
_log(f"exit {rc}")
return rc
if __name__ == "__main__":
sys.exit(main())
+15 -11
View File
@@ -390,17 +390,21 @@ rewrite.
## Open questions
1. **Init failure semantics.** When one daemon crashes mid-run,
should the bundle exit (killing the bottle) or restart just
that daemon? Today, with four separate containers, docker
restarts the crashed one and the bottle stays up. Default
for this PRD: bundle exits on any child death; the bottle
tears down. Restart logic can land later if operators hit
it in practice.
2. **Exit-code propagation.** If multiple daemons die in quick
succession (likely under SIGTERM), which exit code wins?
First-to-die is simplest. Worst-case (highest nonzero
exit code) gives clearest signal in logs. Default to
first-to-die unless an operator scenario disagrees.
the bundle does NOT tear down the survivors — the failure is
logged, the surviving daemons keep running, and whatever the
dead one served starts failing in a way the agent surfaces.
The eventual design is restart-the-dead-daemon plus a
notification to the supervise sidecar so the operator sees
the event explicitly; chunk 1 ships only the "log and leave
alone" half. Tear-down-the-bundle was considered and
rejected: one sick daemon shouldn't take the bottle offline.
2. **Exit-code propagation.** When the supervisor finally exits
(signal-driven shutdown, or every child having died on its
own), the container exits with `max(child returncodes)` —
the worst nonzero code wins. On graceful shutdown every child
is signal-killed (negative returncode) so the max is 0; a
crashed-before-signal daemon's nonzero code wins and reaches
the operator on container exit.
3. **Image pin policy.** Pin `claude-bottle-sidecars` by tag
(`:latest` rebuilt per-release) or by digest written into a
`CLAUDE_BOTTLE_SIDECAR_IMAGE` env var like the existing
@@ -0,0 +1,122 @@
"""Integration: PRD 0024 chunk 1 — the sidecar bundle image builds
and the four daemon binaries are present + executable inside it.
This test does NOT exercise the daemons running against real
config (pipelock.yaml, routes.yaml, etc) — that lands in chunk 2
when the renderer wires the bundle into compose. What we verify
here is the chunk-1 contract:
- Dockerfile.sidecars builds (multi-stage works, base layers
pull, COPYs resolve).
- pipelock, gitleaks, mitmdump are at the documented paths and
answer `--version`.
- The Python init at /app/sidecar_init.py runs and prints the
expected "no daemons selected" line when the supervisor is
pointed at an empty daemon set.
Skips cleanly when docker is unavailable, or under act_runner
where the host bind-mount topology breaks multi-stage builds
that pull large bases.
"""
from __future__ import annotations
import os
import subprocess
import unittest
from tests._docker import skip_unless_docker
_IMAGE = "claude-bottle-sidecars-test:chunk1"
_DOCKERFILE = "Dockerfile.sidecars"
@skip_unless_docker()
@unittest.skipIf(
os.environ.get("GITEA_ACTIONS") == "true",
"skipped under act_runner: multi-stage build pulls a 200+MB "
"mitmproxy base + two upstream sidecar images; runner storage "
"+ time budget make this an interactive-only test",
)
class TestSidecarBundleImage(unittest.TestCase):
"""Builds the image once for the class, then runs a few
`docker run` probes against it."""
@classmethod
def setUpClass(cls) -> None:
repo_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
proc = subprocess.run(
["docker", "build", "-t", _IMAGE,
"-f", _DOCKERFILE, "."],
cwd=repo_root,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
if proc.returncode != 0:
raise unittest.SkipTest(
f"docker build failed; skipping image probes.\n"
f"{proc.stdout.decode('utf-8', errors='replace')[-2000:]}"
)
@classmethod
def tearDownClass(cls) -> None:
subprocess.run(
["docker", "image", "rm", "-f", _IMAGE],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
)
def _run_in_image(self, *cmd: str, timeout: float = 30.0) -> tuple[int, str]:
proc = subprocess.run(
["docker", "run", "--rm", "--entrypoint", cmd[0], _IMAGE,
*cmd[1:]],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
timeout=timeout,
)
return proc.returncode, proc.stdout.decode("utf-8", errors="replace")
def test_pipelock_binary_present_and_versioned(self):
rc, out = self._run_in_image("/usr/local/bin/pipelock", "version")
self.assertEqual(0, rc, msg=out)
self.assertIn("pipelock version", out)
def test_gitleaks_binary_present_and_versioned(self):
rc, out = self._run_in_image("/usr/bin/gitleaks", "version")
self.assertEqual(0, rc, msg=out)
# gitleaks prints a bare version string like "v8.x.y".
self.assertRegex(out, r"v?\d+\.\d+")
def test_mitmdump_binary_present_and_versioned(self):
rc, out = self._run_in_image("mitmdump", "--version")
self.assertEqual(0, rc, msg=out)
self.assertIn("Mitmproxy", out)
def test_python_imports_supervise_module(self):
# The bundle's supervise daemon imports `supervise` as a
# same-directory sibling of `supervise_server`. Probe the
# import resolves with `python3 -c` from /app (the
# Dockerfile's WORKDIR).
rc, out = self._run_in_image(
"python3", "-c",
"import supervise; import supervise_server; print('ok')",
)
self.assertEqual(0, rc, msg=out)
self.assertIn("ok", out)
def test_init_supervisor_runs_with_no_daemons(self):
# `nothing` matches no canonical daemon → supervisor exits 0
# immediately with the documented message. Confirms the
# ENTRYPOINT wiring works.
proc = subprocess.run(
["docker", "run", "--rm",
"-e", "CLAUDE_BOTTLE_SIDECAR_DAEMONS=nothing",
_IMAGE],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
timeout=10.0,
)
out = proc.stdout.decode("utf-8", errors="replace")
self.assertEqual(0, proc.returncode, msg=out)
self.assertIn("no daemons selected", out)
if __name__ == "__main__":
unittest.main()
+299
View File
@@ -0,0 +1,299 @@
"""Unit: sidecar bundle init supervisor (PRD 0024 chunk 1).
Tests both the helper functions in `claude_bottle.sidecar_init`
and the supervisor's end-to-end signal / exit-code behavior. The
end-to-end tests use real subprocesses (`/bin/sleep`,
`/bin/sh -c '...'`) — short-lived, no docker required — so they
run under `tests/unit/` rather than `tests/integration/`."""
from __future__ import annotations
import os
import signal
import subprocess
import sys
import time
import unittest
from pathlib import Path
from unittest.mock import patch
from claude_bottle.sidecar_init import (
_DaemonSpec,
_Supervisor,
_selected_daemons,
)
class TestSelectedDaemons(unittest.TestCase):
"""Env-var subset filtering. The compose renderer is the source
of truth for which daemons are wired; the supervisor just
honors what it's told."""
_DAEMONS = (
_DaemonSpec("egress", ("/bin/sh", "-c", ":")),
_DaemonSpec("pipelock", ("/bin/sh", "-c", ":")),
_DaemonSpec("git-gate", ("/bin/sh", "-c", ":")),
_DaemonSpec("supervise", ("/bin/sh", "-c", ":")),
)
def test_unset_returns_all(self):
got = _selected_daemons({}, all_daemons=self._DAEMONS)
self.assertEqual([d.name for d in got],
["egress", "pipelock", "git-gate", "supervise"])
def test_empty_returns_all(self):
got = _selected_daemons({"CLAUDE_BOTTLE_SIDECAR_DAEMONS": ""},
all_daemons=self._DAEMONS)
self.assertEqual(4, len(got))
def test_whitespace_only_returns_all(self):
got = _selected_daemons({"CLAUDE_BOTTLE_SIDECAR_DAEMONS": " "},
all_daemons=self._DAEMONS)
self.assertEqual(4, len(got))
def test_explicit_subset(self):
got = _selected_daemons(
{"CLAUDE_BOTTLE_SIDECAR_DAEMONS": "egress,pipelock"},
all_daemons=self._DAEMONS,
)
self.assertEqual([d.name for d in got], ["egress", "pipelock"])
def test_preserves_canonical_order(self):
# Order in the env var doesn't matter; the result follows
# the canonical _DAEMONS order so egress starts before
# pipelock (race-window reason).
got = _selected_daemons(
{"CLAUDE_BOTTLE_SIDECAR_DAEMONS": "supervise,pipelock,egress"},
all_daemons=self._DAEMONS,
)
self.assertEqual([d.name for d in got],
["egress", "pipelock", "supervise"])
def test_unknown_names_ignored(self):
got = _selected_daemons(
{"CLAUDE_BOTTLE_SIDECAR_DAEMONS": "egress,bogus"},
all_daemons=self._DAEMONS,
)
self.assertEqual([d.name for d in got], ["egress"])
def test_whitespace_in_names_stripped(self):
got = _selected_daemons(
{"CLAUDE_BOTTLE_SIDECAR_DAEMONS": " egress , pipelock "},
all_daemons=self._DAEMONS,
)
self.assertEqual([d.name for d in got], ["egress", "pipelock"])
class TestSupervisor(unittest.TestCase):
"""End-to-end: drive `_Supervisor` directly with fake commands.
We don't go through `main()` because main installs signal
handlers process-wide, which collides with the test runner."""
def _drive(self, sup: _Supervisor, max_wait_s: float = 6.0) -> int:
deadline = time.monotonic() + max_wait_s
while not sup.tick():
if time.monotonic() > deadline:
self.fail("supervisor watch loop did not converge in time")
time.sleep(0.05)
return sup.exit_code()
def test_all_children_succeed_returns_zero(self):
# `sh -c :` exits 0 immediately. With the new failure
# policy a child dying doesn't trigger shutdown, so the
# loop only converges once BOTH have exited on their own.
# Both exit 0 → max(0, 0) = 0.
specs = [
_DaemonSpec("a", ("/bin/sh", "-c", ":")),
_DaemonSpec("b", ("/bin/sh", "-c", ":")),
]
sup = _Supervisor(specs)
sup.start_all()
rc = self._drive(sup)
self.assertEqual(0, rc)
def test_child_crash_does_not_initiate_shutdown(self):
# Failure policy (PRD 0024, interim): a child dying
# unexpectedly is logged but the supervisor does NOT tear
# down the survivors. Verified by giving the crasher
# ~0.3s to die, then asserting the long-runner is still
# up and the supervisor never set shutdown_at.
specs = [
_DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
_DaemonSpec("longrun", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
# Drive ticks for a while; crasher should die, longrun
# should survive.
deadline = time.monotonic() + 1.0
while time.monotonic() < deadline:
done = sup.tick()
self.assertFalse(done, "loop converged with a child still alive")
if sup.procs[0][1].poll() is not None:
break
time.sleep(0.05)
self.assertEqual(1, sup.procs[0][1].returncode,
"crasher should have exited 1")
self.assertIsNone(sup.procs[1][1].poll(),
"longrun should still be running")
self.assertIsNone(sup.shutdown_at,
"supervisor must not initiate shutdown on child death")
# Clean up — explicit signal-driven shutdown.
sup.request_shutdown(reason="test-teardown")
self._drive(sup)
def test_crash_then_signal_surfaces_nonzero_exit_code(self):
# The crasher's exit code is what reaches the container
# exit even though shutdown was triggered by SIGTERM.
# exit_code() = max(child returncodes) → 1 wins over the
# signal-killed longrun's negative returncode.
specs = [
_DaemonSpec("crasher", ("/bin/sh", "-c", "exit 1")),
_DaemonSpec("longrun", ("/bin/sleep", "30")),
]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.3) # let crasher die
sup.request_shutdown(reason="test")
rc = self._drive(sup)
self.assertEqual(1, rc)
def test_all_children_die_unattended_loop_converges(self):
# If nobody sends a signal but every child eventually
# dies on its own, the supervisor still exits — nothing
# left to supervise.
specs = [
_DaemonSpec("a", ("/bin/sh", "-c", "exit 0")),
_DaemonSpec("b", ("/bin/sh", "-c", "exit 2")),
]
sup = _Supervisor(specs)
sup.start_all()
rc = self._drive(sup)
self.assertEqual(2, rc)
self.assertIsNone(sup.shutdown_at)
def test_shutdown_after_start_terminates_children(self):
# Two long-running children. Caller requests shutdown;
# both should receive SIGTERM and exit. exit_code() is
# max of (returncodes) — both signal-killed (negative),
# so max() picks 0 in the typical case (or the
# platform-specific signal returncode).
specs = [
_DaemonSpec("a", ("/bin/sleep", "60")),
_DaemonSpec("b", ("/bin/sleep", "60")),
]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.2) # let them actually start
sup.request_shutdown(reason="test")
rc = self._drive(sup)
self.assertIsNotNone(rc)
# Both children got the signal — neither survived past
# the grace deadline.
for _, p in sup.procs:
self.assertIsNotNone(p.returncode)
def test_grace_period_escalates_to_sigkill(self):
# A child that ignores SIGTERM. The supervisor's
# _GRACE_SECONDS is 8s globally; we patch it to 0.3 so the
# test stays fast.
ignore_term = (
"/bin/sh", "-c",
"trap '' TERM; sleep 30",
)
specs = [_DaemonSpec("stubborn", ignore_term)]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.3) # let `trap` register
sup.request_shutdown(reason="test")
with patch("claude_bottle.sidecar_init._GRACE_SECONDS", 0.3):
rc = self._drive(sup, max_wait_s=4.0)
# Process was SIGKILL'd → returncode -9 on POSIX.
self.assertEqual(-9, sup.procs[0][1].returncode)
self.assertIsNotNone(rc)
def test_idempotent_shutdown_requests(self):
specs = [_DaemonSpec("a", ("/bin/sleep", "60"))]
sup = _Supervisor(specs)
sup.start_all()
time.sleep(0.1)
first_at = None
sup.request_shutdown(reason="first")
first_at = sup.shutdown_at
# Second call must NOT reset the deadline (otherwise the
# grace timer is gameable by a noisy signal).
sup.request_shutdown(reason="second")
self.assertEqual(first_at, sup.shutdown_at)
self._drive(sup)
class TestMainEndToEnd(unittest.TestCase):
"""Run sidecar_init.py as a real subprocess to cover the
signal-handler installation path. Skipped on platforms
without /bin/sleep + /bin/sh."""
@classmethod
def setUpClass(cls):
for p in ("/bin/sh", "/bin/sleep"):
if not Path(p).exists():
raise unittest.SkipTest(f"missing {p}")
def _run(self, daemons_csv: str, send_signal: int | None,
wait_before_signal: float = 0.4,
overall_timeout: float = 6.0) -> tuple[int, str]:
"""Spawn sidecar_init.main() in a child process with the
DAEMONS list patched to harmless `sleep 30` commands.
Returns (returncode, captured stdout)."""
helper = (
"import os, runpy, sys\n"
"from claude_bottle import sidecar_init as si\n"
"si._DAEMONS = (\n"
" si._DaemonSpec('alpha', ('/bin/sleep','30')),\n"
" si._DaemonSpec('beta', ('/bin/sleep','30')),\n"
")\n"
"sys.exit(si.main([]))\n"
)
env = {**os.environ, "CLAUDE_BOTTLE_SIDECAR_DAEMONS": daemons_csv}
proc = subprocess.Popen(
[sys.executable, "-c", helper],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
env=env,
)
try:
if send_signal is not None:
time.sleep(wait_before_signal)
proc.send_signal(send_signal)
out_b, _ = proc.communicate(timeout=overall_timeout)
except subprocess.TimeoutExpired:
proc.kill()
out_b, _ = proc.communicate()
self.fail("sidecar_init main() did not exit before timeout")
return proc.returncode, out_b.decode("utf-8", errors="replace")
def test_sigterm_clean_shutdown(self):
rc, out = self._run("alpha,beta", signal.SIGTERM)
self.assertIn("starting alpha", out)
self.assertIn("starting beta", out)
self.assertIn("forwarding SIGTERM", out)
# Sleep terminated by SIGTERM exits with returncode -15;
# supervisor surfaces that via max(...) and main()
# returns -15 → process exit becomes 256-15 = 241.
# On macOS bash may convert to 143. Either way, nonzero
# AND the child finished — we don't pin the exact code.
self.assertNotEqual(0, rc)
def test_empty_daemon_set_exits_zero_immediately(self):
# Use a sentinel value that filters out both alpha+beta.
rc, out = self._run("nothing", send_signal=None,
overall_timeout=2.0)
self.assertEqual(0, rc)
self.assertIn("no daemons selected", out)
if __name__ == "__main__":
unittest.main()