refactor!: rename project to bot-bottle

Assisted-by: Codex
This commit is contained in:
2026-05-28 17:56:14 -04:00
parent 8875d8cc17
commit c08b09dc9f
200 changed files with 1271 additions and 1271 deletions
+439
View File
@@ -0,0 +1,439 @@
"""Per-backend bottle factories.
A bottle is a running, isolated environment with claude inside. Each
backend exposes five methods:
prepare(spec, stage_dir=...) -> BottlePlan
Resolves names, validates host-side prerequisites, and writes
scratch files. No remote/runtime resources are created yet.
Safe to call before the y/N preflight.
launch(plan) -> ContextManager[Bottle]
Brings up the container (or VM, or remote machine), provisions
it, yields a Bottle handle, and tears everything down on exit.
prepare_cleanup() -> BottleCleanupPlan
Enumerates orphaned resources left behind by previous bottles
(containers, networks, ...). Idempotent; no side effects.
cleanup(plan) -> None
Actually removes everything described by the cleanup plan.
enumerate_active() -> Sequence[ActiveAgent]
Return every currently-running bottle on this backend, with
enough metadata for callers (CLI `list active`, dashboard
agents pane) to render a row.
Selection is driven by `--backend` on `start` or
BOT_BOTTLE_BACKEND (env var; default "docker"). Per PRD 0003 the
manifest does not carry a backend field; the host picks.
"""
from __future__ import annotations
import os
from abc import ABC, abstractmethod
from contextlib import AbstractContextManager
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Generic, Sequence, TypeVar
from ..log import die
from ..manifest import GitEntry, Manifest
from ..util import expand_tilde
from .util import host_skill_dir
@dataclass(frozen=True)
class BottleSpec:
"""CLI-supplied intent. Backend-agnostic — each backend's prepare
step consumes it and produces its own backend-specific plan.
Resolved values (image names, container name, scratch paths, runsc
availability) live on the plan, not the spec."""
manifest: Manifest
agent_name: str
copy_cwd: bool
user_cwd: str
# PRD 0016 follow-up: when set, the backend's prepare step uses
# this identity instead of minting a fresh one — the resume path
# (`cli.py resume <identity>`) sets this to continue an existing
# bottle's state. Empty string for a fresh `start`.
identity: str = ""
@dataclass(frozen=True)
class BottlePlan(ABC):
"""Base output of a backend's prepare step. Concrete subclasses
(e.g. DockerBottlePlan) add backend-specific resolved fields and
implement `print`."""
spec: BottleSpec
stage_dir: Path
@abstractmethod
def print(self, *, remote_control: bool) -> None:
"""Render the y/N preflight summary to stderr."""
@dataclass(frozen=True)
class BottleCleanupPlan(ABC):
"""Base output of a backend's prepare_cleanup step. Concrete
subclasses (e.g. DockerBottleCleanupPlan) carry backend-specific
lists of resources to be removed and implement `print` + `empty`."""
@abstractmethod
def print(self) -> None:
"""Render the cleanup y/N summary to stderr."""
@property
@abstractmethod
def empty(self) -> bool:
"""True iff there is nothing to clean up; the CLI uses this to
short-circuit before showing the y/N."""
@dataclass(frozen=True)
class ExecResult:
"""Captured result of `Bottle.exec`. Backend-neutral: the Docker
impl populates it from a `subprocess.CompletedProcess`, but a
future fly/smolmachines backend could populate it from any source
that produces a returncode + captured streams."""
returncode: int
stdout: str
stderr: str
@dataclass(frozen=True)
class ActiveAgent:
"""One currently-running agent, as the CLI `list active` and
dashboard agents pane render it. ("Agent" is the project's
consistent name for the thing running inside a bottle — the
bottle is the container, the agent is what runs in it.)
Fields are deliberately backend-neutral. `services` is the set
of sidecar daemons currently up for this bottle (`pipelock`,
`egress`, `git-gate`, `supervise`); the dashboard uses it to
gate edit verbs. `backend_name` is the matching key in
`_BACKENDS` (`docker` / `smolmachines`) — used by the active-
list rendering to disambiguate and by the dashboard's
re-attach path."""
backend_name: str
slug: str
agent_name: str # from metadata.json; "?" if missing
started_at: str # ISO 8601 from metadata.json; "" if missing
services: tuple[str, ...] # alphabetical
class Bottle(ABC):
"""Handle to a running bottle. Yielded by a backend's launch step.
`exec_claude` runs `claude` inside the bottle and blocks until the
session ends. `exec` runs a POSIX shell script inside the bottle
and returns the captured result. `cp_in` copies a host path into
the bottle. `close` is an idempotent alias for context-manager
teardown.
"""
name: str
@abstractmethod
def claude_argv(
self, argv: list[str], *, tty: bool = True,
) -> list[str]:
"""Return the host-side argv that runs `claude <argv>`
inside the bottle. Used by `exec_claude` for foreground
handoffs and by the dashboard's tmux `respawn-pane` flow,
which needs the argv up front (it spawns claude in a tmux
pane rather than as a child of the current process).
Implementations transparently inject
`--append-system-prompt-file` when the bottle was launched
with a provisioned prompt path."""
...
@abstractmethod
def exec_claude(self, argv: list[str], *, tty: bool = True) -> int: ...
@abstractmethod
def exec(self, script: str, *, user: str = "node") -> ExecResult:
"""Run `script` as a POSIX shell script inside the bottle as
`user` (default `node`, matching the agent image's USER
directive) and return the captured stdout/stderr/returncode.
The bottle's environment (including HTTPS_PROXY pointing at
the pipelock sidecar) is inherited by the child. Non-zero
exit does not raise — callers inspect `returncode`
themselves.
Pass `user="root"` for shell-outs that need privileged file
writes / package install — provisioning calls that need root
bypass `Bottle.exec` and use the backend-specific raw
machine-exec helper, but the tests have a legitimate use
case for arbitrary-user runs."""
@abstractmethod
def cp_in(self, host_path: str, container_path: str) -> None: ...
@abstractmethod
def close(self) -> None: ...
PlanT = TypeVar("PlanT", bound=BottlePlan)
CleanupT = TypeVar("CleanupT", bound=BottleCleanupPlan)
class BottleBackend(ABC, Generic[PlanT, CleanupT]):
"""Abstract base for selectable bottle backends. Concrete subclasses
(e.g. DockerBottleBackend) own their own prepare/launch impls.
Parameterized over the backend's concrete plan + cleanup-plan types
so subclass methods get the narrow type without isinstance
boilerplate."""
name: str
def prepare(self, spec: BottleSpec, *, stage_dir: Path) -> PlanT:
"""Template method: run cross-backend host-side validation, then
delegate to the subclass's `_resolve_plan` for the
backend-specific resolution (names, scratch files, etc.). The
validation step is enforced here so a future backend cannot
accidentally skip it. No remote/runtime resources are created."""
self._validate(spec)
return self._resolve_plan(spec, stage_dir=stage_dir)
def _validate(self, spec: BottleSpec) -> None:
"""Cross-backend pre-launch checks. Confirms the agent exists,
the named skills are present on the host, and every git
IdentityFile resolves. Subclasses with additional preconditions
should override and call `super()._validate(spec)` first."""
manifest = spec.manifest
manifest.require_agent(spec.agent_name)
agent = manifest.agents[spec.agent_name]
bottle = manifest.bottle_for(spec.agent_name)
self._validate_skills(agent.skills)
self._validate_git_entries(bottle.git)
self._validate_agent_provider_dockerfile(spec)
def _validate_skills(self, skills: Sequence[str]) -> None:
"""Each named skill must be a directory under the host's
`~/.claude/skills/`. The check is purely host-side, so the
default impl covers every backend."""
for name in skills:
path = host_skill_dir(name)
if not os.path.isdir(path):
die(
f"skill '{name}' not found on host at {path}. "
f"Create it under ~/.claude/skills/, then re-run."
)
def _validate_git_entries(self, entries: Sequence[GitEntry]) -> None:
"""Each entry's IdentityFile must exist on the host (after
expanding leading ~) — the git-gate copies it in at start time
to authenticate the upstream push (PRD 0008). Shape is already
enforced by Manifest validation; this only checks presence."""
for entry in entries:
key = expand_tilde(entry.IdentityFile)
if not os.path.isfile(key):
die(f"git upstream key file not found for '{entry.Name}': {key}")
def _validate_agent_provider_dockerfile(self, spec: BottleSpec) -> None:
bottle = spec.manifest.bottle_for(spec.agent_name)
dockerfile = bottle.agent_provider.dockerfile
if not dockerfile:
return
path = Path(expand_tilde(dockerfile))
if not path.is_absolute():
path = Path(spec.user_cwd) / path
if not path.is_file():
die(
f"agent_provider.dockerfile for bottle "
f"'{spec.manifest.agents[spec.agent_name].bottle}' not found: {path}"
)
@abstractmethod
def _resolve_plan(self, spec: BottleSpec, *, stage_dir: Path) -> PlanT:
"""Backend-specific plan resolution: image/container names,
env-file, prompt-file, proxy plan, runtime detection. Called by
`prepare` after `_validate` succeeds."""
@abstractmethod
def launch(self, plan: PlanT) -> AbstractContextManager[Bottle]:
"""Build/run the bottle and yield a handle; tear down on exit."""
def provision(self, plan: PlanT, target: str) -> str | None:
"""Copy host-side files (CA cert, prompt, skills, .git) into
the running bottle. Called from `launch` after the container
/ machine is up. `target` identifies the running instance in
backend-specific terms (Docker: resolved container name; fly:
machine id). Returns the in-container prompt path if a prompt
was provisioned, else None — the Bottle handle uses it to
decide whether to add --append-system-prompt-file to claude's
argv.
Default orchestration: ca → prompt → skills → git →
supervise. CA install runs first so the agent's trust store
is rebuilt before anything inside the agent makes a TLS call.
Subclasses typically don't override this; they implement the
sub-methods below.
PRD 0017: cred-proxy's agent-side dotfile rewrites (~/.npmrc,
~/.gitconfig insteadOf, tea config) are gone. Egress-proxy is
on the agent's HTTP_PROXY path so every tool that respects
HTTPS_PROXY (claude-code, git over HTTPS, npm, curl) is
intercepted without per-tool reconfiguration."""
self.provision_ca(plan, target)
prompt_path = self.provision_prompt(plan, target)
self.provision_skills(plan, target)
self.provision_git(plan, target)
self.provision_supervise(plan, target)
return prompt_path
def provision_ca(self, plan: PlanT, target: str) -> None:
"""Install the per-bottle CA into the agent's trust store so
the agent trusts the bumped CONNECT cert egress (was
pipelock, pre-PRD-0017) presents. Default impl is a no-op so
backends that don't yet support TLS interception (every backend
except Docker today) aren't forced to implement it. The Docker
backend overrides to docker-cp the cert in and run
`update-ca-certificates`."""
@abstractmethod
def provision_prompt(self, plan: PlanT, target: str) -> str | None:
"""Copy the prompt file into the running bottle. Returns the
in-container path iff the agent has a non-empty prompt;
callers use the return value to decide whether to add
--append-system-prompt-file to claude's argv."""
@abstractmethod
def provision_skills(self, plan: PlanT, target: str) -> None:
"""Copy the agent's named skills from the host into the
running bottle. No-op when the agent has no skills."""
@abstractmethod
def provision_git(self, plan: PlanT, target: str) -> None:
"""Copy the host's cwd `.git` directory into the running
bottle if the user requested --cwd. No-op otherwise."""
def provision_supervise(self, plan: PlanT, target: str) -> None:
"""Write the in-bottle Claude Code MCP config so the agent
discovers the per-bottle supervise sidecar (PRD 0013).
No-op when bottle.supervise is False or the backend doesn't
support the supervise sidecar yet. The Docker backend
overrides."""
@abstractmethod
def prepare_cleanup(self) -> CleanupT:
"""Enumerate orphaned resources from previous bottles. No side
effects; safe to call before the y/N."""
@abstractmethod
def cleanup(self, plan: CleanupT) -> None:
"""Remove everything described by the cleanup plan."""
@abstractmethod
def enumerate_active(self) -> Sequence[ActiveAgent]:
"""Return every currently-running agent on this backend.
Empty when none. Backend-specific: docker queries `docker
compose ls`; smolmachines queries `smolvm machine ls --json`
+ cross-references its bundle container."""
@classmethod
@abstractmethod
def is_available(cls) -> bool:
"""Whether this backend's runtime prerequisites are satisfied
on the current host. Docker → `docker` on PATH; smolmachines
→ `smolvm` on PATH. Used by the cross-backend
`enumerate_active_agents` / `cmd_cleanup` to skip backends
the operator hasn't installed, so a docker-only host
doesn't fail when `cli.py list active` walks past
smolmachines."""
# Import concrete backend classes AFTER the base types are defined, so
# each backend module can pull BottleSpec / BottlePlan / BottleBackend
# via `from . import ...` without hitting a partially-initialized module.
from .docker import DockerBottleBackend # noqa: E402
from .smolmachines import SmolmachinesBottleBackend # noqa: E402
# The dict is heterogeneous: each value is a BottleBackend specialized
# over its own plan type. Concrete plan types are erased here because
# the registry is selected at runtime and the CLI only needs the
# unparameterized methods (prepare → plan → launch(plan), cleanup, etc.).
_BACKENDS: dict[str, BottleBackend[Any, Any]] = {
"docker": DockerBottleBackend(),
"smolmachines": SmolmachinesBottleBackend(),
}
def get_bottle_backend(
name: str | None = None,
) -> BottleBackend[Any, Any]:
"""Resolve the bottle backend.
`name` precedence:
1. explicit arg (CLI `--backend=<name>` passes through here)
2. BOT_BOTTLE_BACKEND env var
3. default `docker`
Dies with a pointer at the known backends if the chosen name
isn't implemented."""
resolved = name or os.environ.get("BOT_BOTTLE_BACKEND") or "docker"
if resolved not in _BACKENDS:
known = ", ".join(sorted(_BACKENDS))
die(f"unknown backend {resolved!r}; known backends: {known}")
return _BACKENDS[resolved]
def known_backend_names() -> tuple[str, ...]:
"""Sorted tuple of all backend keys in `_BACKENDS`. Used by
argparse (`--backend` choices) and the dashboard's backend
picker."""
return tuple(sorted(_BACKENDS))
def has_backend(name: str) -> bool:
"""Whether the named backend's runtime prerequisites are
available on the current host. Cross-backend callers (list,
cleanup) skip unavailable backends so a docker-only host
doesn't fail when the smolmachines backend isn't installed,
and vice versa.
Returns False for unknown names so callers can pass
arbitrary input without separate validation."""
if name not in _BACKENDS:
return False
return _BACKENDS[name].is_available()
def enumerate_active_agents() -> list[ActiveAgent]:
"""All currently-running agents, across every available
backend. Used by CLI `list active` and the dashboard's agents
pane so neither has to know which backends exist. Skips
backends whose `is_available()` reports False. Ordered by
backend name, then by whatever each backend's
`enumerate_active` returns."""
out: list[ActiveAgent] = []
for name in known_backend_names():
if not has_backend(name):
continue
out.extend(_BACKENDS[name].enumerate_active())
return out
__all__ = [
"ActiveAgent",
"Bottle",
"BottleBackend",
"BottleCleanupPlan",
"BottlePlan",
"BottleSpec",
"ExecResult",
"enumerate_active_agents",
"get_bottle_backend",
"has_backend",
"known_backend_names",
]
+33
View File
@@ -0,0 +1,33 @@
"""Docker bottle backend.
The bulk of the implementation lives in sibling modules:
- util: thin Docker subprocess wrappers
- network: Docker network plumbing
- pipelock: DockerPipelockProxy lifecycle
- bottle_plan: DockerBottlePlan
- bottle_cleanup_plan: DockerBottleCleanupPlan
- bottle: DockerBottle handle
- prepare: host-side resolution into a DockerBottlePlan
- launch: bring-up + teardown context manager
- cleanup: orphan enumeration, removal, active listing
- backend: DockerBottleBackend façade wiring the above
This file only re-exports the public names so
`from bot_bottle.backend.docker import DockerBottleBackend` keeps
working.
"""
from __future__ import annotations
from .backend import DockerBottleBackend
from .bottle import DockerBottle
from .bottle_cleanup_plan import DockerBottleCleanupPlan
from .bottle_plan import DockerBottlePlan
__all__ = [
"DockerBottle",
"DockerBottleBackend",
"DockerBottleCleanupPlan",
"DockerBottlePlan",
]
+81
View File
@@ -0,0 +1,81 @@
"""DockerBottleBackend — the Docker implementation of BottleBackend.
This module is a thin façade. The real work lives in four siblings:
- prepare.py — host-side resolution into a DockerBottlePlan
- launch.py — bring-up + teardown context manager
- cleanup.py — orphan enumeration + removal
- enumerate.py — active-agent listing
The base class's `prepare` template runs cross-backend host-side
validation before calling `_resolve_plan` here.
"""
from __future__ import annotations
import shutil
from contextlib import contextmanager
from pathlib import Path
from typing import Generator, Sequence
from .. import ActiveAgent, BottleBackend, BottleSpec
from . import cleanup as _cleanup
from . import enumerate as _enumerate
from . import launch as _launch
from . import prepare as _prepare
from .bottle import DockerBottle
from .bottle_cleanup_plan import DockerBottleCleanupPlan
from .bottle_plan import DockerBottlePlan
from .provision import ca as _ca
from .provision import git as _git
from .provision import prompt as _prompt
from .provision import skills as _skills
from .provision import supervise as _supervise_prov
class DockerBottleBackend(BottleBackend["DockerBottlePlan", "DockerBottleCleanupPlan"]):
"""Docker backend implementation. Selected by BOT_BOTTLE_BACKEND
(default)."""
name = "docker"
@classmethod
def is_available(cls) -> bool:
"""`docker` on PATH is sufficient; we don't probe `docker info`
eagerly because the cross-backend enumerator runs this on
every `list active` and we'd pay a subprocess per call. A
broken daemon will surface its own error during prepare /
launch."""
return shutil.which("docker") is not None
def _resolve_plan(self, spec: BottleSpec, *, stage_dir: Path) -> DockerBottlePlan:
return _prepare.resolve_plan(spec, stage_dir=stage_dir)
@contextmanager
def launch(self, plan: DockerBottlePlan) -> Generator[DockerBottle, None, None]:
with _launch.launch(plan, provision=self.provision) as bottle:
yield bottle
def provision_ca(self, plan: DockerBottlePlan, target: str) -> None:
_ca.provision_ca(plan, target)
def provision_prompt(self, plan: DockerBottlePlan, target: str) -> str | None:
return _prompt.provision_prompt(plan, target)
def provision_skills(self, plan: DockerBottlePlan, target: str) -> None:
_skills.provision_skills(plan, target)
def provision_git(self, plan: DockerBottlePlan, target: str) -> None:
_git.provision_git(plan, target)
def provision_supervise(self, plan: DockerBottlePlan, target: str) -> None:
_supervise_prov.provision_supervise(plan, target)
def prepare_cleanup(self) -> DockerBottleCleanupPlan:
return _cleanup.prepare_cleanup()
def cleanup(self, plan: DockerBottleCleanupPlan) -> None:
_cleanup.cleanup(plan)
def enumerate_active(self) -> Sequence[ActiveAgent]:
return _enumerate.enumerate_active()
+84
View File
@@ -0,0 +1,84 @@
"""DockerBottle — concrete Bottle handle yielded by DockerBottleBackend."""
from __future__ import annotations
import subprocess
from typing import Callable
from ...agent_provider import prompt_args
from .. import Bottle, ExecResult
class DockerBottle(Bottle):
"""Concrete Bottle for Docker."""
def __init__(
self,
container: str,
teardown: Callable[[], None],
prompt_path_in_container: str | None,
*,
agent_command: str = "claude",
agent_prompt_mode: str = "claude_append_file",
):
self.name = container
self._teardown = teardown
self._prompt_path = prompt_path_in_container
self._agent_command = agent_command
self._agent_prompt_mode = agent_prompt_mode
self.agent_command = agent_command
self.agent_provider_template = (
"codex" if agent_command == "codex" else "claude"
)
self._closed = False
def claude_argv(
self, argv: list[str], *, tty: bool = True,
) -> list[str]:
full_argv = list(argv)
full_argv.extend(
prompt_args(self._agent_prompt_mode, self._prompt_path, argv=full_argv)
)
cmd = ["docker", "exec"]
if tty:
cmd.append("-it")
cmd.extend([self.name, self._agent_command, *full_argv])
return cmd
def exec_claude(self, argv: list[str], *, tty: bool = True) -> int:
return subprocess.run(
self.claude_argv(argv, tty=tty), check=False,
).returncode
def exec(self, script: str, *, user: str = "node") -> ExecResult:
# Pipe via stdin to `sh -s` so the caller never has to worry
# about quoting; the script source lands inside the container
# without crossing argv. `-u <user>` overrides the image's
# default USER — defaults to `node` which is already the
# image's USER, so the explicit flag is a no-op there but
# keeps the cross-backend contract uniform.
result = subprocess.run(
["docker", "exec", "-u", user, "-i", self.name, "sh", "-s"],
input=script,
capture_output=True,
text=True,
check=False,
)
return ExecResult(
returncode=result.returncode,
stdout=result.stdout,
stderr=result.stderr,
)
def cp_in(self, host_path: str, container_path: str) -> None:
subprocess.run(
["docker", "cp", host_path, f"{self.name}:{container_path}"],
stdout=subprocess.DEVNULL,
check=True,
)
def close(self) -> None:
if self._closed:
return
self._closed = True
self._teardown()
@@ -0,0 +1,59 @@
"""DockerBottleCleanupPlan — concrete subclass of BottleCleanupPlan.
PRD 0018 chunk 4: cleanup is centered on compose projects. `docker
compose ls` is the source of truth for what's running; the plan
carries the projects to `compose down`, plus three fallback buckets
for legacy / orphan resources:
- stray_containers: pre-compose `bot-bottle-*` containers not
attached to any compose project. Cleared via `docker rm -f`.
- stray_networks: same idea for networks. Cleared via
`docker network rm`.
- orphan_state_dirs: per-bottle state dirs under
~/.bot-bottle/state/ that have no live compose project AND
no `.preserve` marker. Reaped via `shutil.rmtree`.
Compose-managed networks are removed by `compose down --volumes`,
so they don't appear in stray_networks for a normal project — only
truly leftover ones.
"""
from __future__ import annotations
import sys
from dataclasses import dataclass
from ...log import info
from .. import BottleCleanupPlan
@dataclass(frozen=True)
class DockerBottleCleanupPlan(BottleCleanupPlan):
"""Resources DockerBottleBackend.cleanup will remove. Produced by
`prepare_cleanup`; sorted so the y/N output is stable."""
projects: tuple[str, ...]
stray_containers: tuple[str, ...]
stray_networks: tuple[str, ...]
orphan_state_dirs: tuple[str, ...]
@property
def empty(self) -> bool:
return (
not self.projects
and not self.stray_containers
and not self.stray_networks
and not self.orphan_state_dirs
)
def print(self) -> None:
print(file=sys.stderr)
for name in self.projects:
info(f"compose project: {name}")
for name in self.stray_containers:
info(f"stray container: {name}")
for name in self.stray_networks:
info(f"stray network: {name}")
for name in self.orphan_state_dirs:
info(f"orphan state: {name}")
print(file=sys.stderr)
+97
View File
@@ -0,0 +1,97 @@
"""DockerBottlePlan — concrete subclass of BottlePlan.
Carries the Docker-specific resolved fields produced by
DockerBottleBackend.prepare. The launch step consumes it without
further resolution; show_plan-style rendering is the `print` method.
"""
from __future__ import annotations
import sys
from dataclasses import dataclass, field
from pathlib import Path
from ...egress import EgressPlan
from ...git_gate import GitGatePlan
from ...log import info
from ...pipelock import PipelockProxyPlan
from ...supervise import SupervisePlan
from .. import BottlePlan
from ..print_util import print_multi
@dataclass(frozen=True)
class DockerBottlePlan(BottlePlan):
"""Docker-specific resolved fields produced by
DockerBottleBackend.prepare. Inherits `spec` and `stage_dir` from
BottlePlan."""
slug: str
container_name: str
container_name_pinned: bool
image: str
derived_image: str # "" -> no derived image
runtime_image: str # image actually launched (derived or base)
# Absolute path to the Dockerfile that builds `image`. Empty means
# use the repo's default Dockerfile. Populated to a per-bottle
# state file (~/.bot-bottle/state/<slug>/Dockerfile) after a
# capability-block remediation (PRD 0016).
dockerfile_path: str
env_file: Path # docker --env-file: NAME=VALUE literals
# name -> value for vars forwarded into the docker-run child process
# via subprocess env (so values never land on argv or in a file).
# repr=False keeps secret/interpolated/OAuth values out of any
# accidental log of the plan dataclass.
forwarded_env: dict[str, str] = field(repr=False)
prompt_file: Path
proxy_plan: PipelockProxyPlan
git_gate_plan: GitGatePlan
egress_plan: EgressPlan
# None when bottle.supervise is False. PRD 0013 supervise sidecar
# is opt-in via the manifest's bottle.supervise field.
supervise_plan: SupervisePlan | None
use_runsc: bool
agent_command: str = "claude"
agent_prompt_mode: str = "claude_append_file"
agent_provider_template: str = "claude"
def print(self, *, remote_control: bool) -> None:
"""Render the y/N preflight summary to stderr — compact form
intended to fit on screen without scrolling. The full
structured shape (image, container, runtime, etc.) lives on
this dataclass for tooling that wants to introspect it."""
del remote_control # not surfaced in the compact summary
spec = self.spec
manifest = spec.manifest
agent = manifest.agents[spec.agent_name]
bottle = manifest.bottle_for(spec.agent_name)
# The agent sees the union of literal env names (rendered into
# --env-file) and forwarded env names (`-e NAME` with the
# value arriving via subprocess env). The forwarded set holds
# the OAuth token (CLAUDE_CODE_OAUTH_TOKEN) and any host-env
# interpolations from the manifest; egress holds
# upstream tokens in its own environ, so no token forwarding
# from the agent to the proxy is needed.
env_names = sorted(set(bottle.env.keys()) | set(self.forwarded_env.keys()))
print(file=sys.stderr)
info(f"agent : {spec.agent_name}")
info(f"provider : {self.agent_provider_template}")
print_multi("env ", env_names)
print_multi("skills ", list(agent.skills))
info(f"bottle : {agent.bottle}")
git_lines = [
f"{u.upstream_host}:{u.upstream_port}"
for u in self.git_gate_plan.upstreams
]
if git_lines:
print_multi(" git gate ", git_lines)
if self.egress_plan.routes:
egress_lines = []
for r in self.egress_plan.routes:
auth = f" [auth:{r.auth_scheme}]" if r.auth_scheme else ""
egress_lines.append(f"{r.host}{auth}")
print_multi(" egress ", egress_lines)
print(file=sys.stderr)
+328
View File
@@ -0,0 +1,328 @@
"""Per-bottle persistent state (PRD 0016).
Holds the per-bottle Dockerfile override that capability-block
remediation writes, the transcript snapshot the state-preservation
helper saves before teardown, and the launch metadata that lets
`cli.py resume <identity>` reconstruct a bottle's spec. State
lives at:
~/.bot-bottle/state/<identity>/
metadata.json — agent_name + cwd + started_at (for resume)
Dockerfile — per-bottle override (absent → use repo's)
transcript/ — last snapshotted agent state (best-effort)
When the per-bottle Dockerfile is present, the launch step builds
the agent image with a per-bottle tag (bot-bottle-rebuilt-<id>)
from this file rather than the repo's. The build context is still
the repo root so the Dockerfile can COPY bot_bottle source files
the same way the original does.
Identity model:
- Every `cli.py start <agent>` mints a fresh identity via
`bottle_identity(agent_name)`: slug-prefix for readability plus a
5-char random suffix for parallel-safe uniqueness. The metadata
written at launch time pins (agent_name, cwd) to that identity.
- `cli.py resume <identity>` reads the metadata and re-launches a
bottle pinned to the same identity, picking up any per-bottle
Dockerfile and transcript snapshot.
"""
from __future__ import annotations
import dataclasses
import json
import secrets
import string
from dataclasses import dataclass
from pathlib import Path
from ... import supervise as _supervise
from . import util as docker_mod
# Directory layout: ~/.bot-bottle/state/<identity>/...
_STATE_SUBDIR = "state"
_PER_BOTTLE_DOCKERFILE_NAME = "Dockerfile"
_TRANSCRIPT_SUBDIR = "transcript"
# Per-sidecar scratch subdirs. PRD 0018 chunk 2: bind-mount sources
# live here so chunk 3's `docker compose up` can find them at stable
# paths. Each sidecar's `prepare()` writes config + CAs into its own
# subdir; the launch step is unchanged today (still `docker cp`).
_PIPELOCK_SUBDIR = "pipelock"
_EGRESS_SUBDIR = "egress"
_GIT_GATE_SUBDIR = "git-gate"
_SUPERVISE_SUBDIR = "supervise"
_AGENT_SUBDIR = "agent"
_METADATA_NAME = "metadata.json"
# Live-config dir bind-mounted into the supervise sidecar (read-only).
# Host's apply paths keep these files fresh so supervise's
# `list-pipelock-allowlist` / `list-egress-routes` MCP tools
# return the current state — not a snapshot from launch time.
_LIVE_CONFIG_SUBDIR = "live-config"
LIVE_CONFIG_ROUTES_NAME = "routes.yaml"
LIVE_CONFIG_ALLOWLIST_NAME = "allowlist"
# Empty marker file. capability_apply writes it before teardown so
# cli.py's session-end cleanup knows to preserve the state dir for
# `cli.py resume <identity>`. Absent = clean up.
_PRESERVE_MARKER = ".preserve"
# 5 chars of base36 alphabet ≈ 60M combinations. Plenty for human
# operators starting bottles by hand; collision-free in practice.
_RANDOM_SUFFIX_LEN = 5
_SUFFIX_ALPHABET = string.ascii_lowercase + string.digits
def bottle_identity(agent_name: str) -> str:
"""Mint a fresh per-launch bottle identity. The slug-prefix is
`slugify(agent_name)` for readability; the suffix is 5 random
base36 chars so two simultaneous `start <agent>` invocations
don't collide on container/network names.
Every call produces a different identity (non-deterministic).
To continue an existing bottle's state, use the recorded
identity from BottleMetadata via `cli.py resume <identity>`,
not this function."""
slug = docker_mod.slugify(agent_name)
suffix = "".join(secrets.choice(_SUFFIX_ALPHABET) for _ in range(_RANDOM_SUFFIX_LEN))
return f"{slug}-{suffix}"
@dataclass(frozen=True)
class BottleMetadata:
"""Persistent record of how a bottle was launched, written at
start time and read by `cli.py resume`. Lives at
~/.bot-bottle/state/<identity>/metadata.json."""
identity: str
agent_name: str
cwd: str # empty string when --cwd was not passed
copy_cwd: bool
started_at: str # ISO 8601 UTC
# PRD 0018 chunk 3: derivable from identity via
# `compose_project_name(identity)`, but persisted explicitly so
# dashboard / cleanup / resume tooling can read it without
# importing the compose module. Empty string for state dirs
# written before chunk 3 (resume / inspect should fall back to
# deriving from identity in that case).
compose_project: str = ""
def metadata_path(identity: str) -> Path:
return bottle_state_dir(identity) / _METADATA_NAME
def write_metadata(metadata: BottleMetadata) -> Path:
"""Persist `metadata` to ~/.bot-bottle/state/<identity>/metadata.json.
Mode 0o644 — no secrets, just (agent_name, cwd, timestamp)."""
path = metadata_path(metadata.identity)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(dataclasses.asdict(metadata), indent=2) + "\n")
path.chmod(0o644)
return path
def read_metadata(identity: str) -> BottleMetadata | None:
"""Return the metadata for `identity`, or None if no state has
been recorded for it. Used by `cli.py resume` to reconstruct
the launch spec."""
path = metadata_path(identity)
if not path.is_file():
return None
raw = json.loads(path.read_text())
if not isinstance(raw, dict):
return None
return BottleMetadata(
identity=str(raw.get("identity", identity)),
agent_name=str(raw.get("agent_name", "")),
cwd=str(raw.get("cwd", "")),
copy_cwd=bool(raw.get("copy_cwd", False)),
started_at=str(raw.get("started_at", "")),
compose_project=str(raw.get("compose_project", "")),
)
def bottle_state_dir(identity: str) -> Path:
"""Per-bottle state directory on the host. Created lazily by the
write helpers; readers tolerate its absence."""
return _supervise.bot_bottle_root() / _STATE_SUBDIR / identity
def per_bottle_dockerfile_path(identity: str) -> Path:
return bottle_state_dir(identity) / _PER_BOTTLE_DOCKERFILE_NAME
def per_bottle_dockerfile(identity: str) -> str | None:
"""Return the per-bottle Dockerfile content if present, else
None. None means: use the repo's Dockerfile (the original
pre-capability-block behavior)."""
p = per_bottle_dockerfile_path(identity)
if p.is_file():
return p.read_text()
return None
def write_per_bottle_dockerfile(identity: str, content: str) -> Path:
p = per_bottle_dockerfile_path(identity)
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(content)
p.chmod(0o644)
return p
def per_bottle_image_tag(identity: str) -> str:
"""Image tag for a rebuilt bottle. Distinct from the base
bot-bottle-claude:latest so per-bottle rebuilds don't collide in
the docker image cache."""
return f"bot-bottle-rebuilt-{identity}:latest"
def live_config_dir(identity: str) -> Path:
"""Per-bottle live-config dir. Bind-mounted read-only into the
supervise sidecar; the host's apply paths refresh the files on
every operator approval so the agent's `list-*` MCP tools always
return current state."""
return bottle_state_dir(identity) / _LIVE_CONFIG_SUBDIR
def live_routes_path(identity: str) -> Path:
return live_config_dir(identity) / LIVE_CONFIG_ROUTES_NAME
def live_allowlist_path(identity: str) -> Path:
return live_config_dir(identity) / LIVE_CONFIG_ALLOWLIST_NAME
def write_live_config(
identity: str, *, routes: str = "", allowlist: str = "",
) -> Path:
"""Initialise (or refresh) the live-config dir. Empty-string args
leave the existing file alone (caller passes only what it knows).
Returns the live-config dir path."""
d = live_config_dir(identity)
d.mkdir(parents=True, exist_ok=True)
if routes:
p = live_routes_path(identity)
p.write_text(routes)
p.chmod(0o644)
if allowlist:
p = live_allowlist_path(identity)
p.write_text(allowlist)
p.chmod(0o644)
return d
def transcript_snapshot_dir(identity: str) -> Path:
"""Where capability_apply stashes the agent's transcript before
teardown, so the next `cli.py start <agent>` can offer to
resume from it."""
return bottle_state_dir(identity) / _TRANSCRIPT_SUBDIR
# --- Per-sidecar scratch subdirs (PRD 0018 chunk 2) ------------------------
#
# Each sidecar gets its own subdir under the bottle's state dir for
# bind-mount sources (config, CAs, hooks, etc.). Prepare-time writes
# land here; the state dir's normal cleanup (`cleanup_state`) reaps
# them along with everything else when the bottle session ends and
# nothing requested preservation.
def pipelock_state_dir(identity: str) -> Path:
"""State subdir for the pipelock sidecar: pipelock.yaml + the
per-bottle CA cert/key. Bind-mount source from chunk 3 onward."""
return bottle_state_dir(identity) / _PIPELOCK_SUBDIR
def egress_state_dir(identity: str) -> Path:
"""State subdir for the egress sidecar: routes.yaml + the
per-bottle mitmproxy CA. Bind-mount source from chunk 3 onward."""
return bottle_state_dir(identity) / _EGRESS_SUBDIR
def git_gate_state_dir(identity: str) -> Path:
"""State subdir for the git-gate sidecar: entrypoint + hooks +
per-upstream known_hosts. Bind-mount source from chunk 3
onward."""
return bottle_state_dir(identity) / _GIT_GATE_SUBDIR
def supervise_state_dir(identity: str) -> Path:
"""State subdir for the supervise sidecar's current-config dir
(bind-mounted into the agent at /etc/bot-bottle/current-config).
The queue dir is intentionally NOT under here — it lives at
~/.bot-bottle/queue/<slug>/ alongside the audit logs, so it
survives state-dir cleanup."""
return bottle_state_dir(identity) / _SUPERVISE_SUBDIR
def agent_state_dir(identity: str) -> Path:
"""State subdir for the agent's prepare-time scratch files: the
env file (docker --env-file source) and the prompt file."""
return bottle_state_dir(identity) / _AGENT_SUBDIR
# --- Preserve-on-close marker ----------------------------------------------
def preserve_marker_path(identity: str) -> Path:
return bottle_state_dir(identity) / _PRESERVE_MARKER
def mark_preserved(identity: str) -> Path:
"""Mark this bottle's state for preservation across session
teardown. Written by capability_apply.apply_capability_change so
cli.py's session-end cleanup leaves the state dir intact for a
subsequent `cli.py resume`."""
path = preserve_marker_path(identity)
path.parent.mkdir(parents=True, exist_ok=True)
path.touch()
return path
def is_preserved(identity: str) -> bool:
return preserve_marker_path(identity).exists()
def clear_preserve_marker(identity: str) -> None:
"""Idempotent removal. Called at fresh launch (start or resume)
so a marker left from a prior capability-block doesn't keep
state alive past the next normal session-end."""
try:
preserve_marker_path(identity).unlink()
except FileNotFoundError:
pass
def cleanup_state(identity: str) -> None:
"""Remove the per-bottle state dir entirely. Called by cli.py
when a bottle session ends and is_preserved(identity) is False.
Idempotent — missing dir is success."""
import shutil
state_dir = bottle_state_dir(identity)
if state_dir.is_dir():
shutil.rmtree(state_dir, ignore_errors=True)
__all__ = [
"BottleMetadata",
"agent_state_dir",
"bottle_identity",
"bottle_state_dir",
"cleanup_state",
"clear_preserve_marker",
"egress_state_dir",
"git_gate_state_dir",
"is_preserved",
"mark_preserved",
"metadata_path",
"per_bottle_dockerfile",
"per_bottle_dockerfile_path",
"per_bottle_image_tag",
"pipelock_state_dir",
"preserve_marker_path",
"read_metadata",
"supervise_state_dir",
"transcript_snapshot_dir",
"write_metadata",
"write_per_bottle_dockerfile",
]
@@ -0,0 +1,220 @@
"""capability_apply — host-side orchestrator for capability-block
remediation (PRD 0016).
On approval of a capability-block proposal, the dashboard calls
apply_capability_change(slug, new_dockerfile) which:
1. Snapshots the agent's transcript dir to
~/.bot-bottle/state/<slug>/transcript/ (best-effort).
2. Pushes the agent's working tree via `git push` (best-effort —
no upstream / no commits / no git repo all skip with a log).
3. Writes the new Dockerfile to
~/.bot-bottle/state/<slug>/Dockerfile (PRD 0016 Phase 1
state). The next `cli.py start <agent>` picks it up.
4. Force-removes the agent container + all sidecars + the
per-bottle networks. Idempotent — missing resources are not
errors.
Returns (before, after) Dockerfile contents so the dashboard can
record / render the diff. (capability-block has no audit log per
PRD 0013 — the per-bottle Dockerfile state is its own record.)
This is "fire-and-forget" from the agent's perspective: by the time
the dashboard writes the response file the supervise sidecar is
gone, so the agent's tool call connection drops without ever
receiving the response. The replacement agent (next manual
`cli.py start`) sees the new Dockerfile and starts from there.
v1 does not auto-relaunch — see PRD 0016's capability-block return
semantics open question.
"""
from __future__ import annotations
import os
import shutil
import subprocess
from pathlib import Path
from ...log import info, warn
from .bottle_state import (
mark_preserved,
per_bottle_dockerfile,
per_bottle_dockerfile_path,
transcript_snapshot_dir,
write_per_bottle_dockerfile,
)
from .sidecar_bundle import sidecar_bundle_container_name
# Agent home inside the container (per the repo Dockerfile's
# `USER node` + `WORKDIR /home/node`). Used to locate the transcript
# dir + the workspace dir for git push.
_AGENT_HOME_IN_CONTAINER = "/home/node"
_AGENT_TRANSCRIPT_IN_CONTAINER = f"{_AGENT_HOME_IN_CONTAINER}/.claude"
_AGENT_WORKSPACE_IN_CONTAINER = f"{_AGENT_HOME_IN_CONTAINER}/workspace"
# Per-bottle resource name patterns (mirroring prepare.py).
def _agent_container_name(slug: str) -> str:
return f"bot-bottle-{slug}"
def _per_bottle_container_names(slug: str) -> list[str]:
"""All container names that belong to this bottle. Missing
containers are silently skipped by the teardown helper, so it's
fine to include names that don't exist for a given bottle."""
return [
_agent_container_name(slug),
sidecar_bundle_container_name(slug),
]
def _per_bottle_network_names(slug: str) -> list[str]:
return [
f"bot-bottle-net-{slug}",
f"bot-bottle-egress-{slug}",
]
class CapabilityApplyError(RuntimeError):
"""Raised when the apply fails in a way that should keep the
proposal pending (so the operator can retry). Best-effort
failures (transcript snapshot, git push) do not raise — they
just log and proceed."""
# --- Public helpers --------------------------------------------------------
def fetch_current_dockerfile(slug: str) -> str:
"""Return the Dockerfile content the next `cli.py start <agent>`
would use for this bottle. If a per-bottle override exists, that
one; otherwise the repo's Dockerfile.
Used by the operator-edit verb to show the current source of
truth, and by apply_capability_change for the before-diff."""
override = per_bottle_dockerfile(slug)
if override is not None:
return override
repo_dockerfile = _repo_dockerfile_path()
if repo_dockerfile.is_file():
return repo_dockerfile.read_text()
raise CapabilityApplyError(
f"no per-bottle Dockerfile for {slug} and no repo Dockerfile at "
f"{repo_dockerfile}"
)
def apply_capability_change(slug: str, new_dockerfile: str) -> tuple[str, str]:
"""End-to-end capability-block remediation. See module docstring
for the sequence. Returns (before, after) Dockerfile content."""
if not new_dockerfile.strip():
raise CapabilityApplyError("proposed Dockerfile is empty")
before = fetch_current_dockerfile(slug)
snapshot_transcript(slug)
_push_working_tree(slug)
write_per_bottle_dockerfile(slug, new_dockerfile)
# Set the preserve marker BEFORE teardown so cli.py's session-end
# cleanup sees it and keeps the state dir intact for the
# operator's `cli.py resume <identity>`. Without the marker the
# state dir would be deleted as part of normal session end.
mark_preserved(slug)
_teardown_bottle(slug)
return before, new_dockerfile
# --- Internals -------------------------------------------------------------
def _repo_dockerfile_path() -> Path:
"""Path to the repo's Claude Dockerfile (one dir above this module's
package root). Resolved at call time so the path is correct
regardless of where this module is imported from."""
# bot_bottle/backend/docker/capability_apply.py -> repo root
return Path(__file__).resolve().parent.parent.parent.parent / "Dockerfile.claude"
def snapshot_transcript(slug: str) -> None:
"""`docker cp` /home/node/.claude out of the agent container into
~/.bot-bottle/state/<slug>/transcript/. Best-effort: missing
container, missing dir, or cp error all log a warning and return.
The transcript is what `claude --resume` reads to pick up where
the agent left off.
Called from two places:
- capability-apply, before tearing the bottle down.
- cli.py's session-end path, before the launch context closes,
so a crash or normal exit also leaves a transcript on disk
(deleted along with the state dir on clean exit, kept on
crash or capability-block per the preserve marker)."""
container = _agent_container_name(slug)
dest = transcript_snapshot_dir(slug)
if dest.exists():
# Remove any prior snapshot so the new one is a clean copy.
shutil.rmtree(dest, ignore_errors=True)
dest.parent.mkdir(parents=True, exist_ok=True)
r = subprocess.run(
["docker", "cp", f"{container}:{_AGENT_TRANSCRIPT_IN_CONTAINER}", str(dest)],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
warn(
f"transcript snapshot skipped "
f"({(r.stderr or '').strip() or 'no transcript dir in container?'})"
)
return
info(f"transcript snapshotted to {dest}")
def _push_working_tree(slug: str) -> None:
"""`docker exec <agent> git push` from /home/node/workspace.
Best-effort: not-a-git-repo, no upstream, nothing-to-push, no
network all log a warning and return. The replacement bottle
will pick up whatever's actually upstream."""
container = _agent_container_name(slug)
r = subprocess.run(
[
"docker", "exec", container, "sh", "-c",
f"cd {_AGENT_WORKSPACE_IN_CONTAINER} && "
f"git rev-parse --is-inside-work-tree >/dev/null 2>&1 && "
f"git push origin HEAD 2>&1 || true",
],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
warn(
f"capability-apply: git push skipped "
f"({(r.stderr or '').strip() or 'docker exec failed'})"
)
return
output = (r.stdout or "").strip()
if output:
info(f"capability-apply: git push: {output}")
else:
info("capability-apply: git push ran (no output — likely not a git workspace)")
def _teardown_bottle(slug: str) -> None:
"""Force-remove all per-bottle docker resources. Idempotent —
`docker rm -f` / `docker network rm` silently ignore missing
names, so this can be called even mid-rebuild."""
info(f"capability-apply: tearing down bottle {slug}")
for name in _per_bottle_container_names(slug):
subprocess.run(
["docker", "rm", "-f", name],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False,
)
for net in _per_bottle_network_names(slug):
subprocess.run(
["docker", "network", "rm", net],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False,
)
__all__ = [
"CapabilityApplyError",
"apply_capability_change",
"fetch_current_dockerfile",
"snapshot_transcript",
]
+180
View File
@@ -0,0 +1,180 @@
"""Cleanup for the Docker bottle backend.
PRD 0018 chunk 4: cleanup is centered on `docker compose ls`.
Pre-compose code paths could leave bare containers / networks
without a compose project; those still show up via the prefix
scan, just as a fallback bucket alongside the project list.
`prepare_cleanup` enumerates:
- Live compose projects whose name starts with `bot-bottle-`.
- `bot-bottle-*` containers that aren't part of any compose
project (legacy orphans).
- `bot-bottle-*` networks that aren't tied to a compose
project (legacy orphans; compose-managed networks come down
with `compose down --volumes` and don't appear here).
- State dirs under ~/.bot-bottle/state/<identity>/ with no
live compose project AND no `.preserve` marker.
`cleanup` removes everything in the plan.
Active-agent enumeration lives in `backend/docker/enumerate.py`
(mirror of `backend/smolmachines/enumerate.py`).
"""
from __future__ import annotations
import shutil
import subprocess
from ... import supervise as _supervise
from ...log import info, warn
from . import util as docker_mod
from .bottle_cleanup_plan import DockerBottleCleanupPlan
from .bottle_state import bottle_state_dir, is_preserved
from .compose import COMPOSE_PROJECT_PREFIX, list_compose_projects
def _list_prefixed_containers() -> list[str]:
"""All bot-bottle-prefixed containers, running or stopped."""
result = subprocess.run(
["docker", "ps", "-a",
"--filter", f"name=^{COMPOSE_PROJECT_PREFIX}",
"--format", "{{.Names}}\t{{.Label \"com.docker.compose.project\"}}"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
warn(f"docker ps failed: {result.stderr.strip()}")
return []
out: list[str] = []
for line in (result.stdout or "").splitlines():
if not line:
continue
name, _, project = line.partition("\t")
# Stray = no compose label. Compose-managed containers carry
# `com.docker.compose.project=<name>`; we'll reap those via
# `compose down`, not via container rm.
if not project:
out.append(name)
return sorted(set(out))
def _list_prefixed_networks() -> list[str]:
"""All bot-bottle-prefixed networks not currently attached
to a compose project. Compose-managed networks have a
`com.docker.compose.project` label; bare ones (from pre-compose
code paths) don't."""
result = subprocess.run(
["docker", "network", "ls",
"--filter", f"name={COMPOSE_PROJECT_PREFIX}",
"--format", "{{.Name}}\t{{.Label \"com.docker.compose.project\"}}"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
warn(f"docker network ls failed: {result.stderr.strip()}")
return []
out: list[str] = []
for line in (result.stdout or "").splitlines():
if not line:
continue
name, _, project = line.partition("\t")
if not project:
out.append(name)
return sorted(set(out))
def _list_orphan_state_dirs(
live_projects: set[str], protected_identities: set[str],
) -> list[str]:
"""State identities whose compose project isn't running and
that don't have a `.preserve` marker. `.preserve` means the
user (or an auto-preserve-on-crash) wants the state kept for
`resume`.
`protected_identities` is the set of slugs that are live in
ANY backend — used so this docker-side check doesn't reap a
running smolmachines bottle's state dir (the layout is shared
across both backends)."""
state_root = _supervise.bot_bottle_root() / "state"
if not state_root.is_dir():
return []
orphans: list[str] = []
for child in sorted(state_root.iterdir()):
if not child.is_dir():
continue
identity = child.name
project = f"{COMPOSE_PROJECT_PREFIX}{identity}"
if project in live_projects:
continue
if identity in protected_identities:
continue
if is_preserved(identity):
continue
orphans.append(identity)
return orphans
def prepare_cleanup() -> DockerBottleCleanupPlan:
"""Enumerate everything cleanup will touch. No removals.
Pulls the union of live identities across backends via
`enumerate_active_agents()` so the orphan-state-dir bucket
doesn't include slugs whose smolmachines VM is still up."""
docker_mod.require_docker()
projects = list_compose_projects()
project_set = set(projects)
# Late import to avoid a circular at module-load time —
# the backend package's __init__ imports this module.
from .. import enumerate_active_agents
protected = {a.slug for a in enumerate_active_agents()}
return DockerBottleCleanupPlan(
projects=tuple(projects),
stray_containers=tuple(_list_prefixed_containers()),
stray_networks=tuple(_list_prefixed_networks()),
orphan_state_dirs=tuple(
_list_orphan_state_dirs(project_set, protected),
),
)
def cleanup(plan: DockerBottleCleanupPlan) -> None:
"""Remove everything in the plan. Projects first (whose `compose
down` reaps their containers + networks atomically), then stray
legacy resources, then orphan state dirs."""
for project in plan.projects:
info(f"docker compose down ({project})")
result = subprocess.run(
["docker", "compose", "-p", project, "down", "--volumes"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
warn(
f"compose down failed for {project}: "
f"{result.stderr.strip()}"
)
for name in plan.stray_containers:
info(f"removing stray container {name}")
subprocess.run(
["docker", "rm", "-f", name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
)
for name in plan.stray_networks:
info(f"removing stray network {name}")
subprocess.run(
["docker", "network", "rm", name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
)
for identity in plan.orphan_state_dirs:
path = bottle_state_dir(identity)
info(f"removing orphan state dir {path}")
try:
shutil.rmtree(path, ignore_errors=True)
except OSError as e:
warn(f"failed to remove {path}: {e}")
+528
View File
@@ -0,0 +1,528 @@
"""Compose-spec rendering for a Docker bottle (PRD 0018, chunk 1).
`bottle_plan_to_compose(plan)` returns a Compose v2 spec dict
describing the per-bottle container topology — one project per
bottle instance, services for the agent + every applicable sidecar,
two networks, no named volumes.
Pure function. No I/O, no subprocess. Expects every launch-time
field (network names, CA host paths, etc.) on the plan's inner
plans to be populated; chunks 2+3 own that ordering. Chunk 1 just
encodes the translation so it can be unit-tested in isolation.
Conditional services follow the plan content (matches the
SDK-call branching in `launch.py` today):
- pipelock + agent: always.
- git-gate: iff plan.git_gate_plan.upstreams.
- egress: iff plan.egress_plan.routes.
- supervise: iff plan.supervise_plan is not None.
Naming:
- Compose project: `bot-bottle-<slug>`.
- Service names (inside the file): `agent`, `pipelock`,
`egress`, `git-gate`, `supervise`.
- `container_name:` matches today's pattern
(`bot-bottle-<service>-<slug>`) so dashboard/cleanup discovery
via the prefix scan keeps working through the transition.
- Network aliases preserve the current dial-by-shortname pattern
for `egress` / `supervise`, and add the long container-name as
an internal-network alias for `pipelock` / `git-gate` so any
caller still referencing the long name resolves.
Sidecars that are built (egress, git-gate, supervise) get a
compose `build:` block pointing at the repo Dockerfile; the
`image:` tag is set explicitly so cached images on the daemon
aren't rebuilt on every up.
"""
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
from typing import Any
from ...egress import (
EGRESS_HOSTNAME,
EGRESS_ROUTES_IN_CONTAINER,
)
from ...git_gate import GIT_GATE_HOSTNAME, git_gate_aggregate_extra_hosts
from ...log import die, warn
from ...pipelock import PIPELOCK_HOSTNAME
from ...supervise import (
CURRENT_CONFIG_DIR_IN_AGENT,
QUEUE_DIR_IN_CONTAINER,
SUPERVISE_HOSTNAME,
SUPERVISE_PORT,
)
from ...util import expand_tilde
from .bottle_plan import DockerBottlePlan
from .egress import (
EGRESS_CA_IN_CONTAINER,
EGRESS_PIPELOCK_CA_IN_CONTAINER,
)
from .git_gate import (
GIT_GATE_ACCESS_HOOK_IN_CONTAINER,
GIT_GATE_CREDS_DIR_IN_CONTAINER,
GIT_GATE_ENTRYPOINT_IN_CONTAINER,
GIT_GATE_HOOK_IN_CONTAINER,
)
from .pipelock import (
PIPELOCK_CA_CERT_IN_CONTAINER,
PIPELOCK_CA_KEY_IN_CONTAINER,
PIPELOCK_PORT,
)
from .provision.ca import AGENT_CA_BUNDLE, AGENT_CA_PATH
from .sidecar_bundle import (
SIDECAR_BUNDLE_DOCKERFILE,
SIDECAR_BUNDLE_IMAGE,
sidecar_bundle_container_name,
)
# Repo root, used as the build context for the bundle Dockerfile.
_REPO_DIR = str(Path(__file__).resolve().parent.parent.parent.parent)
def bottle_plan_to_compose(plan: DockerBottlePlan) -> dict[str, Any]:
"""Render a Compose v2 spec dict from a fully-resolved
DockerBottlePlan.
The plan must have its inner plans (`proxy_plan`,
`git_gate_plan`, `egress_plan`, `supervise_plan`) populated
with launch-time fields — network names, CA host paths,
pipelock_proxy_url. The renderer doesn't validate; callers
feed it a fully-resolved plan or get an incomplete compose
spec back.
"""
project = f"bot-bottle-{plan.slug}"
services: dict[str, Any] = {
"sidecars": _sidecar_bundle_service(plan),
"agent": _agent_service(plan),
}
return {
"name": project,
"services": services,
"networks": _networks(plan),
}
def _networks(plan: DockerBottlePlan) -> dict[str, Any]:
"""Compose-managed networks with explicit `name:` matching the
existing slug-suffixed convention. Compose creates them on `up`
and destroys them on `down`. The internal one is `--internal`
(no default gateway); the egress one is a normal user-defined
bridge."""
return {
"internal": {
"name": plan.proxy_plan.internal_network,
"internal": True,
},
"egress": {
"name": plan.proxy_plan.egress_network,
},
}
def _bind(host: str | Path, target: str, *, read_only: bool = True) -> dict[str, Any]:
"""One bind-mount entry in the long-form `volumes:` shape.
Long form is preferred over `host:target:ro` strings because
it's easier to inspect in tests and survives whitespace in
host paths."""
return {
"type": "bind",
"source": str(host),
"target": target,
"read_only": read_only,
}
def _sidecar_bundle_service(plan: DockerBottlePlan) -> dict[str, Any]:
"""The `sidecars` service: one container per bottle, bundle
image, all four daemons under a Python init supervisor.
Mechanics:
- Daemon subset narrows via `BOT_BOTTLE_SIDECAR_DAEMONS`
env. pipelock is always present; egress / git-gate /
supervise are conditional on the plan.
- Volumes are the union of the four daemons' bind-mounts,
preserving the same in-container paths so each daemon
finds its config / hooks / CA where it expects.
- Environment is the union of *daemon-private* env vars
(EGRESS_UPSTREAM_PROXY, SUPERVISE_BOTTLE_SLUG, etc).
HTTPS_PROXY is NOT propagated here — see the comment in
egress_entrypoint.sh; setting it at the container level
would route git-gate's git fetches through pipelock,
which is wrong.
- Network aliases register every legacy short/long
hostname (pipelock, egress, git-gate, supervise plus
their `bot-bottle-<service>-<slug>` long forms) so
the agent's HTTPS_PROXY URL and any other inter-service
reference resolves to the bundle.
"""
daemons: list[str] = ["egress", "pipelock"]
if plan.git_gate_plan.upstreams:
daemons.append("git-gate")
if plan.supervise_plan is not None:
daemons.append("supervise")
env: list[str] = [f"BOT_BOTTLE_SIDECAR_DAEMONS={','.join(daemons)}"]
volumes: list[dict[str, Any]] = []
# --- pipelock ----------------------------------------------------
pp = plan.proxy_plan
volumes += [
_bind(pp.yaml_path, "/etc/pipelock.yaml"),
_bind(pp.ca_cert_host_path, PIPELOCK_CA_CERT_IN_CONTAINER),
_bind(pp.ca_key_host_path, PIPELOCK_CA_KEY_IN_CONTAINER),
]
# --- egress (always part of the bundle; the EGRESS_UPSTREAM_*
# env vars + ca bind-mounts are needed iff routes exist; when
# the bottle has no routes the egress daemon falls back to its
# `regular@9099` mode and is unused) -----------------------------
ep = plan.egress_plan
if ep.routes:
env.append(f"EGRESS_UPSTREAM_PROXY={ep.pipelock_proxy_url}")
env.append(f"EGRESS_UPSTREAM_CA={EGRESS_PIPELOCK_CA_IN_CONTAINER}")
volumes += [
_bind(ep.routes_path, EGRESS_ROUTES_IN_CONTAINER),
_bind(ep.mitmproxy_ca_host_path, EGRESS_CA_IN_CONTAINER),
_bind(ep.pipelock_ca_host_path, EGRESS_PIPELOCK_CA_IN_CONTAINER),
]
for token_env in sorted(ep.token_env_map.keys()):
env.append(token_env)
# --- git-gate ----------------------------------------------------
extra_hosts: list[str] = []
gp = plan.git_gate_plan
if gp.upstreams:
volumes += [
_bind(gp.entrypoint_script, GIT_GATE_ENTRYPOINT_IN_CONTAINER),
_bind(gp.hook_script, GIT_GATE_HOOK_IN_CONTAINER),
_bind(gp.access_hook_script, GIT_GATE_ACCESS_HOOK_IN_CONTAINER),
]
for u in gp.upstreams:
keypath = expand_tilde(u.identity_file)
volumes.append(_bind(
keypath,
f"{GIT_GATE_CREDS_DIR_IN_CONTAINER}/{u.name}-key",
))
extra_map = git_gate_aggregate_extra_hosts(gp.upstreams)
extra_hosts = [f"{host}:{ip}" for host, ip in sorted(extra_map.items())]
# --- supervise ---------------------------------------------------
sp = plan.supervise_plan
if sp is not None:
env += [
f"SUPERVISE_BOTTLE_SLUG={plan.slug}",
f"SUPERVISE_QUEUE_DIR={QUEUE_DIR_IN_CONTAINER}",
f"SUPERVISE_PORT={SUPERVISE_PORT}",
]
volumes.append({
"type": "bind",
"source": str(sp.queue_dir),
"target": QUEUE_DIR_IN_CONTAINER,
"read_only": False,
})
# Internal-network aliases: the agent reaches each daemon through
# its short name (pipelock / egress / git-gate / supervise) which
# the bundle answers as if it were the daemon itself.
internal_aliases = [
PIPELOCK_HOSTNAME,
EGRESS_HOSTNAME,
]
if gp.upstreams:
internal_aliases.append(GIT_GATE_HOSTNAME)
if sp is not None:
internal_aliases.append(SUPERVISE_HOSTNAME)
service: dict[str, Any] = {
"image": SIDECAR_BUNDLE_IMAGE,
"build": {
"context": _REPO_DIR,
"dockerfile": SIDECAR_BUNDLE_DOCKERFILE,
},
"container_name": sidecar_bundle_container_name(plan.slug),
"networks": {
"internal": {"aliases": internal_aliases},
"egress": None,
},
"environment": env,
"volumes": volumes,
}
if extra_hosts:
service["extra_hosts"] = extra_hosts
return service
def _agent_service(plan: DockerBottlePlan) -> dict[str, Any]:
"""Agent container. Runs `sleep infinity`; claude is `docker
exec -it`'d into it later. No TTY at the container level —
interactivity is per-exec. HTTP_PROXY/HTTPS_PROXY point at the
egress short-alias when an egress is declared, otherwise
straight at pipelock's container name. CA trust trio matches
the existing launch.py wiring."""
proxy_url = _agent_proxy_url(plan)
no_proxy = _agent_no_proxy(plan)
env: list[str] = [
f"HTTPS_PROXY={proxy_url}",
f"HTTP_PROXY={proxy_url}",
f"https_proxy={proxy_url}",
f"http_proxy={proxy_url}",
f"NO_PROXY={no_proxy}",
f"no_proxy={no_proxy}",
f"NODE_EXTRA_CA_CERTS={AGENT_CA_PATH}",
f"SSL_CERT_FILE={AGENT_CA_BUNDLE}",
f"REQUESTS_CA_BUNDLE={AGENT_CA_BUNDLE}",
]
# Forwarded vars (OAuth token, manifest host-interpolations):
# bare name → inherits from compose-up process env, value
# never lands on argv or in the compose file.
for name in sorted(plan.forwarded_env.keys()):
env.append(name)
service: dict[str, Any] = {
"image": plan.runtime_image,
"container_name": plan.container_name,
"command": ["sleep", "infinity"],
"networks": {"internal": None},
"environment": env,
}
if plan.use_runsc:
service["runtime"] = "runsc"
if plan.env_file and plan.env_file.exists() and plan.env_file.stat().st_size > 0:
service["env_file"] = [str(plan.env_file)]
volumes: list[dict[str, Any]] = []
if plan.supervise_plan is not None:
volumes.append(_bind(
plan.supervise_plan.current_config_dir,
CURRENT_CONFIG_DIR_IN_AGENT,
))
if volumes:
service["volumes"] = volumes
# The init supervisor inside the bundle owns intra-bundle
# daemon ordering, so the agent only waits for the bundle
# container itself.
service["depends_on"] = ["sidecars"]
return service
def _agent_proxy_url(plan: DockerBottlePlan) -> str:
"""Pick the agent's HTTP_PROXY. With egress declared, the agent
goes through egress (which in turn HTTPS_PROXYs to pipelock on
its outbound leg). Without egress, the agent talks straight to
pipelock."""
if plan.egress_plan.routes:
from .egress import EGRESS_PORT
return f"http://{EGRESS_HOSTNAME}:{EGRESS_PORT}"
return f"http://{PIPELOCK_HOSTNAME}:{PIPELOCK_PORT}"
def _agent_no_proxy(plan: DockerBottlePlan) -> str:
"""NO_PROXY for the agent. Matches the launch.py rules:
loopback always, supervise hostname when the supervise sidecar
is up (the MCP long-poll pattern needs to bypass pipelock's
idle timeout)."""
hosts = ["localhost", "127.0.0.1"]
if plan.supervise_plan is not None:
hosts.append(SUPERVISE_HOSTNAME)
return ",".join(hosts)
# --- Lifecycle helpers (PRD 0018 chunk 3) ----------------------------------
#
# The renderer above is pure. The helpers below own the I/O side:
# serialize the spec to disk, drive `docker compose up`, dump the
# merged log file on teardown, and `docker compose down` to clean up
# (networks are pre-created externally so `down` leaves them alone;
# the launch step removes them in its own teardown step).
COMPOSE_FILE_NAME = "docker-compose.yml"
COMPOSE_LOG_NAME = "compose.log"
COMPOSE_PROJECT_PREFIX = "bot-bottle-"
def compose_project_name(slug: str) -> str:
"""Stable mapping from slug → compose project. Matches the
`name:` field the renderer emits, so `docker compose ls`
enumeration and direct CLI invocations agree on the project
identifier."""
return f"{COMPOSE_PROJECT_PREFIX}{slug}"
def slug_from_compose_project(project: str) -> str:
"""Inverse of `compose_project_name`: strip the prefix to get
the underlying slug. Returns empty string if the project name
doesn't start with the expected prefix."""
if not project.startswith(COMPOSE_PROJECT_PREFIX):
return ""
return project[len(COMPOSE_PROJECT_PREFIX):]
def list_compose_projects(*, include_stopped: bool = True) -> list[str]:
"""All compose project names starting with `bot-bottle-`.
`include_stopped=True` (default) runs `docker compose ls --all`
so exited projects appear too; pass False to get only projects
with at least one running container.
Returns [] on docker daemon errors or malformed output rather
than raising — callers should treat the empty list as "no
projects discoverable", not "no projects exist"."""
argv = ["docker", "compose", "ls", "--format", "json"]
if include_stopped:
argv.insert(3, "--all")
try:
result = subprocess.run(
argv, capture_output=True, text=True, check=False,
)
except FileNotFoundError:
# docker binary not on PATH — same shape as a daemon-down
# error from the caller's POV: no projects discoverable.
return []
if result.returncode != 0:
warn(f"docker compose ls failed: {result.stderr.strip()}")
return []
try:
projects = json.loads(result.stdout or "[]")
except json.JSONDecodeError as e:
warn(f"docker compose ls returned malformed JSON: {e}")
return []
names: list[str] = []
for p in projects:
if not isinstance(p, dict):
continue
name = str(p.get("Name", ""))
if name.startswith(COMPOSE_PROJECT_PREFIX):
names.append(name)
return sorted(set(names))
def list_active_slugs(*, include_stopped: bool = False) -> list[str]:
"""Slugs (project name minus prefix) of currently-running
bottles. Used by the dashboard's operator-edit verbs to choose
a bottle to apply a config edit to."""
return sorted(
slug for slug in (
slug_from_compose_project(p)
for p in list_compose_projects(include_stopped=include_stopped)
) if slug
)
def compose_file_path(state_dir: Path) -> Path:
return state_dir / COMPOSE_FILE_NAME
def compose_log_path(state_dir: Path) -> Path:
return state_dir / COMPOSE_LOG_NAME
def write_compose_file(spec: dict[str, Any], path: Path) -> Path:
"""Serialize the compose dict to disk. JSON content with a
`.yml` filename — JSON is a strict subset of YAML 1.2 for the
constructs the renderer uses (mappings, lists, strings, bools,
nulls), and `docker compose -f file.yml` parses it as YAML.
Avoids a yaml dependency while keeping the file `cat`-readable.
"""
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(spec, indent=2, sort_keys=False) + "\n")
path.chmod(0o644)
return path
def _compose_argv(project: str, compose_file: Path, *cmd: str) -> list[str]:
return [
"docker", "compose",
"-p", project,
"-f", str(compose_file),
*cmd,
]
def compose_up(
project: str,
compose_file: Path,
*,
env: dict[str, str] | None = None,
) -> None:
"""`docker compose up -d` for the project. Env-inheritance is
via `env=` on the subprocess — every `environment: [NAME]` (bare
name) entry in the compose file resolves to whatever value
`NAME` has in `env` at exec time. Secrets never land on argv or
in the compose file."""
argv = _compose_argv(project, compose_file, "up", "-d")
result = subprocess.run(
argv, capture_output=True, text=True, env=env, check=False,
)
if result.returncode != 0:
sys.stderr.write(result.stderr)
die(f"docker compose up failed for project {project}")
def compose_dump_logs(project: str, compose_file: Path, output: Path) -> None:
"""Write the merged stdout/stderr of every service to `output`
using `docker compose logs --no-color --timestamps`. Best-effort
— failures here shouldn't block teardown. The interleaved single
file is what the user reads post-mortem; per-service tail still
works through `docker compose logs -f <service>` while the
project is up."""
output.parent.mkdir(parents=True, exist_ok=True)
argv = _compose_argv(project, compose_file, "logs", "--no-color", "--timestamps")
try:
with open(output, "wb") as f:
subprocess.run(
argv,
stdout=f,
stderr=subprocess.STDOUT,
check=False,
)
output.chmod(0o644)
except OSError as e:
warn(f"failed to write compose log to {output}: {e}")
def compose_down(project: str, compose_file: Path) -> None:
"""`docker compose down` for the project. External networks are
intentionally NOT removed by compose (`external: true` on the
networks block); the launch step's own teardown removes them
via `network_remove` so the per-bottle ephemeral subnet doesn't
accumulate."""
argv = _compose_argv(project, compose_file, "down")
result = subprocess.run(
argv, capture_output=True, text=True, check=False,
)
if result.returncode != 0:
warn(
f"docker compose down failed for project {project}: "
f"{result.stderr.strip()}"
)
__all__ = [
"COMPOSE_FILE_NAME",
"COMPOSE_LOG_NAME",
"COMPOSE_PROJECT_PREFIX",
"bottle_plan_to_compose",
"compose_down",
"compose_dump_logs",
"compose_file_path",
"compose_log_path",
"compose_project_name",
"compose_up",
"list_active_slugs",
"list_compose_projects",
"slug_from_compose_project",
"write_compose_file",
]
+123
View File
@@ -0,0 +1,123 @@
"""Docker-side egress helpers: port pin, in-container CA paths,
container naming, and the host-side mitmproxy CA mint. The
prepare-time routes-yaml rendering itself lives on the
platform-neutral `Egress` ABC — backends instantiate it directly.
The per-container `.start()` / `.stop()` lifecycle was removed in
PRD 0024 chunk 3; the sidecar bundle (PRD 0024) runs egress
under its python init supervisor."""
from __future__ import annotations
import os
import subprocess
from pathlib import Path
from ...log import die
# Listening port the egress daemon binds inside the bundle. The
# agent's HTTP_PROXY env var resolves to `http://egress:<port>`,
# and the bundle's network aliases route `egress` to itself.
EGRESS_PORT = int(os.environ.get("BOT_BOTTLE_EGRESS_PORT", "9099"))
# In-container path for mitmproxy's CA. The format is a single PEM
# file holding BOTH the cert and the private key, concatenated. The
# upstream-trust CA (pipelock's, so egress trusts the upstream
# leg) is a separate file because pipelock keeps a different CA on
# its end.
EGRESS_CA_IN_CONTAINER = "/home/mitmproxy/.mitmproxy/mitmproxy-ca.pem"
EGRESS_PIPELOCK_CA_IN_CONTAINER = (
"/home/mitmproxy/.mitmproxy/pipelock-ca.pem"
)
def egress_tls_init(stage_dir: Path) -> tuple[Path, Path]:
"""Mint the per-bottle egress MITM CA via host `openssl req`.
Returns `(mitmproxy_pem, cert_only_pem)`:
- `mitmproxy_pem` is the single-PEM concat (cert + key)
mitmproxy reads from `~/.mitmproxy/mitmproxy-ca.pem`.
- `cert_only_pem` is the cert alone — installed into the agent's
trust store by `provision_ca` so the agent trusts the bumped
CONNECT cert egress presents.
Why openssl req (not the pipelock binary's `tls init`):
pipelock's CA generator stamps a non-standard `Subject Key
Identifier` on the CA (random rather than SHA-1 of the pubkey).
mitmproxy computes the `Authority Key Identifier` on each leaf
it mints as SHA-1(issuer's pubkey). openssl's chain validator
uses the leaf's AKI to find the issuer cert by SKI; pipelock's
SKI doesn't match → openssl reports "unable to get local issuer
certificate" even though the CA is right there in the trust
store. openssl req's `subjectKeyIdentifier=hash` extension uses
SHA-1(pubkey), matching mitmproxy's computation.
Both files live under `<stage_dir>/egress-ca/` (mode 644 —
`docker cp` preserves the mode into the container, where the
mitmproxy user (uid 1000) reads them; the host stage_dir is
mode 700 so the private key isn't world-exposed)."""
work = stage_dir / "egress-ca"
work.mkdir(exist_ok=True)
key_path = work / "ca-key.pem"
cert_path = work / "ca.pem"
cnf_path = work / "ca.cnf"
# RSA-2048 — broad mitmproxy compatibility (its default leaf-cert
# config matches RSA CAs without surprise), and openssl req's
# default behavior here is exactly what we want.
keygen = subprocess.run(
["openssl", "genrsa", "-out", str(key_path), "2048"],
capture_output=True, text=True, check=False,
)
if keygen.returncode != 0:
die(f"egress ca keygen failed: {keygen.stderr.strip()}")
# Standalone private key — never docker-cp'd, never bind-mounted
# (mitmproxy reads the cert+key concat below). Lock to owner-
# only so it doesn't sit at the default umask on disk.
key_path.chmod(0o600)
# `subjectKeyIdentifier=hash` makes openssl compute the SKI as
# SHA-1(pubkey), matching how mitmproxy computes the AKI on the
# leaves it later mints. Without this, chain validation breaks
# despite the CA being present in the trust store.
cnf_path.write_text(
"[req]\n"
"distinguished_name = req_dn\n"
"prompt = no\n"
"x509_extensions = v3_ca\n"
"\n"
"[req_dn]\n"
"O = bot-bottle\n"
"CN = bot-bottle egress CA\n"
"\n"
"[v3_ca]\n"
"basicConstraints = critical, CA:TRUE\n"
"keyUsage = critical, keyCertSign, cRLSign\n"
"subjectKeyIdentifier = hash\n"
)
cnf_path.chmod(0o644)
req = subprocess.run(
["openssl", "req", "-x509", "-new", "-nodes",
"-key", str(key_path),
"-sha256", "-days", "365",
"-config", str(cnf_path),
"-out", str(cert_path)],
capture_output=True, text=True, check=False,
)
if req.returncode != 0:
die(f"egress ca cert generation failed: {req.stderr.strip()}")
cert_path.chmod(0o644)
# mitmproxy reads cert + key from a single concatenated PEM file.
# This file IS bind-mounted into the egress container (chunk 3+),
# where mitmproxy runs as uid 1000 — so the host file has to be
# world-readable for the container's user to read it through the
# mount. Owner-only mode on the parent dir (state/<slug>/, under
# ~/.bot-bottle which inherits ~'s 0o700) is what actually
# restricts who can reach this file on the host.
mitm = work / "mitmproxy-ca.pem"
mitm.write_bytes(cert_path.read_bytes() + key_path.read_bytes())
mitm.chmod(0o644)
return (mitm, cert_path)
+343
View File
@@ -0,0 +1,343 @@
"""Host-side helper to apply a routes.yaml change to a running
egress sidecar (PRD 0014 retargeted by PRD 0017 chunk 3).
Used by the supervise dashboard when the operator approves an
egress-block proposal (or runs the operator-initiated
`routes edit <bottle>` verb). Fetches the current routes.yaml via
`docker exec cat`, validates the new content, writes it into the
sidecar via `docker cp`, then `docker kill --signal HUP` to make
the addon reload without dropping connections.
Also mirrors the new route hosts into pipelock's hostname allowlist
so the downstream leg lets them through — egress enforces
the path-aware allowlist on the agent leg, pipelock enforces the
hostname allowlist + DLP body scan on the upstream leg, and a
host added to one must be in the other or the request 403s
somewhere along the chain.
Raises EgressApplyError on any failure — the dashboard
surfaces the message and keeps the proposal pending so the
operator can retry.
"""
from __future__ import annotations
import json
import re
import subprocess
from pathlib import Path
from ...egress import EGRESS_ROUTES_IN_CONTAINER
from ...egress_addon_core import load_routes
from ...yaml_subset import YamlSubsetError, parse_yaml_subset
from .bottle_state import egress_state_dir
from .sidecar_bundle import sidecar_bundle_container_name
from .pipelock_apply import (
PipelockApplyError,
apply_allowlist_change,
fetch_current_allowlist,
parse_allowlist_content,
render_allowlist_content,
)
def _render_routes_payload(routes_list: list[dict[str, object]]) -> str:
"""Render a list-of-dicts routes payload as YAML matching the
shape `egress_render_routes` produces. The apply path
round-trips current routes.yaml through this so the file the
sidecar sees stays in the YAML format the addon expects."""
if not routes_list:
return "routes: []\n"
lines: list[str] = ["routes:"]
for entry in routes_list:
host = str(entry.get("host", ""))
lines.append(f' - host: "{host}"')
auth_scheme = entry.get("auth_scheme")
token_env = entry.get("token_env")
if auth_scheme and token_env:
lines.append(f' auth_scheme: "{auth_scheme}"')
lines.append(f' token_env: "{token_env}"')
paths = entry.get("path_allowlist") or []
if paths:
lines.append(" path_allowlist:")
for p in paths:
lines.append(f' - "{p}"')
return "\n".join(lines) + "\n"
def _egress_routes_host_path(slug: str) -> Path:
"""The bind-mount source for the egress sidecar's routes.yaml.
Must match what egress.prepare wrote at chunk-2 paths."""
return egress_state_dir(slug) / "egress_routes.yaml"
class EgressApplyError(RuntimeError):
"""Raised when fetch / apply fails. Caller renders to the
operator; does not crash the dashboard."""
def fetch_current_routes(slug: str) -> str:
"""Read the live routes.yaml from the running egress sidecar
for `slug`. Returns the file content as a string. Raises
EgressApplyError if the sidecar isn't reachable or the read
fails."""
container = sidecar_bundle_container_name(slug)
r = subprocess.run(
["docker", "exec", container, "cat", EGRESS_ROUTES_IN_CONTAINER],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
raise EgressApplyError(
f"could not read routes.yaml from {container}: "
f"{(r.stderr or '').strip() or 'container not running?'}"
)
return r.stdout
def validate_routes_content(content: str) -> None:
"""Syntactic check before SIGHUP — the addon's reload also
validates, but failing here keeps the old routes live and gives
the operator a clearer error than the addon's stderr line."""
try:
load_routes(content)
except ValueError as e:
raise EgressApplyError(
f"proposed routes.yaml is not valid: {e}"
) from e
def _hosts_in_routes(content: str) -> list[str]:
"""Extract the host list from a routes.yaml content string.
Uses the addon's own parser so any host the addon will match on
also lands in pipelock's allowlist. Returns sorted+deduped."""
try:
routes = load_routes(content)
except ValueError as e:
raise EgressApplyError(
f"proposed routes.yaml is not valid: {e}"
) from e
return sorted({r.host for r in routes if r.host})
# Pipelock's allowlist parser accepts only literal hostnames:
# `[A-Za-z0-9_.-]+`. Anything else (wildcards, IPv6 literals,
# stray characters) is silently dropped from the mirror so the
# pipelock apply doesn't fail parse before the new yaml is even
# written. The dropped hosts stay on egress's route table —
# but the addon does exact-host match only, so they'll never
# match anything either. (Wildcard host matching was removed —
# see `match_route` in egress_addon_core for the rationale.)
_PIPELOCK_HOST_RE = re.compile(r"^[A-Za-z0-9_.-]+$")
def _pipelock_safe_hosts(hosts: list[str]) -> list[str]:
"""Drop any host pipelock's allowlist parser would reject.
Order preserved."""
return [h for h in hosts if _PIPELOCK_HOST_RE.match(h)]
def _mirror_hosts_to_pipelock(slug: str, hosts: list[str]) -> None:
"""Ensure every pipelock-compatible `hosts` entry is on
pipelock's allowlist. Fetches pipelock's current allowlist,
merges, re-applies. Hosts pipelock can't represent (wildcards,
etc.) are silently skipped — they stay live on egress
but aren't enforced at pipelock. No-op if every host is already
present (apply still restarts pipelock if any host is new).
Raises EgressApplyError on pipelock failures so the
caller's diff/audit reflects the half-state."""
safe_hosts = _pipelock_safe_hosts(hosts)
try:
current = fetch_current_allowlist(slug)
existing = parse_allowlist_content(current)
merged = sorted(set(existing) | set(safe_hosts))
if merged == sorted(existing):
return # nothing to add
apply_allowlist_change(slug, render_allowlist_content(merged))
except PipelockApplyError as e:
# Mirror runs BEFORE the egress write, so egress
# is unchanged on this failure path. Report it as a
# pipelock-side problem so the operator looks in the right
# place; their `pipelock edit` flow can repair manually.
raise EgressApplyError(
f"pipelock allowlist mirror failed (egress NOT "
f"updated): {e}. Fix pipelock's allowlist manually with "
f"`pipelock edit <bottle>` then retry the proposal."
) from e
def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]:
"""Apply `new_content` to the egress sidecar for `slug`:
1. Fetch current routes.yaml (for the before-diff).
2. Validate the new content via the addon's own parser.
3. Mirror the route hosts onto pipelock's allowlist (so the
downstream hostname gate lets them through).
4. Write to a temp file, `docker cp` into the egress
sidecar.
5. `docker kill --signal HUP` so the addon reloads.
Order matters: pipelock first, then egress. If the
pipelock step fails, egress hasn't been touched and the
old routes stay live. If the egress step fails after
pipelock succeeded, pipelock has the host in its allowlist but
egress doesn't enforce it yet — harmless extra-permissive
state at pipelock, and a re-approval will land the egress
side.
Returns (before, after) where `after` == `new_content`. Raises
EgressApplyError on any step."""
container = sidecar_bundle_container_name(slug)
before = fetch_current_routes(slug)
validate_routes_content(new_content)
# Pipelock mirror first — if it fails, egress stays intact
# and the operator gets a clear error about the half-state.
_mirror_hosts_to_pipelock(slug, _hosts_in_routes(new_content))
# routes.yaml is bind-mounted into the egress container as a
# SINGLE FILE. Docker single-file bind mounts pin the source
# inode at mount time; write-temp-then-rename swaps the inode
# on the host, which leaves the container's mount pointing at
# the now-orphaned old inode (so the SIGHUP'd reload re-reads
# unchanged content). Write in-place instead. Lose file-level
# atomicity, but the apply path issues SIGHUP only AFTER the
# write returns, and the addon's `load_routes` raises
# `ValueError` on a partial read and keeps the previous
# in-memory routes — so a SIGHUP that hypothetically raced an
# in-flight write is non-disruptive.
target = _egress_routes_host_path(slug)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(new_content)
# mitmproxy in the container reads through the bind mount as
# uid 1000; the host file has to be world-readable for that
# read to succeed (parent dir at 0o700 still restricts who
# can reach the file on the host). Routes content is not
# secret — tokens live in the container's environ — so 0o644
# is the right trade-off.
target.chmod(0o644)
sig = subprocess.run(
["docker", "kill", "--signal", "HUP", container],
capture_output=True, text=True, check=False,
)
if sig.returncode != 0:
raise EgressApplyError(
f"failed to SIGHUP {container}: "
f"{(sig.stderr or '').strip()}"
)
return before, new_content
def _merge_single_route(
current_yaml: str, new_route: dict[str, object],
) -> str:
"""Merge a single proposed route into the current routes.yaml
content, returning the merged YAML string.
Behavior:
- If `new_route['host']` is NOT in the current routes →
append the route.
- If the host IS already present → union the path_allowlist
entries (proposed existing). The existing `auth_scheme`
and `token_env` are preserved — agent-proposed auth changes
on an existing host are ignored, matching the tool's
documented semantics.
Round-trips the file through `yaml_subset` (the same parser
the addon uses), so the merged output is in the YAML format
the sidecar reads. Token VALUES never appear here; the routes
file carries only env-var slot NAMES."""
try:
cfg = parse_yaml_subset(current_yaml)
except YamlSubsetError as e:
raise EgressApplyError(
f"current routes.yaml is not valid YAML: {e}"
) from e
routes = cfg.get("routes")
if not isinstance(routes, list):
raise EgressApplyError(
"current routes.yaml: 'routes' is not a list"
)
new_host = str(new_route.get("host", "")).lower()
if not new_host:
raise EgressApplyError(
"proposed route is missing 'host'"
)
proposed_paths = list(new_route.get("path_allowlist") or [])
# Look for an existing entry with the same host (case-insensitive).
for entry in routes:
if not isinstance(entry, dict):
continue
if str(entry.get("host", "")).lower() == new_host:
# Merge path_allowlist: union proposed + existing, ordered
# by first-seen so existing paths stay in original order.
existing_paths: list[str] = list(entry.get("path_allowlist") or [])
seen = {p: None for p in existing_paths}
for p in proposed_paths:
seen.setdefault(p, None)
merged_paths = list(seen.keys())
if merged_paths:
entry["path_allowlist"] = merged_paths
# Preserve existing auth — tool description says agent-
# proposed auth on an existing host is ignored.
break
else:
# Host not present; build a new route entry from the
# proposed fields. Need to assign a token_env slot if
# `auth` was proposed (otherwise the addon's parser rejects
# a half-set auth pair). Slots: count existing slots, pick
# the next free index.
entry = {"host": new_route["host"]}
if proposed_paths:
entry["path_allowlist"] = proposed_paths
auth = new_route.get("auth")
if isinstance(auth, dict) and auth.get("scheme") and auth.get("token_ref"):
existing_slots = sorted({
str(r.get("token_env"))
for r in routes
if isinstance(r, dict) and r.get("token_env")
})
next_idx = len(existing_slots)
entry["auth_scheme"] = str(auth["scheme"])
entry["token_env"] = f"EGRESS_TOKEN_{next_idx}"
# NOTE: the addon reads token VALUES from its container's
# environ keyed by token_env. A newly-added auth route at
# runtime points at a slot that has no env value → the
# addon will 403 with "token env unset" until the operator
# arranges for the value to land in the container's env.
# Recording this here so the operator-facing diff carries
# the slot name they'll need to provision.
routes.append(entry)
return _render_routes_payload(routes)
def add_route(slug: str, proposed_route_json: str) -> tuple[str, str]:
"""Apply a single-route addition to the egress. Parses the
agent's proposed route, fetches the current routes file, merges,
and applies via `apply_routes_change`. Returns (before, after)
full-file content for the audit log."""
try:
proposed = json.loads(proposed_route_json)
except json.JSONDecodeError as e:
raise EgressApplyError(
f"proposed route is not valid JSON: {e}"
) from e
if not isinstance(proposed, dict):
raise EgressApplyError(
"proposed route must be a JSON object"
)
current = fetch_current_routes(slug)
merged = _merge_single_route(current, proposed)
return apply_routes_change(slug, merged)
__all__ = [
"EgressApplyError",
"add_route",
"apply_routes_change",
"fetch_current_routes",
"validate_routes_content",
]
+80
View File
@@ -0,0 +1,80 @@
"""Active-agent enumeration for the docker backend.
Mirrors `backend/smolmachines/enumerate.py`: returns
`ActiveAgent` records the CLI `list active` command and the
dashboard agents pane consume. Empty when docker isn't reachable
— gated by `has_backend('docker')` at the cross-backend caller
so this module trusts that docker is available when called.
The parser (`_parse_services_by_project`) is exposed for direct
unit testing; the docker `docker ps` invocation is in
`_query_services_by_project`."""
from __future__ import annotations
import subprocess
from .. import ActiveAgent
from .bottle_state import read_metadata
from .compose import compose_project_name, list_active_slugs
def enumerate_active() -> list[ActiveAgent]:
"""All currently-running docker-backed agents. Caller is
responsible for gating on `has_backend('docker')` if it
matters; if docker is missing the `docker ps` call below
returns an empty list silently."""
slugs = list_active_slugs(include_stopped=False)
if not slugs:
return []
services_by_project = _query_services_by_project()
out: list[ActiveAgent] = []
for slug in slugs:
project = compose_project_name(slug)
services = services_by_project.get(project, set())
metadata = read_metadata(slug)
out.append(ActiveAgent(
backend_name="docker",
slug=slug,
agent_name=metadata.agent_name if metadata else "?",
started_at=metadata.started_at if metadata else "",
services=tuple(sorted(services)),
))
return out
def _parse_services_by_project(stdout: str) -> dict[str, set[str]]:
"""Parse `docker ps` output formatted as
`<project-label>\\t<service-label>` (one line per container)
into a `{project: {service, ...}}` mapping. Pure function for
testing — the docker invocation is in `_query_services_by_project`."""
out: dict[str, set[str]] = {}
for line in stdout.splitlines():
project, _, service = line.partition("\t")
if not project or not service:
continue
out.setdefault(project, set()).add(service)
return out
def _query_services_by_project() -> dict[str, set[str]]:
"""One `docker ps` call → `{project: {service, ...}}`. Used
by the CLI's `list active` and the dashboard's agents pane —
one subprocess per refresh tick, not one per bottle."""
try:
r = subprocess.run(
[
"docker", "ps",
"--filter", "label=com.docker.compose.project",
"--format",
'{{.Label "com.docker.compose.project"}}'
"\t"
'{{.Label "com.docker.compose.service"}}',
],
capture_output=True, text=True, check=False,
)
except FileNotFoundError:
return {}
if r.returncode != 0:
return {}
return _parse_services_by_project(r.stdout or "")
+16
View File
@@ -0,0 +1,16 @@
"""Docker-side git-gate constants: in-container paths the renderer's
bind-mounts target + the listening port. The prepare-time entrypoint
/ hook render lives on the platform-neutral `GitGate` ABC — backends
instantiate it directly. The git-gate daemon's container lifecycle
is owned by the sidecar bundle (PRD 0024)."""
from __future__ import annotations
GIT_GATE_ENTRYPOINT_IN_CONTAINER = "/git-gate-entrypoint.sh"
GIT_GATE_HOOK_IN_CONTAINER = "/etc/git-gate/pre-receive"
GIT_GATE_ACCESS_HOOK_IN_CONTAINER = "/etc/git-gate/access-hook"
GIT_GATE_CREDS_DIR_IN_CONTAINER = "/git-gate/creds"
# git daemon's default listening port.
GIT_GATE_PORT = 9418
+218
View File
@@ -0,0 +1,218 @@
"""Launch step for the Docker bottle backend.
PRD 0018 chunk 3: each instance is one `docker compose` project.
The flow is:
1. Build the agent's base + derived image (compose builds the
sidecar images via the `build:` directive on first up).
2. Pre-create the per-bottle networks. We do this outside compose
so we can inspect the assigned internal CIDR and embed it in
pipelock's yaml (compose's `external: true` lets the compose
file reference these pre-existing networks).
3. Mint the per-bottle CAs (chunk 2 writes them under
state/<slug>/{pipelock,egress}/).
4. Re-render pipelock yaml with the now-known internal CIDR so
the SSRF allowlist exempts the bottle's own subnet.
5. Populate the inner plans with launch-time fields so the
renderer can read network names, CA paths, pipelock URL.
6. Render the compose spec, write it to
state/<slug>/docker-compose.yml, write metadata.json.
7. `docker compose up -d` (token + OAuth values flow into the
compose subprocess env so `environment: [NAME]` bare-name
entries inherit without rendering values into the file).
8. Provision (CA install, prompt copy, skills, git, supervise
config) — unchanged, uses `docker exec`.
9. Yield a DockerBottle handle. `exec_claude` runs claude via
`docker exec -it` exactly like the pre-compose world.
Teardown (ExitStack callbacks fire in reverse):
- Dump `docker compose logs --no-color --timestamps` to
state/<slug>/compose.log (best-effort).
- `docker compose down` removes the project's containers (not the
external networks).
- `network_remove` deletes the two networks we pre-created.
"""
from __future__ import annotations
import dataclasses
import os
from contextlib import ExitStack, contextmanager
from pathlib import Path
from typing import Callable, Generator
from ...egress import egress_resolve_token_values
from ...log import info
from . import network as network_mod
from . import util as docker_mod
from .bottle import DockerBottle
from .bottle_plan import DockerBottlePlan
from .bottle_state import (
bottle_state_dir,
egress_state_dir,
pipelock_state_dir,
)
from .compose import (
bottle_plan_to_compose,
compose_down,
compose_dump_logs,
compose_file_path,
compose_log_path,
compose_project_name,
compose_up,
write_compose_file,
)
from .egress import egress_tls_init
from .pipelock import (
BUNDLE_LOCAL_PIPELOCK_URL,
pipelock_tls_init,
)
# Where the repo root lives, for `docker build` context. Computed once.
_REPO_DIR = str(Path(__file__).resolve().parent.parent.parent.parent)
@contextmanager
def launch(
plan: DockerBottlePlan,
*,
provision: Callable[[DockerBottlePlan, str], str | None],
) -> Generator[DockerBottle, None, None]:
"""Build, launch, and provision a Docker bottle via compose.
Teardown on exit."""
stack = ExitStack()
def teardown() -> None:
try:
stack.close()
except BaseException:
# Teardown must not raise; swallow so the caller's
# __exit__ path can still propagate the original error.
pass
try:
# Step 1: agent image build. Sidecar images get built lazily by
# `docker compose up` via the renderer's `build:` directives.
docker_mod.build_image(
plan.image, _REPO_DIR,
dockerfile=plan.dockerfile_path,
)
if plan.derived_image:
docker_mod.build_image_with_cwd(
plan.derived_image, plan.image, plan.spec.user_cwd
)
# Networks: compose-managed. The names are derived
# deterministically from the slug so the renderer can put
# them on the services and `compose up` creates them with
# those names. The empirical spike confirmed pipelock's
# SSRF guard only checks proxied-request destinations, not
# source IPs — so the bottle's own internal CIDR doesn't
# need to be in `ssrf.ip_allowlist`. Pre-create + CIDR
# introspection are gone; compose owns the network
# lifecycle.
internal_network = network_mod.network_name_for_slug(plan.slug)
egress_network = network_mod.network_egress_name_for_slug(plan.slug)
# Mint per-bottle CAs into state/<slug>/{pipelock,egress}/.
ca_cert_host, ca_key_host = pipelock_tls_init(pipelock_state_dir(plan.slug))
egress_ca_host, egress_ca_cert_only = egress_tls_init(
egress_state_dir(plan.slug),
)
# Populate launch-time fields on every inner plan so the
# renderer reads concrete network names, CA paths, and
# pipelock URL.
proxy_plan = dataclasses.replace(
plan.proxy_plan,
internal_network=internal_network,
internal_network_cidr="",
egress_network=egress_network,
ca_cert_host_path=ca_cert_host,
ca_key_host_path=ca_key_host,
)
git_gate_plan = plan.git_gate_plan
if git_gate_plan.upstreams:
git_gate_plan = dataclasses.replace(
git_gate_plan,
internal_network=internal_network,
egress_network=egress_network,
)
egress_plan = plan.egress_plan
if egress_plan.routes:
egress_plan = dataclasses.replace(
egress_plan,
internal_network=internal_network,
egress_network=egress_network,
mitmproxy_ca_host_path=egress_ca_host,
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
pipelock_ca_host_path=ca_cert_host,
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
)
supervise_plan = plan.supervise_plan
if supervise_plan is not None:
supervise_plan = dataclasses.replace(
supervise_plan,
internal_network=internal_network,
)
plan = dataclasses.replace(
plan,
proxy_plan=proxy_plan,
git_gate_plan=git_gate_plan,
egress_plan=egress_plan,
supervise_plan=supervise_plan,
)
# Step 6: render + write the compose file. metadata.json
# was written at prepare time and already carries
# compose_project; nothing to update here.
state_dir = bottle_state_dir(plan.slug)
spec = bottle_plan_to_compose(plan)
compose_file = write_compose_file(spec, compose_file_path(state_dir))
project = compose_project_name(plan.slug)
# Step 7: compose up. Token values + the OAuth placeholder
# flow through subprocess env; the compose file holds only
# bare names for the secret-carrying entries.
token_values: dict[str, str] = {}
if plan.egress_plan.routes:
token_values = egress_resolve_token_values(
plan.egress_plan.token_env_map, dict(os.environ),
)
compose_env: dict[str, str] = {
**os.environ,
**plan.forwarded_env,
**token_values,
}
info(
f"docker compose up -d (project {project}, "
f"{len(spec['services'])} services)"
)
compose_up(project, compose_file, env=compose_env)
# Register teardown in reverse order: log dump first, then
# `compose down`. Networks come down last via callbacks
# registered in step 2.
stack.callback(compose_down, project, compose_file)
stack.callback(
compose_dump_logs, project, compose_file, compose_log_path(state_dir),
)
# Step 8: provision. Unchanged — uses `docker exec` against
# the agent container by its known name.
prompt_path = provision(plan, plan.container_name)
# Step 9: yield. exec_claude continues to use `docker exec -it`
# — the agent runs `sleep infinity` per the renderer's
# service spec.
yield DockerBottle(
plan.container_name,
teardown,
prompt_path,
agent_command=plan.agent_command,
agent_prompt_mode=plan.agent_prompt_mode,
)
finally:
teardown()
+133
View File
@@ -0,0 +1,133 @@
"""Docker network plumbing for the per-agent egress topology.
The agent container sits on a Docker `--internal` network (no default
gateway). Pipelock straddles that network and a per-agent user-defined
bridge for upstream egress. We deliberately do NOT use Docker's legacy
`bridge` network because only user-defined bridges run Docker's
embedded DNS resolver, which pipelock needs to resolve api.anthropic.com
and similar upstream hostnames.
Naming: bot-bottle-net-<slug> (internal),
bot-bottle-egress-<slug> (egress). Numeric suffix on conflict
(-2, -3, ..., capped at 100).
"""
from __future__ import annotations
import subprocess
from ...log import die, info, warn
def network_name_for_slug(slug: str) -> str:
return f"bot-bottle-net-{slug}"
def network_egress_name_for_slug(slug: str) -> str:
return f"bot-bottle-egress-{slug}"
def network_exists(name: str) -> bool:
"""Uses `docker network inspect`, not `docker network ls -f name=...`,
because the latter does substring matching."""
return (
subprocess.run(
["docker", "network", "inspect", name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
).returncode
== 0
)
def _network_create_with_prefix(base: str, internal: bool) -> str:
"""Create a per-agent Docker network whose name is <base> (with
-2, -3, ... appended on conflict, capped at 100). Returns the
resolved name."""
name = base
suffix = 2
while network_exists(name):
name = f"{base}-{suffix}"
suffix += 1
if suffix > 100:
die(
f"could not find a free network name after {base}-99; "
f"clean up old networks with 'docker network rm <name>'"
)
kind = "internal" if internal else "bridge (egress)"
args = ["docker", "network", "create"]
if internal:
args.append("--internal")
args.append(name)
info(f"creating {kind} network {name}")
result = subprocess.run(args, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, check=False)
if result.returncode != 0:
flag = " --internal" if internal else ""
die(f"docker network create{flag} {name} failed")
return name
def network_create_internal(slug: str) -> str:
"""Create a Docker `--internal` network for the agent. Returns the
resolved name."""
return _network_create_with_prefix(network_name_for_slug(slug), internal=True)
def network_create_egress(slug: str) -> str:
"""Create a per-agent user-defined bridge (NOT the legacy `bridge`)
so the pipelock sidecar has working DNS for upstream hostnames."""
return _network_create_with_prefix(network_egress_name_for_slug(slug), internal=False)
def network_inspect_cidr(name: str) -> str:
"""Return the IPv4 CIDR Docker assigned to a user-defined network.
Used by pipelock's SSRF guard exception: the bottle's internal
network sits in RFC1918 space, so pipelock's `internal:` list
would block any agent request whose destination resolves there
including the cred-proxy sidecar's address. Adding the
network's CIDR to pipelock's `ssrf.ip_allowlist` lets traffic
targeted at the bottle's own sidecars through while pipelock
still body-scans and api_allowlist-gates as usual."""
result = subprocess.run(
["docker", "network", "inspect",
"--format", "{{range .IPAM.Config}}{{.Subnet}}{{end}}", name],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
die(f"docker network inspect {name} failed: {result.stderr.strip()}")
cidr = result.stdout.strip()
if not cidr:
die(f"network {name!r} has no IPAM subnet configured")
return cidr
def network_attach(network: str, container: str) -> None:
result = subprocess.run(
["docker", "network", "connect", network, container],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
)
if result.returncode != 0:
die(f"docker network connect {network} {container} failed")
def network_remove(name: str) -> bool:
"""Idempotent: a missing network is treated as success so this can
be called from a teardown trap. Returns True if removal succeeded
(or the network was already gone)."""
if not network_exists(name):
return True
result = subprocess.run(
["docker", "network", "rm", name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
)
if result.returncode != 0:
warn(f"failed to remove network {name}; clean up with 'docker network rm {name}'")
return False
return True
+81
View File
@@ -0,0 +1,81 @@
"""Docker-side pipelock helpers: image pin, container naming, and
the one-shot `pipelock tls init` host-side CA mint. The
prepare-time YAML rendering itself lives on the platform-neutral
`PipelockProxy` ABC backends instantiate it directly.
The per-container `.start()` / `.stop()` lifecycle was deleted in
PRD 0024 chunk 3; compose-up owns the container lifecycle (PRD
0018) and the bundle path (PRD 0024) collapses pipelock + egress
+ git-gate + supervise into one container."""
from __future__ import annotations
import os
import subprocess
from pathlib import Path
from ...log import die
# Re-exported for the compose renderer + smolmachines launch step
# (they used to import these from this module before they moved to
# the platform-neutral pipelock module).
from ...pipelock import ( # noqa: F401
PIPELOCK_CA_CERT_IN_CONTAINER,
PIPELOCK_CA_KEY_IN_CONTAINER,
)
# Pipelock image, pinned by digest. The digest is the multi-arch image
# index for ghcr.io/luckypipewrench/pipelock:2.3.0.
PIPELOCK_IMAGE = os.environ.get(
"BOT_BOTTLE_PIPELOCK_IMAGE",
"ghcr.io/luckypipewrench/pipelock@sha256:3b1a39417b98406ddc5dc2d8fcb42865ddc0c68a43d355db55f0f8cb06bc6de9",
)
# Listening port for pipelock's forward proxy.
PIPELOCK_PORT = os.environ.get("BOT_BOTTLE_PIPELOCK_PORT", "8888")
# The URL egress dials for its upstream HTTPS_PROXY. egress and
# pipelock share the same container's network namespace inside the
# sidecar bundle, so loopback reaches pipelock directly — no docker
# DNS aliases involved.
BUNDLE_LOCAL_PIPELOCK_URL = f"http://127.0.0.1:{PIPELOCK_PORT}"
def pipelock_tls_init(stage_dir: Path) -> tuple[Path, Path]:
"""Generate a fresh per-bottle CA via a one-shot pipelock container.
Runs `pipelock tls init` against a host-mounted scratch dir, leaving
`ca.pem` (public cert, mode 600) and `ca-key.pem` (private key, mode
600) under `<stage_dir>/pipelock-ca/`. Returns the two host paths.
The image is pinned (same digest the running sidecar uses) so the
generated CA matches what the sidecar expects. Output is owned by
whatever UID the one-shot ran as; the compose renderer's
bind-mounts pin the files in place at runtime, so ownership
inside the running sidecar (root in pipelock's distroless image)
is independent."""
work = stage_dir / "pipelock-ca"
work.mkdir(exist_ok=True)
result = subprocess.run(
["docker", "run", "--rm",
"-v", f"{work}:/h",
"-e", "PIPELOCK_HOME=/h",
PIPELOCK_IMAGE, "tls", "init"],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
die(f"pipelock tls init failed: {result.stderr.strip()}")
cert = work / "ca.pem"
key = work / "ca-key.pem"
if not cert.is_file() or not key.is_file():
die(f"pipelock tls init did not produce ca files in {work}")
# Explicit perms in case a future pipelock release changes
# defaults. Pipelock runs as root in its distroless image and
# bind-mounts work with 0o600 (root reads everything); the key
# has no reason to be readable to anyone else on the host.
key.chmod(0o600)
cert.chmod(0o644)
return (cert, key)
+200
View File
@@ -0,0 +1,200 @@
"""pipelock_apply — host-side helper to apply an api_allowlist
change to a running pipelock sidecar (PRD 0015).
Used by the supervise dashboard when the operator approves a
pipelock-block proposal (or runs the operator-initiated `pipelock
edit <bottle>` verb). Fetches the current pipelock.yaml via `docker
exec`, parses it, swaps the api_allowlist with the proposed hosts,
re-renders, writes back via the bind-mount path, then signals the
bundle supervisor to restart the pipelock daemon (`docker kill
--signal USR1`) so
pipelock picks up the new config.
v1 uses restart, not SIGHUP pipelock has no in-process reload
hook and adding one is the "SIGHUP reload for pipelock" open
question in PRD 0015. Restart drops in-flight outbound calls; the
agent's HTTP client retries pick up against the restarted proxy.
"""
from __future__ import annotations
import os
import re
import subprocess
import tempfile
from pathlib import Path
from ...pipelock import pipelock_render_yaml
from ...yaml_subset import YamlSubsetError, parse_yaml_subset
from .bottle_state import pipelock_state_dir
from .sidecar_bundle import sidecar_bundle_container_name
def _pipelock_yaml_host_path(slug: str) -> Path:
"""The bind-mount source for the pipelock sidecar's
pipelock.yaml matches what pipelock.prepare wrote at chunk-2
paths."""
return pipelock_state_dir(slug) / "pipelock.yaml"
PIPELOCK_YAML_IN_CONTAINER = "/etc/pipelock.yaml"
# Allowlist proposals are one-hostname-per-line. Blank lines and
# `#`-prefixed comments are ignored. The character set matches the
# supervise sidecar's syntactic check on the agent's pipelock-block
# proposal (alphanumerics + dot/dash/underscore).
_HOST_OK = re.compile(r"^[A-Za-z0-9_.-]+$")
class PipelockApplyError(RuntimeError):
"""Raised when fetch / parse / apply fails. The dashboard renders
the message and keeps the proposal pending never crashes."""
def parse_allowlist_content(content: str) -> list[str]:
"""One hostname per line. Blanks and `#` comments are ignored.
Raises PipelockApplyError if a line has a disallowed character."""
hosts: list[str] = []
for i, raw_line in enumerate(content.splitlines(), start=1):
line = raw_line.strip()
if not line or line.startswith("#"):
continue
if not _HOST_OK.match(line):
raise PipelockApplyError(
f"allowlist line {i}: {line!r} has disallowed characters"
)
hosts.append(line)
return hosts
def render_allowlist_content(hosts: list[str]) -> str:
"""Hosts → one-per-line string (the operator-facing format)."""
if not hosts:
return ""
return "\n".join(hosts) + "\n"
def fetch_current_yaml(slug: str) -> str:
"""Read the live /etc/pipelock.yaml from the sidecar bundle.
Uses `docker cp` because pipelock inside the bundle is the
distroless pipelock binary with no shell, and `docker cp` is a
daemon-API tarball copy that works regardless of what's
available inside the container.
Raises PipelockApplyError if the read fails."""
container = sidecar_bundle_container_name(slug)
fd, tmp_path = tempfile.mkstemp(prefix="cb-pipelock-fetch.", suffix=".yaml")
os.close(fd)
try:
r = subprocess.run(
[
"docker", "cp",
f"{container}:{PIPELOCK_YAML_IN_CONTAINER}", tmp_path,
],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
raise PipelockApplyError(
f"could not fetch pipelock.yaml from {container}: "
f"{(r.stderr or '').strip() or 'container not running?'}"
)
return Path(tmp_path).read_text()
finally:
try:
Path(tmp_path).unlink()
except OSError:
pass
def fetch_current_allowlist(slug: str) -> str:
"""Fetch the live yaml, extract api_allowlist, render as one-per-
line the operator-facing format for the TUI / agent's
current-config mount."""
yaml = fetch_current_yaml(slug)
try:
cfg = parse_yaml_subset(yaml)
except YamlSubsetError as e:
raise PipelockApplyError(f"running pipelock yaml: {e}") from e
hosts = cfg.get("api_allowlist", [])
if not isinstance(hosts, list):
raise PipelockApplyError(
"running pipelock yaml: api_allowlist is not a list"
)
return render_allowlist_content([str(h) for h in hosts])
def apply_allowlist_change(
slug: str, new_allowlist_content: str,
) -> tuple[str, str]:
"""Apply `new_allowlist_content` to the sidecar bundle:
1. Parse the proposed hosts (one per line).
2. Fetch + parse current pipelock.yaml.
3. Replace api_allowlist with the proposed hosts; re-render.
4. Write the new yaml to the bind-mount source.
5. `docker kill --signal USR1 <bundle>` so the supervisor
restarts the pipelock daemon in place (leaving egress,
git-gate, and supervise running). Pipelock has no
in-process reload; the supervisor's per-daemon restart
keeps the agent's MCP socket alive — a whole-bundle
`docker restart` would bounce supervise too.
Returns (before, after) where both are one-per-line allowlist
strings (operator-facing format). Raises PipelockApplyError on
any failure; the sidecar's existing config stays in place until
the host write succeeds, and the SIGUSR1 is what makes it
live."""
new_hosts = parse_allowlist_content(new_allowlist_content)
container = sidecar_bundle_container_name(slug)
current_yaml = fetch_current_yaml(slug)
try:
cfg = parse_yaml_subset(current_yaml)
except YamlSubsetError as e:
raise PipelockApplyError(f"running pipelock yaml: {e}") from e
current_hosts = cfg.get("api_allowlist", [])
if not isinstance(current_hosts, list):
raise PipelockApplyError(
"running pipelock yaml: api_allowlist is not a list"
)
before = render_allowlist_content([str(h) for h in current_hosts])
after = render_allowlist_content(new_hosts)
cfg["api_allowlist"] = new_hosts
rendered = pipelock_render_yaml(cfg)
# pipelock.yaml is bind-mounted into the container as a SINGLE
# FILE — same Docker single-file inode issue as egress_apply:
# write-temp-then-rename swaps the host inode and leaves the
# container's mount pointing at the orphaned old one. Write
# in-place. The SIGUSR1 below makes the new content live
# (pipelock has no in-process reload, so the supervisor
# restarts the pipelock daemon in response).
target = _pipelock_yaml_host_path(slug)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(rendered)
# pipelock runs as root in its distroless image — any mode is
# fine — but 0o600 matches what prepare wrote.
target.chmod(0o600)
restart = subprocess.run(
["docker", "kill", "--signal", "USR1", container],
capture_output=True, text=True, check=False,
)
if restart.returncode != 0:
raise PipelockApplyError(
f"failed to signal {container} for pipelock restart: "
f"{(restart.stderr or '').strip()}"
)
return before, after
__all__ = [
"PIPELOCK_YAML_IN_CONTAINER",
"PipelockApplyError",
"apply_allowlist_change",
"fetch_current_allowlist",
"fetch_current_yaml",
"parse_allowlist_content",
"render_allowlist_content",
]
+272
View File
@@ -0,0 +1,272 @@
"""Prepare step for the Docker bottle backend.
`resolve_plan` does all host-side resolution (image and container
names, env-file, prompt-file, proxy plan, runtime detection) and
returns a frozen DockerBottlePlan. No Docker resources are created;
the only side effects are scratch files under `stage_dir` and a probe
of `docker info`. Cross-backend host-side validation has already run
via the base class's `prepare` template before this is called.
"""
from __future__ import annotations
import os
from datetime import datetime, timezone
from pathlib import Path
from ...agent_provider import runtime_for
from ...egress import Egress
from ...env import ResolvedEnv, resolve_env
from ...git_gate import GitGate
from ...log import die
from ...pipelock import PipelockProxy
from ...supervise import Supervise
from .. import BottleSpec
from . import util as docker_mod
from .bottle_plan import DockerBottlePlan
from .bottle_state import (
BottleMetadata,
agent_state_dir,
bottle_identity,
clear_preserve_marker,
egress_state_dir,
git_gate_state_dir,
per_bottle_dockerfile,
per_bottle_dockerfile_path,
per_bottle_image_tag,
pipelock_state_dir,
supervise_state_dir,
write_metadata,
)
from .sidecar_bundle import sidecar_bundle_container_name
def resolve_plan(
spec: BottleSpec,
*,
stage_dir: Path,
) -> DockerBottlePlan:
"""Resolve Docker-specific names and write scratch files. Trusts
that the agent and its skills/git-gate keys are present
validation already ran in the base class."""
docker_mod.require_docker()
proxy = PipelockProxy()
git_gate = GitGate()
egress = Egress()
supervise = Supervise()
manifest = spec.manifest
agent = manifest.agents[spec.agent_name]
bottle = manifest.bottle_for(spec.agent_name)
provider = bottle.agent_provider
provider_runtime = runtime_for(provider.template)
# PRD 0016 follow-up: identity, not bare slug. A fresh `start`
# mints a random-suffixed identity (so parallel runs of the same
# agent in the same cwd don't collide on container/network
# names); a `resume` passes the recorded identity in via
# spec.identity to continue an existing bottle's state.
slug = spec.identity or bottle_identity(spec.agent_name)
# Record the launch metadata so `cli.py resume <identity>` can
# reconstruct the spec. Idempotent — re-writes on resume with a
# refreshed started_at.
write_metadata(BottleMetadata(
identity=slug,
agent_name=spec.agent_name,
cwd=spec.user_cwd if spec.copy_cwd else "",
copy_cwd=spec.copy_cwd,
started_at=datetime.now(timezone.utc).isoformat(),
compose_project=f"bot-bottle-{slug}",
))
# Clear any leftover preserve marker from a prior capability-block
# so this fresh launch can be cleaned up at session-end unless
# the agent triggers another capability-block.
clear_preserve_marker(slug)
# PRD 0016 capability-block: if a per-bottle Dockerfile has been
# written (via apply_capability_change), the base image becomes
# per_bottle_image_tag(slug) built from that file. --cwd still
# layers a derived image on top.
dockerfile_path = ""
if per_bottle_dockerfile(slug) is not None:
image_default = per_bottle_image_tag(slug)
dockerfile_path = str(per_bottle_dockerfile_path(slug))
elif provider.dockerfile:
image_default = f"bot-bottle-{provider.template}:{slug}"
dockerfile_path = _resolve_manifest_dockerfile(provider.dockerfile, spec)
elif provider_runtime.dockerfile:
image_default = provider_runtime.image
dockerfile_path = provider_runtime.dockerfile
else:
image_default = provider_runtime.image
image = os.environ.get("BOT_BOTTLE_IMAGE", image_default)
derived_image = ""
runtime_image = image
if spec.copy_cwd:
derived_image = os.environ.get(
"BOT_BOTTLE_DERIVED_IMAGE", f"bot-bottle-cwd:{slug}"
)
runtime_image = derived_image
default_container = f"bot-bottle-{slug}"
pinned_container = os.environ.get("BOT_BOTTLE_CONTAINER", "")
container_name_pinned = bool(pinned_container)
if container_name_pinned:
container_name = pinned_container
if docker_mod.container_exists(container_name):
die(
f"container '{container_name}' already exists "
f"(pinned via BOT_BOTTLE_CONTAINER). "
f"Remove it with 'docker rm -f {container_name}' or unset the override."
)
else:
container_name = ""
for candidate in docker_mod.container_name_candidates(default_container):
if not docker_mod.container_exists(candidate):
container_name = candidate
break
if not container_name:
die(
f"could not find a free container name after "
f"{default_container}-{docker_mod.MAX_CONTAINER_SUFFIX}; "
f"clean up old containers with 'docker rm -f <name>'"
)
# Probe the sidecar-bundle container name for an orphan from a
# previous run. Otherwise a stale bundle surfaces as a
# docker-create conflict deep inside launch() with no actionable
# hint; failing fast here points at the cleanup command.
bundle_name = sidecar_bundle_container_name(slug)
if docker_mod.container_exists(bundle_name):
die(
f"sidecar bundle container '{bundle_name}' already exists. "
f"This is an orphan from a previous run; clean it up with "
f"'./cli.py cleanup' (or 'docker rm -f {bundle_name}') and "
f"retry."
)
# PRD 0018 chunk 2: prepare-time scratch files live under
# ~/.bot-bottle/state/<slug>/<service>/ so chunk 3's compose
# bind-mounts can point at stable paths. The state subdirs are
# cleaned up by start.py's session-end teardown unless something
# explicitly preserves the state dir (capability-block, crash).
agent_dir = agent_state_dir(slug)
agent_dir.mkdir(parents=True, exist_ok=True)
env_file = agent_dir / "agent.env"
prompt_file = agent_dir / "prompt.txt"
prompt_file.write_text("")
prompt_file.chmod(0o600)
pipelock_dir = pipelock_state_dir(slug)
pipelock_dir.mkdir(parents=True, exist_ok=True)
proxy_plan = proxy.prepare(bottle, slug, pipelock_dir)
git_gate_dir = git_gate_state_dir(slug)
git_gate_dir.mkdir(parents=True, exist_ok=True)
git_gate_plan = git_gate.prepare(bottle, slug, git_gate_dir)
egress_dir = egress_state_dir(slug)
egress_dir.mkdir(parents=True, exist_ok=True)
egress_plan = egress.prepare(bottle, slug, egress_dir)
supervise_plan = None
if bottle.supervise:
# Current Dockerfile for the agent image. Read from the repo
# root; for `--cwd` derived images the base Dockerfile is what
# the agent should propose changes against (the derived layer
# is just a workspace copy).
# (routes.yaml + pipelock allowlist used to land here too but
# PRD 0017 chunk 3 moved them behind the
# `list-egress-routes` MCP tool so the agent gets live
# state rather than a launch-time snapshot.)
supervise_dockerfile_path = (
Path(dockerfile_path)
if dockerfile_path
else Path(__file__).resolve().parent.parent.parent.parent / "Dockerfile.claude"
)
dockerfile_content = (
supervise_dockerfile_path.read_text()
if supervise_dockerfile_path.is_file()
else ""
)
supervise_dir = supervise_state_dir(slug)
supervise_dir.mkdir(parents=True, exist_ok=True)
supervise_plan = supervise.prepare(
slug, supervise_dir,
dockerfile_content=dockerfile_content,
)
resolved = resolve_env(manifest, spec.agent_name)
# Everything that should reach the bottle by-name (so its value
# never lands on argv or in env_file) goes into one dict. Nothing
# mutates the host os.environ.
forwarded_env: dict[str, str] = dict(resolved.forwarded)
# When the bottle declares an egress route with the
# `claude_code_oauth` role marker, claude-code's outbound
# Authorization gets stripped + re-injected by egress. The
# agent's environ still needs *something* claude-code recognises
# as a credential or it refuses to start; ship a non-secret
# placeholder. The placeholder isn't any real token value, so
# leaking it would tell an attacker only that egress is in
# front. Manifest validation enforces singleton on this role.
has_provider_auth = any(
provider_runtime.auth_role in r.roles for r in egress_plan.routes
)
if has_provider_auth:
forwarded_env[provider_runtime.placeholder_env] = "egress-placeholder"
if provider.template == "claude" and has_provider_auth:
# Belt-and-braces: turn off telemetry endpoints (statsig,
# error reporting) that egress can't gate by auth.
forwarded_env.setdefault("CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC", "1")
forwarded_env.setdefault("DISABLE_ERROR_REPORTING", "1")
_write_env_file(resolved, env_file)
prompt_file.write_text(agent.prompt)
use_runsc = docker_mod.runsc_available()
return DockerBottlePlan(
spec=spec,
stage_dir=stage_dir,
slug=slug,
container_name=container_name,
container_name_pinned=container_name_pinned,
image=image,
derived_image=derived_image,
runtime_image=runtime_image,
dockerfile_path=dockerfile_path,
env_file=env_file,
forwarded_env=forwarded_env,
prompt_file=prompt_file,
proxy_plan=proxy_plan,
git_gate_plan=git_gate_plan,
egress_plan=egress_plan,
supervise_plan=supervise_plan,
use_runsc=use_runsc,
agent_command=provider_runtime.command,
agent_prompt_mode=provider_runtime.prompt_mode,
agent_provider_template=provider.template,
)
def _write_env_file(resolved: ResolvedEnv, env_file: Path) -> None:
"""Serialize the literal portion of a ResolvedEnv into docker's
`--env-file` syntax (NAME=VALUE per line, mode 600 since the file
may carry verbatim values from the manifest). Forwarded names ride
on the plan as a structured tuple instead."""
env_lines: list[str] = []
for name, value in resolved.literals.items():
if "\n" in value:
die(
f"env entry {name} (literal) contains a newline; "
f"docker --env-file cannot represent multi-line values."
)
env_lines.append(f"{name}={value}")
env_file.write_text("\n".join(env_lines) + ("\n" if env_lines else ""))
env_file.chmod(0o600)
def _resolve_manifest_dockerfile(path_value: str, spec: BottleSpec) -> str:
path = Path(os.path.expanduser(path_value))
if not path.is_absolute():
path = Path(spec.user_cwd) / path
return str(path)
@@ -0,0 +1,8 @@
"""Per-provisioner modules for the Docker backend.
Each module exports one top-level function:
provision_<thing>(plan: DockerBottlePlan, target: str) -> ...
`DockerBottleBackend.provision_*` methods delegate to these. The
abstract `BottleBackend.provision_*` surface is unchanged; this
subpackage exists only to keep `backend.py` from being a god-file."""
+103
View File
@@ -0,0 +1,103 @@
"""Install the per-bottle MITM CA into the agent container's trust
store.
Post-PRD-0017 the CA depends on the agent's HTTP_PROXY target:
- Bottle declares `egress.routes[]` agent's HTTP_PROXY
points at egress; the cert the agent must trust is the
one egress mints leaf certs with (the egress CA).
- No egress routes agent's HTTP_PROXY points straight at
pipelock; the cert the agent must trust is pipelock's CA (the
pre-cutover behavior).
By the time this provisioner runs, the corresponding `tls_init`
helper has generated the chosen CA under `plan.stage_dir`, and the
sidecar (pipelock or egress) is up referencing the
in-container CA paths.
Cert lands on Debian's standard source path
(`/usr/local/share/ca-certificates/`); `update-ca-certificates`
rebuilds `/etc/ssl/certs/ca-certificates.crt`, which is what curl,
Python `ssl`, and OpenSSL-based tools all read by default. The env
trio set on the agent's `docker run` covers Node
(`NODE_EXTRA_CA_CERTS`) and Python `requests` /
`SSL_CERT_FILE`-honoring libraries that don't load the system
bundle.
The fingerprint is computed via stdlib (`ssl.PEM_cert_to_DER_cert`
+ `hashlib.sha256`) and logged once to stderr. The private key
stays on the host (under `stage_dir`) until teardown wipes the
stage dir; nothing in the agent ever sees it."""
from __future__ import annotations
import hashlib
import ssl
import subprocess
from pathlib import Path
from ....log import info
from ..bottle_plan import DockerBottlePlan
# Debian-family path for sources that `update-ca-certificates` reads.
# Bundle path is what the command rebuilds and what every standard
# TLS consumer in the image reads.
AGENT_CA_PATH = "/usr/local/share/ca-certificates/bot-bottle-mitm-ca.crt"
AGENT_CA_BUNDLE = "/etc/ssl/certs/ca-certificates.crt"
def _select_ca_cert(plan: DockerBottlePlan) -> tuple[Path, str]:
"""Pick the CA cert (and a short label for the log line) that
matches the proxy the agent's HTTP_PROXY points at. Egress-proxy
wins when the bottle declares any routes (it sits in front of
pipelock); else pipelock."""
if plan.egress_plan.routes:
cert = plan.egress_plan.mitmproxy_ca_cert_only_host_path
if cert == Path() or not cert.is_file():
from ....log import die
die(
f"egress CA cert missing at {cert or '(empty)'}; "
f"launch must have called egress_tls_init and "
f"re-bound the plan before provision"
)
return cert, "egress"
cert = plan.proxy_plan.ca_cert_host_path
if not cert or not cert.is_file():
from ....log import die
die(
f"pipelock CA cert missing at {cert or '(empty)'}; "
f"launch must have called pipelock_tls_init and re-bound "
f"the plan before provision"
)
return cert, "pipelock"
def provision_ca(plan: DockerBottlePlan, target: str) -> None:
"""Copy the agent-facing CA cert into the agent, rebuild the
trust bundle, emit a one-line fingerprint log. Called from
`BottleBackend.provision` after the agent container is up."""
container = target
cert_host_path, label = _select_ca_cert(plan)
subprocess.run(
["docker", "cp", str(cert_host_path), f"{container}:{AGENT_CA_PATH}"],
stdout=subprocess.DEVNULL,
check=True,
)
subprocess.run(
["docker", "exec", "-u", "0", container, "chmod", "644", AGENT_CA_PATH],
stdout=subprocess.DEVNULL,
check=True,
)
subprocess.run(
["docker", "exec", "-u", "0", container, "update-ca-certificates"],
stdout=subprocess.DEVNULL,
check=True,
)
# Stdlib SHA-256 of the cert's DER bytes — the standard
# fingerprint form. Never the private key.
der = ssl.PEM_cert_to_DER_cert(cert_host_path.read_text())
fingerprint = hashlib.sha256(der).hexdigest()
info(f"{label} ca fingerprint: sha256:{fingerprint[:32]}...")
+121
View File
@@ -0,0 +1,121 @@
"""Git provisioning inside a running Docker bottle.
Three concerns, all about git in the agent:
1. If --cwd was passed AND the host cwd has a .git, copy that .git
into /home/node/workspace/.git so the agent operates on the
user's repo.
2. If the bottle declares `git` entries (PRD 0008), write a
~/.gitconfig with insteadOf rules so every git operation
against a declared upstream (push, fetch, clone, pull,
ls-remote) transparently hits the per-agent git-gate. The
gate mirrors the upstream in both directions, so URL
rewriting is symmetric.
3. If the bottle declares `git.user` (issue #86), set
`git config --global user.{name,email}` inside the bottle so
the agent's commits are attributed to that identity.
"""
from __future__ import annotations
import os
import subprocess
from pathlib import Path
from ....git_gate import GIT_GATE_HOSTNAME, git_gate_render_gitconfig
from ....log import info
from .. import util as docker_mod
from ..bottle_plan import DockerBottlePlan
def provision_git(plan: DockerBottlePlan, target: str) -> None:
"""Set up git inside the bottle. Runs all three subcases; each
no-ops when its condition isn't met."""
_provision_cwd_git(plan, target)
_provision_git_gate_config(plan, target)
_provision_git_user(plan, target)
def _provision_cwd_git(plan: DockerBottlePlan, target: str) -> None:
"""If --cwd was set and the host cwd has a .git directory, copy
it into /home/node/workspace/.git and fix ownership. No-op
otherwise."""
if not (plan.spec.copy_cwd and Path(plan.spec.user_cwd, ".git").is_dir()):
return
container = target
info(f"copying {plan.spec.user_cwd}/.git -> {container}:/home/node/workspace/.git")
subprocess.run(
["docker", "cp", f"{plan.spec.user_cwd}/.git", f"{container}:/home/node/workspace/.git"],
stdout=subprocess.DEVNULL,
check=True,
)
subprocess.run(
[
"docker", "exec", "-u", "0", container,
"chown", "-R", "node:node", "/home/node/workspace/.git",
],
stdout=subprocess.DEVNULL,
check=True,
)
def _provision_git_gate_config(plan: DockerBottlePlan, target: str) -> None:
"""Write ~/.gitconfig in the bottle with the git-gate
insteadOf rules. No-op when the bottle has no `git` entries."""
bottle = plan.spec.manifest.bottle_for(plan.spec.agent_name)
if not bottle.git:
return
container = target
container_home = os.environ.get("BOT_BOTTLE_CONTAINER_HOME", "/home/node")
container_gitconfig = f"{container_home}/.gitconfig"
content = git_gate_render_gitconfig(bottle.git, GIT_GATE_HOSTNAME)
config_file = plan.stage_dir / "agent_gitconfig"
config_file.write_text(content)
config_file.chmod(0o600)
info(f"writing {container_gitconfig} with {len(bottle.git)} insteadOf rule(s)")
subprocess.run(
["docker", "cp", str(config_file), f"{container}:{container_gitconfig}"],
stdout=subprocess.DEVNULL,
check=True,
)
docker_mod.docker_exec_root(container, ["chown", "node:node", container_gitconfig])
docker_mod.docker_exec_root(container, ["chmod", "644", container_gitconfig])
def _provision_git_user(plan: DockerBottlePlan, target: str) -> None:
"""Apply `git config --global user.{name,email}` inside the
bottle so the agent's commits are attributed to the operator-
chosen identity instead of the agent image's default
(which is no user git would refuse to commit at all
until the agent ran its own `git config`).
Runs as the `node` user so `--global` lands in
`/home/node/.gitconfig` (matching the existing
`_provision_git_gate_config` write location). No-op when the
bottle didn't declare `git.user`.
Each field set independently name-only or email-only
configs only run the `git config` line for the field
present."""
bottle = plan.spec.manifest.bottle_for(plan.spec.agent_name)
gu = bottle.git_user
if gu.is_empty():
return
if gu.name:
info(f"git config --global user.name = {gu.name!r}")
subprocess.run(
["docker", "exec", "-u", "node", target,
"git", "config", "--global", "user.name", gu.name],
stdout=subprocess.DEVNULL,
check=True,
)
if gu.email:
info(f"git config --global user.email = {gu.email!r}")
subprocess.run(
["docker", "exec", "-u", "node", target,
"git", "config", "--global", "user.email", gu.email],
stdout=subprocess.DEVNULL,
check=True,
)
@@ -0,0 +1,43 @@
"""Copy the agent prompt into a running Docker bottle.
The prompt file is always copied (so the in-container path always
exists) but `--append-system-prompt-file` only fires when the agent
actually has a prompt the return value signals which case."""
from __future__ import annotations
import os
import subprocess
from ..bottle_plan import DockerBottlePlan
def provision_prompt(plan: DockerBottlePlan, target: str) -> str | None:
"""Copy the prompt file into the container, fix ownership/mode.
Returns the in-container path if the agent has a non-empty
prompt (drives --append-system-prompt-file), else None. The
file is copied either way so the path always exists."""
container = target
container_home = os.environ.get("BOT_BOTTLE_CONTAINER_HOME", "/home/node")
in_container_prompt_path = f"{container_home}/.bot-bottle-prompt.txt"
subprocess.run(
["docker", "cp", str(plan.prompt_file), f"{container}:{in_container_prompt_path}"],
stdout=subprocess.DEVNULL,
check=True,
)
# `docker cp` preserves host UID; re-own/mode as root so node
# can read its own mode-600 prompt regardless of host UID.
subprocess.run(
["docker", "exec", "-u", "0", container, "chown", "node:node", in_container_prompt_path],
stdout=subprocess.DEVNULL,
check=True,
)
subprocess.run(
["docker", "exec", "-u", "0", container, "chmod", "600", in_container_prompt_path],
stdout=subprocess.DEVNULL,
check=True,
)
agent = plan.spec.manifest.agents[plan.spec.agent_name]
return in_container_prompt_path if agent.prompt else None
@@ -0,0 +1,62 @@
"""Copy host-side skill directories into a running Docker bottle.
Skills are validated on the host before launch by the base class's
`BottleBackend._validate_skills` (called from `prepare`); this module
assumes that validation has already run. A skill disappearing between
validation and copy still dies loudly rather than silently producing
a partial container."""
from __future__ import annotations
import os
import subprocess
from ....log import die, info
from ...util import host_skill_dir
from ..bottle_plan import DockerBottlePlan
def provision_skills(plan: DockerBottlePlan, target: str) -> None:
"""Copy each of the agent's named skills from the host's
~/.claude/skills/<name>/ into the container's equivalent path.
For each skill: ensure parent dir, wipe any prior copy, then
`docker cp <host>/. <container>:<dst>/` so the contents are
copied into a freshly-created destination dir. No-op when the
agent has no skills."""
agent = plan.spec.manifest.agents[plan.spec.agent_name]
if not agent.skills:
return
container = target
container_home = os.environ.get("BOT_BOTTLE_CONTAINER_HOME", "/home/node")
skills_dir = os.environ.get(
"BOT_BOTTLE_CONTAINER_SKILLS_DIR", f"{container_home}/.claude/skills"
)
subprocess.run(
["docker", "exec", container, "mkdir", "-p", skills_dir],
stdout=subprocess.DEVNULL,
check=True,
)
for n in agent.skills:
src = host_skill_dir(n)
if not os.path.isdir(src):
die(f"skill '{n}' disappeared from host between validation and copy at {src}.")
dst = f"{skills_dir}/{n}"
info(f"copying skill {n} into {container}:{dst}")
subprocess.run(
["docker", "exec", container, "rm", "-rf", dst],
stdout=subprocess.DEVNULL,
check=True,
)
subprocess.run(
["docker", "exec", container, "mkdir", "-p", dst],
stdout=subprocess.DEVNULL,
check=True,
)
subprocess.run(
["docker", "cp", f"{src}/.", f"{container}:{dst}/"],
stdout=subprocess.DEVNULL,
check=True,
)
@@ -0,0 +1,65 @@
"""Supervise sidecar provisioning inside a running Docker bottle
(PRD 0013).
Registers the per-bottle supervise sidecar as an HTTP MCP server in
the agent's claude-code config so the agent discovers the three
stuck-recovery MCP tools (cred-proxy-block, pipelock-block,
capability-block) at startup.
Uses `claude mcp add` rather than writing JSON directly. claude-code
owns the on-disk config format (`~/.claude.json` `mcpServers` shape,
field names, scope semantics) and changes it between versions; the
official command handles whatever the installed version expects.
No-op when bottle.supervise is False bottles that haven't opted
into the supervise sidecar shouldn't get an MCP entry pointing at a
sidecar that isn't running.
"""
from __future__ import annotations
import subprocess
from ....log import info, warn
from ....supervise import SUPERVISE_HOSTNAME, SUPERVISE_PORT
from ..bottle_plan import DockerBottlePlan
_SUPERVISE_MCP_NAME = "supervise"
def supervise_mcp_url() -> str:
return f"http://{SUPERVISE_HOSTNAME}:{SUPERVISE_PORT}/"
def provision_supervise(plan: DockerBottlePlan, target: str) -> None:
"""Run `claude mcp add` inside the agent container to register
the supervise sidecar in claude-code's user config. No-op when
bottle.supervise is False.
Failure is logged but not fatal: the bottle still works (you
just can't call supervise tools from the agent until the entry
is added manually). The operator sees the warning at launch."""
if plan.supervise_plan is None:
return
url = supervise_mcp_url()
argv = [
"docker", "exec", "-u", "node", target,
"claude", "mcp", "add",
"--scope", "user",
"--transport", "http",
_SUPERVISE_MCP_NAME,
url,
]
info(f"registering supervise MCP server in agent claude config → {url}")
r = subprocess.run(argv, capture_output=True, text=True, check=False)
if r.returncode != 0:
warn(
f"`claude mcp add supervise` failed (exit {r.returncode}): "
f"{(r.stderr or r.stdout or '').strip()}. Inside the bottle, "
f"register manually with: "
f"claude mcp add --scope user --transport http supervise {url}"
)
__all__ = ["provision_supervise", "supervise_mcp_url"]
@@ -0,0 +1,31 @@
"""Sidecar bundle constants + helpers for the Docker backend
(PRD 0024).
The bundle image (built by Dockerfile.sidecars, PRD 0024 chunk 1)
runs pipelock + egress + git-gate + supervise as one container
per bottle under a small Python init supervisor. As of chunk 5
the bundle is the only shape the legacy four-sidecar topology
and its `BOT_BOTTLE_SIDECAR_BUNDLE` feature flag are gone."""
from __future__ import annotations
import os
# Bundle image. Defaults to a built-locally tag (built from the
# repo's Dockerfile.sidecars via compose `build:`). Operators
# pinning to a published digest can override via env, matching
# the existing `BOT_BOTTLE_PIPELOCK_IMAGE` shape.
SIDECAR_BUNDLE_IMAGE = os.environ.get(
"BOT_BOTTLE_SIDECAR_IMAGE",
"bot-bottle-sidecars:latest",
)
SIDECAR_BUNDLE_DOCKERFILE = "Dockerfile.sidecars"
def sidecar_bundle_container_name(slug: str) -> str:
"""`bot-bottle-sidecars-<slug>`. Same prefix scheme as the
per-sidecar containers it replaces, so the dashboard's
discovery-by-prefix logic keeps working."""
return f"bot-bottle-sidecars-{slug}"
+186
View File
@@ -0,0 +1,186 @@
"""Docker host-side primitives used by DockerBottleBackend: probing
for docker on PATH, slugifying agent names, checking image/container
existence, and building images."""
from __future__ import annotations
import re
import shutil
import subprocess
from typing import Iterable, Iterator
from ...log import die, info
# Cap on the suffix the container-name conflict logic will try before
# giving up: base, base-2, ..., base-MAX_CONTAINER_SUFFIX.
MAX_CONTAINER_SUFFIX = 100
def container_name_candidates(base: str) -> Iterator[str]:
"""Yield `base`, then `base-2`, `base-3`, ... up to
`base-MAX_CONTAINER_SUFFIX`. Both the prepare-time probe and the
launch-time race retry walk this sequence."""
yield base
for suffix in range(2, MAX_CONTAINER_SUFFIX + 1):
yield f"{base}-{suffix}"
def runsc_available() -> bool:
"""Return True if the Docker daemon has the gVisor (`runsc`) runtime
registered. Called once per prepare; the result lives on the plan."""
r = subprocess.run(
["docker", "info", "--format", "{{json .Runtimes}}"],
capture_output=True,
text=True,
check=False,
)
return r.returncode == 0 and "runsc" in r.stdout
def require_docker() -> None:
"""Fail with an install pointer if `docker` is not on PATH."""
if shutil.which("docker") is None:
info("Docker is required but was not found on PATH.")
info("macOS: install Docker Desktop https://docs.docker.com/desktop/install/mac-install/")
info("Linux: install Docker Engine https://docs.docker.com/engine/install/")
die("docker not found")
def image_exists(ref: str) -> bool:
return _silent_run(["docker", "image", "inspect", ref]) == 0
def container_exists(name: str) -> bool:
"""Returns True if a container (running or stopped) with the given
name exists. Uses `docker ps -a -q -f name=^<name>$` so substring
matches don't false-positive."""
result = subprocess.run(
["docker", "ps", "-a", "-q", "-f", f"name=^{name}$"],
capture_output=True,
text=True,
check=True,
)
return bool(result.stdout.strip())
def force_remove_container(name: str) -> None:
"""`docker rm -f` the named container if it exists. No-op if it
doesn't — and the rm itself is best-effort (errors swallowed) so
this is safe to register as a teardown callback."""
if container_exists(name):
subprocess.run(
["docker", "rm", "-f", name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
)
def docker_exec_root(container: str, argv: list[str]) -> None:
"""Run `docker exec -u 0` in the named container, check=True. Used
by SSH provisioning to chown/chmod files that need root."""
subprocess.run(
["docker", "exec", "-u", "0", container, *argv],
stdout=subprocess.DEVNULL,
check=True,
)
_SLUG_RE = re.compile(r"[^a-z0-9]+")
def slugify(name: str) -> str:
"""Lowercase, non-alnum runs → '-', trimmed. Dies on empty result."""
if not name:
die("slugify: missing name")
slug = _SLUG_RE.sub("-", name.lower()).strip("-")
if not slug:
die(f"name '{name}' produced an empty slug; use alphanumeric characters")
return slug
def build_image(ref: str, context: str, *, dockerfile: str = "") -> None:
"""Invokes `docker build` every call. Layer cache makes no-change
rebuilds cheap; running every time means Dockerfile edits land
without manual `docker rmi`.
`dockerfile` is an optional path (relative to `context`, or
absolute) for callers that need to build from a non-default
Dockerfile in the same context e.g. `Dockerfile.git-gate`."""
info(f"building image {ref} from {context} (layer cache keeps repeat builds fast)")
args = ["docker", "build", "-t", ref]
if dockerfile:
args.extend(["-f", dockerfile])
args.append(context)
subprocess.run(args, check=True)
_TRUST_DIALOG_NODE_SCRIPT = (
'const fs=require("fs"),p=process.env.HOME+"/.claude.json",'
'c=JSON.parse(fs.readFileSync(p,"utf8"));'
'c.projects=c.projects||{};'
'c.projects[process.env.HOME+"/workspace"]={hasTrustDialogAccepted:true};'
'fs.writeFileSync(p,JSON.stringify(c,null,2));'
)
def build_image_with_cwd(derived: str, base: str, cwd: str) -> None:
"""Build a thin derived image that copies <cwd> into
/home/node/workspace and adds a trust-dialog entry for it."""
import os
if not os.path.isdir(cwd):
die(f"cwd not found at {cwd}")
info(f"building image {derived} from {base} with {cwd} -> /home/node/workspace")
dockerfile = (
f"FROM {base}\n"
f"COPY --chown=node:node . /home/node/workspace\n"
f"RUN node -e '{_TRUST_DIALOG_NODE_SCRIPT}'\n"
f"WORKDIR /home/node/workspace\n"
)
subprocess.run(
["docker", "build", "-t", derived, "-f", "-", cwd],
input=dockerfile,
text=True,
check=True,
)
def image_id(ref: str) -> str:
"""Return the content-addressed image ID (e.g.
`sha256:abcd...`) for `ref`. The smolmachines backend keys its
`.smolmachine` artifact cache on this, so a Dockerfile change
that produces a new image automatically invalidates the cache."""
r = subprocess.run(
["docker", "image", "inspect", "--format", "{{.Id}}", ref],
capture_output=True,
text=True,
check=False,
)
if r.returncode != 0:
die(
f"docker image inspect for {ref!r} failed: "
f"{(r.stderr or '').strip() or '<no stderr>'}"
)
return r.stdout.strip()
def save(ref: str, output: str) -> None:
"""`docker save REF -o OUTPUT`. Writes a tarball of the image
layers + manifest to the host path. Used by smolmachines
prepare to hand the agent image to a containerized crane that
pushes it to the ephemeral registry bypassing the docker
daemon's `docker push` (which on Docker Desktop can't reach a
host-loopback registry and refuses plain-HTTP pushes to
non-loopback hosts)."""
subprocess.run(["docker", "save", ref, "-o", output], check=True)
def _silent_run(cmd: Iterable[str]) -> int:
return subprocess.run(
list(cmd),
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
).returncode
+28
View File
@@ -0,0 +1,28 @@
"""Shared print helpers for BottlePlan.print implementations.
Lifts the multi-value label printer out of DockerBottlePlan so the
smolmachines backend (and any future backend) renders the same
two-column scannable preflight without duplicating the indent
math."""
from __future__ import annotations
from typing import Sequence
from ..log import info
def print_multi(label: str, values: Sequence[str]) -> None:
"""Print `label: <value>` with continuation lines indented to
align under the first value. Empty `values` renders `(none)`.
Used by every backend's `BottlePlan.print` for env / skills /
git / egress one item per line keeps the preflight summary
scannable when an agent has many of any of these."""
if not values:
info(f"{label}: (none)")
return
info(f"{label}: {values[0]}")
indent = " " * (len(label) + 2)
for v in values[1:]:
info(f"{indent}{v}")
@@ -0,0 +1,15 @@
"""smolmachines bottle backend (PRD 0023).
Selectable via `BOT_BOTTLE_BACKEND=smolmachines`. Runs each
bottle inside a per-agent microVM (libkrun / Hypervisor.framework
on macOS) with a userspace gvproxy gateway as the egress
primitive. The sidecar bundle (PRD 0024) runs as a host-side
docker container reached only through gvproxy's port-forward list.
Chunk 1 (this commit) ships the backend skeleton + Smolfile +
gvproxy renderers + preflight check. VM lifecycle, sidecar
bringup, and provisioning land in later chunks."""
from .backend import SmolmachinesBottleBackend # noqa: F401
__all__ = ["SmolmachinesBottleBackend"]
@@ -0,0 +1,86 @@
"""SmolmachinesBottleBackend — the smolmachines implementation of
BottleBackend (PRD 0023)."""
from __future__ import annotations
from contextlib import contextmanager
from pathlib import Path
from typing import Generator, Sequence
from .. import ActiveAgent, BottleBackend, BottleSpec
from . import cleanup as _cleanup
from . import enumerate as _enumerate
from . import launch as _launch
from . import prepare as _prepare
from . import smolvm as _smolvm
from .bottle import SmolmachinesBottle
from .bottle_cleanup_plan import SmolmachinesBottleCleanupPlan
from .bottle_plan import SmolmachinesBottlePlan
from .provision import ca as _ca
from .provision import git as _git
from .provision import prompt as _prompt
from .provision import skills as _skills
from .provision import supervise as _supervise
class SmolmachinesBottleBackend(
BottleBackend["SmolmachinesBottlePlan", "SmolmachinesBottleCleanupPlan"]
):
"""smolmachines backend. Selected by
`BOT_BOTTLE_BACKEND=smolmachines`."""
name = "smolmachines"
@classmethod
def is_available(cls) -> bool:
"""`smolvm` on PATH. The backend additionally needs macOS
for libkrun + TSI, but `enumerate_active` / `cleanup` are
host-shell ops that gracefully no-op on Linux too the
runtime check happens at `prepare`."""
return _smolvm.is_available()
def _resolve_plan(
self, spec: BottleSpec, *, stage_dir: Path
) -> SmolmachinesBottlePlan:
return _prepare.resolve_plan(spec, stage_dir=stage_dir)
@contextmanager
def launch(
self, plan: SmolmachinesBottlePlan
) -> Generator[SmolmachinesBottle, None, None]:
with _launch.launch(plan, provision=self.provision) as bottle:
yield bottle
def provision_ca(
self, plan: SmolmachinesBottlePlan, target: str
) -> None:
_ca.provision_ca(plan, target)
def provision_prompt(
self, plan: SmolmachinesBottlePlan, target: str
) -> str | None:
return _prompt.provision_prompt(plan, target)
def provision_skills(
self, plan: SmolmachinesBottlePlan, target: str
) -> None:
_skills.provision_skills(plan, target)
def provision_git(
self, plan: SmolmachinesBottlePlan, target: str
) -> None:
_git.provision_git(plan, target)
def provision_supervise(
self, plan: SmolmachinesBottlePlan, target: str
) -> None:
_supervise.provision_supervise(plan, target)
def prepare_cleanup(self) -> SmolmachinesBottleCleanupPlan:
return _cleanup.prepare_cleanup()
def cleanup(self, plan: SmolmachinesBottleCleanupPlan) -> None:
_cleanup.cleanup(plan)
def enumerate_active(self) -> Sequence[ActiveAgent]:
return _enumerate.enumerate_active()
+179
View File
@@ -0,0 +1,179 @@
"""SmolmachinesBottle — running-instance handle (PRD 0023 chunk 2d).
Routes `exec_claude` / `exec` / `cp_in` through `smolvm machine
exec` / `smolvm machine cp`. The handle is yielded by `launch`
and torn down via the surrounding ExitStack on context exit;
`close` is a no-op idempotent alias so the BottleBackend ABC's
context-manager contract is satisfied.
User context: `smolvm machine exec` runs commands as root in the
VM, but the agent image's USER is `node` and claude-code refuses
to run as root with `--dangerously-skip-permissions`. Both
`exec_claude` and `exec` switch to the requested user (default
`node`) via `runuser -u <user> --` and set `HOME` / `USER`
through `smolvm -e` avoiding `runuser -l`'s login-shell wiring
(PAM session setup, /etc/profile sourcing) which can hang on a
minimal Debian VM with no PAM session config."""
from __future__ import annotations
import subprocess
import sys
from typing import Mapping
from ...agent_provider import prompt_args
from .. import Bottle, ExecResult
from . import pty_resize as _pty_resize
from . import smolvm as _smolvm
# Absolute path to the pty_resize wrapper. Invoke as
# `python <path>` rather than `python -m <dotted-path>` so the
# wrapper runs regardless of cwd / sys.path — it has no
# bot_bottle.* imports, so it's self-contained.
_PTY_RESIZE_SCRIPT = _pty_resize.__file__
# Per-user env the agent image's USER (node) expects. claude
# reads ~/.claude.json + writes session state under ~/.claude/;
# bare `runuser -u` inherits root's HOME=/root, which claude
# can't write to. Set HOME / USER explicitly through smolvm -e
# so the child process sees them.
_HOME_FOR = {
"node": "/home/node",
"root": "/root",
}
def _env_flags_for(user: str) -> list[str]:
home = _HOME_FOR.get(user, f"/home/{user}")
return ["-e", f"HOME={home}", "-e", f"USER={user}"]
def _guest_env_flags(env: Mapping[str, str]) -> list[str]:
"""Render `{K: V}` into a flat `-e K=V` argv slice for
`smolvm machine exec`. `smolvm machine create -e` set env
on PID 1 but it doesn't propagate to fresh exec process
trees, so we have to re-pass them every call."""
out: list[str] = []
for k, v in env.items():
out += ["-e", f"{k}={v}"]
return out
class SmolmachinesBottle(Bottle):
"""Handle returned by `SmolmachinesBottleBackend.launch`. The
underlying VM lifecycle (create / start / stop / delete) lives
on the launch ExitStack this class only routes runtime
operations to the right `smolvm machine ...` subcommand."""
def __init__(
self,
machine_name: str,
*,
prompt_path: str | None = None,
guest_env: Mapping[str, str] | None = None,
agent_command: str = "claude",
agent_prompt_mode: str = "claude_append_file",
) -> None:
self.name = machine_name
# In-VM path to the agent's prompt file. None when the
# agent declared no prompt (file still exists; we just
# don't pass --append-system-prompt-file).
self._prompt_path = prompt_path
# Env vars the agent process needs (HTTPS_PROXY,
# CLAUDE_CODE_OAUTH_TOKEN, manifest-declared bottle env, …).
# Forwarded on every `smolvm machine exec` via `-e K=V`
# because exec doesn't inherit from machine_create's env.
self._guest_env = dict(guest_env or {})
self._agent_command = agent_command
self._agent_prompt_mode = agent_prompt_mode
self.agent_command = agent_command
self.agent_provider_template = (
"codex" if agent_command == "codex" else "claude"
)
def claude_argv(
self, argv: list[str], *, tty: bool = True,
) -> list[str]:
flags = ["smolvm", "machine", "exec", "--name", self.name]
if tty:
flags += ["-i", "-t"]
flags += _env_flags_for("node")
flags += _guest_env_flags(self._guest_env)
claude_tail = [self._agent_command]
provider_prompt_args = prompt_args(
self._agent_prompt_mode, self._prompt_path, argv=argv,
)
if self._agent_prompt_mode == "codex_read_prompt_file":
claude_tail += argv
claude_tail += provider_prompt_args
else:
claude_tail += provider_prompt_args
claude_tail += argv
flags += ["--", "runuser", "-u", "node", "--", *claude_tail]
if not tty:
# No PTY allocated — no SIGWINCH to forward, no resize
# bridge needed. Skip the wrapper so non-interactive
# exec paths (e.g., provisioning shell-outs that
# happen to go through this method) stay light.
return flags
return [
sys.executable, _PTY_RESIZE_SCRIPT,
self.name, "--", *flags,
]
def exec_claude(self, argv: list[str], *, tty: bool = True) -> int:
"""Run `claude` interactively inside the VM as the `node`
user. Inherits the operator's terminal (stdin / stdout /
stderr) so the session feels native. Blocks until claude
exits; returns the in-VM exit code.
We bypass the captured-output `machine_exec` helper here
because that one wraps stdout/stderr in pipes fine for
scripted exec, wrong for an interactive shell. Drop down
to `subprocess.run` with the TTY inherited.
UID switches via `runuser -u node --` (not `-l`) so we
avoid login-shell wiring. HOME / USER come from `smolvm
-e` instead, which sets them on the process env."""
return subprocess.run(
self.claude_argv(argv, tty=tty), check=False,
).returncode
def exec(self, script: str, *, user: str = "node") -> ExecResult:
"""Run a POSIX shell script as `user` (default `node`) and
capture the result. Matches the docker backend's `exec`,
which defaults to the image's USER (also node) — so test
helpers / provision shell-outs run with the same identity
on both backends. Pass `user="root"` for tests that need
root.
`runuser -u <user> -- /bin/sh -c <script>` switches UID
without invoking a login shell; HOME / USER are set via
`smolvm -e` (see `_env_flags_for`)."""
argv = (
_env_flags_for(user)
+ _guest_env_flags(self._guest_env)
+ ["--", "runuser", "-u", user, "--", "/bin/sh", "-c", script]
)
# _smolvm.machine_exec expects argv (the bit after `--`);
# the -e flags go before, so call smolvm directly.
r = subprocess.run(
["smolvm", "machine", "exec", "--name", self.name] + argv,
capture_output=True, text=True, check=False,
)
return ExecResult(
returncode=r.returncode,
stdout=r.stdout or "",
stderr=r.stderr or "",
)
def cp_in(self, host_path: str, container_path: str) -> None:
"""Copy a host path into the guest at `container_path`."""
_smolvm.machine_cp(host_path, f"{self.name}:{container_path}")
def close(self) -> None:
# Real teardown lives on the launch ExitStack; this is just
# the idempotent alias the BottleBackend ABC expects.
pass
@@ -0,0 +1,55 @@
"""SmolmachinesBottleCleanupPlan — concrete BottleCleanupPlan (issue #77).
Tracks the resources `SmolmachinesBottleBackend.cleanup` will
remove:
- machines: smolvm machines whose name starts with
`bot-bottle-` (running or stopped). Stopped +
deleted via `smolvm machine stop` + `machine delete -f`.
- bundles: docker containers `bot-bottle-sidecars-<slug>`
left over from a smolmachines bottle (the bundle's
port-forwards stay published on lo0 aliases until
the container is gone). Removed via `docker rm -f`.
- networks: docker networks `bot-bottle-bundle-<slug>`
attached to the bundles. Removed via
`docker network rm`.
Smolmachines state dirs live under the same `~/.bot-bottle/state/`
path the docker backend uses; the docker backend's
`prepare_cleanup` already enumerates orphan state dirs and is the
single source of truth for that bucket (consults
`enumerate_active_bottles()` so it doesn't reap a live
smolmachines bottle's dir)."""
from __future__ import annotations
import sys
from dataclasses import dataclass
from ...log import info
from .. import BottleCleanupPlan
@dataclass(frozen=True)
class SmolmachinesBottleCleanupPlan(BottleCleanupPlan):
"""Resources SmolmachinesBottleBackend.cleanup will remove.
Produced by `prepare_cleanup`; sorted so the y/N output is
stable."""
machines: tuple[str, ...] = ()
bundles: tuple[str, ...] = ()
networks: tuple[str, ...] = ()
@property
def empty(self) -> bool:
return not self.machines and not self.bundles and not self.networks
def print(self) -> None:
print(file=sys.stderr)
for name in self.machines:
info(f"smolvm machine: {name}")
for name in self.bundles:
info(f"bundle container:{name}")
for name in self.networks:
info(f"bundle network: {name}")
print(file=sys.stderr)
@@ -0,0 +1,128 @@
"""SmolmachinesBottlePlan — concrete BottlePlan for the smolmachines
backend (PRD 0023).
Slug + bundle docker subnet / gateway / pinned IP + smolvm
machine name + agent `.smolmachine` artifact + per-bottle guest
env. Provisioning fields (CA cert path, prompt path, etc.) land
in chunk 4."""
from __future__ import annotations
import sys
from dataclasses import dataclass
from pathlib import Path
from ...egress import EgressPlan
from ...git_gate import GitGatePlan
from ...log import info
from ...pipelock import PipelockProxyPlan
from ...supervise import SupervisePlan
from .. import BottlePlan
from ..print_util import print_multi
@dataclass(frozen=True)
class SmolmachinesBottlePlan(BottlePlan):
"""Resolved fields the launch step needs to bring up the bottle.
Inherits `spec` and `stage_dir` from BottlePlan."""
slug: str
# Per-bottle docker subnet for the sidecar bundle container.
# The bundle runs at `bundle_ip` (always `.2`); the gateway is
# at `.1`. smolvm's TSI allowlist is set to `bundle_ip/32`.
bundle_subnet: str
bundle_gateway: str
bundle_ip: str
# smolvm machine name + agent image source. machine_create
# boots from a packed `.smolmachine` artifact (pre-baked at
# prepare time via `smolvm pack create`); using `--from`
# instead of `--image` avoids the registry-pull race we hit
# when machine_start tried to fetch on-demand and the libkrun
# agent's network attempt got refused by macOS.
#
# Chunk 2d ships with a public placeholder image (alpine)
# since bot-bottle-claude:latest lives in the operator's local
# docker daemon and smolvm's crane backend can't read from
# there; chunk 4 resolves the agent-image-conversion gap
# (push to a registry first, or smolvm grows a docker-daemon
# transport).
machine_name: str
# Agent image ref (docker tag). `launch` runs the
# build → save → registry push → smolvm pack pipeline against
# this and feeds the resulting `.smolmachine` artifact to
# `machine_create --from`. The pipeline runs at launch time
# (not prepare time) so the docker build output doesn't garble
# the dashboard's preflight modal.
agent_image_ref: str
# In-guest env vars (HTTPS_PROXY etc) — IP-literal URLs since
# the guest has no DNS resolver inside the TSI allowlist.
# Passed to `smolvm machine create` as `-e K=V` flags.
# Smolfile-rendering is gone (smolvm 0.8.0's
# `--smolfile` is mutually exclusive with `--from`, and
# `--from` is the path that avoids the registry-pull race).
guest_env: dict[str, str]
# Path to the agent's prompt file on the host. Always written
# (mode 0o600) so the in-VM path always exists; the file is
# empty when the agent has no prompt — claude-code reads it
# via --append-system-prompt-file only when non-empty.
prompt_file: Path
# Inner Plans for the four bundle daemons. The same shape the
# docker backend uses — same `.prepare()` calls produced
# them — but our launch step doesn't populate the
# docker-specific network fields (internal_network,
# egress_network) because the smolmachines bundle isn't on
# docker's `--internal` + egress bridge topology; it's on a
# per-bottle bridge with a pinned IP. The unused fields stay
# at their dataclass defaults.
proxy_plan: PipelockProxyPlan
git_gate_plan: GitGatePlan
egress_plan: EgressPlan
# None when bottle.supervise is False, matching the docker
# backend's convention.
supervise_plan: SupervisePlan | None
# Agent-side endpoints. On Docker Desktop the docker bridge
# IPs aren't reachable from the smolvm guest (TSI uses macOS
# networking; docker container IPs live in the daemon's VM),
# so the agent dials the bundle via host loopback +
# docker-published random ports. Empty at prepare time;
# launch populates these after bundle bringup via
# `dataclasses.replace`. Format: a `host:port` for git-gate
# (insteadOf URL prefix) + full URLs for proxy / supervise.
agent_proxy_url: str = ""
agent_git_gate_host: str = ""
agent_supervise_url: str = ""
agent_command: str = "claude"
agent_prompt_mode: str = "claude_append_file"
agent_provider_template: str = "claude"
agent_dockerfile_path: str = ""
def print(self, *, remote_control: bool) -> None:
"""Compact y/N preflight. Same shape as the Docker
backend's so operators see one format across backends."""
del remote_control # not surfaced in the compact summary
spec = self.spec
manifest = spec.manifest
agent = manifest.agents[spec.agent_name]
bottle = manifest.bottle_for(spec.agent_name)
env_names = sorted(bottle.env.keys())
upstreams = [
f"{g.Name}{g.Upstream}" for g in bottle.git
]
# Use the resolved egress_plan (lowercase `host` on the
# plan-level EgressRoute) rather than `bottle.egress.routes`,
# which is the manifest's capitalized-attr form.
routes = [r.host for r in self.egress_plan.routes]
print(file=sys.stderr)
info(f"agent : {spec.agent_name}")
info(f"provider : {self.agent_provider_template}")
print_multi("env ", env_names)
print_multi("skills ", list(agent.skills))
info(f"bottle : {agent.bottle}")
if upstreams:
print_multi(" git gate ", upstreams)
if routes:
print_multi(" egress ", routes)
print(file=sys.stderr)
+159
View File
@@ -0,0 +1,159 @@
"""Cleanup + active-listing for the smolmachines backend (issue #77).
`prepare_cleanup` enumerates leftover smolmachines resources:
- smolvm machines (`smolvm machine ls --json`) whose name starts
with `bot-bottle-`.
- bundle docker containers (`bot-bottle-sidecars-<slug>`).
- bundle docker networks (`bot-bottle-bundle-<slug>`).
State dirs live under `~/.bot-bottle/state/<identity>/`
shared layout with the docker backend, which has the single
orphan-state-dir enumerator (it already consults
`enumerate_active_agents()` so a live smolmachines bottle's dir
is preserved).
`cleanup` removes everything in the plan: stop + delete each VM,
force-rm each container, rm each network. Each step is
best-effort a failure on one resource doesn't block the others."""
from __future__ import annotations
import json
import subprocess
from ...log import info, warn
from . import sidecar_bundle as _bundle
from . import smolvm as _smolvm
from .bottle_cleanup_plan import SmolmachinesBottleCleanupPlan
# Both names start with the same prefix the launcher uses.
_VM_PREFIX = "bot-bottle-"
_BUNDLE_PREFIX = _bundle.bundle_container_name("") # `bot-bottle-sidecars-`
_NETWORK_PREFIX = _bundle.bundle_network_name("") # `bot-bottle-bundle-`
def prepare_cleanup() -> SmolmachinesBottleCleanupPlan:
"""Enumerate every smolmachines-owned resource on the host.
No side effects. Returns an empty plan when smolvm isn't on
PATH (no machines to reap) `cleanup` is a no-op in that
case too."""
machines = _list_bot_bottle_machines()
bundles = _list_bundle_containers()
networks = _list_bundle_networks()
return SmolmachinesBottleCleanupPlan(
machines=tuple(sorted(machines)),
bundles=tuple(sorted(bundles)),
networks=tuple(sorted(networks)),
)
def cleanup(plan: SmolmachinesBottleCleanupPlan) -> None:
"""Remove everything in the plan. Order matters: stop VMs
first (they hold ports on lo0 aliases via libkrun), then the
bundle containers (which hold the host port-forwards), then
the networks (which docker won't reap until the containers
are gone)."""
for name in plan.machines:
info(f"stopping smolvm machine {name}")
subprocess.run(
["smolvm", "machine", "stop", "--name", name],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
check=False,
)
info(f"deleting smolvm machine {name}")
r = subprocess.run(
["smolvm", "machine", "delete", "-f", name],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
warn(
f"smolvm machine delete -f {name} failed: "
f"{(r.stderr or '').strip()}"
)
for name in plan.bundles:
info(f"removing bundle container {name}")
subprocess.run(
["docker", "rm", "-f", name],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
check=False,
)
for name in plan.networks:
info(f"removing bundle network {name}")
r = subprocess.run(
["docker", "network", "rm", name],
capture_output=True, text=True, check=False,
)
if r.returncode != 0 and "no such network" not in (r.stderr or "").lower():
warn(
f"docker network rm {name} failed: "
f"{(r.stderr or '').strip()}"
)
def _list_bot_bottle_machines() -> list[str]:
"""All smolvm machines named `bot-bottle-*`, regardless of
state (running / stopped / created). Empty when smolvm isn't
installed."""
if not _smolvm.is_available():
return []
r = subprocess.run(
["smolvm", "machine", "ls", "--json"],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
return []
try:
machines = json.loads(r.stdout or "[]")
except json.JSONDecodeError:
return []
return [
m["name"] for m in machines
if isinstance(m, dict)
and m.get("name", "").startswith(_VM_PREFIX)
]
def _list_bundle_containers() -> list[str]:
"""All docker containers named `bot-bottle-sidecars-*`,
running or stopped. Empty when docker isn't installed."""
# Late import: `backend/__init__` imports this module
# transitively via the smolmachines backend.
from .. import has_backend
if not has_backend("docker"):
return []
r = subprocess.run(
["docker", "ps", "-a",
"--filter", f"name=^{_BUNDLE_PREFIX}",
"--format", "{{.Names}}"],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
return []
return [
line for line in (r.stdout or "").splitlines()
if line and line.startswith(_BUNDLE_PREFIX)
]
def _list_bundle_networks() -> list[str]:
"""All docker networks named `bot-bottle-bundle-*`. Empty
when docker isn't installed."""
from .. import has_backend
if not has_backend("docker"):
return []
r = subprocess.run(
["docker", "network", "ls",
"--filter", f"name={_NETWORK_PREFIX}",
"--format", "{{.Name}}"],
capture_output=True, text=True, check=False,
)
if r.returncode != 0:
return []
return [
line for line in (r.stdout or "").splitlines()
if line and line.startswith(_NETWORK_PREFIX)
]
@@ -0,0 +1,121 @@
"""Active-agent enumeration for the smolmachines backend (PRD
0023 chunk 4 follow-up + issue #77).
Returns a list of `ActiveAgent` records same shape the docker
backend produces so CLI `list active` and the dashboard agents
pane render both backends through one code path.
A smolmachines agent is "active" when its smolvm guest is
running. We cross-reference against the per-bottle sidecar
bundle container to populate the `services` field (which daemons
are up in the bundle); without a bundle we still surface the VM
so the operator can see + clean it up.
The cross-backend caller gates on `has_backend("smolmachines")`
and `has_backend("docker")`, so this module assumes both are
available when called. Both subprocess calls below still
tolerate "command not on PATH" defensively, but the gate is the
intended access pattern."""
from __future__ import annotations
import json
import subprocess
from .. import ActiveAgent
from ..docker.bottle_state import read_metadata
from . import sidecar_bundle as _bundle
# Smolvm VM names produced by prepare are `bot-bottle-<slug>`,
# matching the bundle container name pattern. We use the prefix
# both as a filter and to strip back to the slug.
_VM_NAME_PREFIX = "bot-bottle-"
def enumerate_active() -> list[ActiveAgent]:
"""All currently-running smolmachines-backed agents. Empty
list when no matching VMs are running. Caller is responsible
for gating on `has_backend('smolmachines')` if needed; if
smolvm is missing the `smolvm machine ls` call below returns
nothing silently."""
result = subprocess.run(
["smolvm", "machine", "ls", "--json"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
return []
try:
machines = json.loads(result.stdout or "[]")
except json.JSONDecodeError:
return []
services_by_slug = _query_bundle_services()
out: list[ActiveAgent] = []
for m in machines:
name = m.get("name") or ""
state = m.get("state") or ""
if state != "running" or not name.startswith(_VM_NAME_PREFIX):
continue
slug = name[len(_VM_NAME_PREFIX):]
metadata = read_metadata(slug)
out.append(ActiveAgent(
backend_name="smolmachines",
slug=slug,
agent_name=metadata.agent_name if metadata else "?",
started_at=metadata.started_at if metadata else "",
services=services_by_slug.get(slug, ()),
))
return out
def _query_bundle_services() -> dict[str, tuple[str, ...]]:
"""`{slug: ('egress', 'pipelock', ...)}` from each running
bundle container's `BOT_BOTTLE_SIDECAR_DAEMONS` env var.
Smolmachines bundles all run the PRD-0024 image with the
same daemon set declared via env, so one inspect per bundle
gets us the picture without exec'ing into the container.
Returns an empty mapping when the docker backend isn't
available the bundle services field on each ActiveAgent
just shows up empty, matching the docker backend's "starting"
state."""
# Late import: `has_backend` lives on the backend package's
# __init__, which imports this module transitively. Pulling
# the name in at call time sidesteps the cycle.
from .. import has_backend
if not has_backend("docker"):
return {}
ps = subprocess.run(
["docker", "ps",
"--filter", "name=" + _bundle.bundle_container_name(""),
"--format", "{{.Names}}"],
capture_output=True, text=True, check=False,
)
if ps.returncode != 0:
return {}
out: dict[str, tuple[str, ...]] = {}
for line in (ps.stdout or "").splitlines():
name = line.strip()
if not name:
continue
slug = name.removeprefix(_bundle.bundle_container_name(""))
if not slug:
continue
inspect = subprocess.run(
["docker", "inspect", name, "--format", "{{json .Config.Env}}"],
capture_output=True, text=True, check=False,
)
if inspect.returncode != 0:
continue
try:
env_list = json.loads(inspect.stdout or "[]")
except json.JSONDecodeError:
continue
for entry in env_list:
key, _, value = entry.partition("=")
if key == "BOT_BOTTLE_SIDECAR_DAEMONS":
out[slug] = tuple(sorted(
d for d in value.split(",") if d
))
break
return out
+468
View File
@@ -0,0 +1,468 @@
"""End-to-end launch flow for the smolmachines backend
(PRD 0023 chunks 2d + 4b).
Brings up the per-bottle docker bridge + sidecar bundle (with
real daemons + their config files), creates + starts the smolvm
guest pointed at the bundle's pinned IP via TSI's
`--allow-cidr <bundle-ip>/32` allowlist, yields a
`SmolmachinesBottle` handle, tears everything down on context
exit.
The bundle's daemons consume the inner Plans the docker backend
already produces: pipelock reads its yaml + CA from the
PipelockProxyPlan; egress reads routes + CAs from the EgressPlan
+ EGRESS_UPSTREAM_PROXY pointing at `127.0.0.1:8888` (bundle
local), since the agent dials pipelock first (not egress) on the
smolmachines path. Git-gate + supervise plumb through the same
plans the docker backend uses, minus the docker-network fields
that don't apply here."""
from __future__ import annotations
import dataclasses
import os
import time
from contextlib import ExitStack, contextmanager
from pathlib import Path
from typing import Callable, Generator
from ...egress import EGRESS_ROUTES_IN_CONTAINER, egress_resolve_token_values
from ...pipelock import (
PIPELOCK_CA_CERT_IN_CONTAINER,
PIPELOCK_CA_KEY_IN_CONTAINER,
)
from ...supervise import QUEUE_DIR_IN_CONTAINER, SUPERVISE_PORT
from ...util import expand_tilde
from ..docker import util as docker_mod
from ..docker.egress import (
EGRESS_CA_IN_CONTAINER,
EGRESS_PIPELOCK_CA_IN_CONTAINER,
EGRESS_PORT as _EGRESS_PORT,
egress_tls_init,
)
from ..docker.git_gate import (
GIT_GATE_ACCESS_HOOK_IN_CONTAINER,
GIT_GATE_CREDS_DIR_IN_CONTAINER,
GIT_GATE_ENTRYPOINT_IN_CONTAINER,
GIT_GATE_HOOK_IN_CONTAINER,
GIT_GATE_PORT as _GIT_GATE_PORT,
)
from ..docker.pipelock import (
BUNDLE_LOCAL_PIPELOCK_URL,
PIPELOCK_PORT as _PIPELOCK_PORT_STR,
pipelock_tls_init,
)
from . import loopback_alias as _loopback
from . import sidecar_bundle as _bundle
from . import smolvm as _smolvm
from .bottle import SmolmachinesBottle
from .bottle_plan import SmolmachinesBottlePlan
from .local_registry import crane_push_tarball, ephemeral_registry
# Repo root, used as the `docker build` context for the agent image.
_REPO_DIR = str(Path(__file__).resolve().parent.parent.parent.parent)
# Per-host cache for `smolvm pack create` outputs. Keyed by the
# docker image ID so a Dockerfile change automatically invalidates
# the cache. `pack create` is idempotent on the smolvm side but
# takes several seconds even on a no-op rebuild.
_SMOLMACHINE_CACHE_DIR = Path.home() / ".cache" / "bot-bottle" / "smolmachines"
# Container-internal listening ports for each bundle daemon. The
# bundle publishes each one on a random host loopback port (see
# `_bundle.start_bundle`), and `_bundle.bundle_host_port` looks
# them up post-start. Pipelock's port is an env-overridable string
# in docker.pipelock; coerce to int here.
_PIPELOCK_PORT = int(_PIPELOCK_PORT_STR)
_SUPERVISE_PORT = SUPERVISE_PORT
@contextmanager
def launch(
plan: SmolmachinesBottlePlan,
*,
provision: Callable[[SmolmachinesBottlePlan, str], str | None],
) -> Generator[SmolmachinesBottle, None, None]:
"""Build + run the bottle and yield a handle; tear everything
down on exit. Errors during bringup unwind any partial state
via the ExitStack."""
stack = ExitStack()
try:
# 1. Reserve a loopback alias for this bottle. macOS only
# routes 127.0.0.1 by default; the per-bottle alias is
# what bundles the docker port-publishes and TSI allowlist
# against, so this bottle can't reach other bottles' (or
# other host services') ports on the loopback. Lazy
# sudo-driven on first use per boot. No-op on Linux.
_loopback.ensure_pool()
loopback_ip = _loopback.allocate(plan.slug)
# 2. Per-bottle docker bridge.
network = _bundle.bundle_network_name(plan.slug)
_bundle.create_bundle_network(network, plan.bundle_subnet, plan.bundle_gateway)
stack.callback(_bundle.remove_bundle_network, network)
# 2. Mint per-bottle CAs and update the inner Plans with
# their launch-time paths. pipelock always runs in the
# bundle; egress's CA is only minted when the bottle
# declares routes (otherwise egress runs idle without
# MITM and the CA files would be unused).
ca_cert_host, ca_key_host = pipelock_tls_init(plan.proxy_plan.yaml_path.parent)
proxy_plan = dataclasses.replace(
plan.proxy_plan,
ca_cert_host_path=ca_cert_host,
ca_key_host_path=ca_key_host,
)
egress_plan = plan.egress_plan
if egress_plan.routes:
egress_ca_host, egress_ca_cert_only = egress_tls_init(
plan.egress_plan.routes_path.parent,
)
egress_plan = dataclasses.replace(
egress_plan,
mitmproxy_ca_host_path=egress_ca_host,
mitmproxy_ca_cert_only_host_path=egress_ca_cert_only,
pipelock_ca_host_path=ca_cert_host,
# On smolmachines, egress's upstream is pipelock
# on the bundle's localhost — they're in the same
# container's network namespace.
pipelock_proxy_url=BUNDLE_LOCAL_PIPELOCK_URL,
)
plan = dataclasses.replace(
plan, proxy_plan=proxy_plan, egress_plan=egress_plan,
)
# 3. Build the BundleLaunchSpec from the (now-resolved)
# inner Plans: daemon subset, env, bind-mounts, and the
# loopback alias to bind published ports against. The
# spec's ports_to_publish list expands depending on which
# daemons the agent needs to reach from the smolvm guest.
bundle_spec = _bundle_launch_spec(plan, network, loopback_ip)
token_env = _resolve_token_env(plan, os.environ)
_bundle.start_bundle(bundle_spec, env={**os.environ, **token_env})
stack.callback(_bundle.stop_bundle, plan.slug)
# 4. Discover the host-side ports docker assigned for the
# bundle's published container ports, and bind the
# agent's URLs to `<loopback_ip>:<host port>`. Docker
# container IPs (192.168.x.x in the daemon's bridge)
# aren't reachable from the smolvm guest on macOS — TSI
# uses macOS networking, and macOS sees the daemon's
# bridge via the published-port loopback forward only.
#
# Proxy hop order matches the docker backend: when the
# bottle declares egress routes, the agent's first hop is
# egress (for token injection), then pipelock. Without
# routes, the agent dials pipelock directly. Whichever
# one is "agent-facing" is the daemon whose port we
# publish on host loopback; the other stays bundle-
# internal as the upstream proxy.
if plan.egress_plan.routes:
agent_facing_port = _EGRESS_PORT
else:
agent_facing_port = _PIPELOCK_PORT
agent_facing_host_port = _bundle.bundle_host_port(
plan.slug, agent_facing_port, host_ip=loopback_ip,
)
agent_proxy_url = f"http://{loopback_ip}:{agent_facing_host_port}"
agent_git_gate_host = ""
if plan.git_gate_plan.upstreams:
git_gate_host_port = _bundle.bundle_host_port(
plan.slug, _GIT_GATE_PORT, host_ip=loopback_ip,
)
agent_git_gate_host = f"{loopback_ip}:{git_gate_host_port}"
agent_supervise_url = ""
if plan.supervise_plan is not None:
supervise_host_port = _bundle.bundle_host_port(
plan.slug, _SUPERVISE_PORT, host_ip=loopback_ip,
)
agent_supervise_url = f"http://{loopback_ip}:{supervise_host_port}/"
# Stamp the URLs onto the plan + guest_env. provision_git
# and provision_supervise read the plan fields; the agent
# reads guest_env on every exec_claude.
#
# NO_PROXY has to include the per-bottle loopback alias —
# otherwise claude's HTTPS_PROXY catches direct calls to
# the supervise URL (`http://<alias>:<port>/`) and proxies
# them through egress, which has no route for the alias
# and rejects with "Failed to connect". The git-gate URL
# uses git://, not affected by HTTP_PROXY, so the alias
# only has to be in NO_PROXY for the MCP / supervise
# path. Append rather than overwrite so prepare.py's
# `localhost,127.0.0.1` baseline stays in place.
existing_no_proxy = plan.guest_env.get("NO_PROXY", "localhost,127.0.0.1")
guest_env = {
**plan.guest_env,
"HTTPS_PROXY": agent_proxy_url,
"HTTP_PROXY": agent_proxy_url,
"NO_PROXY": f"{existing_no_proxy},{loopback_ip}",
}
if agent_git_gate_host:
guest_env["GIT_GATE_URL"] = f"git://{agent_git_gate_host}"
if agent_supervise_url:
guest_env["MCP_SUPERVISE_URL"] = agent_supervise_url
plan = dataclasses.replace(
plan,
guest_env=guest_env,
agent_proxy_url=agent_proxy_url,
agent_git_gate_host=agent_git_gate_host,
agent_supervise_url=agent_supervise_url,
)
# 5. Build the agent image and pack it into a
# `.smolmachine` artifact (or hit the per-Dockerfile-digest
# cache). Runs here, not in prepare, so the docker-build
# output doesn't garble the dashboard's preflight modal:
# both the curses-endwin path and the tmux pane-routing
# path redirect stderr around `launch` already.
agent_from_path = _ensure_smolmachine(
plan.agent_image_ref,
dockerfile=plan.agent_dockerfile_path,
)
# smolvm VM. --from carries the pre-packed .smolmachine
# artifact; --allow-cidr + -e carry the per-bottle TSI
# allowlist + env. The allowlist is the per-bottle
# loopback alias — narrowing it to one /32 keeps the
# agent from reaching other host loopback services or
# other bottles' published ports. Smolfile isn't usable
# here — smolvm 0.8.0 makes `--from` and `--smolfile`
# mutually exclusive.
_smolvm.machine_create(
plan.machine_name,
from_path=agent_from_path,
allow_cidrs=[f"{loopback_ip}/32"],
env=plan.guest_env,
)
stack.callback(_smolvm.machine_delete, plan.machine_name)
# Workaround smolvm 0.8.0: `--allow-cidr` is silently
# dropped when combined with `--from`. Patch the persisted
# state DB to set the allowlist before start so the booted
# VM's TSI actually enforces. See loopback_alias's module
# docstring for the investigation that led here.
_loopback.force_allowlist(plan.machine_name, [f"{loopback_ip}/32"])
_smolvm.machine_start(plan.machine_name)
stack.callback(_smolvm.machine_stop, plan.machine_name)
# 6. Repair filesystem ownership + perms that smolvm's
# pack process remapped to the host invoker's uid (501
# on macOS) rather than preserving the image's expected
# ownership.
#
# - /home/node → node:node so the node user can write
# its own dotfiles (claude appendFileSync on
# ~/.claude.json otherwise bails with ENOENT/EPERM
# and the TUI hangs without surfacing the error).
# - /tmp + /var/tmp → root:root mode 1777 so non-root
# processes can create their per-uid scratch dirs
# (claude-code creates /tmp/claude-<uid>/ as soon as
# it spawns a Bash tool call).
#
# All folded into one sh -c so we only pay one
# machine_exec round trip — back-to-back exec calls
# right after machine_start hit a SIGKILL race in
# libkrun's exec channel (see provision_ca for the
# other half of this same workaround).
_smolvm.machine_exec(plan.machine_name, [
"sh", "-c",
"chown -R node:node /home/node && "
"chown root:root /tmp /var/tmp && "
"chmod 1777 /tmp /var/tmp",
])
# Wait briefly for the VM to settle. Back-to-back smolvm
# machine_exec calls immediately after machine_start
# occasionally SIGKILL the in-VM child at ~100ms (looks
# like a VM warm-up race in libkrun's exec channel).
# 1.5s is empirically enough to dodge it; provisioning
# already takes seconds so the wait is amortized.
time.sleep(1.5)
# 7. Provision (CA / prompt / skills / git / supervise).
prompt_path = provision(plan, plan.machine_name)
yield SmolmachinesBottle(
plan.machine_name,
prompt_path=prompt_path,
guest_env=plan.guest_env,
agent_command=plan.agent_command,
agent_prompt_mode=plan.agent_prompt_mode,
)
finally:
stack.close()
def _bundle_launch_spec(
plan: SmolmachinesBottlePlan, network: str, loopback_ip: str,
) -> _bundle.BundleLaunchSpec:
"""Build a BundleLaunchSpec from the resolved inner Plans.
Daemons in the CSV:
- egress + pipelock are always present (pipelock is the
agent's first hop; egress is its upstream).
- git-gate is conditional on plan.git_gate_plan.upstreams.
- supervise is conditional on plan.supervise_plan.
Env + volumes are the union of the four daemons' needs, with
daemon-private values only (HTTPS_PROXY is scoped to the
egress process by egress_entrypoint.sh see PRD 0024's bundle
bind-address PR)."""
daemons: list[str] = ["egress", "pipelock"]
env: list[str] = []
volumes: list[tuple[str, str, bool]] = []
# In this Docker-Desktop-compatible topology, whichever daemon
# is "agent-facing" gets its port published on the host
# loopback (see `_ensure_smolmachine`'s discovery loop) and the
# other stays bundle-internal. The bundle is NOT reachable by
# bridge IP from the smolvm guest, so the
# PRD-0023-chunk-3 EGRESS_LISTEN_HOST=127.0.0.1 mitigation
# isn't needed: the agent can only dial whatever daemon's
# host port we publish, period.
# --- pipelock ---------------------------------------------
pp = plan.proxy_plan
volumes += [
(str(pp.yaml_path), "/etc/pipelock.yaml", True),
(str(pp.ca_cert_host_path), PIPELOCK_CA_CERT_IN_CONTAINER, True),
(str(pp.ca_key_host_path), PIPELOCK_CA_KEY_IN_CONTAINER, True),
]
# --- egress -----------------------------------------------
ep = plan.egress_plan
if ep.routes:
env.append(f"EGRESS_UPSTREAM_PROXY={ep.pipelock_proxy_url}")
env.append(f"EGRESS_UPSTREAM_CA={EGRESS_PIPELOCK_CA_IN_CONTAINER}")
volumes += [
(str(ep.routes_path), EGRESS_ROUTES_IN_CONTAINER, True),
(str(ep.mitmproxy_ca_host_path), EGRESS_CA_IN_CONTAINER, True),
(str(ep.pipelock_ca_host_path), EGRESS_PIPELOCK_CA_IN_CONTAINER, True),
]
# Bare-name entries for upstream-token slots. Their values
# come from the docker-run subprocess env (inherited from
# the operator's shell), never landing on argv.
for token_env in sorted(ep.token_env_map.keys()):
env.append(token_env)
# --- git-gate ---------------------------------------------
extra_hosts: list[str] = []
gp = plan.git_gate_plan
if gp.upstreams:
daemons.append("git-gate")
volumes += [
(str(gp.entrypoint_script), GIT_GATE_ENTRYPOINT_IN_CONTAINER, True),
(str(gp.hook_script), GIT_GATE_HOOK_IN_CONTAINER, True),
(str(gp.access_hook_script), GIT_GATE_ACCESS_HOOK_IN_CONTAINER, True),
]
for u in gp.upstreams:
keypath = expand_tilde(u.identity_file)
volumes.append((
keypath,
f"{GIT_GATE_CREDS_DIR_IN_CONTAINER}/{u.name}-key",
True,
))
# --- supervise --------------------------------------------
sp = plan.supervise_plan
if sp is not None:
daemons.append("supervise")
env += [
f"SUPERVISE_BOTTLE_SLUG={plan.slug}",
f"SUPERVISE_QUEUE_DIR={QUEUE_DIR_IN_CONTAINER}",
f"SUPERVISE_PORT={SUPERVISE_PORT}",
]
volumes.append((str(sp.queue_dir), QUEUE_DIR_IN_CONTAINER, False))
# Container ports the agent reaches from the smolvm guest —
# published on host loopback so the guest can dial via TSI +
# macOS networking. The HTTP/HTTPS chokepoint is whichever
# daemon's port we publish: egress when routes are declared
# (token injection first, then forwards to bundle-internal
# pipelock), pipelock otherwise.
if ep.routes:
ports_to_publish: list[int] = [_EGRESS_PORT]
else:
ports_to_publish = [_PIPELOCK_PORT]
if gp.upstreams:
ports_to_publish.append(_GIT_GATE_PORT)
if sp is not None:
ports_to_publish.append(_SUPERVISE_PORT)
return _bundle.BundleLaunchSpec(
slug=plan.slug,
network_name=network,
subnet=plan.bundle_subnet,
gateway=plan.bundle_gateway,
bundle_ip=plan.bundle_ip,
daemons_csv=",".join(daemons),
environment=tuple(env),
volumes=tuple(volumes),
ports_to_publish=tuple(ports_to_publish),
publish_host_ip=loopback_ip,
)
def _resolve_token_env(
plan: SmolmachinesBottlePlan, host_env: object
) -> dict[str, str]:
"""Resolve the egress token env-var values from the host's
environ so they reach the bundle's process env via docker's
`-e NAME` inheritance. Empty when no routes declare auth."""
ep = plan.egress_plan
if not ep.routes:
return {}
return egress_resolve_token_values(ep.token_env_map, dict(host_env))
def _ensure_smolmachine(image_ref: str, *, dockerfile: str = "") -> Path:
"""Build the agent docker image and convert it into a
`.smolmachine` artifact, caching the result under
`~/.cache/bot-bottle/smolmachines/` keyed by the docker image
ID (so a Dockerfile change automatically invalidates the cache).
Returns the `.smolmachine.smolmachine` sidecar path that's
the file `machine create --from` consumes (pack create produces
a launcher binary at `.smolmachine` plus the sidecar alongside
it; the sidecar is the actual artifact).
Conversion path: `docker build` (the existing layer cache
makes no-change rebuilds cheap) `docker save` to a tarball
spin up an ephemeral registry on a private docker network
`crane push --insecure` from a one-shot container on the same
network `smolvm pack create --image localhost:<host port>/...`
tear down the registry + network. The crane push detour
sidesteps the Docker-Desktop daemon's HTTPS preference for
non-loopback registries see the `local_registry` module
docstring for the gory details.
Each pack-create costs several seconds even on a hot cache,
so we skip the whole pipeline when the cached sidecar is
already on disk for this image ID."""
_SMOLMACHINE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
docker_mod.build_image(image_ref, _REPO_DIR, dockerfile=dockerfile)
# `sha256:abcd...` -> `abcd...` first 16 chars: short enough to
# keep filenames manageable, long enough to make collisions
# astronomically unlikely.
digest = docker_mod.image_id(image_ref).split(":", 1)[-1][:16]
binary = _SMOLMACHINE_CACHE_DIR / f"{digest}.smolmachine"
sidecar = _SMOLMACHINE_CACHE_DIR / f"{digest}.smolmachine.smolmachine"
if sidecar.is_file():
return sidecar
tarball = _SMOLMACHINE_CACHE_DIR / f"{digest}.image.tar"
docker_mod.save(image_ref, str(tarball))
try:
with ephemeral_registry() as handle:
push_ref = f"{handle.push_endpoint}/bot-bottle:{digest}"
pack_ref = f"{handle.pull_endpoint}/bot-bottle:{digest}"
crane_push_tarball(handle, str(tarball), push_ref)
_smolvm.pack_create(pack_ref, binary)
finally:
# Tarball is ~500MB-1GB for the agent image; reclaim once
# the smolmachine artifact exists. The artifact itself is
# the long-lived cache entry.
tarball.unlink(missing_ok=True)
return sidecar
@@ -0,0 +1,234 @@
"""Ephemeral local OCI registry for the smolmachines agent-image
conversion path (PRD 0023 chunk 4c).
`smolvm pack create --image <ref>` only accepts OCI registry refs
it can't read the local docker daemon's image cache, an OCI
layout directory, or a `docker save` tarball. To convert the
agent's Dockerfile-built image into a `.smolmachine` artifact we
spin up a short-lived `registry:2.8.3` container alongside a
`crane` helper container on a private docker network, push via
`crane push --insecure <tarball> <registry-container>:5000/...`,
and let smolvm pull from the registry's published host port. The
network + both containers are torn down after the pack completes.
Why this two-container dance instead of plain `docker push`:
- Docker Desktop's daemon runs in its own Linux VM, so its
`localhost` is not the host's loopback. A registry bound to
the host's 127.0.0.1 is unreachable from the daemon side.
- `host.docker.internal` is reachable from the daemon but isn't
in Docker's default insecure-registries CIDRs (only `::1/128`
and `127.0.0.0/8` are), so `docker push` to it tries HTTPS,
hits a plain-HTTP registry, and dies with
`http: server gave HTTP response to HTTPS client`. Adding
`host.docker.internal` to daemon.json works but is a one-time
manual step the user has to do in Docker Desktop's UI.
- Going through a docker network sidesteps the host-vs-daemon
loopback mismatch (crane and registry containers see each
other on the network) AND the HTTPS preference (crane has an
`--insecure` flag that forces plain HTTP).
The registry is also published on a random host port so smolvm
a host process can pull from `localhost:<port>` via Docker's
port-forward. smolvm's bundled crane auto-falls-back to HTTP for
localhost addresses, so no insecure-registries config is needed
on that side either."""
from __future__ import annotations
import os
import socket
import subprocess
import time
import uuid
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Iterator
from ...log import die
# registry:2.8.3, pinned by digest. Same env-override pattern as the
# pipelock image pin in bot_bottle/backend/docker/pipelock.py.
REGISTRY_IMAGE = os.environ.get(
"BOT_BOTTLE_REGISTRY_IMAGE",
"registry@sha256:a3d8aaa63ed8681a604f1dea0aa03f100d5895b6a58ace528858a7b332415373",
)
# gcr.io/go-containerregistry/crane:latest, pinned by digest. ~10MB,
# stable upstream from Google; we only invoke `crane push --insecure`
# against a localhost-equivalent registry, so the trust surface is
# narrow.
CRANE_IMAGE = os.environ.get(
"BOT_BOTTLE_CRANE_IMAGE",
"gcr.io/go-containerregistry/crane@sha256:0ae17ecb34315aa7cbff28f6eddee3b7adae0b2f90101260d990804db1eb0084",
)
# Internal port the registry binds to inside its container — fixed
# by the registry:2 image. The host-side mapping is random.
_REGISTRY_CONTAINER_PORT = "5000"
# How long to wait for the registry's HTTP layer to bind before
# giving up. Two seconds is empirically enough; 10s leaves headroom
# for slow CI runners without making the failure mode chatty.
_READY_TIMEOUT_S = 10.0
@dataclass(frozen=True)
class RegistryHandle:
"""Everything callers need to push to + pull from the ephemeral
registry.
`network` is the per-session docker network a `crane push`
container has to join it to reach the registry by name.
`push_endpoint` is the `<host>:<port>` form to embed in image
refs given to the crane push container (resolves via docker
network DNS). `pull_endpoint` is the `<host>:<port>` form a
host process (smolvm) uses; the registry's host port mapping
backs this."""
network: str
push_endpoint: str
pull_endpoint: str
@contextmanager
def ephemeral_registry() -> Iterator[RegistryHandle]:
"""Bring up a per-session docker network + a `registry:2.8.3`
container on it (published on a random host port), yield a
`RegistryHandle`, force-remove both on exit.
The container is started with `--rm` so a clean exit cleans up
on its own; the `finally` block force-removes on abnormal exit
(the calling process crashes between yield and close)."""
session_id = uuid.uuid4().hex[:12]
network = f"bot-bottle-registry-net-{session_id}"
registry_name = f"bot-bottle-registry-{session_id}"
subprocess.run(
["docker", "network", "create", network],
check=True,
capture_output=True,
)
try:
subprocess.run(
[
"docker", "run", "-d", "--rm",
"--name", registry_name,
"--network", network,
# `-p :5000` (no IP prefix) binds the container's
# port 5000 on a random host port across all
# interfaces. The host side reaches the registry
# via this port — smolvm's `pack create` pulls from
# `localhost:<port>` and the docker port-forward
# routes there.
"-p", _REGISTRY_CONTAINER_PORT,
REGISTRY_IMAGE,
],
check=True,
capture_output=True,
)
try:
port = _host_port(registry_name)
_wait_ready(port)
yield RegistryHandle(
network=network,
push_endpoint=f"{registry_name}:{_REGISTRY_CONTAINER_PORT}",
pull_endpoint=f"localhost:{port}",
)
finally:
subprocess.run(
["docker", "rm", "-f", registry_name],
check=False,
capture_output=True,
)
finally:
subprocess.run(
["docker", "network", "rm", network],
check=False,
capture_output=True,
)
def crane_push_tarball(handle: RegistryHandle, tarball_path: str, ref: str) -> None:
"""Run `crane push --insecure <tarball> <ref>` inside a one-shot
container on the registry's docker network. `ref` should
reference the registry by `handle.push_endpoint` so the crane
container resolves it via docker network DNS.
Doesn't go through `docker push` to avoid the Docker-Desktop
daemon's HTTPS preference for non-loopback hostnames — crane's
`--insecure` flag forces plain HTTP, which is what the
registry container speaks."""
r = subprocess.run(
[
"docker", "run", "--rm",
"--network", handle.network,
"-v", f"{tarball_path}:/img.tar:ro",
CRANE_IMAGE,
"push", "--insecure", "/img.tar", ref,
],
capture_output=True,
text=True,
check=False,
)
if r.returncode != 0:
die(
f"crane push of {tarball_path!r} to {ref!r} failed: "
f"{(r.stderr or r.stdout or '').strip() or '<no output>'}"
)
def _host_port(name: str) -> int:
"""Resolve the host-side port docker mapped to the registry's
container port. `docker port <name> 5000/tcp` returns one or
more `host:port` lines (one per address family) we take the
first."""
r = subprocess.run(
["docker", "port", name, f"{_REGISTRY_CONTAINER_PORT}/tcp"],
capture_output=True,
text=True,
check=False,
)
if r.returncode != 0:
die(
f"docker port {name} {_REGISTRY_CONTAINER_PORT}/tcp failed: "
f"{(r.stderr or '').strip() or '<no stderr>'}"
)
# `0.0.0.0:54321\n[::]:54321\n` — split on the last colon to
# handle either IPv4 or IPv6 host syntax.
line = (r.stdout or "").splitlines()[0].strip()
_, _, port_str = line.rpartition(":")
try:
return int(port_str)
except ValueError:
die(f"unexpected `docker port` output: {line!r}")
return -1 # unreachable; die() never returns
def _wait_ready(port: int) -> None:
"""Block until the registry's HTTP layer accepts a TCP
connection on `127.0.0.1:<port>`, or `_READY_TIMEOUT_S`
elapses.
A successful TCP connect is sufficient registry:2.8.3 binds
after it's ready to serve `/v2/` requests, so the push that
follows will land on a working server. We probe loopback
specifically (not via the docker network) because this helper
runs on the host."""
deadline = time.monotonic() + _READY_TIMEOUT_S
last_err: Exception | None = None
while time.monotonic() < deadline:
try:
with socket.create_connection(("127.0.0.1", port), timeout=0.5):
return
except OSError as e:
last_err = e
time.sleep(0.1)
die(
f"local registry on 127.0.0.1:{port} did not accept "
f"connections within {_READY_TIMEOUT_S:.0f}s "
f"(last error: {last_err})"
)
@@ -0,0 +1,254 @@
"""Per-bottle loopback alias allocation + TSI allowlist
enforcement (PRD 0023, follow-up to PR #74).
After the pivot to host-loopback port-forwards, the smolmachines
TSI allowlist was `127.0.0.1/32` which meant the agent VM could
reach **any** service bound to macOS's loopback, not just the
bundle's published ports. Real downgrade from the docker
backend's `--internal` network isolation.
This module narrows the allowlist by allocating each bottle a
unique loopback alias (`127.0.0.16` .. `127.0.0.31`). The
bundle's port-forwards bind to that alias, and the alias's /32
is what TSI allows.
**Smolvm 0.8.0 quirk + workaround.** `smolvm machine create
--from <smolmachine> --net --allow-cidr X/32` silently drops the
flag verified empirically that the agent process's allowlist
ends up `null` in smolvm's persistent state DB (`~/Library/
Application Support/smolvm/server/smolvm.db`, `vms` table,
`data` BLOB), and the booted VM reaches all of `127.0.0.0/8`
regardless of what we passed. Workaround: after machine_create,
open the SQLite DB and patch the row's `allowed_cidrs` field
directly. Smolvm reads the DB at machine_start, so the patched
value takes effect on boot. Tested: enforcement is real the
guest's connect to a non-allowlisted IP fails with `Permission
denied`. Other paths we tried (machine update, stop-edit-
agent.config.json-restart, --smolfile, --image localhost:N/...)
were dead ends.
macOS only configures `127.0.0.1` on `lo0` by default; the
additional aliases require `sudo ifconfig lo0 alias`. We lazily
sudo-add the missing pool on first use per boot the aliases
persist on `lo0` until reboot, so subsequent launches don't
prompt.
Linux native daemons share the host's network namespace; the
whole `127.0.0.0/8` is reachable by default and aliases are
unnecessary. The pool logic detects native-Linux and skips sudo
entirely; the DB patch is also gated on macOS.
Allocation is coordinated by inspecting running bundle
containers' published host IPs — each bottle's bundle owns the
alias appearing in its port bindings. The lowest-numbered free
alias gets handed to a new bottle."""
from __future__ import annotations
import json
import os
import platform
import re
import sqlite3
import subprocess
from pathlib import Path
from typing import Iterable
from ...log import die, info
# smolvm's persistent VM state on macOS — a SQLite DB whose `vms`
# table holds one JSON BLOB per machine. The Linux path is
# different, but smolmachines is macOS-only in v1 (PRD 0023) so
# we hard-code this. If the file moves under us we'll see a
# clear FileNotFoundError; not worth defensive cross-platform
# detection until the backend actually needs Linux.
_SMOLVM_DB_PATH = (
Path.home()
/ "Library"
/ "Application Support"
/ "smolvm"
/ "server"
/ "smolvm.db"
)
# Sixteen aliases by default. Tunable for hosts that want more
# concurrent bottles (each bottle reserves one alias for its
# bundle bringup). The range is chosen to avoid the reserved
# 127.0.0.1/2/3 ports (1 is the default, 2 is sometimes used by
# CUPS, 3 by other macOS services) and stay well clear of
# 127.0.0.53 (systemd-resolved) and 127.0.0.54 (libvirt).
_POOL_START = 16
_POOL_END = 31 # inclusive
# Loopback aliases pool: 127.0.0.<start>..127.0.0.<end>.
def _pool_addresses() -> list[str]:
return [f"127.0.0.{i}" for i in range(_POOL_START, _POOL_END + 1)]
def _is_macos() -> bool:
return platform.system() == "Darwin"
def ensure_pool() -> None:
"""Make sure each address in the pool is up on `lo0`. Lazily
runs `sudo ifconfig lo0 alias <ip>/32 up` for missing entries
(sudo prompts once, then the aliases persist on lo0 until
reboot). No-op on non-macOS hosts."""
if not _is_macos():
return
missing = [ip for ip in _pool_addresses() if not _alias_present(ip)]
if not missing:
return
info(
f"smolmachines needs {len(missing)} loopback alias(es) on lo0 "
f"({', '.join(missing[:3])}{', ...' if len(missing) > 3 else ''}) "
f"to scope per-bottle TSI allowlists. sudo will prompt once; "
f"aliases persist until reboot."
)
for ip in missing:
result = subprocess.run(
["sudo", "-p", "bot-bottle (loopback alias): ",
"ifconfig", "lo0", "alias", f"{ip}/32", "up"],
check=False,
)
if result.returncode != 0:
die(
f"sudo ifconfig lo0 alias {ip} failed (exit "
f"{result.returncode}). Re-run with sudo available, "
f"or add manually: sudo ifconfig lo0 alias {ip}/32 up"
)
def force_allowlist(machine_name: str, allowed_cidrs: list[str]) -> None:
"""Patch smolvm's persistent VM-state DB to set the machine's
`allowed_cidrs` to the given list. Workaround for smolvm
0.8.0's silent-drop of `--allow-cidr` when used with `--from`.
Must run AFTER `smolvm machine create` (the row has to
exist) and BEFORE `smolvm machine start` (smolvm reads the
row on start; in-flight VMs don't pick up changes). Once
smolvm honors the CLI flag upstream this whole function is
redundant flag-respecting create + remove this call from
launch.
No-op on non-macOS the DB path differs and the Linux
smolmachines code path isn't exercised in v1."""
if not _is_macos():
return
if not _SMOLVM_DB_PATH.is_file():
die(
f"smolvm state DB not found at {_SMOLVM_DB_PATH}. "
f"smolvm 0.8.0 expected? `smolvm --version` to check."
)
con = sqlite3.connect(str(_SMOLVM_DB_PATH))
try:
cur = con.cursor()
row = cur.execute(
"SELECT data FROM vms WHERE name = ?", (machine_name,),
).fetchone()
if row is None:
die(
f"smolvm DB has no row for machine {machine_name!r}"
f"machine_create must run before force_allowlist."
)
cfg = json.loads(row[0])
cfg["allowed_cidrs"] = list(allowed_cidrs)
# Write as BLOB (the column type smolvm uses) — passing a
# plain str makes sqlite store it as Text and smolvm then
# fails to read it.
cur.execute(
"UPDATE vms SET data = ? WHERE name = ?",
(sqlite3.Binary(json.dumps(cfg).encode()), machine_name),
)
con.commit()
finally:
con.close()
def allocate(slug: str) -> str:
"""Pick the lowest-numbered alias from the pool not already
in use by a running smolmachines bundle. Bails when the pool
is exhausted the caller should report the limit to the
operator. `slug` is logged for traceability; not otherwise
used (no on-disk reservation, allocation is purely
docker-state-driven).
On non-macOS the whole `127.0.0.0/8` is loopback by default;
`127.0.0.1` is fine to share and we skip the alias dance.
This still returns a deterministic address so launch.py's
callers don't have to branch on platform."""
if not _is_macos():
return "127.0.0.1"
in_use = _aliases_in_use()
for ip in _pool_addresses():
if ip not in in_use:
return ip
die(
f"smolmachines loopback alias pool exhausted "
f"({_POOL_END - _POOL_START + 1} aliases, all in use). "
f"Stop a running bottle (`smolvm machine ls --json`) or "
f"raise _POOL_END in loopback_alias.py."
)
return "" # unreachable; die() never returns
def _alias_present(ip: str) -> bool:
"""True iff `ifconfig lo0` shows `<ip>` as an inet address.
Exact-match `127.0.0.1` shouldn't match `127.0.0.16`."""
result = subprocess.run(
["/sbin/ifconfig", "lo0"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
return False
pattern = re.compile(rf"\binet {re.escape(ip)}\b")
return bool(pattern.search(result.stdout or ""))
def _aliases_in_use() -> set[str]:
"""Aliases already bound by another smolmachines bundle's
published-port mappings. We inspect every container whose
name matches the smolmachines bundle prefix and pull the
`HostIp` out of its port bindings."""
result = subprocess.run(
["docker", "ps", "--format", "{{.Names}}",
"--filter", "name=bot-bottle-sidecars-"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
return set()
names = [n.strip() for n in (result.stdout or "").splitlines() if n.strip()]
in_use: set[str] = set()
for name in names:
in_use.update(_host_ips_for_container(name))
return in_use
def _host_ips_for_container(name: str) -> Iterable[str]:
"""Yield the `HostIp` values across all port bindings on
container `name`. A bundle binds three or four ports and
they all share the same HostIp, so callers can take any."""
result = subprocess.run(
["docker", "inspect", name,
"--format", "{{json .HostConfig.PortBindings}}"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
return ()
try:
bindings = json.loads(result.stdout or "{}")
except json.JSONDecodeError:
return ()
seen: set[str] = set()
for _port, mappings in (bindings or {}).items():
for m in mappings or []:
host_ip = m.get("HostIp") or ""
if host_ip:
seen.add(host_ip)
return seen
__all__ = ["allocate", "ensure_pool", "force_allowlist"]
+192
View File
@@ -0,0 +1,192 @@
"""smolmachines `_resolve_plan` (PRD 0023 chunks 2d + 4c).
Resolves the per-bottle docker subnet + bundle IP and assembles
the guest env. The agent's docker image build → smolmachine
pack pipeline runs in `launch.launch`, not here, so the
dashboard's preflight modal isn't garbled by docker-build output
before the operator has confirmed.
No VM bringup that's `launch.launch`'s job."""
from __future__ import annotations
import os
from datetime import datetime, timezone
from pathlib import Path
from ...agent_provider import runtime_for
from ...backend import BottleSpec
from ...backend.docker.bottle_state import (
BottleMetadata,
agent_state_dir,
bottle_identity,
egress_state_dir,
git_gate_state_dir,
pipelock_state_dir,
supervise_state_dir,
write_metadata,
)
from ...egress import Egress
from ...git_gate import GitGate
from ...pipelock import PipelockProxy
from ...supervise import Supervise
from .bottle_plan import SmolmachinesBottlePlan
from .util import smolmachines_bundle_subnet, smolmachines_preflight
# Gateway ports the bundle exposes inside its container — pipelock
# HTTPS proxy, git-gate's git-daemon, supervise's MCP. The agent
# inside the smolvm guest dials these on the bundle's pinned IP.
_BUNDLE_PIPELOCK_PORT = 8888
_BUNDLE_GIT_GATE_PORT = 9418
_BUNDLE_SUPERVISE_PORT = 9100
def resolve_plan(
spec: BottleSpec, *, stage_dir: Path
) -> SmolmachinesBottlePlan:
"""Materialize the smolmachines plan. The bundle's docker
subnet + pinned IP are derived from the slug; the agent's
`.smolmachine` artifact is built (or cache-hit) here so
launch's `machine create --from` boots without a registry
pull. Per-bottle guest env + the TSI allow_cidrs land on the
plan for launch to pass straight through to
`machine create` flags."""
smolmachines_preflight()
manifest = spec.manifest
bottle = manifest.bottle_for(spec.agent_name)
provider = bottle.agent_provider
provider_runtime = runtime_for(provider.template)
slug = spec.identity or bottle_identity(spec.agent_name)
# Record minimal metadata so `cli.py resume` can recover the
# slug. Same schema as the docker backend.
write_metadata(BottleMetadata(
identity=slug,
agent_name=spec.agent_name,
cwd=spec.user_cwd if spec.copy_cwd else "",
copy_cwd=spec.copy_cwd,
started_at=datetime.now(timezone.utc).isoformat(),
# No compose project for smolmachines bottles; chunk 4
# will give dashboard discovery a backend-specific path.
compose_project="",
))
subnet, gateway, bundle_ip = smolmachines_bundle_subnet(slug)
# Agent's env: the prepare-time view doesn't yet know the
# host loopback ports the bundle's daemons get published on
# (those come from docker AFTER `docker run` returns), so
# HTTPS_PROXY / GIT_GATE_URL / MCP_SUPERVISE_URL are
# populated in launch.py and stamped onto guest_env there.
# What we set here is the part that doesn't depend on
# bundle bringup — bottle.env literals, the empty-NO_PROXY
# safe default, and the TLS trust env trio
# (NODE_EXTRA_CA_CERTS / SSL_CERT_FILE / REQUESTS_CA_BUNDLE)
# pointing at Debian's update-ca-certificates output bundle.
guest_env: dict[str, str] = {
**bottle.env,
"NO_PROXY": "localhost,127.0.0.1",
"NODE_EXTRA_CA_CERTS": "/etc/ssl/certs/ca-certificates.crt",
"SSL_CERT_FILE": "/etc/ssl/certs/ca-certificates.crt",
"REQUESTS_CA_BUNDLE": "/etc/ssl/certs/ca-certificates.crt",
}
# Inner Plans for the four bundle daemons. The ABCs are
# platform-neutral — `.prepare()` writes config files + returns
# a Plan dataclass with no backend-specific assumptions. State
# dirs are still keyed by slug under the docker backend's
# bottle_state layout (shared on-host convention; not a docker
# dependency).
pipelock_dir = pipelock_state_dir(slug)
pipelock_dir.mkdir(parents=True, exist_ok=True)
proxy_plan = PipelockProxy().prepare(bottle, slug, pipelock_dir)
git_gate_dir = git_gate_state_dir(slug)
git_gate_dir.mkdir(parents=True, exist_ok=True)
git_gate_plan = GitGate().prepare(bottle, slug, git_gate_dir)
egress_dir = egress_state_dir(slug)
egress_dir.mkdir(parents=True, exist_ok=True)
egress_plan = Egress().prepare(bottle, slug, egress_dir)
# Claude-code refuses to start without *something* it
# recognises as a credential. When the bottle has an egress
# route carrying the `claude_code_oauth` role marker, egress
# strips + re-injects the real Authorization header on the
# outbound leg using a token held in egress's own environ — so
# the agent gets a non-secret placeholder here (matches the
# docker backend's forwarded_env logic in
# bot_bottle/backend/docker/prepare.py).
has_provider_auth = any(
provider_runtime.auth_role in r.roles for r in egress_plan.routes
)
if has_provider_auth:
guest_env[provider_runtime.placeholder_env] = "egress-placeholder"
if provider.template == "claude" and has_provider_auth:
guest_env.setdefault("CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC", "1")
guest_env.setdefault("DISABLE_ERROR_REPORTING", "1")
supervise_plan = None
if bottle.supervise:
supervise_dir = supervise_state_dir(slug)
supervise_dir.mkdir(parents=True, exist_ok=True)
supervise_plan = Supervise().prepare(slug, supervise_dir)
# Prompt file is always written (mode 0o600) so the in-VM
# path always exists. Content is the agent's `prompt`
# field (markdown body) — empty for agents with no prompt.
# claude-code reads it via --append-system-prompt-file only
# when non-empty, but the file must exist either way to
# match the docker backend's contract.
agent_dir = agent_state_dir(slug)
agent_dir.mkdir(parents=True, exist_ok=True)
prompt_file = agent_dir / "prompt.txt"
agent = manifest.agents[spec.agent_name]
prompt_file.write_text(agent.prompt or "")
prompt_file.chmod(0o600)
machine_name = f"bot-bottle-{slug}"
# Stash the agent image ref — `launch.launch` runs the
# build → pack pipeline at bringup. Honors BOT_BOTTLE_IMAGE
# to match the docker backend's `resolve_plan` default.
agent_dockerfile_path = ""
if provider.dockerfile:
agent_dockerfile_path = _resolve_manifest_dockerfile(provider.dockerfile, spec)
image_default = f"bot-bottle-{provider.template}:{slug}"
elif provider_runtime.dockerfile:
agent_dockerfile_path = provider_runtime.dockerfile
image_default = provider_runtime.image
else:
image_default = provider_runtime.image
agent_image_ref = os.environ.get("BOT_BOTTLE_IMAGE", image_default)
return SmolmachinesBottlePlan(
spec=spec,
stage_dir=stage_dir,
slug=slug,
bundle_subnet=subnet,
bundle_gateway=gateway,
bundle_ip=bundle_ip,
machine_name=machine_name,
agent_image_ref=agent_image_ref,
guest_env=guest_env,
prompt_file=prompt_file,
proxy_plan=proxy_plan,
git_gate_plan=git_gate_plan,
egress_plan=egress_plan,
supervise_plan=supervise_plan,
agent_command=provider_runtime.command,
agent_prompt_mode=provider_runtime.prompt_mode,
agent_provider_template=provider.template,
agent_dockerfile_path=agent_dockerfile_path,
)
def _resolve_manifest_dockerfile(path_value: str, spec: BottleSpec) -> str:
path = Path(os.path.expanduser(path_value))
if not path.is_absolute():
path = Path(spec.user_cwd) / path
return str(path)
@@ -0,0 +1,14 @@
"""Provisioning helpers for the smolmachines backend (PRD 0023
chunk 4).
Each method maps onto one of `BottleBackend`'s `provision_*`
overrides. They run after the VM is up + the bundle is reachable
and copy host-side state (prompt, skills, .git, CA cert,
supervise MCP config) into the guest via `smolvm machine cp` /
`smolvm machine exec`.
Chunk 4a ships `provision_prompt` and `provision_skills` the
two that don't depend on agent-image tooling (claude-code,
update-ca-certificates) beyond `cp` and `mkdir`. provision_ca /
provision_git / provision_supervise land once the agent-image
gap is solved."""
@@ -0,0 +1,104 @@
"""Install the per-bottle MITM CA into the smolmachines guest's
trust store (PRD 0023 chunk 4d).
Mirrors `backend.docker.provision.ca`: select the right CA (egress
when the bottle has routes, else pipelock), `smolvm machine cp` it
to Debian's `/usr/local/share/ca-certificates/` path,
`update-ca-certificates` to rebuild the trust bundle, and log the
fingerprint once. The selected cert depends on the agent's
HTTP_PROXY target same logic as the docker backend, since the
agent dials the same daemons through the same bundle.
`smolvm machine exec` runs commands as root in the VM (no `-u`
flag exists; the VM init is root), so we don't need the explicit
`-u 0` the docker backend uses on its `docker exec` calls."""
from __future__ import annotations
import hashlib
import ssl
from pathlib import Path
from ....log import die, info
from ...docker.provision.ca import AGENT_CA_BUNDLE, AGENT_CA_PATH
from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan
def _select_ca_cert(plan: SmolmachinesBottlePlan) -> tuple[Path, str]:
"""Pick the CA cert (and a short label for the log line) that
matches the proxy the agent's HTTP_PROXY points at. Egress-proxy
wins when the bottle declares any routes; else pipelock.
The launch step minted both CAs (pipelock always; egress when
routes are declared) and stored their host paths back into the
inner Plans via `dataclasses.replace`. If those paths are empty
here something has gone wrong in launch's bringup."""
if plan.egress_plan.routes:
cert = plan.egress_plan.mitmproxy_ca_cert_only_host_path
if cert == Path() or not cert.is_file():
die(
f"egress CA cert missing at {cert or '(empty)'}; "
f"launch must have called egress_tls_init and "
f"re-bound the plan before provision"
)
return cert, "egress"
cert = plan.proxy_plan.ca_cert_host_path
if not cert or not cert.is_file():
die(
f"pipelock CA cert missing at {cert or '(empty)'}; "
f"launch must have called pipelock_tls_init and re-bound "
f"the plan before provision"
)
return cert, "pipelock"
def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
"""Copy the agent-facing CA cert into the guest, rebuild the
trust bundle, emit a one-line fingerprint log. Called from
`BottleBackend.provision` after the smolvm guest is up."""
cert_host_path, label = _select_ca_cert(plan)
_smolvm.machine_cp(str(cert_host_path), f"{target}:{AGENT_CA_PATH}")
# Mode 0644 — readable to non-root tools in the guest.
# update-ca-certificates rebuilds the bundle at AGENT_CA_BUNDLE,
# which is what curl / Python ssl / OpenSSL-based tools read by
# default. The env trio (NODE_EXTRA_CA_CERTS / SSL_CERT_FILE /
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
# `requests` / libraries that don't load the system bundle.
#
# chown + chmod + update-ca-certificates run in one
# `sh -c` so we only pay one machine_exec round trip; the
# `&&` chaining surfaces the first failure as the return
# code.
r = _smolvm.machine_exec(target, [
"sh", "-c",
f"chown root:root {AGENT_CA_PATH} && "
f"chmod 644 {AGENT_CA_PATH} && "
f"update-ca-certificates",
])
if r.returncode != 0 or "1 added" not in (r.stdout or ""):
# update-ca-certificates not adding our cert is fatal —
# claude-code's TLS handshake against the egress-MITM'd
# api.anthropic.com would fail downstream. Bail early
# with what we can see (output is captured by smolvm so
# we can surface it).
die(
f"update-ca-certificates didn't add the agent CA "
f"(exit {r.returncode}): "
f"stdout={(r.stdout or '').strip()!r} "
f"stderr={(r.stderr or '').strip()!r}"
)
# Stdlib SHA-256 of the cert's DER bytes — the standard
# fingerprint form. Never the private key.
der = ssl.PEM_cert_to_DER_cert(cert_host_path.read_text())
fingerprint = hashlib.sha256(der).hexdigest()
info(f"{label} ca fingerprint: sha256:{fingerprint[:32]}...")
# Re-exported for the launch/provision_ca caller + tests. The path
# constants come from the docker module because they're tied to
# Debian's `update-ca-certificates` layout — same in both backends
# since both guest images are Debian-family.
__all__ = ["AGENT_CA_BUNDLE", "AGENT_CA_PATH", "provision_ca"]
@@ -0,0 +1,141 @@
"""Git provisioning inside a running smolmachines bottle
(PRD 0023 chunk 4d).
Three concerns, all about git in the agent:
1. If --cwd was passed AND the host cwd has a .git, copy that
.git into /home/node/workspace/.git so the agent operates on
the user's repo.
2. If the bottle declares `git` entries (PRD 0008), write a
~/.gitconfig with insteadOf rules so every git operation
against a declared upstream transparently hits the per-bottle
git-gate. The gate mirrors the upstream in both directions,
so URL rewriting is symmetric.
3. If the bottle declares `git.user` (issue #86), set
`git config --global user.{name,email}` inside the guest so
the agent's commits are attributed to that identity.
Differs from `backend.docker.provision.git` in one address detail:
the TSI-allowlisted guest can only reach the bundle's pinned IP
(no DNS resolver in the /32 allowlist), so the insteadOf URLs
are `git://<bundle_ip>:<port>/<name>.git` rather than the
docker backend's `git://git-gate/<name>.git`. The render itself
is the shared `git_gate_render_gitconfig` on the platform-neutral
git_gate module."""
from __future__ import annotations
import os
import tempfile
from pathlib import Path
from ....git_gate import git_gate_render_gitconfig
from ....log import info
from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan
# `node` is the agent user from the repo Dockerfile. Override via
# BOT_BOTTLE_GUEST_HOME mirrors the docker backend's
# BOT_BOTTLE_CONTAINER_HOME knob — same purpose, different
# transport.
_DEFAULT_GUEST_HOME = "/home/node"
def _guest_home() -> str:
return os.environ.get("BOT_BOTTLE_GUEST_HOME", _DEFAULT_GUEST_HOME)
def provision_git(plan: SmolmachinesBottlePlan, target: str) -> None:
"""Set up git inside the guest. Runs all three subcases; each
no-ops when its condition isn't met."""
_provision_cwd_git(plan, target)
_provision_git_gate_config(plan, target)
_provision_git_user(plan, target)
def _provision_cwd_git(plan: SmolmachinesBottlePlan, target: str) -> None:
"""If --cwd was set and the host cwd has a .git directory, copy
it into <guest_home>/workspace/.git and fix ownership. No-op
otherwise."""
if not (plan.spec.copy_cwd and Path(plan.spec.user_cwd, ".git").is_dir()):
return
guest_workspace_git = f"{_guest_home()}/workspace/.git"
info(f"copying {plan.spec.user_cwd}/.git -> {target}:{guest_workspace_git}")
# mkdir -p the workspace dir so `machine cp` lands the .git
# directly there even on first-time bottles.
_smolvm.machine_exec(target, ["mkdir", "-p", f"{_guest_home()}/workspace"])
_smolvm.machine_cp(
f"{plan.spec.user_cwd}/.git", f"{target}:{guest_workspace_git}",
)
# `machine cp` lands files as root; the agent runs as node so
# the workspace tree must be chowned over.
_smolvm.machine_exec(
target, ["chown", "-R", "node:node", guest_workspace_git],
)
def _provision_git_gate_config(plan: SmolmachinesBottlePlan, target: str) -> None:
"""Write ~/.gitconfig in the guest with the git-gate insteadOf
rules. No-op when the bottle has no `git` entries."""
bottle = plan.spec.manifest.bottle_for(plan.spec.agent_name)
if not bottle.git:
return
# `127.0.0.1:<host port>` form: the bundle's git-gate port
# is published on host loopback at launch time so the
# smolvm guest (which can only reach macOS networking via
# TSI, not the docker bridge IP) can dial it. launch.py
# populates `plan.agent_git_gate_host` after bundle bringup.
content = git_gate_render_gitconfig(bottle.git, plan.agent_git_gate_host)
guest_gitconfig = f"{_guest_home()}/.gitconfig"
# Stage the file under the plan's stage_dir so `machine cp`
# has a stable host path. The plan's stage_dir is cleaned up
# by start.py's session-end teardown.
with tempfile.NamedTemporaryFile(
"w", dir=str(plan.stage_dir), prefix="gitconfig.",
delete=False,
) as f:
f.write(content)
config_file = Path(f.name)
os.chmod(config_file, 0o600)
info(f"writing {guest_gitconfig} with {len(bottle.git)} insteadOf rule(s)")
_smolvm.machine_cp(str(config_file), f"{target}:{guest_gitconfig}")
_smolvm.machine_exec(target, ["chown", "node:node", guest_gitconfig])
_smolvm.machine_exec(target, ["chmod", "644", guest_gitconfig])
def _provision_git_user(
plan: SmolmachinesBottlePlan, target: str,
) -> None:
"""Apply `git config --global user.{name,email}` inside the
guest as the node user so --global lands in the same
`/home/node/.gitconfig` that `_provision_git_gate_config`
writes to. No-op when the bottle didn't declare `git.user`.
Runs via `runuser -u node --`; HOME is forced via smolvm's
`-e` flag because runuser (without -l) inherits root's
HOME=/root, which would put --global in the wrong file."""
bottle = plan.spec.manifest.bottle_for(plan.spec.agent_name)
gu = bottle.git_user
if gu.is_empty():
return
env = {"HOME": _guest_home(), "USER": "node"}
if gu.name:
info(f"git config --global user.name = {gu.name!r}")
_smolvm.machine_exec(
target,
["runuser", "-u", "node", "--",
"git", "config", "--global", "user.name", gu.name],
env=env,
)
if gu.email:
info(f"git config --global user.email = {gu.email!r}")
_smolvm.machine_exec(
target,
["runuser", "-u", "node", "--",
"git", "config", "--global", "user.email", gu.email],
env=env,
)
@@ -0,0 +1,42 @@
"""Copy the agent prompt into a running smolmachines bottle.
The prompt file is always copied (so the in-guest path always
exists) but `--append-system-prompt-file` only fires when the
agent actually has a prompt the return value signals which
case, mirroring the docker backend's contract.
`smolvm machine cp` lands files as root inside the VM; the claude
process runs as `node`, so we chown + chmod the prompt after the
copy. Same flow as the docker backend's provision_prompt."""
from __future__ import annotations
import os
from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan
# `node` is the agent user from the repo Dockerfile.
# BOT_BOTTLE_GUEST_HOME mirrors the docker backend's
# BOT_BOTTLE_CONTAINER_HOME knob.
_DEFAULT_GUEST_HOME = "/home/node"
def provision_prompt(plan: SmolmachinesBottlePlan, target: str) -> str | None:
"""Copy the prompt file into the running smolvm guest, fix
ownership/mode. Returns the in-guest path if the agent has a
non-empty prompt (drives --append-system-prompt-file), else
None. The file is copied either way so the path always
exists mirrors the docker backend's behavior."""
guest_home = os.environ.get("BOT_BOTTLE_GUEST_HOME", _DEFAULT_GUEST_HOME)
in_guest_prompt_path = f"{guest_home}/.bot-bottle-prompt.txt"
_smolvm.machine_cp(str(plan.prompt_file), f"{target}:{in_guest_prompt_path}")
# machine cp lands as root, source's 0o600 mode is preserved —
# node can't read its own prompt without these two.
_smolvm.machine_exec(target, ["chown", "node:node", in_guest_prompt_path])
_smolvm.machine_exec(target, ["chmod", "600", in_guest_prompt_path])
agent = plan.spec.manifest.agents[plan.spec.agent_name]
return in_guest_prompt_path if agent.prompt else None
@@ -0,0 +1,63 @@
"""Copy host-side skill directories into a running smolmachines
bottle.
Skills are validated on the host before launch by
`BottleBackend._validate_skills`; this module assumes that
validation has already run. A skill that disappears between
validation and copy still dies loudly rather than silently
producing a partial guest."""
from __future__ import annotations
import os
from ....log import die, info
from ...util import host_skill_dir
from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan
# In-guest path mirrors the docker backend's claude-skills
# convention (~/.claude/skills/<name>/) under the node user's
# home — same path as the real bot-bottle image's
# /home/node/.claude/skills (pre-created in the Dockerfile).
_DEFAULT_SKILLS_DIR = "/home/node/.claude/skills"
def provision_skills(plan: SmolmachinesBottlePlan, target: str) -> None:
"""Copy each of the agent's named skills from the host's
~/.claude/skills/<name>/ into the guest's equivalent path.
For each skill: `mkdir -p` the destination, `smolvm machine cp`
the host source dir over, then chown the result to node:node so
the agent can read it. No-op when the agent has no skills.
smolvm machine cp on a directory copies recursively (same
semantics as `cp -r`); unlike docker cp's trailing-slash
convention, smolvm doesn't need the `/.` suffix dance.
machine cp lands files as root inside the VM, so we chown each
skill tree over to node:node after the copy same pattern as
the docker backend's provision_prompt."""
agent = plan.spec.manifest.agents[plan.spec.agent_name]
if not agent.skills:
return
skills_dir = os.environ.get(
"BOT_BOTTLE_GUEST_SKILLS_DIR", _DEFAULT_SKILLS_DIR,
)
_smolvm.machine_exec(target, ["mkdir", "-p", skills_dir])
for name in agent.skills:
src = host_skill_dir(name)
if not os.path.isdir(src):
die(
f"skill {name!r} disappeared from host between "
f"validation and copy at {src}."
)
dst = f"{skills_dir}/{name}"
info(f"copying skill {name} into {target}:{dst}")
# Wipe any prior copy so re-runs don't accumulate.
_smolvm.machine_exec(target, ["rm", "-rf", dst])
_smolvm.machine_cp(src, f"{target}:{dst}")
_smolvm.machine_exec(target, ["chown", "-R", "node:node", dst])
@@ -0,0 +1,67 @@
"""Supervise sidecar provisioning inside a running smolmachines
bottle (PRD 0023 chunk 4d; PRD 0013 supervise plane).
Registers the per-bottle supervise sidecar as an HTTP MCP server
in the agent's claude-code config so the agent discovers the
stuck-recovery MCP tools (pipelock-block, capability-block) at
startup.
Mirrors `backend.docker.provision.supervise` same `claude mcp
add` call, just dispatched via `smolvm machine exec` instead of
`docker exec`, and against `<bundle_ip>:<port>` instead of the
short `supervise` alias (no DNS in the TSI-allowlisted guest)."""
from __future__ import annotations
from ....log import info, warn
from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan
_SUPERVISE_MCP_NAME = "supervise"
def provision_supervise(plan: SmolmachinesBottlePlan, target: str) -> None:
"""Run `claude mcp add` inside the guest to register the
supervise sidecar in claude-code's user config. No-op when
bottle.supervise is False.
The URL is the agent-side endpoint launch.py populated after
bundle bringup `http://127.0.0.1:<host port>/` rather than
the bundle's docker bridge IP, because that bridge isn't
reachable from the smolvm guest on macOS.
Failure is logged but not fatal: the bottle still works (you
just can't call supervise tools from the agent until the entry
is added manually). The operator sees the warning at launch."""
if plan.supervise_plan is None:
return
url = plan.agent_supervise_url
info(f"registering supervise MCP server in agent claude config → {url}")
# `claude mcp add --scope user` writes to ~/.claude.json. The
# agent is the `node` user; smolvm machine_exec runs as root
# by default, so we have to switch user explicitly and set
# HOME so the config lands in /home/node/.claude.json (where
# the agent's claude actually reads it from).
r = _smolvm.machine_exec(
target,
[
"runuser", "-u", "node", "--",
"env", "HOME=/home/node",
"claude", "mcp", "add",
"--scope", "user",
"--transport", "http",
_SUPERVISE_MCP_NAME,
url,
],
)
if r.returncode != 0:
warn(
f"`claude mcp add supervise` failed (exit {r.returncode}): "
f"{(r.stderr or r.stdout or '').strip()}. Inside the bottle, "
f"register manually with: "
f"claude mcp add --scope user --transport http supervise {url}"
)
__all__ = ["provision_supervise"]
@@ -0,0 +1,149 @@
"""Host-side SIGWINCH → in-VM PTY resize bridge (issue #82).
smolvm 0.8.0 `machine exec -t` allocates an in-VM PTY but never
forwards the host terminal's window size (TIOCSWINSZ) to it. The
PTY's initial size is `0 0`, and any host-side resize during the
session goes unnoticed the in-VM claude TUI keeps rendering for
whatever (typically tiny) box it last saw, ignoring the operator's
tmux pane resize. `docker exec -it` does this forwarding
automatically; smolvm doesn't.
This module wraps `smolvm machine exec` with a thin parent
process that:
1. Spawns the original argv as a child (it gets the inherited
TTY, so claude's stdin/stdout/stderr work unchanged).
2. On startup + every host SIGWINCH, reads the host terminal
size via TIOCGWINSZ on stdin (or stderr if stdin isn't a
TTY tmux respawn-pane gives us a TTY on stdout/stderr)
and pushes it into the VM with a side-channel
`smolvm machine exec -- sh -c 'for f in /dev/pts/*; do
stty -F $f cols X rows Y; done'`. The kernel delivers
SIGWINCH to the foreground process group on the slave end
automatically, so claude picks up the new size without
extra signalling.
3. Waits on the child and exits with its returncode.
The dashboard's tmux pane respawn calls `bottle.claude_argv`
which now prepends `[sys.executable, -m, ..., <machine>, --, ...]`
to the smolvm argv. Foreground handoff (curses endwin
subprocess.run) goes through the same path so behavior is
identical.
Removable once smolvm grows native SIGWINCH forwarding (upstream
follow-up tracked separately)."""
from __future__ import annotations
import fcntl
import signal
import struct
import subprocess
import sys
import termios
import threading
# How long to wait after the main exec starts before pushing the
# initial size. Concurrent `smolvm machine exec` invocations race
# libkrun's per-exec OCI config write during the main exec's
# bringup window; the side-channel firing immediately corrupts
# `config.json` and the main exec dies with SIGKILL (rc=137) or
# libkrun's "parse error: trailing garbage" depending on
# scheduling. Two seconds is well past the bringup window on a
# warm VM, well under the operator's "this is unresponsive"
# threshold, and short enough that claude's initial render
# almost always fires after the size has been set.
_STARTUP_SYNC_DELAY_SEC = 2.0
def _read_winsize() -> tuple[int, int] | None:
"""Return `(rows, cols)` from whichever of stdin / stdout /
stderr is a TTY, or None if none are. Different invocation
surfaces give us different TTYs:
- foreground handoff (curses endwin subprocess.run): all
three are the operator's terminal.
- tmux respawn-pane: tmux sets all three to the pane's PTY.
- non-TTY (someone piped stdin in tests): none are; the
sync just no-ops, which is the right behavior."""
for fd in (sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()):
try:
data = fcntl.ioctl(fd, termios.TIOCGWINSZ, b"\x00" * 8)
except OSError:
continue
rows, cols, _, _ = struct.unpack("hhhh", data)
if rows > 0 and cols > 0:
return rows, cols
return None
def _push_size(machine: str, rows: int, cols: int) -> None:
"""Side-channel `smolvm machine exec` that sets the size of
every PTY in the VM. The shell `for` loop covers the case of
multiple concurrent interactive sessions (rare but cheap to
handle); `stty -F` returns silently on PTYs that don't apply.
Best-effort: swallow failures. A failed resize doesn't break
the session it just leaves the in-VM PTY at its old size.
`stdin=DEVNULL` is load-bearing: under tmux, inheriting the
pane PTY here means two concurrent smolvm processes (this one
and the agent session the wrapper is shepherding) share the
PTY's foreground-process-group / input plumbing, and smolvm
bails with an internal config-parse error or SIGKILL within
~100ms of the side-channel firing. Outside tmux the same
pattern survived, presumably because iTerm's PTY plumbing is
more forgiving than tmux's, but the DEVNULL is the right
default either way the side-channel never needs stdin."""
subprocess.run(
["smolvm", "machine", "exec", "--name", machine, "--",
"sh", "-c",
f"for f in /dev/pts/*; do "
f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; "
f"done"],
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
check=False,
)
def main(argv: list[str]) -> int:
"""Entry point. `argv` shape: `<machine> -- <smolvm-argv...>`.
We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically."""
if len(argv) < 3 or argv[1] != "--":
sys.stderr.write(
"usage: python -m bot_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n"
)
return 2
machine = argv[0]
inner = argv[2:]
def sync(*_args) -> None:
size = _read_winsize()
if size is None:
return
_push_size(machine, *size)
signal.signal(signal.SIGWINCH, sync)
proc = subprocess.Popen(inner)
# Initial sync is deferred — see _STARTUP_SYNC_DELAY_SEC.
# daemon=True so the timer doesn't block exit when the child
# finishes before the delay elapses.
timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync)
timer.daemon = True
timer.start()
while True:
try:
return proc.wait()
except KeyboardInterrupt:
proc.send_signal(signal.SIGINT)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,221 @@
"""Per-bottle sidecar bundle bringup for the smolmachines backend
(PRD 0023).
Two docker resources per bottle live here:
- **A dedicated bridge network**, subnet derived from the slug.
The bundle container gets a pinned IP at `<subnet>.2` so the
smolvm guest's TSI allowlist (`<bundle-ip>/32`) has a stable
target. Without pinning, we'd have to inspect the container's
assigned IP after start and feed it back into the Smolfile
a race we can sidestep with `--ip`.
- **The bundle container itself**, running the PRD 0024 bundle
image (`bot-bottle-sidecars:latest` by default). Same
image, same daemons, same daemon-private env / bind-mounts
as the docker backend.
This module ships the lifecycle primitives only create
network, start bundle, stop bundle, remove network wrapped
around `subprocess.run(["docker", ...])`. Wiring them into the
launch flow + populating the `BundleLaunchSpec` from the inner
Plans (PipelockProxyPlan, EgressPlan, ) lands in chunk 2d."""
from __future__ import annotations
import subprocess
from dataclasses import dataclass, field
from pathlib import Path
from typing import Sequence
from ...log import die, warn
from ..docker.sidecar_bundle import SIDECAR_BUNDLE_IMAGE
def bundle_network_name(slug: str) -> str:
"""`bot-bottle-bundle-<slug>` — distinct from the docker
backend's `bot-bottle-net-<slug>` so a smolmachines bottle
and a docker bottle for the same agent don't collide on
network name."""
return f"bot-bottle-bundle-{slug}"
def bundle_container_name(slug: str) -> str:
"""`bot-bottle-sidecars-<slug>` — same name shape the docker
backend uses for the bundle (PRD 0024 chunk 5). The dashboard's
prefix-based discovery covers both backends with one filter."""
return f"bot-bottle-sidecars-{slug}"
@dataclass(frozen=True)
class BundleLaunchSpec:
"""Everything `start_bundle` needs to bring up one bundle
container. Populated by chunk-2d's launch flow from the inner
Plans the prepare step already produces."""
slug: str
network_name: str
subnet: str
gateway: str
bundle_ip: str
image: str = SIDECAR_BUNDLE_IMAGE
# Daemon subset CSV for BOT_BOTTLE_SIDECAR_DAEMONS. The
# supervisor inside the bundle reads it to skip
# bottle-irrelevant daemons (e.g. supervise=False bottles).
daemons_csv: str = "egress,pipelock"
# Plain "KEY=VALUE" strings + "KEY" bare names (the bare-name
# form inherits the value from the docker-run subprocess env,
# matching the docker backend's compose-up secret-forwarding
# pattern).
environment: Sequence[str] = field(default_factory=tuple)
# (host_path, container_path, read_only) bind mounts.
volumes: Sequence[tuple[str, str, bool]] = field(default_factory=tuple)
# Container ports to publish on `publish_host_ip`, random
# host-side port per entry. The smolvm guest's TSI talks via
# macOS networking, so docker container IPs (192.168.x.x in
# the daemon's bridge) aren't directly reachable from the
# guest — host-loopback port-forwards are. Egress's port
# is bundle-internal and never published.
ports_to_publish: Sequence[int] = field(default_factory=tuple)
# Loopback IP to bind published ports against. Per-bottle
# loopback aliases (`127.0.0.16` etc., added via sudo
# ifconfig lo0 alias) narrow the TSI allowlist so a bottle
# can't reach other bottles' (or other host services') ports
# via 127.0.0.1.
publish_host_ip: str = "127.0.0.1"
def create_bundle_network(network_name: str, subnet: str, gateway: str) -> None:
"""`docker network create` with an explicit subnet + gateway
so the bundle's `--ip` lands on the address the Smolfile's
TSI allowlist points at. Idempotent on the caller's side —
`start_bundle` catches the "network exists" error and treats
it as success (chunk-2d teardown is paired with each create).
"""
result = subprocess.run(
["docker", "network", "create",
"--subnet", subnet, "--gateway", gateway,
network_name],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
# Already-exists is fine on a resume path; everything else
# is fatal — the bundle won't have an addressable network.
if "already exists" in (result.stderr or "").lower():
return
die(
f"docker network create {network_name} failed: "
f"{(result.stderr or '').strip()}"
)
def remove_bundle_network(network_name: str) -> None:
"""Idempotent: a missing network returns success."""
result = subprocess.run(
["docker", "network", "rm", network_name],
capture_output=True, text=True, check=False,
)
if result.returncode == 0:
return
if "no such network" in (result.stderr or "").lower():
return
# Network with attached containers is the common non-fatal
# case during a partial teardown — warn but don't die.
warn(
f"docker network rm {network_name} failed: "
f"{(result.stderr or '').strip()}"
)
def start_bundle(spec: BundleLaunchSpec, *,
env: dict[str, str] | None = None) -> None:
"""Bring the bundle container up on the per-bottle bridge with
the pinned IP. Argv is built deterministically from `spec`;
`env` is the host subprocess env (forwarded values for any
bare-name entries in `spec.environment`)."""
container = bundle_container_name(spec.slug)
argv = [
"docker", "run",
"--name", container,
"--detach",
"--rm",
"--network", spec.network_name,
"--ip", spec.bundle_ip,
"-e", f"BOT_BOTTLE_SIDECAR_DAEMONS={spec.daemons_csv}",
]
for entry in spec.environment:
argv += ["-e", entry]
for host_path, container_path, read_only in spec.volumes:
suffix = ":ro" if read_only else ""
argv += ["-v", f"{host_path}:{container_path}{suffix}"]
# Loopback-only host port-forwards — the smolvm guest's TSI
# uses macOS networking, and macOS loopback is the only host
# surface that round-trips into Docker Desktop's daemon VM.
# Binds to the per-bottle alias so TSI's IP-only allowlist
# narrows reachability to this bottle's bundle only.
for port in spec.ports_to_publish:
argv += ["-p", f"{spec.publish_host_ip}::{port}"]
argv.append(spec.image)
result = subprocess.run(
argv, capture_output=True, text=True,
env=dict(env) if env is not None else None, check=False,
)
if result.returncode != 0:
die(
f"docker run for bundle {container} failed: "
f"{(result.stderr or '').strip()}"
)
def bundle_host_port(
slug: str, container_port: int, *, host_ip: str = "127.0.0.1",
) -> int:
"""`docker port <bundle> <container_port>/tcp` → the random
host-side port docker assigned for the binding on `host_ip`.
Called after `start_bundle` on each container port listed in
`BundleLaunchSpec.ports_to_publish` so the launch step can
build the agent's HTTPS_PROXY / GIT_GATE / SUPERVISE URLs in
`<host_ip>:<host port>` form."""
container = bundle_container_name(slug)
result = subprocess.run(
["docker", "port", container, f"{container_port}/tcp"],
capture_output=True, text=True, check=False,
)
if result.returncode != 0:
die(
f"docker port {container} {container_port}/tcp failed: "
f"{(result.stderr or '').strip() or '<no stderr>'}"
)
# Each line looks like `127.0.0.16:54321` — one per address
# family / host IP. Match on the expected host_ip prefix so
# bottles bound to per-bottle aliases pick the right line.
for raw in (result.stdout or "").splitlines():
line = raw.strip()
if line.startswith(f"{host_ip}:"):
_, _, port_str = line.rpartition(":")
try:
return int(port_str)
except ValueError:
die(f"unexpected `docker port` output: {line!r}")
die(
f"no port mapping on {host_ip} for {container} "
f"{container_port}/tcp; got: {(result.stdout or '').strip()!r}"
)
return -1 # unreachable; die() never returns
def stop_bundle(slug: str) -> None:
"""Idempotent: a missing container returns success."""
container = bundle_container_name(slug)
result = subprocess.run(
["docker", "rm", "-f", container],
capture_output=True, text=True, check=False,
)
if result.returncode == 0:
return
if "no such container" in (result.stderr or "").lower():
return
warn(
f"docker rm -f {container} failed: "
f"{(result.stderr or '').strip()}"
)
+217
View File
@@ -0,0 +1,217 @@
"""Thin subprocess wrapper around the `smolvm` CLI (PRD 0023).
One thin Python function per smolvm subcommand the launch flow
needs. Two design choices worth flagging:
- **No daemon, no SDK.** smolvm 0.8.0 ships a `smolvm serve`
HTTP API as the long-term-clean integration target. The
project's stdlib-first ethos + the lower-overhead CLI calls
push v1 to shell out via `subprocess.run`. If a future
smolvm release makes `serve` mandatory (or significantly
faster), revisit.
- **Two return shapes.** `SmolvmRunResult` (returncode + stdout
+ stderr captured) is returned by `machine_exec` because the
caller cares about the in-VM command's exit status, and by
test helpers that introspect output. The other calls
(`machine_start`, `machine_stop`, `pack_create`, etc.) raise
`SmolvmError` on non-zero exit failure to start a VM is
fatal to the launch flow, not something callers want to
branch on.
The wrapper is unit-tested with `subprocess.run` mocked; the
integration smoke test (chunk 2d) exercises against a real
smolvm binary."""
from __future__ import annotations
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Mapping, Sequence
_SMOLVM = "smolvm"
@dataclass(frozen=True)
class SmolvmRunResult:
"""Captured result of an in-VM command. Mirrors the structure
`Bottle.exec` returns so callers can hand it straight through."""
returncode: int
stdout: str
stderr: str
class SmolvmError(RuntimeError):
"""Raised when a smolvm subprocess returns non-zero on a path
where the caller has no useful branch to take (start failed,
pack failed, etc.). Carries the captured stderr for the
operator-facing log line."""
def __init__(self, argv: Sequence[str], result: subprocess.CompletedProcess):
self.argv = list(argv)
self.returncode = result.returncode
self.stdout = result.stdout
self.stderr = result.stderr
cmd = " ".join(self.argv)
super().__init__(
f"{cmd!r} failed (exit {result.returncode}): "
f"{(result.stderr or '').strip() or '<no stderr>'}"
)
def _smolvm(*args: str, env: Mapping[str, str] | None = None,
check: bool = True) -> subprocess.CompletedProcess:
"""One subprocess call into the smolvm CLI. `check=True`
raises SmolvmError on non-zero; `check=False` returns the
CompletedProcess for the caller to inspect."""
argv = [_SMOLVM, *args]
result = subprocess.run(
argv,
capture_output=True,
text=True,
env=dict(env) if env is not None else None,
check=False,
)
if check and result.returncode != 0:
raise SmolvmError(argv, result)
return result
# --- Pack ----------------------------------------------------------------
def pack_create(image: str, output: Path) -> None:
"""`smolvm pack create --image <image> -o <output>`. Converts
an OCI image into a self-contained `.smolmachine` artifact
smolvm can boot via `machine create --from`. Idempotent on the
smolvm side re-running with the same image+output rebuilds
from layer cache."""
_smolvm("pack", "create", "--image", image, "-o", str(output))
# --- Machine lifecycle ---------------------------------------------------
def machine_create(
name: str,
*,
image: str | None = None,
from_path: Path | None = None,
allow_cidrs: Sequence[str] = (),
env: Mapping[str, str] | None = None,
) -> None:
"""`smolvm machine create NAME [--image IMG | --from PATH]
[--allow-cidr CIDR ...] [-e K=V ...]`. NAME is positional
(the CLI's exception to the `--name` pattern other
subcommands use).
`image` (registry ref like `alpine:latest`) and `from_path`
(a `.smolmachine` artifact) are mutually exclusive one or
the other tells smolvm what to boot. The wrapper doesn't
enforce exclusivity; smolvm errors clearly enough.
`allow_cidrs` and `env` are passed as CLI flags instead of a
Smolfile because `--from` and `--smolfile` are themselves
mutually exclusive in smolvm 0.8.0 and we want `--from`'s
no-pull-at-start property. The flag form gives the same
result without the Smolfile complication.
`--net` is sent explicitly when `allow_cidrs` is non-empty.
smolvm 0.8.0's docs say `--allow-cidr` implies `--net`, but
empirically the implication only fires when no `--from` is
set `--from PATH --allow-cidr X/32` silently produces a
machine with `network: false` and no routes in the guest, so
the agent can't reach the bundle's pinned IP."""
args: list[str] = ["machine", "create"]
if image is not None:
args += ["--image", image]
if from_path is not None:
args += ["--from", str(from_path)]
if allow_cidrs:
args.append("--net")
for cidr in allow_cidrs:
args += ["--allow-cidr", cidr]
if env:
for k, v in env.items():
args += ["-e", f"{k}={v}"]
args.append(name)
_smolvm(*args)
def machine_start(name: str) -> None:
"""`smolvm machine start --name NAME`."""
_smolvm("machine", "start", "--name", name)
def machine_stop(name: str) -> None:
"""`smolvm machine stop --name NAME`. Idempotent against
already-stopped machines: smolvm prints a notice and exits 0
in that case, so no special handling here."""
_smolvm("machine", "stop", "--name", name)
def machine_delete(name: str) -> None:
"""`smolvm machine delete -f NAME`. NAME is positional. `-f`
skips the interactive confirmation required for
non-interactive teardown."""
_smolvm("machine", "delete", "-f", name)
def machine_exec(
name: str,
argv: Sequence[str],
*,
env: Mapping[str, str] | None = None,
workdir: str | None = None,
timeout: str | None = None,
) -> SmolvmRunResult:
"""`smolvm machine exec --name NAME [-w DIR] [--timeout DUR]
[-e K=V ...] -- ARGV...`. Returns the captured result rather
than raising callers (including `Bottle.exec`) care about
the in-VM command's exit code, not just whether smolvm ran.
`env` here is in-VM env vars (`-e K=V`), not the host
subprocess env smolvm's own argv carries them through the
VMM."""
flags: list[str] = ["machine", "exec", "--name", name]
if workdir is not None:
flags += ["-w", workdir]
if timeout is not None:
flags += ["--timeout", timeout]
if env:
for k, v in env.items():
flags += ["-e", f"{k}={v}"]
# `--` separator before the command. smolvm's CLI requires it
# so its own flag parser doesn't grab argv items that look
# like flags.
flags.append("--")
flags += list(argv)
result = _smolvm(*flags, check=False)
return SmolvmRunResult(
returncode=result.returncode,
stdout=result.stdout or "",
stderr=result.stderr or "",
)
def machine_cp(src: str, dst: str) -> None:
"""`smolvm machine cp SRC DST`. Path syntax: `machine:path` to
reference a path inside the VM, bare path for the host. Both
SRC and DST are positional; either side can be machine: or
bare. Empty path is a no-op (returns immediately without
invoking smolvm)."""
if not src or not dst:
return
_smolvm("machine", "cp", src, dst)
# --- Discovery -----------------------------------------------------------
def is_available() -> bool:
"""True iff `smolvm` is on PATH. Used by the integration test
suite's skip-guards."""
return shutil.which(_SMOLVM) is not None
+48
View File
@@ -0,0 +1,48 @@
"""Slug / preflight / subnet helpers for the smolmachines backend
(PRD 0023). Kept in its own module so the renderers can be
unit-tested without importing the docker subprocess paths."""
from __future__ import annotations
import hashlib
import shutil
from ...log import die
def smolmachines_preflight() -> None:
"""Ensure `smolvm` is on PATH before the launch flow runs.
Called from `_resolve_plan`; gives the operator a clear
install pointer rather than a cryptic FileNotFoundError
later. `gvproxy` is no longer required see the PRD's design
pivot section."""
if shutil.which("smolvm") is not None:
return
die(
"BOT_BOTTLE_BACKEND=smolmachines requires `smolvm` on "
"PATH. Install with: "
"curl -sSL https://smolmachines.com/install.sh | sh"
)
def smolmachines_bundle_subnet(slug: str) -> tuple[str, str, str]:
"""Derive a per-bottle docker subnet + gateway IP + bundle IP
from the slug.
Returns `(subnet_cidr, gateway_ip, bundle_ip)`. The third
octet comes from SHA-256 of the slug mod 254 (skipping 17 to
avoid the docker-default bridge), so parallel bottles get
distinct /24s and `resume` reuses the same /24. The bundle
container always lands at `.2`; gateway is `.1`; the smolvm
Smolfile's `allow_cidrs` is `<bundle_ip>/32`."""
digest = hashlib.sha256(slug.encode("utf-8")).digest()
octet = (digest[0] % 254) + 1
# Skip the docker-default bridge to dodge the most common
# collision (operators with `docker0` at 172.17.x.x or a
# 192.168.17.x VPN client).
if octet == 17:
octet = 18
subnet = f"192.168.{octet}.0/24"
gateway = f"192.168.{octet}.1"
bundle_ip = f"192.168.{octet}.2"
return subnet, gateway, bundle_ip
+18
View File
@@ -0,0 +1,18 @@
"""Cross-backend utility helpers — host-side primitives shared by
every backend implementation. Backend-specific helpers live one level
deeper (e.g. bot_bottle/backend/docker/util.py)."""
from __future__ import annotations
import os
from ..log import die
def host_skill_dir(name: str) -> str:
"""Return the host-side path for a named skill:
`$HOME/.claude/skills/<name>`. Dies if HOME is unset."""
home = os.environ.get("HOME")
if not home:
die("HOME not set")
return f"{home}/.claude/skills/{name}"