fix(smolmachines): retry CA install after exec SIGKILL
test / unit (push) Successful in 38s
test / integration (push) Successful in 54s

This commit is contained in:
2026-06-01 23:27:21 -04:00
parent 36e3443d2e
commit 2dd8113f7c
2 changed files with 52 additions and 11 deletions
+32 -11
View File
@@ -15,6 +15,8 @@ flag exists; the VM init is root), so we don't need the explicit
from __future__ import annotations
import time
from ....log import die
from ...util import (
AGENT_CA_BUNDLE,
@@ -26,6 +28,9 @@ from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan
_SIGKILL_EXIT = 128 + 9
def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
"""Copy the agent-facing CA cert into the guest, rebuild the
trust bundle, emit a one-line fingerprint log. Called from
@@ -40,17 +45,16 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
# `requests` / libraries that don't load the system bundle.
#
# chown + chmod + update-ca-certificates run in one
# `sh -c` so we only pay one machine_exec round trip; the
# `&&` chaining surfaces the first failure as the return
# code.
r = _smolvm.machine_exec(target, [
"sh", "-c",
f"chown root:root {AGENT_CA_PATH} && "
f"chmod 644 {AGENT_CA_PATH} && "
f"update-ca-certificates",
])
if r.returncode != 0 or "1 added" not in (r.stdout or ""):
r = _install_ca(target)
if r.returncode == _SIGKILL_EXIT:
# smolvm/libkrun can SIGKILL an otherwise-normal exec
# during early-VM provisioning. `update-ca-certificates`
# is idempotent, so retry the same install once after a
# short settle delay before treating it as fatal.
time.sleep(1.0)
r = _install_ca(target)
if r.returncode != 0:
# update-ca-certificates not adding our cert is fatal —
# claude-code's TLS handshake against the egress-MITM'd
# api.anthropic.com would fail downstream. Bail early
@@ -66,6 +70,23 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
log_ca_fingerprint(cert_host_path, label)
def _install_ca(target: str) -> _smolvm.SmolvmRunResult:
# chown + chmod + update-ca-certificates + bundle
# verification run in one `sh -c` so we only pay one
# machine_exec round trip; the `&&` chaining surfaces the
# first failure as the return code. The verify check is more
# stable than requiring "1 added" in stdout: a retry after a
# partially-completed first run may legitimately report "0
# added" while the cert is already installed.
return _smolvm.machine_exec(target, [
"sh", "-c",
f"chown root:root {AGENT_CA_PATH} && "
f"chmod 644 {AGENT_CA_PATH} && "
f"update-ca-certificates && "
f"openssl verify -CAfile {AGENT_CA_BUNDLE} {AGENT_CA_PATH}",
])
# Re-exported for the launch/provision_ca caller + tests. The path
# constants live in the shared `backend.util` (Debian's
# `update-ca-certificates` layout is the same in both backends).
+20
View File
@@ -622,6 +622,26 @@ class TestProvisionCA(unittest.TestCase):
"bot-bottle-demo-abc12:" + _ca.AGENT_CA_PATH,
)
def test_retries_smolvm_sigkill_during_update_ca(self):
plan = _plan(pipelock_ca_path=self.pipelock_ca)
killed = SmolvmRunResult(
returncode=137,
stdout="Updating certificates in /etc/ssl/certs...\n",
stderr="",
)
with patch(
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
), patch(
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
side_effect=[killed, self._UPDATE_OK],
) as ex, patch(
"bot_bottle.backend.smolmachines.provision.ca.time.sleep"
) as sleep:
_ca.provision_ca(plan, "bot-bottle-demo-abc12")
self.assertEqual(2, ex.call_count)
sleep.assert_called_once_with(1.0)
def test_dies_when_selected_cert_missing(self):
# Plan claims a pipelock cert at a path that doesn't exist —
# something went wrong in launch's pipelock_tls_init.