fix(smolmachines): retry CA install after exec SIGKILL
test / unit (push) Successful in 38s
test / integration (push) Successful in 54s

This commit is contained in:
2026-06-01 23:27:21 -04:00
parent 36e3443d2e
commit 2dd8113f7c
2 changed files with 52 additions and 11 deletions
+32 -11
View File
@@ -15,6 +15,8 @@ flag exists; the VM init is root), so we don't need the explicit
from __future__ import annotations from __future__ import annotations
import time
from ....log import die from ....log import die
from ...util import ( from ...util import (
AGENT_CA_BUNDLE, AGENT_CA_BUNDLE,
@@ -26,6 +28,9 @@ from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan from ..bottle_plan import SmolmachinesBottlePlan
_SIGKILL_EXIT = 128 + 9
def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None: def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
"""Copy the agent-facing CA cert into the guest, rebuild the """Copy the agent-facing CA cert into the guest, rebuild the
trust bundle, emit a one-line fingerprint log. Called from trust bundle, emit a one-line fingerprint log. Called from
@@ -40,17 +45,16 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python # REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
# `requests` / libraries that don't load the system bundle. # `requests` / libraries that don't load the system bundle.
# #
# chown + chmod + update-ca-certificates run in one r = _install_ca(target)
# `sh -c` so we only pay one machine_exec round trip; the if r.returncode == _SIGKILL_EXIT:
# `&&` chaining surfaces the first failure as the return # smolvm/libkrun can SIGKILL an otherwise-normal exec
# code. # during early-VM provisioning. `update-ca-certificates`
r = _smolvm.machine_exec(target, [ # is idempotent, so retry the same install once after a
"sh", "-c", # short settle delay before treating it as fatal.
f"chown root:root {AGENT_CA_PATH} && " time.sleep(1.0)
f"chmod 644 {AGENT_CA_PATH} && " r = _install_ca(target)
f"update-ca-certificates",
]) if r.returncode != 0:
if r.returncode != 0 or "1 added" not in (r.stdout or ""):
# update-ca-certificates not adding our cert is fatal — # update-ca-certificates not adding our cert is fatal —
# claude-code's TLS handshake against the egress-MITM'd # claude-code's TLS handshake against the egress-MITM'd
# api.anthropic.com would fail downstream. Bail early # api.anthropic.com would fail downstream. Bail early
@@ -66,6 +70,23 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
log_ca_fingerprint(cert_host_path, label) log_ca_fingerprint(cert_host_path, label)
def _install_ca(target: str) -> _smolvm.SmolvmRunResult:
# chown + chmod + update-ca-certificates + bundle
# verification run in one `sh -c` so we only pay one
# machine_exec round trip; the `&&` chaining surfaces the
# first failure as the return code. The verify check is more
# stable than requiring "1 added" in stdout: a retry after a
# partially-completed first run may legitimately report "0
# added" while the cert is already installed.
return _smolvm.machine_exec(target, [
"sh", "-c",
f"chown root:root {AGENT_CA_PATH} && "
f"chmod 644 {AGENT_CA_PATH} && "
f"update-ca-certificates && "
f"openssl verify -CAfile {AGENT_CA_BUNDLE} {AGENT_CA_PATH}",
])
# Re-exported for the launch/provision_ca caller + tests. The path # Re-exported for the launch/provision_ca caller + tests. The path
# constants live in the shared `backend.util` (Debian's # constants live in the shared `backend.util` (Debian's
# `update-ca-certificates` layout is the same in both backends). # `update-ca-certificates` layout is the same in both backends).
+20
View File
@@ -622,6 +622,26 @@ class TestProvisionCA(unittest.TestCase):
"bot-bottle-demo-abc12:" + _ca.AGENT_CA_PATH, "bot-bottle-demo-abc12:" + _ca.AGENT_CA_PATH,
) )
def test_retries_smolvm_sigkill_during_update_ca(self):
plan = _plan(pipelock_ca_path=self.pipelock_ca)
killed = SmolvmRunResult(
returncode=137,
stdout="Updating certificates in /etc/ssl/certs...\n",
stderr="",
)
with patch(
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
), patch(
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
side_effect=[killed, self._UPDATE_OK],
) as ex, patch(
"bot_bottle.backend.smolmachines.provision.ca.time.sleep"
) as sleep:
_ca.provision_ca(plan, "bot-bottle-demo-abc12")
self.assertEqual(2, ex.call_count)
sleep.assert_called_once_with(1.0)
def test_dies_when_selected_cert_missing(self): def test_dies_when_selected_cert_missing(self):
# Plan claims a pipelock cert at a path that doesn't exist — # Plan claims a pipelock cert at a path that doesn't exist —
# something went wrong in launch's pipelock_tls_init. # something went wrong in launch's pipelock_tls_init.