fix(smolmachines): retry CA install after exec SIGKILL
This commit is contained in:
@@ -15,6 +15,8 @@ flag exists; the VM init is root), so we don't need the explicit
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
|
||||
from ....log import die
|
||||
from ...util import (
|
||||
AGENT_CA_BUNDLE,
|
||||
@@ -26,6 +28,9 @@ from .. import smolvm as _smolvm
|
||||
from ..bottle_plan import SmolmachinesBottlePlan
|
||||
|
||||
|
||||
_SIGKILL_EXIT = 128 + 9
|
||||
|
||||
|
||||
def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
||||
"""Copy the agent-facing CA cert into the guest, rebuild the
|
||||
trust bundle, emit a one-line fingerprint log. Called from
|
||||
@@ -40,17 +45,16 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
||||
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
|
||||
# `requests` / libraries that don't load the system bundle.
|
||||
#
|
||||
# chown + chmod + update-ca-certificates run in one
|
||||
# `sh -c` so we only pay one machine_exec round trip; the
|
||||
# `&&` chaining surfaces the first failure as the return
|
||||
# code.
|
||||
r = _smolvm.machine_exec(target, [
|
||||
"sh", "-c",
|
||||
f"chown root:root {AGENT_CA_PATH} && "
|
||||
f"chmod 644 {AGENT_CA_PATH} && "
|
||||
f"update-ca-certificates",
|
||||
])
|
||||
if r.returncode != 0 or "1 added" not in (r.stdout or ""):
|
||||
r = _install_ca(target)
|
||||
if r.returncode == _SIGKILL_EXIT:
|
||||
# smolvm/libkrun can SIGKILL an otherwise-normal exec
|
||||
# during early-VM provisioning. `update-ca-certificates`
|
||||
# is idempotent, so retry the same install once after a
|
||||
# short settle delay before treating it as fatal.
|
||||
time.sleep(1.0)
|
||||
r = _install_ca(target)
|
||||
|
||||
if r.returncode != 0:
|
||||
# update-ca-certificates not adding our cert is fatal —
|
||||
# claude-code's TLS handshake against the egress-MITM'd
|
||||
# api.anthropic.com would fail downstream. Bail early
|
||||
@@ -66,6 +70,23 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
||||
log_ca_fingerprint(cert_host_path, label)
|
||||
|
||||
|
||||
def _install_ca(target: str) -> _smolvm.SmolvmRunResult:
|
||||
# chown + chmod + update-ca-certificates + bundle
|
||||
# verification run in one `sh -c` so we only pay one
|
||||
# machine_exec round trip; the `&&` chaining surfaces the
|
||||
# first failure as the return code. The verify check is more
|
||||
# stable than requiring "1 added" in stdout: a retry after a
|
||||
# partially-completed first run may legitimately report "0
|
||||
# added" while the cert is already installed.
|
||||
return _smolvm.machine_exec(target, [
|
||||
"sh", "-c",
|
||||
f"chown root:root {AGENT_CA_PATH} && "
|
||||
f"chmod 644 {AGENT_CA_PATH} && "
|
||||
f"update-ca-certificates && "
|
||||
f"openssl verify -CAfile {AGENT_CA_BUNDLE} {AGENT_CA_PATH}",
|
||||
])
|
||||
|
||||
|
||||
# Re-exported for the launch/provision_ca caller + tests. The path
|
||||
# constants live in the shared `backend.util` (Debian's
|
||||
# `update-ca-certificates` layout is the same in both backends).
|
||||
|
||||
@@ -622,6 +622,26 @@ class TestProvisionCA(unittest.TestCase):
|
||||
"bot-bottle-demo-abc12:" + _ca.AGENT_CA_PATH,
|
||||
)
|
||||
|
||||
def test_retries_smolvm_sigkill_during_update_ca(self):
|
||||
plan = _plan(pipelock_ca_path=self.pipelock_ca)
|
||||
killed = SmolvmRunResult(
|
||||
returncode=137,
|
||||
stdout="Updating certificates in /etc/ssl/certs...\n",
|
||||
stderr="",
|
||||
)
|
||||
with patch(
|
||||
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
|
||||
), patch(
|
||||
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
|
||||
side_effect=[killed, self._UPDATE_OK],
|
||||
) as ex, patch(
|
||||
"bot_bottle.backend.smolmachines.provision.ca.time.sleep"
|
||||
) as sleep:
|
||||
_ca.provision_ca(plan, "bot-bottle-demo-abc12")
|
||||
|
||||
self.assertEqual(2, ex.call_count)
|
||||
sleep.assert_called_once_with(1.0)
|
||||
|
||||
def test_dies_when_selected_cert_missing(self):
|
||||
# Plan claims a pipelock cert at a path that doesn't exist —
|
||||
# something went wrong in launch's pipelock_tls_init.
|
||||
|
||||
Reference in New Issue
Block a user