fix(smolmachines): retry CA install after exec SIGKILL
This commit is contained in:
@@ -15,6 +15,8 @@ flag exists; the VM init is root), so we don't need the explicit
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
from ....log import die
|
from ....log import die
|
||||||
from ...util import (
|
from ...util import (
|
||||||
AGENT_CA_BUNDLE,
|
AGENT_CA_BUNDLE,
|
||||||
@@ -26,6 +28,9 @@ from .. import smolvm as _smolvm
|
|||||||
from ..bottle_plan import SmolmachinesBottlePlan
|
from ..bottle_plan import SmolmachinesBottlePlan
|
||||||
|
|
||||||
|
|
||||||
|
_SIGKILL_EXIT = 128 + 9
|
||||||
|
|
||||||
|
|
||||||
def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
||||||
"""Copy the agent-facing CA cert into the guest, rebuild the
|
"""Copy the agent-facing CA cert into the guest, rebuild the
|
||||||
trust bundle, emit a one-line fingerprint log. Called from
|
trust bundle, emit a one-line fingerprint log. Called from
|
||||||
@@ -40,17 +45,16 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
|||||||
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
|
# REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python
|
||||||
# `requests` / libraries that don't load the system bundle.
|
# `requests` / libraries that don't load the system bundle.
|
||||||
#
|
#
|
||||||
# chown + chmod + update-ca-certificates run in one
|
r = _install_ca(target)
|
||||||
# `sh -c` so we only pay one machine_exec round trip; the
|
if r.returncode == _SIGKILL_EXIT:
|
||||||
# `&&` chaining surfaces the first failure as the return
|
# smolvm/libkrun can SIGKILL an otherwise-normal exec
|
||||||
# code.
|
# during early-VM provisioning. `update-ca-certificates`
|
||||||
r = _smolvm.machine_exec(target, [
|
# is idempotent, so retry the same install once after a
|
||||||
"sh", "-c",
|
# short settle delay before treating it as fatal.
|
||||||
f"chown root:root {AGENT_CA_PATH} && "
|
time.sleep(1.0)
|
||||||
f"chmod 644 {AGENT_CA_PATH} && "
|
r = _install_ca(target)
|
||||||
f"update-ca-certificates",
|
|
||||||
])
|
if r.returncode != 0:
|
||||||
if r.returncode != 0 or "1 added" not in (r.stdout or ""):
|
|
||||||
# update-ca-certificates not adding our cert is fatal —
|
# update-ca-certificates not adding our cert is fatal —
|
||||||
# claude-code's TLS handshake against the egress-MITM'd
|
# claude-code's TLS handshake against the egress-MITM'd
|
||||||
# api.anthropic.com would fail downstream. Bail early
|
# api.anthropic.com would fail downstream. Bail early
|
||||||
@@ -66,6 +70,23 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None:
|
|||||||
log_ca_fingerprint(cert_host_path, label)
|
log_ca_fingerprint(cert_host_path, label)
|
||||||
|
|
||||||
|
|
||||||
|
def _install_ca(target: str) -> _smolvm.SmolvmRunResult:
|
||||||
|
# chown + chmod + update-ca-certificates + bundle
|
||||||
|
# verification run in one `sh -c` so we only pay one
|
||||||
|
# machine_exec round trip; the `&&` chaining surfaces the
|
||||||
|
# first failure as the return code. The verify check is more
|
||||||
|
# stable than requiring "1 added" in stdout: a retry after a
|
||||||
|
# partially-completed first run may legitimately report "0
|
||||||
|
# added" while the cert is already installed.
|
||||||
|
return _smolvm.machine_exec(target, [
|
||||||
|
"sh", "-c",
|
||||||
|
f"chown root:root {AGENT_CA_PATH} && "
|
||||||
|
f"chmod 644 {AGENT_CA_PATH} && "
|
||||||
|
f"update-ca-certificates && "
|
||||||
|
f"openssl verify -CAfile {AGENT_CA_BUNDLE} {AGENT_CA_PATH}",
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
# Re-exported for the launch/provision_ca caller + tests. The path
|
# Re-exported for the launch/provision_ca caller + tests. The path
|
||||||
# constants live in the shared `backend.util` (Debian's
|
# constants live in the shared `backend.util` (Debian's
|
||||||
# `update-ca-certificates` layout is the same in both backends).
|
# `update-ca-certificates` layout is the same in both backends).
|
||||||
|
|||||||
@@ -622,6 +622,26 @@ class TestProvisionCA(unittest.TestCase):
|
|||||||
"bot-bottle-demo-abc12:" + _ca.AGENT_CA_PATH,
|
"bot-bottle-demo-abc12:" + _ca.AGENT_CA_PATH,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_retries_smolvm_sigkill_during_update_ca(self):
|
||||||
|
plan = _plan(pipelock_ca_path=self.pipelock_ca)
|
||||||
|
killed = SmolvmRunResult(
|
||||||
|
returncode=137,
|
||||||
|
stdout="Updating certificates in /etc/ssl/certs...\n",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
with patch(
|
||||||
|
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp"
|
||||||
|
), patch(
|
||||||
|
"bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec",
|
||||||
|
side_effect=[killed, self._UPDATE_OK],
|
||||||
|
) as ex, patch(
|
||||||
|
"bot_bottle.backend.smolmachines.provision.ca.time.sleep"
|
||||||
|
) as sleep:
|
||||||
|
_ca.provision_ca(plan, "bot-bottle-demo-abc12")
|
||||||
|
|
||||||
|
self.assertEqual(2, ex.call_count)
|
||||||
|
sleep.assert_called_once_with(1.0)
|
||||||
|
|
||||||
def test_dies_when_selected_cert_missing(self):
|
def test_dies_when_selected_cert_missing(self):
|
||||||
# Plan claims a pipelock cert at a path that doesn't exist —
|
# Plan claims a pipelock cert at a path that doesn't exist —
|
||||||
# something went wrong in launch's pipelock_tls_init.
|
# something went wrong in launch's pipelock_tls_init.
|
||||||
|
|||||||
Reference in New Issue
Block a user