diff --git a/bot_bottle/backend/smolmachines/provision/ca.py b/bot_bottle/backend/smolmachines/provision/ca.py index e9f8b84..15c7751 100644 --- a/bot_bottle/backend/smolmachines/provision/ca.py +++ b/bot_bottle/backend/smolmachines/provision/ca.py @@ -15,6 +15,8 @@ flag exists; the VM init is root), so we don't need the explicit from __future__ import annotations +import time + from ....log import die from ...util import ( AGENT_CA_BUNDLE, @@ -26,6 +28,9 @@ from .. import smolvm as _smolvm from ..bottle_plan import SmolmachinesBottlePlan +_SIGKILL_EXIT = 128 + 9 + + def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None: """Copy the agent-facing CA cert into the guest, rebuild the trust bundle, emit a one-line fingerprint log. Called from @@ -40,17 +45,16 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None: # REQUESTS_CA_BUNDLE) on the guest_env covers Node + Python # `requests` / libraries that don't load the system bundle. # - # chown + chmod + update-ca-certificates run in one - # `sh -c` so we only pay one machine_exec round trip; the - # `&&` chaining surfaces the first failure as the return - # code. - r = _smolvm.machine_exec(target, [ - "sh", "-c", - f"chown root:root {AGENT_CA_PATH} && " - f"chmod 644 {AGENT_CA_PATH} && " - f"update-ca-certificates", - ]) - if r.returncode != 0 or "1 added" not in (r.stdout or ""): + r = _install_ca(target) + if r.returncode == _SIGKILL_EXIT: + # smolvm/libkrun can SIGKILL an otherwise-normal exec + # during early-VM provisioning. `update-ca-certificates` + # is idempotent, so retry the same install once after a + # short settle delay before treating it as fatal. + time.sleep(1.0) + r = _install_ca(target) + + if r.returncode != 0: # update-ca-certificates not adding our cert is fatal — # claude-code's TLS handshake against the egress-MITM'd # api.anthropic.com would fail downstream. Bail early @@ -66,6 +70,23 @@ def provision_ca(plan: SmolmachinesBottlePlan, target: str) -> None: log_ca_fingerprint(cert_host_path, label) +def _install_ca(target: str) -> _smolvm.SmolvmRunResult: + # chown + chmod + update-ca-certificates + bundle + # verification run in one `sh -c` so we only pay one + # machine_exec round trip; the `&&` chaining surfaces the + # first failure as the return code. The verify check is more + # stable than requiring "1 added" in stdout: a retry after a + # partially-completed first run may legitimately report "0 + # added" while the cert is already installed. + return _smolvm.machine_exec(target, [ + "sh", "-c", + f"chown root:root {AGENT_CA_PATH} && " + f"chmod 644 {AGENT_CA_PATH} && " + f"update-ca-certificates && " + f"openssl verify -CAfile {AGENT_CA_BUNDLE} {AGENT_CA_PATH}", + ]) + + # Re-exported for the launch/provision_ca caller + tests. The path # constants live in the shared `backend.util` (Debian's # `update-ca-certificates` layout is the same in both backends). diff --git a/tests/unit/test_smolmachines_provision.py b/tests/unit/test_smolmachines_provision.py index edbb992..b32d20f 100644 --- a/tests/unit/test_smolmachines_provision.py +++ b/tests/unit/test_smolmachines_provision.py @@ -622,6 +622,26 @@ class TestProvisionCA(unittest.TestCase): "bot-bottle-demo-abc12:" + _ca.AGENT_CA_PATH, ) + def test_retries_smolvm_sigkill_during_update_ca(self): + plan = _plan(pipelock_ca_path=self.pipelock_ca) + killed = SmolvmRunResult( + returncode=137, + stdout="Updating certificates in /etc/ssl/certs...\n", + stderr="", + ) + with patch( + "bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_cp" + ), patch( + "bot_bottle.backend.smolmachines.provision.ca._smolvm.machine_exec", + side_effect=[killed, self._UPDATE_OK], + ) as ex, patch( + "bot_bottle.backend.smolmachines.provision.ca.time.sleep" + ) as sleep: + _ca.provision_ca(plan, "bot-bottle-demo-abc12") + + self.assertEqual(2, ex.call_count) + sleep.assert_called_once_with(1.0) + def test_dies_when_selected_cert_missing(self): # Plan claims a pipelock cert at a path that doesn't exist — # something went wrong in launch's pipelock_tls_init.