From aa5aa1f031304a1f6f0fc163640bf65544b0ccee Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 20:55:00 -0400 Subject: [PATCH] fix(smolmachines): defer pty_resize startup sync to dodge libkrun's bringup race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The b9853ae stdin=DEVNULL fix wasn't sufficient. End-to-end testing against a live VM in tmux revealed a second crash path: libkrun spits "load \`config.json\`: parse error: trailing garbage { \"ociVersion\": \"1.0.2\", ... }" and the main exec dies (rc=1 or SIGKILL/rc=137, depending on race scheduling). Root cause: each `smolvm machine exec` writes a per-invocation OCI config.json to the same smolvm state dir during its bringup. The wrapper's startup sync() fires within 1ms of Popen-ing the main exec — both invocations write config.json concurrently, libkrun loads one mid-write, and gets garbage. Trivial inner commands (`sh -c "echo hi"`) finished before the overlap mattered, masking the race in earlier tests. claude's slower startup hits the race every time, and only inside tmux because the outside-tmux foreground-handoff path takes a different bringup sequence that happens to dodge the window. Fix: schedule the initial sync on a 2-second `threading.Timer` instead of calling it synchronously. By 2s the main exec is past its bringup window, so the side-channel's config.json write doesn't collide. Daemon thread so the timer doesn't block exit when the child finishes quickly. Trade-off: the in-VM PTY uses smolvm's default size for the first ~2s, then snaps to the host pane size when the timer fires. Verified end-to-end against a live VM in tmux: claude renders at the default size during bringup, then redraws at full pane width once the deferred sync lands. Operator-driven resizes (SIGWINCH) still bridge in real time via the already-installed signal handler. Also drop the diagnostic log added in 9c83ea6 — we have the fix. Regression test: `TestStartupSyncDeferred.test_main_schedules_timer_does_not_ call_sync_synchronously` mocks Popen + Timer + _push_size and asserts `main()` schedules the timer with the documented delay constant and never invokes _push_size synchronously. Catches a "let's just inline the sync() call" regression immediately. 638 unit tests pass. Co-Authored-By: Claude Opus 4.7 --- .../backend/smolmachines/pty_resize.py | 64 ++++++++++--------- tests/unit/test_smolmachines_pty_resize.py | 34 ++++++++++ 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/claude_bottle/backend/smolmachines/pty_resize.py b/claude_bottle/backend/smolmachines/pty_resize.py index 2d27ab8..ae22804 100644 --- a/claude_bottle/backend/smolmachines/pty_resize.py +++ b/claude_bottle/backend/smolmachines/pty_resize.py @@ -35,28 +35,26 @@ follow-up tracked separately).""" from __future__ import annotations -import datetime import fcntl -import os import signal import struct import subprocess import sys import termios -import traceback +import threading -_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log") - - -def _log(msg: str) -> None: - try: - os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True) - with open(_DEBUG_LOG_PATH, "a") as f: - ts = datetime.datetime.now().isoformat(timespec="milliseconds") - f.write(f"[{ts} pid={os.getpid()}] {msg}\n") - except OSError: - pass +# How long to wait after the main exec starts before pushing the +# initial size. Concurrent `smolvm machine exec` invocations race +# libkrun's per-exec OCI config write during the main exec's +# bringup window; the side-channel firing immediately corrupts +# `config.json` and the main exec dies with SIGKILL (rc=137) or +# libkrun's "parse error: trailing garbage" depending on +# scheduling. Two seconds is well past the bringup window on a +# warm VM, well under the operator's "this is unresponsive" +# threshold, and short enough that claude's initial render +# almost always fires after the size has been set. +_STARTUP_SYNC_DELAY_SEC = 2.0 def _read_winsize() -> tuple[int, int] | None: @@ -116,42 +114,48 @@ def main(argv: list[str]) -> int: We don't use argparse — the `--` separator is the contract and everything past it is forwarded verbatim. Keeps the wrapper transparent for callers building argv programmatically.""" - _log(f"start argv={argv!r} TMUX={os.environ.get('TMUX','')!r} " - f"ppid={os.getppid()}") - if len(argv) < 3 or argv[1] != "--": sys.stderr.write( "usage: python -m claude_bottle.backend.smolmachines.pty_resize " " -- \n" ) - _log("exit=2 (bad argv)") return 2 machine = argv[0] inner = argv[2:] def sync(*_args) -> None: size = _read_winsize() - _log(f"sync size={size!r}") if size is None: return _push_size(machine, *size) signal.signal(signal.SIGWINCH, sync) - try: - proc = subprocess.Popen(inner) - except BaseException: - _log("Popen failed:\n" + traceback.format_exc()) - raise - _log(f"child pid={proc.pid}") - sync() # push initial size — VM PTY starts at 0 0. + proc = subprocess.Popen(inner) + # Defer the initial sync. Firing it immediately races + # libkrun's per-exec OCI config write: both `smolvm machine + # exec` invocations stash a config.json in the same smolvm + # state dir during their bringup window, libkrun loads one + # mid-write, and the main exec dies with SIGKILL (rc=137) + # or libkrun's "parse error: trailing garbage" depending on + # scheduling. Trivial inner commands finish before the + # overlap matters; claude's slower startup hits the race + # every time, only inside tmux (the outside-tmux foreground + # handoff path takes a different bringup sequence that + # happens to dodge the window). + # + # A 2s timer is past the bringup window on a warm VM, so + # the side-channel writes a fresh config.json without + # collision, and the in-VM PTY is sized before claude has + # finished rendering its first frame. daemon=True so the + # timer doesn't block exit when the child finishes quickly. + timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync) + timer.daemon = True + timer.start() while True: try: - rc = proc.wait() - _log(f"child exit rc={rc}") - return rc + return proc.wait() except KeyboardInterrupt: - _log("KeyboardInterrupt → forward SIGINT to child") proc.send_signal(signal.SIGINT) diff --git a/tests/unit/test_smolmachines_pty_resize.py b/tests/unit/test_smolmachines_pty_resize.py index 7722f4a..6624674 100644 --- a/tests/unit/test_smolmachines_pty_resize.py +++ b/tests/unit/test_smolmachines_pty_resize.py @@ -10,6 +10,7 @@ from __future__ import annotations import io import unittest +import unittest.mock from unittest.mock import patch from claude_bottle.backend.smolmachines import pty_resize @@ -126,5 +127,38 @@ class TestMainArgvParsing(unittest.TestCase): self.assertEqual(2, pty_resize.main(["m", "--"])) +class TestStartupSyncDeferred(unittest.TestCase): + """Regression: the initial sync MUST be deferred (timer), not + called synchronously between Popen + wait. Calling it + immediately races libkrun's per-exec OCI config write during + the main exec's bringup and crashes the child (rc=137 or + 'parse error: trailing garbage').""" + + def test_main_schedules_timer_does_not_call_sync_synchronously(self): + # Fake Popen + wait so main returns immediately. Patch + # Timer to record args without spawning a real thread. + # _push_size patched so any rogue synchronous call would + # be observable. + fake_proc = unittest.mock.MagicMock() + fake_proc.wait.return_value = 0 + with patch.object( + pty_resize.subprocess, "Popen", return_value=fake_proc, + ), patch.object( + pty_resize.threading, "Timer", + ) as timer_cls, patch.object( + pty_resize, "_push_size", + ) as push: + rc = pty_resize.main(["machine-name", "--", "echo", "hi"]) + + self.assertEqual(0, rc) + # Timer scheduled with the documented delay constant. + timer_cls.assert_called_once() + delay, callback = timer_cls.call_args.args + self.assertEqual(pty_resize._STARTUP_SYNC_DELAY_SEC, delay) + # _push_size never called synchronously — the only path to + # it is via the (mocked) timer's callback firing. + push.assert_not_called() + + if __name__ == "__main__": unittest.main()