fix(smolmachines): defer pty_resize startup sync to dodge libkrun's bringup race
test / unit (pull_request) Successful in 26s
test / integration (pull_request) Successful in 41s
test / unit (push) Successful in 27s
test / integration (push) Successful in 45s

The b9853ae stdin=DEVNULL fix wasn't sufficient. End-to-end
testing against a live VM in tmux revealed a second crash path:
libkrun spits "load \`config.json\`: parse error: trailing
garbage { \"ociVersion\": \"1.0.2\", ... }" and the main exec
dies (rc=1 or SIGKILL/rc=137, depending on race scheduling).

Root cause: each `smolvm machine exec` writes a per-invocation
OCI config.json to the same smolvm state dir during its bringup.
The wrapper's startup sync() fires within 1ms of Popen-ing the
main exec — both invocations write config.json concurrently,
libkrun loads one mid-write, and gets garbage. Trivial inner
commands (`sh -c "echo hi"`) finished before the overlap
mattered, masking the race in earlier tests. claude's slower
startup hits the race every time, and only inside tmux because
the outside-tmux foreground-handoff path takes a different
bringup sequence that happens to dodge the window.

Fix: schedule the initial sync on a 2-second `threading.Timer`
instead of calling it synchronously. By 2s the main exec is
past its bringup window, so the side-channel's config.json
write doesn't collide. Daemon thread so the timer doesn't
block exit when the child finishes quickly.

Trade-off: the in-VM PTY uses smolvm's default size for the
first ~2s, then snaps to the host pane size when the timer
fires. Verified end-to-end against a live VM in tmux: claude
renders at the default size during bringup, then redraws at
full pane width once the deferred sync lands. Operator-driven
resizes (SIGWINCH) still bridge in real time via the
already-installed signal handler.

Also drop the diagnostic log added in 9c83ea6 — we have the
fix.

Regression test:
`TestStartupSyncDeferred.test_main_schedules_timer_does_not_
call_sync_synchronously` mocks Popen + Timer + _push_size and
asserts `main()` schedules the timer with the documented
delay constant and never invokes _push_size synchronously.
Catches a "let's just inline the sync() call" regression
immediately.

638 unit tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit was merged in pull request #83.
This commit is contained in:
2026-05-27 20:55:00 -04:00
parent 9c83ea6428
commit aa5aa1f031
2 changed files with 68 additions and 30 deletions
@@ -35,28 +35,26 @@ follow-up tracked separately)."""
from __future__ import annotations from __future__ import annotations
import datetime
import fcntl import fcntl
import os
import signal import signal
import struct import struct
import subprocess import subprocess
import sys import sys
import termios import termios
import traceback import threading
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log") # How long to wait after the main exec starts before pushing the
# initial size. Concurrent `smolvm machine exec` invocations race
# libkrun's per-exec OCI config write during the main exec's
def _log(msg: str) -> None: # bringup window; the side-channel firing immediately corrupts
try: # `config.json` and the main exec dies with SIGKILL (rc=137) or
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True) # libkrun's "parse error: trailing garbage" depending on
with open(_DEBUG_LOG_PATH, "a") as f: # scheduling. Two seconds is well past the bringup window on a
ts = datetime.datetime.now().isoformat(timespec="milliseconds") # warm VM, well under the operator's "this is unresponsive"
f.write(f"[{ts} pid={os.getpid()}] {msg}\n") # threshold, and short enough that claude's initial render
except OSError: # almost always fires after the size has been set.
pass _STARTUP_SYNC_DELAY_SEC = 2.0
def _read_winsize() -> tuple[int, int] | None: def _read_winsize() -> tuple[int, int] | None:
@@ -116,42 +114,48 @@ def main(argv: list[str]) -> int:
We don't use argparse — the `--` separator is the contract and We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically.""" transparent for callers building argv programmatically."""
_log(f"start argv={argv!r} TMUX={os.environ.get('TMUX','<unset>')!r} "
f"ppid={os.getppid()}")
if len(argv) < 3 or argv[1] != "--": if len(argv) < 3 or argv[1] != "--":
sys.stderr.write( sys.stderr.write(
"usage: python -m claude_bottle.backend.smolmachines.pty_resize " "usage: python -m claude_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n" "<machine> -- <smolvm-argv...>\n"
) )
_log("exit=2 (bad argv)")
return 2 return 2
machine = argv[0] machine = argv[0]
inner = argv[2:] inner = argv[2:]
def sync(*_args) -> None: def sync(*_args) -> None:
size = _read_winsize() size = _read_winsize()
_log(f"sync size={size!r}")
if size is None: if size is None:
return return
_push_size(machine, *size) _push_size(machine, *size)
signal.signal(signal.SIGWINCH, sync) signal.signal(signal.SIGWINCH, sync)
try: proc = subprocess.Popen(inner)
proc = subprocess.Popen(inner) # Defer the initial sync. Firing it immediately races
except BaseException: # libkrun's per-exec OCI config write: both `smolvm machine
_log("Popen failed:\n" + traceback.format_exc()) # exec` invocations stash a config.json in the same smolvm
raise # state dir during their bringup window, libkrun loads one
_log(f"child pid={proc.pid}") # mid-write, and the main exec dies with SIGKILL (rc=137)
sync() # push initial size — VM PTY starts at 0 0. # or libkrun's "parse error: trailing garbage" depending on
# scheduling. Trivial inner commands finish before the
# overlap matters; claude's slower startup hits the race
# every time, only inside tmux (the outside-tmux foreground
# handoff path takes a different bringup sequence that
# happens to dodge the window).
#
# A 2s timer is past the bringup window on a warm VM, so
# the side-channel writes a fresh config.json without
# collision, and the in-VM PTY is sized before claude has
# finished rendering its first frame. daemon=True so the
# timer doesn't block exit when the child finishes quickly.
timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync)
timer.daemon = True
timer.start()
while True: while True:
try: try:
rc = proc.wait() return proc.wait()
_log(f"child exit rc={rc}")
return rc
except KeyboardInterrupt: except KeyboardInterrupt:
_log("KeyboardInterrupt → forward SIGINT to child")
proc.send_signal(signal.SIGINT) proc.send_signal(signal.SIGINT)
@@ -10,6 +10,7 @@ from __future__ import annotations
import io import io
import unittest import unittest
import unittest.mock
from unittest.mock import patch from unittest.mock import patch
from claude_bottle.backend.smolmachines import pty_resize from claude_bottle.backend.smolmachines import pty_resize
@@ -126,5 +127,38 @@ class TestMainArgvParsing(unittest.TestCase):
self.assertEqual(2, pty_resize.main(["m", "--"])) self.assertEqual(2, pty_resize.main(["m", "--"]))
class TestStartupSyncDeferred(unittest.TestCase):
"""Regression: the initial sync MUST be deferred (timer), not
called synchronously between Popen + wait. Calling it
immediately races libkrun's per-exec OCI config write during
the main exec's bringup and crashes the child (rc=137 or
'parse error: trailing garbage')."""
def test_main_schedules_timer_does_not_call_sync_synchronously(self):
# Fake Popen + wait so main returns immediately. Patch
# Timer to record args without spawning a real thread.
# _push_size patched so any rogue synchronous call would
# be observable.
fake_proc = unittest.mock.MagicMock()
fake_proc.wait.return_value = 0
with patch.object(
pty_resize.subprocess, "Popen", return_value=fake_proc,
), patch.object(
pty_resize.threading, "Timer",
) as timer_cls, patch.object(
pty_resize, "_push_size",
) as push:
rc = pty_resize.main(["machine-name", "--", "echo", "hi"])
self.assertEqual(0, rc)
# Timer scheduled with the documented delay constant.
timer_cls.assert_called_once()
delay, callback = timer_cls.call_args.args
self.assertEqual(pty_resize._STARTUP_SYNC_DELAY_SEC, delay)
# _push_size never called synchronously — the only path to
# it is via the (mocked) timer's callback firing.
push.assert_not_called()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()