fix(smolmachines): defer pty_resize startup sync to dodge libkrun's bringup race
Theb9853aestdin=DEVNULL fix wasn't sufficient. End-to-end testing against a live VM in tmux revealed a second crash path: libkrun spits "load \`config.json\`: parse error: trailing garbage { \"ociVersion\": \"1.0.2\", ... }" and the main exec dies (rc=1 or SIGKILL/rc=137, depending on race scheduling). Root cause: each `smolvm machine exec` writes a per-invocation OCI config.json to the same smolvm state dir during its bringup. The wrapper's startup sync() fires within 1ms of Popen-ing the main exec — both invocations write config.json concurrently, libkrun loads one mid-write, and gets garbage. Trivial inner commands (`sh -c "echo hi"`) finished before the overlap mattered, masking the race in earlier tests. claude's slower startup hits the race every time, and only inside tmux because the outside-tmux foreground-handoff path takes a different bringup sequence that happens to dodge the window. Fix: schedule the initial sync on a 2-second `threading.Timer` instead of calling it synchronously. By 2s the main exec is past its bringup window, so the side-channel's config.json write doesn't collide. Daemon thread so the timer doesn't block exit when the child finishes quickly. Trade-off: the in-VM PTY uses smolvm's default size for the first ~2s, then snaps to the host pane size when the timer fires. Verified end-to-end against a live VM in tmux: claude renders at the default size during bringup, then redraws at full pane width once the deferred sync lands. Operator-driven resizes (SIGWINCH) still bridge in real time via the already-installed signal handler. Also drop the diagnostic log added in9c83ea6— we have the fix. Regression test: `TestStartupSyncDeferred.test_main_schedules_timer_does_not_ call_sync_synchronously` mocks Popen + Timer + _push_size and asserts `main()` schedules the timer with the documented delay constant and never invokes _push_size synchronously. Catches a "let's just inline the sync() call" regression immediately. 638 unit tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit was merged in pull request #83.
This commit is contained in:
@@ -35,28 +35,26 @@ follow-up tracked separately)."""
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import datetime
|
|
||||||
import fcntl
|
import fcntl
|
||||||
import os
|
|
||||||
import signal
|
import signal
|
||||||
import struct
|
import struct
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import termios
|
import termios
|
||||||
import traceback
|
import threading
|
||||||
|
|
||||||
|
|
||||||
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log")
|
# How long to wait after the main exec starts before pushing the
|
||||||
|
# initial size. Concurrent `smolvm machine exec` invocations race
|
||||||
|
# libkrun's per-exec OCI config write during the main exec's
|
||||||
def _log(msg: str) -> None:
|
# bringup window; the side-channel firing immediately corrupts
|
||||||
try:
|
# `config.json` and the main exec dies with SIGKILL (rc=137) or
|
||||||
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True)
|
# libkrun's "parse error: trailing garbage" depending on
|
||||||
with open(_DEBUG_LOG_PATH, "a") as f:
|
# scheduling. Two seconds is well past the bringup window on a
|
||||||
ts = datetime.datetime.now().isoformat(timespec="milliseconds")
|
# warm VM, well under the operator's "this is unresponsive"
|
||||||
f.write(f"[{ts} pid={os.getpid()}] {msg}\n")
|
# threshold, and short enough that claude's initial render
|
||||||
except OSError:
|
# almost always fires after the size has been set.
|
||||||
pass
|
_STARTUP_SYNC_DELAY_SEC = 2.0
|
||||||
|
|
||||||
|
|
||||||
def _read_winsize() -> tuple[int, int] | None:
|
def _read_winsize() -> tuple[int, int] | None:
|
||||||
@@ -116,42 +114,48 @@ def main(argv: list[str]) -> int:
|
|||||||
We don't use argparse — the `--` separator is the contract and
|
We don't use argparse — the `--` separator is the contract and
|
||||||
everything past it is forwarded verbatim. Keeps the wrapper
|
everything past it is forwarded verbatim. Keeps the wrapper
|
||||||
transparent for callers building argv programmatically."""
|
transparent for callers building argv programmatically."""
|
||||||
_log(f"start argv={argv!r} TMUX={os.environ.get('TMUX','<unset>')!r} "
|
|
||||||
f"ppid={os.getppid()}")
|
|
||||||
|
|
||||||
if len(argv) < 3 or argv[1] != "--":
|
if len(argv) < 3 or argv[1] != "--":
|
||||||
sys.stderr.write(
|
sys.stderr.write(
|
||||||
"usage: python -m claude_bottle.backend.smolmachines.pty_resize "
|
"usage: python -m claude_bottle.backend.smolmachines.pty_resize "
|
||||||
"<machine> -- <smolvm-argv...>\n"
|
"<machine> -- <smolvm-argv...>\n"
|
||||||
)
|
)
|
||||||
_log("exit=2 (bad argv)")
|
|
||||||
return 2
|
return 2
|
||||||
machine = argv[0]
|
machine = argv[0]
|
||||||
inner = argv[2:]
|
inner = argv[2:]
|
||||||
|
|
||||||
def sync(*_args) -> None:
|
def sync(*_args) -> None:
|
||||||
size = _read_winsize()
|
size = _read_winsize()
|
||||||
_log(f"sync size={size!r}")
|
|
||||||
if size is None:
|
if size is None:
|
||||||
return
|
return
|
||||||
_push_size(machine, *size)
|
_push_size(machine, *size)
|
||||||
|
|
||||||
signal.signal(signal.SIGWINCH, sync)
|
signal.signal(signal.SIGWINCH, sync)
|
||||||
|
|
||||||
try:
|
proc = subprocess.Popen(inner)
|
||||||
proc = subprocess.Popen(inner)
|
# Defer the initial sync. Firing it immediately races
|
||||||
except BaseException:
|
# libkrun's per-exec OCI config write: both `smolvm machine
|
||||||
_log("Popen failed:\n" + traceback.format_exc())
|
# exec` invocations stash a config.json in the same smolvm
|
||||||
raise
|
# state dir during their bringup window, libkrun loads one
|
||||||
_log(f"child pid={proc.pid}")
|
# mid-write, and the main exec dies with SIGKILL (rc=137)
|
||||||
sync() # push initial size — VM PTY starts at 0 0.
|
# or libkrun's "parse error: trailing garbage" depending on
|
||||||
|
# scheduling. Trivial inner commands finish before the
|
||||||
|
# overlap matters; claude's slower startup hits the race
|
||||||
|
# every time, only inside tmux (the outside-tmux foreground
|
||||||
|
# handoff path takes a different bringup sequence that
|
||||||
|
# happens to dodge the window).
|
||||||
|
#
|
||||||
|
# A 2s timer is past the bringup window on a warm VM, so
|
||||||
|
# the side-channel writes a fresh config.json without
|
||||||
|
# collision, and the in-VM PTY is sized before claude has
|
||||||
|
# finished rendering its first frame. daemon=True so the
|
||||||
|
# timer doesn't block exit when the child finishes quickly.
|
||||||
|
timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync)
|
||||||
|
timer.daemon = True
|
||||||
|
timer.start()
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
rc = proc.wait()
|
return proc.wait()
|
||||||
_log(f"child exit rc={rc}")
|
|
||||||
return rc
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
_log("KeyboardInterrupt → forward SIGINT to child")
|
|
||||||
proc.send_signal(signal.SIGINT)
|
proc.send_signal(signal.SIGINT)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import io
|
import io
|
||||||
import unittest
|
import unittest
|
||||||
|
import unittest.mock
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from claude_bottle.backend.smolmachines import pty_resize
|
from claude_bottle.backend.smolmachines import pty_resize
|
||||||
@@ -126,5 +127,38 @@ class TestMainArgvParsing(unittest.TestCase):
|
|||||||
self.assertEqual(2, pty_resize.main(["m", "--"]))
|
self.assertEqual(2, pty_resize.main(["m", "--"]))
|
||||||
|
|
||||||
|
|
||||||
|
class TestStartupSyncDeferred(unittest.TestCase):
|
||||||
|
"""Regression: the initial sync MUST be deferred (timer), not
|
||||||
|
called synchronously between Popen + wait. Calling it
|
||||||
|
immediately races libkrun's per-exec OCI config write during
|
||||||
|
the main exec's bringup and crashes the child (rc=137 or
|
||||||
|
'parse error: trailing garbage')."""
|
||||||
|
|
||||||
|
def test_main_schedules_timer_does_not_call_sync_synchronously(self):
|
||||||
|
# Fake Popen + wait so main returns immediately. Patch
|
||||||
|
# Timer to record args without spawning a real thread.
|
||||||
|
# _push_size patched so any rogue synchronous call would
|
||||||
|
# be observable.
|
||||||
|
fake_proc = unittest.mock.MagicMock()
|
||||||
|
fake_proc.wait.return_value = 0
|
||||||
|
with patch.object(
|
||||||
|
pty_resize.subprocess, "Popen", return_value=fake_proc,
|
||||||
|
), patch.object(
|
||||||
|
pty_resize.threading, "Timer",
|
||||||
|
) as timer_cls, patch.object(
|
||||||
|
pty_resize, "_push_size",
|
||||||
|
) as push:
|
||||||
|
rc = pty_resize.main(["machine-name", "--", "echo", "hi"])
|
||||||
|
|
||||||
|
self.assertEqual(0, rc)
|
||||||
|
# Timer scheduled with the documented delay constant.
|
||||||
|
timer_cls.assert_called_once()
|
||||||
|
delay, callback = timer_cls.call_args.args
|
||||||
|
self.assertEqual(pty_resize._STARTUP_SYNC_DELAY_SEC, delay)
|
||||||
|
# _push_size never called synchronously — the only path to
|
||||||
|
# it is via the (mocked) timer's callback firing.
|
||||||
|
push.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user