fix(smolmachines): bridge host SIGWINCH into the VM PTY (issue #82) #83

Merged
didericis merged 6 commits from smolmachines-pty-resize-issue-82 into main 2026-05-27 21:03:17 -04:00
2 changed files with 68 additions and 30 deletions
Showing only changes of commit aa5aa1f031 - Show all commits
@@ -35,28 +35,26 @@ follow-up tracked separately)."""
from __future__ import annotations
import datetime
import fcntl
import os
import signal
import struct
import subprocess
import sys
import termios
import traceback
import threading
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log")
def _log(msg: str) -> None:
try:
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True)
with open(_DEBUG_LOG_PATH, "a") as f:
ts = datetime.datetime.now().isoformat(timespec="milliseconds")
f.write(f"[{ts} pid={os.getpid()}] {msg}\n")
except OSError:
pass
# How long to wait after the main exec starts before pushing the
# initial size. Concurrent `smolvm machine exec` invocations race
# libkrun's per-exec OCI config write during the main exec's
# bringup window; the side-channel firing immediately corrupts
# `config.json` and the main exec dies with SIGKILL (rc=137) or
# libkrun's "parse error: trailing garbage" depending on
# scheduling. Two seconds is well past the bringup window on a
# warm VM, well under the operator's "this is unresponsive"
# threshold, and short enough that claude's initial render
# almost always fires after the size has been set.
_STARTUP_SYNC_DELAY_SEC = 2.0
def _read_winsize() -> tuple[int, int] | None:
@@ -116,42 +114,48 @@ def main(argv: list[str]) -> int:
We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically."""
_log(f"start argv={argv!r} TMUX={os.environ.get('TMUX','<unset>')!r} "
f"ppid={os.getppid()}")
if len(argv) < 3 or argv[1] != "--":
sys.stderr.write(
"usage: python -m claude_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n"
)
_log("exit=2 (bad argv)")
return 2
machine = argv[0]
inner = argv[2:]
def sync(*_args) -> None:
size = _read_winsize()
_log(f"sync size={size!r}")
if size is None:
return
_push_size(machine, *size)
signal.signal(signal.SIGWINCH, sync)
try:
proc = subprocess.Popen(inner)
except BaseException:
_log("Popen failed:\n" + traceback.format_exc())
raise
_log(f"child pid={proc.pid}")
sync() # push initial size — VM PTY starts at 0 0.
proc = subprocess.Popen(inner)
# Defer the initial sync. Firing it immediately races
# libkrun's per-exec OCI config write: both `smolvm machine
# exec` invocations stash a config.json in the same smolvm
# state dir during their bringup window, libkrun loads one
# mid-write, and the main exec dies with SIGKILL (rc=137)
# or libkrun's "parse error: trailing garbage" depending on
# scheduling. Trivial inner commands finish before the
# overlap matters; claude's slower startup hits the race
# every time, only inside tmux (the outside-tmux foreground
# handoff path takes a different bringup sequence that
# happens to dodge the window).
#
# A 2s timer is past the bringup window on a warm VM, so
# the side-channel writes a fresh config.json without
# collision, and the in-VM PTY is sized before claude has
# finished rendering its first frame. daemon=True so the
# timer doesn't block exit when the child finishes quickly.
Review

comment is redundant with comment for the constant

comment is redundant with comment for the constant
Review

Right — collapsed in b3c6d66 to a one-liner pointing at _STARTUP_SYNC_DELAY_SEC plus the daemon=True operational note. The libkrun-race + warm-VM-timing explanation now lives only on the constant.

Right — collapsed in b3c6d66 to a one-liner pointing at `_STARTUP_SYNC_DELAY_SEC` plus the `daemon=True` operational note. The libkrun-race + warm-VM-timing explanation now lives only on the constant.
timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync)
timer.daemon = True
timer.start()
while True:
try:
rc = proc.wait()
_log(f"child exit rc={rc}")
return rc
return proc.wait()
except KeyboardInterrupt:
_log("KeyboardInterrupt → forward SIGINT to child")
proc.send_signal(signal.SIGINT)
@@ -10,6 +10,7 @@ from __future__ import annotations
import io
import unittest
import unittest.mock
from unittest.mock import patch
from claude_bottle.backend.smolmachines import pty_resize
@@ -126,5 +127,38 @@ class TestMainArgvParsing(unittest.TestCase):
self.assertEqual(2, pty_resize.main(["m", "--"]))
class TestStartupSyncDeferred(unittest.TestCase):
"""Regression: the initial sync MUST be deferred (timer), not
called synchronously between Popen + wait. Calling it
immediately races libkrun's per-exec OCI config write during
the main exec's bringup and crashes the child (rc=137 or
'parse error: trailing garbage')."""
def test_main_schedules_timer_does_not_call_sync_synchronously(self):
# Fake Popen + wait so main returns immediately. Patch
# Timer to record args without spawning a real thread.
# _push_size patched so any rogue synchronous call would
# be observable.
fake_proc = unittest.mock.MagicMock()
fake_proc.wait.return_value = 0
with patch.object(
pty_resize.subprocess, "Popen", return_value=fake_proc,
), patch.object(
pty_resize.threading, "Timer",
) as timer_cls, patch.object(
pty_resize, "_push_size",
) as push:
rc = pty_resize.main(["machine-name", "--", "echo", "hi"])
self.assertEqual(0, rc)
# Timer scheduled with the documented delay constant.
timer_cls.assert_called_once()
delay, callback = timer_cls.call_args.args
self.assertEqual(pty_resize._STARTUP_SYNC_DELAY_SEC, delay)
# _push_size never called synchronously — the only path to
# it is via the (mocked) timer's callback firing.
push.assert_not_called()
if __name__ == "__main__":
unittest.main()