fix(smolmachines): bridge host SIGWINCH into the VM PTY (issue #82) #83

Merged
didericis merged 6 commits from smolmachines-pty-resize-issue-82 into main 2026-05-27 21:03:17 -04:00
2 changed files with 68 additions and 30 deletions
Showing only changes of commit aa5aa1f031 - Show all commits
@@ -35,28 +35,26 @@ follow-up tracked separately)."""
from __future__ import annotations from __future__ import annotations
import datetime
import fcntl import fcntl
import os
import signal import signal
import struct import struct
import subprocess import subprocess
import sys import sys
import termios import termios
import traceback import threading
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log") # How long to wait after the main exec starts before pushing the
# initial size. Concurrent `smolvm machine exec` invocations race
# libkrun's per-exec OCI config write during the main exec's
def _log(msg: str) -> None: # bringup window; the side-channel firing immediately corrupts
try: # `config.json` and the main exec dies with SIGKILL (rc=137) or
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True) # libkrun's "parse error: trailing garbage" depending on
with open(_DEBUG_LOG_PATH, "a") as f: # scheduling. Two seconds is well past the bringup window on a
ts = datetime.datetime.now().isoformat(timespec="milliseconds") # warm VM, well under the operator's "this is unresponsive"
f.write(f"[{ts} pid={os.getpid()}] {msg}\n") # threshold, and short enough that claude's initial render
except OSError: # almost always fires after the size has been set.
pass _STARTUP_SYNC_DELAY_SEC = 2.0
def _read_winsize() -> tuple[int, int] | None: def _read_winsize() -> tuple[int, int] | None:
@@ -116,42 +114,48 @@ def main(argv: list[str]) -> int:
We don't use argparse — the `--` separator is the contract and We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically.""" transparent for callers building argv programmatically."""
_log(f"start argv={argv!r} TMUX={os.environ.get('TMUX','<unset>')!r} "
f"ppid={os.getppid()}")
if len(argv) < 3 or argv[1] != "--": if len(argv) < 3 or argv[1] != "--":
sys.stderr.write( sys.stderr.write(
"usage: python -m claude_bottle.backend.smolmachines.pty_resize " "usage: python -m claude_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n" "<machine> -- <smolvm-argv...>\n"
) )
_log("exit=2 (bad argv)")
return 2 return 2
machine = argv[0] machine = argv[0]
inner = argv[2:] inner = argv[2:]
def sync(*_args) -> None: def sync(*_args) -> None:
size = _read_winsize() size = _read_winsize()
_log(f"sync size={size!r}")
if size is None: if size is None:
return return
_push_size(machine, *size) _push_size(machine, *size)
signal.signal(signal.SIGWINCH, sync) signal.signal(signal.SIGWINCH, sync)
try: proc = subprocess.Popen(inner)
proc = subprocess.Popen(inner) # Defer the initial sync. Firing it immediately races
except BaseException: # libkrun's per-exec OCI config write: both `smolvm machine
_log("Popen failed:\n" + traceback.format_exc()) # exec` invocations stash a config.json in the same smolvm
raise # state dir during their bringup window, libkrun loads one
_log(f"child pid={proc.pid}") # mid-write, and the main exec dies with SIGKILL (rc=137)
sync() # push initial size — VM PTY starts at 0 0. # or libkrun's "parse error: trailing garbage" depending on
# scheduling. Trivial inner commands finish before the
# overlap matters; claude's slower startup hits the race
# every time, only inside tmux (the outside-tmux foreground
# handoff path takes a different bringup sequence that
# happens to dodge the window).
#
# A 2s timer is past the bringup window on a warm VM, so
# the side-channel writes a fresh config.json without
# collision, and the in-VM PTY is sized before claude has
# finished rendering its first frame. daemon=True so the
# timer doesn't block exit when the child finishes quickly.
Review

comment is redundant with comment for the constant

comment is redundant with comment for the constant
Review

Right — collapsed in b3c6d66 to a one-liner pointing at _STARTUP_SYNC_DELAY_SEC plus the daemon=True operational note. The libkrun-race + warm-VM-timing explanation now lives only on the constant.

Right — collapsed in b3c6d66 to a one-liner pointing at `_STARTUP_SYNC_DELAY_SEC` plus the `daemon=True` operational note. The libkrun-race + warm-VM-timing explanation now lives only on the constant.
timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync)
timer.daemon = True
timer.start()
while True: while True:
try: try:
rc = proc.wait() return proc.wait()
_log(f"child exit rc={rc}")
return rc
except KeyboardInterrupt: except KeyboardInterrupt:
_log("KeyboardInterrupt → forward SIGINT to child")
proc.send_signal(signal.SIGINT) proc.send_signal(signal.SIGINT)
@@ -10,6 +10,7 @@ from __future__ import annotations
import io import io
import unittest import unittest
import unittest.mock
from unittest.mock import patch from unittest.mock import patch
from claude_bottle.backend.smolmachines import pty_resize from claude_bottle.backend.smolmachines import pty_resize
@@ -126,5 +127,38 @@ class TestMainArgvParsing(unittest.TestCase):
self.assertEqual(2, pty_resize.main(["m", "--"])) self.assertEqual(2, pty_resize.main(["m", "--"]))
class TestStartupSyncDeferred(unittest.TestCase):
"""Regression: the initial sync MUST be deferred (timer), not
called synchronously between Popen + wait. Calling it
immediately races libkrun's per-exec OCI config write during
the main exec's bringup and crashes the child (rc=137 or
'parse error: trailing garbage')."""
def test_main_schedules_timer_does_not_call_sync_synchronously(self):
# Fake Popen + wait so main returns immediately. Patch
# Timer to record args without spawning a real thread.
# _push_size patched so any rogue synchronous call would
# be observable.
fake_proc = unittest.mock.MagicMock()
fake_proc.wait.return_value = 0
with patch.object(
pty_resize.subprocess, "Popen", return_value=fake_proc,
), patch.object(
pty_resize.threading, "Timer",
) as timer_cls, patch.object(
pty_resize, "_push_size",
) as push:
rc = pty_resize.main(["machine-name", "--", "echo", "hi"])
self.assertEqual(0, rc)
# Timer scheduled with the documented delay constant.
timer_cls.assert_called_once()
delay, callback = timer_cls.call_args.args
self.assertEqual(pty_resize._STARTUP_SYNC_DELAY_SEC, delay)
# _push_size never called synchronously — the only path to
# it is via the (mocked) timer's callback firing.
push.assert_not_called()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()