fix(smolmachines): bridge host SIGWINCH into the VM PTY (issue #82) #83

Merged
didericis merged 6 commits from smolmachines-pty-resize-issue-82 into main 2026-05-27 21:03:17 -04:00
2 changed files with 27 additions and 38 deletions
Showing only changes of commit b9853ae0c7 - Show all commits
@@ -35,33 +35,12 @@ follow-up tracked separately)."""
from __future__ import annotations from __future__ import annotations
import datetime
import fcntl import fcntl
import os
import signal import signal
import struct import struct
import subprocess import subprocess
import sys import sys
import termios import termios
import traceback
# Debug log so we can diagnose tmux-pane crashes that happen in
# pane respawn — the dashboard's curses surface eats stderr, and
# `tmux respawn-pane`'s default remain-on-exit is off. Always-on
# (small overhead) so a user reporting a crash can just share the
# file. Append-mode, per-pid line prefix.
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log")
def _log(msg: str) -> None:
try:
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True)
with open(_DEBUG_LOG_PATH, "a") as f:
ts = datetime.datetime.now().isoformat(timespec="milliseconds")
f.write(f"[{ts} pid={os.getpid()}] {msg}\n")
except OSError:
pass
def _read_winsize() -> tuple[int, int] | None: def _read_winsize() -> tuple[int, int] | None:
@@ -92,13 +71,24 @@ def _push_size(machine: str, rows: int, cols: int) -> None:
handle); `stty -F` returns silently on PTYs that don't apply. handle); `stty -F` returns silently on PTYs that don't apply.
Best-effort: swallow failures. A failed resize doesn't break Best-effort: swallow failures. A failed resize doesn't break
the session — it just leaves the in-VM PTY at its old size.""" the session — it just leaves the in-VM PTY at its old size.
`stdin=DEVNULL` is load-bearing: under tmux, inheriting the
pane PTY here means two concurrent smolvm processes (this one
and the agent session the wrapper is shepherding) share the
PTY's foreground-process-group / input plumbing, and smolvm
bails with an internal config-parse error or SIGKILL within
~100ms of the side-channel firing. Outside tmux the same
pattern survived, presumably because iTerm's PTY plumbing is
more forgiving than tmux's, but the DEVNULL is the right
default either way — the side-channel never needs stdin."""
subprocess.run( subprocess.run(
["smolvm", "machine", "exec", "--name", machine, "--", ["smolvm", "machine", "exec", "--name", machine, "--",
"sh", "-c", "sh", "-c",
f"for f in /dev/pts/*; do " f"for f in /dev/pts/*; do "
f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; " f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; "
f"done"], f"done"],
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
check=False, check=False,
) )
@@ -110,23 +100,17 @@ def main(argv: list[str]) -> int:
We don't use argparse — the `--` separator is the contract and We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically.""" transparent for callers building argv programmatically."""
_log(f"start argv={argv!r} cwd={os.getcwd()!r} "
f"PATH={os.environ.get('PATH','')!r} "
f"TMUX={os.environ.get('TMUX','<unset>')!r}")
if len(argv) < 3 or argv[1] != "--": if len(argv) < 3 or argv[1] != "--":
sys.stderr.write( sys.stderr.write(
"usage: python -m claude_bottle.backend.smolmachines.pty_resize " "usage: python -m claude_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n" "<machine> -- <smolvm-argv...>\n"
) )
_log("exit=2 (bad argv)")
return 2 return 2
machine = argv[0] machine = argv[0]
inner = argv[2:] inner = argv[2:]
def sync(*_args) -> None: def sync(*_args) -> None:
size = _read_winsize() size = _read_winsize()
_log(f"sync size={size!r}")
if size is None: if size is None:
return return
_push_size(machine, *size) _push_size(machine, *size)
@@ -136,23 +120,15 @@ def main(argv: list[str]) -> int:
# is caught even if it races the initial sync. # is caught even if it races the initial sync.
signal.signal(signal.SIGWINCH, sync) signal.signal(signal.SIGWINCH, sync)
try:
proc = subprocess.Popen(inner) proc = subprocess.Popen(inner)
except BaseException:
_log("Popen failed:\n" + traceback.format_exc())
raise
_log(f"child pid={proc.pid}")
sync() # push initial size — VM PTY starts at 0 0. sync() # push initial size — VM PTY starts at 0 0.
while True: while True:
try: try:
rc = proc.wait() return proc.wait()
_log(f"child exit rc={rc}")
return rc
except KeyboardInterrupt: except KeyboardInterrupt:
# Ctrl-C in the operator's terminal → forward to the # Ctrl-C in the operator's terminal → forward to the
# child once, then keep waiting. claude handles its # child once, then keep waiting. claude handles its
# own interrupt cleanup. # own interrupt cleanup.
_log("KeyboardInterrupt → forward SIGINT to child")
proc.send_signal(signal.SIGINT) proc.send_signal(signal.SIGINT)
@@ -34,6 +34,19 @@ class TestPushSize(unittest.TestCase):
self.assertIn("rows 50", argv[8]) self.assertIn("rows 50", argv[8])
self.assertIn("for f in /dev/pts/*", argv[8]) self.assertIn("for f in /dev/pts/*", argv[8])
def test_side_channel_uses_devnull_stdin(self):
# Load-bearing regression: under tmux, inheriting the
# pane PTY as the side-channel's stdin makes smolvm crash
# within ~100ms (concurrent smolvm processes sharing the
# PTY's FG-PG / input plumbing). DEVNULL stdin sidesteps
# the interaction.
with patch.object(pty_resize.subprocess, "run") as run:
pty_resize._push_size("claude-bottle-m", 24, 80)
self.assertEqual(
pty_resize.subprocess.DEVNULL,
run.call_args.kwargs.get("stdin"),
)
def test_swallows_subprocess_failures(self): def test_swallows_subprocess_failures(self):
# `check=False` + DEVNULL streams: a side-channel failure # `check=False` + DEVNULL streams: a side-channel failure
# mustn't break the operator's session. # mustn't break the operator's session.