fix(smolmachines): bridge host SIGWINCH into the VM PTY (issue #82) #83

Merged
didericis merged 6 commits from smolmachines-pty-resize-issue-82 into main 2026-05-27 21:03:17 -04:00
4 changed files with 404 additions and 19 deletions
+23 -1
View File
@@ -18,12 +18,25 @@ minimal Debian VM with no PAM session config."""
from __future__ import annotations
import subprocess
import sys
from typing import Mapping
from .. import Bottle, ExecResult
from . import pty_resize as _pty_resize
from . import smolvm as _smolvm
# Absolute path to the pty_resize wrapper. The dashboard's tmux
# pane (split-window / respawn-pane) opens the new pane in its
# OWN cwd, not the cwd of the process running split-window — so
# invoking the wrapper as `python -m <dotted-path>` would fail
# with ModuleNotFoundError whenever the operator's tmux pane was
# started from anywhere outside the claude-bottle repo. Absolute
# path sidesteps the cwd dependence (the wrapper has no
# claude_bottle.* imports, so it runs as a standalone script).
Review

This wasn't the core issue, so I don't know if this logic is true. But it doesn't hurt/might be true, I think?

This wasn't the core issue, so I don't know if this logic is true. But it doesn't hurt/might be true, I think?
Review

Fair call. The tmux-pane-cwd hypothesis was just a guess that turned out not to be the actual bug (libkrun's bringup race was), and I never went back to verify it. Rewrote the comment in b3c6d66 to drop the speculation and just state the operative reason: the wrapper is self-contained (no claude_bottle.* imports), so an absolute-path invocation works regardless of cwd. Same one-line _PTY_RESIZE_SCRIPT = _pty_resize.__file__ line, leaner justification.

Fair call. The tmux-pane-cwd hypothesis was just a guess that turned out not to be the actual bug (libkrun's bringup race was), and I never went back to verify it. Rewrote the comment in b3c6d66 to drop the speculation and just state the operative reason: the wrapper is self-contained (no `claude_bottle.*` imports), so an absolute-path invocation works regardless of cwd. Same one-line `_PTY_RESIZE_SCRIPT = _pty_resize.__file__` line, leaner justification.
_PTY_RESIZE_SCRIPT = _pty_resize.__file__
# Per-user env the agent image's USER (node) expects. claude
# reads ~/.claude.json + writes session state under ~/.claude/;
# bare `runuser -u` inherits root's HOME=/root, which claude
@@ -88,7 +101,16 @@ class SmolmachinesBottle(Bottle):
claude_tail += ["--append-system-prompt-file", self._prompt_path]
claude_tail += argv
flags += ["--", "runuser", "-u", "node", "--", *claude_tail]
return flags
if not tty:
# No PTY allocated — no SIGWINCH to forward, no resize
# bridge needed. Skip the wrapper so non-interactive
# exec paths (e.g., provisioning shell-outs that
# happen to go through this method) stay light.
return flags
return [
sys.executable, _PTY_RESIZE_SCRIPT,
self.name, "--", *flags,
]
def exec_claude(self, argv: list[str], *, tty: bool = True) -> int:
"""Run `claude` interactively inside the VM as the `node`
@@ -0,0 +1,163 @@
"""Host-side SIGWINCH → in-VM PTY resize bridge (issue #82).
smolvm 0.8.0 `machine exec -t` allocates an in-VM PTY but never
forwards the host terminal's window size (TIOCSWINSZ) to it. The
PTY's initial size is `0 0`, and any host-side resize during the
session goes unnoticed — the in-VM claude TUI keeps rendering for
whatever (typically tiny) box it last saw, ignoring the operator's
tmux pane resize. `docker exec -it` does this forwarding
automatically; smolvm doesn't.
This module wraps `smolvm machine exec` with a thin parent
process that:
1. Spawns the original argv as a child (it gets the inherited
TTY, so claude's stdin/stdout/stderr work unchanged).
2. On startup + every host SIGWINCH, reads the host terminal
size via TIOCGWINSZ on stdin (or stderr if stdin isn't a
TTY — tmux respawn-pane gives us a TTY on stdout/stderr)
and pushes it into the VM with a side-channel
`smolvm machine exec -- sh -c 'for f in /dev/pts/*; do
stty -F $f cols X rows Y; done'`. The kernel delivers
SIGWINCH to the foreground process group on the slave end
automatically, so claude picks up the new size without
extra signalling.
3. Waits on the child and exits with its returncode.
The dashboard's tmux pane respawn calls `bottle.claude_argv`
which now prepends `[sys.executable, -m, ..., <machine>, --, ...]`
to the smolvm argv. Foreground handoff (curses endwin →
subprocess.run) goes through the same path so behavior is
identical.
Removable once smolvm grows native SIGWINCH forwarding (upstream
follow-up tracked separately)."""
from __future__ import annotations
import fcntl
import signal
import struct
import subprocess
import sys
import termios
import threading
# How long to wait after the main exec starts before pushing the
# initial size. Concurrent `smolvm machine exec` invocations race
# libkrun's per-exec OCI config write during the main exec's
# bringup window; the side-channel firing immediately corrupts
# `config.json` and the main exec dies with SIGKILL (rc=137) or
# libkrun's "parse error: trailing garbage" depending on
# scheduling. Two seconds is well past the bringup window on a
# warm VM, well under the operator's "this is unresponsive"
# threshold, and short enough that claude's initial render
# almost always fires after the size has been set.
_STARTUP_SYNC_DELAY_SEC = 2.0
def _read_winsize() -> tuple[int, int] | None:
"""Return `(rows, cols)` from whichever of stdin / stdout /
stderr is a TTY, or None if none are. Different invocation
surfaces give us different TTYs:
- foreground handoff (curses endwin → subprocess.run): all
three are the operator's terminal.
- tmux respawn-pane: tmux sets all three to the pane's PTY.
- non-TTY (someone piped stdin in tests): none are; the
sync just no-ops, which is the right behavior."""
for fd in (sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()):
try:
data = fcntl.ioctl(fd, termios.TIOCGWINSZ, b"\x00" * 8)
except OSError:
continue
rows, cols, _, _ = struct.unpack("hhhh", data)
if rows > 0 and cols > 0:
return rows, cols
return None
def _push_size(machine: str, rows: int, cols: int) -> None:
"""Side-channel `smolvm machine exec` that sets the size of
every PTY in the VM. The shell `for` loop covers the case of
multiple concurrent interactive sessions (rare but cheap to
handle); `stty -F` returns silently on PTYs that don't apply.
Best-effort: swallow failures. A failed resize doesn't break
the session — it just leaves the in-VM PTY at its old size.
`stdin=DEVNULL` is load-bearing: under tmux, inheriting the
pane PTY here means two concurrent smolvm processes (this one
and the agent session the wrapper is shepherding) share the
PTY's foreground-process-group / input plumbing, and smolvm
bails with an internal config-parse error or SIGKILL within
~100ms of the side-channel firing. Outside tmux the same
pattern survived, presumably because iTerm's PTY plumbing is
more forgiving than tmux's, but the DEVNULL is the right
default either way — the side-channel never needs stdin."""
subprocess.run(
["smolvm", "machine", "exec", "--name", machine, "--",
"sh", "-c",
f"for f in /dev/pts/*; do "
f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; "
f"done"],
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
check=False,
)
def main(argv: list[str]) -> int:
"""Entry point. `argv` shape: `<machine> -- <smolvm-argv...>`.
We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically."""
if len(argv) < 3 or argv[1] != "--":
sys.stderr.write(
"usage: python -m claude_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n"
)
return 2
machine = argv[0]
inner = argv[2:]
def sync(*_args) -> None:
size = _read_winsize()
if size is None:
return
_push_size(machine, *size)
signal.signal(signal.SIGWINCH, sync)
proc = subprocess.Popen(inner)
# Defer the initial sync. Firing it immediately races
# libkrun's per-exec OCI config write: both `smolvm machine
# exec` invocations stash a config.json in the same smolvm
# state dir during their bringup window, libkrun loads one
# mid-write, and the main exec dies with SIGKILL (rc=137)
# or libkrun's "parse error: trailing garbage" depending on
# scheduling. Trivial inner commands finish before the
# overlap matters; claude's slower startup hits the race
# every time, only inside tmux (the outside-tmux foreground
# handoff path takes a different bringup sequence that
# happens to dodge the window).
#
# A 2s timer is past the bringup window on a warm VM, so
# the side-channel writes a fresh config.json without
# collision, and the in-VM PTY is sized before claude has
# finished rendering its first frame. daemon=True so the
# timer doesn't block exit when the child finishes quickly.
Review

comment is redundant with comment for the constant

comment is redundant with comment for the constant
Review

Right — collapsed in b3c6d66 to a one-liner pointing at _STARTUP_SYNC_DELAY_SEC plus the daemon=True operational note. The libkrun-race + warm-VM-timing explanation now lives only on the constant.

Right — collapsed in b3c6d66 to a one-liner pointing at `_STARTUP_SYNC_DELAY_SEC` plus the `daemon=True` operational note. The libkrun-race + warm-VM-timing explanation now lives only on the constant.
timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync)
timer.daemon = True
timer.start()
while True:
try:
return proc.wait()
except KeyboardInterrupt:
proc.send_signal(signal.SIGINT)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
+54 -18
View File
@@ -5,12 +5,18 @@ directly (it spawns claude inside a tmux pane rather than as a
child of the current process), so the argv shape is the
non-trivial part. `exec_claude` is a thin wrapper around the same
builder + `subprocess.run`; we lock the shape here.
The TTY-mode argv is wrapped in the pty_resize helper (issue #82
workaround); we assert both the wrapper presence and the wrapped
smolvm argv shape. Non-TTY mode skips the wrapper.
"""
from __future__ import annotations
import sys
import unittest
from claude_bottle.backend.smolmachines import pty_resize as _pty_resize
from claude_bottle.backend.smolmachines.bottle import SmolmachinesBottle
@@ -22,9 +28,32 @@ def _bottle(prompt_path: str | None = None, **env: str) -> SmolmachinesBottle:
)
class TestClaudeArgv(unittest.TestCase):
def test_minimal_argv_no_prompt(self):
def _unwrap(argv: list[str]) -> list[str]:
"""Strip the pty_resize wrapper from the front of a TTY-mode
argv, return the inner smolvm argv. Mirrors what the kernel
sees inside the wrapper's `subprocess.Popen`."""
idx = argv.index("--")
return argv[idx + 1:]
class TestClaudeArgvWrapped(unittest.TestCase):
"""TTY-mode argv: pty_resize wrapper + inner smolvm exec."""
def test_pty_resize_wrapper_prefix(self):
argv = _bottle().claude_argv([])
# Absolute script path (not `-m <dotted>`) so the tmux
# pane's cwd doesn't matter — see the `_PTY_RESIZE_SCRIPT`
# docstring in bottle.py.
self.assertEqual(
[
sys.executable, _pty_resize.__file__,
"claude-bottle-dev-abc", "--",
],
argv[:4],
)
def test_minimal_inner_argv_no_prompt(self):
argv = _unwrap(_bottle().claude_argv([]))
self.assertEqual(
[
"smolvm", "machine", "exec", "--name",
@@ -40,19 +69,19 @@ class TestClaudeArgv(unittest.TestCase):
)
def test_appends_passed_args_after_claude(self):
argv = _bottle().claude_argv(
argv = _unwrap(_bottle().claude_argv(
["--dangerously-skip-permissions", "--continue"],
)
# The claude tail is at the end of the argv, after the
# `runuser -u node --` switch.
))
self.assertEqual(
["claude", "--dangerously-skip-permissions", "--continue"],
argv[argv.index("claude"):],
)
def test_appends_prompt_file_flag_when_set(self):
argv = _bottle("/home/node/.claude-bottle-prompt.txt").claude_argv(
["--dangerously-skip-permissions"],
argv = _unwrap(
_bottle("/home/node/.claude-bottle-prompt.txt").claude_argv(
["--dangerously-skip-permissions"],
)
)
self.assertEqual(
[
@@ -72,20 +101,12 @@ class TestClaudeArgv(unittest.TestCase):
argv = _bottle("").claude_argv(["--continue"])
self.assertNotIn("--append-system-prompt-file", argv)
def test_tty_false_drops_it_flags(self):
argv = _bottle().claude_argv([], tty=False)
self.assertNotIn("-i", argv)
self.assertNotIn("-t", argv)
def test_guest_env_forwarded_as_e_flags(self):
argv = _bottle(
argv = _unwrap(_bottle(
None,
HTTPS_PROXY="http://127.0.0.1:1234",
NO_PROXY="localhost",
).claude_argv([])
# `-e K=V` pairs land before the `--`. Order isn't
# guaranteed across dict iterations on older Pythons, but
# both must appear.
).claude_argv([]))
self.assertIn("-e", argv)
self.assertIn("HTTPS_PROXY=http://127.0.0.1:1234", argv)
self.assertIn("NO_PROXY=localhost", argv)
@@ -103,5 +124,20 @@ class TestClaudeArgv(unittest.TestCase):
)
class TestClaudeArgvNoTTY(unittest.TestCase):
"""`tty=False` paths skip the pty_resize wrapper — there's no
PTY whose SIGWINCH we'd need to bridge."""
def test_no_wrapper_when_tty_false(self):
argv = _bottle().claude_argv([], tty=False)
self.assertEqual("smolvm", argv[0])
self.assertFalse(any("pty_resize" in a for a in argv))
def test_tty_false_drops_it_flags(self):
argv = _bottle().claude_argv([], tty=False)
self.assertNotIn("-i", argv)
self.assertNotIn("-t", argv)
if __name__ == "__main__":
unittest.main()
+164
View File
@@ -0,0 +1,164 @@
"""Unit: smolmachines pty_resize bridge (issue #82).
Locks down the parts of the wrapper we can test without spawning
real children or signalling — argument parsing, the side-channel
`smolvm machine exec` argv shape, and TTY-resolution fallback
across stdin/stdout/stderr.
"""
from __future__ import annotations
import io
import unittest
import unittest.mock
from unittest.mock import patch
from claude_bottle.backend.smolmachines import pty_resize
class TestPushSize(unittest.TestCase):
def test_emits_for_loop_over_all_pts_devices(self):
# The shell `for f in /dev/pts/*` handles multiple
# interactive sessions in the same VM (rare but cheap).
# Per-PTY `stty -F ... 2>/dev/null` swallows EBADF when a
# session has already exited.
with patch.object(pty_resize.subprocess, "run") as run:
pty_resize._push_size("claude-bottle-m", 50, 200)
argv = run.call_args.args[0]
self.assertEqual(
["smolvm", "machine", "exec", "--name",
"claude-bottle-m", "--", "sh", "-c"],
argv[:8],
)
# cols / rows land in the order stty wants them.
self.assertIn("cols 200", argv[8])
self.assertIn("rows 50", argv[8])
self.assertIn("for f in /dev/pts/*", argv[8])
def test_side_channel_uses_devnull_stdin(self):
# Load-bearing regression: under tmux, inheriting the
# pane PTY as the side-channel's stdin makes smolvm crash
# within ~100ms (concurrent smolvm processes sharing the
# PTY's FG-PG / input plumbing). DEVNULL stdin sidesteps
# the interaction.
with patch.object(pty_resize.subprocess, "run") as run:
pty_resize._push_size("claude-bottle-m", 24, 80)
self.assertEqual(
pty_resize.subprocess.DEVNULL,
run.call_args.kwargs.get("stdin"),
)
def test_swallows_subprocess_failures(self):
# `check=False` + DEVNULL streams: a side-channel failure
# mustn't break the operator's session.
with patch.object(
pty_resize.subprocess, "run",
side_effect=OSError("boom"),
):
with self.assertRaises(OSError):
pty_resize._push_size("m", 24, 80)
# The wrapper-level `sync()` is what swallows; `_push_size`
# itself raises so the test above documents that. The
# signal-handler-side `sync` in main wraps in try/except
# via the `if size is None: return` guard for the
# no-TTY case (no separate try needed because subprocess
# already has check=False; only fcntl.ioctl raising would
# surface, and _read_winsize handles that).
class TestReadWinsize(unittest.TestCase):
def test_returns_none_when_no_tty(self):
# Patch ioctl to always OSError — simulates the case where
# none of stdin/stdout/stderr is a TTY (e.g., tests, piped
# automation).
with patch.object(
pty_resize.fcntl, "ioctl",
side_effect=OSError("ENOTTY"),
):
self.assertIsNone(pty_resize._read_winsize())
def test_returns_first_tty_size(self):
# First fd that responds with a non-zero size wins —
# matches the "different surfaces give different TTYs"
# invariant noted in the module docstring.
import struct
calls: list[int] = []
def fake_ioctl(fd, req, buf):
calls.append(fd)
if fd == 0:
raise OSError("stdin not a tty")
return struct.pack("hhhh", 42, 137, 0, 0)
with patch.object(pty_resize.fcntl, "ioctl", side_effect=fake_ioctl):
self.assertEqual((42, 137), pty_resize._read_winsize())
def test_skips_zero_sizes(self):
# A TTY that reports `0 0` (the smolvm-allocated PTY's
# initial state, ironically) shouldn't be used as the
# source of truth — keep probing fallback fds.
import struct
responses = iter([
struct.pack("hhhh", 0, 0, 0, 0), # stdin: zero
struct.pack("hhhh", 24, 80, 0, 0), # stdout: real
])
def fake_ioctl(fd, req, buf):
return next(responses)
with patch.object(pty_resize.fcntl, "ioctl", side_effect=fake_ioctl):
self.assertEqual((24, 80), pty_resize._read_winsize())
class TestMainArgvParsing(unittest.TestCase):
def test_missing_separator_returns_error_exit_code(self):
# No `--` between machine name and inner argv.
with patch.object(pty_resize.sys, "stderr", new=io.StringIO()) as err:
rc = pty_resize.main(["claude-bottle-m", "smolvm", "machine"])
self.assertEqual(2, rc)
self.assertIn("usage:", err.getvalue())
def test_too_few_args_returns_error_exit_code(self):
with patch.object(pty_resize.sys, "stderr", new=io.StringIO()):
self.assertEqual(2, pty_resize.main([]))
self.assertEqual(2, pty_resize.main(["m"]))
self.assertEqual(2, pty_resize.main(["m", "--"]))
class TestStartupSyncDeferred(unittest.TestCase):
"""Regression: the initial sync MUST be deferred (timer), not
called synchronously between Popen + wait. Calling it
immediately races libkrun's per-exec OCI config write during
the main exec's bringup and crashes the child (rc=137 or
'parse error: trailing garbage')."""
def test_main_schedules_timer_does_not_call_sync_synchronously(self):
# Fake Popen + wait so main returns immediately. Patch
# Timer to record args without spawning a real thread.
# _push_size patched so any rogue synchronous call would
# be observable.
fake_proc = unittest.mock.MagicMock()
fake_proc.wait.return_value = 0
with patch.object(
pty_resize.subprocess, "Popen", return_value=fake_proc,
), patch.object(
pty_resize.threading, "Timer",
) as timer_cls, patch.object(
pty_resize, "_push_size",
) as push:
rc = pty_resize.main(["machine-name", "--", "echo", "hi"])
self.assertEqual(0, rc)
# Timer scheduled with the documented delay constant.
timer_cls.assert_called_once()
delay, callback = timer_cls.call_args.args
self.assertEqual(pty_resize._STARTUP_SYNC_DELAY_SEC, delay)
# _push_size never called synchronously — the only path to
# it is via the (mocked) timer's callback firing.
push.assert_not_called()
if __name__ == "__main__":
unittest.main()