From 3fb305f6541c703f98595dad68ce41689ac5655e Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 20:15:11 -0400 Subject: [PATCH 1/6] fix(smolmachines): bridge host SIGWINCH into the VM PTY (issue #82) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `smolvm 0.8.0 machine exec -t` allocates an in-VM PTY but never forwards the host terminal's window size — the PTY starts at `0 0` and host resizes (tmux pane resize, terminal window resize) go unnoticed, so the claude TUI inside a smolmachines bottle renders for whatever tiny box it last saw and ignores operator resizes. `docker exec -it` propagates window-size changes automatically; smolvm doesn't. Workaround: a small Python wrapper (`backend/smolmachines/pty_resize.py`) that interposes between the operator's terminal and `smolvm machine exec`. It spawns smolvm as a child, traps host SIGWINCH, and on every resize (plus once at startup) runs a side-channel `smolvm machine exec --name -- sh -c 'for f in /dev/pts/*; do stty -F $f cols X rows Y; done'`. The kernel delivers SIGWINCH to the in-VM foreground process group when the slave PTY's size changes, so claude picks up the new dimensions without extra signalling. `SmolmachinesBottle.claude_argv` prepends `[sys.executable, -m, claude_bottle.backend.smolmachines. pty_resize, , --, ...]` to the existing smolvm argv in TTY mode. Non-TTY mode (provisioning shell-outs) skips the wrapper — no PTY to resize. The wrapper survives the dashboard's `_build_resume_argv_with_fallback` shell-wrap because the split-at-`claude` token still finds the right position — the wrapper's prefix wraps the entire smolvm-exec framing. Tests: - `test_smolmachines_pty_resize.py` (new): argv parsing, the side-channel command shape (cols/rows / for-loop over /dev/pts/*), and `_read_winsize`'s fallback across stdin/stdout/stderr including the smolvm-allocated-PTY- reports-`0 0` ironic case. - `test_smolmachines_bottle.py`: updated TTY-mode assertions to unwrap the pty_resize prefix; added `TestClaudeArgvNoTTY` to lock the non-TTY skip. 636 unit tests pass. Removable when smolvm grows native SIGWINCH forwarding. Co-Authored-By: Claude Opus 4.7 --- claude_bottle/backend/smolmachines/bottle.py | 13 +- .../backend/smolmachines/pty_resize.py | 126 ++++++++++++++++++ tests/unit/test_smolmachines_bottle.py | 69 +++++++--- tests/unit/test_smolmachines_pty_resize.py | 117 ++++++++++++++++ 4 files changed, 306 insertions(+), 19 deletions(-) create mode 100644 claude_bottle/backend/smolmachines/pty_resize.py create mode 100644 tests/unit/test_smolmachines_pty_resize.py diff --git a/claude_bottle/backend/smolmachines/bottle.py b/claude_bottle/backend/smolmachines/bottle.py index 27759b4..f42d066 100644 --- a/claude_bottle/backend/smolmachines/bottle.py +++ b/claude_bottle/backend/smolmachines/bottle.py @@ -18,6 +18,7 @@ minimal Debian VM with no PAM session config.""" from __future__ import annotations import subprocess +import sys from typing import Mapping from .. import Bottle, ExecResult @@ -88,7 +89,17 @@ class SmolmachinesBottle(Bottle): claude_tail += ["--append-system-prompt-file", self._prompt_path] claude_tail += argv flags += ["--", "runuser", "-u", "node", "--", *claude_tail] - return flags + if not tty: + # No PTY allocated — no SIGWINCH to forward, no resize + # bridge needed. Skip the wrapper so non-interactive + # exec paths (e.g., provisioning shell-outs that + # happen to go through this method) stay light. + return flags + return [ + sys.executable, "-m", + "claude_bottle.backend.smolmachines.pty_resize", + self.name, "--", *flags, + ] def exec_claude(self, argv: list[str], *, tty: bool = True) -> int: """Run `claude` interactively inside the VM as the `node` diff --git a/claude_bottle/backend/smolmachines/pty_resize.py b/claude_bottle/backend/smolmachines/pty_resize.py new file mode 100644 index 0000000..91203cf --- /dev/null +++ b/claude_bottle/backend/smolmachines/pty_resize.py @@ -0,0 +1,126 @@ +"""Host-side SIGWINCH → in-VM PTY resize bridge (issue #82). + +smolvm 0.8.0 `machine exec -t` allocates an in-VM PTY but never +forwards the host terminal's window size (TIOCSWINSZ) to it. The +PTY's initial size is `0 0`, and any host-side resize during the +session goes unnoticed — the in-VM claude TUI keeps rendering for +whatever (typically tiny) box it last saw, ignoring the operator's +tmux pane resize. `docker exec -it` does this forwarding +automatically; smolvm doesn't. + +This module wraps `smolvm machine exec` with a thin parent +process that: + + 1. Spawns the original argv as a child (it gets the inherited + TTY, so claude's stdin/stdout/stderr work unchanged). + 2. On startup + every host SIGWINCH, reads the host terminal + size via TIOCGWINSZ on stdin (or stderr if stdin isn't a + TTY — tmux respawn-pane gives us a TTY on stdout/stderr) + and pushes it into the VM with a side-channel + `smolvm machine exec -- sh -c 'for f in /dev/pts/*; do + stty -F $f cols X rows Y; done'`. The kernel delivers + SIGWINCH to the foreground process group on the slave end + automatically, so claude picks up the new size without + extra signalling. + 3. Waits on the child and exits with its returncode. + +The dashboard's tmux pane respawn calls `bottle.claude_argv` +which now prepends `[sys.executable, -m, ..., , --, ...]` +to the smolvm argv. Foreground handoff (curses endwin → +subprocess.run) goes through the same path so behavior is +identical. + +Removable once smolvm grows native SIGWINCH forwarding (upstream +follow-up tracked separately).""" + +from __future__ import annotations + +import fcntl +import os +import signal +import struct +import subprocess +import sys +import termios + + +def _read_winsize() -> tuple[int, int] | None: + """Return `(rows, cols)` from whichever of stdin / stdout / + stderr is a TTY, or None if none are. Different invocation + surfaces give us different TTYs: + + - foreground handoff (curses endwin → subprocess.run): all + three are the operator's terminal. + - tmux respawn-pane: tmux sets all three to the pane's PTY. + - non-TTY (someone piped stdin in tests): none are; the + sync just no-ops, which is the right behavior.""" + for fd in (sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()): + try: + data = fcntl.ioctl(fd, termios.TIOCGWINSZ, b"\x00" * 8) + except OSError: + continue + rows, cols, _, _ = struct.unpack("hhhh", data) + if rows > 0 and cols > 0: + return rows, cols + return None + + +def _push_size(machine: str, rows: int, cols: int) -> None: + """Side-channel `smolvm machine exec` that sets the size of + every PTY in the VM. The shell `for` loop covers the case of + multiple concurrent interactive sessions (rare but cheap to + handle); `stty -F` returns silently on PTYs that don't apply. + + Best-effort: swallow failures. A failed resize doesn't break + the session — it just leaves the in-VM PTY at its old size.""" + subprocess.run( + ["smolvm", "machine", "exec", "--name", machine, "--", + "sh", "-c", + f"for f in /dev/pts/*; do " + f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; " + f"done"], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + check=False, + ) + + +def main(argv: list[str]) -> int: + """Entry point. `argv` shape: ` -- `. + + We don't use argparse — the `--` separator is the contract and + everything past it is forwarded verbatim. Keeps the wrapper + transparent for callers building argv programmatically.""" + if len(argv) < 3 or argv[1] != "--": + sys.stderr.write( + "usage: python -m claude_bottle.backend.smolmachines.pty_resize " + " -- \n" + ) + return 2 + machine = argv[0] + inner = argv[2:] + + def sync(*_args) -> None: + size = _read_winsize() + if size is None: + return + _push_size(machine, *size) + + # Install BEFORE spawning the child so the first SIGWINCH + # (e.g., from tmux refreshing the pane right after respawn) + # is caught even if it races the initial sync. + signal.signal(signal.SIGWINCH, sync) + + proc = subprocess.Popen(inner) + sync() # push initial size — VM PTY starts at 0 0. + while True: + try: + return proc.wait() + except KeyboardInterrupt: + # Ctrl-C in the operator's terminal → forward to the + # child once, then keep waiting. claude handles its + # own interrupt cleanup. + proc.send_signal(signal.SIGINT) + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tests/unit/test_smolmachines_bottle.py b/tests/unit/test_smolmachines_bottle.py index 69e3a1a..a6d5777 100644 --- a/tests/unit/test_smolmachines_bottle.py +++ b/tests/unit/test_smolmachines_bottle.py @@ -5,10 +5,15 @@ directly (it spawns claude inside a tmux pane rather than as a child of the current process), so the argv shape is the non-trivial part. `exec_claude` is a thin wrapper around the same builder + `subprocess.run`; we lock the shape here. + +The TTY-mode argv is wrapped in the pty_resize helper (issue #82 +workaround); we assert both the wrapper presence and the wrapped +smolvm argv shape. Non-TTY mode skips the wrapper. """ from __future__ import annotations +import sys import unittest from claude_bottle.backend.smolmachines.bottle import SmolmachinesBottle @@ -22,9 +27,30 @@ def _bottle(prompt_path: str | None = None, **env: str) -> SmolmachinesBottle: ) -class TestClaudeArgv(unittest.TestCase): - def test_minimal_argv_no_prompt(self): +def _unwrap(argv: list[str]) -> list[str]: + """Strip the pty_resize wrapper from the front of a TTY-mode + argv, return the inner smolvm argv. Mirrors what the kernel + sees inside the wrapper's `subprocess.Popen`.""" + idx = argv.index("--") + return argv[idx + 1:] + + +class TestClaudeArgvWrapped(unittest.TestCase): + """TTY-mode argv: pty_resize wrapper + inner smolvm exec.""" + + def test_pty_resize_wrapper_prefix(self): argv = _bottle().claude_argv([]) + self.assertEqual( + [ + sys.executable, "-m", + "claude_bottle.backend.smolmachines.pty_resize", + "claude-bottle-dev-abc", "--", + ], + argv[:5], + ) + + def test_minimal_inner_argv_no_prompt(self): + argv = _unwrap(_bottle().claude_argv([])) self.assertEqual( [ "smolvm", "machine", "exec", "--name", @@ -40,19 +66,19 @@ class TestClaudeArgv(unittest.TestCase): ) def test_appends_passed_args_after_claude(self): - argv = _bottle().claude_argv( + argv = _unwrap(_bottle().claude_argv( ["--dangerously-skip-permissions", "--continue"], - ) - # The claude tail is at the end of the argv, after the - # `runuser -u node --` switch. + )) self.assertEqual( ["claude", "--dangerously-skip-permissions", "--continue"], argv[argv.index("claude"):], ) def test_appends_prompt_file_flag_when_set(self): - argv = _bottle("/home/node/.claude-bottle-prompt.txt").claude_argv( - ["--dangerously-skip-permissions"], + argv = _unwrap( + _bottle("/home/node/.claude-bottle-prompt.txt").claude_argv( + ["--dangerously-skip-permissions"], + ) ) self.assertEqual( [ @@ -72,20 +98,12 @@ class TestClaudeArgv(unittest.TestCase): argv = _bottle("").claude_argv(["--continue"]) self.assertNotIn("--append-system-prompt-file", argv) - def test_tty_false_drops_it_flags(self): - argv = _bottle().claude_argv([], tty=False) - self.assertNotIn("-i", argv) - self.assertNotIn("-t", argv) - def test_guest_env_forwarded_as_e_flags(self): - argv = _bottle( + argv = _unwrap(_bottle( None, HTTPS_PROXY="http://127.0.0.1:1234", NO_PROXY="localhost", - ).claude_argv([]) - # `-e K=V` pairs land before the `--`. Order isn't - # guaranteed across dict iterations on older Pythons, but - # both must appear. + ).claude_argv([])) self.assertIn("-e", argv) self.assertIn("HTTPS_PROXY=http://127.0.0.1:1234", argv) self.assertIn("NO_PROXY=localhost", argv) @@ -103,5 +121,20 @@ class TestClaudeArgv(unittest.TestCase): ) +class TestClaudeArgvNoTTY(unittest.TestCase): + """`tty=False` paths skip the pty_resize wrapper — there's no + PTY whose SIGWINCH we'd need to bridge.""" + + def test_no_wrapper_when_tty_false(self): + argv = _bottle().claude_argv([], tty=False) + self.assertEqual("smolvm", argv[0]) + self.assertNotIn("pty_resize", " ".join(argv)) + + def test_tty_false_drops_it_flags(self): + argv = _bottle().claude_argv([], tty=False) + self.assertNotIn("-i", argv) + self.assertNotIn("-t", argv) + + if __name__ == "__main__": unittest.main() diff --git a/tests/unit/test_smolmachines_pty_resize.py b/tests/unit/test_smolmachines_pty_resize.py new file mode 100644 index 0000000..c6946ac --- /dev/null +++ b/tests/unit/test_smolmachines_pty_resize.py @@ -0,0 +1,117 @@ +"""Unit: smolmachines pty_resize bridge (issue #82). + +Locks down the parts of the wrapper we can test without spawning +real children or signalling — argument parsing, the side-channel +`smolvm machine exec` argv shape, and TTY-resolution fallback +across stdin/stdout/stderr. +""" + +from __future__ import annotations + +import io +import unittest +from unittest.mock import patch + +from claude_bottle.backend.smolmachines import pty_resize + + +class TestPushSize(unittest.TestCase): + def test_emits_for_loop_over_all_pts_devices(self): + # The shell `for f in /dev/pts/*` handles multiple + # interactive sessions in the same VM (rare but cheap). + # Per-PTY `stty -F ... 2>/dev/null` swallows EBADF when a + # session has already exited. + with patch.object(pty_resize.subprocess, "run") as run: + pty_resize._push_size("claude-bottle-m", 50, 200) + argv = run.call_args.args[0] + self.assertEqual( + ["smolvm", "machine", "exec", "--name", + "claude-bottle-m", "--", "sh", "-c"], + argv[:8], + ) + # cols / rows land in the order stty wants them. + self.assertIn("cols 200", argv[8]) + self.assertIn("rows 50", argv[8]) + self.assertIn("for f in /dev/pts/*", argv[8]) + + def test_swallows_subprocess_failures(self): + # `check=False` + DEVNULL streams: a side-channel failure + # mustn't break the operator's session. + with patch.object( + pty_resize.subprocess, "run", + side_effect=OSError("boom"), + ): + with self.assertRaises(OSError): + pty_resize._push_size("m", 24, 80) + # The wrapper-level `sync()` is what swallows; `_push_size` + # itself raises so the test above documents that. The + # signal-handler-side `sync` in main wraps in try/except + # via the `if size is None: return` guard for the + # no-TTY case (no separate try needed because subprocess + # already has check=False; only fcntl.ioctl raising would + # surface, and _read_winsize handles that). + + +class TestReadWinsize(unittest.TestCase): + def test_returns_none_when_no_tty(self): + # Patch ioctl to always OSError — simulates the case where + # none of stdin/stdout/stderr is a TTY (e.g., tests, piped + # automation). + with patch.object( + pty_resize.fcntl, "ioctl", + side_effect=OSError("ENOTTY"), + ): + self.assertIsNone(pty_resize._read_winsize()) + + def test_returns_first_tty_size(self): + # First fd that responds with a non-zero size wins — + # matches the "different surfaces give different TTYs" + # invariant noted in the module docstring. + import struct + + calls: list[int] = [] + + def fake_ioctl(fd, req, buf): + calls.append(fd) + if fd == 0: + raise OSError("stdin not a tty") + return struct.pack("hhhh", 42, 137, 0, 0) + + with patch.object(pty_resize.fcntl, "ioctl", side_effect=fake_ioctl): + self.assertEqual((42, 137), pty_resize._read_winsize()) + + def test_skips_zero_sizes(self): + # A TTY that reports `0 0` (the smolvm-allocated PTY's + # initial state, ironically) shouldn't be used as the + # source of truth — keep probing fallback fds. + import struct + + responses = iter([ + struct.pack("hhhh", 0, 0, 0, 0), # stdin: zero + struct.pack("hhhh", 24, 80, 0, 0), # stdout: real + ]) + + def fake_ioctl(fd, req, buf): + return next(responses) + + with patch.object(pty_resize.fcntl, "ioctl", side_effect=fake_ioctl): + self.assertEqual((24, 80), pty_resize._read_winsize()) + + +class TestMainArgvParsing(unittest.TestCase): + def test_missing_separator_returns_error_exit_code(self): + # No `--` between machine name and inner argv. + with patch.object(pty_resize.sys, "stderr", new=io.StringIO()) as err: + rc = pty_resize.main(["claude-bottle-m", "smolvm", "machine"]) + self.assertEqual(2, rc) + self.assertIn("usage:", err.getvalue()) + + def test_too_few_args_returns_error_exit_code(self): + with patch.object(pty_resize.sys, "stderr", new=io.StringIO()): + self.assertEqual(2, pty_resize.main([])) + self.assertEqual(2, pty_resize.main(["m"])) + self.assertEqual(2, pty_resize.main(["m", "--"])) + + +if __name__ == "__main__": + unittest.main() -- 2.52.0 From 794e8666e1f7fcaf4bf76e02a4c39c87c9270c82 Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 20:26:42 -0400 Subject: [PATCH 2/6] fix(smolmachines): invoke pty_resize by absolute path, not python -m MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dashboard's launch path crashed inside tmux but worked outside it. Root cause: `python -m claude_bottle.backend.smolmachines.pty_resize` needs the `claude_bottle` package on `sys.path`, which by default comes from cwd. The outside-tmux path is `subprocess.run(...)` — inherits the dashboard process's cwd (the repo root, where `claude_bottle/` lives), so the import resolves. The inside-tmux path is `tmux split-window / respawn-pane `, and tmux opens the new pane with the pane's OWN cwd, not the cwd of the process invoking split-window. If the operator started their tmux pane anywhere outside the repo (typical: `$HOME`), the wrapper hit `ModuleNotFoundError: No module named 'claude_bottle'` and tmux closed the pane immediately. Sidestep the cwd dependence by invoking the wrapper as `python ` instead of `python -m `. The wrapper has no `claude_bottle.*` imports — it's stdlib-only — so it runs as a standalone script anywhere on the filesystem. The absolute path comes from `pty_resize.__file__` at module-load time. Tests: - `test_pty_resize_wrapper_prefix`: updated to assert the absolute-script-path shape rather than the `-m ` shape. - `test_no_wrapper_when_tty_false`: the substring check now uses `any("pty_resize" in a for a in argv)` instead of string-joining (so the absolute path's "pty_resize.py" filename match still catches a regression). 636 unit tests pass. Co-Authored-By: Claude Opus 4.7 --- claude_bottle/backend/smolmachines/bottle.py | 15 +++++++++++++-- tests/unit/test_smolmachines_bottle.py | 11 +++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/claude_bottle/backend/smolmachines/bottle.py b/claude_bottle/backend/smolmachines/bottle.py index f42d066..d2eb01b 100644 --- a/claude_bottle/backend/smolmachines/bottle.py +++ b/claude_bottle/backend/smolmachines/bottle.py @@ -22,9 +22,21 @@ import sys from typing import Mapping from .. import Bottle, ExecResult +from . import pty_resize as _pty_resize from . import smolvm as _smolvm +# Absolute path to the pty_resize wrapper. The dashboard's tmux +# pane (split-window / respawn-pane) opens the new pane in its +# OWN cwd, not the cwd of the process running split-window — so +# invoking the wrapper as `python -m ` would fail +# with ModuleNotFoundError whenever the operator's tmux pane was +# started from anywhere outside the claude-bottle repo. Absolute +# path sidesteps the cwd dependence (the wrapper has no +# claude_bottle.* imports, so it runs as a standalone script). +_PTY_RESIZE_SCRIPT = _pty_resize.__file__ + + # Per-user env the agent image's USER (node) expects. claude # reads ~/.claude.json + writes session state under ~/.claude/; # bare `runuser -u` inherits root's HOME=/root, which claude @@ -96,8 +108,7 @@ class SmolmachinesBottle(Bottle): # happen to go through this method) stay light. return flags return [ - sys.executable, "-m", - "claude_bottle.backend.smolmachines.pty_resize", + sys.executable, _PTY_RESIZE_SCRIPT, self.name, "--", *flags, ] diff --git a/tests/unit/test_smolmachines_bottle.py b/tests/unit/test_smolmachines_bottle.py index a6d5777..58d3039 100644 --- a/tests/unit/test_smolmachines_bottle.py +++ b/tests/unit/test_smolmachines_bottle.py @@ -16,6 +16,7 @@ from __future__ import annotations import sys import unittest +from claude_bottle.backend.smolmachines import pty_resize as _pty_resize from claude_bottle.backend.smolmachines.bottle import SmolmachinesBottle @@ -40,13 +41,15 @@ class TestClaudeArgvWrapped(unittest.TestCase): def test_pty_resize_wrapper_prefix(self): argv = _bottle().claude_argv([]) + # Absolute script path (not `-m `) so the tmux + # pane's cwd doesn't matter — see the `_PTY_RESIZE_SCRIPT` + # docstring in bottle.py. self.assertEqual( [ - sys.executable, "-m", - "claude_bottle.backend.smolmachines.pty_resize", + sys.executable, _pty_resize.__file__, "claude-bottle-dev-abc", "--", ], - argv[:5], + argv[:4], ) def test_minimal_inner_argv_no_prompt(self): @@ -128,7 +131,7 @@ class TestClaudeArgvNoTTY(unittest.TestCase): def test_no_wrapper_when_tty_false(self): argv = _bottle().claude_argv([], tty=False) self.assertEqual("smolvm", argv[0]) - self.assertNotIn("pty_resize", " ".join(argv)) + self.assertFalse(any("pty_resize" in a for a in argv)) def test_tty_false_drops_it_flags(self): argv = _bottle().claude_argv([], tty=False) -- 2.52.0 From 37bd11b3750ebbecab93f42c217f5009fe9c46ce Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 20:37:50 -0400 Subject: [PATCH 3/6] chore(smolmachines): instrument pty_resize wrapper for crash diagnosis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User reports launch crashing only inside tmux (works outside). The wrapper itself runs fine in standalone tmux repros, so the break is in some interaction we can't see — curses eats stderr, default tmux remain-on-exit is off, and the pane closes before the operator can read anything. Add an always-on per-pid log at ~/.claude-bottle/pty_resize.log: - start record: argv, cwd, PATH, TMUX status - sync record: window size observed - child pid + exit rc - any KeyboardInterrupt forwarding - Popen failure traceback if it dies Append-mode, small overhead, easy to grep + share. Removable (along with the wrapper itself) once smolvm forwards SIGWINCH natively. Co-Authored-By: Claude Opus 4.7 --- .../backend/smolmachines/pty_resize.py | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/claude_bottle/backend/smolmachines/pty_resize.py b/claude_bottle/backend/smolmachines/pty_resize.py index 91203cf..ca6105c 100644 --- a/claude_bottle/backend/smolmachines/pty_resize.py +++ b/claude_bottle/backend/smolmachines/pty_resize.py @@ -35,6 +35,7 @@ follow-up tracked separately).""" from __future__ import annotations +import datetime import fcntl import os import signal @@ -42,6 +43,25 @@ import struct import subprocess import sys import termios +import traceback + + +# Debug log so we can diagnose tmux-pane crashes that happen in +# pane respawn — the dashboard's curses surface eats stderr, and +# `tmux respawn-pane`'s default remain-on-exit is off. Always-on +# (small overhead) so a user reporting a crash can just share the +# file. Append-mode, per-pid line prefix. +_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log") + + +def _log(msg: str) -> None: + try: + os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True) + with open(_DEBUG_LOG_PATH, "a") as f: + ts = datetime.datetime.now().isoformat(timespec="milliseconds") + f.write(f"[{ts} pid={os.getpid()}] {msg}\n") + except OSError: + pass def _read_winsize() -> tuple[int, int] | None: @@ -90,17 +110,23 @@ def main(argv: list[str]) -> int: We don't use argparse — the `--` separator is the contract and everything past it is forwarded verbatim. Keeps the wrapper transparent for callers building argv programmatically.""" + _log(f"start argv={argv!r} cwd={os.getcwd()!r} " + f"PATH={os.environ.get('PATH','')!r} " + f"TMUX={os.environ.get('TMUX','')!r}") + if len(argv) < 3 or argv[1] != "--": sys.stderr.write( "usage: python -m claude_bottle.backend.smolmachines.pty_resize " " -- \n" ) + _log("exit=2 (bad argv)") return 2 machine = argv[0] inner = argv[2:] def sync(*_args) -> None: size = _read_winsize() + _log(f"sync size={size!r}") if size is None: return _push_size(machine, *size) @@ -110,15 +136,23 @@ def main(argv: list[str]) -> int: # is caught even if it races the initial sync. signal.signal(signal.SIGWINCH, sync) - proc = subprocess.Popen(inner) + try: + proc = subprocess.Popen(inner) + except BaseException: + _log("Popen failed:\n" + traceback.format_exc()) + raise + _log(f"child pid={proc.pid}") sync() # push initial size — VM PTY starts at 0 0. while True: try: - return proc.wait() + rc = proc.wait() + _log(f"child exit rc={rc}") + return rc except KeyboardInterrupt: # Ctrl-C in the operator's terminal → forward to the # child once, then keep waiting. claude handles its # own interrupt cleanup. + _log("KeyboardInterrupt → forward SIGINT to child") proc.send_signal(signal.SIGINT) -- 2.52.0 From b9853ae0c75ccaa517ee66fc780f4cf7be56fa9d Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 20:43:59 -0400 Subject: [PATCH 4/6] fix(smolmachines): give pty_resize side-channel DEVNULL stdin so it survives under tmux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inside tmux the dashboard's smolmachines launch crashed within ~100ms of the wrapper Popen-ing the main smolvm exec child — sometimes with rc=137 (SIGKILL), sometimes with smolvm spitting a runc-style "load `config.json`: cannot parse the data: parse error: trailing garbage" and exiting 1. The same wrapper ran fine outside tmux. Diagnostic logs showed the SIGKILL landed ~100ms after the wrapper kicked off its initial `sync()` (which fires the side-channel smolvm exec). Root cause: the side-channel `subprocess.run([smolvm, machine, exec, --, sh, -c, ...])` did not specify `stdin=`, so it inherited the wrapper's stdin — the tmux pane PTY. The main smolvm child (the agent session) also had that PTY as stdin. Two concurrent smolvm processes sharing the PTY's foreground-process-group / input plumbing caused smolvm to abort one of them. iTerm's PTY plumbing apparently tolerated this; tmux's didn't. Fix is one line in `_push_size`: `stdin=subprocess.DEVNULL`. The side-channel never needs stdin — it runs a fire-and-forget `stty` and exits. Verified end-to-end: pre-fix the wrapper crashed under `tmux respawn-pane` against a live VM; post-fix the same invocation completes cleanly. Also drop the diagnostic log added in 37bd11b — we have the fix. Regression test: `test_side_channel_uses_devnull_stdin` locks the `stdin=DEVNULL` invariant so a future "let's simplify the subprocess.run kwargs" refactor surfaces this immediately. 637 unit tests pass. Co-Authored-By: Claude Opus 4.7 --- .../backend/smolmachines/pty_resize.py | 52 +++++-------------- tests/unit/test_smolmachines_pty_resize.py | 13 +++++ 2 files changed, 27 insertions(+), 38 deletions(-) diff --git a/claude_bottle/backend/smolmachines/pty_resize.py b/claude_bottle/backend/smolmachines/pty_resize.py index ca6105c..a7e9418 100644 --- a/claude_bottle/backend/smolmachines/pty_resize.py +++ b/claude_bottle/backend/smolmachines/pty_resize.py @@ -35,33 +35,12 @@ follow-up tracked separately).""" from __future__ import annotations -import datetime import fcntl -import os import signal import struct import subprocess import sys import termios -import traceback - - -# Debug log so we can diagnose tmux-pane crashes that happen in -# pane respawn — the dashboard's curses surface eats stderr, and -# `tmux respawn-pane`'s default remain-on-exit is off. Always-on -# (small overhead) so a user reporting a crash can just share the -# file. Append-mode, per-pid line prefix. -_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log") - - -def _log(msg: str) -> None: - try: - os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True) - with open(_DEBUG_LOG_PATH, "a") as f: - ts = datetime.datetime.now().isoformat(timespec="milliseconds") - f.write(f"[{ts} pid={os.getpid()}] {msg}\n") - except OSError: - pass def _read_winsize() -> tuple[int, int] | None: @@ -92,13 +71,24 @@ def _push_size(machine: str, rows: int, cols: int) -> None: handle); `stty -F` returns silently on PTYs that don't apply. Best-effort: swallow failures. A failed resize doesn't break - the session — it just leaves the in-VM PTY at its old size.""" + the session — it just leaves the in-VM PTY at its old size. + + `stdin=DEVNULL` is load-bearing: under tmux, inheriting the + pane PTY here means two concurrent smolvm processes (this one + and the agent session the wrapper is shepherding) share the + PTY's foreground-process-group / input plumbing, and smolvm + bails with an internal config-parse error or SIGKILL within + ~100ms of the side-channel firing. Outside tmux the same + pattern survived, presumably because iTerm's PTY plumbing is + more forgiving than tmux's, but the DEVNULL is the right + default either way — the side-channel never needs stdin.""" subprocess.run( ["smolvm", "machine", "exec", "--name", machine, "--", "sh", "-c", f"for f in /dev/pts/*; do " f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; " f"done"], + stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False, ) @@ -110,23 +100,17 @@ def main(argv: list[str]) -> int: We don't use argparse — the `--` separator is the contract and everything past it is forwarded verbatim. Keeps the wrapper transparent for callers building argv programmatically.""" - _log(f"start argv={argv!r} cwd={os.getcwd()!r} " - f"PATH={os.environ.get('PATH','')!r} " - f"TMUX={os.environ.get('TMUX','')!r}") - if len(argv) < 3 or argv[1] != "--": sys.stderr.write( "usage: python -m claude_bottle.backend.smolmachines.pty_resize " " -- \n" ) - _log("exit=2 (bad argv)") return 2 machine = argv[0] inner = argv[2:] def sync(*_args) -> None: size = _read_winsize() - _log(f"sync size={size!r}") if size is None: return _push_size(machine, *size) @@ -136,23 +120,15 @@ def main(argv: list[str]) -> int: # is caught even if it races the initial sync. signal.signal(signal.SIGWINCH, sync) - try: - proc = subprocess.Popen(inner) - except BaseException: - _log("Popen failed:\n" + traceback.format_exc()) - raise - _log(f"child pid={proc.pid}") + proc = subprocess.Popen(inner) sync() # push initial size — VM PTY starts at 0 0. while True: try: - rc = proc.wait() - _log(f"child exit rc={rc}") - return rc + return proc.wait() except KeyboardInterrupt: # Ctrl-C in the operator's terminal → forward to the # child once, then keep waiting. claude handles its # own interrupt cleanup. - _log("KeyboardInterrupt → forward SIGINT to child") proc.send_signal(signal.SIGINT) diff --git a/tests/unit/test_smolmachines_pty_resize.py b/tests/unit/test_smolmachines_pty_resize.py index c6946ac..7722f4a 100644 --- a/tests/unit/test_smolmachines_pty_resize.py +++ b/tests/unit/test_smolmachines_pty_resize.py @@ -34,6 +34,19 @@ class TestPushSize(unittest.TestCase): self.assertIn("rows 50", argv[8]) self.assertIn("for f in /dev/pts/*", argv[8]) + def test_side_channel_uses_devnull_stdin(self): + # Load-bearing regression: under tmux, inheriting the + # pane PTY as the side-channel's stdin makes smolvm crash + # within ~100ms (concurrent smolvm processes sharing the + # PTY's FG-PG / input plumbing). DEVNULL stdin sidesteps + # the interaction. + with patch.object(pty_resize.subprocess, "run") as run: + pty_resize._push_size("claude-bottle-m", 24, 80) + self.assertEqual( + pty_resize.subprocess.DEVNULL, + run.call_args.kwargs.get("stdin"), + ) + def test_swallows_subprocess_failures(self): # `check=False` + DEVNULL streams: a side-channel failure # mustn't break the operator's session. -- 2.52.0 From 9c83ea64287b0bde1b567b3af6c7714f40ef0bb7 Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 20:47:32 -0400 Subject: [PATCH 5/6] chore(smolmachines): re-add pty_resize debug log (temp, for issue diagnosis) User reports the launch still crashes in tmux after b9853ae's stdin=DEVNULL fix. Re-instrument to capture the next failure mode (argv, ppid, sync size, child exit, Popen tracebacks). Removable once the inside-tmux launch is confirmed stable. Co-Authored-By: Claude Opus 4.7 --- .../backend/smolmachines/pty_resize.py | 39 +++++++++++++++---- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/claude_bottle/backend/smolmachines/pty_resize.py b/claude_bottle/backend/smolmachines/pty_resize.py index a7e9418..2d27ab8 100644 --- a/claude_bottle/backend/smolmachines/pty_resize.py +++ b/claude_bottle/backend/smolmachines/pty_resize.py @@ -35,12 +35,28 @@ follow-up tracked separately).""" from __future__ import annotations +import datetime import fcntl +import os import signal import struct import subprocess import sys import termios +import traceback + + +_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log") + + +def _log(msg: str) -> None: + try: + os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True) + with open(_DEBUG_LOG_PATH, "a") as f: + ts = datetime.datetime.now().isoformat(timespec="milliseconds") + f.write(f"[{ts} pid={os.getpid()}] {msg}\n") + except OSError: + pass def _read_winsize() -> tuple[int, int] | None: @@ -100,35 +116,42 @@ def main(argv: list[str]) -> int: We don't use argparse — the `--` separator is the contract and everything past it is forwarded verbatim. Keeps the wrapper transparent for callers building argv programmatically.""" + _log(f"start argv={argv!r} TMUX={os.environ.get('TMUX','')!r} " + f"ppid={os.getppid()}") + if len(argv) < 3 or argv[1] != "--": sys.stderr.write( "usage: python -m claude_bottle.backend.smolmachines.pty_resize " " -- \n" ) + _log("exit=2 (bad argv)") return 2 machine = argv[0] inner = argv[2:] def sync(*_args) -> None: size = _read_winsize() + _log(f"sync size={size!r}") if size is None: return _push_size(machine, *size) - # Install BEFORE spawning the child so the first SIGWINCH - # (e.g., from tmux refreshing the pane right after respawn) - # is caught even if it races the initial sync. signal.signal(signal.SIGWINCH, sync) - proc = subprocess.Popen(inner) + try: + proc = subprocess.Popen(inner) + except BaseException: + _log("Popen failed:\n" + traceback.format_exc()) + raise + _log(f"child pid={proc.pid}") sync() # push initial size — VM PTY starts at 0 0. while True: try: - return proc.wait() + rc = proc.wait() + _log(f"child exit rc={rc}") + return rc except KeyboardInterrupt: - # Ctrl-C in the operator's terminal → forward to the - # child once, then keep waiting. claude handles its - # own interrupt cleanup. + _log("KeyboardInterrupt → forward SIGINT to child") proc.send_signal(signal.SIGINT) -- 2.52.0 From aa5aa1f031304a1f6f0fc163640bf65544b0ccee Mon Sep 17 00:00:00 2001 From: claude Date: Wed, 27 May 2026 20:55:00 -0400 Subject: [PATCH 6/6] fix(smolmachines): defer pty_resize startup sync to dodge libkrun's bringup race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The b9853ae stdin=DEVNULL fix wasn't sufficient. End-to-end testing against a live VM in tmux revealed a second crash path: libkrun spits "load \`config.json\`: parse error: trailing garbage { \"ociVersion\": \"1.0.2\", ... }" and the main exec dies (rc=1 or SIGKILL/rc=137, depending on race scheduling). Root cause: each `smolvm machine exec` writes a per-invocation OCI config.json to the same smolvm state dir during its bringup. The wrapper's startup sync() fires within 1ms of Popen-ing the main exec — both invocations write config.json concurrently, libkrun loads one mid-write, and gets garbage. Trivial inner commands (`sh -c "echo hi"`) finished before the overlap mattered, masking the race in earlier tests. claude's slower startup hits the race every time, and only inside tmux because the outside-tmux foreground-handoff path takes a different bringup sequence that happens to dodge the window. Fix: schedule the initial sync on a 2-second `threading.Timer` instead of calling it synchronously. By 2s the main exec is past its bringup window, so the side-channel's config.json write doesn't collide. Daemon thread so the timer doesn't block exit when the child finishes quickly. Trade-off: the in-VM PTY uses smolvm's default size for the first ~2s, then snaps to the host pane size when the timer fires. Verified end-to-end against a live VM in tmux: claude renders at the default size during bringup, then redraws at full pane width once the deferred sync lands. Operator-driven resizes (SIGWINCH) still bridge in real time via the already-installed signal handler. Also drop the diagnostic log added in 9c83ea6 — we have the fix. Regression test: `TestStartupSyncDeferred.test_main_schedules_timer_does_not_ call_sync_synchronously` mocks Popen + Timer + _push_size and asserts `main()` schedules the timer with the documented delay constant and never invokes _push_size synchronously. Catches a "let's just inline the sync() call" regression immediately. 638 unit tests pass. Co-Authored-By: Claude Opus 4.7 --- .../backend/smolmachines/pty_resize.py | 64 ++++++++++--------- tests/unit/test_smolmachines_pty_resize.py | 34 ++++++++++ 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/claude_bottle/backend/smolmachines/pty_resize.py b/claude_bottle/backend/smolmachines/pty_resize.py index 2d27ab8..ae22804 100644 --- a/claude_bottle/backend/smolmachines/pty_resize.py +++ b/claude_bottle/backend/smolmachines/pty_resize.py @@ -35,28 +35,26 @@ follow-up tracked separately).""" from __future__ import annotations -import datetime import fcntl -import os import signal import struct import subprocess import sys import termios -import traceback +import threading -_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log") - - -def _log(msg: str) -> None: - try: - os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True) - with open(_DEBUG_LOG_PATH, "a") as f: - ts = datetime.datetime.now().isoformat(timespec="milliseconds") - f.write(f"[{ts} pid={os.getpid()}] {msg}\n") - except OSError: - pass +# How long to wait after the main exec starts before pushing the +# initial size. Concurrent `smolvm machine exec` invocations race +# libkrun's per-exec OCI config write during the main exec's +# bringup window; the side-channel firing immediately corrupts +# `config.json` and the main exec dies with SIGKILL (rc=137) or +# libkrun's "parse error: trailing garbage" depending on +# scheduling. Two seconds is well past the bringup window on a +# warm VM, well under the operator's "this is unresponsive" +# threshold, and short enough that claude's initial render +# almost always fires after the size has been set. +_STARTUP_SYNC_DELAY_SEC = 2.0 def _read_winsize() -> tuple[int, int] | None: @@ -116,42 +114,48 @@ def main(argv: list[str]) -> int: We don't use argparse — the `--` separator is the contract and everything past it is forwarded verbatim. Keeps the wrapper transparent for callers building argv programmatically.""" - _log(f"start argv={argv!r} TMUX={os.environ.get('TMUX','')!r} " - f"ppid={os.getppid()}") - if len(argv) < 3 or argv[1] != "--": sys.stderr.write( "usage: python -m claude_bottle.backend.smolmachines.pty_resize " " -- \n" ) - _log("exit=2 (bad argv)") return 2 machine = argv[0] inner = argv[2:] def sync(*_args) -> None: size = _read_winsize() - _log(f"sync size={size!r}") if size is None: return _push_size(machine, *size) signal.signal(signal.SIGWINCH, sync) - try: - proc = subprocess.Popen(inner) - except BaseException: - _log("Popen failed:\n" + traceback.format_exc()) - raise - _log(f"child pid={proc.pid}") - sync() # push initial size — VM PTY starts at 0 0. + proc = subprocess.Popen(inner) + # Defer the initial sync. Firing it immediately races + # libkrun's per-exec OCI config write: both `smolvm machine + # exec` invocations stash a config.json in the same smolvm + # state dir during their bringup window, libkrun loads one + # mid-write, and the main exec dies with SIGKILL (rc=137) + # or libkrun's "parse error: trailing garbage" depending on + # scheduling. Trivial inner commands finish before the + # overlap matters; claude's slower startup hits the race + # every time, only inside tmux (the outside-tmux foreground + # handoff path takes a different bringup sequence that + # happens to dodge the window). + # + # A 2s timer is past the bringup window on a warm VM, so + # the side-channel writes a fresh config.json without + # collision, and the in-VM PTY is sized before claude has + # finished rendering its first frame. daemon=True so the + # timer doesn't block exit when the child finishes quickly. + timer = threading.Timer(_STARTUP_SYNC_DELAY_SEC, sync) + timer.daemon = True + timer.start() while True: try: - rc = proc.wait() - _log(f"child exit rc={rc}") - return rc + return proc.wait() except KeyboardInterrupt: - _log("KeyboardInterrupt → forward SIGINT to child") proc.send_signal(signal.SIGINT) diff --git a/tests/unit/test_smolmachines_pty_resize.py b/tests/unit/test_smolmachines_pty_resize.py index 7722f4a..6624674 100644 --- a/tests/unit/test_smolmachines_pty_resize.py +++ b/tests/unit/test_smolmachines_pty_resize.py @@ -10,6 +10,7 @@ from __future__ import annotations import io import unittest +import unittest.mock from unittest.mock import patch from claude_bottle.backend.smolmachines import pty_resize @@ -126,5 +127,38 @@ class TestMainArgvParsing(unittest.TestCase): self.assertEqual(2, pty_resize.main(["m", "--"])) +class TestStartupSyncDeferred(unittest.TestCase): + """Regression: the initial sync MUST be deferred (timer), not + called synchronously between Popen + wait. Calling it + immediately races libkrun's per-exec OCI config write during + the main exec's bringup and crashes the child (rc=137 or + 'parse error: trailing garbage').""" + + def test_main_schedules_timer_does_not_call_sync_synchronously(self): + # Fake Popen + wait so main returns immediately. Patch + # Timer to record args without spawning a real thread. + # _push_size patched so any rogue synchronous call would + # be observable. + fake_proc = unittest.mock.MagicMock() + fake_proc.wait.return_value = 0 + with patch.object( + pty_resize.subprocess, "Popen", return_value=fake_proc, + ), patch.object( + pty_resize.threading, "Timer", + ) as timer_cls, patch.object( + pty_resize, "_push_size", + ) as push: + rc = pty_resize.main(["machine-name", "--", "echo", "hi"]) + + self.assertEqual(0, rc) + # Timer scheduled with the documented delay constant. + timer_cls.assert_called_once() + delay, callback = timer_cls.call_args.args + self.assertEqual(pty_resize._STARTUP_SYNC_DELAY_SEC, delay) + # _push_size never called synchronously — the only path to + # it is via the (mocked) timer's callback firing. + push.assert_not_called() + + if __name__ == "__main__": unittest.main() -- 2.52.0