fix(smolmachines): give pty_resize side-channel DEVNULL stdin so it survives under tmux
test / unit (pull_request) Successful in 26s
test / integration (pull_request) Successful in 40s

Inside tmux the dashboard's smolmachines launch crashed within
~100ms of the wrapper Popen-ing the main smolvm exec child —
sometimes with rc=137 (SIGKILL), sometimes with smolvm
spitting a runc-style "load `config.json`: cannot parse the
data: parse error: trailing garbage" and exiting 1. The same
wrapper ran fine outside tmux. Diagnostic logs showed the
SIGKILL landed ~100ms after the wrapper kicked off its
initial `sync()` (which fires the side-channel smolvm exec).

Root cause: the side-channel `subprocess.run([smolvm, machine,
exec, --, sh, -c, ...])` did not specify `stdin=`, so it
inherited the wrapper's stdin — the tmux pane PTY. The main
smolvm child (the agent session) also had that PTY as stdin.
Two concurrent smolvm processes sharing the PTY's
foreground-process-group / input plumbing caused smolvm to
abort one of them. iTerm's PTY plumbing apparently tolerated
this; tmux's didn't.

Fix is one line in `_push_size`: `stdin=subprocess.DEVNULL`.
The side-channel never needs stdin — it runs a fire-and-forget
`stty` and exits. Verified end-to-end: pre-fix the wrapper
crashed under `tmux respawn-pane` against a live VM; post-fix
the same invocation completes cleanly.

Also drop the diagnostic log added in 37bd11b — we have the
fix.

Regression test:
`test_side_channel_uses_devnull_stdin` locks the
`stdin=DEVNULL` invariant so a future "let's simplify the
subprocess.run kwargs" refactor surfaces this immediately.

637 unit tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 20:43:59 -04:00
parent 37bd11b375
commit b9853ae0c7
2 changed files with 27 additions and 38 deletions
@@ -35,33 +35,12 @@ follow-up tracked separately)."""
from __future__ import annotations from __future__ import annotations
import datetime
import fcntl import fcntl
import os
import signal import signal
import struct import struct
import subprocess import subprocess
import sys import sys
import termios import termios
import traceback
# Debug log so we can diagnose tmux-pane crashes that happen in
# pane respawn — the dashboard's curses surface eats stderr, and
# `tmux respawn-pane`'s default remain-on-exit is off. Always-on
# (small overhead) so a user reporting a crash can just share the
# file. Append-mode, per-pid line prefix.
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log")
def _log(msg: str) -> None:
try:
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True)
with open(_DEBUG_LOG_PATH, "a") as f:
ts = datetime.datetime.now().isoformat(timespec="milliseconds")
f.write(f"[{ts} pid={os.getpid()}] {msg}\n")
except OSError:
pass
def _read_winsize() -> tuple[int, int] | None: def _read_winsize() -> tuple[int, int] | None:
@@ -92,13 +71,24 @@ def _push_size(machine: str, rows: int, cols: int) -> None:
handle); `stty -F` returns silently on PTYs that don't apply. handle); `stty -F` returns silently on PTYs that don't apply.
Best-effort: swallow failures. A failed resize doesn't break Best-effort: swallow failures. A failed resize doesn't break
the session — it just leaves the in-VM PTY at its old size.""" the session — it just leaves the in-VM PTY at its old size.
`stdin=DEVNULL` is load-bearing: under tmux, inheriting the
pane PTY here means two concurrent smolvm processes (this one
and the agent session the wrapper is shepherding) share the
PTY's foreground-process-group / input plumbing, and smolvm
bails with an internal config-parse error or SIGKILL within
~100ms of the side-channel firing. Outside tmux the same
pattern survived, presumably because iTerm's PTY plumbing is
more forgiving than tmux's, but the DEVNULL is the right
default either way — the side-channel never needs stdin."""
subprocess.run( subprocess.run(
["smolvm", "machine", "exec", "--name", machine, "--", ["smolvm", "machine", "exec", "--name", machine, "--",
"sh", "-c", "sh", "-c",
f"for f in /dev/pts/*; do " f"for f in /dev/pts/*; do "
f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; " f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; "
f"done"], f"done"],
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
check=False, check=False,
) )
@@ -110,23 +100,17 @@ def main(argv: list[str]) -> int:
We don't use argparse — the `--` separator is the contract and We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically.""" transparent for callers building argv programmatically."""
_log(f"start argv={argv!r} cwd={os.getcwd()!r} "
f"PATH={os.environ.get('PATH','')!r} "
f"TMUX={os.environ.get('TMUX','<unset>')!r}")
if len(argv) < 3 or argv[1] != "--": if len(argv) < 3 or argv[1] != "--":
sys.stderr.write( sys.stderr.write(
"usage: python -m claude_bottle.backend.smolmachines.pty_resize " "usage: python -m claude_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n" "<machine> -- <smolvm-argv...>\n"
) )
_log("exit=2 (bad argv)")
return 2 return 2
machine = argv[0] machine = argv[0]
inner = argv[2:] inner = argv[2:]
def sync(*_args) -> None: def sync(*_args) -> None:
size = _read_winsize() size = _read_winsize()
_log(f"sync size={size!r}")
if size is None: if size is None:
return return
_push_size(machine, *size) _push_size(machine, *size)
@@ -136,23 +120,15 @@ def main(argv: list[str]) -> int:
# is caught even if it races the initial sync. # is caught even if it races the initial sync.
signal.signal(signal.SIGWINCH, sync) signal.signal(signal.SIGWINCH, sync)
try: proc = subprocess.Popen(inner)
proc = subprocess.Popen(inner)
except BaseException:
_log("Popen failed:\n" + traceback.format_exc())
raise
_log(f"child pid={proc.pid}")
sync() # push initial size — VM PTY starts at 0 0. sync() # push initial size — VM PTY starts at 0 0.
while True: while True:
try: try:
rc = proc.wait() return proc.wait()
_log(f"child exit rc={rc}")
return rc
except KeyboardInterrupt: except KeyboardInterrupt:
# Ctrl-C in the operator's terminal → forward to the # Ctrl-C in the operator's terminal → forward to the
# child once, then keep waiting. claude handles its # child once, then keep waiting. claude handles its
# own interrupt cleanup. # own interrupt cleanup.
_log("KeyboardInterrupt → forward SIGINT to child")
proc.send_signal(signal.SIGINT) proc.send_signal(signal.SIGINT)
@@ -34,6 +34,19 @@ class TestPushSize(unittest.TestCase):
self.assertIn("rows 50", argv[8]) self.assertIn("rows 50", argv[8])
self.assertIn("for f in /dev/pts/*", argv[8]) self.assertIn("for f in /dev/pts/*", argv[8])
def test_side_channel_uses_devnull_stdin(self):
# Load-bearing regression: under tmux, inheriting the
# pane PTY as the side-channel's stdin makes smolvm crash
# within ~100ms (concurrent smolvm processes sharing the
# PTY's FG-PG / input plumbing). DEVNULL stdin sidesteps
# the interaction.
with patch.object(pty_resize.subprocess, "run") as run:
pty_resize._push_size("claude-bottle-m", 24, 80)
self.assertEqual(
pty_resize.subprocess.DEVNULL,
run.call_args.kwargs.get("stdin"),
)
def test_swallows_subprocess_failures(self): def test_swallows_subprocess_failures(self):
# `check=False` + DEVNULL streams: a side-channel failure # `check=False` + DEVNULL streams: a side-channel failure
# mustn't break the operator's session. # mustn't break the operator's session.