fix(smolmachines): give pty_resize side-channel DEVNULL stdin so it survives under tmux
Inside tmux the dashboard's smolmachines launch crashed within
~100ms of the wrapper Popen-ing the main smolvm exec child —
sometimes with rc=137 (SIGKILL), sometimes with smolvm
spitting a runc-style "load `config.json`: cannot parse the
data: parse error: trailing garbage" and exiting 1. The same
wrapper ran fine outside tmux. Diagnostic logs showed the
SIGKILL landed ~100ms after the wrapper kicked off its
initial `sync()` (which fires the side-channel smolvm exec).
Root cause: the side-channel `subprocess.run([smolvm, machine,
exec, --, sh, -c, ...])` did not specify `stdin=`, so it
inherited the wrapper's stdin — the tmux pane PTY. The main
smolvm child (the agent session) also had that PTY as stdin.
Two concurrent smolvm processes sharing the PTY's
foreground-process-group / input plumbing caused smolvm to
abort one of them. iTerm's PTY plumbing apparently tolerated
this; tmux's didn't.
Fix is one line in `_push_size`: `stdin=subprocess.DEVNULL`.
The side-channel never needs stdin — it runs a fire-and-forget
`stty` and exits. Verified end-to-end: pre-fix the wrapper
crashed under `tmux respawn-pane` against a live VM; post-fix
the same invocation completes cleanly.
Also drop the diagnostic log added in 37bd11b — we have the
fix.
Regression test:
`test_side_channel_uses_devnull_stdin` locks the
`stdin=DEVNULL` invariant so a future "let's simplify the
subprocess.run kwargs" refactor surfaces this immediately.
637 unit tests pass.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -35,33 +35,12 @@ follow-up tracked separately)."""
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import datetime
|
|
||||||
import fcntl
|
import fcntl
|
||||||
import os
|
|
||||||
import signal
|
import signal
|
||||||
import struct
|
import struct
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import termios
|
import termios
|
||||||
import traceback
|
|
||||||
|
|
||||||
|
|
||||||
# Debug log so we can diagnose tmux-pane crashes that happen in
|
|
||||||
# pane respawn — the dashboard's curses surface eats stderr, and
|
|
||||||
# `tmux respawn-pane`'s default remain-on-exit is off. Always-on
|
|
||||||
# (small overhead) so a user reporting a crash can just share the
|
|
||||||
# file. Append-mode, per-pid line prefix.
|
|
||||||
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log")
|
|
||||||
|
|
||||||
|
|
||||||
def _log(msg: str) -> None:
|
|
||||||
try:
|
|
||||||
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True)
|
|
||||||
with open(_DEBUG_LOG_PATH, "a") as f:
|
|
||||||
ts = datetime.datetime.now().isoformat(timespec="milliseconds")
|
|
||||||
f.write(f"[{ts} pid={os.getpid()}] {msg}\n")
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def _read_winsize() -> tuple[int, int] | None:
|
def _read_winsize() -> tuple[int, int] | None:
|
||||||
@@ -92,13 +71,24 @@ def _push_size(machine: str, rows: int, cols: int) -> None:
|
|||||||
handle); `stty -F` returns silently on PTYs that don't apply.
|
handle); `stty -F` returns silently on PTYs that don't apply.
|
||||||
|
|
||||||
Best-effort: swallow failures. A failed resize doesn't break
|
Best-effort: swallow failures. A failed resize doesn't break
|
||||||
the session — it just leaves the in-VM PTY at its old size."""
|
the session — it just leaves the in-VM PTY at its old size.
|
||||||
|
|
||||||
|
`stdin=DEVNULL` is load-bearing: under tmux, inheriting the
|
||||||
|
pane PTY here means two concurrent smolvm processes (this one
|
||||||
|
and the agent session the wrapper is shepherding) share the
|
||||||
|
PTY's foreground-process-group / input plumbing, and smolvm
|
||||||
|
bails with an internal config-parse error or SIGKILL within
|
||||||
|
~100ms of the side-channel firing. Outside tmux the same
|
||||||
|
pattern survived, presumably because iTerm's PTY plumbing is
|
||||||
|
more forgiving than tmux's, but the DEVNULL is the right
|
||||||
|
default either way — the side-channel never needs stdin."""
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
["smolvm", "machine", "exec", "--name", machine, "--",
|
["smolvm", "machine", "exec", "--name", machine, "--",
|
||||||
"sh", "-c",
|
"sh", "-c",
|
||||||
f"for f in /dev/pts/*; do "
|
f"for f in /dev/pts/*; do "
|
||||||
f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; "
|
f"stty -F \"$f\" cols {cols} rows {rows} 2>/dev/null; "
|
||||||
f"done"],
|
f"done"],
|
||||||
|
stdin=subprocess.DEVNULL,
|
||||||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||||||
check=False,
|
check=False,
|
||||||
)
|
)
|
||||||
@@ -110,23 +100,17 @@ def main(argv: list[str]) -> int:
|
|||||||
We don't use argparse — the `--` separator is the contract and
|
We don't use argparse — the `--` separator is the contract and
|
||||||
everything past it is forwarded verbatim. Keeps the wrapper
|
everything past it is forwarded verbatim. Keeps the wrapper
|
||||||
transparent for callers building argv programmatically."""
|
transparent for callers building argv programmatically."""
|
||||||
_log(f"start argv={argv!r} cwd={os.getcwd()!r} "
|
|
||||||
f"PATH={os.environ.get('PATH','')!r} "
|
|
||||||
f"TMUX={os.environ.get('TMUX','<unset>')!r}")
|
|
||||||
|
|
||||||
if len(argv) < 3 or argv[1] != "--":
|
if len(argv) < 3 or argv[1] != "--":
|
||||||
sys.stderr.write(
|
sys.stderr.write(
|
||||||
"usage: python -m claude_bottle.backend.smolmachines.pty_resize "
|
"usage: python -m claude_bottle.backend.smolmachines.pty_resize "
|
||||||
"<machine> -- <smolvm-argv...>\n"
|
"<machine> -- <smolvm-argv...>\n"
|
||||||
)
|
)
|
||||||
_log("exit=2 (bad argv)")
|
|
||||||
return 2
|
return 2
|
||||||
machine = argv[0]
|
machine = argv[0]
|
||||||
inner = argv[2:]
|
inner = argv[2:]
|
||||||
|
|
||||||
def sync(*_args) -> None:
|
def sync(*_args) -> None:
|
||||||
size = _read_winsize()
|
size = _read_winsize()
|
||||||
_log(f"sync size={size!r}")
|
|
||||||
if size is None:
|
if size is None:
|
||||||
return
|
return
|
||||||
_push_size(machine, *size)
|
_push_size(machine, *size)
|
||||||
@@ -136,23 +120,15 @@ def main(argv: list[str]) -> int:
|
|||||||
# is caught even if it races the initial sync.
|
# is caught even if it races the initial sync.
|
||||||
signal.signal(signal.SIGWINCH, sync)
|
signal.signal(signal.SIGWINCH, sync)
|
||||||
|
|
||||||
try:
|
proc = subprocess.Popen(inner)
|
||||||
proc = subprocess.Popen(inner)
|
|
||||||
except BaseException:
|
|
||||||
_log("Popen failed:\n" + traceback.format_exc())
|
|
||||||
raise
|
|
||||||
_log(f"child pid={proc.pid}")
|
|
||||||
sync() # push initial size — VM PTY starts at 0 0.
|
sync() # push initial size — VM PTY starts at 0 0.
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
rc = proc.wait()
|
return proc.wait()
|
||||||
_log(f"child exit rc={rc}")
|
|
||||||
return rc
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# Ctrl-C in the operator's terminal → forward to the
|
# Ctrl-C in the operator's terminal → forward to the
|
||||||
# child once, then keep waiting. claude handles its
|
# child once, then keep waiting. claude handles its
|
||||||
# own interrupt cleanup.
|
# own interrupt cleanup.
|
||||||
_log("KeyboardInterrupt → forward SIGINT to child")
|
|
||||||
proc.send_signal(signal.SIGINT)
|
proc.send_signal(signal.SIGINT)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,19 @@ class TestPushSize(unittest.TestCase):
|
|||||||
self.assertIn("rows 50", argv[8])
|
self.assertIn("rows 50", argv[8])
|
||||||
self.assertIn("for f in /dev/pts/*", argv[8])
|
self.assertIn("for f in /dev/pts/*", argv[8])
|
||||||
|
|
||||||
|
def test_side_channel_uses_devnull_stdin(self):
|
||||||
|
# Load-bearing regression: under tmux, inheriting the
|
||||||
|
# pane PTY as the side-channel's stdin makes smolvm crash
|
||||||
|
# within ~100ms (concurrent smolvm processes sharing the
|
||||||
|
# PTY's FG-PG / input plumbing). DEVNULL stdin sidesteps
|
||||||
|
# the interaction.
|
||||||
|
with patch.object(pty_resize.subprocess, "run") as run:
|
||||||
|
pty_resize._push_size("claude-bottle-m", 24, 80)
|
||||||
|
self.assertEqual(
|
||||||
|
pty_resize.subprocess.DEVNULL,
|
||||||
|
run.call_args.kwargs.get("stdin"),
|
||||||
|
)
|
||||||
|
|
||||||
def test_swallows_subprocess_failures(self):
|
def test_swallows_subprocess_failures(self):
|
||||||
# `check=False` + DEVNULL streams: a side-channel failure
|
# `check=False` + DEVNULL streams: a side-channel failure
|
||||||
# mustn't break the operator's session.
|
# mustn't break the operator's session.
|
||||||
|
|||||||
Reference in New Issue
Block a user