chore(smolmachines): instrument pty_resize wrapper for crash diagnosis
test / unit (pull_request) Successful in 28s
test / integration (pull_request) Successful in 41s

User reports launch crashing only inside tmux (works outside).
The wrapper itself runs fine in standalone tmux repros, so the
break is in some interaction we can't see — curses eats stderr,
default tmux remain-on-exit is off, and the pane closes before
the operator can read anything.

Add an always-on per-pid log at ~/.claude-bottle/pty_resize.log:

  - start record: argv, cwd, PATH, TMUX status
  - sync record: window size observed
  - child pid + exit rc
  - any KeyboardInterrupt forwarding
  - Popen failure traceback if it dies

Append-mode, small overhead, easy to grep + share.

Removable (along with the wrapper itself) once smolvm forwards
SIGWINCH natively.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 20:37:50 -04:00
parent 794e8666e1
commit 37bd11b375
@@ -35,6 +35,7 @@ follow-up tracked separately)."""
from __future__ import annotations
import datetime
import fcntl
import os
import signal
@@ -42,6 +43,25 @@ import struct
import subprocess
import sys
import termios
import traceback
# Debug log so we can diagnose tmux-pane crashes that happen in
# pane respawn — the dashboard's curses surface eats stderr, and
# `tmux respawn-pane`'s default remain-on-exit is off. Always-on
# (small overhead) so a user reporting a crash can just share the
# file. Append-mode, per-pid line prefix.
_DEBUG_LOG_PATH = os.path.expanduser("~/.claude-bottle/pty_resize.log")
def _log(msg: str) -> None:
try:
os.makedirs(os.path.dirname(_DEBUG_LOG_PATH), exist_ok=True)
with open(_DEBUG_LOG_PATH, "a") as f:
ts = datetime.datetime.now().isoformat(timespec="milliseconds")
f.write(f"[{ts} pid={os.getpid()}] {msg}\n")
except OSError:
pass
def _read_winsize() -> tuple[int, int] | None:
@@ -90,17 +110,23 @@ def main(argv: list[str]) -> int:
We don't use argparse — the `--` separator is the contract and
everything past it is forwarded verbatim. Keeps the wrapper
transparent for callers building argv programmatically."""
_log(f"start argv={argv!r} cwd={os.getcwd()!r} "
f"PATH={os.environ.get('PATH','')!r} "
f"TMUX={os.environ.get('TMUX','<unset>')!r}")
if len(argv) < 3 or argv[1] != "--":
sys.stderr.write(
"usage: python -m claude_bottle.backend.smolmachines.pty_resize "
"<machine> -- <smolvm-argv...>\n"
)
_log("exit=2 (bad argv)")
return 2
machine = argv[0]
inner = argv[2:]
def sync(*_args) -> None:
size = _read_winsize()
_log(f"sync size={size!r}")
if size is None:
return
_push_size(machine, *size)
@@ -110,15 +136,23 @@ def main(argv: list[str]) -> int:
# is caught even if it races the initial sync.
signal.signal(signal.SIGWINCH, sync)
proc = subprocess.Popen(inner)
try:
proc = subprocess.Popen(inner)
except BaseException:
_log("Popen failed:\n" + traceback.format_exc())
raise
_log(f"child pid={proc.pid}")
sync() # push initial size — VM PTY starts at 0 0.
while True:
try:
return proc.wait()
rc = proc.wait()
_log(f"child exit rc={rc}")
return rc
except KeyboardInterrupt:
# Ctrl-C in the operator's terminal → forward to the
# child once, then keep waiting. claude handles its
# own interrupt cleanup.
_log("KeyboardInterrupt → forward SIGINT to child")
proc.send_signal(signal.SIGINT)