fix(apply): write routes/pipelock yaml in place, not via rename
PRD 0018 chunk 3's atomicity fix used write-temp-then-rename to
update bind-mounted config files. POSIX rename atomically swaps
the inode at the host path — but Docker single-file bind mounts
on Linux pin the source inode at mount time, so post-rename the
container's mount points at the now-orphaned old inode and never
sees the new content. The egress sidecar's SIGHUP-driven reload
re-reads the same stale file → "egress route updates aren't
updatable via the supervisor anymore".
Switch egress_apply + pipelock_apply to write in place (same
inode, truncated + rewritten). Lose file-level POSIX atomicity,
but:
- egress: SIGHUP fires only AFTER the write returns; the
addon's `load_routes` raises `ValueError` on a partial read
and keeps the previous in-memory routes, so the in-process
race window (already narrow) is non-disruptive.
- pipelock: applies via `docker restart` rather than SIGHUP;
restart serializes after the host write completes, so the
container reads the fully-written file on next boot.
macOS Docker Desktop's file-sharing layer (virtiofs / osxfs)
silently re-resolves the path on rename, which is why this bug
didn't surface in dev tests on macOS. Linux native Docker is
the strict reading; the fix works on both.
This commit is contained in:
@@ -23,10 +23,8 @@ operator can retry.
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from ...egress import EGRESS_ROUTES_IN_CONTAINER
|
||||
@@ -195,47 +193,36 @@ def apply_routes_change(slug: str, new_content: str) -> tuple[str, str]:
|
||||
# and the operator gets a clear error about the half-state.
|
||||
_mirror_hosts_to_pipelock(slug, _hosts_in_routes(new_content))
|
||||
|
||||
# PRD 0018 chunk 3 + security item (c): routes.yaml is bind-
|
||||
# mounted into the egress container, so the write target is the
|
||||
# host path the sidecar reads through the mount. POSIX
|
||||
# rename-onto-self is atomic on the same filesystem, so a sidecar
|
||||
# SIGHUP racing the apply can never observe a half-written file —
|
||||
# it sees either the old bytes or the new ones.
|
||||
# routes.yaml is bind-mounted into the egress container as a
|
||||
# SINGLE FILE. Docker single-file bind mounts pin the source
|
||||
# inode at mount time; write-temp-then-rename swaps the inode
|
||||
# on the host, which leaves the container's mount pointing at
|
||||
# the now-orphaned old inode (so the SIGHUP'd reload re-reads
|
||||
# unchanged content). Write in-place instead. Lose file-level
|
||||
# atomicity, but the apply path issues SIGHUP only AFTER the
|
||||
# write returns, and the addon's `load_routes` raises
|
||||
# `ValueError` on a partial read and keeps the previous
|
||||
# in-memory routes — so a SIGHUP that hypothetically raced an
|
||||
# in-flight write is non-disruptive.
|
||||
target = _egress_routes_host_path(slug)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
fd, tmp_path_str = tempfile.mkstemp(
|
||||
prefix=".egress_routes.", suffix=".yaml.tmp", dir=str(target.parent),
|
||||
target.write_text(new_content)
|
||||
# mitmproxy in the container reads through the bind mount as
|
||||
# uid 1000; the host file has to be world-readable for that
|
||||
# read to succeed (parent dir at 0o700 still restricts who
|
||||
# can reach the file on the host). Routes content is not
|
||||
# secret — tokens live in the container's environ — so 0o644
|
||||
# is the right trade-off.
|
||||
target.chmod(0o644)
|
||||
sig = subprocess.run(
|
||||
["docker", "kill", "--signal", "HUP", container],
|
||||
capture_output=True, text=True, check=False,
|
||||
)
|
||||
tmp_path = Path(tmp_path_str)
|
||||
try:
|
||||
with os.fdopen(fd, "w") as f:
|
||||
f.write(new_content)
|
||||
# mitmproxy in the container reads through the bind mount as
|
||||
# uid 1000; the host file has to be world-readable for that
|
||||
# read to succeed (parent dir at 0o700 still restricts who
|
||||
# can reach the file on the host). Routes content is not
|
||||
# secret — tokens live in the container's environ — so 0o644
|
||||
# is the right trade-off.
|
||||
os.chmod(tmp_path, 0o644)
|
||||
os.replace(tmp_path, target)
|
||||
sig = subprocess.run(
|
||||
["docker", "kill", "--signal", "HUP", container],
|
||||
capture_output=True, text=True, check=False,
|
||||
if sig.returncode != 0:
|
||||
raise EgressApplyError(
|
||||
f"failed to SIGHUP {container}: "
|
||||
f"{(sig.stderr or '').strip()}"
|
||||
)
|
||||
if sig.returncode != 0:
|
||||
raise EgressApplyError(
|
||||
f"failed to SIGHUP {container}: "
|
||||
f"{(sig.stderr or '').strip()}"
|
||||
)
|
||||
except BaseException:
|
||||
# On any failure pre-rename, drop the tmp file. Post-rename
|
||||
# there's nothing to clean up — `os.replace` is atomic so
|
||||
# either the new file is in place or the old one still is.
|
||||
try:
|
||||
tmp_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
return before, new_content
|
||||
|
||||
|
||||
Reference in New Issue
Block a user