fix(codex): make host-credential bottles actually authenticate
test / unit (pull_request) Successful in 37s
test / integration (pull_request) Successful in 45s

Debugging a live codex smolmachines bottle surfaced three independent
failures past the sign-in screen; fix each so forward_host_credentials
works end to end:

- codex_auth: dummy access/id tokens now inherit the *real* host token's
  exp instead of now+1h. Codex (0.135) refreshes when its local token's
  JWT exp lapses; with a placeholder refresh_token that refresh fails and
  drops to the sign-in screen. Aligning exp tracks the real token's life.

- prepare: set CODEX_CA_CERTIFICATE to the agent CA bundle for codex
  bottles. Codex is rustls and ignores the system store / NODE_EXTRA_CA_
  CERTS; it reads CODEX_CA_CERTIFICATE (fallback SSL_CERT_FILE) for custom
  roots across HTTPS + wss, so it must be pointed at the egress MITM CA or
  injection can't work without tls_passthrough.

- pipelock: auto tls_passthrough the Codex API hosts when
  forward_host_credentials is on. Egress injects the bearer before
  pipelock, whose header DLP then flags the JWT ("request header contains
  secret") and the retry storm trips its 429. passthrough host-gates the
  CONNECT but skips decrypt+rescan of egress-owned auth. The auto-added
  routes aren't in bottle.egress.routes, so the hosts are added explicitly.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-01 16:38:34 -04:00
parent 237afce844
commit 8e5262b539
11 changed files with 197 additions and 58 deletions
+15 -24
View File
@@ -45,19 +45,11 @@ _HOME_FOR = {
}
def _env_flags_for(user: str) -> list[str]:
def _env_assignments_for(user: str, env: Mapping[str, str]) -> list[str]:
home = _HOME_FOR.get(user, f"/home/{user}")
return ["-e", f"HOME={home}", "-e", f"USER={user}"]
def _guest_env_flags(env: Mapping[str, str]) -> list[str]:
"""Render `{K: V}` into a flat `-e K=V` argv slice for
`smolvm machine exec`. `smolvm machine create -e` set env
on PID 1 but it doesn't propagate to fresh exec process
trees, so we have to re-pass them every call."""
out: list[str] = []
out = [f"HOME={home}", f"USER={user}"]
for k, v in env.items():
out += ["-e", f"{k}={v}"]
out.append(f"{k}={v}")
return out
@@ -98,9 +90,8 @@ class SmolmachinesBottle(Bottle):
flags = ["smolvm", "machine", "exec", "--name", self.name]
if tty:
flags += ["-i", "-t"]
flags += _env_flags_for("node")
flags += _guest_env_flags(self._guest_env)
agent_tail = [self.agent_command]
agent_tail = ["env", *_env_assignments_for("node", self._guest_env),
self.agent_command]
provider_prompt_args = prompt_args(
self._agent_prompt_mode, self._prompt_path, argv=argv,
)
@@ -148,16 +139,16 @@ class SmolmachinesBottle(Bottle):
on both backends. Pass `user="root"` for tests that need
root.
`runuser -u <user> -- /bin/sh -c <script>` switches UID
without invoking a login shell; HOME / USER are set via
`smolvm -e` (see `_env_flags_for`)."""
argv = (
_env_flags_for(user)
+ _guest_env_flags(self._guest_env)
+ ["--", "runuser", "-u", user, "--", "/bin/sh", "-c", script]
)
# _smolvm.machine_exec expects argv (the bit after `--`);
# the -e flags go before, so call smolvm directly.
`runuser -u <user> -- env ... /bin/sh -c <script>` switches UID
without invoking a login shell, then sets HOME / USER and the
bottle env in the child process."""
argv = [
"--", "runuser", "-u", user, "--",
"env", *_env_assignments_for(user, self._guest_env),
"/bin/sh", "-c", script,
]
# Call smolvm directly because this path needs the host-side
# subprocess capture shape used by the Docker backend.
r = subprocess.run(
["smolvm", "machine", "exec", "--name", self.name] + argv,
capture_output=True, text=True, check=False,
@@ -129,6 +129,24 @@ def resolve_plan(
if provider.template == "claude" and has_provider_auth:
guest_env.setdefault("CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC", "1")
guest_env.setdefault("DISABLE_ERROR_REPORTING", "1")
if provider.template == "codex":
# Codex is a Rust/rustls client: unlike the Node agents it does
# NOT consult the system trust store or honor NODE_EXTRA_CA_CERTS.
# It reads CODEX_CA_CERTIFICATE (falling back to SSL_CERT_FILE)
# for custom roots, across HTTPS *and* the wss responses channel.
# Point it at the bundle update-ca-certificates rebuilt with the
# egress MITM CA so Codex trusts the proxy and egress can inject
# the host bearer — without this, codex bottles need
# pipelock tls_passthrough, which disables auth injection.
guest_env["CODEX_CA_CERTIFICATE"] = (
"/etc/ssl/certs/ca-certificates.crt"
)
if provider.template == "codex" and provider.forward_host_credentials:
# Smolvm exec process trees do not reliably inherit the image
# user's login environment. Pin CODEX_HOME to the same path
# provision_provider_auth writes so Codex never falls back to a
# root or unset home and shows the sign-in picker.
guest_env["CODEX_HOME"] = "/home/node/.codex"
supervise_plan = None
if bottle.supervise:
@@ -4,6 +4,7 @@ from __future__ import annotations
import os
from ....log import die
from .. import smolvm as _smolvm
from ..bottle_plan import SmolmachinesBottlePlan
@@ -28,3 +29,21 @@ def provision_provider_auth(plan: SmolmachinesBottlePlan, target: str) -> None:
_smolvm.machine_cp(str(plan.codex_auth_file), f"{target}:{auth_path}")
_smolvm.machine_exec(target, ["chown", "node:node", auth_path])
_smolvm.machine_exec(target, ["chmod", "600", auth_path])
result = _smolvm.machine_exec(
target,
[
"runuser", "-u", "node", "--",
"env",
f"HOME={guest_home}",
f"CODEX_HOME={auth_dir}",
"codex", "login", "status",
],
)
if result.returncode != 0:
detail = (result.stderr or result.stdout).strip()
if detail:
detail = f": {detail}"
die(
"codex host credentials: dummy auth was copied into the "
f"smolmachine, but Codex did not accept it{detail}"
)
+45 -19
View File
@@ -75,11 +75,22 @@ def codex_dummy_auth_json(
now: datetime | None = None,
) -> str:
"""Return a non-secret `auth.json` that keeps Codex in the host's
auth branch while egress owns the real bearer token."""
auth branch while egress owns the real bearer token.
The dummy access/id tokens carry the *host* token's real `exp` so
Codex's proactive refresh lifecycle (it refreshes when its local
access token is at/past expiry) tracks the real token instead of
firing after an artificial TTL. Codex cannot refresh inside the
bottle — the refresh token is a placeholder and the OpenAI token
endpoint is off-route — so a shorter dummy exp would drop Codex to
the sign-in screen the moment it lapsed, even while egress still
holds a valid bearer."""
path = codex_auth_path(host_env)
codex_host_access_token(host_env, now=now)
access = codex_host_access_token(host_env, now=now)
raw = _read_auth_object(path)
dummy = _redact_codex_auth(deepcopy(raw), now=now)
host_exp = _jwt_exp(access)
exp_ts = int(host_exp.timestamp()) if host_exp is not None else None
dummy = _redact_codex_auth(deepcopy(raw), now=now, exp_ts=exp_ts)
return json.dumps(dummy, indent=2, sort_keys=True) + "\n"
@@ -104,29 +115,35 @@ def _read_auth_object(path: Path) -> dict:
return raw
def _dummy_jwt(now: datetime | None = None) -> str:
def _dummy_exp(now: datetime | None, exp_ts: int | None) -> int:
if exp_ts is not None:
return exp_ts
check_now = now or datetime.now(timezone.utc)
exp = int(check_now.timestamp()) + 3600
return int(check_now.timestamp()) + 3600
def _dummy_jwt(now: datetime | None = None, *, exp_ts: int | None = None) -> str:
return _encode_dummy_jwt({
"exp": exp,
"exp": _dummy_exp(now, exp_ts),
"sub": "bot-bottle-placeholder",
})
def _dummy_jwt_from_host(value: object, *, now: datetime | None = None) -> str:
def _dummy_jwt_from_host(
value: object, *, now: datetime | None = None, exp_ts: int | None = None,
) -> str:
if not isinstance(value, str):
return _dummy_jwt(now)
return _dummy_jwt(now, exp_ts=exp_ts)
parts = value.split(".")
if len(parts) < 2:
return _dummy_jwt(now)
return _dummy_jwt(now, exp_ts=exp_ts)
try:
payload = json.loads(_b64url_decode(parts[1]))
except (ValueError, json.JSONDecodeError):
return _dummy_jwt(now)
return _dummy_jwt(now, exp_ts=exp_ts)
if not isinstance(payload, dict):
return _dummy_jwt(now)
return _encode_dummy_jwt(_redact_jwt_payload(payload, now=now))
return _dummy_jwt(now, exp_ts=exp_ts)
return _encode_dummy_jwt(_redact_jwt_payload(payload, now=now, exp_ts=exp_ts))
def _encode_dummy_jwt(payload: dict) -> str:
@@ -141,12 +158,12 @@ def _redact_jwt_payload(
payload: dict,
*,
now: datetime | None = None,
exp_ts: int | None = None,
) -> dict:
check_now = now or datetime.now(timezone.utc)
out = _redact_claims(payload)
if not isinstance(out, dict):
out = {}
out["exp"] = int(check_now.timestamp()) + 3600
out["exp"] = _dummy_exp(now, exp_ts)
out.setdefault("sub", "bot-bottle-placeholder")
return out
@@ -195,6 +212,11 @@ def _redact_auth_claim(value: object) -> dict:
lower = key.lower()
if lower == "chatgpt_plan_type" and isinstance(inner, str) and inner:
out[key] = inner
elif lower == "chatgpt_account_id" and isinstance(inner, str) and inner:
# Current Codex uses the selected account id when building
# ChatGPT requests. Keep that non-secret identifier aligned
# with the host while egress owns the real bearer token.
out[key] = inner
elif lower == "localhost" and isinstance(inner, bool):
out[key] = inner
elif isinstance(inner, bool):
@@ -212,7 +234,9 @@ def _redact_auth_claim(value: object) -> dict:
return out
def _redact_codex_auth(value: object, *, now: datetime | None = None) -> object:
def _redact_codex_auth(
value: object, *, now: datetime | None = None, exp_ts: int | None = None,
) -> object:
if isinstance(value, dict):
out: dict[str, object] = {}
for key, inner in value.items():
@@ -220,18 +244,20 @@ def _redact_codex_auth(value: object, *, now: datetime | None = None) -> object:
if lower == "openai_api_key":
out[key] = None
elif lower == "tokens":
out[key] = _redact_codex_auth(inner, now=now)
out[key] = _redact_codex_auth(inner, now=now, exp_ts=exp_ts)
elif lower in {"access_token", "id_token"}:
out[key] = _dummy_jwt_from_host(inner, now=now)
out[key] = _dummy_jwt_from_host(inner, now=now, exp_ts=exp_ts)
elif "token" in lower or "secret" in lower or lower.endswith("_key"):
out[key] = "bot-bottle-placeholder"
elif lower == "account_id" and isinstance(inner, str) and inner:
out[key] = inner
elif lower in {"account_id", "user_id", "email"}:
out[key] = "bot-bottle-placeholder"
else:
out[key] = _redact_codex_auth(inner, now=now)
out[key] = _redact_codex_auth(inner, now=now, exp_ts=exp_ts)
return out
if isinstance(value, list):
return [_redact_codex_auth(v, now=now) for v in value]
return [_redact_codex_auth(v, now=now, exp_ts=exp_ts) for v in value]
return value
+18 -1
View File
@@ -21,7 +21,11 @@ from dataclasses import dataclass
from pathlib import Path
from typing import cast
from .egress import EGRESS_HOSTNAME, egress_routes_for_bottle
from .egress import (
CODEX_HOST_CREDENTIAL_HOSTS,
EGRESS_HOSTNAME,
egress_routes_for_bottle,
)
from .supervise import SUPERVISE_HOSTNAME
from .manifest import Bottle
@@ -111,6 +115,19 @@ def pipelock_effective_tls_passthrough(bottle: Bottle) -> list[str]:
for route in bottle.egress.routes:
if route.Pipelock.TlsPassthrough:
seen.setdefault(route.Host, None)
# forward_host_credentials makes egress inject the host ChatGPT bearer
# on the Codex API hosts AFTER the agent boundary. Pipelock sits
# downstream of egress and DLP-scans request headers; left to MITM
# these routes it flags the injected JWT as a leaked secret
# ("request header contains secret") and blocks. Pass them through so
# pipelock still enforces the host allowlist on CONNECT but does not
# decrypt + rescan egress-owned auth. The auto-added routes live in
# egress_routes_for_bottle, not bottle.egress.routes, so add the
# hosts explicitly here.
provider = bottle.agent_provider
if provider.forward_host_credentials and provider.template == "codex":
for host in CODEX_HOST_CREDENTIAL_HOSTS:
seen.setdefault(host, None)
return sorted(seen.keys())