diff --git a/.gitea/workflows/canaries.yml b/.gitea/workflows/canaries.yml
new file mode 100644
index 0000000..0085b76
--- /dev/null
+++ b/.gitea/workflows/canaries.yml
@@ -0,0 +1,31 @@
+# Weekly canary suite. Catches upstream regressions (broken pipelock
+# image packaging at the pinned digest, etc.) without coupling every
+# dev push to upstream registry availability.
+#
+# Opt-in via CLAUDE_BOTTLE_RUN_CANARIES=1 so the same files can be run
+# locally with the same gating.
+
+name: canaries
+
+on:
+  schedule:
+    # 12:00 UTC every Monday.
+    - cron: "0 12 * * 1"
+  workflow_dispatch:
+
+jobs:
+  canaries:
+    runs-on: ubuntu-latest
+    env:
+      CLAUDE_BOTTLE_RUN_CANARIES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Run canaries
+        run: python3 -m unittest discover -t . -s tests/canaries -v
diff --git a/.gitea/workflows/test.yml b/.gitea/workflows/test.yml
index f08e038..ee2d590 100644
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -1,10 +1,14 @@
-# Run the project's full test suite on every PR push and on push to main.
+# Run the project's test suite on every PR push and on push to main.
 #
-# The suite uses stdlib `unittest` (see tests/run_tests.py) — no external
-# Python dependencies are required to execute it. Integration tests need a
-# reachable Docker daemon; if Docker is unavailable on the runner those
-# tests skip cleanly via tests/_docker.py:skip_unless_docker, so the job
-# still passes (with skips visible in the run output).
+# The suite uses stdlib `unittest` discovery — no external Python
+# dependencies are required to execute it. Tests are split by directory:
+#
+#   tests/unit/         — pure unit tests; always run
+#   tests/integration/  — need a reachable Docker daemon; skip cleanly
+#                         (via tests/_docker.py:skip_unless_docker) when
+#                         Docker isn't available on the runner
+#   tests/canaries/     — upstream regression canaries; run on a separate
+#                         schedule (see canaries.yml), not here
 #
 # This workflow assumes the Gitea Actions runner exposes the host Docker
 # socket to the job container so `docker` commands inside the job can
@@ -20,8 +24,21 @@ on:
   pull_request:
 
 jobs:
-  test:
-    name: run tests/run_tests.py
+  unit:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Run unit tests
+        run: python3 -m unittest discover -t . -s tests/unit -v
+
+  integration:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -41,5 +58,5 @@ jobs:
             echo "docker not on PATH — integration tests will skip"
           fi
 
-      - name: Run full test suite
-        run: python3 tests/run_tests.py
+      - name: Run integration tests
+        run: python3 -m unittest discover -t . -s tests/integration -v
diff --git a/tests/README.md b/tests/README.md
index 150e669..9e8ef91 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -8,47 +8,58 @@ tests need Docker and skip cleanly otherwise.
 
 ```
 tests/
-  run_tests.py                    # entry point
-  fixtures.py                     # JSON manifest builders
-  _docker.py                      # docker-availability skip helper
-  test_pipelock_naming.py         # unit
-  test_pipelock_classify.py       # unit
-  test_pipelock_allowlist.py      # unit
-  test_pipelock_yaml.py           # unit
-  test_pipelock_image.py          # integration
-  test_pipelock_sidecar_smoke.py  # integration
-  test_dry_run_plan.py            # integration
-  test_orphan_cleanup.py          # integration
+  fixtures.py                       # JSON manifest builders (shared)
+  _docker.py                        # docker-availability skip helper (shared)
+  unit/
+    test_pipelock_classify.py
+    test_pipelock_allowlist.py
+    test_pipelock_yaml.py
+    test_manifest_runtime.py
+  integration/
+    test_pipelock_sidecar_smoke.py
+    test_dry_run_plan.py
+    test_orphan_cleanup.py
+  canaries/
+    test_pipelock_image.py          # opt-in; see below
 ```
 
+Classification falls out of the directory — no hand-maintained list to
+keep in sync.
+
 ## Running
 
 ```bash
-tests/run_tests.py                                  # everything
-tests/run_tests.py unit                             # unit only
-tests/run_tests.py integration                      # integration only
-tests/run_tests.py tests/test_pipelock_yaml.py      # one file
+python -m unittest discover -t . -s tests/unit -v         # unit only
+python -m unittest discover -t . -s tests/integration -v  # integration only
+python -m unittest discover -t . -s tests -v              # both (recursive)
+python -m unittest tests.unit.test_pipelock_yaml          # one file
 ```
 
-You can also run via `python -m unittest`:
-
-```bash
-python -m unittest discover -s tests
-python -m unittest tests.test_pipelock_yaml
-```
+Discovery is invoked with `-t .` (top-level dir = repo root) so the
+`claude_bottle` package on `sys.path` resolves correctly.
 
 ## What the integration tests cover
 
-- `test_pipelock_image.py` — the pinned digest is reachable, ENTRYPOINT
-  is `/pipelock`, and `CMD` includes `run`.
-- `test_pipelock_sidecar_smoke.py` — `docker create` + `docker cp` the
-  generated YAML to `/etc/pipelock.yaml` + `docker start`, then probe
-  `/health`.
-- `test_dry_run_plan.py` — `cli.py start --dry-run` shows the resolved
-  egress allowlist and creates zero docker resources.
-- `test_orphan_cleanup.py` — network_remove and pipelock_stop are
-  idempotent against missing resources, so the EXIT trap can call them
-  unconditionally.
+- `test_pipelock_sidecar_smoke.py` — drives `DockerPipelockProxy.prepare`
+  + `.start` (the production code path) against a real Docker daemon and
+  probes the sidecar's `/health` from an in-network curl container.
+- `test_dry_run_plan.py` — `cli.py start --dry-run --format=json` emits
+  a structured plan that contains the resolved egress allowlist and
+  the bottle's runtime, and creates zero Docker resources.
+- `test_orphan_cleanup.py` — `network_remove` and `PipelockProxy.stop`
+  are idempotent against missing resources, so the EXIT trap can call
+  them unconditionally.
+
+## Canaries
+
+`tests/canaries/` holds upstream-regression checks (e.g. the pinned
+pipelock digest's binary still runs). These are gated on
+`CLAUDE_BOTTLE_RUN_CANARIES=1` and not part of the per-push suite.
+They're invoked by the scheduled `canaries` workflow.
+
+```bash
+CLAUDE_BOTTLE_RUN_CANARIES=1 python -m unittest discover -t . -s tests/canaries -v
+```
 
 ## What's NOT covered
 
@@ -60,9 +71,10 @@ python -m unittest tests.test_pipelock_yaml
 
 ## Adding a test
 
-1. Pick a filename: `test_<topic>.py`. Add it to `INTEGRATION_NAMES`
-   in `run_tests.py` if it needs Docker.
-2. Boilerplate:
+1. Pick the directory: `tests/unit/` for a pure unit test,
+   `tests/integration/` for one that needs Docker.
+2. Filename: `test_<topic>.py`.
+3. Boilerplate:
    ```python
    import unittest
 
@@ -75,5 +87,5 @@ python -m unittest tests.test_pipelock_yaml
    if __name__ == "__main__":
        unittest.main()
    ```
-3. For Docker-dependent tests, decorate the class with
+4. For Docker-dependent tests, decorate the class with
    `@skip_unless_docker()` from `tests._docker`.
diff --git a/tests/canaries/__init__.py b/tests/canaries/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_pipelock_image.py b/tests/canaries/test_pipelock_image.py
similarity index 62%
rename from tests/test_pipelock_image.py
rename to tests/canaries/test_pipelock_image.py
index ffb23b1..72a54cf 100644
--- a/tests/test_pipelock_image.py
+++ b/tests/canaries/test_pipelock_image.py
@@ -1,7 +1,13 @@
-"""Integration: the pinned pipelock image's binary actually runs.
-Catches a broken upstream packaging at the pinned digest. Requires
-docker."""
+"""Canary: the pinned pipelock image's binary actually runs.
 
+This test exists to catch a broken upstream packaging at the pinned
+digest. It is NOT part of the per-push suite — that would couple every
+dev push to upstream registry availability. Set
+CLAUDE_BOTTLE_RUN_CANARIES=1 to opt in (a scheduled CI workflow does
+this; humans can run it ad-hoc the same way).
+"""
+
+import os
 import subprocess
 import unittest
 
@@ -9,6 +15,10 @@ from claude_bottle.backend.docker.pipelock import PIPELOCK_IMAGE
 from tests._docker import skip_unless_docker
 
 
+@unittest.skipUnless(
+    os.environ.get("CLAUDE_BOTTLE_RUN_CANARIES") == "1",
+    "canary suite is opt-in; set CLAUDE_BOTTLE_RUN_CANARIES=1 to run",
+)
 @skip_unless_docker()
 class TestPipelockImage(unittest.TestCase):
     @classmethod
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_dry_run_plan.py b/tests/integration/test_dry_run_plan.py
similarity index 100%
rename from tests/test_dry_run_plan.py
rename to tests/integration/test_dry_run_plan.py
diff --git a/tests/test_orphan_cleanup.py b/tests/integration/test_orphan_cleanup.py
similarity index 100%
rename from tests/test_orphan_cleanup.py
rename to tests/integration/test_orphan_cleanup.py
diff --git a/tests/test_pipelock_sidecar_smoke.py b/tests/integration/test_pipelock_sidecar_smoke.py
similarity index 100%
rename from tests/test_pipelock_sidecar_smoke.py
rename to tests/integration/test_pipelock_sidecar_smoke.py
diff --git a/tests/run_tests.py b/tests/run_tests.py
deleted file mode 100755
index f8edfab..0000000
--- a/tests/run_tests.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env python3
-"""Test runner. Wraps unittest's discovery so we can split unit /
-integration the same way the bash runner did.
-
-Usage:
-  tests/run_tests.py             # unit + integration
-  tests/run_tests.py unit        # unit only (no docker)
-  tests/run_tests.py integration # integration only (need docker)
-  tests/run_tests.py tests/test_x.py  # one specific file (or path)
-
-Tests are auto-classified as integration when their filename matches
-one of INTEGRATION_NAMES below; everything else is a unit test.
-"""
-
-from __future__ import annotations
-
-import sys
-import unittest
-from pathlib import Path
-
-REPO_ROOT = Path(__file__).resolve().parent.parent
-TESTS_DIR = REPO_ROOT / "tests"
-
-INTEGRATION_NAMES = {
-    "test_dry_run_plan.py",
-    "test_orphan_cleanup.py",
-    "test_pipelock_image.py",
-    "test_pipelock_sidecar_smoke.py",
-}
-
-
-def _all_test_files() -> list[Path]:
-    return sorted(TESTS_DIR.glob("test_*.py"))
-
-
-def _classify(path: Path) -> str:
-    return "integration" if path.name in INTEGRATION_NAMES else "unit"
-
-
-def _modname(path: Path) -> str:
-    return f"tests.{path.stem}"
-
-
-def _build_suite(files: list[Path]) -> unittest.TestSuite:
-    loader = unittest.TestLoader()
-    suite = unittest.TestSuite()
-    for f in files:
-        suite.addTests(loader.loadTestsFromName(_modname(f)))
-    return suite
-
-
-def usage() -> None:
-    sys.stderr.write(
-        "usage: tests/run_tests.py [unit|integration|path/to/test.py]\n"
-    )
-
-
-def main(argv: list[str]) -> int:
-    sys.path.insert(0, str(REPO_ROOT))
-
-    if not argv:
-        files = _all_test_files()
-    else:
-        arg = argv[0]
-        if arg in ("-h", "--help"):
-            usage()
-            return 0
-        if arg == "unit":
-            files = [f for f in _all_test_files() if _classify(f) == "unit"]
-        elif arg == "integration":
-            files = [f for f in _all_test_files() if _classify(f) == "integration"]
-        else:
-            p = Path(arg).resolve()
-            if not p.is_file():
-                sys.stderr.write(f"no such file: {arg}\n")
-                usage()
-                return 2
-            files = [p]
-
-    if not files:
-        sys.stderr.write("no test files found\n")
-        return 2
-
-    suite = _build_suite(files)
-    runner = unittest.TextTestRunner(verbosity=2)
-    result = runner.run(suite)
-    return 0 if result.wasSuccessful() else 1
-
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
diff --git a/tests/test_manifest_runtime.py b/tests/test_manifest_runtime.py
deleted file mode 100644
index 963fabf..0000000
--- a/tests/test_manifest_runtime.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""Unit: bottle 'runtime' field is no longer supported (PRD 0003).
-
-gVisor is now auto-detected by the Docker factory. A manifest carrying
-the legacy 'runtime' field must fail loudly with a message pointing the
-user at the auto-detect behavior, rather than silently ignoring."""
-
-import io
-import sys
-import unittest
-
-from claude_bottle.log import Die
-from claude_bottle.manifest import Bottle, Manifest
-
-
-_ABSENT = object()
-
-
-def _manifest(runtime_value: object) -> dict:
-    """Build a minimal manifest JSON shape with one bottle whose runtime
-    field is set (or absent if `runtime_value is _ABSENT`)."""
-    bottle: dict = {}
-    if runtime_value is not _ABSENT:
-        bottle["runtime"] = runtime_value
-    return {
-        "bottles": {"dev": bottle},
-        "agents": {"demo": {"skills": [], "prompt": "", "bottle": "dev"}},
-    }
-
-
-class TestManifestRuntimeRemoved(unittest.TestCase):
-    def test_loads_when_runtime_absent(self):
-        m = Manifest.from_json_obj(_manifest(_ABSENT))
-        self.assertIn("dev", m.bottles)
-
-    def test_bottle_dataclass_has_no_runtime_attribute(self):
-        """Structural check: the field has been removed from the dataclass."""
-        b = Bottle()
-        self.assertFalse(hasattr(b, "runtime"))
-
-    def test_rejects_runsc_value_with_helpful_message(self):
-        captured = io.StringIO()
-        old_stderr = sys.stderr
-        sys.stderr = captured
-        try:
-            with self.assertRaises(Die):
-                Manifest.from_json_obj(_manifest("runsc"))
-        finally:
-            sys.stderr = old_stderr
-        msg = captured.getvalue()
-        self.assertIn("'runtime'", msg, "error names the field")
-        self.assertIn("auto-detect", msg, "error points at the new behavior")
-
-    def test_rejects_runc_value(self):
-        with self.assertRaises(Die):
-            Manifest.from_json_obj(_manifest("runc"))
-
-    def test_rejects_unknown_value(self):
-        with self.assertRaises(Die):
-            Manifest.from_json_obj(_manifest("kata-runtime"))
-
-    def test_rejects_non_string(self):
-        """Any presence of the field is an error; type is not consulted."""
-        with self.assertRaises(Die):
-            Manifest.from_json_obj(_manifest(42))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_manifest_runtime.py b/tests/unit/test_manifest_runtime.py
new file mode 100644
index 0000000..b365862
--- /dev/null
+++ b/tests/unit/test_manifest_runtime.py
@@ -0,0 +1,39 @@
+"""Unit: bottle 'runtime' field is no longer supported (PRD 0003).
+
+gVisor is now auto-detected by the Docker factory. A manifest carrying
+the legacy 'runtime' field must fail, regardless of value, rather than
+silently ignoring."""
+
+import unittest
+
+from claude_bottle.log import Die
+from claude_bottle.manifest import Bottle, Manifest
+
+
+def _manifest_with_runtime(value: object) -> dict:
+    return {
+        "bottles": {"dev": {"runtime": value}},
+        "agents": {"demo": {"skills": [], "prompt": "", "bottle": "dev"}},
+    }
+
+
+class TestManifestRuntimeRemoved(unittest.TestCase):
+    def test_loads_when_runtime_absent(self):
+        m = Manifest.from_json_obj({
+            "bottles": {"dev": {}},
+            "agents": {"demo": {"skills": [], "prompt": "", "bottle": "dev"}},
+        })
+        self.assertIn("dev", m.bottles)
+
+    def test_bottle_dataclass_has_no_runtime_attribute(self):
+        self.assertFalse(hasattr(Bottle(), "runtime"))
+
+    def test_any_runtime_value_is_rejected(self):
+        for value in ("runsc", "runc", "kata-runtime", "", 42, None):
+            with self.subTest(value=value):
+                with self.assertRaises(Die):
+                    Manifest.from_json_obj(_manifest_with_runtime(value))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_pipelock_allowlist.py b/tests/unit/test_pipelock_allowlist.py
similarity index 100%
rename from tests/test_pipelock_allowlist.py
rename to tests/unit/test_pipelock_allowlist.py
diff --git a/tests/test_pipelock_classify.py b/tests/unit/test_pipelock_classify.py
similarity index 100%
rename from tests/test_pipelock_classify.py
rename to tests/unit/test_pipelock_classify.py
diff --git a/tests/test_pipelock_yaml.py b/tests/unit/test_pipelock_yaml.py
similarity index 100%
rename from tests/test_pipelock_yaml.py
rename to tests/unit/test_pipelock_yaml.py