"""Watchdog: freeze runs whose agent exited without signalling done. `sweep(now)` is the pure, testable core: any `running` record whose `last_checkin_at` is older than the timeout is frozen as done-without-self-report and returned so provenance can flag it. `Watchdog.start()` runs `sweep` on a daemon thread once a minute. """ from __future__ import annotations import threading from datetime import datetime, timedelta from .model import STATUS_FROZEN, STATUS_RUNNING, RunRecord from .runner import BottleRunner from .store import StateStore _TICK_SECS = 60.0 def _parse(ts: str) -> datetime | None: try: return datetime.fromisoformat(ts) except (ValueError, TypeError): return None class Watchdog: def __init__( self, *, store: StateStore, runner: BottleRunner, timeout_secs: int, ) -> None: self._store = store self._runner = runner self._timeout = timedelta(seconds=timeout_secs) self._stop = threading.Event() self._thread: threading.Thread | None = None def sweep(self, now: datetime) -> list[RunRecord]: """Freeze stale running records. Returns the ones fired.""" fired: list[RunRecord] = [] for record in self._store.all(): if record.status != STATUS_RUNNING: continue checkin = _parse(record.last_checkin_at) if checkin is None or now - checkin <= self._timeout: continue self._runner.freeze(record.slug) record.status = STATUS_FROZEN self._store.upsert(record) fired.append(record) return fired def start(self) -> None: self._thread = threading.Thread(target=self._loop, daemon=True) self._thread.start() def stop(self) -> None: self._stop.set() if self._thread is not None: self._thread.join(timeout=_TICK_SECS) def _loop(self) -> None: while not self._stop.wait(_TICK_SECS): self.sweep(datetime.now().astimezone())