314dc03b0d
Moves the orchestrator into bot_bottle/orchestrator/ so one install gets everything. Entry point is now `python -m bot_bottle.orchestrator run`. - Add bot_bottle/orchestrator/ with all 14 modules (verbatim move; internal imports were already relative, so no changes inside orchestrator modules) - Rewrite bootstrap.py: remove the lazy bot_bottle import guard, use direct relative imports from ..contrib.* - Add bot_bottle/contrib/forge/base.py: ScopedForge (read-anywhere / write-scoped) - Add bot_bottle/contrib/gitea/client.py: GiteaClient + GiteaForge (urllib.request only) - Add bot_bottle/contrib/gitea/forge_state.py: ForgeState + SqliteForgeStateStore - Add tests/unit/orchestrator/ (82 tests: 63 migrated + 19 new for contrib modules) Closes #321
69 lines
2.0 KiB
Python
69 lines
2.0 KiB
Python
"""Watchdog: freeze runs whose agent exited without signalling done.
|
|
|
|
`sweep(now)` is the pure, testable core: any `running` record whose
|
|
`last_checkin_at` is older than the timeout is frozen as
|
|
done-without-self-report and returned so provenance can flag it.
|
|
`Watchdog.start()` runs `sweep` on a daemon thread once a minute.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import threading
|
|
from datetime import datetime, timedelta
|
|
|
|
from .model import STATUS_FROZEN, STATUS_RUNNING, RunRecord
|
|
from .runner import BottleRunner
|
|
from .store import StateStore
|
|
|
|
_TICK_SECS = 60.0
|
|
|
|
|
|
def _parse(ts: str) -> datetime | None:
|
|
try:
|
|
return datetime.fromisoformat(ts)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
class Watchdog:
|
|
def __init__(
|
|
self,
|
|
*,
|
|
store: StateStore,
|
|
runner: BottleRunner,
|
|
timeout_secs: int,
|
|
) -> None:
|
|
self._store = store
|
|
self._runner = runner
|
|
self._timeout = timedelta(seconds=timeout_secs)
|
|
self._stop = threading.Event()
|
|
self._thread: threading.Thread | None = None
|
|
|
|
def sweep(self, now: datetime) -> list[RunRecord]:
|
|
"""Freeze stale running records. Returns the ones fired."""
|
|
fired: list[RunRecord] = []
|
|
for record in self._store.all():
|
|
if record.status != STATUS_RUNNING:
|
|
continue
|
|
checkin = _parse(record.last_checkin_at)
|
|
if checkin is None or now - checkin <= self._timeout:
|
|
continue
|
|
self._runner.freeze(record.slug)
|
|
record.status = STATUS_FROZEN
|
|
self._store.upsert(record)
|
|
fired.append(record)
|
|
return fired
|
|
|
|
def start(self) -> None:
|
|
self._thread = threading.Thread(target=self._loop, daemon=True)
|
|
self._thread.start()
|
|
|
|
def stop(self) -> None:
|
|
self._stop.set()
|
|
if self._thread is not None:
|
|
self._thread.join(timeout=_TICK_SECS)
|
|
|
|
def _loop(self) -> None:
|
|
while not self._stop.wait(_TICK_SECS):
|
|
self.sweep(datetime.now().astimezone())
|