kin: auto-commit after pipeline

This commit is contained in:
Gros Frumos 2026-03-17 15:59:43 +02:00
parent 396f5193d3
commit 18160de45e
9 changed files with 449 additions and 0 deletions

View file

@ -619,6 +619,9 @@ def _migrate(conn: sqlite3.Connection):
if "department" not in pipeline_cols:
conn.execute("ALTER TABLE pipelines ADD COLUMN department TEXT")
conn.commit()
if "pid" not in pipeline_cols:
conn.execute("ALTER TABLE pipelines ADD COLUMN pid INTEGER")
conn.commit()
# Create department_handoffs table (KIN-098)
if "department_handoffs" not in existing_tables:

View file

@ -496,6 +496,7 @@ def update_pipeline(
total_cost_usd: float | None = None,
total_tokens: int | None = None,
total_duration_seconds: int | None = None,
pid: int | None = None,
) -> dict:
"""Update pipeline status and stats."""
fields: dict[str, Any] = {}
@ -509,6 +510,8 @@ def update_pipeline(
fields["total_tokens"] = total_tokens
if total_duration_seconds is not None:
fields["total_duration_seconds"] = total_duration_seconds
if pid is not None:
fields["pid"] = pid
if fields:
sets = ", ".join(f"{k} = ?" for k in fields)
vals = list(fields.values()) + [id]
@ -520,6 +523,14 @@ def update_pipeline(
return _row_to_dict(row)
def get_running_pipelines_with_pid(conn: sqlite3.Connection) -> list[dict]:
"""Return all running pipelines that have a known PID (used by watchdog)."""
rows = conn.execute(
"SELECT id, task_id, pid FROM pipelines WHERE status = 'running' AND pid IS NOT NULL"
).fetchall()
return _rows_to_list(rows)
# ---------------------------------------------------------------------------
# Support
# ---------------------------------------------------------------------------

77
core/watchdog.py Normal file
View file

@ -0,0 +1,77 @@
"""
Kin pipeline watchdog background thread that detects dead subprocesses.
Every `interval` seconds: check all running pipelines with a known PID.
If the PID is dead (ProcessLookupError / ESRCH), mark pipeline as failed
and task as blocked with a descriptive reason.
"""
import logging
import os
import sqlite3
import threading
import time
from pathlib import Path
from core import models
from core.db import init_db
_logger = logging.getLogger("kin.watchdog")
def _check_dead_pipelines(db_path: Path) -> None:
"""Single watchdog pass: open a fresh connection, scan running pipelines."""
try:
conn = sqlite3.connect(str(db_path), check_same_thread=False)
conn.row_factory = sqlite3.Row
try:
running = models.get_running_pipelines_with_pid(conn)
except Exception as exc:
# Table may not exist yet on very first startup
_logger.debug("Watchdog: could not query pipelines (%s)", exc)
conn.close()
return
for row in running:
pid = row["pid"]
pipeline_id = row["id"]
task_id = row["task_id"]
try:
os.kill(pid, 0) # signal 0 = existence check
except ProcessLookupError:
reason = f"Process died unexpectedly (PID {pid})"
_logger.warning(
"Watchdog: pipeline %s PID %s is dead — marking blocked (%s)",
pipeline_id, pid, task_id,
)
try:
models.update_pipeline(conn, pipeline_id, status="failed")
models.update_task(conn, task_id, status="blocked", blocked_reason=reason)
except Exception as upd_exc:
_logger.error("Watchdog: failed to update pipeline/task: %s", upd_exc)
except PermissionError:
# Process exists but we can't signal it (e.g. different user) — skip
pass
conn.close()
except Exception as exc:
_logger.error("Watchdog pass failed: %s", exc)
def _watchdog_loop(db_path: Path, interval: int) -> None:
"""Daemon thread body: sleep then check, forever."""
_logger.info("Watchdog started (interval=%ds, db=%s)", interval, db_path)
while True:
time.sleep(interval)
_check_dead_pipelines(db_path)
def start_watchdog(db_path: Path, interval: int = 30) -> None:
"""Start the background watchdog thread (daemon=True, so it dies with the process)."""
t = threading.Thread(
target=_watchdog_loop,
args=(db_path, interval),
daemon=True,
name="kin-watchdog",
)
t.start()
_logger.info("Watchdog thread launched (pid=%d)", os.getpid())