kin: KIN-091 Улучшения из исследования рынка: (1) Revise button с feedback loop, (2) auto-test before review — агент сам прогоняет тесты и фиксит до review, (3) spec-driven workflow для новых проектов — constitution → spec → plan → tasks, (4) git worktrees для параллельных агентов без конфликтов, (5) auto-trigger pipeline при создании задачи с label auto
This commit is contained in:
parent
0cc063d47a
commit
0ccd451b4b
14 changed files with 1660 additions and 18 deletions
353
agents/runner.py
353
agents/runner.py
|
|
@ -54,6 +54,19 @@ def _build_claude_env() -> dict:
|
|||
seen.add(d)
|
||||
deduped.append(d)
|
||||
env["PATH"] = ":".join(deduped)
|
||||
|
||||
# Ensure SSH agent is available for agents that connect via SSH.
|
||||
# Under launchd, SSH_AUTH_SOCK is not inherited — detect macOS system socket.
|
||||
if "SSH_AUTH_SOCK" not in env:
|
||||
import glob
|
||||
socks = glob.glob("/private/tmp/com.apple.launchd.*/Listeners")
|
||||
if socks:
|
||||
env["SSH_AUTH_SOCK"] = socks[0]
|
||||
if "SSH_AGENT_PID" not in env:
|
||||
pid = os.environ.get("SSH_AGENT_PID")
|
||||
if pid:
|
||||
env["SSH_AGENT_PID"] = pid
|
||||
|
||||
return env
|
||||
|
||||
|
||||
|
|
@ -127,6 +140,7 @@ def run_agent(
|
|||
dry_run: bool = False,
|
||||
allow_write: bool = False,
|
||||
noninteractive: bool = False,
|
||||
working_dir_override: str | None = None,
|
||||
) -> dict:
|
||||
"""Run a single Claude Code agent as a subprocess.
|
||||
|
||||
|
|
@ -161,7 +175,9 @@ def run_agent(
|
|||
working_dir = None
|
||||
# Operations projects have no local path — sysadmin works via SSH
|
||||
is_operations = project and project.get("project_type") == "operations"
|
||||
if not is_operations and project and role in ("debugger", "frontend_dev", "backend_dev", "tester", "security"):
|
||||
if working_dir_override:
|
||||
working_dir = working_dir_override
|
||||
elif not is_operations and project and role in ("debugger", "frontend_dev", "backend_dev", "tester", "security", "constitution", "spec", "task_decomposer"):
|
||||
project_path = Path(project["path"]).expanduser()
|
||||
if project_path.is_dir():
|
||||
working_dir = str(project_path)
|
||||
|
|
@ -685,6 +701,151 @@ def _save_sysadmin_output(
|
|||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Auto-test: detect test failure in agent output
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TEST_FAILURE_PATTERNS = [
|
||||
r"\bFAILED\b",
|
||||
r"\bFAIL\b",
|
||||
r"\d+\s+failed",
|
||||
r"test(?:s)?\s+failed",
|
||||
r"assert(?:ion)?\s*(error|failed)",
|
||||
r"exception(?:s)?\s+occurred",
|
||||
r"returncode\s*[!=]=\s*0",
|
||||
r"Error:\s",
|
||||
r"ERRORS?\b",
|
||||
]
|
||||
|
||||
_TEST_SUCCESS_PATTERNS = [
|
||||
r"no\s+failures",
|
||||
r"all\s+tests?\s+pass",
|
||||
r"0\s+failed",
|
||||
r"passed.*no\s+errors",
|
||||
]
|
||||
|
||||
|
||||
def _is_test_failure(result: dict) -> bool:
|
||||
"""Return True if agent output indicates test failures.
|
||||
|
||||
Checks for failure keywords, guards against false positives from
|
||||
explicit success phrases (e.g. 'no failures').
|
||||
"""
|
||||
output = result.get("raw_output") or result.get("output") or ""
|
||||
if not isinstance(output, str):
|
||||
output = json.dumps(output, ensure_ascii=False)
|
||||
|
||||
for p in _TEST_SUCCESS_PATTERNS:
|
||||
if re.search(p, output, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
for p in _TEST_FAILURE_PATTERNS:
|
||||
if re.search(p, output, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Auto-test runner: run project tests via `make test`
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Roles that trigger auto-test when project.auto_test_enabled is set
|
||||
_AUTO_TEST_ROLES = {"backend_dev", "frontend_dev"}
|
||||
|
||||
|
||||
def _run_project_tests(project_path: str, timeout: int = 120) -> dict:
|
||||
"""Run `make test` in project_path. Returns {success, output, returncode}.
|
||||
|
||||
Never raises — all errors are captured and returned in output.
|
||||
"""
|
||||
env = _build_claude_env()
|
||||
make_cmd = shutil.which("make", path=env["PATH"]) or "make"
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[make_cmd, "test"],
|
||||
cwd=project_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
env=env,
|
||||
)
|
||||
output = (result.stdout or "") + (result.stderr or "")
|
||||
return {"success": result.returncode == 0, "output": output, "returncode": result.returncode}
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"success": False, "output": f"make test timed out after {timeout}s", "returncode": 124}
|
||||
except FileNotFoundError:
|
||||
return {"success": False, "output": "make not found — no Makefile or make not in PATH", "returncode": 127}
|
||||
except Exception as exc:
|
||||
return {"success": False, "output": f"Test run error: {exc}", "returncode": -1}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Decomposer output: create child tasks from task_decomposer JSON
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _save_decomposer_output(
|
||||
conn: sqlite3.Connection,
|
||||
project_id: str,
|
||||
parent_task_id: str,
|
||||
result: dict,
|
||||
) -> dict:
|
||||
"""Parse task_decomposer output and create child tasks in DB.
|
||||
|
||||
Expected output format: {tasks: [{title, brief, priority, category, acceptance_criteria}]}
|
||||
Idempotent: skips tasks with same parent_task_id + title (case-insensitive).
|
||||
Returns {created: int, skipped: int}.
|
||||
"""
|
||||
raw = result.get("raw_output") or result.get("output") or ""
|
||||
if isinstance(raw, (dict, list)):
|
||||
raw = json.dumps(raw, ensure_ascii=False)
|
||||
|
||||
parsed = _try_parse_json(raw)
|
||||
if not isinstance(parsed, dict):
|
||||
return {"created": 0, "skipped": 0, "error": "non-JSON decomposer output"}
|
||||
|
||||
task_list = parsed.get("tasks", [])
|
||||
if not isinstance(task_list, list):
|
||||
return {"created": 0, "skipped": 0, "error": "invalid tasks format"}
|
||||
|
||||
created = 0
|
||||
skipped = 0
|
||||
for item in task_list:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
title = (item.get("title") or "").strip()
|
||||
if not title:
|
||||
continue
|
||||
# Idempotency: skip if same parent + title already exists
|
||||
existing = conn.execute(
|
||||
"""SELECT id FROM tasks
|
||||
WHERE parent_task_id = ? AND lower(trim(title)) = lower(trim(?))""",
|
||||
(parent_task_id, title),
|
||||
).fetchone()
|
||||
if existing:
|
||||
skipped += 1
|
||||
continue
|
||||
category = (item.get("category") or "").strip().upper()
|
||||
if category not in models.TASK_CATEGORIES:
|
||||
category = None
|
||||
task_id = models.next_task_id(conn, project_id, category=category)
|
||||
brief_text = item.get("brief") or ""
|
||||
models.create_task(
|
||||
conn,
|
||||
task_id,
|
||||
project_id,
|
||||
title,
|
||||
priority=item.get("priority", 5),
|
||||
brief={"text": brief_text, "source": f"decomposer:{parent_task_id}"},
|
||||
category=category,
|
||||
acceptance_criteria=item.get("acceptance_criteria"),
|
||||
parent_task_id=parent_task_id,
|
||||
)
|
||||
created += 1
|
||||
|
||||
return {"created": created, "skipped": skipped}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Auto-learning: extract decisions from pipeline results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -866,6 +1027,26 @@ def run_pipeline(
|
|||
model = step.get("model", "sonnet")
|
||||
brief = step.get("brief")
|
||||
|
||||
# Worktree isolation: opt-in per project, for write-capable roles
|
||||
_WORKTREE_ROLES = {"backend_dev", "frontend_dev", "debugger"}
|
||||
worktree_path = None
|
||||
project_for_wt = models.get_project(conn, task["project_id"]) if not dry_run else None
|
||||
use_worktree = (
|
||||
not dry_run
|
||||
and role in _WORKTREE_ROLES
|
||||
and project_for_wt
|
||||
and project_for_wt.get("worktrees_enabled")
|
||||
and project_for_wt.get("path")
|
||||
)
|
||||
if use_worktree:
|
||||
try:
|
||||
from core.worktree import create_worktree, ensure_gitignore
|
||||
p_path = str(Path(project_for_wt["path"]).expanduser())
|
||||
ensure_gitignore(p_path)
|
||||
worktree_path = create_worktree(p_path, task_id, role)
|
||||
except Exception:
|
||||
worktree_path = None # Fall back to normal execution
|
||||
|
||||
try:
|
||||
result = run_agent(
|
||||
conn, role, task_id, project_id,
|
||||
|
|
@ -875,6 +1056,7 @@ def run_pipeline(
|
|||
dry_run=dry_run,
|
||||
allow_write=allow_write,
|
||||
noninteractive=noninteractive,
|
||||
working_dir_override=worktree_path,
|
||||
)
|
||||
except Exception as exc:
|
||||
exc_msg = f"Step {i+1}/{len(steps)} ({role}) raised exception: {exc}"
|
||||
|
|
@ -999,6 +1181,44 @@ def run_pipeline(
|
|||
"pipeline_id": pipeline["id"] if pipeline else None,
|
||||
}
|
||||
|
||||
# Worktree merge/cleanup after successful step
|
||||
if worktree_path and result["success"] and not dry_run:
|
||||
try:
|
||||
from core.worktree import merge_worktree, cleanup_worktree
|
||||
p_path = str(Path(project_for_wt["path"]).expanduser())
|
||||
merge_result = merge_worktree(worktree_path, p_path)
|
||||
if not merge_result["success"]:
|
||||
conflicts = merge_result.get("conflicts", [])
|
||||
conflict_msg = f"Worktree merge conflict in files: {', '.join(conflicts)}" if conflicts else "Worktree merge failed"
|
||||
models.update_task(conn, task_id, status="blocked", blocked_reason=conflict_msg)
|
||||
cleanup_worktree(worktree_path, p_path)
|
||||
if pipeline:
|
||||
models.update_pipeline(conn, pipeline["id"], status="failed",
|
||||
total_cost_usd=total_cost,
|
||||
total_tokens=total_tokens,
|
||||
total_duration_seconds=total_duration)
|
||||
return {
|
||||
"success": False,
|
||||
"error": conflict_msg,
|
||||
"steps_completed": i,
|
||||
"results": results,
|
||||
"total_cost_usd": total_cost,
|
||||
"total_tokens": total_tokens,
|
||||
"total_duration_seconds": total_duration,
|
||||
"pipeline_id": pipeline["id"] if pipeline else None,
|
||||
}
|
||||
cleanup_worktree(worktree_path, p_path)
|
||||
except Exception:
|
||||
pass # Worktree errors must never block pipeline
|
||||
elif worktree_path and not dry_run:
|
||||
# Step failed — cleanup worktree without merging
|
||||
try:
|
||||
from core.worktree import cleanup_worktree
|
||||
p_path = str(Path(project_for_wt["path"]).expanduser())
|
||||
cleanup_worktree(worktree_path, p_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Semantic blocked: agent ran successfully but returned status='blocked'
|
||||
|
|
@ -1056,6 +1276,137 @@ def run_pipeline(
|
|||
except Exception:
|
||||
pass # Never block pipeline on sysadmin save errors
|
||||
|
||||
# Save decomposer output: create child tasks from task_decomposer JSON
|
||||
if role == "task_decomposer" and result["success"] and not dry_run:
|
||||
try:
|
||||
_save_decomposer_output(conn, project_id, task_id, result)
|
||||
except Exception:
|
||||
pass # Never block pipeline on decomposer save errors
|
||||
|
||||
# Project-level auto-test: run `make test` after backend_dev/frontend_dev steps.
|
||||
# Enabled per project via auto_test_enabled flag (opt-in).
|
||||
# On failure, loop fixer up to KIN_AUTO_TEST_MAX_ATTEMPTS times, then block.
|
||||
if (
|
||||
not dry_run
|
||||
and role in _AUTO_TEST_ROLES
|
||||
and result["success"]
|
||||
and project_for_wt
|
||||
and project_for_wt.get("auto_test_enabled")
|
||||
and project_for_wt.get("path")
|
||||
):
|
||||
max_auto_test_attempts = int(os.environ.get("KIN_AUTO_TEST_MAX_ATTEMPTS") or 3)
|
||||
p_path_str = str(Path(project_for_wt["path"]).expanduser())
|
||||
test_run = _run_project_tests(p_path_str)
|
||||
results.append({"role": "_auto_test", "success": test_run["success"],
|
||||
"output": test_run["output"], "_project_test": True})
|
||||
auto_test_attempt = 0
|
||||
while not test_run["success"] and auto_test_attempt < max_auto_test_attempts:
|
||||
auto_test_attempt += 1
|
||||
fix_context = (
|
||||
f"Automated project test run (make test) failed after your changes.\n"
|
||||
f"Test output:\n{test_run['output'][:4000]}\n"
|
||||
f"Fix the failing tests. Do NOT modify test files."
|
||||
)
|
||||
fix_result = run_agent(
|
||||
conn, role, task_id, project_id,
|
||||
model=model,
|
||||
previous_output=fix_context,
|
||||
dry_run=False,
|
||||
allow_write=allow_write,
|
||||
noninteractive=noninteractive,
|
||||
)
|
||||
total_cost += fix_result.get("cost_usd") or 0
|
||||
total_tokens += fix_result.get("tokens_used") or 0
|
||||
total_duration += fix_result.get("duration_seconds") or 0
|
||||
results.append({**fix_result, "_auto_test_fix_attempt": auto_test_attempt})
|
||||
test_run = _run_project_tests(p_path_str)
|
||||
results.append({"role": "_auto_test", "success": test_run["success"],
|
||||
"output": test_run["output"], "_project_test": True,
|
||||
"_attempt": auto_test_attempt})
|
||||
if not test_run["success"]:
|
||||
block_reason = (
|
||||
f"Auto-test (make test) failed after {auto_test_attempt} fix attempt(s). "
|
||||
f"Last output: {test_run['output'][:500]}"
|
||||
)
|
||||
models.update_task(conn, task_id, status="blocked", blocked_reason=block_reason)
|
||||
if pipeline:
|
||||
models.update_pipeline(conn, pipeline["id"], status="failed",
|
||||
total_cost_usd=total_cost,
|
||||
total_tokens=total_tokens,
|
||||
total_duration_seconds=total_duration)
|
||||
return {
|
||||
"success": False,
|
||||
"error": block_reason,
|
||||
"steps_completed": i,
|
||||
"results": results,
|
||||
"total_cost_usd": total_cost,
|
||||
"total_tokens": total_tokens,
|
||||
"total_duration_seconds": total_duration,
|
||||
"pipeline_id": pipeline["id"] if pipeline else None,
|
||||
}
|
||||
|
||||
# Auto-test loop: if tester step has auto_fix=true and tests failed,
|
||||
# call fix_role agent and re-run tester up to max_attempts times.
|
||||
if (
|
||||
not dry_run
|
||||
and step.get("auto_fix")
|
||||
and role == "tester"
|
||||
and result["success"]
|
||||
and _is_test_failure(result)
|
||||
):
|
||||
max_attempts = int(step.get("max_attempts", 3))
|
||||
fix_role = step.get("fix_role", "backend_dev")
|
||||
fix_model = step.get("fix_model", model)
|
||||
attempt = 0
|
||||
while attempt < max_attempts and _is_test_failure(result):
|
||||
attempt += 1
|
||||
tester_output = result.get("raw_output") or result.get("output") or ""
|
||||
if isinstance(tester_output, (dict, list)):
|
||||
tester_output = json.dumps(tester_output, ensure_ascii=False)
|
||||
|
||||
# Run fixer
|
||||
fix_result = run_agent(
|
||||
conn, fix_role, task_id, project_id,
|
||||
model=fix_model,
|
||||
previous_output=tester_output,
|
||||
dry_run=False,
|
||||
allow_write=allow_write,
|
||||
noninteractive=noninteractive,
|
||||
)
|
||||
total_cost += fix_result.get("cost_usd") or 0
|
||||
total_tokens += fix_result.get("tokens_used") or 0
|
||||
total_duration += fix_result.get("duration_seconds") or 0
|
||||
results.append({**fix_result, "_auto_fix_attempt": attempt})
|
||||
|
||||
# Re-run tester
|
||||
fix_output = fix_result.get("raw_output") or fix_result.get("output") or ""
|
||||
if isinstance(fix_output, (dict, list)):
|
||||
fix_output = json.dumps(fix_output, ensure_ascii=False)
|
||||
retest = run_agent(
|
||||
conn, role, task_id, project_id,
|
||||
model=model,
|
||||
previous_output=fix_output,
|
||||
dry_run=False,
|
||||
allow_write=allow_write,
|
||||
noninteractive=noninteractive,
|
||||
)
|
||||
total_cost += retest.get("cost_usd") or 0
|
||||
total_tokens += retest.get("tokens_used") or 0
|
||||
total_duration += retest.get("duration_seconds") or 0
|
||||
result = retest
|
||||
results.append({**result, "_auto_retest_attempt": attempt})
|
||||
|
||||
# Save final test result regardless of outcome
|
||||
try:
|
||||
final_output = result.get("raw_output") or result.get("output") or ""
|
||||
models.update_task(conn, task_id, test_result={
|
||||
"output": final_output if isinstance(final_output, str) else str(final_output),
|
||||
"auto_fix_attempts": attempt,
|
||||
"passed": not _is_test_failure(result),
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Chain output to next step
|
||||
previous_output = result.get("raw_output") or result.get("output")
|
||||
if isinstance(previous_output, (dict, list)):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue