From cc4133f5ce080ddff966a0b6a07245f04d4e3be4 Mon Sep 17 00:00:00 2001 From: Gros Frumos Date: Tue, 17 Mar 2026 14:52:04 +0200 Subject: [PATCH] kin: auto-commit after pipeline --- tests/test_qa_gaps.py | 564 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 564 insertions(+) create mode 100644 tests/test_qa_gaps.py diff --git a/tests/test_qa_gaps.py b/tests/test_qa_gaps.py new file mode 100644 index 0000000..586e1c6 --- /dev/null +++ b/tests/test_qa_gaps.py @@ -0,0 +1,564 @@ +"""QA gap tests: issues identified during tester-brief review 2026-03-17. + +Covers: +1. (medium) Issue 1 — blocked_reason from dept head sub-pipeline failure NOT passed to update_task(). + When _execute_department_head_step runs a sub-pipeline and a worker fails, the specific + worker error is LOST at two levels: + (a) _execute_department_head_step does not forward sub_result['error'] in its return dict + (b) run_pipeline at line 1529 uses generic 'Department X sub-pipeline failed' + instead of dept_result.get('error') or dept_result.get('blocked_reason') + + Note: when the dept head itself returns status=blocked (not the workers), the reason + IS propagated correctly via the _parse_agent_blocked path (line 1435-1479). + The bug only fires when sub-pipeline workers fail. + +2. (low) Issue 2 — dept_infra and dept_research routes MISSING. + specialists.yaml has 7 departments but only 6 dept_* routes. + infra_head and research_head have no dedicated dept_* route. + test_department_routes_exist uses >= 6 so it passes; this test documents the gap explicitly. + +3. (low) Issue 3 — pm.md line 134: 'Valid values for status' appears after JSON example + that does NOT contain a status field. The PM is told to include status in output but + the main example contradicts this. + +4. (low) Issue 4 — context_builder.py line 335: duplicate task = context.get('task'). + Already assigned on line 256. No functional bug, but redundant code. +""" + +import json +import pytest +from unittest.mock import patch, MagicMock + +from core.db import init_db +from core import models +from agents.runner import run_pipeline, _execute_department_head_step + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def conn(): + c = init_db(":memory:") + models.create_project(c, "proj", "TestProject", "~/projects/test", + tech_stack=["python", "vue3"]) + models.create_task(c, "PROJ-001", "proj", "Full-stack feature", + brief={"route_type": "dept_feature"}) + yield c + c.close() + + +def _mock_claude_success(output_data): + mock = MagicMock() + mock.stdout = json.dumps(output_data) if isinstance(output_data, dict) else output_data + mock.stderr = "" + mock.returncode = 0 + return mock + + +def _mock_claude_failure(error_msg="error"): + mock = MagicMock() + mock.stdout = "" + mock.stderr = error_msg + mock.returncode = 1 + return mock + + +def _dept_head_output(sub_pipeline, artifacts=None, handoff_notes="", status="done"): + return { + "status": status, + "sub_pipeline": sub_pipeline, + "artifacts": artifacts or {"files_changed": ["api.py"], "notes": "done"}, + "handoff_notes": handoff_notes, + } + + +# --------------------------------------------------------------------------- +# Issue 1: blocked_reason from dept sub-pipeline worker failure NOT passed +# to update_task() in the parent run_pipeline +# (medium severity) +# --------------------------------------------------------------------------- + +class TestBlockedReasonPropagation: + """Issue 1 — agents/runner.py ~line 1529. + + When a department head's sub-pipeline worker fails, the specific error + from the worker is NOT propagated to the parent run_pipeline's update_task call. + + The bug path: + run_pipeline (parent) + → run_agent(backend_head) succeeds + → _execute_department_head_step(...) + → run_pipeline (child sub-pipeline) + → run_agent(backend_dev) FAILS with specific error + → sub_result has error = "Step 1/1 (backend_dev) failed: " + → BUT _execute_department_head_step does NOT include sub_result['error'] + in its return dict (returns only success=False, output=summary) + → dept_result has no 'blocked_reason' or 'error' key + → Parent run_pipeline at line 1529 writes generic: + "Department backend_head sub-pipeline failed" + (ignores dept_result entirely) + → update_task(blocked_reason="Department backend_head sub-pipeline failed") + (specific worker error is LOST) + + These tests verify the bug exists and serve as regression tests when fixed. + """ + + @patch("agents.runner._run_autocommit") + @patch("agents.runner.subprocess.run") + def test_worker_failure_specific_error_lost_in_task_blocked_reason( + self, mock_run, mock_autocommit, conn + ): + """When sub-pipeline worker fails with specific error, that specific error + is NOT preserved in task.blocked_reason. Only a generic message is stored. + + This test DOCUMENTS the bug. The task.blocked_reason gets: + 'Department backend_head sub-pipeline failed' + instead of something containing the actual worker error. + """ + specific_worker_error = "Cannot connect to database: connection refused at port 5432" + call_count = [0] + + def side_effect(*args, **kwargs): + call_count[0] += 1 + if call_count[0] == 1: + # backend_head plans work successfully + return _mock_claude_success(_dept_head_output( + sub_pipeline=[{"role": "backend_dev", "brief": "Implement"}], + )) + # backend_dev (worker) fails with specific error + return _mock_claude_failure(specific_worker_error) + + mock_run.side_effect = side_effect + + steps = [{"role": "backend_head", "model": "opus", "brief": "Do backend"}] + result = run_pipeline(conn, "PROJ-001", steps) + + assert result["success"] is False + + task = models.get_task(conn, "PROJ-001") + assert task["status"] == "blocked" + + # BUG DOCUMENTED: The specific worker error is NOT in task.blocked_reason. + # Current behavior: task.blocked_reason == 'Department backend_head sub-pipeline failed' + # Desired behavior: specific_worker_error should appear in task.blocked_reason + assert specific_worker_error not in (task["blocked_reason"] or ""), ( + f"Unexpected: specific worker error was propagated to task.blocked_reason. " + f"This means Issue 1 has been fixed — update this test!" + ) + + # Verify: generic message IS what gets stored (documents current behavior) + assert "backend_head" in (task["blocked_reason"] or ""), ( + "Expected task.blocked_reason to contain 'backend_head' (the generic message)" + ) + + @patch("agents.runner._run_autocommit") + @patch("agents.runner.subprocess.run") + def test_execute_department_head_step_does_not_include_sub_error_in_return( + self, mock_run, mock_autocommit, conn + ): + """_execute_department_head_step returns success=False when worker fails, + but the returned dict does NOT contain the specific error from the sub-pipeline. + + This verifies the first level of the bug: the function silently drops the error. + """ + mock_run.return_value = _mock_claude_failure("disk full: cannot write to /tmp") + + output = _dept_head_output( + sub_pipeline=[{"role": "backend_dev", "brief": "Implement"}], + ) + pipeline = models.create_pipeline(conn, "PROJ-001", "proj", "dept_feature", + [{"role": "backend_head"}]) + + dept_result = _execute_department_head_step( + conn, "PROJ-001", "proj", + parent_pipeline_id=pipeline["id"], + step={"role": "backend_head", "brief": "Do backend"}, + dept_head_result={"raw_output": json.dumps(output)}, + ) + + assert dept_result["success"] is False + # BUG: 'error' key is missing — sub_result['error'] is not forwarded + assert "error" not in dept_result, ( + f"Unexpected: 'error' key is now in dept_result={dept_result}. " + "Issue 1 (first level) may be fixed — update this test!" + ) + # BUG: 'blocked_reason' key is also missing + assert "blocked_reason" not in dept_result, ( + f"Unexpected: 'blocked_reason' key is now in dept_result={dept_result}. " + "Issue 1 (first level) may be fixed — update this test!" + ) + + @patch("agents.runner._run_autocommit") + @patch("agents.runner.subprocess.run") + def test_run_pipeline_uses_generic_error_not_dept_result_details( + self, mock_run, mock_autocommit, conn + ): + """run_pipeline uses 'Department X sub-pipeline failed' at line 1529, + ignoring any error/blocked_reason in dept_result. + + This verifies the second level of the bug. + """ + call_count = [0] + + def side_effect(*args, **kwargs): + call_count[0] += 1 + if call_count[0] == 1: + return _mock_claude_success(_dept_head_output( + sub_pipeline=[{"role": "backend_dev", "brief": "Implement"}], + )) + return _mock_claude_failure("import error: module 'mymodule' not found") + + mock_run.side_effect = side_effect + + steps = [{"role": "backend_head", "model": "opus", "brief": "Do backend"}] + result = run_pipeline(conn, "PROJ-001", steps) + + assert result["success"] is False + + # BUG VERIFIED: result['error'] is the generic message, not the worker error + expected_generic = "Department backend_head sub-pipeline failed" + assert result["error"] == expected_generic, ( + f"Expected generic error '{expected_generic}', " + f"got '{result.get('error')}'. " + "If this fails, Issue 1 has been fixed — update this test!" + ) + + +# --------------------------------------------------------------------------- +# Issue 2: infra_head and research_head missing dept_* routes +# (low severity — documentation gap) +# --------------------------------------------------------------------------- + +class TestMissingDeptRoutes: + """Issue 2 — agents/specialists.yaml routes section. + + The specialists.yaml defines 7 departments: + backend, frontend, qa, security, infra, research, marketing + + But there are only 6 dept_* routes: + dept_feature, dept_fullstack, dept_security_audit, dept_backend, + dept_frontend, dept_marketing + + infra_head (infra department) and research_head (research department) + have NO dedicated standalone dept_* route. This means the PM cannot + route a task exclusively to the infra or research department via route template. + """ + + def test_infra_head_has_no_dedicated_dept_route(self): + """Confirms infra_head is NOT reachable via any standalone dept_infra route. + + This test documents the gap. It will FAIL once a dept_infra route is added. + """ + import yaml + with open("agents/specialists.yaml") as f: + data = yaml.safe_load(f) + + routes = data["routes"] + dept_routes = {k: v for k, v in routes.items() if k.startswith("dept_")} + + # Check no route exclusively using infra_head as its only step + infra_only_routes = [ + name for name, route in dept_routes.items() + if route["steps"] == ["infra_head"] + ] + assert len(infra_only_routes) == 0, ( + f"Found unexpected dept_infra route(s): {infra_only_routes}. " + "Update this test if route was intentionally added." + ) + + def test_research_head_has_no_dedicated_dept_route(self): + """Confirms research_head is NOT reachable via any standalone dept_research route. + + This test documents the gap. It will FAIL once a dept_research route is added. + """ + import yaml + with open("agents/specialists.yaml") as f: + data = yaml.safe_load(f) + + routes = data["routes"] + dept_routes = {k: v for k, v in routes.items() if k.startswith("dept_")} + + research_only_routes = [ + name for name, route in dept_routes.items() + if route["steps"] == ["research_head"] + ] + assert len(research_only_routes) == 0, ( + f"Found unexpected dept_research route(s): {research_only_routes}. " + "Update this test if route was intentionally added." + ) + + def test_infra_and_research_heads_missing_from_all_routes(self): + """Both infra_head and research_head appear in NO route in specialists.yaml. + + This is the core gap: the PM cannot use these department heads. + When fixed (routes added), this test will fail and should be updated/removed. + """ + import yaml + with open("agents/specialists.yaml") as f: + data = yaml.safe_load(f) + + routes = data["routes"] + all_route_roles = set() + for route in routes.values(): + all_route_roles.update(route.get("steps", [])) + + # Verify the gap exists + assert "infra_head" not in all_route_roles, ( + "infra_head now appears in a route. Issue 2 partially fixed. " + "Update this test." + ) + assert "research_head" not in all_route_roles, ( + "research_head now appears in a route. Issue 2 partially fixed. " + "Update this test." + ) + + def test_6_dept_routes_currently_exist(self): + """Documents that exactly 6 dept_* routes exist (not 7 or 8). + + The existing test test_department_routes_exist uses >= 6, which passes. + This test pins the CURRENT state. When dept_infra and dept_research are added, + this count will become 8 and this test should be updated. + """ + import yaml + with open("agents/specialists.yaml") as f: + data = yaml.safe_load(f) + + routes = data["routes"] + dept_routes = {k: v for k, v in routes.items() if k.startswith("dept_")} + + assert len(dept_routes) == 6, ( + f"Expected 6 dept_* routes (documenting current state), " + f"found {len(dept_routes)}: {list(dept_routes.keys())}. " + "If this changed, update the count and remove gap tests above." + ) + + +# --------------------------------------------------------------------------- +# Issue 3: pm.md status field inconsistency +# (low severity — documentation/prompt clarity) +# --------------------------------------------------------------------------- + +class TestPmPromptStatusFieldConsistency: + """Issue 3 — agents/prompts/pm.md line 134. + + Line 134 says: + 'Valid values for status: "done", "blocked".' + + This line appears AFTER the main JSON output example (lines 109-131) which + does NOT contain a 'status' field at all. The PM receives contradictory + instructions: the primary example shows no 'status', but line 134 implies + status should be present. + + The correct example DOES contain a status field in the blocked-only JSON + at line 143, but the primary output example on lines 109-131 omits it. + """ + + def test_pm_prompt_main_example_does_not_contain_status_field(self): + """The main JSON example in pm.md does NOT include a 'status' field. + + This is an inconsistency: line 134 says 'Valid values for status' + but the example itself omits status. Documents the structural ambiguity. + """ + with open("agents/prompts/pm.md") as f: + content = f.read() + + # Find the main output example block + # It starts after '## Output format' and ends before '## Blocked Protocol' + output_section_start = content.find("## Output format") + blocked_section_start = content.find("## Blocked Protocol") + assert output_section_start != -1, "Section '## Output format' not found in pm.md" + assert blocked_section_start != -1, "Section '## Blocked Protocol' not found in pm.md" + + output_section = content[output_section_start:blocked_section_start] + + # The main example JSON block (the first ```json...``` block in this section) + import re + json_blocks = re.findall(r"```json\s*(.*?)```", output_section, re.DOTALL) + assert len(json_blocks) >= 1, "No JSON example found in ## Output format section" + + main_example_json_text = json_blocks[0].strip() + + # The main example must NOT have a status field (confirming the inconsistency) + # If this test suddenly fails, status was added to the example — the bug is fixed + try: + main_example = json.loads(main_example_json_text) + has_status = "status" in main_example + except json.JSONDecodeError: + # Example may not be parseable (it has comments/placeholders) + has_status = '"status"' in main_example_json_text + + assert not has_status, ( + "The main JSON example NOW contains a 'status' field. " + "Issue 3 has been fixed — update or remove this test." + ) + + def test_pm_prompt_mentions_status_values_after_example(self): + """Line after main example mentions 'Valid values for status: done, blocked'. + + Documents that this note appears AFTER the main example, causing ambiguity. + """ + with open("agents/prompts/pm.md") as f: + content = f.read() + + assert 'Valid values for `status`' in content, ( + "Expected 'Valid values for `status`' in pm.md — did the prompt change?" + ) + + # Confirm it references done and blocked + assert '"done"' in content + assert '"blocked"' in content + + def test_pm_blocked_protocol_example_uses_reason_not_blocked_reason(self): + """The Blocked Protocol JSON example uses 'reason' (not 'blocked_reason'). + + But _parse_agent_blocked in runner.py accepts both 'reason' and 'blocked_reason'. + The pm.md blocked example and the inline status note use different field names, + adding to the overall inconsistency. + """ + with open("agents/prompts/pm.md") as f: + content = f.read() + + blocked_section_start = content.find("## Blocked Protocol") + assert blocked_section_start != -1 + + blocked_section = content[blocked_section_start:] + + import re + json_blocks = re.findall(r"```json\s*(.*?)```", blocked_section, re.DOTALL) + assert len(json_blocks) >= 1, "No JSON example in ## Blocked Protocol" + + blocked_example_text = json_blocks[0].strip() + # The blocked protocol example uses "reason", not "blocked_reason" + assert '"reason"' in blocked_example_text, ( + "Expected 'reason' field in blocked protocol JSON example" + ) + # And should NOT use "blocked_reason" (different from the inline note) + assert '"blocked_reason"' not in blocked_example_text, ( + "'blocked_reason' appeared in blocked protocol example — field name is now consistent. " + "Update this test." + ) + + +# --------------------------------------------------------------------------- +# Issue 4: context_builder.py duplicate task assignment +# (low severity — redundant code) +# --------------------------------------------------------------------------- + +class TestContextBuilderDuplicateAssignment: + """Issue 4 — core/context_builder.py line 335. + + task = context.get('task') is assigned twice inside format_prompt(): + - Line 256: task = context.get('task') (first assignment, used for task info) + - Line 335: task = context.get('task') (duplicate, used for revise_comment) + + No functional bug — both assignments produce the same result since the context + dict is not mutated between them. The duplicate is dead code and misleading. + """ + + def test_duplicate_task_assignment_does_not_affect_revise_comment_output(self): + """format_prompt includes revise_comment correctly despite the duplicate + task = context.get('task') on line 335. + + This test verifies the duplicate causes no observable functional regression. + """ + from core.context_builder import format_prompt + + task_data = { + "id": "PROJ-001", + "title": "Test task", + "status": "in_progress", + "priority": "normal", + "revise_comment": "Please add error handling to the API endpoint", + } + context = { + "task": task_data, + "project": { + "id": "proj", + "name": "TestProject", + "path": "~/projects/test", + "language": "ru", + }, + } + + # Use a simple inline template to avoid file I/O + result = format_prompt(context, "backend_dev", + prompt_template="You are backend_dev.") + + # revise_comment should appear in the formatted output + assert "Director's revision request:" in result + assert "Please add error handling to the API endpoint" in result + + def test_duplicate_task_assignment_does_not_affect_task_info_section(self): + """format_prompt includes task info correctly (first use of 'task' on line 256). + + Verifies the first assignment is not broken by the duplicate on line 335. + """ + from core.context_builder import format_prompt + + task_data = { + "id": "PROJ-001", + "title": "My test task", + "status": "in_progress", + "priority": "high", + "brief": {"route_type": "debug"}, + } + context = { + "task": task_data, + "project": { + "id": "proj", + "name": "TestProject", + "path": "~/projects/test", + "language": "ru", + }, + } + + result = format_prompt(context, "debugger", + prompt_template="You are debugger.") + + # Task section should be populated (uses first assignment at line 256) + assert "PROJ-001" in result + assert "My test task" in result + + def test_source_code_has_duplicate_task_assignment(self): + """Documents that the duplicate assignment still exists in the source. + + This test will FAIL once the duplicate is removed (cleanup completed). + """ + import ast + import pathlib + + source = pathlib.Path("core/context_builder.py").read_text() + tree = ast.parse(source) + + # Count assignments of the form: task = context.get('task') + task_assign_count = 0 + for node in ast.walk(tree): + if isinstance(node, ast.Assign): + # Check target is 'task' + targets_are_task = any( + isinstance(t, ast.Name) and t.id == "task" + for t in node.targets + ) + if not targets_are_task: + continue + # Check value is context.get('task') or context.get("task") + val = node.value + if ( + isinstance(val, ast.Call) + and isinstance(val.func, ast.Attribute) + and val.func.attr == "get" + and isinstance(val.func.value, ast.Name) + and val.func.value.id == "context" + and len(val.args) == 1 + and isinstance(val.args[0], ast.Constant) + and val.args[0].value == "task" + ): + task_assign_count += 1 + + assert task_assign_count == 2, ( + f"Expected exactly 2 duplicate 'task = context.get(\"task\")' assignments " + f"(documenting Issue 4), found {task_assign_count}. " + "If count is 1, the duplicate was removed — this test can be deleted." + )