kin/tests/test_qa_gaps.py
2026-03-17 16:02:47 +02:00

556 lines
23 KiB
Python

"""QA gap tests: issues identified during tester-brief review 2026-03-17.
Covers:
1. (medium) Issue 1 — blocked_reason from dept head sub-pipeline failure NOT passed to update_task().
When _execute_department_head_step runs a sub-pipeline and a worker fails, the specific
worker error is LOST at two levels:
(a) _execute_department_head_step does not forward sub_result['error'] in its return dict
(b) run_pipeline at line 1529 uses generic 'Department X sub-pipeline failed'
instead of dept_result.get('error') or dept_result.get('blocked_reason')
Note: when the dept head itself returns status=blocked (not the workers), the reason
IS propagated correctly via the _parse_agent_blocked path (line 1435-1479).
The bug only fires when sub-pipeline workers fail.
2. (low) Issue 2 — dept_infra and dept_research routes MISSING.
specialists.yaml has 7 departments but only 6 dept_* routes.
infra_head and research_head have no dedicated dept_* route.
test_department_routes_exist uses >= 6 so it passes; this test documents the gap explicitly.
3. (low) Issue 3 — pm.md line 134: 'Valid values for status' appears after JSON example
that does NOT contain a status field. The PM is told to include status in output but
the main example contradicts this.
4. (low) Issue 4 — context_builder.py line 335: duplicate task = context.get('task').
Already assigned on line 256. No functional bug, but redundant code.
"""
import json
import pytest
from unittest.mock import patch, MagicMock
from core.db import init_db
from core import models
from agents.runner import run_pipeline, _execute_department_head_step
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def conn():
c = init_db(":memory:")
models.create_project(c, "proj", "TestProject", "~/projects/test",
tech_stack=["python", "vue3"])
models.create_task(c, "PROJ-001", "proj", "Full-stack feature",
brief={"route_type": "dept_feature"})
yield c
c.close()
def _mock_claude_success(output_data):
mock = MagicMock()
mock.stdout = json.dumps(output_data) if isinstance(output_data, dict) else output_data
mock.stderr = ""
mock.returncode = 0
return mock
def _mock_claude_failure(error_msg="error"):
mock = MagicMock()
mock.stdout = ""
mock.stderr = error_msg
mock.returncode = 1
return mock
def _dept_head_output(sub_pipeline, artifacts=None, handoff_notes="", status="done"):
return {
"status": status,
"sub_pipeline": sub_pipeline,
"artifacts": artifacts or {"files_changed": ["api.py"], "notes": "done"},
"handoff_notes": handoff_notes,
}
# ---------------------------------------------------------------------------
# Issue 1: blocked_reason from dept sub-pipeline worker failure NOT passed
# to update_task() in the parent run_pipeline
# (medium severity)
# ---------------------------------------------------------------------------
class TestBlockedReasonPropagation:
"""Issue 1 — agents/runner.py ~line 1529.
When a department head's sub-pipeline worker fails, the specific error
from the worker is NOT propagated to the parent run_pipeline's update_task call.
The bug path:
run_pipeline (parent)
→ run_agent(backend_head) succeeds
→ _execute_department_head_step(...)
→ run_pipeline (child sub-pipeline)
→ run_agent(backend_dev) FAILS with specific error
→ sub_result has error = "Step 1/1 (backend_dev) failed: <specific_error>"
→ BUT _execute_department_head_step does NOT include sub_result['error']
in its return dict (returns only success=False, output=summary)
→ dept_result has no 'blocked_reason' or 'error' key
→ Parent run_pipeline at line 1529 writes generic:
"Department backend_head sub-pipeline failed"
(ignores dept_result entirely)
→ update_task(blocked_reason="Department backend_head sub-pipeline failed")
(specific worker error is LOST)
These tests verify the bug exists and serve as regression tests when fixed.
"""
@patch("agents.runner._run_autocommit")
@patch("agents.runner.subprocess.run")
def test_worker_failure_specific_error_lost_in_task_blocked_reason(
self, mock_run, mock_autocommit, conn
):
"""When sub-pipeline worker fails with specific error, that specific error
is NOT preserved in task.blocked_reason. Only a generic message is stored.
This test DOCUMENTS the bug. The task.blocked_reason gets:
'Department backend_head sub-pipeline failed'
instead of something containing the actual worker error.
"""
specific_worker_error = "Cannot connect to database: connection refused at port 5432"
call_count = [0]
def side_effect(*args, **kwargs):
call_count[0] += 1
if call_count[0] == 1:
# backend_head plans work successfully
return _mock_claude_success(_dept_head_output(
sub_pipeline=[{"role": "backend_dev", "brief": "Implement"}],
))
# backend_dev (worker) fails with specific error
return _mock_claude_failure(specific_worker_error)
mock_run.side_effect = side_effect
steps = [{"role": "backend_head", "model": "opus", "brief": "Do backend"}]
result = run_pipeline(conn, "PROJ-001", steps)
assert result["success"] is False
task = models.get_task(conn, "PROJ-001")
assert task["status"] == "blocked"
# BUG DOCUMENTED: The specific worker error is NOT in task.blocked_reason.
# Current behavior: task.blocked_reason == 'Department backend_head sub-pipeline failed'
# Desired behavior: specific_worker_error should appear in task.blocked_reason
assert specific_worker_error not in (task["blocked_reason"] or ""), (
f"Unexpected: specific worker error was propagated to task.blocked_reason. "
f"This means Issue 1 has been fixed — update this test!"
)
# Verify: generic message IS what gets stored (documents current behavior)
assert "backend_head" in (task["blocked_reason"] or ""), (
"Expected task.blocked_reason to contain 'backend_head' (the generic message)"
)
@patch("agents.runner._run_autocommit")
@patch("agents.runner.subprocess.run")
def test_execute_department_head_step_does_not_include_sub_error_in_return(
self, mock_run, mock_autocommit, conn
):
"""_execute_department_head_step returns success=False when worker fails,
but the returned dict does NOT contain the specific error from the sub-pipeline.
This verifies the first level of the bug: the function silently drops the error.
"""
mock_run.return_value = _mock_claude_failure("disk full: cannot write to /tmp")
output = _dept_head_output(
sub_pipeline=[{"role": "backend_dev", "brief": "Implement"}],
)
pipeline = models.create_pipeline(conn, "PROJ-001", "proj", "dept_feature",
[{"role": "backend_head"}])
dept_result = _execute_department_head_step(
conn, "PROJ-001", "proj",
parent_pipeline_id=pipeline["id"],
step={"role": "backend_head", "brief": "Do backend"},
dept_head_result={"raw_output": json.dumps(output)},
)
assert dept_result["success"] is False
# BUG: 'error' key is missing — sub_result['error'] is not forwarded
assert "error" not in dept_result, (
f"Unexpected: 'error' key is now in dept_result={dept_result}. "
"Issue 1 (first level) may be fixed — update this test!"
)
# BUG: 'blocked_reason' key is also missing
assert "blocked_reason" not in dept_result, (
f"Unexpected: 'blocked_reason' key is now in dept_result={dept_result}. "
"Issue 1 (first level) may be fixed — update this test!"
)
@patch("agents.runner._run_autocommit")
@patch("agents.runner.subprocess.run")
def test_run_pipeline_uses_generic_error_not_dept_result_details(
self, mock_run, mock_autocommit, conn
):
"""run_pipeline uses 'Department X sub-pipeline failed' at line 1529,
ignoring any error/blocked_reason in dept_result.
This verifies the second level of the bug.
"""
call_count = [0]
def side_effect(*args, **kwargs):
call_count[0] += 1
if call_count[0] == 1:
return _mock_claude_success(_dept_head_output(
sub_pipeline=[{"role": "backend_dev", "brief": "Implement"}],
))
return _mock_claude_failure("import error: module 'mymodule' not found")
mock_run.side_effect = side_effect
steps = [{"role": "backend_head", "model": "opus", "brief": "Do backend"}]
result = run_pipeline(conn, "PROJ-001", steps)
assert result["success"] is False
# BUG VERIFIED: result['error'] is the generic message, not the worker error
expected_generic = "Department backend_head sub-pipeline failed"
assert result["error"] == expected_generic, (
f"Expected generic error '{expected_generic}', "
f"got '{result.get('error')}'. "
"If this fails, Issue 1 has been fixed — update this test!"
)
# ---------------------------------------------------------------------------
# Issue 2: infra_head and research_head missing dept_* routes
# (low severity — documentation gap)
# ---------------------------------------------------------------------------
class TestMissingDeptRoutes:
"""Issue 2 — agents/specialists.yaml routes section.
The specialists.yaml defines 7 departments:
backend, frontend, qa, security, infra, research, marketing
But there are only 6 dept_* routes:
dept_feature, dept_fullstack, dept_security_audit, dept_backend,
dept_frontend, dept_marketing
infra_head (infra department) and research_head (research department)
have NO dedicated standalone dept_* route. This means the PM cannot
route a task exclusively to the infra or research department via route template.
"""
def test_infra_head_has_dedicated_dept_route(self):
"""dept_infra route exists in specialists.yaml with steps: [infra_head]."""
import yaml
with open("agents/specialists.yaml") as f:
data = yaml.safe_load(f)
routes = data["routes"]
dept_routes = {k: v for k, v in routes.items() if k.startswith("dept_")}
infra_only_routes = [
name for name, route in dept_routes.items()
if route["steps"] == ["infra_head"]
]
assert len(infra_only_routes) == 1, (
f"Expected exactly one dept_infra route with steps=[infra_head], "
f"found: {infra_only_routes}"
)
assert "dept_infra" in infra_only_routes
def test_research_head_has_no_dedicated_dept_route(self):
"""Confirms research_head is NOT reachable via any standalone dept_research route.
This test documents the gap. It will FAIL once a dept_research route is added.
"""
import yaml
with open("agents/specialists.yaml") as f:
data = yaml.safe_load(f)
routes = data["routes"]
dept_routes = {k: v for k, v in routes.items() if k.startswith("dept_")}
research_only_routes = [
name for name, route in dept_routes.items()
if route["steps"] == ["research_head"]
]
assert len(research_only_routes) == 0, (
f"Found unexpected dept_research route(s): {research_only_routes}. "
"Update this test if route was intentionally added."
)
def test_infra_and_research_heads_missing_from_all_routes(self):
"""Both infra_head and research_head appear in NO route in specialists.yaml.
This is the core gap: the PM cannot use these department heads.
When fixed (routes added), this test will fail and should be updated/removed.
"""
import yaml
with open("agents/specialists.yaml") as f:
data = yaml.safe_load(f)
routes = data["routes"]
all_route_roles = set()
for route in routes.values():
all_route_roles.update(route.get("steps", []))
# Verify the gap exists
assert "infra_head" not in all_route_roles, (
"infra_head now appears in a route. Issue 2 partially fixed. "
"Update this test."
)
assert "research_head" not in all_route_roles, (
"research_head now appears in a route. Issue 2 partially fixed. "
"Update this test."
)
def test_6_dept_routes_currently_exist(self):
"""Documents that exactly 6 dept_* routes exist (not 7 or 8).
The existing test test_department_routes_exist uses >= 6, which passes.
This test pins the CURRENT state. When dept_infra and dept_research are added,
this count will become 8 and this test should be updated.
"""
import yaml
with open("agents/specialists.yaml") as f:
data = yaml.safe_load(f)
routes = data["routes"]
dept_routes = {k: v for k, v in routes.items() if k.startswith("dept_")}
assert len(dept_routes) == 6, (
f"Expected 6 dept_* routes (documenting current state), "
f"found {len(dept_routes)}: {list(dept_routes.keys())}. "
"If this changed, update the count and remove gap tests above."
)
# ---------------------------------------------------------------------------
# Issue 3: pm.md status field inconsistency
# (low severity — documentation/prompt clarity)
# ---------------------------------------------------------------------------
class TestPmPromptStatusFieldConsistency:
"""Issue 3 — agents/prompts/pm.md line 134.
Line 134 says:
'Valid values for status: "done", "blocked".'
This line appears AFTER the main JSON output example (lines 109-131) which
does NOT contain a 'status' field at all. The PM receives contradictory
instructions: the primary example shows no 'status', but line 134 implies
status should be present.
The correct example DOES contain a status field in the blocked-only JSON
at line 143, but the primary output example on lines 109-131 omits it.
"""
def test_pm_prompt_main_example_does_not_contain_status_field(self):
"""The main JSON example in pm.md does NOT include a 'status' field.
This is an inconsistency: line 134 says 'Valid values for status'
but the example itself omits status. Documents the structural ambiguity.
"""
with open("agents/prompts/pm.md") as f:
content = f.read()
# Find the main output example block
# It starts after '## Output format' and ends before '## Blocked Protocol'
output_section_start = content.find("## Output format")
blocked_section_start = content.find("## Blocked Protocol")
assert output_section_start != -1, "Section '## Output format' not found in pm.md"
assert blocked_section_start != -1, "Section '## Blocked Protocol' not found in pm.md"
output_section = content[output_section_start:blocked_section_start]
# The main example JSON block (the first ```json...``` block in this section)
import re
json_blocks = re.findall(r"```json\s*(.*?)```", output_section, re.DOTALL)
assert len(json_blocks) >= 1, "No JSON example found in ## Output format section"
main_example_json_text = json_blocks[0].strip()
# The main example must NOT have a status field (confirming the inconsistency)
# If this test suddenly fails, status was added to the example — the bug is fixed
try:
main_example = json.loads(main_example_json_text)
has_status = "status" in main_example
except json.JSONDecodeError:
# Example may not be parseable (it has comments/placeholders)
has_status = '"status"' in main_example_json_text
assert not has_status, (
"The main JSON example NOW contains a 'status' field. "
"Issue 3 has been fixed — update or remove this test."
)
def test_pm_prompt_does_not_mention_status_values_after_example(self):
"""Orphaned 'Valid values for status' line was removed from pm.md.
Issue 3 fixed: the ambiguous note that appeared after the main JSON example
and referenced a 'status' field absent from that example has been deleted.
"""
with open("agents/prompts/pm.md") as f:
content = f.read()
assert 'Valid values for `status`' not in content, (
"Orphaned 'Valid values for `status`' line was re-introduced in pm.md"
)
def test_pm_blocked_protocol_example_uses_reason_not_blocked_reason(self):
"""The Blocked Protocol JSON example uses 'reason' (not 'blocked_reason').
But _parse_agent_blocked in runner.py accepts both 'reason' and 'blocked_reason'.
The pm.md blocked example and the inline status note use different field names,
adding to the overall inconsistency.
"""
with open("agents/prompts/pm.md") as f:
content = f.read()
blocked_section_start = content.find("## Blocked Protocol")
assert blocked_section_start != -1
blocked_section = content[blocked_section_start:]
import re
json_blocks = re.findall(r"```json\s*(.*?)```", blocked_section, re.DOTALL)
assert len(json_blocks) >= 1, "No JSON example in ## Blocked Protocol"
blocked_example_text = json_blocks[0].strip()
# The blocked protocol example uses "reason", not "blocked_reason"
assert '"reason"' in blocked_example_text, (
"Expected 'reason' field in blocked protocol JSON example"
)
# And should NOT use "blocked_reason" (different from the inline note)
assert '"blocked_reason"' not in blocked_example_text, (
"'blocked_reason' appeared in blocked protocol example — field name is now consistent. "
"Update this test."
)
# ---------------------------------------------------------------------------
# Issue 4: context_builder.py duplicate task assignment
# (low severity — redundant code)
# ---------------------------------------------------------------------------
class TestContextBuilderDuplicateAssignment:
"""Issue 4 — core/context_builder.py line 335.
task = context.get('task') is assigned twice inside format_prompt():
- Line 256: task = context.get('task') (first assignment, used for task info)
- Line 335: task = context.get('task') (duplicate, used for revise_comment)
No functional bug — both assignments produce the same result since the context
dict is not mutated between them. The duplicate is dead code and misleading.
"""
def test_duplicate_task_assignment_does_not_affect_revise_comment_output(self):
"""format_prompt includes revise_comment correctly despite the duplicate
task = context.get('task') on line 335.
This test verifies the duplicate causes no observable functional regression.
"""
from core.context_builder import format_prompt
task_data = {
"id": "PROJ-001",
"title": "Test task",
"status": "in_progress",
"priority": "normal",
"revise_comment": "Please add error handling to the API endpoint",
}
context = {
"task": task_data,
"project": {
"id": "proj",
"name": "TestProject",
"path": "~/projects/test",
"language": "ru",
},
}
# Use a simple inline template to avoid file I/O
result = format_prompt(context, "backend_dev",
prompt_template="You are backend_dev.")
# revise_comment should appear in the formatted output
assert "Director's revision request:" in result
assert "Please add error handling to the API endpoint" in result
def test_duplicate_task_assignment_does_not_affect_task_info_section(self):
"""format_prompt includes task info correctly (first use of 'task' on line 256).
Verifies the first assignment is not broken by the duplicate on line 335.
"""
from core.context_builder import format_prompt
task_data = {
"id": "PROJ-001",
"title": "My test task",
"status": "in_progress",
"priority": "high",
"brief": {"route_type": "debug"},
}
context = {
"task": task_data,
"project": {
"id": "proj",
"name": "TestProject",
"path": "~/projects/test",
"language": "ru",
},
}
result = format_prompt(context, "debugger",
prompt_template="You are debugger.")
# Task section should be populated (uses first assignment at line 256)
assert "PROJ-001" in result
assert "My test task" in result
def test_source_code_has_single_task_assignment(self):
"""Duplicate assignment was removed from context_builder.py.
Issue 4 fixed: task = context.get('task') now appears exactly once
inside format_prompt().
"""
import ast
import pathlib
source = pathlib.Path("core/context_builder.py").read_text()
tree = ast.parse(source)
# Count assignments of the form: task = context.get('task')
task_assign_count = 0
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
targets_are_task = any(
isinstance(t, ast.Name) and t.id == "task"
for t in node.targets
)
if not targets_are_task:
continue
val = node.value
if (
isinstance(val, ast.Call)
and isinstance(val.func, ast.Attribute)
and val.func.attr == "get"
and isinstance(val.func.value, ast.Name)
and val.func.value.id == "context"
and len(val.args) == 1
and isinstance(val.args[0], ast.Constant)
and val.args[0].value == "task"
):
task_assign_count += 1
assert task_assign_count == 1, (
f"Expected exactly 1 'task = context.get(\"task\")' assignment, "
f"found {task_assign_count}. Duplicate may have been re-introduced."
)