Fix bootstrap: deep scan, CLAUDE.md fallback, noise filtering

1. Tech stack: recursive file search (depth 3) + CLAUDE.md text fallback when config files are on remote server (detects nodejs, postgresql, etc.) 2. Modules: scan */src/ patterns in top-level dirs (frontend/src/, backend-pg/src/) 3. Decisions: filter out unrelated sections (Jitsi, Nextcloud, Prosody, GOIP), filter noise (commit hashes, shell commands, external service paths). Noise filtering also applied to Obsidian decisions. Tested on vdolipoperek: 4 tech, 5 modules, 9 clean decisions, 24 Obsidian tasks. 61 tests, all passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 13:37:42 +02:00 · 2026-03-15 13:37:42 +02:00 · e5444114bd
commit e5444114bd
parent da4a8aae72
3 changed files with 344 additions and 94 deletions
--- a/agents/bootstrap.py
+++ b/agents/bootstrap.py
@ -42,54 +42,91 @@ _FILE_MARKERS = {
 }


+_SKIP_DIRS = {"node_modules", ".git", "dist", ".next", ".nuxt", "__pycache__", ".venv", "venv"}
+
+
 def detect_tech_stack(project_path: Path) -> list[str]:
-    """Detect tech stack from project files."""
+    """Detect tech stack from project files.
+
+    Searches recursively up to depth 3, skipping node_modules/.git/dist.
+    Falls back to CLAUDE.md heuristics if no files found.
+    """
    stack: set[str] = set()

-    # Config file markers
-    for fname, tech in _FILE_MARKERS.items():
-        # Check root and one level deep
-        if (project_path / fname).exists():
-            stack.add(tech)
-        for sub in ("frontend", "backend", "server", "client", "app"):
-            if (project_path / sub / fname).exists():
-                stack.add(tech)
+    # Recursive search for config files and package.json (depth ≤ 3)
+    for fpath in _walk_files(project_path, max_depth=3):
+        fname = fpath.name
+        if fname in _FILE_MARKERS:
+            stack.add(_FILE_MARKERS[fname])
+        if fname == "package.json":
+            stack.update(_parse_package_json(fpath))
+        if fname == "requirements.txt":
+            stack.update(_parse_requirements_txt(fpath))
+        if fname == "go.mod":
+            stack.add("go")
+            try:
+                text = fpath.read_text(errors="replace")
+                if "gin-gonic" in text:
+                    stack.add("gin")
+                if "fiber" in text:
+                    stack.add("fiber")
+            except OSError:
+                pass

-    # package.json (root + subdirs)
-    for pj_path in _find_package_jsons(project_path):
-        stack.update(_parse_package_json(pj_path))
-
-    # requirements.txt
-    for req_path in project_path.glob("**/requirements.txt"):
-        if _is_inside_node_modules(req_path, project_path):
-            continue
-        stack.update(_parse_requirements_txt(req_path))
-
-    # go.mod
-    go_mod = project_path / "go.mod"
-    if go_mod.exists():
-        stack.add("go")
-        text = go_mod.read_text(errors="replace")
-        if "gin-gonic" in text:
-            stack.add("gin")
-        if "fiber" in text:
-            stack.add("fiber")
+    # Fallback: extract tech hints from CLAUDE.md if no config files found
+    if not stack:
+        stack.update(_detect_stack_from_claude_md(project_path))

    return sorted(stack)


-def _find_package_jsons(root: Path) -> list[Path]:
-    """Find package.json files (root + immediate subdirs, skip node_modules)."""
-    results = []
-    pj = root / "package.json"
-    if pj.exists():
-        results.append(pj)
-    for sub in root.iterdir():
-        if sub.is_dir() and sub.name != "node_modules" and not sub.name.startswith("."):
-            pj = sub / "package.json"
-            if pj.exists():
-                results.append(pj)
-    return results
+# CLAUDE.md text → tech labels (for fallback when project files are on a remote server)
+_CLAUDE_MD_TECH_HINTS = {
+    r"(?i)vue[\s.]?3": "vue3", r"(?i)vue[\s.]?2": "vue2",
+    r"(?i)\bnuxt\b": "nuxt3", r"(?i)\breact\b": "react",
+    r"(?i)\btypescript\b": "typescript", r"(?i)\bvite\b": "vite",
+    r"(?i)\btailwind": "tailwind",
+    r"(?i)node\.?js": "nodejs", r"(?i)\bexpress\b": "express",
+    r"(?i)postgresql|postgres": "postgresql",
+    r"(?i)\bsqlite\b": "sqlite", r"(?i)\bmysql\b": "mysql",
+    r"(?i)\bdocker\b": "docker",
+    r"(?i)\bpython\b": "python", r"(?i)\bfastapi\b": "fastapi",
+    r"(?i)\bdjango\b": "django", r"(?i)\bflask\b": "flask",
+    r"(?i)\bgo\b.*(?:gin|fiber|module)": "go",
+    r"(?i)\bnginx\b": "nginx",
+    r"(?i)\bpinia\b": "pinia", r"(?i)\bvuex\b": "vuex",
+}
+
+
+def _detect_stack_from_claude_md(project_path: Path) -> list[str]:
+    """Fallback: infer tech stack from CLAUDE.md text when no config files exist."""
+    claude_md = project_path / "CLAUDE.md"
+    if not claude_md.exists():
+        return []
+    try:
+        text = claude_md.read_text(errors="replace")[:5000]  # First 5KB is enough
+    except OSError:
+        return []
+    stack = []
+    for pattern, tech in _CLAUDE_MD_TECH_HINTS.items():
+        if re.search(pattern, text):
+            stack.append(tech)
+    return stack
+
+
+def _walk_files(root: Path, max_depth: int = 3, _depth: int = 0):
+    """Yield files up to max_depth, skipping node_modules/dist/.git."""
+    if _depth > max_depth:
+        return
+    try:
+        entries = sorted(root.iterdir())
+    except (OSError, PermissionError):
+        return
+    for entry in entries:
+        if entry.is_file():
+            yield entry
+        elif entry.is_dir() and entry.name not in _SKIP_DIRS and not entry.name.startswith("."):
+            yield from _walk_files(entry, max_depth, _depth + 1)


 def _parse_package_json(path: Path) -> list[str]:
@ -140,26 +177,40 @@ _BACKEND_MARKERS = {"express", "fastify", "koa", "router", "controller", "middle


 def detect_modules(project_path: Path) -> list[dict]:
-    """Scan src/ (or app/, lib/, frontend/, backend/) for modules."""
+    """Scan for modules: checks root subdirs, */src/ patterns, standard names.
+
+    Strategy:
+    1. Find all "source root" dirs (src/, app/, lib/ at root or inside top-level dirs)
+    2. Each first-level subdir of a source root = a module candidate
+    3. Top-level dirs with their own src/ are treated as component roots
+       (e.g. frontend/, backend-pg/) — scan THEIR src/ for modules
+    """
    modules = []
-    scan_dirs = []
+    scan_dirs: list[tuple[Path, str | None]] = []  # (dir, prefix_hint)

-    # Prioritized source dirs
-    for name in ("src", "app", "lib", "frontend", "backend", "server", "client"):
+    # Direct source dirs in root
+    for name in ("src", "app", "lib"):
        d = project_path / name
        if d.is_dir():
-            scan_dirs.append(d)
+            scan_dirs.append((d, None))

-    # Also check frontend/src, backend/src patterns
-    for name in ("frontend/src", "backend/src", "backend-pg/src"):
-        d = project_path / name
-        if d.is_dir():
-            scan_dirs.append(d)
+    # Top-level component dirs (frontend/, backend/, backend-pg/, server/, client/)
+    # These get scanned for src/ inside, or directly if they contain source files
+    for child in sorted(project_path.iterdir()):
+        if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
+            continue
+        child_src = child / "src"
+        if child_src.is_dir():
+            # e.g. frontend/src/, backend-pg/src/ — scan their subdirs
+            scan_dirs.append((child_src, child.name))
+        elif child.name in ("frontend", "backend", "server", "client", "web", "api"):
+            # No src/ but it's a known component dir — scan it directly
+            scan_dirs.append((child, child.name))

    seen = set()
-    for scan_dir in scan_dirs:
+    for scan_dir, prefix in scan_dirs:
        for child in sorted(scan_dir.iterdir()):
-            if not child.is_dir() or child.name.startswith(".") or child.name == "node_modules":
+            if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
                continue
            mod = _analyze_module(child, project_path)
            key = (mod["name"], mod["path"])
@ -230,7 +281,7 @@ def _guess_module_type(dir_path: Path, exts: set[str], files: list[Path]) -> str
 _DECISION_PATTERNS = [
    (r"(?i)\b(GOTCHA|ВАЖНО|WARNING|ВНИМАНИЕ)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
    (r"(?i)\b(WORKAROUND|ОБХОДНОЙ|ХАК)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "workaround"),
-    (r"(?i)\b(FIXME|TODO|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
+    (r"(?i)\b(FIXME|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
    (r"(?i)\b(РЕШЕНИЕ|DECISION)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "decision"),
    (r"(?i)\b(CONVENTION|СОГЛАШЕНИЕ|ПРАВИЛО)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "convention"),
 ]
@ -238,13 +289,83 @@ _DECISION_PATTERNS = [
 # Section headers that likely contain decisions
 _DECISION_SECTIONS = [
    r"(?i)known\s+issues?", r"(?i)workaround", r"(?i)gotcha",
-    r"(?i)решени[яе]", r"(?i)грабл[ия]", r"(?i)важно",
+    r"(?i)решени[яе]", r"(?i)грабл[ия]",
    r"(?i)conventions?", r"(?i)правила", r"(?i)нюансы",
 ]

+# Section headers about UNRELATED services — skip these entirely
+_UNRELATED_SECTION_PATTERNS = [
+    r"(?i)jitsi", r"(?i)nextcloud", r"(?i)prosody",
+    r"(?i)coturn", r"(?i)turn\b", r"(?i)asterisk",
+    r"(?i)ghost\s+блог", r"(?i)onlyoffice",
+    r"(?i)git\s+sync", r"(?i)\.env\s+добав",
+    r"(?i)goip\s+watcher", r"(?i)tbank\s+monitor",  # monitoring services
+    r"(?i)фикс\s+удален",  # commit-level fixes (not decisions)
+]

-def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
-    """Parse CLAUDE.md for decisions, gotchas, workarounds."""
+# Noise patterns — individual items that look like noise, not decisions
+_NOISE_PATTERNS = [
+    r"^[0-9a-f]{6,40}$",                     # commit hashes
+    r"^\s*(docker|ssh|scp|git|curl|sudo)\s",  # shell commands
+    r"^`[^`]+`$",                             # inline code-only items
+    r"(?i)(prosody|jitsi|jicofo|jvb|coturn|nextcloud|onlyoffice|ghost)",  # unrelated services
+    r"(?i)\.jitsi-meet-cfg",                  # jitsi config paths
+    r"(?i)(meet\.jitsi|sitemeet\.org)",        # jitsi domains
+    r"(?i)(cloud\.vault\.red|office\.vault)",  # nextcloud domains
+    r"(?i)JWT_APP_(ID|SECRET)",                # jwt config lines
+    r"(?i)XMPP_",                             # prosody config
+    r"\(коммит\s+`?[0-9a-f]+`?\)",            # "(коммит `a33c2b9`)" references
+    r"(?i)known_uids|idle_loop|reconnect",    # goip-watcher internals
+]
+
+
+def _is_noise(text: str) -> bool:
+    """Check if a decision candidate is noise."""
+    # Clean markdown bold for matching
+    clean = re.sub(r"\*\*([^*]*)\*\*", r"\1", text).strip()
+    return any(re.search(p, clean) for p in _NOISE_PATTERNS)
+
+
+def _split_into_sections(text: str) -> list[tuple[str, str]]:
+    """Split markdown into (header, body) pairs by ## headers.
+
+    Returns list of (header_text, body_text) tuples.
+    Anything before the first ## is returned with header="".
+    """
+    parts = re.split(r"(?m)^(##\s+.+)$", text)
+    sections = []
+    current_header = ""
+    current_body = parts[0] if parts else ""
+
+    for i in range(1, len(parts), 2):
+        if current_header or current_body.strip():
+            sections.append((current_header, current_body))
+        current_header = parts[i].strip()
+        current_body = parts[i + 1] if i + 1 < len(parts) else ""
+
+    if current_header or current_body.strip():
+        sections.append((current_header, current_body))
+
+    return sections
+
+
+def _is_unrelated_section(header: str) -> bool:
+    """Check if a section header is about an unrelated service."""
+    return any(re.search(p, header) for p in _UNRELATED_SECTION_PATTERNS)
+
+
+def extract_decisions_from_claude_md(
+    project_path: Path,
+    project_id: str | None = None,
+    project_name: str | None = None,
+) -> list[dict]:
+    """Parse CLAUDE.md for decisions, gotchas, workarounds.
+
+    Filters out:
+    - Sections about unrelated services (Jitsi, Nextcloud, Prosody, etc.)
+    - Noise: commit hashes, docker/ssh commands, paths to external services
+    - If CLAUDE.md has multi-project sections, only extracts for current project
+    """
    claude_md = project_path / "CLAUDE.md"
    if not claude_md.exists():
        return []
@ -254,20 +375,30 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
    except OSError:
        return []

+    # Split into sections and filter out unrelated ones
+    sections = _split_into_sections(text)
+    relevant_text = []
+    for header, body in sections:
+        if _is_unrelated_section(header):
+            continue
+        relevant_text.append(header + "\n" + body)
+
+    filtered_text = "\n".join(relevant_text)
+
    decisions = []
    seen_titles = set()

-    # Pattern-based extraction
+    # Pattern-based extraction from relevant sections only
    for pattern, dec_type in _DECISION_PATTERNS:
-        for m in re.finditer(pattern, text, re.DOTALL):
-            label = m.group(1).strip()
+        for m in re.finditer(pattern, filtered_text, re.DOTALL):
            body = m.group(2).strip()
            if not body or len(body) < 10:
                continue
-            # First line as title, rest as description
            lines = body.split("\n")
            title = lines[0].strip().rstrip(".")[:100]
            desc = body
+            if _is_noise(title) or _is_noise(desc):
+                continue
            if title not in seen_titles:
                seen_titles.add(title)
                decisions.append({
@ -277,26 +408,36 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
                    "category": _guess_category(title + " " + desc),
                })

-    # Section-based extraction: find headers matching decision sections
-    sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", text)
-    for i, section in enumerate(sections):
+    # Section-based extraction: find ### or #### headers matching decision patterns
+    sub_sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", filtered_text)
+    for i, section in enumerate(sub_sections):
        if any(re.search(pat, section) for pat in _DECISION_SECTIONS):
-            # The content is in the next section
-            if i + 1 < len(sections):
-                content = sections[i + 1].strip()
-                # Extract bullet points
+            if i + 1 < len(sub_sections):
+                content = sub_sections[i + 1].strip()
                for line in content.split("\n"):
                    line = line.strip()
-                    if line.startswith(("- ", "* ", "• ")):
+                    # Numbered items (1. **text**) or bullet items
+                    item = None
+                    if re.match(r"^\d+\.\s+", line):
+                        item = re.sub(r"^\d+\.\s+", "", line).strip()
+                    elif line.startswith(("- ", "* ", "• ")):
                        item = line.lstrip("-*• ").strip()
-                        if item and len(item) > 10 and item[:80] not in seen_titles:
-                            seen_titles.add(item[:80])
-                            decisions.append({
-                                "type": "decision",
-                                "title": item[:100],
-                                "description": item,
-                                "category": _guess_category(item),
-                            })
+
+                    if not item or len(item) < 10:
+                        continue
+                    # Clean bold markers for title
+                    clean = re.sub(r"\*\*([^*]+)\*\*", r"\1", item)
+                    if _is_noise(clean):
+                        continue
+                    title = clean[:100]
+                    if title not in seen_titles:
+                        seen_titles.add(title)
+                        decisions.append({
+                            "type": "gotcha",
+                            "title": title,
+                            "description": item,
+                            "category": _guess_category(item),
+                        })

    return decisions

@ -414,28 +555,34 @@ def _extract_obsidian_decisions(text: str, source: str, decisions: list[dict]):
    for pattern, dec_type in _DECISION_PATTERNS:
        for m in re.finditer(pattern, text, re.DOTALL):
            body = m.group(2).strip()
-            if body and len(body) > 10:
-                title = body.split("\n")[0].strip()[:100]
-                decisions.append({
-                    "type": dec_type,
-                    "title": title,
-                    "description": body,
-                    "category": _guess_category(body),
-                    "source": source,
-                })
-
-    # Also look for ВАЖНО/GOTCHA/FIXME inline markers not caught above
-    for m in re.finditer(r"(?i)\*\*(ВАЖНО|GOTCHA|FIXME)\*\*[:\s]*(.*?)(?=\n|$)", text):
-        body = m.group(2).strip()
-        if body and len(body) > 10:
+            if not body or len(body) < 10:
+                continue
+            title = body.split("\n")[0].strip()[:100]
+            if _is_noise(title) or _is_noise(body):
+                continue
            decisions.append({
-                "type": "gotcha",
-                "title": body[:100],
+                "type": dec_type,
+                "title": title,
                "description": body,
                "category": _guess_category(body),
                "source": source,
            })

+    # Also look for ВАЖНО/GOTCHA/FIXME inline markers not caught above
+    for m in re.finditer(r"(?i)\*\*(ВАЖНО|GOTCHA|FIXME)\*\*[:\s]*(.*?)(?=\n|$)", text):
+        body = m.group(2).strip()
+        if not body or len(body) < 10:
+            continue
+        if _is_noise(body):
+            continue
+        decisions.append({
+            "type": "gotcha",
+            "title": body[:100],
+            "description": body,
+            "category": _guess_category(body),
+            "source": source,
+        })
+

 # ---------------------------------------------------------------------------
 # Formatting for CLI preview