Fix bootstrap: deep scan, CLAUDE.md fallback, noise filtering

1. Tech stack: recursive file search (depth 3) + CLAUDE.md text fallback when config files are on remote server (detects nodejs, postgresql, etc.) 2. Modules: scan */src/ patterns in top-level dirs (frontend/src/, backend-pg/src/) 3. Decisions: filter out unrelated sections (Jitsi, Nextcloud, Prosody, GOIP), filter noise (commit hashes, shell commands, external service paths). Noise filtering also applied to Obsidian decisions. Tested on vdolipoperek: 4 tech, 5 modules, 9 clean decisions, 24 Obsidian tasks. 61 tests, all passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 13:37:42 +02:00 · 2026-03-15 13:37:42 +02:00 · e5444114bd
commit e5444114bd
parent da4a8aae72
3 changed files with 344 additions and 94 deletions
--- a/agents/bootstrap.py
+++ b/agents/bootstrap.py
@ -42,54 +42,91 @@ _FILE_MARKERS = {
 }
 _SKIP_DIRS = {"node_modules", ".git", "dist", ".next", ".nuxt", "__pycache__", ".venv", "venv"}
 def detect_tech_stack(project_path: Path) -> list[str]:
-    """Detect tech stack from project files."""
+    """Detect tech stack from project files.
    Searches recursively up to depth 3, skipping node_modules/.git/dist.
    Falls back to CLAUDE.md heuristics if no files found.
    """
    stack: set[str] = set()
-    # Config file markers
+    # Recursive search for config files and package.json (depth ≤ 3)
-    for fname, tech in _FILE_MARKERS.items():
+    for fpath in _walk_files(project_path, max_depth=3):
-        # Check root and one level deep
+        fname = fpath.name
-        if (project_path / fname).exists():
+        if fname in _FILE_MARKERS:
-            stack.add(tech)
+            stack.add(_FILE_MARKERS[fname])
-        for sub in ("frontend", "backend", "server", "client", "app"):
+        if fname == "package.json":
-            if (project_path / sub / fname).exists():
+            stack.update(_parse_package_json(fpath))
-                stack.add(tech)
+        if fname == "requirements.txt":
-
+            stack.update(_parse_requirements_txt(fpath))
-    # package.json (root + subdirs)
+        if fname == "go.mod":
    for pj_path in _find_package_jsons(project_path):
        stack.update(_parse_package_json(pj_path))
    # requirements.txt
    for req_path in project_path.glob("**/requirements.txt"):
        if _is_inside_node_modules(req_path, project_path):
            continue
        stack.update(_parse_requirements_txt(req_path))
    # go.mod
    go_mod = project_path / "go.mod"
    if go_mod.exists():
            stack.add("go")
-        text = go_mod.read_text(errors="replace")
+            try:
                text = fpath.read_text(errors="replace")
                if "gin-gonic" in text:
                    stack.add("gin")
                if "fiber" in text:
                    stack.add("fiber")
            except OSError:
                pass
    # Fallback: extract tech hints from CLAUDE.md if no config files found
    if not stack:
        stack.update(_detect_stack_from_claude_md(project_path))
    return sorted(stack)
-def _find_package_jsons(root: Path) -> list[Path]:
+# CLAUDE.md text → tech labels (for fallback when project files are on a remote server)
-    """Find package.json files (root + immediate subdirs, skip node_modules)."""
+_CLAUDE_MD_TECH_HINTS = {
-    results = []
+    r"(?i)vue[\s.]?3": "vue3", r"(?i)vue[\s.]?2": "vue2",
-    pj = root / "package.json"
+    r"(?i)\bnuxt\b": "nuxt3", r"(?i)\breact\b": "react",
-    if pj.exists():
+    r"(?i)\btypescript\b": "typescript", r"(?i)\bvite\b": "vite",
-        results.append(pj)
+    r"(?i)\btailwind": "tailwind",
-    for sub in root.iterdir():
+    r"(?i)node\.?js": "nodejs", r"(?i)\bexpress\b": "express",
-        if sub.is_dir() and sub.name != "node_modules" and not sub.name.startswith("."):
+    r"(?i)postgresql|postgres": "postgresql",
-            pj = sub / "package.json"
+    r"(?i)\bsqlite\b": "sqlite", r"(?i)\bmysql\b": "mysql",
-            if pj.exists():
+    r"(?i)\bdocker\b": "docker",
-                results.append(pj)
+    r"(?i)\bpython\b": "python", r"(?i)\bfastapi\b": "fastapi",
-    return results
+    r"(?i)\bdjango\b": "django", r"(?i)\bflask\b": "flask",
    r"(?i)\bgo\b.*(?:gin|fiber|module)": "go",
    r"(?i)\bnginx\b": "nginx",
    r"(?i)\bpinia\b": "pinia", r"(?i)\bvuex\b": "vuex",
 }
 def _detect_stack_from_claude_md(project_path: Path) -> list[str]:
    """Fallback: infer tech stack from CLAUDE.md text when no config files exist."""
    claude_md = project_path / "CLAUDE.md"
    if not claude_md.exists():
        return []
    try:
        text = claude_md.read_text(errors="replace")[:5000]  # First 5KB is enough
    except OSError:
        return []
    stack = []
    for pattern, tech in _CLAUDE_MD_TECH_HINTS.items():
        if re.search(pattern, text):
            stack.append(tech)
    return stack
 def _walk_files(root: Path, max_depth: int = 3, _depth: int = 0):
    """Yield files up to max_depth, skipping node_modules/dist/.git."""
    if _depth > max_depth:
        return
    try:
        entries = sorted(root.iterdir())
    except (OSError, PermissionError):
        return
    for entry in entries:
        if entry.is_file():
            yield entry
        elif entry.is_dir() and entry.name not in _SKIP_DIRS and not entry.name.startswith("."):
            yield from _walk_files(entry, max_depth, _depth + 1)
 def _parse_package_json(path: Path) -> list[str]:
@ -140,26 +177,40 @@ _BACKEND_MARKERS = {"express", "fastify", "koa", "router", "controller", "middle
 def detect_modules(project_path: Path) -> list[dict]:
-    """Scan src/ (or app/, lib/, frontend/, backend/) for modules."""
+    """Scan for modules: checks root subdirs, */src/ patterns, standard names.
    Strategy:
    1. Find all "source root" dirs (src/, app/, lib/ at root or inside top-level dirs)
    2. Each first-level subdir of a source root = a module candidate
    3. Top-level dirs with their own src/ are treated as component roots
       (e.g. frontend/, backend-pg/) — scan THEIR src/ for modules
    """
    modules = []
-    scan_dirs = []
+    scan_dirs: list[tuple[Path, str | None]] = []  # (dir, prefix_hint)
-    # Prioritized source dirs
+    # Direct source dirs in root
-    for name in ("src", "app", "lib", "frontend", "backend", "server", "client"):
+    for name in ("src", "app", "lib"):
        d = project_path / name
        if d.is_dir():
-            scan_dirs.append(d)
+            scan_dirs.append((d, None))
-    # Also check frontend/src, backend/src patterns
+    # Top-level component dirs (frontend/, backend/, backend-pg/, server/, client/)
-    for name in ("frontend/src", "backend/src", "backend-pg/src"):
+    # These get scanned for src/ inside, or directly if they contain source files
-        d = project_path / name
+    for child in sorted(project_path.iterdir()):
-        if d.is_dir():
+        if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
-            scan_dirs.append(d)
+            continue
        child_src = child / "src"
        if child_src.is_dir():
            # e.g. frontend/src/, backend-pg/src/ — scan their subdirs
            scan_dirs.append((child_src, child.name))
        elif child.name in ("frontend", "backend", "server", "client", "web", "api"):
            # No src/ but it's a known component dir — scan it directly
            scan_dirs.append((child, child.name))
    seen = set()
-    for scan_dir in scan_dirs:
+    for scan_dir, prefix in scan_dirs:
        for child in sorted(scan_dir.iterdir()):
-            if not child.is_dir() or child.name.startswith(".") or child.name == "node_modules":
+            if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
                continue
            mod = _analyze_module(child, project_path)
            key = (mod["name"], mod["path"])
@ -230,7 +281,7 @@ def _guess_module_type(dir_path: Path, exts: set[str], files: list[Path]) -> str
 _DECISION_PATTERNS = [
    (r"(?i)\b(GOTCHA|ВАЖНО|WARNING|ВНИМАНИЕ)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
    (r"(?i)\b(WORKAROUND|ОБХОДНОЙ|ХАК)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "workaround"),
-    (r"(?i)\b(FIXME|TODO|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
+    (r"(?i)\b(FIXME|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
    (r"(?i)\b(РЕШЕНИЕ|DECISION)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "decision"),
    (r"(?i)\b(CONVENTION|СОГЛАШЕНИЕ|ПРАВИЛО)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "convention"),
 ]
@ -238,13 +289,83 @@ _DECISION_PATTERNS = [
 # Section headers that likely contain decisions
 _DECISION_SECTIONS = [
    r"(?i)known\s+issues?", r"(?i)workaround", r"(?i)gotcha",
-    r"(?i)решени[яе]", r"(?i)грабл[ия]", r"(?i)важно",
+    r"(?i)решени[яе]", r"(?i)грабл[ия]",
    r"(?i)conventions?", r"(?i)правила", r"(?i)нюансы",
 ]
 # Section headers about UNRELATED services — skip these entirely
 _UNRELATED_SECTION_PATTERNS = [
    r"(?i)jitsi", r"(?i)nextcloud", r"(?i)prosody",
    r"(?i)coturn", r"(?i)turn\b", r"(?i)asterisk",
    r"(?i)ghost\s+блог", r"(?i)onlyoffice",
    r"(?i)git\s+sync", r"(?i)\.env\s+добав",
    r"(?i)goip\s+watcher", r"(?i)tbank\s+monitor",  # monitoring services
    r"(?i)фикс\s+удален",  # commit-level fixes (not decisions)
 ]
-def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
+# Noise patterns — individual items that look like noise, not decisions
-    """Parse CLAUDE.md for decisions, gotchas, workarounds."""
+_NOISE_PATTERNS = [
    r"^[0-9a-f]{6,40}$",                     # commit hashes
    r"^\s*(docker|ssh|scp|git|curl|sudo)\s",  # shell commands
    r"^`[^`]+`$",                             # inline code-only items
    r"(?i)(prosody|jitsi|jicofo|jvb|coturn|nextcloud|onlyoffice|ghost)",  # unrelated services
    r"(?i)\.jitsi-meet-cfg",                  # jitsi config paths
    r"(?i)(meet\.jitsi|sitemeet\.org)",        # jitsi domains
    r"(?i)(cloud\.vault\.red|office\.vault)",  # nextcloud domains
    r"(?i)JWT_APP_(ID|SECRET)",                # jwt config lines
    r"(?i)XMPP_",                             # prosody config
    r"\(коммит\s+`?[0-9a-f]+`?\)",            # "(коммит `a33c2b9`)" references
    r"(?i)known_uids|idle_loop|reconnect",    # goip-watcher internals
 ]
 def _is_noise(text: str) -> bool:
    """Check if a decision candidate is noise."""
    # Clean markdown bold for matching
    clean = re.sub(r"\*\*([^*]*)\*\*", r"\1", text).strip()
    return any(re.search(p, clean) for p in _NOISE_PATTERNS)
 def _split_into_sections(text: str) -> list[tuple[str, str]]:
    """Split markdown into (header, body) pairs by ## headers.
    Returns list of (header_text, body_text) tuples.
    Anything before the first ## is returned with header="".
    """
    parts = re.split(r"(?m)^(##\s+.+)$", text)
    sections = []
    current_header = ""
    current_body = parts[0] if parts else ""
    for i in range(1, len(parts), 2):
        if current_header or current_body.strip():
            sections.append((current_header, current_body))
        current_header = parts[i].strip()
        current_body = parts[i + 1] if i + 1 < len(parts) else ""
    if current_header or current_body.strip():
        sections.append((current_header, current_body))
    return sections
 def _is_unrelated_section(header: str) -> bool:
    """Check if a section header is about an unrelated service."""
    return any(re.search(p, header) for p in _UNRELATED_SECTION_PATTERNS)
 def extract_decisions_from_claude_md(
    project_path: Path,
    project_id: str | None = None,
    project_name: str | None = None,
 ) -> list[dict]:
    """Parse CLAUDE.md for decisions, gotchas, workarounds.
    Filters out:
    - Sections about unrelated services (Jitsi, Nextcloud, Prosody, etc.)
    - Noise: commit hashes, docker/ssh commands, paths to external services
    - If CLAUDE.md has multi-project sections, only extracts for current project
    """
    claude_md = project_path / "CLAUDE.md"
    if not claude_md.exists():
        return []
@ -254,20 +375,30 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
    except OSError:
        return []
    # Split into sections and filter out unrelated ones
    sections = _split_into_sections(text)
    relevant_text = []
    for header, body in sections:
        if _is_unrelated_section(header):
            continue
        relevant_text.append(header + "\n" + body)
    filtered_text = "\n".join(relevant_text)
    decisions = []
    seen_titles = set()
-    # Pattern-based extraction
+    # Pattern-based extraction from relevant sections only
    for pattern, dec_type in _DECISION_PATTERNS:
-        for m in re.finditer(pattern, text, re.DOTALL):
+        for m in re.finditer(pattern, filtered_text, re.DOTALL):
            label = m.group(1).strip()
            body = m.group(2).strip()
            if not body or len(body) < 10:
                continue
            # First line as title, rest as description
            lines = body.split("\n")
            title = lines[0].strip().rstrip(".")[:100]
            desc = body
            if _is_noise(title) or _is_noise(desc):
                continue
            if title not in seen_titles:
                seen_titles.add(title)
                decisions.append({
@ -277,23 +408,33 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
                    "category": _guess_category(title + " " + desc),
                })
-    # Section-based extraction: find headers matching decision sections
+    # Section-based extraction: find ### or #### headers matching decision patterns
-    sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", text)
+    sub_sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", filtered_text)
-    for i, section in enumerate(sections):
+    for i, section in enumerate(sub_sections):
        if any(re.search(pat, section) for pat in _DECISION_SECTIONS):
-            # The content is in the next section
+            if i + 1 < len(sub_sections):
-            if i + 1 < len(sections):
+                content = sub_sections[i + 1].strip()
                content = sections[i + 1].strip()
                # Extract bullet points
                for line in content.split("\n"):
                    line = line.strip()
-                    if line.startswith(("- ", "* ", "• ")):
+                    # Numbered items (1. **text**) or bullet items
                    item = None
                    if re.match(r"^\d+\.\s+", line):
                        item = re.sub(r"^\d+\.\s+", "", line).strip()
                    elif line.startswith(("- ", "* ", "• ")):
                        item = line.lstrip("-*• ").strip()
-                        if item and len(item) > 10 and item[:80] not in seen_titles:
+
-                            seen_titles.add(item[:80])
+                    if not item or len(item) < 10:
                        continue
                    # Clean bold markers for title
                    clean = re.sub(r"\*\*([^*]+)\*\*", r"\1", item)
                    if _is_noise(clean):
                        continue
                    title = clean[:100]
                    if title not in seen_titles:
                        seen_titles.add(title)
                        decisions.append({
-                                "type": "decision",
+                            "type": "gotcha",
-                                "title": item[:100],
+                            "title": title,
                            "description": item,
                            "category": _guess_category(item),
                        })
@ -414,8 +555,11 @@ def _extract_obsidian_decisions(text: str, source: str, decisions: list[dict]):
    for pattern, dec_type in _DECISION_PATTERNS:
        for m in re.finditer(pattern, text, re.DOTALL):
            body = m.group(2).strip()
-            if body and len(body) > 10:
+            if not body or len(body) < 10:
                continue
            title = body.split("\n")[0].strip()[:100]
            if _is_noise(title) or _is_noise(body):
                continue
            decisions.append({
                "type": dec_type,
                "title": title,
@ -427,7 +571,10 @@ def _extract_obsidian_decisions(text: str, source: str, decisions: list[dict]):
    # Also look for ВАЖНО/GOTCHA/FIXME inline markers not caught above
    for m in re.finditer(r"(?i)\*\*(ВАЖНО|GOTCHA|FIXME)\*\*[:\s]*(.*?)(?=\n|$)", text):
        body = m.group(2).strip()
-        if body and len(body) > 10:
+        if not body or len(body) < 10:
            continue
        if _is_noise(body):
            continue
        decisions.append({
            "type": "gotcha",
            "title": body[:100],
--- a/cli/main.py
+++ b/cli/main.py
@ -435,7 +435,7 @@ def bootstrap(ctx, path, project_id, name, vault_path, yes):
    click.echo(f"Scanning {project_path} ...")
    tech_stack = detect_tech_stack(project_path)
    modules = detect_modules(project_path)
-    decisions = extract_decisions_from_claude_md(project_path)
+    decisions = extract_decisions_from_claude_md(project_path, project_id, name)
    # Obsidian
    obsidian = None
--- a/tests/test_bootstrap.py
+++ b/tests/test_bootstrap.py
@ -67,6 +67,27 @@ def test_detect_monorepo(tmp_path):
    assert "fastapi" in stack
 def test_detect_deep_monorepo(tmp_path):
    """Test that files nested 2-3 levels deep are found (like vdolipoperek)."""
    fe = tmp_path / "frontend" / "src"
    fe.mkdir(parents=True)
    (tmp_path / "frontend" / "package.json").write_text(json.dumps({
        "dependencies": {"vue": "^3.4"},
        "devDependencies": {"vite": "^5.0", "tailwindcss": "^3.4"},
    }))
    (tmp_path / "frontend" / "vite.config.js").write_text("export default {}")
    (tmp_path / "frontend" / "tailwind.config.js").write_text("module.exports = {}")
    be = tmp_path / "backend-pg" / "src"
    be.mkdir(parents=True)
    (be / "index.js").write_text("const express = require('express');")
    stack = detect_tech_stack(tmp_path)
    assert "vue3" in stack
    assert "vite" in stack
    assert "tailwind" in stack
 def test_detect_empty_dir(tmp_path):
    assert detect_tech_stack(tmp_path) == []
@ -104,6 +125,36 @@ def test_detect_modules_backend_pg(tmp_path):
    assert any(m["name"] == "services" for m in modules)
 def test_detect_modules_monorepo(tmp_path):
    """Full monorepo: frontend/src/ + backend-pg/src/."""
    # Frontend
    fe_views = tmp_path / "frontend" / "src" / "views"
    fe_views.mkdir(parents=True)
    (fe_views / "Hotel.vue").write_text("<template></template>")
    fe_comp = tmp_path / "frontend" / "src" / "components"
    fe_comp.mkdir(parents=True)
    (fe_comp / "Search.vue").write_text("<template></template>")
    # Backend
    be_svc = tmp_path / "backend-pg" / "src" / "services"
    be_svc.mkdir(parents=True)
    (be_svc / "db.js").write_text("const express = require('express');")
    be_routes = tmp_path / "backend-pg" / "src" / "routes"
    be_routes.mkdir(parents=True)
    (be_routes / "api.js").write_text("const router = require('express').Router();")
    modules = detect_modules(tmp_path)
    names = {m["name"] for m in modules}
    assert "views" in names
    assert "components" in names
    assert "services" in names
    assert "routes" in names
    # Check types
    types = {m["name"]: m["type"] for m in modules}
    assert types["views"] == "frontend"
    assert types["components"] == "frontend"
 # ---------------------------------------------------------------------------
 # Decisions from CLAUDE.md
 # ---------------------------------------------------------------------------
@ -124,7 +175,7 @@ FIXME: race condition in useSearch composable
 - CSS grid fallback для IE11 (но мы его не поддерживаем)
 """)
-    decisions = extract_decisions_from_claude_md(tmp_path)
+    decisions = extract_decisions_from_claude_md(tmp_path, "myproj", "My Project")
    assert len(decisions) >= 4
    types = {d["type"] for d in decisions}
@ -136,6 +187,58 @@ def test_extract_decisions_no_claude_md(tmp_path):
    assert extract_decisions_from_claude_md(tmp_path) == []
 def test_extract_decisions_filters_unrelated_sections(tmp_path):
    """Sections about Jitsi, Nextcloud, Prosody should be skipped."""
    (tmp_path / "CLAUDE.md").write_text("""# vdolipoperek
 ## Known Issues
 1. **Hotel ID mismatch** — Sletat GetTours vs GetHotels разные ID
 2. **db.js export** — module.exports = pool (НЕ { pool })
 ## Jitsi + Nextcloud интеграция (2026-03-04)
 ВАЖНО: JWT_APP_SECRET must be synced between Prosody and Nextcloud
 GOTCHA: focus.meet.jitsi must be pinned in custom-config.js
 ## Prosody config
 ВАЖНО: conf.d files принадлежат root → писать через docker exec
 ## Git Sync (2026-03-03)
 ВАЖНО: Все среды синхронизированы на коммите 4ee5603
 """)
    decisions = extract_decisions_from_claude_md(tmp_path, "vdol", "vdolipoperek")
    titles = [d["title"] for d in decisions]
    # Should have the real known issues
    assert any("Hotel ID mismatch" in t for t in titles)
    assert any("db.js export" in t for t in titles)
    # Should NOT have Jitsi/Prosody/Nextcloud noise
    assert not any("JWT_APP_SECRET" in t for t in titles)
    assert not any("focus.meet.jitsi" in t for t in titles)
    assert not any("conf.d files" in t for t in titles)
 def test_extract_decisions_filters_noise(tmp_path):
    """Commit hashes and shell commands should not be decisions."""
    (tmp_path / "CLAUDE.md").write_text("""# Project
 ## Known Issues
 1. **Real bug** — actual architectural issue that matters
 - docker exec -it prosody bash
 - ssh dev "cd /opt/project && git pull"
 """)
    decisions = extract_decisions_from_claude_md(tmp_path)
    titles = [d["title"] for d in decisions]
    assert any("Real bug" in t for t in titles)
    # Shell commands should be filtered
    assert not any("docker exec" in t for t in titles)
    assert not any("ssh dev" in t for t in titles)
 # ---------------------------------------------------------------------------
 # Obsidian vault
 # ---------------------------------------------------------------------------