Fix bootstrap: deep scan, CLAUDE.md fallback, noise filtering

1. Tech stack: recursive file search (depth 3) + CLAUDE.md text fallback
   when config files are on remote server (detects nodejs, postgresql, etc.)
2. Modules: scan */src/ patterns in top-level dirs (frontend/src/, backend-pg/src/)
3. Decisions: filter out unrelated sections (Jitsi, Nextcloud, Prosody, GOIP),
   filter noise (commit hashes, shell commands, external service paths).
   Noise filtering also applied to Obsidian decisions.

Tested on vdolipoperek: 4 tech, 5 modules, 9 clean decisions, 24 Obsidian tasks.
61 tests, all passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
johnfrum1234 2026-03-15 13:37:42 +02:00
parent da4a8aae72
commit e5444114bd
3 changed files with 344 additions and 94 deletions

View file

@ -42,54 +42,91 @@ _FILE_MARKERS = {
}
_SKIP_DIRS = {"node_modules", ".git", "dist", ".next", ".nuxt", "__pycache__", ".venv", "venv"}
def detect_tech_stack(project_path: Path) -> list[str]:
"""Detect tech stack from project files."""
"""Detect tech stack from project files.
Searches recursively up to depth 3, skipping node_modules/.git/dist.
Falls back to CLAUDE.md heuristics if no files found.
"""
stack: set[str] = set()
# Config file markers
for fname, tech in _FILE_MARKERS.items():
# Check root and one level deep
if (project_path / fname).exists():
stack.add(tech)
for sub in ("frontend", "backend", "server", "client", "app"):
if (project_path / sub / fname).exists():
stack.add(tech)
# Recursive search for config files and package.json (depth ≤ 3)
for fpath in _walk_files(project_path, max_depth=3):
fname = fpath.name
if fname in _FILE_MARKERS:
stack.add(_FILE_MARKERS[fname])
if fname == "package.json":
stack.update(_parse_package_json(fpath))
if fname == "requirements.txt":
stack.update(_parse_requirements_txt(fpath))
if fname == "go.mod":
stack.add("go")
try:
text = fpath.read_text(errors="replace")
if "gin-gonic" in text:
stack.add("gin")
if "fiber" in text:
stack.add("fiber")
except OSError:
pass
# package.json (root + subdirs)
for pj_path in _find_package_jsons(project_path):
stack.update(_parse_package_json(pj_path))
# requirements.txt
for req_path in project_path.glob("**/requirements.txt"):
if _is_inside_node_modules(req_path, project_path):
continue
stack.update(_parse_requirements_txt(req_path))
# go.mod
go_mod = project_path / "go.mod"
if go_mod.exists():
stack.add("go")
text = go_mod.read_text(errors="replace")
if "gin-gonic" in text:
stack.add("gin")
if "fiber" in text:
stack.add("fiber")
# Fallback: extract tech hints from CLAUDE.md if no config files found
if not stack:
stack.update(_detect_stack_from_claude_md(project_path))
return sorted(stack)
def _find_package_jsons(root: Path) -> list[Path]:
"""Find package.json files (root + immediate subdirs, skip node_modules)."""
results = []
pj = root / "package.json"
if pj.exists():
results.append(pj)
for sub in root.iterdir():
if sub.is_dir() and sub.name != "node_modules" and not sub.name.startswith("."):
pj = sub / "package.json"
if pj.exists():
results.append(pj)
return results
# CLAUDE.md text → tech labels (for fallback when project files are on a remote server)
_CLAUDE_MD_TECH_HINTS = {
r"(?i)vue[\s.]?3": "vue3", r"(?i)vue[\s.]?2": "vue2",
r"(?i)\bnuxt\b": "nuxt3", r"(?i)\breact\b": "react",
r"(?i)\btypescript\b": "typescript", r"(?i)\bvite\b": "vite",
r"(?i)\btailwind": "tailwind",
r"(?i)node\.?js": "nodejs", r"(?i)\bexpress\b": "express",
r"(?i)postgresql|postgres": "postgresql",
r"(?i)\bsqlite\b": "sqlite", r"(?i)\bmysql\b": "mysql",
r"(?i)\bdocker\b": "docker",
r"(?i)\bpython\b": "python", r"(?i)\bfastapi\b": "fastapi",
r"(?i)\bdjango\b": "django", r"(?i)\bflask\b": "flask",
r"(?i)\bgo\b.*(?:gin|fiber|module)": "go",
r"(?i)\bnginx\b": "nginx",
r"(?i)\bpinia\b": "pinia", r"(?i)\bvuex\b": "vuex",
}
def _detect_stack_from_claude_md(project_path: Path) -> list[str]:
"""Fallback: infer tech stack from CLAUDE.md text when no config files exist."""
claude_md = project_path / "CLAUDE.md"
if not claude_md.exists():
return []
try:
text = claude_md.read_text(errors="replace")[:5000] # First 5KB is enough
except OSError:
return []
stack = []
for pattern, tech in _CLAUDE_MD_TECH_HINTS.items():
if re.search(pattern, text):
stack.append(tech)
return stack
def _walk_files(root: Path, max_depth: int = 3, _depth: int = 0):
"""Yield files up to max_depth, skipping node_modules/dist/.git."""
if _depth > max_depth:
return
try:
entries = sorted(root.iterdir())
except (OSError, PermissionError):
return
for entry in entries:
if entry.is_file():
yield entry
elif entry.is_dir() and entry.name not in _SKIP_DIRS and not entry.name.startswith("."):
yield from _walk_files(entry, max_depth, _depth + 1)
def _parse_package_json(path: Path) -> list[str]:
@ -140,26 +177,40 @@ _BACKEND_MARKERS = {"express", "fastify", "koa", "router", "controller", "middle
def detect_modules(project_path: Path) -> list[dict]:
"""Scan src/ (or app/, lib/, frontend/, backend/) for modules."""
"""Scan for modules: checks root subdirs, */src/ patterns, standard names.
Strategy:
1. Find all "source root" dirs (src/, app/, lib/ at root or inside top-level dirs)
2. Each first-level subdir of a source root = a module candidate
3. Top-level dirs with their own src/ are treated as component roots
(e.g. frontend/, backend-pg/) scan THEIR src/ for modules
"""
modules = []
scan_dirs = []
scan_dirs: list[tuple[Path, str | None]] = [] # (dir, prefix_hint)
# Prioritized source dirs
for name in ("src", "app", "lib", "frontend", "backend", "server", "client"):
# Direct source dirs in root
for name in ("src", "app", "lib"):
d = project_path / name
if d.is_dir():
scan_dirs.append(d)
scan_dirs.append((d, None))
# Also check frontend/src, backend/src patterns
for name in ("frontend/src", "backend/src", "backend-pg/src"):
d = project_path / name
if d.is_dir():
scan_dirs.append(d)
# Top-level component dirs (frontend/, backend/, backend-pg/, server/, client/)
# These get scanned for src/ inside, or directly if they contain source files
for child in sorted(project_path.iterdir()):
if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
continue
child_src = child / "src"
if child_src.is_dir():
# e.g. frontend/src/, backend-pg/src/ — scan their subdirs
scan_dirs.append((child_src, child.name))
elif child.name in ("frontend", "backend", "server", "client", "web", "api"):
# No src/ but it's a known component dir — scan it directly
scan_dirs.append((child, child.name))
seen = set()
for scan_dir in scan_dirs:
for scan_dir, prefix in scan_dirs:
for child in sorted(scan_dir.iterdir()):
if not child.is_dir() or child.name.startswith(".") or child.name == "node_modules":
if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
continue
mod = _analyze_module(child, project_path)
key = (mod["name"], mod["path"])
@ -230,7 +281,7 @@ def _guess_module_type(dir_path: Path, exts: set[str], files: list[Path]) -> str
_DECISION_PATTERNS = [
(r"(?i)\b(GOTCHA|ВАЖНО|WARNING|ВНИМАНИЕ)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
(r"(?i)\b(WORKAROUND|ОБХОДНОЙ|ХАК)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "workaround"),
(r"(?i)\b(FIXME|TODO|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
(r"(?i)\b(FIXME|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
(r"(?i)\b(РЕШЕНИЕ|DECISION)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "decision"),
(r"(?i)\b(CONVENTION|СОГЛАШЕНИЕ|ПРАВИЛО)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "convention"),
]
@ -238,13 +289,83 @@ _DECISION_PATTERNS = [
# Section headers that likely contain decisions
_DECISION_SECTIONS = [
r"(?i)known\s+issues?", r"(?i)workaround", r"(?i)gotcha",
r"(?i)решени[яе]", r"(?i)грабл[ия]", r"(?i)важно",
r"(?i)решени[яе]", r"(?i)грабл[ия]",
r"(?i)conventions?", r"(?i)правила", r"(?i)нюансы",
]
# Section headers about UNRELATED services — skip these entirely
_UNRELATED_SECTION_PATTERNS = [
r"(?i)jitsi", r"(?i)nextcloud", r"(?i)prosody",
r"(?i)coturn", r"(?i)turn\b", r"(?i)asterisk",
r"(?i)ghost\s+блог", r"(?i)onlyoffice",
r"(?i)git\s+sync", r"(?i)\.env\s+добав",
r"(?i)goip\s+watcher", r"(?i)tbank\s+monitor", # monitoring services
r"(?i)фикс\s+удален", # commit-level fixes (not decisions)
]
def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
"""Parse CLAUDE.md for decisions, gotchas, workarounds."""
# Noise patterns — individual items that look like noise, not decisions
_NOISE_PATTERNS = [
r"^[0-9a-f]{6,40}$", # commit hashes
r"^\s*(docker|ssh|scp|git|curl|sudo)\s", # shell commands
r"^`[^`]+`$", # inline code-only items
r"(?i)(prosody|jitsi|jicofo|jvb|coturn|nextcloud|onlyoffice|ghost)", # unrelated services
r"(?i)\.jitsi-meet-cfg", # jitsi config paths
r"(?i)(meet\.jitsi|sitemeet\.org)", # jitsi domains
r"(?i)(cloud\.vault\.red|office\.vault)", # nextcloud domains
r"(?i)JWT_APP_(ID|SECRET)", # jwt config lines
r"(?i)XMPP_", # prosody config
r"\(коммит\s+`?[0-9a-f]+`?\)", # "(коммит `a33c2b9`)" references
r"(?i)known_uids|idle_loop|reconnect", # goip-watcher internals
]
def _is_noise(text: str) -> bool:
"""Check if a decision candidate is noise."""
# Clean markdown bold for matching
clean = re.sub(r"\*\*([^*]*)\*\*", r"\1", text).strip()
return any(re.search(p, clean) for p in _NOISE_PATTERNS)
def _split_into_sections(text: str) -> list[tuple[str, str]]:
"""Split markdown into (header, body) pairs by ## headers.
Returns list of (header_text, body_text) tuples.
Anything before the first ## is returned with header="".
"""
parts = re.split(r"(?m)^(##\s+.+)$", text)
sections = []
current_header = ""
current_body = parts[0] if parts else ""
for i in range(1, len(parts), 2):
if current_header or current_body.strip():
sections.append((current_header, current_body))
current_header = parts[i].strip()
current_body = parts[i + 1] if i + 1 < len(parts) else ""
if current_header or current_body.strip():
sections.append((current_header, current_body))
return sections
def _is_unrelated_section(header: str) -> bool:
"""Check if a section header is about an unrelated service."""
return any(re.search(p, header) for p in _UNRELATED_SECTION_PATTERNS)
def extract_decisions_from_claude_md(
project_path: Path,
project_id: str | None = None,
project_name: str | None = None,
) -> list[dict]:
"""Parse CLAUDE.md for decisions, gotchas, workarounds.
Filters out:
- Sections about unrelated services (Jitsi, Nextcloud, Prosody, etc.)
- Noise: commit hashes, docker/ssh commands, paths to external services
- If CLAUDE.md has multi-project sections, only extracts for current project
"""
claude_md = project_path / "CLAUDE.md"
if not claude_md.exists():
return []
@ -254,20 +375,30 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
except OSError:
return []
# Split into sections and filter out unrelated ones
sections = _split_into_sections(text)
relevant_text = []
for header, body in sections:
if _is_unrelated_section(header):
continue
relevant_text.append(header + "\n" + body)
filtered_text = "\n".join(relevant_text)
decisions = []
seen_titles = set()
# Pattern-based extraction
# Pattern-based extraction from relevant sections only
for pattern, dec_type in _DECISION_PATTERNS:
for m in re.finditer(pattern, text, re.DOTALL):
label = m.group(1).strip()
for m in re.finditer(pattern, filtered_text, re.DOTALL):
body = m.group(2).strip()
if not body or len(body) < 10:
continue
# First line as title, rest as description
lines = body.split("\n")
title = lines[0].strip().rstrip(".")[:100]
desc = body
if _is_noise(title) or _is_noise(desc):
continue
if title not in seen_titles:
seen_titles.add(title)
decisions.append({
@ -277,26 +408,36 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
"category": _guess_category(title + " " + desc),
})
# Section-based extraction: find headers matching decision sections
sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", text)
for i, section in enumerate(sections):
# Section-based extraction: find ### or #### headers matching decision patterns
sub_sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", filtered_text)
for i, section in enumerate(sub_sections):
if any(re.search(pat, section) for pat in _DECISION_SECTIONS):
# The content is in the next section
if i + 1 < len(sections):
content = sections[i + 1].strip()
# Extract bullet points
if i + 1 < len(sub_sections):
content = sub_sections[i + 1].strip()
for line in content.split("\n"):
line = line.strip()
if line.startswith(("- ", "* ", "")):
# Numbered items (1. **text**) or bullet items
item = None
if re.match(r"^\d+\.\s+", line):
item = re.sub(r"^\d+\.\s+", "", line).strip()
elif line.startswith(("- ", "* ", "")):
item = line.lstrip("-*• ").strip()
if item and len(item) > 10 and item[:80] not in seen_titles:
seen_titles.add(item[:80])
decisions.append({
"type": "decision",
"title": item[:100],
"description": item,
"category": _guess_category(item),
})
if not item or len(item) < 10:
continue
# Clean bold markers for title
clean = re.sub(r"\*\*([^*]+)\*\*", r"\1", item)
if _is_noise(clean):
continue
title = clean[:100]
if title not in seen_titles:
seen_titles.add(title)
decisions.append({
"type": "gotcha",
"title": title,
"description": item,
"category": _guess_category(item),
})
return decisions
@ -414,28 +555,34 @@ def _extract_obsidian_decisions(text: str, source: str, decisions: list[dict]):
for pattern, dec_type in _DECISION_PATTERNS:
for m in re.finditer(pattern, text, re.DOTALL):
body = m.group(2).strip()
if body and len(body) > 10:
title = body.split("\n")[0].strip()[:100]
decisions.append({
"type": dec_type,
"title": title,
"description": body,
"category": _guess_category(body),
"source": source,
})
# Also look for ВАЖНО/GOTCHA/FIXME inline markers not caught above
for m in re.finditer(r"(?i)\*\*(ВАЖНО|GOTCHA|FIXME)\*\*[:\s]*(.*?)(?=\n|$)", text):
body = m.group(2).strip()
if body and len(body) > 10:
if not body or len(body) < 10:
continue
title = body.split("\n")[0].strip()[:100]
if _is_noise(title) or _is_noise(body):
continue
decisions.append({
"type": "gotcha",
"title": body[:100],
"type": dec_type,
"title": title,
"description": body,
"category": _guess_category(body),
"source": source,
})
# Also look for ВАЖНО/GOTCHA/FIXME inline markers not caught above
for m in re.finditer(r"(?i)\*\*(ВАЖНО|GOTCHA|FIXME)\*\*[:\s]*(.*?)(?=\n|$)", text):
body = m.group(2).strip()
if not body or len(body) < 10:
continue
if _is_noise(body):
continue
decisions.append({
"type": "gotcha",
"title": body[:100],
"description": body,
"category": _guess_category(body),
"source": source,
})
# ---------------------------------------------------------------------------
# Formatting for CLI preview