Fix bootstrap: deep scan, CLAUDE.md fallback, noise filtering
1. Tech stack: recursive file search (depth 3) + CLAUDE.md text fallback when config files are on remote server (detects nodejs, postgresql, etc.) 2. Modules: scan */src/ patterns in top-level dirs (frontend/src/, backend-pg/src/) 3. Decisions: filter out unrelated sections (Jitsi, Nextcloud, Prosody, GOIP), filter noise (commit hashes, shell commands, external service paths). Noise filtering also applied to Obsidian decisions. Tested on vdolipoperek: 4 tech, 5 modules, 9 clean decisions, 24 Obsidian tasks. 61 tests, all passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
da4a8aae72
commit
e5444114bd
3 changed files with 344 additions and 94 deletions
|
|
@ -42,54 +42,91 @@ _FILE_MARKERS = {
|
|||
}
|
||||
|
||||
|
||||
_SKIP_DIRS = {"node_modules", ".git", "dist", ".next", ".nuxt", "__pycache__", ".venv", "venv"}
|
||||
|
||||
|
||||
def detect_tech_stack(project_path: Path) -> list[str]:
|
||||
"""Detect tech stack from project files."""
|
||||
"""Detect tech stack from project files.
|
||||
|
||||
Searches recursively up to depth 3, skipping node_modules/.git/dist.
|
||||
Falls back to CLAUDE.md heuristics if no files found.
|
||||
"""
|
||||
stack: set[str] = set()
|
||||
|
||||
# Config file markers
|
||||
for fname, tech in _FILE_MARKERS.items():
|
||||
# Check root and one level deep
|
||||
if (project_path / fname).exists():
|
||||
stack.add(tech)
|
||||
for sub in ("frontend", "backend", "server", "client", "app"):
|
||||
if (project_path / sub / fname).exists():
|
||||
stack.add(tech)
|
||||
# Recursive search for config files and package.json (depth ≤ 3)
|
||||
for fpath in _walk_files(project_path, max_depth=3):
|
||||
fname = fpath.name
|
||||
if fname in _FILE_MARKERS:
|
||||
stack.add(_FILE_MARKERS[fname])
|
||||
if fname == "package.json":
|
||||
stack.update(_parse_package_json(fpath))
|
||||
if fname == "requirements.txt":
|
||||
stack.update(_parse_requirements_txt(fpath))
|
||||
if fname == "go.mod":
|
||||
stack.add("go")
|
||||
try:
|
||||
text = fpath.read_text(errors="replace")
|
||||
if "gin-gonic" in text:
|
||||
stack.add("gin")
|
||||
if "fiber" in text:
|
||||
stack.add("fiber")
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# package.json (root + subdirs)
|
||||
for pj_path in _find_package_jsons(project_path):
|
||||
stack.update(_parse_package_json(pj_path))
|
||||
|
||||
# requirements.txt
|
||||
for req_path in project_path.glob("**/requirements.txt"):
|
||||
if _is_inside_node_modules(req_path, project_path):
|
||||
continue
|
||||
stack.update(_parse_requirements_txt(req_path))
|
||||
|
||||
# go.mod
|
||||
go_mod = project_path / "go.mod"
|
||||
if go_mod.exists():
|
||||
stack.add("go")
|
||||
text = go_mod.read_text(errors="replace")
|
||||
if "gin-gonic" in text:
|
||||
stack.add("gin")
|
||||
if "fiber" in text:
|
||||
stack.add("fiber")
|
||||
# Fallback: extract tech hints from CLAUDE.md if no config files found
|
||||
if not stack:
|
||||
stack.update(_detect_stack_from_claude_md(project_path))
|
||||
|
||||
return sorted(stack)
|
||||
|
||||
|
||||
def _find_package_jsons(root: Path) -> list[Path]:
|
||||
"""Find package.json files (root + immediate subdirs, skip node_modules)."""
|
||||
results = []
|
||||
pj = root / "package.json"
|
||||
if pj.exists():
|
||||
results.append(pj)
|
||||
for sub in root.iterdir():
|
||||
if sub.is_dir() and sub.name != "node_modules" and not sub.name.startswith("."):
|
||||
pj = sub / "package.json"
|
||||
if pj.exists():
|
||||
results.append(pj)
|
||||
return results
|
||||
# CLAUDE.md text → tech labels (for fallback when project files are on a remote server)
|
||||
_CLAUDE_MD_TECH_HINTS = {
|
||||
r"(?i)vue[\s.]?3": "vue3", r"(?i)vue[\s.]?2": "vue2",
|
||||
r"(?i)\bnuxt\b": "nuxt3", r"(?i)\breact\b": "react",
|
||||
r"(?i)\btypescript\b": "typescript", r"(?i)\bvite\b": "vite",
|
||||
r"(?i)\btailwind": "tailwind",
|
||||
r"(?i)node\.?js": "nodejs", r"(?i)\bexpress\b": "express",
|
||||
r"(?i)postgresql|postgres": "postgresql",
|
||||
r"(?i)\bsqlite\b": "sqlite", r"(?i)\bmysql\b": "mysql",
|
||||
r"(?i)\bdocker\b": "docker",
|
||||
r"(?i)\bpython\b": "python", r"(?i)\bfastapi\b": "fastapi",
|
||||
r"(?i)\bdjango\b": "django", r"(?i)\bflask\b": "flask",
|
||||
r"(?i)\bgo\b.*(?:gin|fiber|module)": "go",
|
||||
r"(?i)\bnginx\b": "nginx",
|
||||
r"(?i)\bpinia\b": "pinia", r"(?i)\bvuex\b": "vuex",
|
||||
}
|
||||
|
||||
|
||||
def _detect_stack_from_claude_md(project_path: Path) -> list[str]:
|
||||
"""Fallback: infer tech stack from CLAUDE.md text when no config files exist."""
|
||||
claude_md = project_path / "CLAUDE.md"
|
||||
if not claude_md.exists():
|
||||
return []
|
||||
try:
|
||||
text = claude_md.read_text(errors="replace")[:5000] # First 5KB is enough
|
||||
except OSError:
|
||||
return []
|
||||
stack = []
|
||||
for pattern, tech in _CLAUDE_MD_TECH_HINTS.items():
|
||||
if re.search(pattern, text):
|
||||
stack.append(tech)
|
||||
return stack
|
||||
|
||||
|
||||
def _walk_files(root: Path, max_depth: int = 3, _depth: int = 0):
|
||||
"""Yield files up to max_depth, skipping node_modules/dist/.git."""
|
||||
if _depth > max_depth:
|
||||
return
|
||||
try:
|
||||
entries = sorted(root.iterdir())
|
||||
except (OSError, PermissionError):
|
||||
return
|
||||
for entry in entries:
|
||||
if entry.is_file():
|
||||
yield entry
|
||||
elif entry.is_dir() and entry.name not in _SKIP_DIRS and not entry.name.startswith("."):
|
||||
yield from _walk_files(entry, max_depth, _depth + 1)
|
||||
|
||||
|
||||
def _parse_package_json(path: Path) -> list[str]:
|
||||
|
|
@ -140,26 +177,40 @@ _BACKEND_MARKERS = {"express", "fastify", "koa", "router", "controller", "middle
|
|||
|
||||
|
||||
def detect_modules(project_path: Path) -> list[dict]:
|
||||
"""Scan src/ (or app/, lib/, frontend/, backend/) for modules."""
|
||||
"""Scan for modules: checks root subdirs, */src/ patterns, standard names.
|
||||
|
||||
Strategy:
|
||||
1. Find all "source root" dirs (src/, app/, lib/ at root or inside top-level dirs)
|
||||
2. Each first-level subdir of a source root = a module candidate
|
||||
3. Top-level dirs with their own src/ are treated as component roots
|
||||
(e.g. frontend/, backend-pg/) — scan THEIR src/ for modules
|
||||
"""
|
||||
modules = []
|
||||
scan_dirs = []
|
||||
scan_dirs: list[tuple[Path, str | None]] = [] # (dir, prefix_hint)
|
||||
|
||||
# Prioritized source dirs
|
||||
for name in ("src", "app", "lib", "frontend", "backend", "server", "client"):
|
||||
# Direct source dirs in root
|
||||
for name in ("src", "app", "lib"):
|
||||
d = project_path / name
|
||||
if d.is_dir():
|
||||
scan_dirs.append(d)
|
||||
scan_dirs.append((d, None))
|
||||
|
||||
# Also check frontend/src, backend/src patterns
|
||||
for name in ("frontend/src", "backend/src", "backend-pg/src"):
|
||||
d = project_path / name
|
||||
if d.is_dir():
|
||||
scan_dirs.append(d)
|
||||
# Top-level component dirs (frontend/, backend/, backend-pg/, server/, client/)
|
||||
# These get scanned for src/ inside, or directly if they contain source files
|
||||
for child in sorted(project_path.iterdir()):
|
||||
if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
|
||||
continue
|
||||
child_src = child / "src"
|
||||
if child_src.is_dir():
|
||||
# e.g. frontend/src/, backend-pg/src/ — scan their subdirs
|
||||
scan_dirs.append((child_src, child.name))
|
||||
elif child.name in ("frontend", "backend", "server", "client", "web", "api"):
|
||||
# No src/ but it's a known component dir — scan it directly
|
||||
scan_dirs.append((child, child.name))
|
||||
|
||||
seen = set()
|
||||
for scan_dir in scan_dirs:
|
||||
for scan_dir, prefix in scan_dirs:
|
||||
for child in sorted(scan_dir.iterdir()):
|
||||
if not child.is_dir() or child.name.startswith(".") or child.name == "node_modules":
|
||||
if not child.is_dir() or child.name in _SKIP_DIRS or child.name.startswith("."):
|
||||
continue
|
||||
mod = _analyze_module(child, project_path)
|
||||
key = (mod["name"], mod["path"])
|
||||
|
|
@ -230,7 +281,7 @@ def _guess_module_type(dir_path: Path, exts: set[str], files: list[Path]) -> str
|
|||
_DECISION_PATTERNS = [
|
||||
(r"(?i)\b(GOTCHA|ВАЖНО|WARNING|ВНИМАНИЕ)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
|
||||
(r"(?i)\b(WORKAROUND|ОБХОДНОЙ|ХАК)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "workaround"),
|
||||
(r"(?i)\b(FIXME|TODO|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
|
||||
(r"(?i)\b(FIXME|БАГИ?)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "gotcha"),
|
||||
(r"(?i)\b(РЕШЕНИЕ|DECISION)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "decision"),
|
||||
(r"(?i)\b(CONVENTION|СОГЛАШЕНИЕ|ПРАВИЛО)[:\s]+(.*?)(?=\n[#\-]|\n\n|\Z)", "convention"),
|
||||
]
|
||||
|
|
@ -238,13 +289,83 @@ _DECISION_PATTERNS = [
|
|||
# Section headers that likely contain decisions
|
||||
_DECISION_SECTIONS = [
|
||||
r"(?i)known\s+issues?", r"(?i)workaround", r"(?i)gotcha",
|
||||
r"(?i)решени[яе]", r"(?i)грабл[ия]", r"(?i)важно",
|
||||
r"(?i)решени[яе]", r"(?i)грабл[ия]",
|
||||
r"(?i)conventions?", r"(?i)правила", r"(?i)нюансы",
|
||||
]
|
||||
|
||||
# Section headers about UNRELATED services — skip these entirely
|
||||
_UNRELATED_SECTION_PATTERNS = [
|
||||
r"(?i)jitsi", r"(?i)nextcloud", r"(?i)prosody",
|
||||
r"(?i)coturn", r"(?i)turn\b", r"(?i)asterisk",
|
||||
r"(?i)ghost\s+блог", r"(?i)onlyoffice",
|
||||
r"(?i)git\s+sync", r"(?i)\.env\s+добав",
|
||||
r"(?i)goip\s+watcher", r"(?i)tbank\s+monitor", # monitoring services
|
||||
r"(?i)фикс\s+удален", # commit-level fixes (not decisions)
|
||||
]
|
||||
|
||||
def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
|
||||
"""Parse CLAUDE.md for decisions, gotchas, workarounds."""
|
||||
# Noise patterns — individual items that look like noise, not decisions
|
||||
_NOISE_PATTERNS = [
|
||||
r"^[0-9a-f]{6,40}$", # commit hashes
|
||||
r"^\s*(docker|ssh|scp|git|curl|sudo)\s", # shell commands
|
||||
r"^`[^`]+`$", # inline code-only items
|
||||
r"(?i)(prosody|jitsi|jicofo|jvb|coturn|nextcloud|onlyoffice|ghost)", # unrelated services
|
||||
r"(?i)\.jitsi-meet-cfg", # jitsi config paths
|
||||
r"(?i)(meet\.jitsi|sitemeet\.org)", # jitsi domains
|
||||
r"(?i)(cloud\.vault\.red|office\.vault)", # nextcloud domains
|
||||
r"(?i)JWT_APP_(ID|SECRET)", # jwt config lines
|
||||
r"(?i)XMPP_", # prosody config
|
||||
r"\(коммит\s+`?[0-9a-f]+`?\)", # "(коммит `a33c2b9`)" references
|
||||
r"(?i)known_uids|idle_loop|reconnect", # goip-watcher internals
|
||||
]
|
||||
|
||||
|
||||
def _is_noise(text: str) -> bool:
|
||||
"""Check if a decision candidate is noise."""
|
||||
# Clean markdown bold for matching
|
||||
clean = re.sub(r"\*\*([^*]*)\*\*", r"\1", text).strip()
|
||||
return any(re.search(p, clean) for p in _NOISE_PATTERNS)
|
||||
|
||||
|
||||
def _split_into_sections(text: str) -> list[tuple[str, str]]:
|
||||
"""Split markdown into (header, body) pairs by ## headers.
|
||||
|
||||
Returns list of (header_text, body_text) tuples.
|
||||
Anything before the first ## is returned with header="".
|
||||
"""
|
||||
parts = re.split(r"(?m)^(##\s+.+)$", text)
|
||||
sections = []
|
||||
current_header = ""
|
||||
current_body = parts[0] if parts else ""
|
||||
|
||||
for i in range(1, len(parts), 2):
|
||||
if current_header or current_body.strip():
|
||||
sections.append((current_header, current_body))
|
||||
current_header = parts[i].strip()
|
||||
current_body = parts[i + 1] if i + 1 < len(parts) else ""
|
||||
|
||||
if current_header or current_body.strip():
|
||||
sections.append((current_header, current_body))
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def _is_unrelated_section(header: str) -> bool:
|
||||
"""Check if a section header is about an unrelated service."""
|
||||
return any(re.search(p, header) for p in _UNRELATED_SECTION_PATTERNS)
|
||||
|
||||
|
||||
def extract_decisions_from_claude_md(
|
||||
project_path: Path,
|
||||
project_id: str | None = None,
|
||||
project_name: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Parse CLAUDE.md for decisions, gotchas, workarounds.
|
||||
|
||||
Filters out:
|
||||
- Sections about unrelated services (Jitsi, Nextcloud, Prosody, etc.)
|
||||
- Noise: commit hashes, docker/ssh commands, paths to external services
|
||||
- If CLAUDE.md has multi-project sections, only extracts for current project
|
||||
"""
|
||||
claude_md = project_path / "CLAUDE.md"
|
||||
if not claude_md.exists():
|
||||
return []
|
||||
|
|
@ -254,20 +375,30 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
|
|||
except OSError:
|
||||
return []
|
||||
|
||||
# Split into sections and filter out unrelated ones
|
||||
sections = _split_into_sections(text)
|
||||
relevant_text = []
|
||||
for header, body in sections:
|
||||
if _is_unrelated_section(header):
|
||||
continue
|
||||
relevant_text.append(header + "\n" + body)
|
||||
|
||||
filtered_text = "\n".join(relevant_text)
|
||||
|
||||
decisions = []
|
||||
seen_titles = set()
|
||||
|
||||
# Pattern-based extraction
|
||||
# Pattern-based extraction from relevant sections only
|
||||
for pattern, dec_type in _DECISION_PATTERNS:
|
||||
for m in re.finditer(pattern, text, re.DOTALL):
|
||||
label = m.group(1).strip()
|
||||
for m in re.finditer(pattern, filtered_text, re.DOTALL):
|
||||
body = m.group(2).strip()
|
||||
if not body or len(body) < 10:
|
||||
continue
|
||||
# First line as title, rest as description
|
||||
lines = body.split("\n")
|
||||
title = lines[0].strip().rstrip(".")[:100]
|
||||
desc = body
|
||||
if _is_noise(title) or _is_noise(desc):
|
||||
continue
|
||||
if title not in seen_titles:
|
||||
seen_titles.add(title)
|
||||
decisions.append({
|
||||
|
|
@ -277,26 +408,36 @@ def extract_decisions_from_claude_md(project_path: Path) -> list[dict]:
|
|||
"category": _guess_category(title + " " + desc),
|
||||
})
|
||||
|
||||
# Section-based extraction: find headers matching decision sections
|
||||
sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", text)
|
||||
for i, section in enumerate(sections):
|
||||
# Section-based extraction: find ### or #### headers matching decision patterns
|
||||
sub_sections = re.split(r"(?m)^(#{1,4}\s+.*?)$", filtered_text)
|
||||
for i, section in enumerate(sub_sections):
|
||||
if any(re.search(pat, section) for pat in _DECISION_SECTIONS):
|
||||
# The content is in the next section
|
||||
if i + 1 < len(sections):
|
||||
content = sections[i + 1].strip()
|
||||
# Extract bullet points
|
||||
if i + 1 < len(sub_sections):
|
||||
content = sub_sections[i + 1].strip()
|
||||
for line in content.split("\n"):
|
||||
line = line.strip()
|
||||
if line.startswith(("- ", "* ", "• ")):
|
||||
# Numbered items (1. **text**) or bullet items
|
||||
item = None
|
||||
if re.match(r"^\d+\.\s+", line):
|
||||
item = re.sub(r"^\d+\.\s+", "", line).strip()
|
||||
elif line.startswith(("- ", "* ", "• ")):
|
||||
item = line.lstrip("-*• ").strip()
|
||||
if item and len(item) > 10 and item[:80] not in seen_titles:
|
||||
seen_titles.add(item[:80])
|
||||
decisions.append({
|
||||
"type": "decision",
|
||||
"title": item[:100],
|
||||
"description": item,
|
||||
"category": _guess_category(item),
|
||||
})
|
||||
|
||||
if not item or len(item) < 10:
|
||||
continue
|
||||
# Clean bold markers for title
|
||||
clean = re.sub(r"\*\*([^*]+)\*\*", r"\1", item)
|
||||
if _is_noise(clean):
|
||||
continue
|
||||
title = clean[:100]
|
||||
if title not in seen_titles:
|
||||
seen_titles.add(title)
|
||||
decisions.append({
|
||||
"type": "gotcha",
|
||||
"title": title,
|
||||
"description": item,
|
||||
"category": _guess_category(item),
|
||||
})
|
||||
|
||||
return decisions
|
||||
|
||||
|
|
@ -414,28 +555,34 @@ def _extract_obsidian_decisions(text: str, source: str, decisions: list[dict]):
|
|||
for pattern, dec_type in _DECISION_PATTERNS:
|
||||
for m in re.finditer(pattern, text, re.DOTALL):
|
||||
body = m.group(2).strip()
|
||||
if body and len(body) > 10:
|
||||
title = body.split("\n")[0].strip()[:100]
|
||||
decisions.append({
|
||||
"type": dec_type,
|
||||
"title": title,
|
||||
"description": body,
|
||||
"category": _guess_category(body),
|
||||
"source": source,
|
||||
})
|
||||
|
||||
# Also look for ВАЖНО/GOTCHA/FIXME inline markers not caught above
|
||||
for m in re.finditer(r"(?i)\*\*(ВАЖНО|GOTCHA|FIXME)\*\*[:\s]*(.*?)(?=\n|$)", text):
|
||||
body = m.group(2).strip()
|
||||
if body and len(body) > 10:
|
||||
if not body or len(body) < 10:
|
||||
continue
|
||||
title = body.split("\n")[0].strip()[:100]
|
||||
if _is_noise(title) or _is_noise(body):
|
||||
continue
|
||||
decisions.append({
|
||||
"type": "gotcha",
|
||||
"title": body[:100],
|
||||
"type": dec_type,
|
||||
"title": title,
|
||||
"description": body,
|
||||
"category": _guess_category(body),
|
||||
"source": source,
|
||||
})
|
||||
|
||||
# Also look for ВАЖНО/GOTCHA/FIXME inline markers not caught above
|
||||
for m in re.finditer(r"(?i)\*\*(ВАЖНО|GOTCHA|FIXME)\*\*[:\s]*(.*?)(?=\n|$)", text):
|
||||
body = m.group(2).strip()
|
||||
if not body or len(body) < 10:
|
||||
continue
|
||||
if _is_noise(body):
|
||||
continue
|
||||
decisions.append({
|
||||
"type": "gotcha",
|
||||
"title": body[:100],
|
||||
"description": body,
|
||||
"category": _guess_category(body),
|
||||
"source": source,
|
||||
})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Formatting for CLI preview
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue