Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

analyze_sources.py 9.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. # /// script
  2. # /// requires-python = ">=3.10"
  3. # /// dependencies = []
  4. # ///
  5. """Analyze source documents for the distillation generator.
  6. Enumerates files from paths/folders/globs, computes sizes and token estimates,
  7. detects document types from naming conventions, and suggests groupings for
  8. related documents (e.g., a brief paired with its discovery notes).
  9. Accepts: file paths, folder paths (scans recursively for .md/.txt/.yaml/.yml/.json),
  10. or glob patterns. Skips node_modules, .git, __pycache__, .venv, _bmad-output.
  11. Output JSON structure:
  12. status: "ok" | "error"
  13. files[]: path, filename, size_bytes, estimated_tokens, doc_type
  14. summary: total_files, total_size_bytes, total_estimated_tokens
  15. groups[]: group_key, files[] with role (primary/companion/standalone)
  16. - Groups related docs by naming convention (e.g., brief + discovery-notes)
  17. routing: recommendation ("single" | "fan-out"), reason
  18. - single: ≤3 files AND ≤15K estimated tokens
  19. - fan-out: >3 files OR >15K estimated tokens
  20. split_prediction: prediction ("likely" | "unlikely"), reason, estimated_distillate_tokens
  21. - Estimates distillate at ~1/3 source size; splits if >5K tokens
  22. """
  23. from __future__ import annotations
  24. import argparse
  25. import glob
  26. import json
  27. import os
  28. import re
  29. import sys
  30. from pathlib import Path
  31. # Extensions to include when scanning folders
  32. INCLUDE_EXTENSIONS = {".md", ".txt", ".yaml", ".yml", ".json"}
  33. # Directories to skip when scanning folders
  34. SKIP_DIRS = {
  35. "node_modules", ".git", "__pycache__", ".venv", "venv",
  36. ".claude", "_bmad-output", ".cursor", ".vscode",
  37. }
  38. # Approximate chars per token for estimation
  39. CHARS_PER_TOKEN = 4
  40. # Thresholds
  41. SINGLE_COMPRESSOR_MAX_TOKENS = 15_000
  42. SINGLE_DISTILLATE_MAX_TOKENS = 5_000
  43. # Naming patterns for document type detection
  44. DOC_TYPE_PATTERNS = [
  45. (r"discovery[_-]notes", "discovery-notes"),
  46. (r"product[_-]brief", "product-brief"),
  47. (r"research[_-]report", "research-report"),
  48. (r"architecture", "architecture-doc"),
  49. (r"prd", "prd"),
  50. (r"distillate", "distillate"),
  51. (r"changelog", "changelog"),
  52. (r"readme", "readme"),
  53. (r"spec", "specification"),
  54. (r"requirements", "requirements"),
  55. (r"design[_-]doc", "design-doc"),
  56. (r"meeting[_-]notes", "meeting-notes"),
  57. (r"brainstorm", "brainstorming"),
  58. (r"interview", "interview-notes"),
  59. ]
  60. # Patterns for grouping related documents
  61. GROUP_PATTERNS = [
  62. # base document + discovery notes
  63. (r"^(.+?)(?:-discovery-notes|-discovery_notes)\.(\w+)$", r"\1.\2"),
  64. # base document + appendix
  65. (r"^(.+?)(?:-appendix|-addendum)(?:-\w+)?\.(\w+)$", r"\1.\2"),
  66. # base document + review/feedback
  67. (r"^(.+?)(?:-review|-feedback)\.(\w+)$", r"\1.\2"),
  68. ]
  69. def resolve_inputs(inputs: list[str]) -> list[Path]:
  70. """Resolve input arguments to a flat list of file paths."""
  71. files: list[Path] = []
  72. for inp in inputs:
  73. path = Path(inp)
  74. if path.is_file():
  75. files.append(path.resolve())
  76. elif path.is_dir():
  77. for root, dirs, filenames in os.walk(path):
  78. dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
  79. for fn in sorted(filenames):
  80. fp = Path(root) / fn
  81. if fp.suffix.lower() in INCLUDE_EXTENSIONS:
  82. files.append(fp.resolve())
  83. else:
  84. # Try as glob
  85. matches = glob.glob(inp, recursive=True)
  86. for m in sorted(matches):
  87. mp = Path(m)
  88. if mp.is_file() and mp.suffix.lower() in INCLUDE_EXTENSIONS:
  89. files.append(mp.resolve())
  90. # Deduplicate while preserving order
  91. seen: set[Path] = set()
  92. deduped: list[Path] = []
  93. for f in files:
  94. if f not in seen:
  95. seen.add(f)
  96. deduped.append(f)
  97. return deduped
  98. def detect_doc_type(filename: str) -> str:
  99. """Detect document type from filename."""
  100. name_lower = filename.lower()
  101. for pattern, doc_type in DOC_TYPE_PATTERNS:
  102. if re.search(pattern, name_lower):
  103. return doc_type
  104. return "unknown"
  105. def suggest_groups(files: list[Path]) -> list[dict]:
  106. """Suggest document groupings based on naming conventions."""
  107. groups: dict[str, list[dict]] = {}
  108. ungrouped: list[dict] = []
  109. file_map = {f.name: f for f in files}
  110. assigned: set[str] = set()
  111. for f in files:
  112. if f.name in assigned:
  113. continue
  114. matched = False
  115. for pattern, base_pattern in GROUP_PATTERNS:
  116. m = re.match(pattern, f.name, re.IGNORECASE)
  117. if m:
  118. # This file is a companion — find its base
  119. base_name = re.sub(pattern, base_pattern, f.name, flags=re.IGNORECASE)
  120. group_key = base_name
  121. if group_key not in groups:
  122. groups[group_key] = []
  123. # Add the base file if it exists
  124. if base_name in file_map and base_name not in assigned:
  125. groups[group_key].append({
  126. "path": str(file_map[base_name]),
  127. "filename": base_name,
  128. "role": "primary",
  129. })
  130. assigned.add(base_name)
  131. groups[group_key].append({
  132. "path": str(f),
  133. "filename": f.name,
  134. "role": "companion",
  135. })
  136. assigned.add(f.name)
  137. matched = True
  138. break
  139. if not matched:
  140. # Check if this file is a base that already has companions
  141. if f.name in groups:
  142. continue # Already added as primary
  143. ungrouped.append({
  144. "path": str(f),
  145. "filename": f.name,
  146. })
  147. result = []
  148. for group_key, members in groups.items():
  149. result.append({
  150. "group_key": group_key,
  151. "files": members,
  152. })
  153. for ug in ungrouped:
  154. if ug["filename"] not in assigned:
  155. result.append({
  156. "group_key": ug["filename"],
  157. "files": [{"path": ug["path"], "filename": ug["filename"], "role": "standalone"}],
  158. })
  159. return result
  160. def analyze(inputs: list[str], output_path: str | None = None) -> None:
  161. """Main analysis function."""
  162. files = resolve_inputs(inputs)
  163. if not files:
  164. result = {
  165. "status": "error",
  166. "error": "No readable files found from provided inputs",
  167. "inputs": inputs,
  168. }
  169. output_json(result, output_path)
  170. return
  171. # Analyze each file
  172. file_details = []
  173. total_chars = 0
  174. for f in files:
  175. size = f.stat().st_size
  176. total_chars += size
  177. file_details.append({
  178. "path": str(f),
  179. "filename": f.name,
  180. "size_bytes": size,
  181. "estimated_tokens": size // CHARS_PER_TOKEN,
  182. "doc_type": detect_doc_type(f.name),
  183. })
  184. total_tokens = total_chars // CHARS_PER_TOKEN
  185. groups = suggest_groups(files)
  186. # Routing recommendation
  187. if len(files) <= 3 and total_tokens <= SINGLE_COMPRESSOR_MAX_TOKENS:
  188. routing = "single"
  189. routing_reason = (
  190. f"{len(files)} file(s), ~{total_tokens:,} estimated tokens — "
  191. f"within single compressor threshold"
  192. )
  193. else:
  194. routing = "fan-out"
  195. routing_reason = (
  196. f"{len(files)} file(s), ~{total_tokens:,} estimated tokens — "
  197. f"exceeds single compressor threshold "
  198. f"({'>' + str(SINGLE_COMPRESSOR_MAX_TOKENS) + ' tokens' if total_tokens > SINGLE_COMPRESSOR_MAX_TOKENS else '> 3 files'})"
  199. )
  200. # Split prediction
  201. estimated_distillate_tokens = total_tokens // 3 # rough: distillate is ~1/3 of source
  202. if estimated_distillate_tokens > SINGLE_DISTILLATE_MAX_TOKENS:
  203. split_prediction = "likely"
  204. split_reason = (
  205. f"Estimated distillate ~{estimated_distillate_tokens:,} tokens "
  206. f"exceeds {SINGLE_DISTILLATE_MAX_TOKENS:,} threshold"
  207. )
  208. else:
  209. split_prediction = "unlikely"
  210. split_reason = (
  211. f"Estimated distillate ~{estimated_distillate_tokens:,} tokens "
  212. f"within {SINGLE_DISTILLATE_MAX_TOKENS:,} threshold"
  213. )
  214. result = {
  215. "status": "ok",
  216. "files": file_details,
  217. "summary": {
  218. "total_files": len(files),
  219. "total_size_bytes": total_chars,
  220. "total_estimated_tokens": total_tokens,
  221. },
  222. "groups": groups,
  223. "routing": {
  224. "recommendation": routing,
  225. "reason": routing_reason,
  226. },
  227. "split_prediction": {
  228. "prediction": split_prediction,
  229. "reason": split_reason,
  230. "estimated_distillate_tokens": estimated_distillate_tokens,
  231. },
  232. }
  233. output_json(result, output_path)
  234. def output_json(data: dict, output_path: str | None) -> None:
  235. """Write JSON to file or stdout."""
  236. json_str = json.dumps(data, indent=2)
  237. if output_path:
  238. Path(output_path).parent.mkdir(parents=True, exist_ok=True)
  239. Path(output_path).write_text(json_str + "\n")
  240. print(f"Results written to {output_path}", file=sys.stderr)
  241. else:
  242. print(json_str)
  243. def main() -> None:
  244. parser = argparse.ArgumentParser(
  245. description=__doc__,
  246. formatter_class=argparse.RawDescriptionHelpFormatter,
  247. )
  248. parser.add_argument(
  249. "inputs",
  250. nargs="+",
  251. help="File paths, folder paths, or glob patterns to analyze",
  252. )
  253. parser.add_argument(
  254. "-o", "--output",
  255. help="Output JSON to file instead of stdout",
  256. )
  257. args = parser.parse_args()
  258. analyze(args.inputs, args.output)
  259. sys.exit(0)
  260. if __name__ == "__main__":
  261. main()