#!/usr/bin/env python3 """ Generate a tour skeleton from repo documentation (README, CONTRIBUTING, docs/). Reads README.md (and optionally CONTRIBUTING.md, docs/) to extract: - File and directory references - Architecture / structure sections - Setup instructions (becomes an orientation step) - External links (becomes uri steps) Outputs a skeleton .tour JSON that the code-tour skill fills in with descriptions. The skill reads this skeleton and enriches it — it does NOT replace the skill's judgment. Usage: python generate_from_docs.py [--repo-root ] [--persona ] [--output ] Examples: python generate_from_docs.py python generate_from_docs.py --persona new-joiner --output .tours/from-readme.tour python generate_from_docs.py --repo-root /path/to/repo --persona vibecoder """ import json import re import sys import os from pathlib import Path from typing import Optional # ── Markdown extraction helpers ────────────────────────────────────────────── # Matches inline code that looks like a file/directory path _CODE_PATH = re.compile(r"`([^`]{2,80})`") # Matches headings _HEADING = re.compile(r"^(#{1,3})\s+(.+)$", re.MULTILINE) # Matches markdown links: [text](url) _LINK = re.compile(r"\[([^\]]+)\]$(https?://[^)]+)$") # Patterns that suggest a path (contains / or . with extension) _LOOKS_LIKE_PATH = re.compile(r"^\.?[\w\-]+(/[\w\-\.]+)+$|^\./|^[\w]+\.[a-z]{1,5}$") # Architecture / structure section keywords _STRUCT_KEYWORDS = re.compile( r"\b(structure|architecture|layout|overview|directory|folder|module|component|" r"design|system|organization|getting.started|quick.start|setup|installation)\b", re.IGNORECASE, ) def _extract_paths_from_text(text: str, repo_root: Path) -> list[str]: """Extract inline code that looks like real file/directory paths.""" candidates = _CODE_PATH.findall(text) found = [] for c in candidates: c = c.strip().lstrip("./") if not c: continue if not _LOOKS_LIKE_PATH.match(c) and "/" not in c and "." not in c: continue # check if path actually exists full = repo_root / c if full.exists(): found.append(c) return found def _extract_external_links(text: str) -> list[tuple[str, str]]: """Extract [label](url) pairs for URI steps.""" links = _LINK.findall(text) # filter out image links and very generic anchors return [ (label, url) for label, url in links if not url.endswith((".png", ".jpg", ".gif", ".svg")) and label.lower() not in ("here", "this", "link", "click", "see") ] def _split_into_sections(text: str) -> list[tuple[str, str]]: """Split markdown into (heading, body) pairs.""" headings = list(_HEADING.finditer(text)) sections = [] for i, m in enumerate(headings): heading = m.group(2).strip() start = m.end() end = headings[i + 1].start() if i + 1 < len(headings) else len(text) body = text[start:end].strip() sections.append((heading, body)) return sections def _is_structure_section(heading: str) -> bool: return bool(_STRUCT_KEYWORDS.search(heading)) # ── Step builders ───────────────────────────────────────────────────────────── def _make_content_step(title: str, hint: str) -> dict: return { "title": title, "description": f"[TODO: {hint}]", } def _make_file_step(path: str, hint: str = "") -> dict: step = { "file": path, "title": f"[TODO: title for {path}]", "description": f"[TODO: {hint or 'explain this file for the persona'}]", } return step def _make_dir_step(path: str, hint: str = "") -> dict: return { "directory": path, "title": f"[TODO: title for {path}/]", "description": f"[TODO: {hint or 'explain what lives here'}]", } def _make_uri_step(url: str, label: str) -> dict: return { "uri": url, "title": label, "description": "[TODO: explain why this link is relevant and what the reader should notice]", } # ── Core generator ──────────────────────────────────────────────────────────── def generate_skeleton(repo_root: str = ".", persona: str = "new-joiner") -> dict: repo = Path(repo_root).resolve() # ── Read documentation files ───────────────────────────────────────── doc_files = ["README.md", "readme.md", "Readme.md"] extra_docs = ["CONTRIBUTING.md", "ARCHITECTURE.md", "docs/architecture.md", "docs/README.md"] readme_text = "" for name in doc_files: p = repo / name if p.exists(): readme_text = p.read_text(errors="replace") break extra_texts = [] for name in extra_docs: p = repo / name if p.exists(): extra_texts.append((name, p.read_text(errors="replace"))) all_text = readme_text + "\n".join(t for _, t in extra_texts) # ── Collect steps ───────────────────────────────────────────────────── steps = [] seen_paths: set[str] = set() # 1. Intro step steps.append( _make_content_step( "Welcome", f"Introduce the repo: what it does, who this {persona} tour is for, what they'll understand after finishing.", ) ) # 2. Parse README sections if readme_text: sections = _split_into_sections(readme_text) for heading, body in sections: # structure / architecture sections → directory steps if _is_structure_section(heading): paths = _extract_paths_from_text(body, repo) for p in paths: if p in seen_paths: continue seen_paths.add(p) full = repo / p if full.is_dir(): steps.append(_make_dir_step(p, f"mentioned under '{heading}' in README")) elif full.is_file(): steps.append(_make_file_step(p, f"mentioned under '{heading}' in README")) # 3. Scan all text for file/dir references not yet captured all_paths = _extract_paths_from_text(all_text, repo) for p in all_paths: if p in seen_paths: continue seen_paths.add(p) full = repo / p if full.is_dir(): steps.append(_make_dir_step(p)) elif full.is_file(): steps.append(_make_file_step(p)) # 4. If very few file steps found, fall back to top-level directory scan file_and_dir_steps = [s for s in steps if "file" in s or "directory" in s] if len(file_and_dir_steps) < 3: # add top-level directories for item in sorted(repo.iterdir()): if item.name.startswith(".") or item.name in ("node_modules", "__pycache__", ".git"): continue rel = str(item.relative_to(repo)) if rel in seen_paths: continue seen_paths.add(rel) if item.is_dir(): steps.append(_make_dir_step(rel, "top-level directory")) elif item.is_file() and item.suffix in (".ts", ".js", ".py", ".go", ".rs", ".java", ".rb"): steps.append(_make_file_step(rel, "top-level source file")) # 5. URI steps from external links in README links = _extract_external_links(readme_text) # Only include links that look like architecture / design references for label, url in links[:3]: # cap at 3 to avoid noise steps.append(_make_uri_step(url, label)) # 6. Closing step steps.append( _make_content_step( "What to Explore Next", "Summarize what the reader now understands. List 2–3 follow-up tours they should read next.", ) ) # Deduplicate steps by (file/directory/uri key) seen_keys: set = set() deduped = [] for s in steps: key = s.get("file") or s.get("directory") or s.get("uri") or s.get("title") if key in seen_keys: continue seen_keys.add(key) deduped.append(s) return { "$schema": "https://aka.ms/codetour-schema", "title": f"[TODO: descriptive title for {persona} tour]", "description": f"[TODO: one sentence — who this is for and what they'll understand]", "_skeleton_generated_by": "generate_from_docs.py", "_instructions": ( "This is a skeleton. Fill in every [TODO: ...] with real content. " "Read each referenced file before writing its description. " "Remove this _skeleton_generated_by and _instructions field before saving." ), "steps": deduped, } def main(): args = sys.argv[1:] if "--help" in args or "-h" in args: print(__doc__) sys.exit(0) repo_root = "." persona = "new-joiner" output: Optional[str] = None i = 0 while i < len(args): if args[i] == "--repo-root" and i + 1 < len(args): repo_root = args[i + 1] i += 2 elif args[i] == "--persona" and i + 1 < len(args): persona = args[i + 1] i += 2 elif args[i] == "--output" and i + 1 < len(args): output = args[i + 1] i += 2 else: i += 1 skeleton = generate_skeleton(repo_root, persona) out_json = json.dumps(skeleton, indent=2) if output: Path(output).parent.mkdir(parents=True, exist_ok=True) Path(output).write_text(out_json) print(f"✅ Skeleton written to {output}") print(f" {len(skeleton['steps'])} steps generated from docs") print(f" Fill in all [TODO: ...] entries before sharing") else: print(out_json) if __name__ == "__main__": main()