awesome-copilot/skills/acquire-codebase-knowledge/scripts/scan.py

#!/usr/bin/env python3
"""
scan.py — Collect project discovery information for the acquire-codebase-knowledge skill.
Run from the project root directory.

Usage: python3 scan.py [OPTIONS]

Options:
  --output FILE   Write output to FILE instead of stdout
  --help          Show this message and exit

Exit codes:
  0  Success
  1  Usage error
"""

import os
import sys
import argparse
import subprocess
import json
from pathlib import Path
from typing import List, Set
import re

TREE_LIMIT = 200
TREE_MAX_DEPTH = 3
TODO_LIMIT = 60
MANIFEST_PREVIEW_LINES = 80
RECENT_COMMITS_LIMIT = 20
CHURN_LIMIT = 20

EXCLUDE_DIRS = {
    "node_modules", ".git", "dist", "build", "out", ".next", ".nuxt",
    "__pycache__", ".venv", "venv", ".tox", "target", "vendor",
    "coverage", ".nyc_output", "generated", ".cache", ".turbo",
    ".yarn", ".pnp", "bin", "obj"
}

MANIFESTS = [
    # JavaScript/Node.js
    "package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "bun.lockb",
    "deno.json", "deno.jsonc",
    # Python
    "requirements.txt", "Pipfile", "Pipfile.lock", "pyproject.toml", "setup.py", "setup.cfg",
    "poetry.lock", "pdm.lock", "uv.lock",
    # Go
    "go.mod", "go.sum",
    # Rust
    "Cargo.toml", "Cargo.lock",
    # Java/Kotlin
    "pom.xml", "build.gradle", "build.gradle.kts", "settings.gradle", "settings.gradle.kts",
    "gradle.properties",
    # PHP/Composer
    "composer.json", "composer.lock",
    # Ruby
    "Gemfile", "Gemfile.lock", "*.gemspec",
    # Elixir
    "mix.exs", "mix.lock",
    # Dart/Flutter
    "pubspec.yaml", "pubspec.lock",
    # .NET/C#
    "*.csproj", "*.sln", "*.slnx", "global.json", "packages.config",
    # Swift
    "Package.swift", "Package.resolved",
    # Scala
    "build.sbt", "scala-cli.yml",
    # Haskell
    "*.cabal", "stack.yaml", "cabal.project", "cabal.project.local",
    # OCaml
    "dune-project", "opam", "opam.lock",
    # Nim
    "*.nimble", "nim.cfg",
    # Crystal
    "shard.yml", "shard.lock",
    # R
    "DESCRIPTION", "renv.lock",
    # Julia
    "Project.toml", "Manifest.toml",
    # Build systems
    "CMakeLists.txt", "Makefile", "GNUmakefile",
    "SConstruct", "build.xml",
    "BUILD", "BUILD.bazel", "WORKSPACE", "bazel.lock",
    "justfile", ".justfile", "Taskfile.yml",
    "tox.ini", "Vagrantfile"
]

ENTRY_CANDIDATES = [
    # JavaScript/Node.js/TypeScript
    "src/index.ts", "src/index.js", "src/index.mjs",
    "src/main.ts", "src/main.js", "src/main.py",
    "src/app.ts", "src/app.js",
    "src/server.ts", "src/server.js",
    "index.ts", "index.js", "app.ts", "app.js",
    "lib/index.ts", "lib/index.js",
    # Go
    "main.go", "cmd/main.go", "cmd/*/main.go",
    # Python
    "main.py", "app.py", "server.py", "run.py", "cli.py",
    "src/main.py", "src/__main__.py",
    # .NET/C#
    "Program.cs", "src/Program.cs", "Main.cs",
    # Java
    "Main.java", "Application.java", "App.java",
    "src/main/java/Main.java",
    # Kotlin
    "Main.kt", "Application.kt", "App.kt",
    # Rust
    "src/main.rs", "src/lib.rs",
    # Swift
    "main.swift", "Package.swift", "Sources/main.swift",
    # Ruby
    "app.rb", "main.rb", "lib/app.rb",
    # PHP
    "index.php", "app.php", "public/index.php",
    # Go
    "cmd/*/main.go",
    # Scala
    "src/main/scala/Main.scala",
    # Haskell
    "Main.hs", "app/Main.hs",
    # Clojure
    "src/core.clj", "-main.clj",
    # Elixir
    "lib/application.ex", "mix.exs",
]

LINT_FILES = [
    ".eslintrc", ".eslintrc.json", ".eslintrc.js", ".eslintrc.cjs", ".eslintrc.yml", ".eslintrc.yaml",
    "eslint.config.js", "eslint.config.mjs", "eslint.config.cjs",
    ".prettierrc", ".prettierrc.json", ".prettierrc.js", ".prettierrc.yml",
    "prettier.config.js", "prettier.config.mjs",
    ".editorconfig",
    "tsconfig.json", "tsconfig.base.json", "tsconfig.build.json",
    ".golangci.yml", ".golangci.yaml",
    "setup.cfg", ".flake8", ".pylintrc", "mypy.ini",
    ".rubocop.yml", "phpcs.xml", "phpstan.neon",
    "biome.json", "biome.jsonc"
]

ENV_TEMPLATES = [".env.example", ".env.template", ".env.sample", ".env.defaults", ".env.local.example"]

SOURCE_EXTS = [
    "ts", "tsx", "js", "jsx", "mjs", "cjs",
    "py", "go", "java", "kt", "rb", "php",
    "rs", "cs", "cpp", "c", "h", "ex", "exs",
    "swift", "scala", "clj", "cljs", "lua",
    "vim", "vim", "hs", "ml", "ml", "nim", "cr",
    "r", "jl", "groovy", "gradle", "xml", "json"
]

MONOREPO_FILES = ["pnpm-workspace.yaml", "lerna.json", "nx.json", "rush.json", "turbo.json", "moon.yml"]
MONOREPO_DIRS = ["packages", "apps", "libs", "services", "modules"]

CI_CD_CONFIGS = {
    ".github/workflows": "GitHub Actions",
    ".gitlab-ci.yml": "GitLab CI",
    "Jenkinsfile": "Jenkins",
    ".circleci/config.yml": "CircleCI",
    ".travis.yml": "Travis CI",
    "azure-pipelines.yml": "Azure Pipelines",
    "appveyor.yml": "AppVeyor",
    ".drone.yml": "Drone CI",
    ".woodpecker.yml": "Woodpecker CI",
    "bitbucket-pipelines.yml": "Bitbucket Pipelines"
}

CONTAINER_FILES = [
    "Dockerfile", "docker-compose.yml", "docker-compose.yaml",
    ".dockerignore", "Dockerfile.*",
    "k8s", "kustomization.yaml", "Chart.yaml",
    "Vagrantfile", "podman-compose.yml"
]

SECURITY_CONFIGS = [
    ".snyk", "security.txt", "SECURITY.md",
    ".dependabot.yml", ".whitesource",
    "sbom.json", "sbom.spdx", ".bandit.yaml"
]

PERFORMANCE_MARKERS = [
    "benchmark", "bench", "perf.data", ".prof",
    "k6.js", "locustfile.py", "jmeter.jmx"
]


def parse_args():
    """Parse command-line arguments."""
    parser = argparse.ArgumentParser(
        description="Scan the current directory (project root) and output discovery information "
                    "for the acquire-codebase-knowledge skill.",
        add_help=True
    )
    parser.add_argument(
        "--output",
        type=str,
        help="Write output to FILE instead of stdout"
    )
    return parser.parse_args()


def should_exclude(path: Path) -> bool:
    """Check if a path should be excluded from scanning."""
    return any(part in EXCLUDE_DIRS for part in path.parts)


def get_directory_tree(max_depth: int = TREE_MAX_DEPTH) -> List[str]:
    """Get directory tree up to max_depth."""
    files = []

    def walk(path: Path, depth: int):
        if depth > max_depth or should_exclude(path):
            return
        try:
            for item in sorted(path.iterdir()):
                if should_exclude(item):
                    continue
                rel_path = item.relative_to(Path.cwd())
                files.append(str(rel_path))
                if item.is_dir():
                    walk(item, depth + 1)
        except (PermissionError, OSError):
            pass

    walk(Path.cwd(), 0)
    return files[:TREE_LIMIT]


def find_manifest_files() -> List[str]:
    """Find manifest files matching patterns."""
    found = []
    for pattern in MANIFESTS:
        if "*" in pattern:
            # Handle glob patterns
            for path in Path.cwd().glob(pattern):
                if path.is_file() and not should_exclude(path):
                    found.append(path.name)
        else:
            path = Path.cwd() / pattern
            if path.is_file():
                found.append(pattern)
    return sorted(set(found))


def read_file_preview(filepath: Path, max_lines: int = MANIFEST_PREVIEW_LINES) -> str:
    """Read file with line limit."""
    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
            lines = f.readlines()

        if not lines:
            return "None found."

        preview = ''.join(lines[:max_lines])
        if len(lines) > max_lines:
            preview += f"\n[TRUNCATED] Showing first {max_lines} of {len(lines)} lines."
        return preview
    except Exception as e:
        return f"[Error reading file: {e}]"


def find_entry_points() -> List[str]:
    """Find entry point candidates."""
    found = []
    for candidate in ENTRY_CANDIDATES:
        if Path(candidate).exists():
            found.append(candidate)
    return found


def find_lint_config() -> List[str]:
    """Find linting and formatting config files."""
    found = []
    for filename in LINT_FILES:
        if Path(filename).exists():
            found.append(filename)
    return found


def find_env_templates() -> List[tuple]:
    """Find environment variable templates."""
    found = []
    for filename in ENV_TEMPLATES:
        path = Path(filename)
        if path.exists():
            found.append((filename, path))
    return found


def search_todos() -> List[str]:
    """Search for TODO/FIXME/HACK comments."""
    todos = []
    patterns = ["TODO", "FIXME", "HACK"]
    exclude_dirs_str = "|".join(EXCLUDE_DIRS | {"test", "tests", "__tests__", "spec", "__mocks__", "fixtures"})

    try:
        for root, dirs, files in os.walk(Path.cwd()):
            # Remove excluded directories from dirs to prevent os.walk from descending
            dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and d not in {"test", "tests", "__tests__", "spec", "__mocks__", "fixtures"}]

            for file in files:
                # Check file extension
                ext = Path(file).suffix.lstrip('.')
                if ext not in SOURCE_EXTS:
                    continue

                filepath = Path(root) / file
                try:
                    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
                        for line_num, line in enumerate(f, 1):
                            for pattern in patterns:
                                if pattern in line:
                                    rel_path = filepath.relative_to(Path.cwd())
                                    todos.append(f"{rel_path}:{line_num}: {line.strip()}")
                except Exception:
                    pass
    except Exception:
        pass

    return todos[:TODO_LIMIT]


def get_git_commits() -> List[str]:
    """Get recent git commits."""
    try:
        result = subprocess.run(
            ["git", "log", "--oneline", "-n", str(RECENT_COMMITS_LIMIT)],
            capture_output=True,
            text=True,
            cwd=Path.cwd()
        )
        if result.returncode == 0:
            return result.stdout.strip().split('\n') if result.stdout.strip() else []
        return []
    except Exception:
        return []


def get_git_churn() -> List[str]:
    """Get high-churn files from last 90 days."""
    try:
        result = subprocess.run(
            ["git", "log", "--since=90 days ago", "--name-only", "--pretty=format:"],
            capture_output=True,
            text=True,
            cwd=Path.cwd()
        )
        if result.returncode == 0:
            files = [f.strip() for f in result.stdout.split('\n') if f.strip()]
            # Count occurrences
            from collections import Counter
            counts = Counter(files)
            churn = sorted(counts.items(), key=lambda x: x[1], reverse=True)
            return [f"{count:4d} {filename}" for filename, count in churn[:CHURN_LIMIT]]
        return []
    except Exception:
        return []


def is_git_repo() -> bool:
    """Check if current directory is a git repository."""
    try:
        subprocess.run(
            ["git", "rev-parse", "--git-dir"],
            capture_output=True,
            cwd=Path.cwd(),
            timeout=2
        )
        return True
    except Exception:
        return False


def detect_monorepo() -> List[str]:
    """Detect monorepo signals."""
    signals = []

    for filename in MONOREPO_FILES:
        if Path(filename).exists():
            signals.append(f"Monorepo tool detected: {filename}")

    for dirname in MONOREPO_DIRS:
        if Path(dirname).is_dir():
            signals.append(f"Sub-package directory found: {dirname}/")

    # Check package.json workspaces
    if Path("package.json").exists():
        try:
            with open("package.json", 'r') as f:
                content = f.read()
                if '"workspaces"' in content:
                    signals.append("package.json has 'workspaces' field (npm/yarn workspaces monorepo)")
        except Exception:
            pass

    return signals


def detect_ci_cd_pipelines() -> List[str]:
    """Detect CI/CD pipeline configurations."""
    pipelines = []

    for config_path, pipeline_name in CI_CD_CONFIGS.items():
        path = Path(config_path)
        if path.is_file():
            pipelines.append(f"CI/CD: {pipeline_name}")
        elif path.is_dir():
            # Check for workflow files in directory
            try:
                if list(path.glob("*.yml")) or list(path.glob("*.yaml")):
                    pipelines.append(f"CI/CD: {pipeline_name}")
            except Exception:
                pass

    return pipelines


def detect_containers() -> List[str]:
    """Detect containerization and orchestration configs."""
    containers = []

    for config in CONTAINER_FILES:
        path = Path(config)
        if path.is_file():
            if "Dockerfile" in config:
                containers.append("Container: Docker found")
            elif "docker-compose" in config:
                containers.append("Orchestration: Docker Compose found")
            elif config.endswith(".yaml") or config.endswith(".yml"):
                containers.append(f"Container/Orchestration: {config}")
        elif path.is_dir():
            if config in ["k8s", "kubernetes"]:
                containers.append("Orchestration: Kubernetes configs found")
            try:
                if list(path.glob("*.yml")) or list(path.glob("*.yaml")):
                    containers.append(f"Container/Orchestration: {config}/ directory found")
            except Exception:
                pass

    return containers


def detect_security_configs() -> List[str]:
    """Detect security and compliance configurations."""
    security = []

    for config in SECURITY_CONFIGS:
        if Path(config).exists():
            config_name = config.replace(".yml", "").replace(".yaml", "").lstrip(".")
            security.append(f"Security: {config_name}")

    return security


def detect_performance_markers() -> List[str]:
    """Detect performance testing and profiling markers."""
    performance = []

    for marker in PERFORMANCE_MARKERS:
        if Path(marker).exists():
            performance.append(f"Performance: {marker} found")
        else:
            # Check for directories
            try:
                if Path(marker).is_dir():
                    performance.append(f"Performance: {marker}/ directory found")
            except Exception:
                pass

    return performance


def collect_code_metrics() -> dict:
    """Collect code metrics: file counts by extension, total LOC."""
    metrics = {
        "total_files": 0,
        "by_extension": {},
        "by_language": {},
        "total_lines": 0,
        "largest_files": []
    }

    # Language mapping
    lang_map = {
        "ts": "TypeScript", "tsx": "TypeScript/React", "js": "JavaScript",
        "jsx": "JavaScript/React", "py": "Python", "go": "Go",
        "java": "Java", "kt": "Kotlin", "rs": "Rust",
        "cs": "C#", "rb": "Ruby", "php": "PHP",
        "swift": "Swift", "scala": "Scala", "ex": "Elixir",
        "cpp": "C++", "c": "C", "h": "C Header",
        "clj": "Clojure", "lua": "Lua", "hs": "Haskell"
    }

    file_sizes = []

    try:
        for root, dirs, files in os.walk(Path.cwd()):
            dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]

            for file in files:
                filepath = Path(root) / file
                ext = filepath.suffix.lstrip('.')

                if not ext or ext in {"pyc", "o", "a", "so"}:
                    continue

                try:
                    size = filepath.stat().st_size
                    file_sizes.append((filepath.relative_to(Path.cwd()), size))

                    metrics["total_files"] += 1
                    metrics["by_extension"][ext] = metrics["by_extension"].get(ext, 0) + 1

                    lang = lang_map.get(ext, "Other")
                    metrics["by_language"][lang] = metrics["by_language"].get(lang, 0) + 1

                    # Count lines for text files
                    if ext in SOURCE_EXTS and size < 1_000_000:  # Skip huge files
                        try:
                            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                                metrics["total_lines"] += len(f.readlines())
                        except Exception:
                            pass
                except Exception:
                    pass

        # Top 10 largest files
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        metrics["largest_files"] = [
            f"{str(f)}: {s/1024:.1f}KB" for f, s in file_sizes[:10]
        ]

    except Exception:
        pass

    return metrics


def print_section(title: str, content: List[str], output_file=None) -> None:
    """Print a section with title and content."""
    lines = [f"\n=== {title} ==="]

    if isinstance(content, list):
        lines.extend(content if content else ["None found."])
    elif isinstance(content, str):
        lines.append(content)

    text = '\n'.join(lines) + '\n'

    if output_file:
        output_file.write(text)
    else:
        print(text, end='')


def main():
    """Main entry point."""
    args = parse_args()

    output_file = None
    if args.output:
        output_dir = Path(args.output).parent
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = open(args.output, 'w', encoding='utf-8')
        print(f"Writing output to: {args.output}", file=sys.stderr)

    try:
        # Directory tree
        print_section(
            f"DIRECTORY TREE (max depth {TREE_MAX_DEPTH}, source files only)",
            get_directory_tree(),
            output_file
        )

        # Stack detection
        manifests = find_manifest_files()
        if manifests:
            manifest_content = [""]
            for manifest in manifests:
                manifest_path = Path(manifest)
                manifest_content.append(f"--- {manifest} ---")
                if manifest == "bun.lockb":
                    manifest_content.append("[Binary lockfile — see package.json for dependency details.]")
                else:
                    manifest_content.append(read_file_preview(manifest_path))
            print_section("STACK DETECTION (manifest files)", manifest_content, output_file)
        else:
            print_section("STACK DETECTION (manifest files)", ["No recognized manifest files found in project root."], output_file)

        # Entry points
        entries = find_entry_points()
        if entries:
            entry_content = [f"Found: {e}" for e in entries]
            print_section("ENTRY POINTS", entry_content, output_file)
        else:
            print_section("ENTRY POINTS", ["No common entry points found. Check 'main' or 'scripts.start' in manifest files above."], output_file)

        # Linting config
        lint = find_lint_config()
        if lint:
            lint_content = [f"Found: {l}" for l in lint]
            print_section("LINTING AND FORMATTING CONFIG", lint_content, output_file)
        else:
            print_section("LINTING AND FORMATTING CONFIG", ["No linting or formatting config files found in project root."], output_file)

        # Environment templates
        envs = find_env_templates()
        if envs:
            env_content = []
            for filename, filepath in envs:
                env_content.append(f"--- {filename} ---")
                env_content.append(read_file_preview(filepath))
            print_section("ENVIRONMENT VARIABLE TEMPLATES", env_content, output_file)
        else:
            print_section("ENVIRONMENT VARIABLE TEMPLATES", ["No .env.example or .env.template found. Identify required environment variables by searching the code and config for environment variable reads."], output_file)

        # TODOs
        todos = search_todos()
        if todos:
            print_section("TODO / FIXME / HACK (production code only, test dirs excluded)", todos, output_file)
        else:
            print_section("TODO / FIXME / HACK (production code only, test dirs excluded)", ["None found."], output_file)

        # Git info
        if is_git_repo():
            commits = get_git_commits()
            if commits:
                print_section("GIT RECENT COMMITS (last 20)", commits, output_file)
            else:
                print_section("GIT RECENT COMMITS (last 20)", ["No commits found."], output_file)

            churn = get_git_churn()
            if churn:
                print_section("HIGH-CHURN FILES (last 90 days, top 20)", churn, output_file)
            else:
                print_section("HIGH-CHURN FILES (last 90 days, top 20)", ["None found."], output_file)
        else:
            print_section("GIT RECENT COMMITS (last 20)", ["Not a git repository or no commits yet."], output_file)
            print_section("HIGH-CHURN FILES (last 90 days, top 20)", ["Not a git repository."], output_file)

        # Monorepo detection
        monorepo = detect_monorepo()
        if monorepo:
            print_section("MONOREPO SIGNALS", monorepo, output_file)
        else:
            print_section("MONOREPO SIGNALS", ["No monorepo signals detected."], output_file)

        # Code metrics
        metrics = collect_code_metrics()
        metrics_output = [
            f"Total files scanned: {metrics['total_files']}",
            f"Total lines of code: {metrics['total_lines']}",
            ""
        ]
        if metrics["by_language"]:
            metrics_output.append("Files by language:")
            for lang, count in sorted(metrics["by_language"].items(), key=lambda x: x[1], reverse=True):
                metrics_output.append(f"  {lang}: {count}")
        if metrics["largest_files"]:
            metrics_output.append("")
            metrics_output.append("Top 10 largest files:")
            metrics_output.extend(metrics["largest_files"])
        print_section("CODE METRICS", metrics_output, output_file)

        # CI/CD Detection
        ci_cd = detect_ci_cd_pipelines()
        if ci_cd:
            print_section("CI/CD PIPELINES", ci_cd, output_file)
        else:
            print_section("CI/CD PIPELINES", ["No CI/CD pipelines detected."], output_file)

        # Container Detection
        containers = detect_containers()
        if containers:
            print_section("CONTAINERS & ORCHESTRATION", containers, output_file)
        else:
            print_section("CONTAINERS & ORCHESTRATION", ["No containerization configs detected."], output_file)

        # Security Configs
        security = detect_security_configs()
        if security:
            print_section("SECURITY & COMPLIANCE", security, output_file)
        else:
            print_section("SECURITY & COMPLIANCE", ["No security configs detected."], output_file)

        # Performance Markers
        performance = detect_performance_markers()
        if performance:
            print_section("PERFORMANCE & TESTING", performance, output_file)
        else:
            print_section("PERFORMANCE & TESTING", ["No performance testing configs detected."], output_file)

        # Final message
        final_msg = "\n=== SCAN COMPLETE ===\n"
        if output_file:
            output_file.write(final_msg)
        else:
            print(final_msg, end='')

        return 0

    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1

    finally:
        if output_file:
            output_file.close()


if __name__ == "__main__":
    sys.exit(main())