Files
awesome-copilot/skills/acquire-codebase-knowledge/scripts/scan.py
Satya K b8f3822748 Adds a new Agent Skill - Acquire-Codebase-Knowledge (#1373)
* feat(skill): add acquire-codebase-knowledge skill documentation

* feat(templates): add architecture, concerns, conventions, integrations, stack, structure, and testing documentation templates

* feat(references): add inquiry checkpoints and stack detection documentation

* feat(scan): add script to collect project discovery information for acquire-codebase-knowledge skill

* feat(skills): add acquire-codebase-knowledge skill for codebase mapping and documentation

* feat(scan): enhance scan script with absolute path handling and improved output variable validation

* feat(scan): replace bash script with Python script for project discovery information collection

* feat(skills): update acquire-codebase-knowledge skill to replace scan.sh with scan.py
2026-04-14 10:29:57 +10:00

713 lines
23 KiB
Python

#!/usr/bin/env python3
"""
scan.py — Collect project discovery information for the acquire-codebase-knowledge skill.
Run from the project root directory.
Usage: python3 scan.py [OPTIONS]
Options:
--output FILE Write output to FILE instead of stdout
--help Show this message and exit
Exit codes:
0 Success
1 Usage error
"""
import os
import sys
import argparse
import subprocess
import json
from pathlib import Path
from typing import List, Set
import re
TREE_LIMIT = 200
TREE_MAX_DEPTH = 3
TODO_LIMIT = 60
MANIFEST_PREVIEW_LINES = 80
RECENT_COMMITS_LIMIT = 20
CHURN_LIMIT = 20
EXCLUDE_DIRS = {
"node_modules", ".git", "dist", "build", "out", ".next", ".nuxt",
"__pycache__", ".venv", "venv", ".tox", "target", "vendor",
"coverage", ".nyc_output", "generated", ".cache", ".turbo",
".yarn", ".pnp", "bin", "obj"
}
MANIFESTS = [
# JavaScript/Node.js
"package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "bun.lockb",
"deno.json", "deno.jsonc",
# Python
"requirements.txt", "Pipfile", "Pipfile.lock", "pyproject.toml", "setup.py", "setup.cfg",
"poetry.lock", "pdm.lock", "uv.lock",
# Go
"go.mod", "go.sum",
# Rust
"Cargo.toml", "Cargo.lock",
# Java/Kotlin
"pom.xml", "build.gradle", "build.gradle.kts", "settings.gradle", "settings.gradle.kts",
"gradle.properties",
# PHP/Composer
"composer.json", "composer.lock",
# Ruby
"Gemfile", "Gemfile.lock", "*.gemspec",
# Elixir
"mix.exs", "mix.lock",
# Dart/Flutter
"pubspec.yaml", "pubspec.lock",
# .NET/C#
"*.csproj", "*.sln", "*.slnx", "global.json", "packages.config",
# Swift
"Package.swift", "Package.resolved",
# Scala
"build.sbt", "scala-cli.yml",
# Haskell
"*.cabal", "stack.yaml", "cabal.project", "cabal.project.local",
# OCaml
"dune-project", "opam", "opam.lock",
# Nim
"*.nimble", "nim.cfg",
# Crystal
"shard.yml", "shard.lock",
# R
"DESCRIPTION", "renv.lock",
# Julia
"Project.toml", "Manifest.toml",
# Build systems
"CMakeLists.txt", "Makefile", "GNUmakefile",
"SConstruct", "build.xml",
"BUILD", "BUILD.bazel", "WORKSPACE", "bazel.lock",
"justfile", ".justfile", "Taskfile.yml",
"tox.ini", "Vagrantfile"
]
ENTRY_CANDIDATES = [
# JavaScript/Node.js/TypeScript
"src/index.ts", "src/index.js", "src/index.mjs",
"src/main.ts", "src/main.js", "src/main.py",
"src/app.ts", "src/app.js",
"src/server.ts", "src/server.js",
"index.ts", "index.js", "app.ts", "app.js",
"lib/index.ts", "lib/index.js",
# Go
"main.go", "cmd/main.go", "cmd/*/main.go",
# Python
"main.py", "app.py", "server.py", "run.py", "cli.py",
"src/main.py", "src/__main__.py",
# .NET/C#
"Program.cs", "src/Program.cs", "Main.cs",
# Java
"Main.java", "Application.java", "App.java",
"src/main/java/Main.java",
# Kotlin
"Main.kt", "Application.kt", "App.kt",
# Rust
"src/main.rs", "src/lib.rs",
# Swift
"main.swift", "Package.swift", "Sources/main.swift",
# Ruby
"app.rb", "main.rb", "lib/app.rb",
# PHP
"index.php", "app.php", "public/index.php",
# Go
"cmd/*/main.go",
# Scala
"src/main/scala/Main.scala",
# Haskell
"Main.hs", "app/Main.hs",
# Clojure
"src/core.clj", "-main.clj",
# Elixir
"lib/application.ex", "mix.exs",
]
LINT_FILES = [
".eslintrc", ".eslintrc.json", ".eslintrc.js", ".eslintrc.cjs", ".eslintrc.yml", ".eslintrc.yaml",
"eslint.config.js", "eslint.config.mjs", "eslint.config.cjs",
".prettierrc", ".prettierrc.json", ".prettierrc.js", ".prettierrc.yml",
"prettier.config.js", "prettier.config.mjs",
".editorconfig",
"tsconfig.json", "tsconfig.base.json", "tsconfig.build.json",
".golangci.yml", ".golangci.yaml",
"setup.cfg", ".flake8", ".pylintrc", "mypy.ini",
".rubocop.yml", "phpcs.xml", "phpstan.neon",
"biome.json", "biome.jsonc"
]
ENV_TEMPLATES = [".env.example", ".env.template", ".env.sample", ".env.defaults", ".env.local.example"]
SOURCE_EXTS = [
"ts", "tsx", "js", "jsx", "mjs", "cjs",
"py", "go", "java", "kt", "rb", "php",
"rs", "cs", "cpp", "c", "h", "ex", "exs",
"swift", "scala", "clj", "cljs", "lua",
"vim", "vim", "hs", "ml", "ml", "nim", "cr",
"r", "jl", "groovy", "gradle", "xml", "json"
]
MONOREPO_FILES = ["pnpm-workspace.yaml", "lerna.json", "nx.json", "rush.json", "turbo.json", "moon.yml"]
MONOREPO_DIRS = ["packages", "apps", "libs", "services", "modules"]
CI_CD_CONFIGS = {
".github/workflows": "GitHub Actions",
".gitlab-ci.yml": "GitLab CI",
"Jenkinsfile": "Jenkins",
".circleci/config.yml": "CircleCI",
".travis.yml": "Travis CI",
"azure-pipelines.yml": "Azure Pipelines",
"appveyor.yml": "AppVeyor",
".drone.yml": "Drone CI",
".woodpecker.yml": "Woodpecker CI",
"bitbucket-pipelines.yml": "Bitbucket Pipelines"
}
CONTAINER_FILES = [
"Dockerfile", "docker-compose.yml", "docker-compose.yaml",
".dockerignore", "Dockerfile.*",
"k8s", "kustomization.yaml", "Chart.yaml",
"Vagrantfile", "podman-compose.yml"
]
SECURITY_CONFIGS = [
".snyk", "security.txt", "SECURITY.md",
".dependabot.yml", ".whitesource",
"sbom.json", "sbom.spdx", ".bandit.yaml"
]
PERFORMANCE_MARKERS = [
"benchmark", "bench", "perf.data", ".prof",
"k6.js", "locustfile.py", "jmeter.jmx"
]
def parse_args():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Scan the current directory (project root) and output discovery information "
"for the acquire-codebase-knowledge skill.",
add_help=True
)
parser.add_argument(
"--output",
type=str,
help="Write output to FILE instead of stdout"
)
return parser.parse_args()
def should_exclude(path: Path) -> bool:
"""Check if a path should be excluded from scanning."""
return any(part in EXCLUDE_DIRS for part in path.parts)
def get_directory_tree(max_depth: int = TREE_MAX_DEPTH) -> List[str]:
"""Get directory tree up to max_depth."""
files = []
def walk(path: Path, depth: int):
if depth > max_depth or should_exclude(path):
return
try:
for item in sorted(path.iterdir()):
if should_exclude(item):
continue
rel_path = item.relative_to(Path.cwd())
files.append(str(rel_path))
if item.is_dir():
walk(item, depth + 1)
except (PermissionError, OSError):
pass
walk(Path.cwd(), 0)
return files[:TREE_LIMIT]
def find_manifest_files() -> List[str]:
"""Find manifest files matching patterns."""
found = []
for pattern in MANIFESTS:
if "*" in pattern:
# Handle glob patterns
for path in Path.cwd().glob(pattern):
if path.is_file() and not should_exclude(path):
found.append(path.name)
else:
path = Path.cwd() / pattern
if path.is_file():
found.append(pattern)
return sorted(set(found))
def read_file_preview(filepath: Path, max_lines: int = MANIFEST_PREVIEW_LINES) -> str:
"""Read file with line limit."""
try:
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
lines = f.readlines()
if not lines:
return "None found."
preview = ''.join(lines[:max_lines])
if len(lines) > max_lines:
preview += f"\n[TRUNCATED] Showing first {max_lines} of {len(lines)} lines."
return preview
except Exception as e:
return f"[Error reading file: {e}]"
def find_entry_points() -> List[str]:
"""Find entry point candidates."""
found = []
for candidate in ENTRY_CANDIDATES:
if Path(candidate).exists():
found.append(candidate)
return found
def find_lint_config() -> List[str]:
"""Find linting and formatting config files."""
found = []
for filename in LINT_FILES:
if Path(filename).exists():
found.append(filename)
return found
def find_env_templates() -> List[tuple]:
"""Find environment variable templates."""
found = []
for filename in ENV_TEMPLATES:
path = Path(filename)
if path.exists():
found.append((filename, path))
return found
def search_todos() -> List[str]:
"""Search for TODO/FIXME/HACK comments."""
todos = []
patterns = ["TODO", "FIXME", "HACK"]
exclude_dirs_str = "|".join(EXCLUDE_DIRS | {"test", "tests", "__tests__", "spec", "__mocks__", "fixtures"})
try:
for root, dirs, files in os.walk(Path.cwd()):
# Remove excluded directories from dirs to prevent os.walk from descending
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and d not in {"test", "tests", "__tests__", "spec", "__mocks__", "fixtures"}]
for file in files:
# Check file extension
ext = Path(file).suffix.lstrip('.')
if ext not in SOURCE_EXTS:
continue
filepath = Path(root) / file
try:
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
for line_num, line in enumerate(f, 1):
for pattern in patterns:
if pattern in line:
rel_path = filepath.relative_to(Path.cwd())
todos.append(f"{rel_path}:{line_num}: {line.strip()}")
except Exception:
pass
except Exception:
pass
return todos[:TODO_LIMIT]
def get_git_commits() -> List[str]:
"""Get recent git commits."""
try:
result = subprocess.run(
["git", "log", "--oneline", "-n", str(RECENT_COMMITS_LIMIT)],
capture_output=True,
text=True,
cwd=Path.cwd()
)
if result.returncode == 0:
return result.stdout.strip().split('\n') if result.stdout.strip() else []
return []
except Exception:
return []
def get_git_churn() -> List[str]:
"""Get high-churn files from last 90 days."""
try:
result = subprocess.run(
["git", "log", "--since=90 days ago", "--name-only", "--pretty=format:"],
capture_output=True,
text=True,
cwd=Path.cwd()
)
if result.returncode == 0:
files = [f.strip() for f in result.stdout.split('\n') if f.strip()]
# Count occurrences
from collections import Counter
counts = Counter(files)
churn = sorted(counts.items(), key=lambda x: x[1], reverse=True)
return [f"{count:4d} {filename}" for filename, count in churn[:CHURN_LIMIT]]
return []
except Exception:
return []
def is_git_repo() -> bool:
"""Check if current directory is a git repository."""
try:
subprocess.run(
["git", "rev-parse", "--git-dir"],
capture_output=True,
cwd=Path.cwd(),
timeout=2
)
return True
except Exception:
return False
def detect_monorepo() -> List[str]:
"""Detect monorepo signals."""
signals = []
for filename in MONOREPO_FILES:
if Path(filename).exists():
signals.append(f"Monorepo tool detected: {filename}")
for dirname in MONOREPO_DIRS:
if Path(dirname).is_dir():
signals.append(f"Sub-package directory found: {dirname}/")
# Check package.json workspaces
if Path("package.json").exists():
try:
with open("package.json", 'r') as f:
content = f.read()
if '"workspaces"' in content:
signals.append("package.json has 'workspaces' field (npm/yarn workspaces monorepo)")
except Exception:
pass
return signals
def detect_ci_cd_pipelines() -> List[str]:
"""Detect CI/CD pipeline configurations."""
pipelines = []
for config_path, pipeline_name in CI_CD_CONFIGS.items():
path = Path(config_path)
if path.is_file():
pipelines.append(f"CI/CD: {pipeline_name}")
elif path.is_dir():
# Check for workflow files in directory
try:
if list(path.glob("*.yml")) or list(path.glob("*.yaml")):
pipelines.append(f"CI/CD: {pipeline_name}")
except Exception:
pass
return pipelines
def detect_containers() -> List[str]:
"""Detect containerization and orchestration configs."""
containers = []
for config in CONTAINER_FILES:
path = Path(config)
if path.is_file():
if "Dockerfile" in config:
containers.append("Container: Docker found")
elif "docker-compose" in config:
containers.append("Orchestration: Docker Compose found")
elif config.endswith(".yaml") or config.endswith(".yml"):
containers.append(f"Container/Orchestration: {config}")
elif path.is_dir():
if config in ["k8s", "kubernetes"]:
containers.append("Orchestration: Kubernetes configs found")
try:
if list(path.glob("*.yml")) or list(path.glob("*.yaml")):
containers.append(f"Container/Orchestration: {config}/ directory found")
except Exception:
pass
return containers
def detect_security_configs() -> List[str]:
"""Detect security and compliance configurations."""
security = []
for config in SECURITY_CONFIGS:
if Path(config).exists():
config_name = config.replace(".yml", "").replace(".yaml", "").lstrip(".")
security.append(f"Security: {config_name}")
return security
def detect_performance_markers() -> List[str]:
"""Detect performance testing and profiling markers."""
performance = []
for marker in PERFORMANCE_MARKERS:
if Path(marker).exists():
performance.append(f"Performance: {marker} found")
else:
# Check for directories
try:
if Path(marker).is_dir():
performance.append(f"Performance: {marker}/ directory found")
except Exception:
pass
return performance
def collect_code_metrics() -> dict:
"""Collect code metrics: file counts by extension, total LOC."""
metrics = {
"total_files": 0,
"by_extension": {},
"by_language": {},
"total_lines": 0,
"largest_files": []
}
# Language mapping
lang_map = {
"ts": "TypeScript", "tsx": "TypeScript/React", "js": "JavaScript",
"jsx": "JavaScript/React", "py": "Python", "go": "Go",
"java": "Java", "kt": "Kotlin", "rs": "Rust",
"cs": "C#", "rb": "Ruby", "php": "PHP",
"swift": "Swift", "scala": "Scala", "ex": "Elixir",
"cpp": "C++", "c": "C", "h": "C Header",
"clj": "Clojure", "lua": "Lua", "hs": "Haskell"
}
file_sizes = []
try:
for root, dirs, files in os.walk(Path.cwd()):
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
for file in files:
filepath = Path(root) / file
ext = filepath.suffix.lstrip('.')
if not ext or ext in {"pyc", "o", "a", "so"}:
continue
try:
size = filepath.stat().st_size
file_sizes.append((filepath.relative_to(Path.cwd()), size))
metrics["total_files"] += 1
metrics["by_extension"][ext] = metrics["by_extension"].get(ext, 0) + 1
lang = lang_map.get(ext, "Other")
metrics["by_language"][lang] = metrics["by_language"].get(lang, 0) + 1
# Count lines for text files
if ext in SOURCE_EXTS and size < 1_000_000: # Skip huge files
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
metrics["total_lines"] += len(f.readlines())
except Exception:
pass
except Exception:
pass
# Top 10 largest files
file_sizes.sort(key=lambda x: x[1], reverse=True)
metrics["largest_files"] = [
f"{str(f)}: {s/1024:.1f}KB" for f, s in file_sizes[:10]
]
except Exception:
pass
return metrics
def print_section(title: str, content: List[str], output_file=None) -> None:
"""Print a section with title and content."""
lines = [f"\n=== {title} ==="]
if isinstance(content, list):
lines.extend(content if content else ["None found."])
elif isinstance(content, str):
lines.append(content)
text = '\n'.join(lines) + '\n'
if output_file:
output_file.write(text)
else:
print(text, end='')
def main():
"""Main entry point."""
args = parse_args()
output_file = None
if args.output:
output_dir = Path(args.output).parent
output_dir.mkdir(parents=True, exist_ok=True)
output_file = open(args.output, 'w', encoding='utf-8')
print(f"Writing output to: {args.output}", file=sys.stderr)
try:
# Directory tree
print_section(
f"DIRECTORY TREE (max depth {TREE_MAX_DEPTH}, source files only)",
get_directory_tree(),
output_file
)
# Stack detection
manifests = find_manifest_files()
if manifests:
manifest_content = [""]
for manifest in manifests:
manifest_path = Path(manifest)
manifest_content.append(f"--- {manifest} ---")
if manifest == "bun.lockb":
manifest_content.append("[Binary lockfile — see package.json for dependency details.]")
else:
manifest_content.append(read_file_preview(manifest_path))
print_section("STACK DETECTION (manifest files)", manifest_content, output_file)
else:
print_section("STACK DETECTION (manifest files)", ["No recognized manifest files found in project root."], output_file)
# Entry points
entries = find_entry_points()
if entries:
entry_content = [f"Found: {e}" for e in entries]
print_section("ENTRY POINTS", entry_content, output_file)
else:
print_section("ENTRY POINTS", ["No common entry points found. Check 'main' or 'scripts.start' in manifest files above."], output_file)
# Linting config
lint = find_lint_config()
if lint:
lint_content = [f"Found: {l}" for l in lint]
print_section("LINTING AND FORMATTING CONFIG", lint_content, output_file)
else:
print_section("LINTING AND FORMATTING CONFIG", ["No linting or formatting config files found in project root."], output_file)
# Environment templates
envs = find_env_templates()
if envs:
env_content = []
for filename, filepath in envs:
env_content.append(f"--- {filename} ---")
env_content.append(read_file_preview(filepath))
print_section("ENVIRONMENT VARIABLE TEMPLATES", env_content, output_file)
else:
print_section("ENVIRONMENT VARIABLE TEMPLATES", ["No .env.example or .env.template found. Identify required environment variables by searching the code and config for environment variable reads."], output_file)
# TODOs
todos = search_todos()
if todos:
print_section("TODO / FIXME / HACK (production code only, test dirs excluded)", todos, output_file)
else:
print_section("TODO / FIXME / HACK (production code only, test dirs excluded)", ["None found."], output_file)
# Git info
if is_git_repo():
commits = get_git_commits()
if commits:
print_section("GIT RECENT COMMITS (last 20)", commits, output_file)
else:
print_section("GIT RECENT COMMITS (last 20)", ["No commits found."], output_file)
churn = get_git_churn()
if churn:
print_section("HIGH-CHURN FILES (last 90 days, top 20)", churn, output_file)
else:
print_section("HIGH-CHURN FILES (last 90 days, top 20)", ["None found."], output_file)
else:
print_section("GIT RECENT COMMITS (last 20)", ["Not a git repository or no commits yet."], output_file)
print_section("HIGH-CHURN FILES (last 90 days, top 20)", ["Not a git repository."], output_file)
# Monorepo detection
monorepo = detect_monorepo()
if monorepo:
print_section("MONOREPO SIGNALS", monorepo, output_file)
else:
print_section("MONOREPO SIGNALS", ["No monorepo signals detected."], output_file)
# Code metrics
metrics = collect_code_metrics()
metrics_output = [
f"Total files scanned: {metrics['total_files']}",
f"Total lines of code: {metrics['total_lines']}",
""
]
if metrics["by_language"]:
metrics_output.append("Files by language:")
for lang, count in sorted(metrics["by_language"].items(), key=lambda x: x[1], reverse=True):
metrics_output.append(f" {lang}: {count}")
if metrics["largest_files"]:
metrics_output.append("")
metrics_output.append("Top 10 largest files:")
metrics_output.extend(metrics["largest_files"])
print_section("CODE METRICS", metrics_output, output_file)
# CI/CD Detection
ci_cd = detect_ci_cd_pipelines()
if ci_cd:
print_section("CI/CD PIPELINES", ci_cd, output_file)
else:
print_section("CI/CD PIPELINES", ["No CI/CD pipelines detected."], output_file)
# Container Detection
containers = detect_containers()
if containers:
print_section("CONTAINERS & ORCHESTRATION", containers, output_file)
else:
print_section("CONTAINERS & ORCHESTRATION", ["No containerization configs detected."], output_file)
# Security Configs
security = detect_security_configs()
if security:
print_section("SECURITY & COMPLIANCE", security, output_file)
else:
print_section("SECURITY & COMPLIANCE", ["No security configs detected."], output_file)
# Performance Markers
performance = detect_performance_markers()
if performance:
print_section("PERFORMANCE & TESTING", performance, output_file)
else:
print_section("PERFORMANCE & TESTING", ["No performance testing configs detected."], output_file)
# Final message
final_msg = "\n=== SCAN COMPLETE ===\n"
if output_file:
output_file.write(final_msg)
else:
print(final_msg, end='')
return 0
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return 1
finally:
if output_file:
output_file.close()
if __name__ == "__main__":
sys.exit(main())