add mini-context-graph skill (#1580)

* add mini-context-graph skill

* remove pycache files

* filename case update to SKILL.md

* update readme
This commit is contained in:
Nixon Kurian
2026-05-05 09:34:37 +05:30
committed by GitHub
parent 1f96bce626
commit 746ba555b6
16 changed files with 2343 additions and 0 deletions
@@ -0,0 +1,23 @@
"""
config.py — Global configuration constants for the Context Graph Skill.
Data directories are resolved from environment variables so the skill can be
used from any project without writing data inside the skill package itself.
MINI_CONTEXT_GRAPH_DATA_DIR — where graph.json, index.json, etc. live
MINI_CONTEXT_GRAPH_WIKI_DIR — where wiki pages, index.md, and log.md live
Both default to subdirectories of the current working directory when the env
vars are not set, so data ends up in the consuming project's directory.
"""
import os
from pathlib import Path
_BASE = Path(os.environ.get("MINI_CONTEXT_GRAPH_BASE", str(Path.cwd())))
DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(_BASE / "data")))
WIKI_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_WIKI_DIR", str(_BASE / "wiki")))
MAX_GRAPH_DEPTH: int = 2
MIN_CONFIDENCE: float = 0.6
MAX_NODES: int = 50
@@ -0,0 +1,296 @@
"""
contextgraph.py — Main interface for the Context Graph Skill.
This file is orchestration-only. All LLM reasoning lives in the .md files.
Python here only wires together the deterministic storage and retrieval tools.
Agent usage:
- ingest(): agent reads ingestion.md + ontology.md, extracts entities/relations,
then calls the tool methods directly.
- query(): agent reads retrieval.md, calls index_store.search + retrieval_engine.retrieve,
then calls graph_store.get_subgraph and returns the result.
"""
from __future__ import annotations
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
import config
from tools import graph_store, index_store, ontology_store, retrieval_engine, documents_store
class ContextGraphSkill:
def ingest(self, documents: list[str]) -> None:
"""
Orchestration entry point for ingesting documents into the context graph.
The agent (Copilot) MUST:
1. Read ingestion.md to understand entity/relation extraction rules.
2. Read ontology.md to apply type normalization.
3. For each document, produce a JSON with entities + relations.
4. For each entity:
- ontology_store.add_type(entity["type"])
- node_id = graph_store.add_node(entity["name"], entity["type"])
- index_store.add_entity(entity["name"], node_id)
5. For each relation (if confidence >= MIN_CONFIDENCE):
- ontology_store.add_relation(relation["type"])
- source_id = graph_store.find_node_by_name(relation["source"])
- target_id = graph_store.find_node_by_name(relation["target"])
- graph_store.add_edge(source_id, target_id, relation["type"], relation["confidence"])
This method does NOT call any LLM. It documents the agent contract only.
"""
raise NotImplementedError(
"ingest() must be driven by the Copilot agent following ingestion.md. "
"Call the tool methods directly after LLM extraction."
)
def query(self, query: str) -> dict:
"""
Orchestration entry point for retrieving a subgraph for a query.
The agent (Copilot) MUST:
1. Read retrieval.md to understand the retrieval strategy.
2. Call index_store.search(query) to get seed node_ids.
3. Call retrieval_engine.retrieve(seed_ids, depth=MAX_GRAPH_DEPTH) to expand.
4. Call graph_store.get_subgraph(node_ids) to build the result.
5. Return the subgraph dict.
This method does NOT call any LLM. It documents the agent contract only.
Returns an empty subgraph if called directly.
"""
seed_ids = index_store.search(query)
if not seed_ids:
return {"nodes": {}, "edges": []}
node_ids = retrieval_engine.retrieve(
seed_ids,
depth=config.MAX_GRAPH_DEPTH,
min_confidence=config.MIN_CONFIDENCE,
max_nodes=config.MAX_NODES,
)
return graph_store.get_subgraph(node_ids)
# ------------------------------------------------------------------
# Convenience wrappers — agents may call these directly
# ------------------------------------------------------------------
def add_node(self, name: str, node_type: str) -> str:
"""Add a node to the graph and index. Returns node_id."""
canonical_type = ontology_store.normalize_type(node_type)
ontology_store.add_type(canonical_type)
node_id = graph_store.add_node(name, canonical_type)
index_store.add_entity(name, node_id)
return node_id
def add_edge(
self, source_name: str, target_name: str, relation: str, confidence: float
) -> None:
"""Add an edge between two nodes (by name) if both exist and confidence qualifies."""
if confidence < config.MIN_CONFIDENCE:
return
source_id = graph_store.find_node_by_name(source_name)
target_id = graph_store.find_node_by_name(target_name)
if source_id is None or target_id is None:
return
canonical_relation = ontology_store.normalize_relation(relation)
ontology_store.add_relation(canonical_relation)
graph_store.add_edge(source_id, target_id, canonical_relation, confidence)
# ------------------------------------------------------------------
# LLM Wiki + RAG methods — store raw content & provenance
# ------------------------------------------------------------------
def ingest_with_content(
self,
doc_id: str,
title: str,
source: str,
raw_content: str,
entities: list[dict],
relations: list[dict],
) -> dict:
"""
Full RAG ingestion: stores raw document + chunks, then wires provenance
links from each graph node/edge back to source chunks.
The agent MUST:
1. Read the raw_content.
2. Read ingestion.md and ontology.md for extraction rules.
3. Extract entities and relations (LLM reasoning step).
4. Call this method with the results.
Args:
doc_id: Stable document identifier (e.g. "doc_001").
title: Human-readable document title.
source: Origin path or URL (immutable, never modified).
raw_content: Full text of the document.
entities: List of dicts: [{name, type, supporting_text?}, ...]
relations: List of dicts: [{source, target, type, confidence,
supporting_text?, chunk_hint?}, ...]
Returns:
Summary dict: {doc_id, chunk_count, nodes_added, edges_added}
"""
# Step 1: Store raw document and auto-chunk
doc = documents_store.add_document(doc_id, title, source, raw_content)
chunks = doc["chunks"]
def _find_best_chunk(text: str) -> str | None:
"""Find the chunk whose text most overlaps with the given span."""
if not text or not chunks:
return None
text_lower = text.lower()
best_chunk_id = None
best_score = 0
for chunk in chunks:
if text_lower in chunk["text"].lower():
return chunk["chunk_id"]
# Fallback: count overlapping words
words_text = set(text_lower.split())
words_chunk = set(chunk["text"].lower().split())
score = len(words_text & words_chunk)
if score > best_score:
best_score = score
best_chunk_id = chunk["chunk_id"]
return best_chunk_id
nodes_added = 0
# Step 2: Ingest entities with provenance
for entity in entities:
supporting = entity.get("supporting_text", "")
chunk_id = _find_best_chunk(supporting)
chunk_ids = [chunk_id] if chunk_id else []
canonical_type = ontology_store.normalize_type(entity["type"])
ontology_store.add_type(canonical_type)
node_id = graph_store.add_node(
entity["name"],
canonical_type,
source_document=doc_id,
source_chunks=chunk_ids,
)
index_store.add_entity(entity["name"], node_id)
nodes_added += 1
edges_added = 0
# Step 3: Ingest relations with provenance
for rel in relations:
if rel.get("confidence", 0) < config.MIN_CONFIDENCE:
continue
supporting = rel.get("supporting_text", "")
chunk_id = _find_best_chunk(supporting) or rel.get("chunk_hint")
source_id = graph_store.find_node_by_name(rel["source"])
target_id = graph_store.find_node_by_name(rel["target"])
if source_id is None or target_id is None:
continue
canonical_relation = ontology_store.normalize_relation(rel["type"])
ontology_store.add_relation(canonical_relation)
graph_store.add_edge(
source_id,
target_id,
canonical_relation,
rel["confidence"],
source_document=doc_id,
supporting_text=supporting or None,
chunk_id=chunk_id,
)
edges_added += 1
return {
"doc_id": doc_id,
"chunk_count": len(chunks),
"nodes_added": nodes_added,
"edges_added": edges_added,
}
def query_with_evidence(self, query: str) -> dict:
"""
Query the graph and return the subgraph together with supporting
source documents and chunks (evidence chain).
Returns:
{
"query": str,
"subgraph": {"nodes": {...}, "edges": [...]},
"supporting_documents": [
{
"doc_id": str,
"doc_title": str,
"supporting_chunks": [{"chunk_id": str, "text": str}, ...]
}
],
"evidence_chain": str # human-readable summary path
}
"""
subgraph = self.query(query)
if not subgraph["nodes"]:
return {
"query": query,
"subgraph": subgraph,
"supporting_documents": [],
"evidence_chain": "No matching nodes found.",
}
# Collect all provenance pointers from nodes and edges
docs_chunks: dict[str, list[str]] = {} # doc_id -> [chunk_ids]
for node in subgraph["nodes"].values():
doc_id = node.get("source_document")
if doc_id:
docs_chunks.setdefault(doc_id, [])
docs_chunks[doc_id].extend(node.get("source_chunks") or [])
for edge in subgraph["edges"]:
doc_id = edge.get("source_document")
if doc_id:
docs_chunks.setdefault(doc_id, [])
if edge.get("chunk_id"):
docs_chunks[doc_id].append(edge["chunk_id"])
# Resolve chunk texts from documents_store
supporting_documents = []
for doc_id, chunk_ids in docs_chunks.items():
doc = documents_store.get_document(doc_id)
if doc is None:
continue
seen = set()
chunks_out = []
for cid in chunk_ids:
if cid in seen:
continue
seen.add(cid)
chunk = documents_store.get_chunk(cid)
if chunk:
chunks_out.append({"chunk_id": cid, "text": chunk["text"]})
if chunks_out:
supporting_documents.append({
"doc_id": doc_id,
"doc_title": doc["title"],
"supporting_chunks": chunks_out,
})
# Build a simple evidence chain string
chain_parts = []
for edge in subgraph["edges"]:
src_node = subgraph["nodes"].get(edge["source"], {})
tgt_node = subgraph["nodes"].get(edge["target"], {})
src_name = src_node.get("name", edge["source"])
tgt_name = tgt_node.get("name", edge["target"])
chain_parts.append(f"{src_name} --[{edge['type']}]--> {tgt_name}")
evidence_chain = " | ".join(chain_parts) if chain_parts else "No edges in subgraph."
return {
"query": query,
"subgraph": subgraph,
"supporting_documents": supporting_documents,
"evidence_chain": evidence_chain,
}
@@ -0,0 +1,198 @@
"""
template_agent_workflow.py — Template agent script for ingesting + querying the context graph.
This script demonstrates the complete workflow an agent should follow:
1. Read markdown guidance files
2. Extract entities/relations via LLM reasoning
3. Call Python methods to persist
4. Query the graph
5. Handle errors gracefully
Copy and adapt this template for your agent implementation.
"""
import json
import sys
from pathlib import Path
# Add tools to path
sys.path.insert(0, str(Path(__file__).parent))
from contextgraph import ContextGraphSkill
def ingest_document(skill: ContextGraphSkill, document: str) -> dict:
"""
Step 1: Agent reads ingestion.md and ontology.md
Step 2: Agent uses LLM to extract entities and relations
Step 3: Call Python methods to persist (mimicked here with static extraction)
In a real agent, replace the static extraction with LLM calls.
"""
print(f"\n[INGEST] Processing document:\n{document}\n")
# --- STEP 1 & 2: LLM EXTRACTION PHASE (Guided by ingestion.md + ontology.md) ---
# In a real agent, this would use LLM reasoning.
# For now, we'll mock an extraction result:
extraction_result = {
"entities": [
{"name": "memory leak", "type": "issue"},
{"name": "system crash", "type": "issue"},
{"name": "object", "type": "component"},
],
"relations": [
{
"source": "memory leak",
"target": "system crash",
"type": "causes",
"confidence": 1.0,
},
{
"source": "object",
"target": "memory leak",
"type": "contributes to",
"confidence": 0.9,
},
],
}
print(f"[LLM] Extracted entities + relations:")
print(json.dumps(extraction_result, indent=2))
# --- STEP 3: PERSIST PHASE (Call Python methods) ---
errors = []
added_nodes = {}
for entity in extraction_result["entities"]:
try:
node_id = skill.add_node(entity["name"], entity["type"])
added_nodes[entity["name"]] = node_id
print(f" ✓ Added node: {entity['name']} (id: {node_id}, type: {entity['type']})")
except Exception as e:
errors.append(f"Failed to add node {entity['name']}: {e}")
print(f" ✗ Error adding node {entity['name']}: {e}")
for relation in extraction_result["relations"]:
# Validate both endpoints exist
if relation["source"] not in added_nodes or relation["target"] not in added_nodes:
error_msg = f"Cannot add edge: source or target missing"
errors.append(error_msg)
print(f" ✗ Skip edge {relation['source']}{relation['target']}: {error_msg}")
continue
# Validate confidence threshold
if relation["confidence"] < 0.6:
error_msg = f"Confidence {relation['confidence']} < 0.6 (minimum threshold)"
errors.append(error_msg)
print(f" ✗ Skip edge {relation['source']}{relation['target']}: {error_msg}")
continue
try:
skill.add_edge(
source_name=relation["source"],
target_name=relation["target"],
relation=relation["type"],
confidence=relation["confidence"],
)
print(
f" ✓ Added edge: {relation['source']} "
f"--[{relation['type']}]→ {relation['target']} "
f"(confidence: {relation['confidence']})"
)
except Exception as e:
errors.append(f"Failed to add edge {relation['source']}{relation['target']}: {e}")
print(f" ✗ Error adding edge: {e}")
return {
"success": len(errors) == 0,
"nodes_added": len(added_nodes),
"edges_added": len(extraction_result["relations"]) - len(
[e for e in errors if "skip edge" in e.lower()]
),
"errors": errors,
}
def query_graph(skill: ContextGraphSkill, query: str) -> dict:
"""
Query the graph for context to answer the user's question.
Step 1: Read retrieval.md
Step 2: Call skill.query() which internally handles BFS + subgraph extraction
Step 3: Return structured context
"""
print(f"\n[QUERY] {query}\n")
try:
subgraph = skill.query(query)
if not subgraph["nodes"]:
print(" No relevant entities found in graph.")
return {
"success": True,
"query": query,
"subgraph": subgraph,
"nodes_found": 0,
"edges_found": 0,
}
print(f" ✓ Retrieved subgraph with {len(subgraph['nodes'])} nodes, {len(subgraph['edges'])} edges")
print(f"\n Nodes:")
for node_id, node in subgraph["nodes"].items():
print(f" - {node['name']} (type: {node['type']}, id: {node_id})")
print(f"\n Edges:")
for edge in subgraph["edges"]:
source_name = subgraph["nodes"][edge["source"]]["name"]
target_name = subgraph["nodes"][edge["target"]]["name"]
print(
f" - {source_name} --[{edge['type']}]→ {target_name} "
f"(confidence: {edge['confidence']})"
)
return {
"success": True,
"query": query,
"subgraph": subgraph,
"nodes_found": len(subgraph["nodes"]),
"edges_found": len(subgraph["edges"]),
}
except Exception as e:
error_msg = f"Query failed: {e}"
print(f"{error_msg}")
return {"success": False, "query": query, "error": error_msg}
def main():
"""Demo: ingest a document, then query the graph."""
skill = ContextGraphSkill()
# ===== INGESTION =====
document = """
System crashes due to memory leaks.
Memory leaks occur when objects are not released.
"""
result = ingest_document(skill, document)
print(f"\n[INGEST RESULT] Nodes added: {result['nodes_added']}, " f"Edges added: {result['edges_added']}")
if result["errors"]:
print(f"Errors: {result['errors']}")
# ===== RETRIEVAL =====
queries = [
"Why does the system crash?",
"What causes memory leaks?",
]
for query in queries:
result = query_graph(skill, query)
if result["success"]:
print(f" Nodes found: {result['nodes_found']}, Edges found: {result['edges_found']}")
else:
print(f" Error: {result['error']}")
if __name__ == "__main__":
main()
@@ -0,0 +1,191 @@
"""
documents_store.py — Persistent storage for raw documents and chunks (RAG layer).
Inspired by Karpathy's LLM Wiki pattern: raw sources are immutable and stored
as the ground truth. Chunks are the retrieval unit; provenance links tie graph
nodes/edges back to specific chunks.
Handles:
- Storing raw documents with metadata
- Chunking documents into overlapping text windows
- Retrieving chunks by id or by keyword search
- Persisting to data/documents.json
"""
from __future__ import annotations
import json
import os
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
_DOCS_FILE = _DATA_DIR / "documents.json"
_CHUNK_SIZE = 500 # characters per chunk
_CHUNK_OVERLAP = 100 # overlap between consecutive chunks
_STOPWORDS = frozenset([
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
"its", "this", "that", "these", "those", "i", "you", "he", "she",
"we", "they", "what", "which", "who", "how", "why", "when", "where",
])
def _load() -> dict:
if _DOCS_FILE.exists():
with open(_DOCS_FILE, "r") as f:
return json.load(f)
return {"documents": {}}
def _save(store: dict) -> None:
_DATA_DIR.mkdir(parents=True, exist_ok=True)
with open(_DOCS_FILE, "w") as f:
json.dump(store, f, indent=2)
def _tokenize(text: str) -> list[str]:
tokens = re.findall(r"[a-z0-9]+", text.lower())
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
def _chunk_text(content: str, chunk_size: int = _CHUNK_SIZE, overlap: int = _CHUNK_OVERLAP) -> list[str]:
"""Split content into overlapping character windows."""
chunks = []
start = 0
while start < len(content):
end = start + chunk_size
chunks.append(content[start:end].strip())
if end >= len(content):
break
start += chunk_size - overlap
return [c for c in chunks if c]
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def add_document(
doc_id: str,
title: str,
source: str,
content: str,
) -> dict:
"""
Store a raw document and auto-generate chunks.
Args:
doc_id: Caller-supplied stable identifier (e.g. "doc_001" or a filename).
title: Human-readable title.
source: Origin path/URL (immutable provenance pointer).
content: Full raw text to store and chunk.
Returns:
The stored document dict including generated chunk_ids.
"""
store = _load()
# Idempotent: return existing doc if already stored
if doc_id in store["documents"]:
return store["documents"][doc_id]
raw_chunks = _chunk_text(content)
chunks = []
for i, text in enumerate(raw_chunks):
chunks.append({
"chunk_id": f"{doc_id}_chunk_{i:03d}",
"index": i,
"text": text,
})
doc = {
"id": doc_id,
"title": title,
"source": source,
"content": content,
"chunks": chunks,
"ingestion_date": datetime.now(timezone.utc).isoformat(),
}
store["documents"][doc_id] = doc
_save(store)
return doc
def get_document(doc_id: str) -> dict | None:
"""Return the full document record or None if not found."""
store = _load()
return store["documents"].get(doc_id)
def get_chunk(chunk_id: str) -> dict | None:
"""Return a specific chunk by its chunk_id (searches across all documents)."""
store = _load()
for doc in store["documents"].values():
for chunk in doc["chunks"]:
if chunk["chunk_id"] == chunk_id:
return chunk
return None
def get_chunks_for_document(doc_id: str) -> list[dict]:
"""Return all chunks for a document."""
doc = get_document(doc_id)
if doc is None:
return []
return doc["chunks"]
def search_chunks(query: str, top_k: int = 5) -> list[dict]:
"""
Keyword search over chunk text. Returns top_k matching chunks sorted by
term overlap (simple TF-style scoring, no embeddings required).
Returns list of dicts with keys: chunk_id, doc_id, score, text.
"""
store = _load()
query_tokens = set(_tokenize(query))
if not query_tokens:
return []
scored: list[tuple[float, dict]] = []
for doc in store["documents"].values():
for chunk in doc["chunks"]:
chunk_tokens = set(_tokenize(chunk["text"]))
overlap = len(query_tokens & chunk_tokens)
if overlap > 0:
score = overlap / len(query_tokens)
scored.append((score, {
"chunk_id": chunk["chunk_id"],
"doc_id": doc["id"],
"doc_title": doc["title"],
"score": round(score, 4),
"text": chunk["text"],
}))
scored.sort(key=lambda x: x[0], reverse=True)
return [item for _, item in scored[:top_k]]
def list_documents() -> list[dict]:
"""Return a summary list of all stored documents (no content, no chunks)."""
store = _load()
return [
{
"id": doc["id"],
"title": doc["title"],
"source": doc["source"],
"chunk_count": len(doc["chunks"]),
"ingestion_date": doc["ingestion_date"],
}
for doc in store["documents"].values()
]
@@ -0,0 +1,202 @@
"""
graph_store.py — Persistent storage for graph nodes and edges.
Handles:
- Adding/deduplicating nodes
- Adding edges with confidence
- Fetching neighbors
- Persisting to graph.json
"""
from __future__ import annotations
import json
import os
import sys
import uuid
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
_GRAPH_FILE = _DATA_DIR / "graph.json"
def _load() -> dict:
if _GRAPH_FILE.exists():
with open(_GRAPH_FILE, "r") as f:
return json.load(f)
return {"nodes": {}, "edges": []}
def _save(graph: dict) -> None:
_DATA_DIR.mkdir(parents=True, exist_ok=True)
with open(_GRAPH_FILE, "w") as f:
json.dump(graph, f, indent=2)
def add_node(
name: str,
node_type: str,
source_document: str | None = None,
source_chunks: list[str] | None = None,
) -> str:
"""
Add a node if it doesn't exist. Returns node_id.
Args:
source_document: doc_id from documents_store (provenance pointer).
source_chunks: list of chunk_ids that mention this entity.
"""
graph = _load()
name_lower = name.strip().lower()
# Deduplication: search by normalized name
for node_id, node in graph["nodes"].items():
if node["name"] == name_lower:
# Merge provenance if new info provided
changed = False
if source_document and node.get("source_document") is None:
node["source_document"] = source_document
changed = True
if source_chunks:
existing = set(node.get("source_chunks") or [])
merged = list(existing | set(source_chunks))
if merged != list(existing):
node["source_chunks"] = merged
changed = True
if changed:
_save(graph)
return node_id
node_id = str(uuid.uuid4())[:8]
graph["nodes"][node_id] = {
"name": name_lower,
"type": node_type.strip().lower(),
"source_document": source_document,
"source_chunks": source_chunks or [],
}
_save(graph)
return node_id
def add_edge(
source_id: str,
target_id: str,
relation: str,
confidence: float,
source_document: str | None = None,
supporting_text: str | None = None,
chunk_id: str | None = None,
) -> None:
"""
Add a directed edge between two nodes.
Args:
source_document: doc_id from documents_store (provenance pointer).
supporting_text: The exact text span that supports this relation.
chunk_id: The specific chunk_id the supporting text came from.
"""
graph = _load()
# Deduplicate edges by source + target + relation
relation_lower = relation.strip().lower()
for edge in graph["edges"]:
if (
edge["source"] == source_id
and edge["target"] == target_id
and edge["type"] == relation_lower
):
changed = False
if confidence > edge["confidence"]:
edge["confidence"] = confidence
changed = True
if source_document and edge.get("source_document") is None:
edge["source_document"] = source_document
changed = True
if supporting_text and edge.get("supporting_text") is None:
edge["supporting_text"] = supporting_text
changed = True
if chunk_id and edge.get("chunk_id") is None:
edge["chunk_id"] = chunk_id
changed = True
if changed:
_save(graph)
return
graph["edges"].append({
"source": source_id,
"target": target_id,
"type": relation_lower,
"confidence": confidence,
"source_document": source_document,
"supporting_text": supporting_text,
"chunk_id": chunk_id,
})
_save(graph)
def get_neighbors(node_id: str, min_confidence: float = 0.0) -> list[str]:
"""Return node_ids of all neighbors reachable from node_id."""
graph = _load()
neighbors = []
for edge in graph["edges"]:
if edge["confidence"] < min_confidence:
continue
if edge["source"] == node_id:
neighbors.append(edge["target"])
elif edge["target"] == node_id:
neighbors.append(edge["source"])
return list(set(neighbors))
def get_node(node_id: str) -> dict | None:
"""Fetch a single node by ID."""
graph = _load()
return graph["nodes"].get(node_id)
def get_subgraph(node_ids: list[str]) -> dict:
"""Return nodes and edges induced by the given node_ids."""
graph = _load()
node_id_set = set(node_ids)
nodes = {nid: graph["nodes"][nid] for nid in node_ids if nid in graph["nodes"]}
edges = [
e
for e in graph["edges"]
if e["source"] in node_id_set and e["target"] in node_id_set
]
return {"nodes": nodes, "edges": edges}
def find_node_by_name(name: str) -> str | None:
"""Return node_id for a given normalized name, or None."""
graph = _load()
name_lower = name.strip().lower()
for node_id, node in graph["nodes"].items():
if node["name"] == name_lower:
return node_id
return None
def link_node_to_source(node_id: str, doc_id: str, chunk_ids: list[str]) -> None:
"""Attach provenance (doc_id + chunk_ids) to an existing node."""
graph = _load()
if node_id not in graph["nodes"]:
return
node = graph["nodes"][node_id]
node["source_document"] = doc_id
existing = set(node.get("source_chunks") or [])
node["source_chunks"] = list(existing | set(chunk_ids))
_save(graph)
def get_node_sources(node_id: str) -> dict:
"""Return provenance info (source_document + source_chunks) for a node."""
graph = _load()
node = graph["nodes"].get(node_id, {})
return {
"source_document": node.get("source_document"),
"source_chunks": node.get("source_chunks", []),
}
@@ -0,0 +1,90 @@
"""
index_store.py — Maintains entity and keyword indexes for fast lookup.
Handles:
- Entity index: name → [node_ids]
- Keyword index: token → [node_ids]
- Persist to index.json
"""
from __future__ import annotations
import json
import os
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
_INDEX_FILE = _DATA_DIR / "index.json"
_STOPWORDS = frozenset(
[
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
"its", "this", "that", "these", "those", "i", "you", "he", "she",
"we", "they", "what", "which", "who", "how", "why", "when", "where",
]
)
def _load() -> dict:
if _INDEX_FILE.exists():
with open(_INDEX_FILE, "r") as f:
return json.load(f)
return {"entity_index": {}, "keyword_index": {}}
def _save(index: dict) -> None:
_DATA_DIR.mkdir(parents=True, exist_ok=True)
with open(_INDEX_FILE, "w") as f:
json.dump(index, f, indent=2)
def _tokenize(text: str) -> list[str]:
"""Split text into lowercase tokens, removing stopwords and short tokens."""
tokens = re.findall(r"[a-z0-9]+", text.lower())
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
def add_entity(name: str, node_id: str) -> None:
"""Register an entity name → node_id in both entity and keyword indexes."""
index = _load()
name_lower = name.strip().lower()
# Entity index
if name_lower not in index["entity_index"]:
index["entity_index"][name_lower] = []
if node_id not in index["entity_index"][name_lower]:
index["entity_index"][name_lower].append(node_id)
# Keyword index
for token in _tokenize(name_lower):
if token not in index["keyword_index"]:
index["keyword_index"][token] = []
if node_id not in index["keyword_index"][token]:
index["keyword_index"][token].append(node_id)
_save(index)
def search(query: str) -> list[str]:
"""Search for node_ids matching the query via entity name or keywords."""
index = _load()
query_lower = query.strip().lower()
matched_ids: set[str] = set()
# Exact entity name match
if query_lower in index["entity_index"]:
matched_ids.update(index["entity_index"][query_lower])
# Keyword match
for token in _tokenize(query_lower):
if token in index["keyword_index"]:
matched_ids.update(index["keyword_index"][token])
return list(matched_ids)
@@ -0,0 +1,175 @@
"""
ontology_store.py — Tracks entity types and relation types.
Handles:
- Registering types and relations with usage counts
- Normalizing types and relations via synonym mapping
- Persisting to ontology.json
NOTE: No LLM logic here. Normalization is rule-based (lowercase + synonym map).
"""
import json
import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
_ONTOLOGY_FILE = _DATA_DIR / "ontology.json"
# Synonym maps — lowercase variants map to canonical forms
_ENTITY_TYPE_MAP: dict[str, str] = {
"component": "component",
"module": "component",
"class": "component",
"function": "component",
"method": "component",
"bug": "issue",
"defect": "issue",
"fault": "issue",
"error": "issue",
"failure": "issue",
"problem": "issue",
"crash": "issue",
"server": "infrastructure",
"host": "infrastructure",
"machine": "infrastructure",
"node": "infrastructure",
"user": "actor",
"person": "actor",
"operator": "actor",
"admin": "actor",
"administrator": "actor",
"actor": "actor",
"app": "software",
"application": "software",
"service": "software",
"program": "software",
"software": "software",
"database": "storage",
"datastore": "storage",
"db": "storage",
"storage": "storage",
"api": "interface",
"endpoint": "interface",
"interface": "interface",
"connection": "interface",
"event": "event",
"incident": "event",
"occurrence": "event",
"trigger": "event",
"concept": "concept",
"idea": "concept",
"principle": "concept",
"theory": "concept",
"process": "process",
"thread": "process",
"task": "process",
"job": "process",
"workflow": "process",
"object": "component",
"resource": "component",
"memory": "resource",
"cpu": "resource",
"system": "system",
"platform": "system",
"framework": "system",
"library": "software",
"package": "software",
}
_RELATION_TYPE_MAP: dict[str, str] = {
"causes": "causes",
"triggers": "causes",
"leads to": "causes",
"results in": "causes",
"produces": "causes",
"is part of": "contains",
"belongs to": "contains",
"lives in": "contains",
"sits in": "contains",
"contains": "contains",
"depends on": "depends on",
"requires": "depends on",
"needs": "depends on",
"uses": "uses",
"calls": "uses",
"invokes": "uses",
"consumes": "uses",
"affects": "affects",
"impacts": "affects",
"influences": "affects",
"creates": "creates",
"instantiates": "creates",
"spawns": "creates",
"connects to": "connects to",
"links to": "connects to",
"references": "connects to",
"inherits from": "extends",
"extends": "extends",
"subclasses": "extends",
"reads from": "reads from",
"queries": "reads from",
"fetches": "reads from",
"writes to": "writes to",
"stores in": "writes to",
"persists to": "writes to",
"contributes to": "contributes to",
"allocated by": "allocated by",
"released by": "released by",
"not released": "not released",
}
def _load() -> dict:
if _ONTOLOGY_FILE.exists():
with open(_ONTOLOGY_FILE, "r") as f:
return json.load(f)
return {"entity_types": {}, "relation_types": {}}
def _save(ontology: dict) -> None:
_DATA_DIR.mkdir(parents=True, exist_ok=True)
with open(_ONTOLOGY_FILE, "w") as f:
json.dump(ontology, f, indent=2)
def normalize_type(type_name: str) -> str:
"""Return the canonical form of an entity type."""
key = type_name.strip().lower().replace("-", " ").replace("_", " ")
return _ENTITY_TYPE_MAP.get(key, key)
def normalize_relation(relation_name: str) -> str:
"""Return the canonical form of a relation type."""
key = relation_name.strip().lower().replace("-", " ").replace("_", " ")
return _RELATION_TYPE_MAP.get(key, key)
def add_type(type_name: str) -> None:
"""Register an entity type, incrementing its usage count."""
ontology = _load()
canonical = normalize_type(type_name)
ontology["entity_types"][canonical] = ontology["entity_types"].get(canonical, 0) + 1
_save(ontology)
def add_relation(relation_name: str) -> None:
"""Register a relation type, incrementing its usage count."""
ontology = _load()
canonical = normalize_relation(relation_name)
ontology["relation_types"][canonical] = ontology["relation_types"].get(canonical, 0) + 1
_save(ontology)
def get_all_types() -> dict[str, int]:
"""Return all registered entity types with counts."""
return _load()["entity_types"]
def get_all_relations() -> dict[str, int]:
"""Return all registered relation types with counts."""
return _load()["relation_types"]
@@ -0,0 +1,58 @@
"""
retrieval_engine.py — BFS-based graph traversal for context retrieval.
Input: seed node_ids + depth
Output: list of node_ids within traversal depth filtered by min_confidence
"""
from __future__ import annotations
import sys
from pathlib import Path
from collections import deque
# Allow imports from parent package
sys.path.insert(0, str(Path(__file__).parent.parent))
from tools import graph_store
import config
def retrieve(
seed_node_ids: list[str],
depth: int = config.MAX_GRAPH_DEPTH,
min_confidence: float = config.MIN_CONFIDENCE,
max_nodes: int = config.MAX_NODES,
) -> list[str]:
"""
BFS from seed nodes up to `depth` hops.
Returns a list of node_ids (including seeds) within the traversal,
filtered by min_confidence on edges and capped at max_nodes.
"""
visited: set[str] = set()
# Queue items: (node_id, current_depth)
queue: deque[tuple[str, int]] = deque()
for seed in seed_node_ids:
if seed not in visited:
visited.add(seed)
queue.append((seed, 0))
while queue:
if len(visited) >= max_nodes:
break
node_id, current_depth = queue.popleft()
if current_depth >= depth:
continue
neighbors = graph_store.get_neighbors(node_id, min_confidence=min_confidence)
for neighbor in neighbors:
if neighbor not in visited:
visited.add(neighbor)
queue.append((neighbor, current_depth + 1))
if len(visited) >= max_nodes:
break
return list(visited)
@@ -0,0 +1,294 @@
"""
wiki_store.py — Manages the persistent wiki layer.
Inspired by Karpathy's LLM Wiki pattern: the wiki is a directory of LLM-generated
markdown pages that the agent writes and maintains. This module provides the
deterministic file I/O and index/log management so the agent can focus on
reasoning, not bookkeeping.
Wiki structure (relative to project root):
wiki/
index.md ← content-oriented catalog of all pages
log.md ← chronological append-only operation log
entities/ ← one page per entity (person, concept, system, etc.)
summaries/ ← source document summary pages
topics/ ← cross-cutting synthesis and topic pages
The agent WRITES pages; this module handles the filesystem + index + log.
"""
from __future__ import annotations
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
_WIKI_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_WIKI_DIR", str(config.WIKI_DIR)))
_INDEX_FILE = _WIKI_DIR / "index.md"
_LOG_FILE = _WIKI_DIR / "log.md"
_CATEGORY_DIRS = {
"entity": _WIKI_DIR / "entities",
"summary": _WIKI_DIR / "summaries",
"topic": _WIKI_DIR / "topics",
}
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _ensure_dirs() -> None:
_WIKI_DIR.mkdir(parents=True, exist_ok=True)
for d in _CATEGORY_DIRS.values():
d.mkdir(parents=True, exist_ok=True)
def _now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
def _slug(title: str) -> str:
"""Convert a title to a filesystem-safe slug."""
slug = title.lower().strip()
slug = re.sub(r"[^a-z0-9]+", "-", slug)
return slug.strip("-")
def _page_path(category: str, slug: str) -> Path:
base = _CATEGORY_DIRS.get(category, _WIKI_DIR)
return base / f"{slug}.md"
# ---------------------------------------------------------------------------
# Index management
# ---------------------------------------------------------------------------
def _load_index() -> list[dict]:
"""Parse index.md into a list of entry dicts."""
if not _INDEX_FILE.exists():
return []
entries = []
for line in _INDEX_FILE.read_text().splitlines():
# Expected table row: | [[slug]] | category | summary | date |
if line.startswith("| [["):
parts = [p.strip() for p in line.split("|") if p.strip()]
if len(parts) >= 3:
link = parts[0] # [[slug]]
category = parts[1] if len(parts) > 1 else ""
summary = parts[2] if len(parts) > 2 else ""
date = parts[3] if len(parts) > 3 else ""
slug = re.sub(r"\[\[|\]\]", "", link)
entries.append({
"slug": slug,
"category": category,
"summary": summary,
"date": date,
})
return entries
def _save_index(entries: list[dict]) -> None:
"""Rewrite index.md from the entries list."""
_ensure_dirs()
lines = [
"# Wiki Index\n",
"_Auto-managed by wiki_store. Do not edit the table manually._\n\n",
"| Page | Category | Summary | Date |\n",
"|------|----------|---------|------|\n",
]
for e in entries:
lines.append(
f"| [[{e['slug']}]] | {e['category']} | {e['summary']} | {e['date']} |\n"
)
_INDEX_FILE.write_text("".join(lines))
def _append_log(operation: str, detail: str) -> None:
"""Append a timestamped entry to log.md."""
_ensure_dirs()
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d")
entry = f"\n## [{timestamp}] {operation} | {detail}\n"
with open(_LOG_FILE, "a") as f:
f.write(entry)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def write_page(
category: str,
title: str,
content: str,
summary: str = "",
) -> str:
"""
Write (or overwrite) a wiki page.
The agent provides the full markdown content. This method handles:
- Writes the .md file to the appropriate category subfolder.
- Updates index.md with a one-line entry.
- Appends an entry to log.md.
Args:
category: One of "entity", "summary", "topic".
title: Human-readable page title (used for slug + index).
content: Full markdown content the agent wrote.
summary: One-line summary for the index (optional; auto-extracted if empty).
Returns:
Relative path from wiki root (e.g. "entities/memory-leak.md").
"""
_ensure_dirs()
slug = _slug(title)
path = _page_path(category, slug)
# Auto-extract first non-heading, non-empty line as summary if not provided
if not summary:
for line in content.splitlines():
stripped = line.strip()
if stripped and not stripped.startswith("#"):
summary = stripped[:100]
break
path.write_text(content)
# Update index
entries = _load_index()
existing = next((e for e in entries if e["slug"] == slug), None)
if existing:
existing["summary"] = summary
existing["date"] = _now_iso()
else:
entries.append({
"slug": slug,
"category": category,
"summary": summary,
"date": _now_iso(),
})
_save_index(entries)
_append_log("write", title)
return str(path.relative_to(_WIKI_DIR))
def read_page(category: str, title: str) -> str | None:
"""Read a wiki page's content. Returns None if not found."""
slug = _slug(title)
path = _page_path(category, slug)
if not path.exists():
return None
return path.read_text()
def read_page_by_slug(slug: str) -> str | None:
"""Read a wiki page by slug, searching across all categories."""
for d in list(_CATEGORY_DIRS.values()) + [_WIKI_DIR]:
path = d / f"{slug}.md"
if path.exists():
return path.read_text()
return None
def search_wiki(query: str) -> list[dict]:
"""
Simple keyword search over all wiki pages.
Returns list of {slug, category, path, snippet} sorted by relevance.
"""
query_tokens = set(re.findall(r"[a-z0-9]+", query.lower()))
if not query_tokens:
return []
results = []
for category, base_dir in _CATEGORY_DIRS.items():
if not base_dir.exists():
continue
for page_path in base_dir.glob("*.md"):
content = page_path.read_text().lower()
content_tokens = set(re.findall(r"[a-z0-9]+", content))
overlap = len(query_tokens & content_tokens)
if overlap > 0:
# Extract a short snippet around first match
first_token = next(iter(query_tokens & content_tokens), "")
idx = content.find(first_token)
snippet = content[max(0, idx - 30):idx + 80].replace("\n", " ").strip()
results.append({
"slug": page_path.stem,
"category": category,
"path": str(page_path.relative_to(_WIKI_DIR)),
"score": overlap,
"snippet": snippet,
})
results.sort(key=lambda x: x["score"], reverse=True)
return results
def list_pages(category: str | None = None) -> list[dict]:
"""List all wiki pages, optionally filtered by category."""
entries = _load_index()
if category:
return [e for e in entries if e["category"] == category]
return entries
def get_log(last_n: int = 20) -> list[str]:
"""Return the last N log entries from log.md."""
if not _LOG_FILE.exists():
return []
lines = _LOG_FILE.read_text().splitlines()
entries = [l for l in lines if l.startswith("## [")]
return entries[-last_n:]
def lint_wiki() -> dict:
"""
Health-check the wiki as described in Karpathy's LLM Wiki pattern.
Checks for:
- Orphan pages (in directory but not in index)
- Missing pages (in index but file deleted)
- Broken wikilinks ([[slug]] pointing to non-existent file)
- Pages with no wikilinks (isolated pages)
Returns:
{
"orphan_pages": [...],
"missing_pages": [...],
"broken_wikilinks": {slug: [broken_links]},
"isolated_pages": [...],
}
"""
index_entries = {e["slug"] for e in _load_index()}
file_slugs: dict[str, Path] = {}
for d in _CATEGORY_DIRS.values():
if d.exists():
for p in d.glob("*.md"):
file_slugs[p.stem] = p
orphans = [s for s in file_slugs if s not in index_entries]
missing = [s for s in index_entries if s not in file_slugs]
broken_wikilinks: dict[str, list[str]] = {}
isolated: list[str] = []
all_slugs = set(file_slugs.keys())
for slug, path in file_slugs.items():
content = path.read_text()
links = re.findall(r"\[\[([^\]]+)\]\]", content)
if not links:
isolated.append(slug)
broken = [lnk for lnk in links if _slug(lnk) not in all_slugs]
if broken:
broken_wikilinks[slug] = broken
return {
"orphan_pages": orphans,
"missing_pages": missing,
"broken_wikilinks": broken_wikilinks,
"isolated_pages": isolated,
}