Files
awesome-copilot/skills/mini-context-graph/scripts/tools/index_store.py
T
Nixon Kurian 746ba555b6 add mini-context-graph skill (#1580)
* add mini-context-graph skill

* remove pycache files

* filename case update to SKILL.md

* update readme
2026-05-05 14:04:37 +10:00

91 lines
2.8 KiB
Python

"""
index_store.py — Maintains entity and keyword indexes for fast lookup.
Handles:
- Entity index: name → [node_ids]
- Keyword index: token → [node_ids]
- Persist to index.json
"""
from __future__ import annotations
import json
import os
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import config
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
_INDEX_FILE = _DATA_DIR / "index.json"
_STOPWORDS = frozenset(
[
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
"its", "this", "that", "these", "those", "i", "you", "he", "she",
"we", "they", "what", "which", "who", "how", "why", "when", "where",
]
)
def _load() -> dict:
if _INDEX_FILE.exists():
with open(_INDEX_FILE, "r") as f:
return json.load(f)
return {"entity_index": {}, "keyword_index": {}}
def _save(index: dict) -> None:
_DATA_DIR.mkdir(parents=True, exist_ok=True)
with open(_INDEX_FILE, "w") as f:
json.dump(index, f, indent=2)
def _tokenize(text: str) -> list[str]:
"""Split text into lowercase tokens, removing stopwords and short tokens."""
tokens = re.findall(r"[a-z0-9]+", text.lower())
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
def add_entity(name: str, node_id: str) -> None:
"""Register an entity name → node_id in both entity and keyword indexes."""
index = _load()
name_lower = name.strip().lower()
# Entity index
if name_lower not in index["entity_index"]:
index["entity_index"][name_lower] = []
if node_id not in index["entity_index"][name_lower]:
index["entity_index"][name_lower].append(node_id)
# Keyword index
for token in _tokenize(name_lower):
if token not in index["keyword_index"]:
index["keyword_index"][token] = []
if node_id not in index["keyword_index"][token]:
index["keyword_index"][token].append(node_id)
_save(index)
def search(query: str) -> list[str]:
"""Search for node_ids matching the query via entity name or keywords."""
index = _load()
query_lower = query.strip().lower()
matched_ids: set[str] = set()
# Exact entity name match
if query_lower in index["entity_index"]:
matched_ids.update(index["entity_index"][query_lower])
# Keyword match
for token in _tokenize(query_lower):
if token in index["keyword_index"]:
matched_ids.update(index["keyword_index"][token])
return list(matched_ids)