awesome-copilot/skills/mini-context-graph/scripts/tools/index_store.py

"""
index_store.py — Maintains entity and keyword indexes for fast lookup.

Handles:
- Entity index: name → [node_ids]
- Keyword index: token → [node_ids]
- Persist to index.json
"""
from __future__ import annotations

import json
import os
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))
import config

_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
_INDEX_FILE = _DATA_DIR / "index.json"

_STOPWORDS = frozenset(
    [
        "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
        "have", "has", "had", "do", "does", "did", "will", "would", "could",
        "should", "may", "might", "shall", "can", "to", "of", "in", "on",
        "at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
        "its", "this", "that", "these", "those", "i", "you", "he", "she",
        "we", "they", "what", "which", "who", "how", "why", "when", "where",
    ]
)


def _load() -> dict:
    if _INDEX_FILE.exists():
        with open(_INDEX_FILE, "r") as f:
            return json.load(f)
    return {"entity_index": {}, "keyword_index": {}}


def _save(index: dict) -> None:
    _DATA_DIR.mkdir(parents=True, exist_ok=True)
    with open(_INDEX_FILE, "w") as f:
        json.dump(index, f, indent=2)


def _tokenize(text: str) -> list[str]:
    """Split text into lowercase tokens, removing stopwords and short tokens."""
    tokens = re.findall(r"[a-z0-9]+", text.lower())
    return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]


def add_entity(name: str, node_id: str) -> None:
    """Register an entity name → node_id in both entity and keyword indexes."""
    index = _load()
    name_lower = name.strip().lower()

    # Entity index
    if name_lower not in index["entity_index"]:
        index["entity_index"][name_lower] = []
    if node_id not in index["entity_index"][name_lower]:
        index["entity_index"][name_lower].append(node_id)

    # Keyword index
    for token in _tokenize(name_lower):
        if token not in index["keyword_index"]:
            index["keyword_index"][token] = []
        if node_id not in index["keyword_index"][token]:
            index["keyword_index"][token].append(node_id)

    _save(index)


def search(query: str) -> list[str]:
    """Search for node_ids matching the query via entity name or keywords."""
    index = _load()
    query_lower = query.strip().lower()
    matched_ids: set[str] = set()

    # Exact entity name match
    if query_lower in index["entity_index"]:
        matched_ids.update(index["entity_index"][query_lower])

    # Keyword match
    for token in _tokenize(query_lower):
        if token in index["keyword_index"]:
            matched_ids.update(index["keyword_index"][token])

    return list(matched_ids)