mirror of
https://github.com/github/awesome-copilot.git
synced 2026-05-05 14:42:12 +00:00
746ba555b6
* add mini-context-graph skill * remove pycache files * filename case update to SKILL.md * update readme
91 lines
2.8 KiB
Python
91 lines
2.8 KiB
Python
"""
|
|
index_store.py — Maintains entity and keyword indexes for fast lookup.
|
|
|
|
Handles:
|
|
- Entity index: name → [node_ids]
|
|
- Keyword index: token → [node_ids]
|
|
- Persist to index.json
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
import config
|
|
|
|
_DATA_DIR = Path(os.environ.get("MINI_CONTEXT_GRAPH_DATA_DIR", str(config.DATA_DIR)))
|
|
_INDEX_FILE = _DATA_DIR / "index.json"
|
|
|
|
_STOPWORDS = frozenset(
|
|
[
|
|
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
|
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
"should", "may", "might", "shall", "can", "to", "of", "in", "on",
|
|
"at", "by", "for", "with", "from", "and", "or", "but", "not", "it",
|
|
"its", "this", "that", "these", "those", "i", "you", "he", "she",
|
|
"we", "they", "what", "which", "who", "how", "why", "when", "where",
|
|
]
|
|
)
|
|
|
|
|
|
def _load() -> dict:
|
|
if _INDEX_FILE.exists():
|
|
with open(_INDEX_FILE, "r") as f:
|
|
return json.load(f)
|
|
return {"entity_index": {}, "keyword_index": {}}
|
|
|
|
|
|
def _save(index: dict) -> None:
|
|
_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
with open(_INDEX_FILE, "w") as f:
|
|
json.dump(index, f, indent=2)
|
|
|
|
|
|
def _tokenize(text: str) -> list[str]:
|
|
"""Split text into lowercase tokens, removing stopwords and short tokens."""
|
|
tokens = re.findall(r"[a-z0-9]+", text.lower())
|
|
return [t for t in tokens if t not in _STOPWORDS and len(t) > 1]
|
|
|
|
|
|
def add_entity(name: str, node_id: str) -> None:
|
|
"""Register an entity name → node_id in both entity and keyword indexes."""
|
|
index = _load()
|
|
name_lower = name.strip().lower()
|
|
|
|
# Entity index
|
|
if name_lower not in index["entity_index"]:
|
|
index["entity_index"][name_lower] = []
|
|
if node_id not in index["entity_index"][name_lower]:
|
|
index["entity_index"][name_lower].append(node_id)
|
|
|
|
# Keyword index
|
|
for token in _tokenize(name_lower):
|
|
if token not in index["keyword_index"]:
|
|
index["keyword_index"][token] = []
|
|
if node_id not in index["keyword_index"][token]:
|
|
index["keyword_index"][token].append(node_id)
|
|
|
|
_save(index)
|
|
|
|
|
|
def search(query: str) -> list[str]:
|
|
"""Search for node_ids matching the query via entity name or keywords."""
|
|
index = _load()
|
|
query_lower = query.strip().lower()
|
|
matched_ids: set[str] = set()
|
|
|
|
# Exact entity name match
|
|
if query_lower in index["entity_index"]:
|
|
matched_ids.update(index["entity_index"][query_lower])
|
|
|
|
# Keyword match
|
|
for token in _tokenize(query_lower):
|
|
if token in index["keyword_index"]:
|
|
matched_ids.update(index["keyword_index"][token])
|
|
|
|
return list(matched_ids)
|