# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
Bring-Your-Own Vector Store¶
Dieses Notebook zeigt, wie ein benutzerdefinierter Vektorspeicher implementiert und für die Verwendung mit GraphRAG registriert wird.
Übersicht¶
GraphRAG verwendet eine Plug-and-Play-Architektur, die die einfache Integration benutzerdefinierter Vektorspeicher (über das nativ unterstützte hinaus) durch Befolgung eines Factory-Design-Musters ermöglicht. Dies erlaubt Ihnen:
- Funktionalität erweitern: Unterstützung für neue Vektordatenbank-Backends hinzufügen
- Verhalten anpassen: Spezialisierte Suchlogik oder Datenstrukturen implementieren
- Bestehende Systeme integrieren: GraphRAG mit Ihrer vorhandenen Vektordatenbank-Infrastruktur verbinden
Was Sie lernen werden¶
- Verständnis der
BaseVectorStore-Schnittstelle - Implementierung einer benutzerdefinierten Vektorspeicherklasse
- Registrierung Ihres Vektorspeichers bei der
VectorStoreFactory - Testen und Validieren Ihrer Implementierung
- Konfiguration von GraphRAG zur Verwendung Ihres benutzerdefinierten Vektorspeichers
Legen wir los!
Schritt 1: Benötigte Abhängigkeiten importieren¶
Importieren wir zunächst die notwendigen GraphRAG-Komponenten und andere Abhängigkeiten, die wir benötigen werden.
pip install graphrag
from typing import Any
import numpy as np
import yaml
from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
from graphrag.data_model.types import TextEmbedder
# GraphRAG vector store components
from graphrag.vector_stores.base import (
BaseVectorStore,
VectorStoreDocument,
VectorStoreSearchResult,
)
from graphrag.vector_stores.factory import VectorStoreFactory
Schritt 2: Verstehen der BaseVectorStore-Schnittstelle¶
Bevor wir einen benutzerdefinierten Vektorspeicher verwenden, untersuchen wir die BaseVectorStore-Schnittstelle, um zu verstehen, welche Methoden implementiert werden müssen.
# Let's inspect the BaseVectorStore class to understand the required methods
import inspect
print("BaseVectorStore Abstract Methods:")
print("=" * 40)
abstract_methods = []
for name, method in inspect.getmembers(BaseVectorStore, predicate=inspect.isfunction):
if getattr(method, "__isabstractmethod__", False):
signature = inspect.signature(method)
abstract_methods.append(f"• {name}{signature}")
print(f"• {name}{signature}")
print(f"\nTotal abstract methods to implement: {len(abstract_methods)}")
BaseVectorStore Abstract Methods: ======================================== • connect(self, **kwargs: Any) -> None • filter_by_id(self, include_ids: list[str] | list[int]) -> Any • load_documents(self, documents: list[graphrag.vector_stores.base.VectorStoreDocument], overwrite: bool = True) -> None • search_by_id(self, id: str) -> graphrag.vector_stores.base.VectorStoreDocument • similarity_search_by_text(self, text: str, text_embedder: collections.abc.Callable[[str], list[float]], k: int = 10, **kwargs: Any) -> list[graphrag.vector_stores.base.VectorStoreSearchResult] • similarity_search_by_vector(self, query_embedding: list[float], k: int = 10, **kwargs: Any) -> list[graphrag.vector_stores.base.VectorStoreSearchResult] Total abstract methods to implement: 6
Schritt 3: Implementieren eines benutzerdefinierten Vektorspeichers¶
Implementieren wir nun als Beispiel einen einfachen In-Memory-Vektorspeicher. Dieser Vektorspeicher wird
- Dokumente und Vektoren im Speicher mithilfe von Python-Datenstrukturen speichern
- Alle erforderlichen BaseVectorStore-Methoden unterstützen
Hinweis: Dies ist ein vereinfachtes Beispiel zur Demonstration. Produktionsspeicher für Vektoren würden typischerweise optimierte Bibliotheken wie FAISS, fortschrittlichere Indizierung und persistente Speicherung verwenden.
class SimpleInMemoryVectorStore(BaseVectorStore):
"""A simple in-memory vector store implementation for demonstration purposes.
This vector store stores documents and their embeddings in memory and provides
basic similarity search functionality using cosine similarity.
WARNING: This is for demonstration only - not suitable for production use.
For production, consider using optimized vector databases like LanceDB,
Azure AI Search, or other specialized vector stores.
"""
# Internal storage for documents and vectors
documents: dict[str, VectorStoreDocument]
vectors: dict[str, np.ndarray]
connected: bool
def __init__(self, **kwargs: Any):
"""Initialize the in-memory vector store."""
super().__init__(**kwargs)
self.documents: dict[str, VectorStoreDocument] = {}
self.vectors: dict[str, np.ndarray] = {}
self.connected = False
print(f"🚀 SimpleInMemoryVectorStore initialized for index: {self.index_name}")
def connect(self, **kwargs: Any) -> None:
"""Connect to the vector storage (no-op for in-memory store)."""
self.connected = True
print(f"✅ Connected to in-memory vector store: {self.index_name}")
def load_documents(
self, documents: list[VectorStoreDocument], overwrite: bool = True
) -> None:
"""Load documents into the vector store."""
if not self.connected:
msg = "Vector store not connected. Call connect() first."
raise RuntimeError(msg)
if overwrite:
self.documents.clear()
self.vectors.clear()
loaded_count = 0
for doc in documents:
if doc.vector is not None:
doc_id = str(doc.id)
self.documents[doc_id] = doc
self.vectors[doc_id] = np.array(doc.vector, dtype=np.float32)
loaded_count += 1
print(f"📚 Loaded {loaded_count} documents into vector store")
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
"""Calculate cosine similarity between two vectors."""
# Normalize vectors
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
if norm1 == 0 or norm2 == 0:
return 0.0
return float(np.dot(vec1, vec2) / (norm1 * norm2))
def similarity_search_by_vector(
self, query_embedding: list[float], k: int = 10, **kwargs: Any
) -> list[VectorStoreSearchResult]:
"""Perform similarity search using a query vector."""
if not self.connected:
msg = "Vector store not connected. Call connect() first."
raise RuntimeError(msg)
if not self.vectors:
return []
query_vec = np.array(query_embedding, dtype=np.float32)
similarities = []
# Calculate similarity with all stored vectors
for doc_id, stored_vec in self.vectors.items():
similarity = self._cosine_similarity(query_vec, stored_vec)
similarities.append((doc_id, similarity))
# Sort by similarity (descending) and take top k
similarities.sort(key=lambda x: x[1], reverse=True)
top_k = similarities[:k]
# Create search results
results = []
for doc_id, score in top_k:
document = self.documents[doc_id]
result = VectorStoreSearchResult(document=document, score=score)
results.append(result)
return results
def similarity_search_by_text(
self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
) -> list[VectorStoreSearchResult]:
"""Perform similarity search using text (which gets embedded first)."""
# Embed the text first
query_embedding = text_embedder(text)
# Use vector search with the embedding
return self.similarity_search_by_vector(query_embedding, k, **kwargs)
def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:
"""Build a query filter to filter documents by id.
For this simple implementation, we return the list of IDs as the filter.
"""
return [str(id_) for id_ in include_ids]
def search_by_id(self, id: str) -> VectorStoreDocument:
"""Search for a document by id."""
doc_id = str(id)
if doc_id not in self.documents:
msg = f"Document with id '{id}' not found"
raise KeyError(msg)
return self.documents[doc_id]
def get_stats(self) -> dict[str, Any]:
"""Get statistics about the vector store (custom method)."""
return {
"index_name": self.index_name,
"document_count": len(self.documents),
"vector_count": len(self.vectors),
"connected": self.connected,
"vector_dimension": len(next(iter(self.vectors.values())))
if self.vectors
else 0,
}
print("✅ SimpleInMemoryVectorStore class defined!")
✅ SimpleInMemoryVectorStore class defined!
Schritt 4: Registrieren des benutzerdefinierten Vektorspeichers¶
Lassen Sie uns nun unseren benutzerdefinierten Vektorspeicher bei der VectorStoreFactory registrieren, damit er in GraphRAG verwendet werden kann.
# Register our custom vector store with a unique identifier
CUSTOM_VECTOR_STORE_TYPE = "simple_memory"
# Register the vector store class
VectorStoreFactory.register(CUSTOM_VECTOR_STORE_TYPE, SimpleInMemoryVectorStore)
print(f"✅ Registered custom vector store with type: '{CUSTOM_VECTOR_STORE_TYPE}'")
# Verify registration
available_types = VectorStoreFactory.get_vector_store_types()
print(f"\n📋 Available vector store types: {available_types}")
print(
f"🔍 Is our custom type supported? {VectorStoreFactory.is_supported_type(CUSTOM_VECTOR_STORE_TYPE)}"
)
✅ Registered custom vector store with type: 'simple_memory' 📋 Available vector store types: ['lancedb', 'azure_ai_search', 'cosmosdb', 'simple_memory'] 🔍 Is our custom type supported? True
Schritt 5: Testen des benutzerdefinierten Vektorspeichers¶
Lassen Sie uns einige Beispieldaten erstellen und unseren benutzerdefinierten Vektorspeicher implementieren.
# Create sample documents with mock embeddings
def create_mock_embedding(dimension: int = 384) -> list[float]:
"""Create a random embedding vector for testing."""
return np.random.normal(0, 1, dimension).tolist()
# Sample documents
sample_documents = [
VectorStoreDocument(
id="doc_1",
text="GraphRAG is a powerful knowledge graph extraction and reasoning framework.",
vector=create_mock_embedding(),
attributes={"category": "technology", "source": "documentation"},
),
VectorStoreDocument(
id="doc_2",
text="Vector stores enable efficient similarity search over high-dimensional data.",
vector=create_mock_embedding(),
attributes={"category": "technology", "source": "research"},
),
VectorStoreDocument(
id="doc_3",
text="Machine learning models can process and understand natural language text.",
vector=create_mock_embedding(),
attributes={"category": "AI", "source": "article"},
),
VectorStoreDocument(
id="doc_4",
text="Custom implementations allow for specialized behavior and integration.",
vector=create_mock_embedding(),
attributes={"category": "development", "source": "tutorial"},
),
]
print(f"📝 Created {len(sample_documents)} sample documents")
📝 Created 4 sample documents
# Test creating vector store using the factory
schema = VectorStoreSchemaConfig(index_name="test_collection")
# Create vector store instance using factory
vector_store = VectorStoreFactory.create_vector_store(
CUSTOM_VECTOR_STORE_TYPE, vector_store_schema_config=schema
)
print(f"✅ Created vector store instance: {type(vector_store).__name__}")
print(f"📊 Initial stats: {vector_store.get_stats()}")
🚀 SimpleInMemoryVectorStore initialized for index: test_collection
✅ Created vector store instance: SimpleInMemoryVectorStore
📊 Initial stats: {'index_name': 'test_collection', 'document_count': 0, 'vector_count': 0, 'connected': False, 'vector_dimension': 0}
# Connect and load documents
vector_store.connect()
vector_store.load_documents(sample_documents)
print(f"📊 Updated stats: {vector_store.get_stats()}")
✅ Connected to in-memory vector store: test_collection
📚 Loaded 4 documents into vector store
📊 Updated stats: {'index_name': 'test_collection', 'document_count': 4, 'vector_count': 4, 'connected': True, 'vector_dimension': 384}
# Test similarity search
query_vector = create_mock_embedding() # Random query vector for testing
search_results = vector_store.similarity_search_by_vector(
query_vector,
k=3, # Get top 3 similar documents
)
print(f"🔍 Found {len(search_results)} similar documents:\n")
for i, result in enumerate(search_results, 1):
doc = result.document
print(f"{i}. ID: {doc.id}")
print(f" Text: {doc.text[:60]}...")
print(f" Similarity Score: {result.score:.4f}")
print(f" Category: {doc.attributes.get('category', 'N/A')}")
print()
🔍 Found 3 similar documents: 1. ID: doc_1 Text: GraphRAG is a powerful knowledge graph extraction and reason... Similarity Score: 0.0373 Category: technology 2. ID: doc_4 Text: Custom implementations allow for specialized behavior and in... Similarity Score: -0.0061 Category: development 3. ID: doc_2 Text: Vector stores enable efficient similarity search over high-d... Similarity Score: -0.0230 Category: technology
# Test search by ID
try:
found_doc = vector_store.search_by_id("doc_2")
print("✅ Found document by ID:")
print(f" ID: {found_doc.id}")
print(f" Text: {found_doc.text}")
print(f" Attributes: {found_doc.attributes}")
except KeyError as e:
print(f"❌ Error: {e}")
# Test filter by ID
id_filter = vector_store.filter_by_id(["doc_1", "doc_3"])
print(f"\n🔧 ID filter result: {id_filter}")
✅ Found document by ID:
ID: doc_2
Text: Vector stores enable efficient similarity search over high-dimensional data.
Attributes: {'category': 'technology', 'source': 'research'}
🔧 ID filter result: ['doc_1', 'doc_3']
Schritt 6: Konfiguration für GraphRAG¶
Sehen wir uns nun an, wie Sie GraphRAG in einer Einstellungsdatei zur Verwendung Ihres benutzerdefinierten Vektorspeichers konfigurieren würden.
# Example GraphRAG yaml settings
example_settings = {
"vector_store": {
"default_vector_store": {
"type": CUSTOM_VECTOR_STORE_TYPE, # "simple_memory"
"collection_name": "graphrag_entities",
# Add any custom parameters your vector store needs
"custom_parameter": "custom_value",
}
},
# Other GraphRAG configuration...
"models": {
"default_embedding_model": {
"type": "openai_embedding",
"model": "text-embedding-3-small",
}
},
}
# Convert to YAML format for settings.yml
yaml_config = yaml.dump(example_settings, default_flow_style=False, indent=2)
print("📄 Example settings.yml configuration:")
print("=" * 40)
print(yaml_config)
📄 Example settings.yml configuration:
========================================
models:
default_embedding_model:
model: text-embedding-3-small
type: openai_embedding
vector_store:
default_vector_store:
collection_name: graphrag_entities
custom_parameter: custom_value
type: simple_memory
Schritt 7: Integration in die GraphRAG-Pipeline¶
So würde Ihr benutzerdefinierter Vektorspeicher in einer typischen GraphRAG-Pipeline verwendet werden.
# Example of how GraphRAG would use your custom vector store
def simulate_graphrag_pipeline():
"""Simulate how GraphRAG would use the custom vector store."""
print("🚀 Simulating GraphRAG pipeline with custom vector store...\n")
# 1. GraphRAG creates vector store using factory
schema = VectorStoreSchemaConfig(index_name="graphrag_entities")
store = VectorStoreFactory.create_vector_store(
CUSTOM_VECTOR_STORE_TYPE,
vector_store_schema_config=schema,
similarity_threshold=0.3,
)
store.connect()
print("✅ Step 1: Vector store created and connected")
# 2. During indexing, GraphRAG loads extracted entities
entity_documents = [
VectorStoreDocument(
id=f"entity_{i}",
text=f"Entity {i} description: Important concept in the knowledge graph",
vector=create_mock_embedding(),
attributes={"type": "entity", "importance": i % 3 + 1},
)
for i in range(10)
]
store.load_documents(entity_documents)
print(f"✅ Step 2: Loaded {len(entity_documents)} entity documents")
# 3. During query time, GraphRAG searches for relevant entities
query_embedding = create_mock_embedding()
relevant_entities = store.similarity_search_by_vector(query_embedding, k=5)
print(f"✅ Step 3: Found {len(relevant_entities)} relevant entities for query")
# 4. GraphRAG uses these entities for context building
context_entities = [result.document for result in relevant_entities]
print("✅ Step 4: Context built using retrieved entities")
print(f"📊 Final stats: {store.get_stats()}")
return context_entities
# Run the simulation
context = simulate_graphrag_pipeline()
print(f"\n🎯 Retrieved {len(context)} entities for context building")
🚀 Simulating GraphRAG pipeline with custom vector store...
🚀 SimpleInMemoryVectorStore initialized for index: graphrag_entities
✅ Connected to in-memory vector store: graphrag_entities
✅ Step 1: Vector store created and connected
📚 Loaded 10 documents into vector store
✅ Step 2: Loaded 10 entity documents
✅ Step 3: Found 5 relevant entities for query
✅ Step 4: Context built using retrieved entities
📊 Final stats: {'index_name': 'graphrag_entities', 'document_count': 10, 'vector_count': 10, 'connected': True, 'vector_dimension': 384}
🎯 Retrieved 5 entities for context building
Schritt 8: Testen und Validieren¶
Erstellen wir eine umfassende Testsuite, um sicherzustellen, dass unser Vektorspeicher korrekt funktioniert.
def test_custom_vector_store():
"""Comprehensive test suite for the custom vector store."""
print("🧪 Running comprehensive vector store tests...\n")
# Test 1: Basic functionality
print("Test 1: Basic functionality")
store = VectorStoreFactory.create_vector_store(
CUSTOM_VECTOR_STORE_TYPE,
vector_store_schema_config=VectorStoreSchemaConfig(index_name="test"),
)
store.connect()
# Load test documents
test_docs = sample_documents[:2]
store.load_documents(test_docs)
assert len(store.documents) == 2, "Should have 2 documents"
assert len(store.vectors) == 2, "Should have 2 vectors"
print("✅ Basic functionality test passed")
# Test 2: Search functionality
print("\nTest 2: Search functionality")
query_vec = create_mock_embedding()
results = store.similarity_search_by_vector(query_vec, k=5)
assert len(results) <= 2, "Should not return more results than documents"
assert all(isinstance(r, VectorStoreSearchResult) for r in results), (
"Should return VectorStoreSearchResult objects"
)
assert all(-1 <= r.score <= 1 for r in results), (
"Similarity scores should be between -1 and 1"
)
print("✅ Search functionality test passed")
# Test 3: Search by ID
print("\nTest 3: Search by ID")
found_doc = store.search_by_id("doc_1")
assert found_doc.id == "doc_1", "Should find correct document"
try:
store.search_by_id("nonexistent")
assert False, "Should raise KeyError for nonexistent ID"
except KeyError:
pass # Expected
print("✅ Search by ID test passed")
# Test 4: Filter functionality
print("\nTest 4: Filter functionality")
filter_result = store.filter_by_id(["doc_1", "doc_2"])
assert filter_result == ["doc_1", "doc_2"], "Should return filtered IDs"
print("✅ Filter functionality test passed")
# Test 5: Error handling
print("\nTest 5: Error handling")
disconnected_store = VectorStoreFactory.create_vector_store(
CUSTOM_VECTOR_STORE_TYPE,
vector_store_schema_config=VectorStoreSchemaConfig(index_name="test2"),
)
try:
disconnected_store.load_documents(test_docs)
assert False, "Should raise error when not connected"
except RuntimeError:
pass # Expected
try:
disconnected_store.similarity_search_by_vector(query_vec)
assert False, "Should raise error when not connected"
except RuntimeError:
pass # Expected
print("✅ Error handling test passed")
print("\n🎉 All tests passed! Your custom vector store is working correctly.")
# Run the tests
test_custom_vector_store()
🧪 Running comprehensive vector store tests... Test 1: Basic functionality 🚀 SimpleInMemoryVectorStore initialized for index: test ✅ Connected to in-memory vector store: test 📚 Loaded 2 documents into vector store ✅ Basic functionality test passed Test 2: Search functionality ✅ Search functionality test passed Test 3: Search by ID ✅ Search by ID test passed Test 4: Filter functionality ✅ Filter functionality test passed Test 5: Error handling 🚀 SimpleInMemoryVectorStore initialized for index: test2 ✅ Error handling test passed 🎉 All tests passed! Your custom vector store is working correctly.
Zusammenfassung und nächste Schritte¶
Herzlichen Glückwunsch! Sie haben erfolgreich gelernt, wie Sie einen benutzerdefinierten Vektorspeicher mit GraphRAG implementieren und registrieren. Hier ist, was Sie erreicht haben:
Was Sie gebaut haben¶
- ✅ Benutzerdefinierte Vektorspeicherklasse:
SimpleInMemoryVectorStoremit allen erforderlichen Methoden implementiert - ✅ Factory-Integration: Ihren Vektorspeicher mit
VectorStoreFactoryregistriert - ✅ Umfassendes Testen: Funktionalität mit einer vollständigen Testsuite validiert
- ✅ Konfigurationsbeispiele: Gelernt, wie GraphRAG zur Verwendung Ihres Vektorspeichers konfiguriert wird
Schlüsselmitnahmen¶
- Schnittstellenkonformität: Implementieren Sie immer alle Methoden von
BaseVectorStore - Factory-Muster: Verwenden Sie
VectorStoreFactory.register(), um Ihren Vektorspeicher verfügbar zu machen - Konfiguration: Vektorspeicher werden in GraphRAG-Einstellungsdateien konfiguriert
- Testen: Testen Sie die gesamte Funktionalität gründlich vor der Bereitstellung
Nächste Schritte¶
Schauen Sie sich das Notebook "API-Übersicht" an, um zu erfahren, wie Sie Daten über die Graphrag-API indizieren und abfragen.
Ressourcen¶
Viel Spaß beim Aufbauen! 🚀