"""Vector Store Provider Discovery and Management.
This module provides utilities for discovering, comparing, and configuring
vector store providers within the Haive framework. It offers comprehensive
information about all available vector store backends.
Examples:
Basic discovery::
from haive.core.engine.vectorstore.discovery import get_vectorstore_providers
providers = get_vectorstore_providers()
print(f"Available: {list(providers.keys())}")
Get provider recommendations::
from haive.core.engine.vectorstore.discovery import recommend_vectorstore
# For development
dev_stores = recommend_vectorstore("development")
print(f"For development: {dev_stores}")
# For production
prod_stores = recommend_vectorstore("production")
print(f"For production: {prod_stores}")
"""
from dataclasses import dataclass
from enum import Enum
[docs]
class VectorStoreType(str, Enum):
"""Categories of vector stores."""
LOCAL = "local" # FAISS, Annoy, InMemory
CLOUD = "cloud" # Pinecone, Weaviate Cloud
HYBRID = "hybrid" # Chroma, Qdrant, Weaviate
DATABASE = "database" # PGVector, ClickHouse
SEARCH_ENGINE = "search" # Elasticsearch, OpenSearch
[docs]
class CostTier(str, Enum):
"""Cost structure for vector stores."""
FREE = "free" # FAISS, InMemory, Chroma (local)
FREEMIUM = "freemium" # Pinecone, Qdrant Cloud
PAID = "paid" # Most cloud offerings
ENTERPRISE = "enterprise" # Enterprise-only features
[docs]
@dataclass
class VectorStoreInfo:
"""Comprehensive information about a vector store provider."""
name: str
description: str
type: VectorStoreType
cost: CostTier
auth_required: bool
setup_complexity: str # "easy", "medium", "complex"
performance_tier: str # "basic", "good", "excellent"
# Capabilities
supports_metadata_filtering: bool = True
supports_hybrid_search: bool = False
supports_multi_tenancy: bool = False
supports_real_time_updates: bool = True
# Technical specs
max_dimensions: int | None = None
index_types: list[str] = None
distance_metrics: list[str] = None
# Requirements
python_packages: list[str] = None
external_dependencies: list[str] = None
# Usage info
popular_use_cases: list[str] = None
best_for: str = ""
def __post_init__(self):
"""Set defaults for mutable fields."""
if self.index_types is None:
self.index_types = ["flat", "hnsw"]
if self.distance_metrics is None:
self.distance_metrics = ["cosine", "euclidean", "dot_product"]
if self.python_packages is None:
self.python_packages = []
if self.external_dependencies is None:
self.external_dependencies = []
if self.popular_use_cases is None:
self.popular_use_cases = ["semantic_search", "rag", "similarity_matching"]
[docs]
def get_vectorstore_providers() -> dict[str, VectorStoreInfo]:
"""Get comprehensive information about all vector store providers.
Returns:
Dictionary mapping provider name to VectorStoreInfo
Examples:
Get all providers::
providers = get_vectorstore_providers()
for name, info in providers.items():
print(f"{name}: {info.description}")
"""
return {
# === LOCAL STORES ===
"FAISS": VectorStoreInfo(
name="FAISS",
description="Facebook AI Similarity Search - High-performance local vector search",
type=VectorStoreType.LOCAL,
cost=CostTier.FREE,
auth_required=False,
setup_complexity="easy",
performance_tier="excellent",
supports_metadata_filtering=False,
supports_hybrid_search=False,
supports_multi_tenancy=False,
max_dimensions=None, # No limit
index_types=["flat", "ivf", "hnsw", "pq"],
python_packages=["faiss-cpu", "faiss-gpu"],
best_for="High-performance local search, research, prototyping",
popular_use_cases=["research", "prototyping", "high_throughput_search"],
),
"InMemory": VectorStoreInfo(
name="InMemory",
description="Simple in-memory vector storage for testing and development",
type=VectorStoreType.LOCAL,
cost=CostTier.FREE,
auth_required=False,
setup_complexity="easy",
performance_tier="basic",
supports_metadata_filtering=True,
max_dimensions=10000, # Practical memory limit
python_packages=[], # Built-in
best_for="Testing, development, small datasets",
popular_use_cases=["testing", "development", "demos"],
),
"Annoy": VectorStoreInfo(
name="Annoy",
description="Approximate Nearest Neighbors library by Spotify",
type=VectorStoreType.LOCAL,
cost=CostTier.FREE,
auth_required=False,
setup_complexity="easy",
performance_tier="good",
supports_metadata_filtering=False,
supports_real_time_updates=False, # Read-only after build
index_types=["angular", "euclidean"],
python_packages=["annoy"],
best_for="Read-heavy workloads, music/content recommendation",
popular_use_cases=["recommendation_systems", "content_similarity"],
),
# === HYBRID STORES ===
"Chroma": VectorStoreInfo(
name="Chroma",
description="Open-source embedding database with local and server modes",
type=VectorStoreType.HYBRID,
cost=CostTier.FREE,
auth_required=False,
setup_complexity="easy",
performance_tier="good",
supports_metadata_filtering=True,
supports_hybrid_search=False,
supports_multi_tenancy=True,
python_packages=["chromadb"],
external_dependencies=["docker (optional)"],
best_for="Development to production, easy scaling, rich metadata",
popular_use_cases=["rag", "semantic_search", "document_analysis"],
),
"Qdrant": VectorStoreInfo(
name="Qdrant",
description="Vector database with advanced filtering and geo-search",
type=VectorStoreType.HYBRID,
cost=CostTier.FREEMIUM,
auth_required=False, # For local deployment
setup_complexity="medium",
performance_tier="excellent",
supports_metadata_filtering=True,
supports_hybrid_search=True,
supports_multi_tenancy=True,
index_types=["hnsw"],
distance_metrics=["cosine", "euclidean", "dot_product", "manhattan"],
python_packages=["qdrant-client"],
external_dependencies=["docker", "qdrant-server"],
best_for="Advanced filtering, geo-search, high-performance applications",
popular_use_cases=[
"location_based_search",
"advanced_rag",
"recommendation",
],
),
"Weaviate": VectorStoreInfo(
name="Weaviate",
description="Open-source vector database with GraphQL API",
type=VectorStoreType.HYBRID,
cost=CostTier.FREEMIUM,
auth_required=False, # For local
setup_complexity="medium",
performance_tier="excellent",
supports_metadata_filtering=True,
supports_hybrid_search=True,
supports_multi_tenancy=True,
python_packages=["weaviate-client"],
external_dependencies=["docker", "weaviate-server"],
best_for="GraphQL integration, hybrid search, complex schemas",
popular_use_cases=[
"knowledge_graphs",
"hybrid_search",
"enterprise_search",
],
),
# === CLOUD STORES ===
"Pinecone": VectorStoreInfo(
name="Pinecone",
description="Fully managed vector database service",
type=VectorStoreType.CLOUD,
cost=CostTier.FREEMIUM,
auth_required=True,
setup_complexity="easy",
performance_tier="excellent",
supports_metadata_filtering=True,
supports_hybrid_search=False,
supports_multi_tenancy=True,
supports_real_time_updates=True,
python_packages=["pinecone-client"],
best_for="Production apps, zero-ops, auto-scaling",
popular_use_cases=[
"production_rag",
"recommendation_systems",
"similarity_search",
],
),
# === DATABASE EXTENSIONS ===
"PGVector": VectorStoreInfo(
name="PGVector",
description="PostgreSQL extension for vector similarity search",
type=VectorStoreType.DATABASE,
cost=CostTier.FREE,
auth_required=True, # Database credentials
setup_complexity="medium",
performance_tier="good",
supports_metadata_filtering=True,
supports_hybrid_search=False,
supports_multi_tenancy=True,
index_types=["ivfflat", "hnsw"],
python_packages=["psycopg2-binary", "pgvector"],
external_dependencies=["postgresql", "pgvector-extension"],
best_for="Existing PostgreSQL users, ACID compliance, complex queries",
popular_use_cases=["enterprise_rag", "data_warehousing", "analytics"],
),
"ClickHouse": VectorStoreInfo(
name="ClickHouse",
description="Column-oriented database with vector search capabilities",
type=VectorStoreType.DATABASE,
cost=CostTier.FREE,
auth_required=True,
setup_complexity="complex",
performance_tier="excellent",
supports_metadata_filtering=True,
supports_hybrid_search=False,
max_dimensions=65536,
python_packages=["clickhouse-connect"],
external_dependencies=["clickhouse-server"],
best_for="Analytics workloads, large-scale data, time-series",
popular_use_cases=["analytics", "time_series", "large_scale_search"],
),
# === SEARCH ENGINES ===
"Elasticsearch": VectorStoreInfo(
name="Elasticsearch",
description="Search engine with vector search capabilities",
type=VectorStoreType.SEARCH_ENGINE,
cost=CostTier.FREEMIUM,
auth_required=False, # For local
setup_complexity="medium",
performance_tier="excellent",
supports_metadata_filtering=True,
supports_hybrid_search=True,
supports_multi_tenancy=True,
python_packages=["elasticsearch"],
external_dependencies=["elasticsearch-server"],
best_for="Full-text + vector search, existing Elasticsearch users",
popular_use_cases=["hybrid_search", "enterprise_search", "log_analysis"],
),
}
[docs]
def filter_vectorstores(
type_filter: VectorStoreType | None = None,
cost_filter: CostTier | None = None,
auth_required: bool | None = None,
supports_metadata: bool | None = None,
supports_hybrid: bool | None = None,
setup_complexity: str | None = None,
) -> dict[str, VectorStoreInfo]:
"""Filter vector stores by criteria.
Args:
type_filter: Filter by store type
cost_filter: Filter by cost tier
auth_required: Filter by auth requirement
supports_metadata: Filter by metadata support
supports_hybrid: Filter by hybrid search support
setup_complexity: Filter by setup complexity
Returns:
Filtered dictionary of vector stores
Examples:
Get free, easy-setup stores::
stores = filter_vectorstores(
cost_filter=CostTier.FREE,
setup_complexity="easy"
)
Get hybrid search capable stores::
stores = filter_vectorstores(supports_hybrid=True)
"""
all_stores = get_vectorstore_providers()
filtered = {}
for name, info in all_stores.items():
# Apply filters
if type_filter and info.type != type_filter:
continue
if cost_filter and info.cost != cost_filter:
continue
if auth_required is not None and info.auth_required != auth_required:
continue
if (
supports_metadata is not None
and info.supports_metadata_filtering != supports_metadata
):
continue
if (
supports_hybrid is not None
and info.supports_hybrid_search != supports_hybrid
):
continue
if setup_complexity and info.setup_complexity != setup_complexity:
continue
filtered[name] = info
return filtered
[docs]
def recommend_vectorstore(use_case: str) -> list[str]:
"""Get vector store recommendations for specific use cases.
Args:
use_case: Use case ("development", "production", "research",
"enterprise", "free_only", "local_only")
Returns:
List of recommended vector store names
Examples:
Get development recommendations::
dev_stores = recommend_vectorstore("development")
# Returns: ["Chroma", "InMemory", "FAISS"]
Get production recommendations::
prod_stores = recommend_vectorstore("production")
# Returns: ["Pinecone", "Qdrant", "Weaviate"]
"""
if use_case == "development":
return ["Chroma", "InMemory", "FAISS"]
if use_case == "production":
return ["Pinecone", "Qdrant", "Weaviate", "Chroma"]
if use_case == "research":
return ["FAISS", "Annoy", "Chroma", "Qdrant"]
if use_case == "enterprise":
return ["PGVector", "Elasticsearch", "Weaviate", "Qdrant"]
if use_case == "free_only":
free_stores = filter_vectorstores(cost_filter=CostTier.FREE)
return list(free_stores.keys())
if use_case == "local_only":
local_stores = filter_vectorstores(type_filter=VectorStoreType.LOCAL)
return list(local_stores.keys())
if use_case == "no_auth":
no_auth_stores = filter_vectorstores(auth_required=False)
return list(no_auth_stores.keys())
# Default general recommendations
return ["Chroma", "Pinecone", "FAISS", "Qdrant"]
[docs]
def get_setup_instructions(provider_name: str) -> str:
"""Get setup instructions for a vector store provider.
Args:
provider_name: Name of the vector store provider
Returns:
Setup instructions as formatted string
Examples:
Get Chroma setup::
instructions = get_setup_instructions("Chroma")
print(instructions)
"""
providers = get_vectorstore_providers()
info = providers.get(provider_name)
if not info:
return f"Provider '{provider_name}' not found."
instructions = f"# {info.name} Setup Instructions\n\n"
instructions += f"Description: {info.description}\n"
instructions += f"Type: {info.type.value}\n"
instructions += f"Cost: {info.cost.value}\n"
instructions += f"Setup Complexity: {info.setup_complexity}\n\n"
if info.python_packages:
instructions += "## Python Installation\n"
instructions += f"pip install {' '.join(info.python_packages)}\n\n"
if info.external_dependencies:
instructions += "## External Dependencies\n"
for dep in info.external_dependencies:
instructions += f"- {dep}\n"
instructions += "\n"
instructions += f"## Best For\n{info.best_for}\n\n"
if info.popular_use_cases:
instructions += "## Popular Use Cases\n"
for use_case in info.popular_use_cases:
instructions += f"- {use_case}\n"
return instructions
[docs]
def compare_vectorstores(provider_names: list[str]) -> str:
"""Compare multiple vector store providers.
Args:
provider_names: List of provider names to compare
Returns:
Comparison table as formatted string
Examples:
Compare popular stores::
comparison = compare_vectorstores(["Chroma", "Pinecone", "FAISS"])
print(comparison)
"""
providers = get_vectorstore_providers()
# Filter to requested providers
selected = {
name: info for name, info in providers.items() if name in provider_names
}
if not selected:
return "No valid providers found for comparison."
# Build comparison table
comparison = "Vector Store Comparison\n"
comparison += "=" * 50 + "\n\n"
# Header
comparison += f"{'Provider':<15} {'Type':<10} {'Cost':<10} {'Auth':<6} {'Setup':<8} {'Perf':<8}\n"
comparison += "-" * 65 + "\n"
# Rows
for name, info in selected.items():
auth_str = "Yes" if info.auth_required else "No"
comparison += (
f"{name:<15} {info.type.value:<10} {info.cost.value:<10} {auth_str:<6} "
)
comparison += f"{info.setup_complexity:<8} {info.performance_tier:<8}\n"
comparison += "\n"
# Detailed capabilities
comparison += "Capabilities:\n"
comparison += "-" * 20 + "\n"
for name, info in selected.items():
comparison += f"\n{name}:\n"
comparison += f" Metadata Filtering: {'✓' if info.supports_metadata_filtering else '✗'}\n"
comparison += (
f" Hybrid Search: {'✓' if info.supports_hybrid_search else '✗'}\n"
)
comparison += (
f" Multi-tenancy: {'✓' if info.supports_multi_tenancy else '✗'}\n"
)
comparison += (
f" Real-time Updates: {'✓' if info.supports_real_time_updates else '✗'}\n"
)
comparison += f" Best For: {info.best_for}\n"
return comparison