Source code for haive.core.engine.retriever.providers.QdrantSparseVectorRetrieverConfig

"""Qdrant Sparse Vector Retriever implementation for the Haive framework.

from typing import Any
This module provides a configuration class for the Qdrant Sparse Vector retriever,
which uses Qdrant's sparse vector capabilities for keyword-based and hybrid search.
Qdrant supports both dense and sparse vectors, enabling efficient text search
using sparse embeddings like BM25 or TF-IDF representations.

The QdrantSparseVectorRetriever works by:
1. Connecting to a Qdrant instance
2. Using sparse vector representations for text search
3. Supporting efficient keyword matching and retrieval
4. Enabling hybrid dense + sparse vector search

This retriever is particularly useful when:
- Need efficient keyword-based search with Qdrant
- Want to combine dense and sparse vector search
- Building hybrid retrieval systems
- Using Qdrant for production vector search
- Need high-performance text matching

The implementation integrates with LangChain's QdrantSparseVectorRetriever while
providing a consistent Haive configuration interface with secure API key management.
"""

from typing import Any

from langchain_core.documents import Document
from pydantic import Field, SecretStr

from haive.core.common.mixins.secure_config import SecureConfigMixin
from haive.core.engine.retriever.retriever import BaseRetrieverConfig
from haive.core.engine.retriever.types import RetrieverType



[docs]
@BaseRetrieverConfig.register(RetrieverType.QDRANT_SPARSE_VECTOR)
class QdrantSparseVectorRetrieverConfig(SecureConfigMixin, BaseRetrieverConfig):
    """Configuration for Qdrant Sparse Vector retriever in the Haive framework.

    This retriever uses Qdrant's sparse vector capabilities to provide efficient
    keyword-based search and hybrid dense + sparse vector retrieval.

    Attributes:
        retriever_type (RetrieverType): The type of retriever (always QDRANT_SPARSE_VECTOR).
        qdrant_url (str): Qdrant instance URL.
        collection_name (str): Name of the Qdrant collection.
        api_key (Optional[SecretStr]): Qdrant API key (auto-resolved from QDRANT_API_KEY).
        k (int): Number of documents to retrieve.
        sparse_vector_name (str): Name of the sparse vector field.
        enable_hybrid_search (bool): Whether to combine with dense vectors.

    Examples:
        >>> from haive.core.engine.retriever import QdrantSparseVectorRetrieverConfig
        >>>
        >>> # Create the Qdrant sparse vector retriever config
        >>> config = QdrantSparseVectorRetrieverConfig(
        ...     name="qdrant_sparse_retriever",
        ...     qdrant_url="https://my-cluster.qdrant.tech",
        ...     collection_name="documents",
        ...     k=10,
        ...     sparse_vector_name="sparse_text",
        ...     enable_hybrid_search=False
        ... )
        >>>
        >>> # Instantiate and use the retriever
        >>> retriever = config.instantiate()
        >>> docs = retriever.get_relevant_documents("machine learning algorithms")
        >>>
        >>> # Example with hybrid search
        >>> hybrid_config = QdrantSparseVectorRetrieverConfig(
        ...     name="qdrant_hybrid_retriever",
        ...     qdrant_url="https://my-cluster.qdrant.tech",
        ...     collection_name="documents",
        ...     enable_hybrid_search=True,
        ...     hybrid_fusion_method="rrf"
        ... )
    """

    retriever_type: RetrieverType = Field(
        default=RetrieverType.QDRANT_SPARSE_VECTOR, description="The type of retriever"
    )

    # Qdrant connection configuration
    qdrant_url: str = Field(
        ..., description="Qdrant instance URL (e.g., 'https://my-cluster.qdrant.tech')"
    )

    collection_name: str = Field(
        ..., description="Name of the Qdrant collection to search"
    )

    # API configuration with SecureConfigMixin
    api_key: SecretStr | None = Field(
        default=None, description="Qdrant API key (auto-resolved from QDRANT_API_KEY)"
    )

    # Provider for SecureConfigMixin
    provider: str = Field(
        default="qdrant", description="Provider name for API key resolution"
    )

    # Search parameters
    k: int = Field(
        default=10, ge=1, le=100, description="Number of documents to retrieve"
    )

    sparse_vector_name: str = Field(
        default="sparse_text", description="Name of the sparse vector field in Qdrant"
    )

    # Hybrid search configuration
    enable_hybrid_search: bool = Field(
        default=False,
        description="Whether to combine sparse vectors with dense vectors",
    )

    dense_vector_name: str = Field(
        default="dense",
        description="Name of the dense vector field (used in hybrid search)",
    )

    hybrid_fusion_method: str = Field(
        default="rrf",
        description="Fusion method for hybrid search: 'rrf' (Reciprocal Rank Fusion), 'linear'",
    )

    # Search filtering
    filter_conditions: dict[str, Any] | None = Field(
        default=None, description="Qdrant filter conditions for search results"
    )

    # Advanced parameters
    score_threshold: float | None = Field(
        default=None, ge=0.0, le=1.0, description="Minimum score threshold for results"
    )

    sparse_encoder_model: str | None = Field(
        default=None, description="Sparse encoder model name (e.g., 'splade++', 'bm25')"
    )

    # Connection parameters
    timeout: float | None = Field(
        default=60.0, ge=1.0, le=300.0, description="Request timeout in seconds"
    )


[docs]
    def get_input_fields(self) -> dict[str, tuple[type, Any]]:
        """Return input field definitions for Qdrant Sparse Vector retriever."""
        return {
            "query": (str, Field(description="Sparse vector search query for Qdrant")),
        }



[docs]
    def get_output_fields(self) -> dict[str, tuple[type, Any]]:
        """Return output field definitions for Qdrant Sparse Vector retriever."""
        return {
            "documents": (
                list[Document],
                Field(
                    default_factory=list,
                    description="Documents from Qdrant sparse vector search",
                ),
            ),
        }



[docs]
    def instantiate(self) -> Any:
        """Create a Qdrant Sparse Vector retriever from this configuration.

        Returns:
            QdrantSparseVectorRetriever: Instantiated retriever ready for sparse vector search.

        Raises:
            ImportError: If required packages are not available.
            ValueError: If API key or configuration is invalid.
        """
        try:
            from langchain_qdrant.retrievers import QdrantSparseVectorRetriever
            from qdrant_client import QdrantClient
        except ImportError:
            raise ImportError(
                "QdrantSparseVectorRetriever requires langchain-qdrant and qdrant-client packages. "
                "Install with: pip install langchain-qdrant qdrant-client"
            )

        # Get API key using SecureConfigMixin
        api_key = self.get_api_key()

        # Create Qdrant client
        client_kwargs = {"url": self.qdrant_url, "timeout": self.timeout}

        if api_key:
            client_kwargs["api_key"] = api_key

        client = QdrantClient(**client_kwargs)

        # Prepare retriever configuration
        config = {
            "client": client,
            "collection_name": self.collection_name,
            "sparse_vector_name": self.sparse_vector_name,
            "k": self.k,
        }

        # Add hybrid search configuration
        if self.enable_hybrid_search:
            config["enable_hybrid"] = True
            config["dense_vector_name"] = self.dense_vector_name
            config["fusion_method"] = self.hybrid_fusion_method

        # Add optional parameters
        if self.filter_conditions:
            config["filter"] = self.filter_conditions

        if self.score_threshold:
            config["score_threshold"] = self.score_threshold

        if self.sparse_encoder_model:
            config["sparse_encoder"] = self.sparse_encoder_model

        return QdrantSparseVectorRetriever(**config)