Source code for haive.core.engine.document.loaders.specific.web

"""Web Loaders for Document Engine.

This module implements specialized web loaders for different types of web content
including GitHub, ArXiv, Wikipedia, and general web pages.
"""

import logging
from typing import Any
from urllib.parse import urlparse

from langchain_core.document_loaders.base import BaseLoader

from haive.core.engine.document.loaders.sources.implementation import (
    CredentialType,
    WebUrlSource,
)

logger = logging.getLogger(__name__)



[docs]
class GitHubSource(WebUrlSource):
    """GitHub repository and content source."""

    def __init__(
        self,
        repo_url: str,
        file_filter: list[str] | None = None,
        include_issues: bool = False,
        include_pull_requests: bool = False,
        **kwargs,
    ):
        """Init  .

        Args:
            repo_url: [TODO: Add description]
            file_filter: [TODO: Add description]
            include_issues: [TODO: Add description]
            include_pull_requests: [TODO: Add description]
        """
        super().__init__(source_path=repo_url, **kwargs)
        self.repo_url = repo_url
        self.file_filter = file_filter or []
        self.include_issues = include_issues
        self.include_pull_requests = include_pull_requests
        self.allowed_domains = ["github.com", "api.github.com"]


[docs]
    def can_handle(self, path: str) -> bool:
        """Check if this is a GitHub URL."""
        try:
            parsed = urlparse(path)
            return "github.com" in parsed.netloc
        except Exception:
            return False



[docs]
    def get_confidence_score(self, path: str) -> float:
        """Get confidence score for GitHub URLs."""
        if not self.can_handle(path):
            return 0.0
        return 0.95



[docs]
    def requires_authentication(self) -> bool:
        """GitHub may require authentication for private repos."""
        return True



[docs]
    def get_credential_requirements(self) -> list[CredentialType]:
        """GitHub needs API token."""
        return [CredentialType.ACCESS_TOKEN, CredentialType.API_KEY]



[docs]
    def create_loader(self) -> BaseLoader | None:
        """Create a GitHub loader."""
        try:
            from langchain_community.document_loaders import (
                GitHubIssuesLoader,
                GitLoader,
            )

            # Get GitHub token if available
            github_token = None
            if self.credential_manager:
                cred = self.credential_manager.get_credential("github")
                if cred and cred.credential_type in [
                    CredentialType.ACCESS_TOKEN,
                    CredentialType.API_KEY,
                ]:
                    github_token = cred.value

            # Parse GitHub URL to extract repo info
            parsed = urlparse(self.repo_url)
            path_parts = parsed.path.strip("/").split("/")

            if len(path_parts) >= 2:
                repo_owner = path_parts[0]
                repo_name = path_parts[1]

                if self.include_issues:
                    return GitHubIssuesLoader(
                        repo=f"{repo_owner}/{repo_name}",
                        access_token=github_token,
                        include_prs=self.include_pull_requests,
                    )
                # Use GitLoader for repository content
                return GitLoader(
                    clone_url=self.repo_url,
                    repo_path=f"/tmp/git_repos/{repo_owner}_{repo_name}",
                    file_filter=lambda file_path: (
                        any(pattern in file_path for pattern in self.file_filter)
                        if self.file_filter
                        else True
                    ),
                )
            raise ValueError(f"Invalid GitHub URL format: {self.repo_url}")

        except ImportError:
            logger.warning(
                "GitHub loaders not available. Install with: pip install pygithub gitpython"
            )
            return None
        except Exception as e:
            logger.exception(f"Failed to create GitHub loader: {e}")
            return None





[docs]
class ArXivSource(WebUrlSource):
    """ArXiv research paper source."""

    def __init__(
        self,
        query: str | None = None,
        paper_id: str | None = None,
        max_results: int = 10,
        **kwargs,
    ):
        """Init  .

        Args:
            query: [TODO: Add description]
            paper_id: [TODO: Add description]
            max_results: [TODO: Add description]
        """
        source_path = f"arxiv:{paper_id}" if paper_id else f"arxiv:search:{query}"
        super().__init__(source_path=source_path, **kwargs)
        self.query = query
        self.paper_id = paper_id
        self.max_results = max_results


[docs]
    def can_handle(self, path: str) -> bool:
        """Check if this is an ArXiv identifier or URL."""
        try:
            if path.startswith("arxiv:"):
                return True
            parsed = urlparse(path)
            return "arxiv.org" in parsed.netloc
        except Exception:
            return False



[docs]
    def get_confidence_score(self, path: str) -> float:
        """Get confidence score for ArXiv sources."""
        if not self.can_handle(path):
            return 0.0
        return 0.9



[docs]
    def create_loader(self) -> BaseLoader | None:
        """Create an ArXiv loader."""
        try:
            from langchain_community.document_loaders import ArxivLoader

            if self.paper_id:
                # Load specific paper
                return ArxivLoader(query=self.paper_id, load_max_docs=1)
            if self.query:
                # Search for papers
                return ArxivLoader(query=self.query, load_max_docs=self.max_results)
            raise ValueError("Either paper_id or query must be provided")

        except ImportError:
            logger.warning("ArxivLoader not available. Install with: pip install arxiv")
            return None
        except Exception as e:
            logger.exception(f"Failed to create ArXiv loader: {e}")
            return None





[docs]
class WikipediaSource(WebUrlSource):
    """Wikipedia article source."""

    def __init__(
        self,
        query: str | None = None,
        page_title: str | None = None,
        lang: str = "en",
        load_max_docs: int = 1,
        **kwargs,
    ):
        """Init  .

        Args:
            query: [TODO: Add description]
            page_title: [TODO: Add description]
            lang: [TODO: Add description]
            load_max_docs: [TODO: Add description]
        """
        source_path = f"wikipedia:{lang}:{page_title or query}"
        super().__init__(source_path=source_path, **kwargs)
        self.query = query
        self.page_title = page_title
        self.lang = lang
        self.load_max_docs = load_max_docs


[docs]
    def can_handle(self, path: str) -> bool:
        """Check if this is a Wikipedia URL or identifier."""
        try:
            if path.startswith("wikipedia:"):
                return True
            parsed = urlparse(path)
            return "wikipedia.org" in parsed.netloc
        except Exception:
            return False



[docs]
    def get_confidence_score(self, path: str) -> float:
        """Get confidence score for Wikipedia sources."""
        if not self.can_handle(path):
            return 0.0
        return 0.9



[docs]
    def create_loader(self) -> BaseLoader | None:
        """Create a Wikipedia loader."""
        try:
            from langchain_community.document_loaders import WikipediaLoader

            search_query = self.page_title or self.query
            if not search_query:
                raise ValueError("Either page_title or query must be provided")

            return WikipediaLoader(
                query=search_query,
                lang=self.lang,
                load_max_docs=self.load_max_docs,
            )

        except ImportError:
            logger.warning(
                "WikipediaLoader not available. Install with: pip install wikipedia"
            )
            return None
        except Exception as e:
            logger.exception(f"Failed to create Wikipedia loader: {e}")
            return None





[docs]
class PlaywrightWebSource(WebUrlSource):
    """Advanced web source using Playwright for JavaScript-heavy sites."""

    def __init__(
        self,
        urls: list[str],
        wait_until: str = "networkidle",
        headless: bool = True,
        **kwargs,
    ):
        """Init  .

        Args:
            urls: [TODO: Add description]
            wait_until: [TODO: Add description]
            headless: [TODO: Add description]
        """
        super().__init__(source_path=urls[0] if urls else "", **kwargs)
        self.urls = urls
        self.wait_until = wait_until
        self.headless = headless


[docs]
    def can_handle(self, path: str) -> bool:
        """Check if this is a web URL suitable for Playwright."""
        try:
            parsed = urlparse(path)
            return parsed.scheme in ["http", "https"]
        except Exception:
            return False



[docs]
    def get_confidence_score(self, path: str) -> float:
        """Get confidence score for web URLs (lower priority than basic web)."""
        if not self.can_handle(path):
            return 0.0
        return 0.6  # Lower than basic web loader for auto-selection



[docs]
    def create_loader(self) -> BaseLoader | None:
        """Create a Playwright web loader."""
        try:
            from langchain_community.document_loaders import PlaywrightURLLoader

            return PlaywrightURLLoader(
                urls=self.urls,
                remove_selectors=["header", "footer", "nav", ".sidebar"],
                continue_on_failure=True,
                headless=self.headless,
            )

        except ImportError:
            logger.warning(
                "PlaywrightURLLoader not available. Install with: pip install playwright"
            )
            return None
        except Exception as e:
            logger.exception(f"Failed to create Playwright loader: {e}")
            return None





[docs]
class BasicWebSource(WebUrlSource):
    """Basic web source for simple HTML pages."""

    def __init__(
        self,
        web_paths: list[str],
        requests_kwargs: dict[str, Any] | None = None,
        **kwargs,
    ):
        """Init  .

        Args:
            web_paths: [TODO: Add description]
            requests_kwargs: [TODO: Add description]
        """
        super().__init__(source_path=web_paths[0] if web_paths else "", **kwargs)
        self.web_paths = web_paths
        self.requests_kwargs = requests_kwargs or {}


[docs]
    def can_handle(self, path: str) -> bool:
        """Check if this is a web URL."""
        try:
            parsed = urlparse(path)
            return parsed.scheme in ["http", "https"]
        except Exception:
            return False



[docs]
    def get_confidence_score(self, path: str) -> float:
        """Get confidence score for web URLs."""
        if not self.can_handle(path):
            return 0.0
        return 0.7



[docs]
    def create_loader(self) -> BaseLoader | None:
        """Create a basic web loader."""
        try:
            from langchain_community.document_loaders import WebBaseLoader

            return WebBaseLoader(
                web_paths=self.web_paths,
                requests_kwargs=self.requests_kwargs,
            )

        except ImportError:
            logger.warning("WebBaseLoader not available")
            return None
        except Exception as e:
            logger.exception(f"Failed to create web loader: {e}")
            return None




# Export web sources
__all__ = [
    "ArXivSource",
    "BasicWebSource",
    "GitHubSource",
    "PlaywrightWebSource",
    "WikipediaSource",
]