Source code for haive.core.engine.document.loaders.specific.web

"""Web Loaders for Document Engine.

This module implements specialized web loaders for different types of web content
including GitHub, ArXiv, Wikipedia, and general web pages.
"""

import logging
from typing import Any
from urllib.parse import urlparse

from langchain_core.document_loaders.base import BaseLoader

from haive.core.engine.document.loaders.sources.implementation import (
    CredentialType,
    WebUrlSource,
)

logger = logging.getLogger(__name__)


[docs] class GitHubSource(WebUrlSource): """GitHub repository and content source.""" def __init__( self, repo_url: str, file_filter: list[str] | None = None, include_issues: bool = False, include_pull_requests: bool = False, **kwargs, ): """Init . Args: repo_url: [TODO: Add description] file_filter: [TODO: Add description] include_issues: [TODO: Add description] include_pull_requests: [TODO: Add description] """ super().__init__(source_path=repo_url, **kwargs) self.repo_url = repo_url self.file_filter = file_filter or [] self.include_issues = include_issues self.include_pull_requests = include_pull_requests self.allowed_domains = ["github.com", "api.github.com"]
[docs] def can_handle(self, path: str) -> bool: """Check if this is a GitHub URL.""" try: parsed = urlparse(path) return "github.com" in parsed.netloc except Exception: return False
[docs] def get_confidence_score(self, path: str) -> float: """Get confidence score for GitHub URLs.""" if not self.can_handle(path): return 0.0 return 0.95
[docs] def requires_authentication(self) -> bool: """GitHub may require authentication for private repos.""" return True
[docs] def get_credential_requirements(self) -> list[CredentialType]: """GitHub needs API token.""" return [CredentialType.ACCESS_TOKEN, CredentialType.API_KEY]
[docs] def create_loader(self) -> BaseLoader | None: """Create a GitHub loader.""" try: from langchain_community.document_loaders import ( GitHubIssuesLoader, GitLoader, ) # Get GitHub token if available github_token = None if self.credential_manager: cred = self.credential_manager.get_credential("github") if cred and cred.credential_type in [ CredentialType.ACCESS_TOKEN, CredentialType.API_KEY, ]: github_token = cred.value # Parse GitHub URL to extract repo info parsed = urlparse(self.repo_url) path_parts = parsed.path.strip("/").split("/") if len(path_parts) >= 2: repo_owner = path_parts[0] repo_name = path_parts[1] if self.include_issues: return GitHubIssuesLoader( repo=f"{repo_owner}/{repo_name}", access_token=github_token, include_prs=self.include_pull_requests, ) # Use GitLoader for repository content return GitLoader( clone_url=self.repo_url, repo_path=f"/tmp/git_repos/{repo_owner}_{repo_name}", file_filter=lambda file_path: ( any(pattern in file_path for pattern in self.file_filter) if self.file_filter else True ), ) raise ValueError(f"Invalid GitHub URL format: {self.repo_url}") except ImportError: logger.warning( "GitHub loaders not available. Install with: pip install pygithub gitpython" ) return None except Exception as e: logger.exception(f"Failed to create GitHub loader: {e}") return None
[docs] class ArXivSource(WebUrlSource): """ArXiv research paper source.""" def __init__( self, query: str | None = None, paper_id: str | None = None, max_results: int = 10, **kwargs, ): """Init . Args: query: [TODO: Add description] paper_id: [TODO: Add description] max_results: [TODO: Add description] """ source_path = f"arxiv:{paper_id}" if paper_id else f"arxiv:search:{query}" super().__init__(source_path=source_path, **kwargs) self.query = query self.paper_id = paper_id self.max_results = max_results
[docs] def can_handle(self, path: str) -> bool: """Check if this is an ArXiv identifier or URL.""" try: if path.startswith("arxiv:"): return True parsed = urlparse(path) return "arxiv.org" in parsed.netloc except Exception: return False
[docs] def get_confidence_score(self, path: str) -> float: """Get confidence score for ArXiv sources.""" if not self.can_handle(path): return 0.0 return 0.9
[docs] def create_loader(self) -> BaseLoader | None: """Create an ArXiv loader.""" try: from langchain_community.document_loaders import ArxivLoader if self.paper_id: # Load specific paper return ArxivLoader(query=self.paper_id, load_max_docs=1) if self.query: # Search for papers return ArxivLoader(query=self.query, load_max_docs=self.max_results) raise ValueError("Either paper_id or query must be provided") except ImportError: logger.warning("ArxivLoader not available. Install with: pip install arxiv") return None except Exception as e: logger.exception(f"Failed to create ArXiv loader: {e}") return None
[docs] class WikipediaSource(WebUrlSource): """Wikipedia article source.""" def __init__( self, query: str | None = None, page_title: str | None = None, lang: str = "en", load_max_docs: int = 1, **kwargs, ): """Init . Args: query: [TODO: Add description] page_title: [TODO: Add description] lang: [TODO: Add description] load_max_docs: [TODO: Add description] """ source_path = f"wikipedia:{lang}:{page_title or query}" super().__init__(source_path=source_path, **kwargs) self.query = query self.page_title = page_title self.lang = lang self.load_max_docs = load_max_docs
[docs] def can_handle(self, path: str) -> bool: """Check if this is a Wikipedia URL or identifier.""" try: if path.startswith("wikipedia:"): return True parsed = urlparse(path) return "wikipedia.org" in parsed.netloc except Exception: return False
[docs] def get_confidence_score(self, path: str) -> float: """Get confidence score for Wikipedia sources.""" if not self.can_handle(path): return 0.0 return 0.9
[docs] def create_loader(self) -> BaseLoader | None: """Create a Wikipedia loader.""" try: from langchain_community.document_loaders import WikipediaLoader search_query = self.page_title or self.query if not search_query: raise ValueError("Either page_title or query must be provided") return WikipediaLoader( query=search_query, lang=self.lang, load_max_docs=self.load_max_docs, ) except ImportError: logger.warning( "WikipediaLoader not available. Install with: pip install wikipedia" ) return None except Exception as e: logger.exception(f"Failed to create Wikipedia loader: {e}") return None
[docs] class PlaywrightWebSource(WebUrlSource): """Advanced web source using Playwright for JavaScript-heavy sites.""" def __init__( self, urls: list[str], wait_until: str = "networkidle", headless: bool = True, **kwargs, ): """Init . Args: urls: [TODO: Add description] wait_until: [TODO: Add description] headless: [TODO: Add description] """ super().__init__(source_path=urls[0] if urls else "", **kwargs) self.urls = urls self.wait_until = wait_until self.headless = headless
[docs] def can_handle(self, path: str) -> bool: """Check if this is a web URL suitable for Playwright.""" try: parsed = urlparse(path) return parsed.scheme in ["http", "https"] except Exception: return False
[docs] def get_confidence_score(self, path: str) -> float: """Get confidence score for web URLs (lower priority than basic web).""" if not self.can_handle(path): return 0.0 return 0.6 # Lower than basic web loader for auto-selection
[docs] def create_loader(self) -> BaseLoader | None: """Create a Playwright web loader.""" try: from langchain_community.document_loaders import PlaywrightURLLoader return PlaywrightURLLoader( urls=self.urls, remove_selectors=["header", "footer", "nav", ".sidebar"], continue_on_failure=True, headless=self.headless, ) except ImportError: logger.warning( "PlaywrightURLLoader not available. Install with: pip install playwright" ) return None except Exception as e: logger.exception(f"Failed to create Playwright loader: {e}") return None
[docs] class BasicWebSource(WebUrlSource): """Basic web source for simple HTML pages.""" def __init__( self, web_paths: list[str], requests_kwargs: dict[str, Any] | None = None, **kwargs, ): """Init . Args: web_paths: [TODO: Add description] requests_kwargs: [TODO: Add description] """ super().__init__(source_path=web_paths[0] if web_paths else "", **kwargs) self.web_paths = web_paths self.requests_kwargs = requests_kwargs or {}
[docs] def can_handle(self, path: str) -> bool: """Check if this is a web URL.""" try: parsed = urlparse(path) return parsed.scheme in ["http", "https"] except Exception: return False
[docs] def get_confidence_score(self, path: str) -> float: """Get confidence score for web URLs.""" if not self.can_handle(path): return 0.0 return 0.7
[docs] def create_loader(self) -> BaseLoader | None: """Create a basic web loader.""" try: from langchain_community.document_loaders import WebBaseLoader return WebBaseLoader( web_paths=self.web_paths, requests_kwargs=self.requests_kwargs, ) except ImportError: logger.warning("WebBaseLoader not available") return None except Exception as e: logger.exception(f"Failed to create web loader: {e}") return None
# Export web sources __all__ = [ "ArXivSource", "BasicWebSource", "GitHubSource", "PlaywrightWebSource", "WikipediaSource", ]