Source code for pyfetcher.scrape.forms

"""HTML form extraction for :mod:`pyfetcher`.

Purpose:
    Parse ``<form>`` elements from HTML and extract their fields, making it
    easy to build form submission requests programmatically.

Examples:
    ::

        >>> html = '<form action="/login" method="post"><input name="user"/></form>'
        >>> forms = extract_forms(html, base_url="https://example.com")
        >>> forms[0].action
        'https://example.com/login'
"""

from __future__ import annotations

from dataclasses import dataclass, field
from urllib.parse import urljoin

from bs4 import BeautifulSoup, Tag



[docs]
@dataclass(frozen=True, slots=True)
class FormField:
    """A single form input field.

    Args:
        name: The field's ``name`` attribute.
        type: The field's ``type`` attribute (e.g. ``'text'``, ``'hidden'``).
        value: The field's default ``value`` attribute.
        options: For ``<select>`` elements, the list of ``<option>`` values.

    Examples:
        ::

            >>> field = FormField(name="user", type="text", value="")
            >>> field.name
            'user'
    """

    name: str
    type: str
    value: str
    options: list[str] = field(default_factory=list)




[docs]
@dataclass(frozen=True, slots=True)
class FormData:
    """Parsed HTML form.

    Args:
        action: The resolved form action URL.
        method: The HTTP method (uppercased, e.g. ``'GET'``, ``'POST'``).
        fields: List of form fields.
        id: The form's ``id`` attribute, if present.
        name: The form's ``name`` attribute, if present.

    Examples:
        ::

            >>> form = FormData(action="https://example.com/login", method="POST", fields=[])
            >>> form.method
            'POST'
    """

    action: str
    method: str
    fields: list[FormField]
    id: str | None = None
    name: str | None = None


[docs]
    def to_dict(self) -> dict[str, str]:
        """Convert form fields to a submission dictionary.

        Returns a dictionary mapping field names to their default values,
        suitable for use as POST data or query parameters.

        Returns:
            A dictionary of field names to values.

        Examples:
            ::

                >>> form = FormData(
                ...     action="/submit", method="POST",
                ...     fields=[FormField(name="q", type="text", value="hello")],
                ... )
                >>> form.to_dict()
                {'q': 'hello'}
        """
        return {f.name: f.value for f in self.fields if f.name}





[docs]
def extract_forms(html: str, *, base_url: str | None = None) -> list[FormData]:
    """Extract all forms from HTML.

    Parses ``<form>`` elements and their input fields (``<input>``,
    ``<textarea>``, ``<select>``) to produce structured form data.

    Args:
        html: Raw HTML string to parse.
        base_url: Base URL for resolving relative form action URLs.

    Returns:
        A list of :class:`FormData` objects.

    Examples:
        ::

            >>> html = '<form action="/search"><input name="q" value=""/></form>'
            >>> forms = extract_forms(html, base_url="https://example.com")
            >>> forms[0].action
            'https://example.com/search'
    """
    soup = BeautifulSoup(html, "html.parser")
    results: list[FormData] = []

    for form in soup.find_all("form"):
        if not isinstance(form, Tag):
            continue

        action = form.get("action", "")
        if base_url and action:
            action = urljoin(base_url, str(action))
        elif base_url:
            action = base_url

        method = str(form.get("method", "GET")).upper()
        fields: list[FormField] = []

        for inp in form.find_all("input"):
            if not isinstance(inp, Tag):
                continue
            name = str(inp.get("name", ""))
            fields.append(
                FormField(
                    name=name,
                    type=str(inp.get("type", "text")),
                    value=str(inp.get("value", "")),
                )
            )

        for textarea in form.find_all("textarea"):
            if not isinstance(textarea, Tag):
                continue
            name = str(textarea.get("name", ""))
            fields.append(
                FormField(
                    name=name,
                    type="textarea",
                    value=textarea.get_text(),
                )
            )

        for sel in form.find_all("select"):
            if not isinstance(sel, Tag):
                continue
            name = str(sel.get("name", ""))
            options = []
            selected_value = ""
            for option in sel.find_all("option"):
                if isinstance(option, Tag):
                    val = str(option.get("value", option.get_text(strip=True)))
                    options.append(val)
                    if option.get("selected") is not None:
                        selected_value = val
            if not selected_value and options:
                selected_value = options[0]
            fields.append(
                FormField(
                    name=name,
                    type="select",
                    value=selected_value,
                    options=options,
                )
            )

        results.append(
            FormData(
                action=str(action),
                method=method,
                fields=fields,
                id=form.get("id"),
                name=form.get("name"),
            )
        )

    return results