Source code for pyfetcher.scrape.forms

"""HTML form extraction for :mod:`pyfetcher`.

Purpose:
    Parse ``<form>`` elements from HTML and extract their fields, making it
    easy to build form submission requests programmatically.

Examples:
    ::

        >>> html = '<form action="/login" method="post"><input name="user"/></form>'
        >>> forms = extract_forms(html, base_url="https://example.com")
        >>> forms[0].action
        'https://example.com/login'
"""

from __future__ import annotations

from dataclasses import dataclass, field
from urllib.parse import urljoin

from bs4 import BeautifulSoup, Tag


[docs] @dataclass(frozen=True, slots=True) class FormField: """A single form input field. Args: name: The field's ``name`` attribute. type: The field's ``type`` attribute (e.g. ``'text'``, ``'hidden'``). value: The field's default ``value`` attribute. options: For ``<select>`` elements, the list of ``<option>`` values. Examples: :: >>> field = FormField(name="user", type="text", value="") >>> field.name 'user' """ name: str type: str value: str options: list[str] = field(default_factory=list)
[docs] @dataclass(frozen=True, slots=True) class FormData: """Parsed HTML form. Args: action: The resolved form action URL. method: The HTTP method (uppercased, e.g. ``'GET'``, ``'POST'``). fields: List of form fields. id: The form's ``id`` attribute, if present. name: The form's ``name`` attribute, if present. Examples: :: >>> form = FormData(action="https://example.com/login", method="POST", fields=[]) >>> form.method 'POST' """ action: str method: str fields: list[FormField] id: str | None = None name: str | None = None
[docs] def to_dict(self) -> dict[str, str]: """Convert form fields to a submission dictionary. Returns a dictionary mapping field names to their default values, suitable for use as POST data or query parameters. Returns: A dictionary of field names to values. Examples: :: >>> form = FormData( ... action="/submit", method="POST", ... fields=[FormField(name="q", type="text", value="hello")], ... ) >>> form.to_dict() {'q': 'hello'} """ return {f.name: f.value for f in self.fields if f.name}
[docs] def extract_forms(html: str, *, base_url: str | None = None) -> list[FormData]: """Extract all forms from HTML. Parses ``<form>`` elements and their input fields (``<input>``, ``<textarea>``, ``<select>``) to produce structured form data. Args: html: Raw HTML string to parse. base_url: Base URL for resolving relative form action URLs. Returns: A list of :class:`FormData` objects. Examples: :: >>> html = '<form action="/search"><input name="q" value=""/></form>' >>> forms = extract_forms(html, base_url="https://example.com") >>> forms[0].action 'https://example.com/search' """ soup = BeautifulSoup(html, "html.parser") results: list[FormData] = [] for form in soup.find_all("form"): if not isinstance(form, Tag): continue action = form.get("action", "") if base_url and action: action = urljoin(base_url, str(action)) elif base_url: action = base_url method = str(form.get("method", "GET")).upper() fields: list[FormField] = [] for inp in form.find_all("input"): if not isinstance(inp, Tag): continue name = str(inp.get("name", "")) fields.append( FormField( name=name, type=str(inp.get("type", "text")), value=str(inp.get("value", "")), ) ) for textarea in form.find_all("textarea"): if not isinstance(textarea, Tag): continue name = str(textarea.get("name", "")) fields.append( FormField( name=name, type="textarea", value=textarea.get_text(), ) ) for sel in form.find_all("select"): if not isinstance(sel, Tag): continue name = str(sel.get("name", "")) options = [] selected_value = "" for option in sel.find_all("option"): if isinstance(option, Tag): val = str(option.get("value", option.get_text(strip=True))) options.append(val) if option.get("selected") is not None: selected_value = val if not selected_value and options: selected_value = options[0] fields.append( FormField( name=name, type="select", value=selected_value, options=options, ) ) results.append( FormData( action=str(action), method=method, fields=fields, id=form.get("id"), name=form.get("name"), ) ) return results