Scraping¶
pyfetcher provides a comprehensive set of scraping utilities built on BeautifulSoup.
CSS Selector Extraction¶
from pyfetcher.scrape.selectors import extract_text, extract_attrs, extract_table
# Extract text from elements
titles = extract_text(html, "h1")
# Extract attributes
links = extract_attrs(html, "a", attrs=["href", "title"])
# Parse HTML tables
rows = extract_table(html, "table.data")
Link Harvesting¶
from pyfetcher.scrape.links import extract_links
links = extract_links(html, base_url="https://example.com")
internal = [l for l in links if not l.is_external]
Form Parsing¶
from pyfetcher.scrape.forms import extract_forms
forms = extract_forms(html, base_url="https://example.com")
login_form = forms[0]
print(login_form.action, login_form.method)
print(login_form.to_dict()) # Field names -> default values
Robots.txt¶
from pyfetcher.scrape.robots import parse_robots_txt, is_allowed
rules = parse_robots_txt(robots_txt_content)
if is_allowed(rules, "/admin", user_agent="MyBot"):
print("Path is allowed")
Sitemap Parsing¶
from pyfetcher.scrape.sitemap import parse_sitemap
entries = parse_sitemap(sitemap_xml)
for entry in entries:
print(entry.loc, entry.lastmod)
Content Extraction¶
from pyfetcher.scrape.content import extract_readable_text
# Strips scripts, styles, nav, footer
text = extract_readable_text(html)
# Target specific element
text = extract_readable_text(html, selector="article")