Source code for scrawler.utils.validation_utils
"""Functions to make sure the specifications for a crawling/scraping are valid and work together correctly."""
from typing import List
import logging
from tld import get_tld
from scrawler.attributes import SearchAttributes, ExportAttributes, CrawlingAttributes
[docs]def validate_input_params(urls: List[str], search_attrs: SearchAttributes, export_attrs: ExportAttributes = None,
crawling_attrs: CrawlingAttributes = None, **kwargs):
"""Validate that all URLs work and the various attributes work together."""
validate_urls(urls)
if export_attrs is not None:
# 1) Validate that n_filenames == n_URLs
no_urls = len(urls)
no_filenames = len(export_attrs.fn) if (type(export_attrs.fn) is list) else 1
if not (no_urls == no_filenames):
raise ValueError(f"Number of filenames ({no_urls}) provided is different that number of URLs to process ({no_filenames}).")
# 2) Validate that header has same amount of columns as generated from search attributes (if passed as a list)
header = export_attrs.header
if header and (header != "first-row"): # check that it's not None or False
if not (len(header) == search_attrs.n_return_values):
raise ValueError(f"Length of the header ({len(header)}) for exporting data is not equal to the"
f" number of columns generated by the search attributes ({search_attrs.n_return_values})."
f"\n\tHeader: {header}.")
# 3) If using dynamic parameters, passed lists of parameters should have the same length as len(urls)
for extractor in search_attrs.attributes:
if extractor.dynamic_parameters:
for param, value in extractor.__dict__.items():
if type(value) is list: # check only lists, not tuples or constants
if not len(value) == len(urls):
raise ValueError(f"You have passed a data extractor of class {extractor.__class__} using dynamic parameters."
f" However, the number of parameters you passed for the attribute '{param}' does not equal the amount of URLs to be processed."
f" You have to pass either a list of parameters of the same length as the number of URLs, or pass constants (not of type list).")
[docs]def validate_urls(urls: List[str]) -> None:
"""Checks if URL(s) can be parsed and checks for duplicates."""
for url in urls:
get_tld(url)
# Check for duplicates by converting to set()
if not len(urls) == len(set(urls)):
logging.warning(f"The list of urls to process contains {len(urls) - len(set(urls))} duplicate(s).")