"""Specifies the attribute objects used by crawlers and scrapers."""
from typing import Tuple, Union, Callable
from inspect import signature
import os
import pandas as pd
from scrawler.defaults import (DEFAULT_CSV_ENCODING, DEFAULT_CSV_SEPARATOR, DEFAULT_CSV_QUOTING, DEFAULT_CSV_ESCAPECHAR,
DEFAULT_PAUSE_TIME)
from scrawler.website import Website
from scrawler.data_extractors import BaseExtractor
from scrawler.utils.web_utils import is_same_host, extract_same_host_pattern
[docs]class SearchAttributes:
def __init__(self, *args: BaseExtractor, validate: bool = True):
"""Specify which data to collect/search for in the website.
:param args: Data extractors specifying which data to extract in websites
(see `built-in data extractors <built_in_data_extractors.html>`__ or for possibilities
or `define a custom data extractor <custom_data_extractors.html>`__).
:param validate: Whether to make sure that input parameters are valid.
"""
if validate:
for extractor in args:
if not isinstance(extractor, BaseExtractor):
raise TypeError(f"{extractor.__class__} does not inherit from BaseExtractor.")
self.attributes: Tuple[BaseExtractor] = args
self.n_return_values: int = sum([extractor.n_return_values for extractor in self.attributes])
[docs]class ExportAttributes:
def __init__(self, directory: str, fn: Union[str, list],
header: Union[list, str, bool] = None, encoding: str = DEFAULT_CSV_ENCODING,
separator: str = DEFAULT_CSV_SEPARATOR, quoting: int = DEFAULT_CSV_QUOTING,
escapechar: str = DEFAULT_CSV_ESCAPECHAR, validate: bool = True, **kwargs):
"""Specify how and where to export the collected data.
:param directory: Folder where file(s) will be saved to.
:param fn: Name(s) of the file(s) containing the crawled data. *Without* file extension.
:param header: Have the final CSV file have a header. Possible parameters:
If ``None`` or ``False``, no header will be written.
If ``first-row`` or ``True``, uses first row of data as header.
Else, pass list of strings of appropriate length.
:param encoding: Encoding to use to create the CSV file.
:param separator: Column separator or delimiter to use for creating the CSV file.
:param quoting: Puts quotes around cells that contain the separator character.
:param escapechar: Escapes the separator character.
:param validate: Whether to make sure that input parameters are valid.
:param kwargs: Any parameter supported by :meth:`pandas:pandas.DataFrame.to_csv` can be passed.
"""
if validate:
# Check that directory exists
if not os.path.isdir(directory):
raise NotADirectoryError(f"Export directory does not exist on this system ({directory}).")
# Check that keyword arguments are allowed for pandas.DataFrame.to_csv()
for key, value in kwargs.items():
if key not in signature(pd.DataFrame.to_csv).parameters:
raise ValueError(f'Invalid keyword argument passed to ExportAttributes: "{key}"')
self.directory = directory
self.fn = fn # Filename(s)
self.header = header
self.encoding = encoding
self.separator = separator
self.quoting = quoting
self.escapechar = escapechar
for key, value in kwargs.items(): # Add keyword arguments as attributes
self.__setattr__(key, value)
[docs]class CrawlingAttributes:
def __init__(self,
filter_non_standard_schemes: bool = True,
filter_media_files: bool = True,
blocklist: tuple = (),
filter_foreign_urls: Union[str, Callable] = "auto",
strip_url_parameters: bool = False,
strip_url_fragments: bool = True,
max_no_urls: int = None,
max_distance_from_start_url: int = None,
max_subdirectory_depth: int = None,
pause_time: float = DEFAULT_PAUSE_TIME,
respect_robots_txt: bool = True,
validate: bool = True
):
"""Specify how to conduct the crawling, including filtering irrelevant URLs or limiting the number of crawled URLs.
:param filter_non_standard_schemes: Filter URLs starting with schemes other than ``http:`` or ``https:`` (e.g., ``mailto:`` or ``javascript:``).
:param filter_media_files: Whether to filter media files. Recommended: ``True`` to avoid long runtimes caused by large file downloads.
:param blocklist: Filter URLs that contain one or more of the parts specified here. Has to be a ``list``.
:param filter_foreign_urls: Filter URLs that do not belong to the same host (foreign URLs).
Can either be a string that is passed to :func:`.is_same_host`, or a custom ``Callable`` that has to include two arguments, ``url1`` and ``url2``.
In :func:`.is_same_host`, the following string values are permitted:
1. ``auto``: Automatically extracts a matching pattern from the start URL (see :func:`.extract_same_host_pattern` for details).
2. Any one of the attributes of the :class:`.ParsedUrl` class (e.g. ``domain``, ``hostname``, ``fld``).
3. ``subdomainX`` with ``X`` representing an integer number up to which subdomain the URLs should be compared. E.g., comparing ``http://www.sub.example.com`` and ``http://blog.sub.example.com``, ``sub`` is the first level, while the second levels are ``www`` and ``blog``, respectively.
4. ``directoryX`` with ``X`` representing an integer number up to which directory the URLs should be compared. E.g., for ``http://example.com/dir1/dir2/index.html``, ``directory2`` would include all files in ``dir2``.
:param strip_url_parameters: Whether to strip URL query parameters (prefixed by ``?``) from the URL.
:param strip_url_fragments: Whether to strip URL fragments (prefixed by ``#``) from the URL.
:param max_no_urls: Maximum number of URLs to be crawled per domain (safety limit for very large crawls). Set to ``None`` if you want all URLs to be crawled.
:param max_distance_from_start_url: Maximum number of links that have to be followed to arrive at a certain URL from the start URL.
:param max_subdirectory_depth: Maximum sub-level of the host up to which to crawl. E.g., consider this schema: ``hostname/sub-directory1/sub-siteA``.
If you would want to crawl all URLs of the same level as ``sub-directory1``, specify 1.
``sub-siteA`` will then not be found, but a site ``hostname/sub-directory2`` or ``hostname/sub-siteB`` will be.
:param pause_time: Time to wait between the crawling of two URLs (in seconds).
:param respect_robots_txt: Whether to respect the specifications made in the website's ``robots.txt`` file.
"""
if validate:
# Check that a valid input is passed to parameter filter_foreign_url
TEST_URL = "https://www.example.com"
try:
if not isinstance(filter_foreign_urls, Callable):
test_mode = extract_same_host_pattern(TEST_URL) if (filter_foreign_urls == "auto") else filter_foreign_urls
assert is_same_host(TEST_URL, TEST_URL, mode=test_mode), "is_same_host() should be True if the same URL is used."
else:
assert filter_foreign_urls(TEST_URL, TEST_URL), f"Error when testing your custom foreign URL filter function ({filter_foreign_urls.__name__}): Should be True if the same URL is used for both input arguments."
except (ValueError, TypeError, AssertionError) as e:
raise ValueError(f"Parameter filter_foreign_url is not correctly specified: {filter_foreign_urls}. The following error occurred during validation: {e}")
self.filter_non_standard_schemes = filter_non_standard_schemes
self.filter_media = filter_media_files
self.blocklist = blocklist
self.filter_foreign_urls = filter_foreign_urls
self.strip_url_parameters = strip_url_parameters
self.strip_url_fragments = strip_url_fragments
self.max_no_urls = max_no_urls if (max_no_urls is not None) else float("inf")
self.max_distance_from_start_url = max_distance_from_start_url if (max_distance_from_start_url is not None) else float("inf")
self.max_subdirectory_depth = max_subdirectory_depth if (max_subdirectory_depth is not None) else float("inf")
self.pause_time = pause_time
self.respect_robots_txt = respect_robots_txt