Source code for scrawler.data_extractors

from datetime import datetime
from typing import Union, Tuple, List, Callable
import functools

import dateutil.parser
from bs4 import BeautifulSoup, Tag, NavigableString
import readability

from scrawler.utils.general_utils import sanitize_text
from scrawler.website import Website
from scrawler.utils.web_utils import get_directory_depth
from scrawler.defaults import DEFAULT_EMPTY_FIELD_STRING

__all__ = ["GeneralHtmlTagExtractor", "GeneralHttpHeaderFieldExtractor", "AccessTimeExtractor", "CmsExtractor",
           "ContactNameExtractor", "CustomStringPutter", "DateExtractor", "DescriptionExtractor",
           "DirectoryDepthExtractor", "ExpiryDateExtractor", "HttpStatusCodeExtractor",
           "KeywordsExtractor", "LanguageExtractor", "LastModifiedDateExtractor", "LinkExtractor",
           "ServerProductExtractor", "StepsFromStartPageExtractor", "MobileOptimizedExtractor",
           "TermOccurrenceExtractor", "TermOccurrenceCountExtractor", "TitleExtractor", "UrlExtractor",
           "UrlBranchNameExtractor", "UrlCategoryExtractor", "WebsiteTextExtractor"]

# CONSTANTS: Default HTML attributes to collect certain data points
_DEFAULT_CMS_TAG_TYPE = "meta"
_DEFAULT_CMS_ATTRS = {"name": ["generator", "Generator", "formatter", "Powered-By", "application-name"]}
_DEFAULT_CMS_KEYWORDS = {"WordPress": ["wp-content", "wp-includes", "wp-uploads"],
                         "TYPO3 CMS": ["typo3"],
                         "Wix.com": ["/wix-bolt/", "wixcode-worker.js", "wixstatic.com"],
                         "Shopify": ["cdn.shopify.com", "shopify.js", "/shopify/"]}

_DEFAULT_CONTACT_TAG_TYPES = ("div")
_DEFAULT_CONTACT_TAG_ATTRS = {"class": "employee_name"}

_DEFAULT_DATE_TAG_TYPES = ("meta")
_DEFAULT_DATE_TAG_ATTRS = {"name": "pubdate"}

_DEFAULT_DESCRIPTION_TAG_TYPE = "meta"
_DESCRIPTION_TAG_ATTRS_1 = {"name": ["description", "Description"]}
_DESCRIPTION_TAG_ATTRS_2 = {"property": ["description", "Description", "og:description"]}

_DEFAULT_KEYWORDS_TAG_TYPE = "meta"
_DEFAULT_KEYWORDS_TAG_ATTRS = {"name": ["keywords", "Keywords"]}

_DEFAULT_TEXT_TAG_TYPES = ("div")
_DEFAULT_TEXT_TAG_ATTRS = {"class": ["content"]}
_DEFAULT_TEXT_ALLOWED_STRING_TYPES = [NavigableString]

_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_TYPE = "meta"
_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_ATTRS = {"name": "viewport"}


[docs]def supports_dynamic_parameters(func) -> Callable: """Function decorator to select correct parameter based on index when using dynamic parameters.""" @functools.wraps(func) def run(self, website: Website, index: int = None) -> Callable: if index is not None and self.dynamic_parameters: # First, initialize new object to prevent changes to original. # Note that this has a high performance impact (though better than when using copy() or deepcopy()) self_copy = self.__class__(**self.__dict__) for param, value in self_copy.__dict__.items(): if type(value) is list: # update only lists self_copy.__dict__[param] = value[index] return func(self_copy, website, index) else: return func(self, website) return run
[docs]class BaseExtractor: def __init__(self, *args, dynamic_parameters: bool = False, n_return_values: int = None, **kwargs) -> None: """Provides the basic architecture for each data extractor. Every data extractor has to inherit from :class:`.BaseExtractor`. :param args: Positional arguments to be used by children inheriting from :class:`.BaseExtractor`. :param dynamic_parameters: Set this to ``True`` when you would like to pass a :class:`list` to a certain parameter, and have each URL/scraping target use a different value from that list based on an index. See also `here <custom_data_extractors.html#dynamic-parameters>`__. :param n_return_values: Specifies the number of values that will be returned by the extractor. This is almost always 1, but there are cases such as :class:`.DateExtractor` which may return more values. See also `here <custom_data_extractors.html#n-return-values>`__. :param kwargs: Keyword arguments to be used by children inheriting from :class:`.BaseExtractor`. """ self.dynamic_parameters = dynamic_parameters self.n_return_values = n_return_values if (n_return_values is not None) else 1
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None): """Runs the extraction and returns the extracted data. :param website: :class:`.Website` object that data is extracted from. :param index: Used for extractors that should behave differently for each domain/site if multiple are processed. Usually, the extractor will be passed a list of values and use only the value relevant to the currently processed domain/site (for example, :class:`.CustomStringPutter` may put a different string for each domain). See also `here <custom_data_extractors.html#dynamic-parameters>`__. """ pass
[docs]class GeneralHtmlTagExtractor(BaseExtractor): def __init__(self, tag_types: tuple, tag_attrs: dict, attr_to_extract: str, fill_empty_field: bool = True, **kwargs): """General purpose extractor for extracting HTML tags and then extracting a single attribute from the tag. :param tag_types: Describes the tag types to find, e. g. ``div``. :param tag_attrs: Specifies the HTML attributes use to find the relevant HTML tag in a key-value dict format. Example: ``{"class": ["content", "main-content"]}``. See also `this explanation of HTML tag attributes <https://www.w3schools.com/htmL/html_attributes.asp>`__. :param attr_to_extract: The attribute that should be extracted from the found HTML tag. :param fill_empty_field: Used in cases where the specified attribute in the HTML tag exists but is empty. If ``True``, returns the value specified in ``DEFAULT_EMPTY_FIELD_STRING``. Otherwise, returns an empty string. :param kwargs: """ self.tag_types = tag_types self.tag_attrs = tag_attrs self.attr_to_extract = attr_to_extract self.fill_empty_field = fill_empty_field super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: html_tag = website.find(self.tag_types, attrs=self.tag_attrs) try: content = sanitize_text(html_tag.attrs[self.attr_to_extract]) except (AttributeError, KeyError): content = DEFAULT_EMPTY_FIELD_STRING # For cases where the attribute exists but is empty ("") if self.fill_empty_field: if content == "": content = DEFAULT_EMPTY_FIELD_STRING return content
[docs]class GeneralHttpHeaderFieldExtractor(BaseExtractor): def __init__(self, field_to_extract: str, fill_empty_field: bool = True, **kwargs): """General purpose extractor for extracting `HTTP header <https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers>`__ fields.""" self.field_to_extract = field_to_extract self.fill_empty_field = fill_empty_field super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: headers = website.http_response.headers try: content = headers[self.field_to_extract] except KeyError: content = DEFAULT_EMPTY_FIELD_STRING if self.fill_empty_field: if content == "": content = DEFAULT_EMPTY_FIELD_STRING return content
[docs]class AccessTimeExtractor(BaseExtractor): def __init__(self, **kwargs): """Returns the current time as time of access. To be exact, the time of processing.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> datetime: return datetime.now()
[docs]class CmsExtractor(BaseExtractor): def __init__(self, **kwargs): """Extract the Content Management System (CMS) used for building the website. Note: This method uses the HTML generator meta tag and some hard-coded search terms. Therefore, not all systems will be identified correctly.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: cms_tag = website.find(_DEFAULT_CMS_TAG_TYPE, attrs=_DEFAULT_CMS_ATTRS, content=True) if cms_tag is not None: return cms_tag["content"] else: # Detect some CM systems by keywords used in the HTML source code for cms, keywords in _DEFAULT_CMS_KEYWORDS.items(): for word in keywords: if word in website.html_text.lower(): return cms return DEFAULT_EMPTY_FIELD_STRING
[docs]class ContactNameExtractor(BaseExtractor): def __init__(self, tag_types: tuple = _DEFAULT_CONTACT_TAG_TYPES, tag_attrs: dict = _DEFAULT_CONTACT_TAG_ATTRS, separator: str = ";", **kwargs): """Find contact name(s) for a given website. :param tag_types: Specifies which kind of tags to look at (e. g., ``div`` or ``span``) :param tag_attrs: Provide additional attributes in a dictionary, e. g. ``{"class": "contact"}``. :param separator: When more than one contact is found, they are separated by the string given here. """ self.tag_types = tag_types self.tag_attrs = tag_attrs self.separator = separator super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: contact_tags = website.find_all(self.tag_types, attrs=self.tag_attrs) contacts = {sanitize_text(tag.text) for tag in contact_tags} if len(contacts) == 0: return DEFAULT_EMPTY_FIELD_STRING else: return self.separator.join(contacts) # join list to return everything in one string
[docs]class CustomStringPutter(BaseExtractor): def __init__(self, string: Union[str, list], **kwargs): """Simply returns a given string or entry from a list of strings. Background: Sometimes, a column should be appended with a custom label for a given website (for example, an external ID). :param string: The string to be returned by the :meth:`~scrawler.data_extractors.CustomStringPutter.run` method. Can optionally pass a list here and use a different value for different URLs/domains that are scraped. In that case, remember to also pass ``use_index=True``. :raises IndexError: May raise an ``IndexError`` if a the parameter ``string`` is passed a list and ``use_index=True``. This may occur when you pass a list of custom strings shorter than the list of URLs crawled. """ self.string = string super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: return self.string
[docs]class DateExtractor(BaseExtractor): def __init__(self, tag_types: tuple = _DEFAULT_DATE_TAG_TYPES, tag_attrs: dict = _DEFAULT_DATE_TAG_ATTRS, return_year_month_day: bool = False, **kwargs): """ Get dates by looking at passed tag. Can optionally parse dates to year, month and day. :param tag_types: Describes the tag types to find, e. g. ``meta``. :param tag_attrs: Specifies HTML attributes and their values in a key-value dict format. Example: ``{"name": "pubdate"}``. :param return_year_month_day: If True, returns date as 3 integers: year (``YYYY``), month (``MM``) and day (``dd``). """ super().__init__(**kwargs) self.tag_types = tag_types self.tag_attrs = tag_attrs self.return_year_month_day = return_year_month_day self.n_return_values = 3 if self.return_year_month_day else 1
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> Union[datetime, Tuple[int, int, int]]: date_tag = website.find(self.tag_types, attrs=self.tag_attrs, content=True) try: date_string = date_tag.attrs["content"] parsed_date = dateutil.parser.parse(date_string) # returns a datetime object year, month, day = parsed_date.year, parsed_date.month, parsed_date.day except (AttributeError, ValueError, OverflowError): # if pubdate_tag is None or date string can't be parsed parsed_date = DEFAULT_EMPTY_FIELD_STRING year, month, day = DEFAULT_EMPTY_FIELD_STRING, DEFAULT_EMPTY_FIELD_STRING, DEFAULT_EMPTY_FIELD_STRING if self.return_year_month_day: return year, month, day else: return parsed_date
[docs]class DescriptionExtractor(BaseExtractor): def __init__(self, **kwargs): """Get website description (the one shown in search engine results) using two common description fields.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: standard_desc_tag = website.find(_DEFAULT_DESCRIPTION_TAG_TYPE, attrs=_DESCRIPTION_TAG_ATTRS_1, content=True) if standard_desc_tag is not None: description = standard_desc_tag.attrs["content"] if (standard_desc_tag is None) or (description == ""): other_desc_tag = website.find(_DEFAULT_DESCRIPTION_TAG_TYPE, attrs=_DESCRIPTION_TAG_ATTRS_2, content=True) if other_desc_tag is not None: description = other_desc_tag.attrs["content"] else: description = DEFAULT_EMPTY_FIELD_STRING return sanitize_text(description)
[docs]class DirectoryDepthExtractor(BaseExtractor): def __init__(self, **kwargs): """Returns the directory level that a given document is in. For example, ``https://www.sub.example.com/dir1/dir2/file.html`` returns 3.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> int: return get_directory_depth(website.url)
[docs]class ExpiryDateExtractor(GeneralHttpHeaderFieldExtractor, DateExtractor, BaseExtractor): def __init__(self, return_year_month_day: bool = False, **kwargs): """Get website ``expiry`` date from HTTP header or HTML Meta tag.""" GeneralHttpHeaderFieldExtractor.__init__(self, field_to_extract="Expires", **kwargs) DateExtractor.__init__(self, tag_types=("meta"), tag_attrs={"name": ["expires", "Expires", "EXPIRES"]}, return_year_month_day=return_year_month_day)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> Union[datetime, Tuple[int, int, int]]: header_date_string = GeneralHttpHeaderFieldExtractor.run(self, website, index) try: result = dateutil.parser.parse(header_date_string) if self.return_year_month_day: result = result.year, result.month, result.day except (ValueError, OverflowError): # if date string can't be parsed result = DateExtractor.run(self, website, index) # DateExtractor already respects return_year_month_day return result
class HtmlTextExtractor(BaseExtractor): def __init__(self, **kwargs): """Get plain HTML text of website.""" super().__init__(**kwargs) @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> int: return website.html_text
[docs]class HttpStatusCodeExtractor(BaseExtractor): def __init__(self, **kwargs): """Get status code of HTTP request.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> int: try: return website.http_response.status except AttributeError: # when website is fetch using fetch(), the response object is requests.Response return website.http_response.status_code
[docs]class KeywordsExtractor(GeneralHtmlTagExtractor, BaseExtractor): def __init__(self, **kwargs): """Get keywords from HTML keyword meta tag (if present).""" super().__init__(tag_types=_DEFAULT_KEYWORDS_TAG_TYPE, tag_attrs=_DEFAULT_KEYWORDS_TAG_ATTRS, attr_to_extract="content", **kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: return super().run(website, index)
[docs]class LanguageExtractor(GeneralHtmlTagExtractor, BaseExtractor): def __init__(self, **kwargs): """Get language of a given website from its HTML tag ``lang`` attribute.""" super().__init__(tag_types="html", tag_attrs={}, attr_to_extract="lang", **kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: return super().run(website, index).lower()
[docs]class LastModifiedDateExtractor(GeneralHttpHeaderFieldExtractor, DateExtractor, BaseExtractor): def __init__(self, return_year_month_day: bool = False, **kwargs): """Get website ``last-modified`` date from HTTP header or HTML Meta tag.""" GeneralHttpHeaderFieldExtractor.__init__(self, field_to_extract="Last-Modified", **kwargs) DateExtractor.__init__(self, tag_types=("meta"), tag_attrs={"http-equiv": "last-modified"}, return_year_month_day=return_year_month_day)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> Union[datetime, Tuple[int, int, int]]: header_date_string = GeneralHttpHeaderFieldExtractor.run(self, website, index) try: result = dateutil.parser.parse(header_date_string) if self.return_year_month_day: result = result.year, result.month, result.day except (ValueError, OverflowError): # if date string can't be parsed result = DateExtractor.run(self, website, index) # DateExtractor already respects return_year_month_day return result
[docs]class LinkExtractor(BaseExtractor): def __init__(self, **kwargs): """Find all links from a website (without duplicates).""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> set: link_tags = website.find_all("a", href=True) # find all link tags <a> that have the attribute href links = {tag["href"].strip() for tag in link_tags} # get the URL (hyper-reference, href) return links
[docs]class ServerProductExtractor(GeneralHttpHeaderFieldExtractor, BaseExtractor): def __init__(self, **kwargs): """Get website ``Server`` info from HTTP header.""" super().__init__(field_to_extract="Server", **kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: return super().run(website, index)
[docs]class StepsFromStartPageExtractor(BaseExtractor): def __init__(self, **kwargs): """Returns the number of links that have to be followed from the start page to arrive at this website.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> int: return website.steps_from_start_page
[docs]class MobileOptimizedExtractor(BaseExtractor): def __init__(self, **kwargs): """Checks whether website is optimized for mobile usage by looking up HTML ``viewport`` meta tag.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> int: viewport_tag = website.find(_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_TYPE, attrs=_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_ATTRS, content=True) return 0 if (viewport_tag is None) else 1
[docs]class TermOccurrenceExtractor(BaseExtractor): def __init__(self, terms: Union[List[str], str], ignore_case: bool = False, **kwargs): """Checks if the given terms occur in the website's HTML text. Returns 0 if no term occurs in the soup's text, 1 if at least one occurs. :param terms: term or list of terms to search for. :param ignore_case: Whether to respect the text's casing (upper-/lowercase). """ self.terms = [terms] if type(terms) is str else terms self.ignore_case = ignore_case super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> int: text = website.html_text if self.ignore_case: text = text.lower() for term in self.terms: if term in text: return 1 return 0
[docs]class TermOccurrenceCountExtractor(BaseExtractor): def __init__(self, terms: Union[List[str], str], ignore_case: bool = False, **kwargs): """Count the number of times the given terms occur in the website's HTML text. :param terms: term or list of terms to search for. :param ignore_case: Whether to respect the text's casing (upper-/lowercase). :returns: Total sum of all occurrences. """ self.terms = [terms] if type(terms) is str else terms self.ignore_case = ignore_case super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> int: text = website.html_text if self.ignore_case: text = text.lower() return sum([text.count(term) for term in self.terms])
[docs]class TitleExtractor(BaseExtractor): def __init__(self, **kwargs): """Get title of a website (the same that is shown in a browser in the tabs tray).""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: try: return sanitize_text(website.title.string) except AttributeError: return DEFAULT_EMPTY_FIELD_STRING
[docs]class UrlExtractor(BaseExtractor): def __init__(self, **kwargs): """Returns the website's URL.""" super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: return website.url
[docs]class UrlBranchNameExtractor(BaseExtractor): def __init__(self, branch_name_position: int = 1, **kwargs): """Extract sub-domain names from URLs like ``subdomain.example.com``, which often refer to an entity's sub-branches. :param branch_name_position: Where in the URL to look for the name. If ``0``, the domain will be used. Otherwise, indexes into all available sub-domains: ``1`` would retrieve the first sub-domain *from the right*, ``2`` the second, and so on. """ self.branch_name_position = branch_name_position super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: if self.branch_name_position == 0: branch_name = website.parsed_url.domain else: try: branch_name = website.parsed_url.subdomain.split(".")[-self.branch_name_position] except IndexError: branch_name = DEFAULT_EMPTY_FIELD_STRING return branch_name
[docs]class UrlCategoryExtractor(BaseExtractor): def __init__(self, category_position: int = 2, **kwargs): """ Try to identify the category of a given URL as the directory specified by :attr:`category_position`. :param category_position: Specify at which position in the path the category can be found. """ self.category_position = category_position super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: try: return website.parsed_url.path.split("/")[self.category_position].split(".")[0] except (AttributeError, IndexError): return DEFAULT_EMPTY_FIELD_STRING
[docs]class WebsiteTextExtractor(BaseExtractor): def __init__(self, mode: str = "auto", min_length: int = 30, tag_types: tuple = _DEFAULT_TEXT_TAG_TYPES, tag_attrs: dict = _DEFAULT_TEXT_TAG_ATTRS, allowed_string_types: List[NavigableString] = _DEFAULT_TEXT_ALLOWED_STRING_TYPES, separator: str = "[SEP]", **kwargs): """Get readable website text, excluding ``<script>``, ``<style>``, ``<template>`` and other non-readable text. Several modes are available to make sure to only capture relevant text. :param mode: Default mode is ``auto``, which uses the ``readability`` algorithm to only extract a website's article text. If ``all_strings``, all readable website text (excluding script, style and other tags as well as HTML comments) will be retrieved. See also the `BeautifulSoup documentation <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text>`__ for the ``get_text()`` method. If ``by_length``, the :attr:`min_length` parameter will be used to determine the minimum length of HTML strings to be included in the text. If ``search_in_tags``, the tags dictionary will be used to identify the tags that include text. :param min_length: If using mode ``by_length``, this is the minimum length of a string to be considered. Shorter strings will be discarded. :param tag_types: Describes the tag types to find, e. g. ``div``. :param tag_attrs: Specifies HTML attributes and their values in a key-value dict format. Example: ``{"class": ["content", "main-content"]}``. :param allowed_string_types: List of types that are considered to be readable. This makes sure that scripts and similar types are excluded. Note that the types passed here have to inherit from :class:`bs4.NavigableString`. :param separator: String to be used as separator when concatenating all found strings. """ self.mode = mode self.min_length = min_length self.tag_types = tag_types self.tag_attrs = tag_attrs self.allowed_string_types = allowed_string_types self.separator = separator super().__init__(**kwargs)
[docs] @supports_dynamic_parameters def run(self, website: Website, index: int = None) -> str: def get_txt(obj: Union[Website, BeautifulSoup, Tag]) -> str: return BeautifulSoup.get_text(obj, separator=self.separator, strip=True, types=self.allowed_string_types) if self.mode == "auto": try: text = readability.Document(website.html_text).summary(html_partial=True) except Exception: text = DEFAULT_EMPTY_FIELD_STRING elif self.mode == "all_strings": text = get_txt(website) elif self.mode == "by_length": strings = website._all_strings(strip=True, types=self.allowed_string_types) strings = filter(lambda s: len(s) >= self.min_length, strings) text = self.separator.join(strings) elif self.mode == "search_in_tags": content_tags = website.find_all(self.tag_types, attrs=self.tag_attrs) if len(content_tags) == 0: # None found text = DEFAULT_EMPTY_FIELD_STRING elif len(content_tags) == 1: text = get_txt(content_tags[0]) else: texts = [get_txt(tag) for tag in content_tags] text = max(texts, key=len) # choose tag with most text in it else: raise ValueError(f'Incorrect text search mode specified: "{self.mode}"') return sanitize_text(text)