Source code for scrawler.data_extractors
from datetime import datetime
from typing import Union, Tuple, List, Callable
import functools
import dateutil.parser
from bs4 import BeautifulSoup, Tag, NavigableString
import readability
from scrawler.utils.general_utils import sanitize_text
from scrawler.website import Website
from scrawler.utils.web_utils import get_directory_depth
from scrawler.defaults import DEFAULT_EMPTY_FIELD_STRING
__all__ = ["GeneralHtmlTagExtractor", "GeneralHttpHeaderFieldExtractor", "AccessTimeExtractor", "CmsExtractor",
"ContactNameExtractor", "CustomStringPutter", "DateExtractor", "DescriptionExtractor",
"DirectoryDepthExtractor", "ExpiryDateExtractor", "HttpStatusCodeExtractor",
"KeywordsExtractor", "LanguageExtractor", "LastModifiedDateExtractor", "LinkExtractor",
"ServerProductExtractor", "StepsFromStartPageExtractor", "MobileOptimizedExtractor",
"TermOccurrenceExtractor", "TermOccurrenceCountExtractor", "TitleExtractor", "UrlExtractor",
"UrlBranchNameExtractor", "UrlCategoryExtractor", "WebsiteTextExtractor"]
# CONSTANTS: Default HTML attributes to collect certain data points
_DEFAULT_CMS_TAG_TYPE = "meta"
_DEFAULT_CMS_ATTRS = {"name": ["generator", "Generator", "formatter", "Powered-By", "application-name"]}
_DEFAULT_CMS_KEYWORDS = {"WordPress": ["wp-content", "wp-includes", "wp-uploads"],
"TYPO3 CMS": ["typo3"],
"Wix.com": ["/wix-bolt/", "wixcode-worker.js", "wixstatic.com"],
"Shopify": ["cdn.shopify.com", "shopify.js", "/shopify/"]}
_DEFAULT_CONTACT_TAG_TYPES = ("div")
_DEFAULT_CONTACT_TAG_ATTRS = {"class": "employee_name"}
_DEFAULT_DATE_TAG_TYPES = ("meta")
_DEFAULT_DATE_TAG_ATTRS = {"name": "pubdate"}
_DEFAULT_DESCRIPTION_TAG_TYPE = "meta"
_DESCRIPTION_TAG_ATTRS_1 = {"name": ["description", "Description"]}
_DESCRIPTION_TAG_ATTRS_2 = {"property": ["description", "Description", "og:description"]}
_DEFAULT_KEYWORDS_TAG_TYPE = "meta"
_DEFAULT_KEYWORDS_TAG_ATTRS = {"name": ["keywords", "Keywords"]}
_DEFAULT_TEXT_TAG_TYPES = ("div")
_DEFAULT_TEXT_TAG_ATTRS = {"class": ["content"]}
_DEFAULT_TEXT_ALLOWED_STRING_TYPES = [NavigableString]
_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_TYPE = "meta"
_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_ATTRS = {"name": "viewport"}
[docs]def supports_dynamic_parameters(func) -> Callable:
"""Function decorator to select correct parameter based on index when using dynamic parameters."""
@functools.wraps(func)
def run(self, website: Website, index: int = None) -> Callable:
if index is not None and self.dynamic_parameters:
# First, initialize new object to prevent changes to original.
# Note that this has a high performance impact (though better than when using copy() or deepcopy())
self_copy = self.__class__(**self.__dict__)
for param, value in self_copy.__dict__.items():
if type(value) is list: # update only lists
self_copy.__dict__[param] = value[index]
return func(self_copy, website, index)
else:
return func(self, website)
return run
[docs]class BaseExtractor:
def __init__(self, *args, dynamic_parameters: bool = False, n_return_values: int = None, **kwargs) -> None:
"""Provides the basic architecture for each data extractor.
Every data extractor has to inherit from :class:`.BaseExtractor`.
:param args: Positional arguments to be used by children inheriting from :class:`.BaseExtractor`.
:param dynamic_parameters: Set this to ``True`` when you would like to pass a :class:`list` to a certain parameter,
and have each URL/scraping target use a different value from that list based on an index.
See also `here <custom_data_extractors.html#dynamic-parameters>`__.
:param n_return_values: Specifies the number of values that will be returned by the extractor.
This is almost always 1, but there are cases such as :class:`.DateExtractor` which may return more values.
See also `here <custom_data_extractors.html#n-return-values>`__.
:param kwargs: Keyword arguments to be used by children inheriting from :class:`.BaseExtractor`.
"""
self.dynamic_parameters = dynamic_parameters
self.n_return_values = n_return_values if (n_return_values is not None) else 1
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None):
"""Runs the extraction and returns the extracted data.
:param website: :class:`.Website` object that data is extracted from.
:param index: Used for extractors that should behave differently for each domain/site if multiple are processed.
Usually, the extractor will be passed a list of values and use only the value relevant
to the currently processed domain/site (for example, :class:`.CustomStringPutter` may put
a different string for each domain). See also `here <custom_data_extractors.html#dynamic-parameters>`__.
"""
pass
[docs]class GeneralHtmlTagExtractor(BaseExtractor):
def __init__(self, tag_types: tuple, tag_attrs: dict, attr_to_extract: str,
fill_empty_field: bool = True, **kwargs):
"""General purpose extractor for extracting HTML tags and then extracting a single attribute from the tag.
:param tag_types: Describes the tag types to find, e. g. ``div``.
:param tag_attrs: Specifies the HTML attributes use to find the relevant HTML tag in a key-value dict format.
Example: ``{"class": ["content", "main-content"]}``.
See also `this explanation of HTML tag attributes <https://www.w3schools.com/htmL/html_attributes.asp>`__.
:param attr_to_extract: The attribute that should be extracted from the found HTML tag.
:param fill_empty_field: Used in cases where the specified attribute in the HTML tag exists but is empty.
If ``True``, returns the value specified in ``DEFAULT_EMPTY_FIELD_STRING``.
Otherwise, returns an empty string.
:param kwargs:
"""
self.tag_types = tag_types
self.tag_attrs = tag_attrs
self.attr_to_extract = attr_to_extract
self.fill_empty_field = fill_empty_field
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
html_tag = website.find(self.tag_types, attrs=self.tag_attrs)
try:
content = sanitize_text(html_tag.attrs[self.attr_to_extract])
except (AttributeError, KeyError):
content = DEFAULT_EMPTY_FIELD_STRING
# For cases where the attribute exists but is empty ("")
if self.fill_empty_field:
if content == "":
content = DEFAULT_EMPTY_FIELD_STRING
return content
[docs]class GeneralHttpHeaderFieldExtractor(BaseExtractor):
def __init__(self, field_to_extract: str, fill_empty_field: bool = True, **kwargs):
"""General purpose extractor for extracting `HTTP header <https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers>`__ fields."""
self.field_to_extract = field_to_extract
self.fill_empty_field = fill_empty_field
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
headers = website.http_response.headers
try:
content = headers[self.field_to_extract]
except KeyError:
content = DEFAULT_EMPTY_FIELD_STRING
if self.fill_empty_field:
if content == "":
content = DEFAULT_EMPTY_FIELD_STRING
return content
[docs]class AccessTimeExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Returns the current time as time of access. To be exact, the time of processing."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> datetime:
return datetime.now()
[docs]class CmsExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Extract the Content Management System (CMS) used for building the website.
Note: This method uses the HTML generator meta tag and some hard-coded search terms.
Therefore, not all systems will be identified correctly."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
cms_tag = website.find(_DEFAULT_CMS_TAG_TYPE, attrs=_DEFAULT_CMS_ATTRS, content=True)
if cms_tag is not None:
return cms_tag["content"]
else: # Detect some CM systems by keywords used in the HTML source code
for cms, keywords in _DEFAULT_CMS_KEYWORDS.items():
for word in keywords:
if word in website.html_text.lower():
return cms
return DEFAULT_EMPTY_FIELD_STRING
[docs]class ContactNameExtractor(BaseExtractor):
def __init__(self, tag_types: tuple = _DEFAULT_CONTACT_TAG_TYPES,
tag_attrs: dict = _DEFAULT_CONTACT_TAG_ATTRS,
separator: str = ";", **kwargs):
"""Find contact name(s) for a given website.
:param tag_types: Specifies which kind of tags to look at (e. g., ``div`` or ``span``)
:param tag_attrs: Provide additional attributes in a dictionary, e. g. ``{"class": "contact"}``.
:param separator: When more than one contact is found, they are separated by the string given here.
"""
self.tag_types = tag_types
self.tag_attrs = tag_attrs
self.separator = separator
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
contact_tags = website.find_all(self.tag_types, attrs=self.tag_attrs)
contacts = {sanitize_text(tag.text) for tag in contact_tags}
if len(contacts) == 0:
return DEFAULT_EMPTY_FIELD_STRING
else:
return self.separator.join(contacts) # join list to return everything in one string
[docs]class CustomStringPutter(BaseExtractor):
def __init__(self, string: Union[str, list], **kwargs):
"""Simply returns a given string or entry from a list of strings. Background: Sometimes, a column should be appended with a custom label for a given website (for example, an external ID).
:param string: The string to be returned by the :meth:`~scrawler.data_extractors.CustomStringPutter.run` method.
Can optionally pass a list here and use a different value for different URLs/domains that are scraped.
In that case, remember to also pass ``use_index=True``.
:raises IndexError: May raise an ``IndexError`` if a the parameter ``string`` is passed a list and ``use_index=True``.
This may occur when you pass a list of custom strings shorter than the list of URLs crawled.
"""
self.string = string
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
return self.string
[docs]class DateExtractor(BaseExtractor):
def __init__(self, tag_types: tuple = _DEFAULT_DATE_TAG_TYPES,
tag_attrs: dict = _DEFAULT_DATE_TAG_ATTRS,
return_year_month_day: bool = False, **kwargs):
"""
Get dates by looking at passed tag. Can optionally parse dates to year, month and day.
:param tag_types: Describes the tag types to find, e. g. ``meta``.
:param tag_attrs: Specifies HTML attributes and their values in a key-value dict format.
Example: ``{"name": "pubdate"}``.
:param return_year_month_day: If True, returns date as 3 integers: year (``YYYY``), month (``MM``) and day (``dd``).
"""
super().__init__(**kwargs)
self.tag_types = tag_types
self.tag_attrs = tag_attrs
self.return_year_month_day = return_year_month_day
self.n_return_values = 3 if self.return_year_month_day else 1
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> Union[datetime, Tuple[int, int, int]]:
date_tag = website.find(self.tag_types, attrs=self.tag_attrs, content=True)
try:
date_string = date_tag.attrs["content"]
parsed_date = dateutil.parser.parse(date_string) # returns a datetime object
year, month, day = parsed_date.year, parsed_date.month, parsed_date.day
except (AttributeError, ValueError, OverflowError): # if pubdate_tag is None or date string can't be parsed
parsed_date = DEFAULT_EMPTY_FIELD_STRING
year, month, day = DEFAULT_EMPTY_FIELD_STRING, DEFAULT_EMPTY_FIELD_STRING, DEFAULT_EMPTY_FIELD_STRING
if self.return_year_month_day:
return year, month, day
else:
return parsed_date
[docs]class DescriptionExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Get website description (the one shown in search engine results) using two common description fields."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
standard_desc_tag = website.find(_DEFAULT_DESCRIPTION_TAG_TYPE, attrs=_DESCRIPTION_TAG_ATTRS_1, content=True)
if standard_desc_tag is not None:
description = standard_desc_tag.attrs["content"]
if (standard_desc_tag is None) or (description == ""):
other_desc_tag = website.find(_DEFAULT_DESCRIPTION_TAG_TYPE, attrs=_DESCRIPTION_TAG_ATTRS_2, content=True)
if other_desc_tag is not None:
description = other_desc_tag.attrs["content"]
else:
description = DEFAULT_EMPTY_FIELD_STRING
return sanitize_text(description)
[docs]class DirectoryDepthExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Returns the directory level that a given document is in.
For example, ``https://www.sub.example.com/dir1/dir2/file.html`` returns 3."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> int:
return get_directory_depth(website.url)
[docs]class ExpiryDateExtractor(GeneralHttpHeaderFieldExtractor, DateExtractor, BaseExtractor):
def __init__(self, return_year_month_day: bool = False, **kwargs):
"""Get website ``expiry`` date from HTTP header or HTML Meta tag."""
GeneralHttpHeaderFieldExtractor.__init__(self, field_to_extract="Expires", **kwargs)
DateExtractor.__init__(self, tag_types=("meta"), tag_attrs={"name": ["expires", "Expires", "EXPIRES"]},
return_year_month_day=return_year_month_day)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> Union[datetime, Tuple[int, int, int]]:
header_date_string = GeneralHttpHeaderFieldExtractor.run(self, website, index)
try:
result = dateutil.parser.parse(header_date_string)
if self.return_year_month_day:
result = result.year, result.month, result.day
except (ValueError, OverflowError): # if date string can't be parsed
result = DateExtractor.run(self, website, index) # DateExtractor already respects return_year_month_day
return result
class HtmlTextExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Get plain HTML text of website."""
super().__init__(**kwargs)
@supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> int:
return website.html_text
[docs]class HttpStatusCodeExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Get status code of HTTP request."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> int:
try:
return website.http_response.status
except AttributeError: # when website is fetch using fetch(), the response object is requests.Response
return website.http_response.status_code
[docs]class KeywordsExtractor(GeneralHtmlTagExtractor, BaseExtractor):
def __init__(self, **kwargs):
"""Get keywords from HTML keyword meta tag (if present)."""
super().__init__(tag_types=_DEFAULT_KEYWORDS_TAG_TYPE, tag_attrs=_DEFAULT_KEYWORDS_TAG_ATTRS,
attr_to_extract="content", **kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
return super().run(website, index)
[docs]class LanguageExtractor(GeneralHtmlTagExtractor, BaseExtractor):
def __init__(self, **kwargs):
"""Get language of a given website from its HTML tag ``lang`` attribute."""
super().__init__(tag_types="html", tag_attrs={}, attr_to_extract="lang", **kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
return super().run(website, index).lower()
[docs]class LastModifiedDateExtractor(GeneralHttpHeaderFieldExtractor, DateExtractor, BaseExtractor):
def __init__(self, return_year_month_day: bool = False, **kwargs):
"""Get website ``last-modified`` date from HTTP header or HTML Meta tag."""
GeneralHttpHeaderFieldExtractor.__init__(self, field_to_extract="Last-Modified", **kwargs)
DateExtractor.__init__(self, tag_types=("meta"), tag_attrs={"http-equiv": "last-modified"},
return_year_month_day=return_year_month_day)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> Union[datetime, Tuple[int, int, int]]:
header_date_string = GeneralHttpHeaderFieldExtractor.run(self, website, index)
try:
result = dateutil.parser.parse(header_date_string)
if self.return_year_month_day:
result = result.year, result.month, result.day
except (ValueError, OverflowError): # if date string can't be parsed
result = DateExtractor.run(self, website, index) # DateExtractor already respects return_year_month_day
return result
[docs]class LinkExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Find all links from a website (without duplicates)."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> set:
link_tags = website.find_all("a", href=True) # find all link tags <a> that have the attribute href
links = {tag["href"].strip() for tag in link_tags} # get the URL (hyper-reference, href)
return links
[docs]class ServerProductExtractor(GeneralHttpHeaderFieldExtractor, BaseExtractor):
def __init__(self, **kwargs):
"""Get website ``Server`` info from HTTP header."""
super().__init__(field_to_extract="Server", **kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
return super().run(website, index)
[docs]class StepsFromStartPageExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Returns the number of links that have to be followed from the start page to arrive at this website."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> int:
return website.steps_from_start_page
[docs]class MobileOptimizedExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Checks whether website is optimized for mobile usage by looking up HTML ``viewport`` meta tag."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> int:
viewport_tag = website.find(_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_TYPE, attrs=_DEFAULT_IS_MOBILE_OPTIMIZED_TAG_ATTRS,
content=True)
return 0 if (viewport_tag is None) else 1
[docs]class TermOccurrenceExtractor(BaseExtractor):
def __init__(self, terms: Union[List[str], str], ignore_case: bool = False, **kwargs):
"""Checks if the given terms occur in the website's HTML text.
Returns 0 if no term occurs in the soup's text, 1 if at least one occurs.
:param terms: term or list of terms to search for.
:param ignore_case: Whether to respect the text's casing (upper-/lowercase).
"""
self.terms = [terms] if type(terms) is str else terms
self.ignore_case = ignore_case
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> int:
text = website.html_text
if self.ignore_case:
text = text.lower()
for term in self.terms:
if term in text:
return 1
return 0
[docs]class TermOccurrenceCountExtractor(BaseExtractor):
def __init__(self, terms: Union[List[str], str], ignore_case: bool = False, **kwargs):
"""Count the number of times the given terms occur in the website's HTML text.
:param terms: term or list of terms to search for.
:param ignore_case: Whether to respect the text's casing (upper-/lowercase).
:returns: Total sum of all occurrences.
"""
self.terms = [terms] if type(terms) is str else terms
self.ignore_case = ignore_case
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> int:
text = website.html_text
if self.ignore_case:
text = text.lower()
return sum([text.count(term) for term in self.terms])
[docs]class TitleExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Get title of a website (the same that is shown in a browser in the tabs tray)."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
try:
return sanitize_text(website.title.string)
except AttributeError:
return DEFAULT_EMPTY_FIELD_STRING
[docs]class UrlExtractor(BaseExtractor):
def __init__(self, **kwargs):
"""Returns the website's URL."""
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
return website.url
[docs]class UrlBranchNameExtractor(BaseExtractor):
def __init__(self, branch_name_position: int = 1, **kwargs):
"""Extract sub-domain names from URLs like ``subdomain.example.com``, which often refer to an entity's sub-branches.
:param branch_name_position: Where in the URL to look for the name. If ``0``, the domain will be used.
Otherwise, indexes into all available sub-domains:
``1`` would retrieve the first sub-domain *from the right*, ``2`` the second, and so on.
"""
self.branch_name_position = branch_name_position
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
if self.branch_name_position == 0:
branch_name = website.parsed_url.domain
else:
try:
branch_name = website.parsed_url.subdomain.split(".")[-self.branch_name_position]
except IndexError:
branch_name = DEFAULT_EMPTY_FIELD_STRING
return branch_name
[docs]class UrlCategoryExtractor(BaseExtractor):
def __init__(self, category_position: int = 2, **kwargs):
"""
Try to identify the category of a given URL as the directory specified by :attr:`category_position`.
:param category_position: Specify at which position in the path the category can be found.
"""
self.category_position = category_position
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
try:
return website.parsed_url.path.split("/")[self.category_position].split(".")[0]
except (AttributeError, IndexError):
return DEFAULT_EMPTY_FIELD_STRING
[docs]class WebsiteTextExtractor(BaseExtractor):
def __init__(self, mode: str = "auto",
min_length: int = 30,
tag_types: tuple = _DEFAULT_TEXT_TAG_TYPES,
tag_attrs: dict = _DEFAULT_TEXT_TAG_ATTRS,
allowed_string_types: List[NavigableString] = _DEFAULT_TEXT_ALLOWED_STRING_TYPES,
separator: str = "[SEP]", **kwargs):
"""Get readable website text, excluding ``<script>``, ``<style>``, ``<template>`` and other non-readable text.
Several modes are available to make sure to only capture relevant text.
:param mode: Default mode is ``auto``, which uses the ``readability`` algorithm to only extract a website's article text.
If ``all_strings``, all readable website text (excluding script, style and other tags as well as HTML comments) will be retrieved.
See also the `BeautifulSoup documentation <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text>`__ for the ``get_text()`` method.
If ``by_length``, the :attr:`min_length` parameter will be used to determine the minimum length of HTML strings to be included in the text.
If ``search_in_tags``, the tags dictionary will be used to identify the tags that include text.
:param min_length: If using mode ``by_length``, this is the minimum length of a string to be considered.
Shorter strings will be discarded.
:param tag_types: Describes the tag types to find, e. g. ``div``.
:param tag_attrs: Specifies HTML attributes and their values in a key-value dict format.
Example: ``{"class": ["content", "main-content"]}``.
:param allowed_string_types: List of types that are considered to be readable. This makes sure that scripts and similar types are excluded.
Note that the types passed here have to inherit from :class:`bs4.NavigableString`.
:param separator: String to be used as separator when concatenating all found strings.
"""
self.mode = mode
self.min_length = min_length
self.tag_types = tag_types
self.tag_attrs = tag_attrs
self.allowed_string_types = allowed_string_types
self.separator = separator
super().__init__(**kwargs)
[docs] @supports_dynamic_parameters
def run(self, website: Website, index: int = None) -> str:
def get_txt(obj: Union[Website, BeautifulSoup, Tag]) -> str:
return BeautifulSoup.get_text(obj, separator=self.separator, strip=True, types=self.allowed_string_types)
if self.mode == "auto":
try:
text = readability.Document(website.html_text).summary(html_partial=True)
except Exception:
text = DEFAULT_EMPTY_FIELD_STRING
elif self.mode == "all_strings":
text = get_txt(website)
elif self.mode == "by_length":
strings = website._all_strings(strip=True, types=self.allowed_string_types)
strings = filter(lambda s: len(s) >= self.min_length, strings)
text = self.separator.join(strings)
elif self.mode == "search_in_tags":
content_tags = website.find_all(self.tag_types, attrs=self.tag_attrs)
if len(content_tags) == 0: # None found
text = DEFAULT_EMPTY_FIELD_STRING
elif len(content_tags) == 1:
text = get_txt(content_tags[0])
else:
texts = [get_txt(tag) for tag in content_tags]
text = max(texts, key=len) # choose tag with most text in it
else:
raise ValueError(f'Incorrect text search mode specified: "{self.mode}"')
return sanitize_text(text)