from typing import Iterable, Union, Callable
import logging
import time
from scrawler.attributes import SearchAttributes, ExportAttributes
from scrawler.defaults import DEFAULT_PAUSE_TIME
from scrawler.website import Website
from scrawler.data_extractors import LinkExtractor
from scrawler.utils.general_utils import ProgressBar
from scrawler.utils.web_utils import (extract_same_host_pattern, get_directory_depth, strip_unnecessary_url_parts,
fix_relative_urls, filter_urls, get_redirected_url, get_robot_file_parser)
from scrawler.utils.file_io_utils import export_to_csv
[docs]def crawl_domain(start_url: str,
search_attributes: SearchAttributes,
export_attrs: ExportAttributes = None,
user_agent: str = None,
pause_time: float = DEFAULT_PAUSE_TIME,
respect_robots_txt: bool = True,
max_no_urls: int = float("inf"),
max_distance_from_start_url: int = float("inf"),
max_subdirectory_depth: int = float("inf"),
filter_non_standard_schemes: bool = True,
filter_media_files: bool = True,
blocklist: Iterable = (),
filter_foreign_urls: Union[str, Callable] = "auto",
strip_url_parameters: bool = False,
strip_url_fragments: bool = True,
return_type: str = "data",
progress_bar: ProgressBar = None,
current_index: int = None,
**kwargs):
"""
Collect data from all sites of a given domain. The sites within the domain are found automatically be iteratively searching for all links inside all pages.
:param start_url: The first URL to be accessed. From here, links will be extracted and iteratively processed to find all linked sites.
:param search_attributes: Dictionary specifying what to search for and how to search it.
:param export_attrs: Optional. If specified, the crawled data is exported as soon as it's ready, not after the entire crawling has finished.
:param user_agent: Optionally specify a user agent for making the HTTP request.
:param pause_time: Time to wait between the crawling of two URLs (in seconds).
:param respect_robots_txt: Whether to respect the specifications made in the website's ``robots.txt`` file.
:param max_no_urls: Maximum number of URLs to be crawled (safety limit for very large crawls).
:param max_distance_from_start_url: Maximum number of links that have to be followed to arrive at a certain URL from the start_url.
:param max_subdirectory_depth: Maximum sub-level of the host up to which to crawl. E.g., consider this schema: ``hostname/sub-directory1/sub-siteA``.
If you would want to crawl all URLs of the same level as ``sub-directory1``, specify 1.
``sub-siteA`` will then not be found, but a site ``hostname/sub-directory2`` or ``hostname/sub-siteB`` will be.
:param filter_non_standard_schemes: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__.
:param filter_media_files: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__.
:param blocklist: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__.
:param filter_foreign_urls: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__.
:param strip_url_parameters: See `strip_unnecessary_url_parts() <reference.html#scrawler.utils.web_utils.strip_unnecessary_url_parts>`__.
:param strip_url_fragments: See `strip_unnecessary_url_parts() <reference.html#scrawler.utils.web_utils.strip_unnecessary_url_parts>`__.
:param return_type: Specify which values to return ("all", "none", "data").
:param progress_bar: If a ``ProgressBar`` object is passed, prints a progress bar on the command line.
:param current_index: Internal index needed to allow dynamic parameters (parameters where a list of values has been
passed and only the values relevant to the currently processed URL should be used; for example, export_attrs may
contain a list of filenames, and only the relevant filename for the currently processed URL should be used).
See `this explanation <custom_data_extractors.html#dynamic-parameters>`__ for details.
:return: List of the data collected from all URLs that where found using ``start_url`` as starting point.
"""
# Fetch and update start URL (solves redirects)
start_url = get_redirected_url(start_url, user_agent=user_agent)
if start_url is None:
return None
# Optionally get foreign URL matching pattern
filter_foreign_urls = extract_same_host_pattern(start_url) if (filter_foreign_urls == "auto") else filter_foreign_urls
# Robots.txt parsing
if respect_robots_txt:
robots_txt_parser = get_robot_file_parser(start_url, user_agent=user_agent)
# Initiate some objects
to_crawl = {start_url}
processed = set()
discarded = set()
data = []
url_and_distance = {start_url: 0} # for parameter max_url_depth
# Start logging progress bar on console
if progress_bar is not None:
progress_bar.update(iterations=0, total_length_update=1)
while (len(to_crawl) > 0) and (len(data) < max_no_urls):
next_url = to_crawl.pop()
# Check if URL access is disallowed by robots.txt
if respect_robots_txt and (robots_txt_parser is not None):
ua = "*" if (user_agent is None) else user_agent
if not robots_txt_parser.can_fetch(ua, next_url):
logging.info(f"URL access disallowed for crawler by robots.txt: {next_url}")
discarded.add(next_url)
if progress_bar is not None:
progress_bar.update(iterations=1)
continue
# Crawl only up to the subdirectory depth specified in the parameter
current_directory_depth = get_directory_depth(next_url)
if current_directory_depth > max_subdirectory_depth:
logging.warning(f"Subdirectory depth too deep ({current_directory_depth}): {next_url}")
discarded.add(next_url)
if progress_bar is not None:
progress_bar.update(iterations=1)
continue
# Crawl only up to a certain distance (links that had to be followed) from the start_url
current_steps_from_start_page = url_and_distance[next_url]
if current_steps_from_start_page > max_distance_from_start_url:
logging.warning(f"Too many steps from start page ({current_steps_from_start_page}): {next_url}")
discarded.add(next_url)
if progress_bar is not None:
progress_bar.update(iterations=1)
continue
# Get Website object for further processing
try:
website = Website(next_url, steps_from_start_page=current_steps_from_start_page).fetch(user_agent=user_agent,
check_http_content_type=filter_media_files)
except Exception as e:
logging.error(f"{e.__class__.__module__}.{e.__class__.__name__} while processing {next_url}. Details: {e}")
discarded.add(next_url)
if progress_bar is not None:
progress_bar.update(iterations=1)
continue
# Collect the data from the website
url_data = search_attributes.extract_all_attrs_from_website(website, index=current_index)
data.append(url_data)
# Collect all available hyperlinks from the website and pre-process + filter them
found_urls = LinkExtractor().run(website) if (current_steps_from_start_page < max_distance_from_start_url) else []
found_urls = strip_unnecessary_url_parts(found_urls, parameters=strip_url_parameters,
fragments=strip_url_fragments)
found_urls = fix_relative_urls(urls=found_urls, base_url=start_url)
found_urls, filtered = filter_urls(found_urls, base_url=start_url,
filter_foreign_urls=filter_foreign_urls,
filter_non_standard_schemes=filter_non_standard_schemes,
filter_media_files=filter_media_files,
blocklist=blocklist,
return_discarded=True)
discarded = discarded.union(filtered)
# All newly found URLs to working list (to_crawl) except those processed or discarded already
processed.add(next_url)
urls_to_add = found_urls.difference(processed, discarded, to_crawl)
to_crawl.update(urls_to_add)
# Add URL depth (distance from start) to each newly found URL
for url in found_urls:
if url in url_and_distance: # do not overwrite URL depths that are already included
continue
else:
url_and_distance[url] = current_steps_from_start_page + 1
logging.debug(f"Processed {next_url}")
# Update progress bar
if progress_bar is not None:
progress_bar.update(iterations=1, total_length_update=len(urls_to_add))
# pause to avoid being flagged as spammer
time.sleep(pause_time)
# Optionally export files immediately
if (export_attrs is not None) and (len(data) > 0):
export_to_csv(data, current_index=current_index, **export_attrs.__dict__)
if return_type == "all": # TODO better return type definition?
return data, to_crawl, processed, discarded, url_and_distance
elif return_type == "data":
return data
else:
return None
[docs]def scrape_site(url: str, search_attrs: SearchAttributes, export_attrs: ExportAttributes = None,
user_agent: str = None, current_index: int = None, progress_bar: ProgressBar = None) -> list:
"""Scrape the data specified in search_attrs from one website.
:param url: URL to be scraped.
:param search_attrs: Specify which data to collect/search for in the website.
:param export_attrs: Specify how and where to export the collected data (as CSV).
:param user_agent: Optionally specify a user agent for making the HTTP request.
:param current_index: Internal index needed to allow dynamic parameters (parameters where a list of values has been
passed and only the values relevant to the currently processed URL should be used; for example, export_attrs may
contain a list of filenames, and only the relevant filename for the currently processed URL should be used).
See `this explanation <custom_data_extractors.html#dynamic-parameters>`__ for details.
:param progress_bar: If a ``ProgressBar`` object is passed, prints a progress bar on the command line.
:return: List of data collected from the website.
"""
if progress_bar is not None:
progress_bar.update(iterations=0, total_length_update=1)
try:
website = Website(url).fetch(user_agent=user_agent)
website_data = search_attrs.extract_all_attrs_from_website(website, index=current_index)
except Exception as e:
logging.error(f"{e.__class__.__module__}.{e.__class__.__name__} while processing {url}. Details: {e}")
website_data = []
if progress_bar is not None:
progress_bar.update(iterations=1)
# Optionally export files immediately
if (export_attrs is not None) and (len(website_data) > 0):
export_to_csv(website_data, current_index=current_index, **export_attrs.__dict__)
else:
return website_data # TODO useful return values