Source code for scrawler.backends.multithreading_backend

from typing import Iterable, Union, Callable
import logging
import time

from scrawler.attributes import SearchAttributes, ExportAttributes
from scrawler.defaults import DEFAULT_PAUSE_TIME
from scrawler.website import Website
from scrawler.data_extractors import LinkExtractor
from scrawler.utils.general_utils import ProgressBar
from scrawler.utils.web_utils import (extract_same_host_pattern, get_directory_depth, strip_unnecessary_url_parts,
                                      fix_relative_urls, filter_urls, get_redirected_url, get_robot_file_parser)
from scrawler.utils.file_io_utils import export_to_csv


[docs]def crawl_domain(start_url: str, search_attributes: SearchAttributes, export_attrs: ExportAttributes = None, user_agent: str = None, pause_time: float = DEFAULT_PAUSE_TIME, respect_robots_txt: bool = True, max_no_urls: int = float("inf"), max_distance_from_start_url: int = float("inf"), max_subdirectory_depth: int = float("inf"), filter_non_standard_schemes: bool = True, filter_media_files: bool = True, blocklist: Iterable = (), filter_foreign_urls: Union[str, Callable] = "auto", strip_url_parameters: bool = False, strip_url_fragments: bool = True, return_type: str = "data", progress_bar: ProgressBar = None, current_index: int = None, **kwargs): """ Collect data from all sites of a given domain. The sites within the domain are found automatically be iteratively searching for all links inside all pages. :param start_url: The first URL to be accessed. From here, links will be extracted and iteratively processed to find all linked sites. :param search_attributes: Dictionary specifying what to search for and how to search it. :param export_attrs: Optional. If specified, the crawled data is exported as soon as it's ready, not after the entire crawling has finished. :param user_agent: Optionally specify a user agent for making the HTTP request. :param pause_time: Time to wait between the crawling of two URLs (in seconds). :param respect_robots_txt: Whether to respect the specifications made in the website's ``robots.txt`` file. :param max_no_urls: Maximum number of URLs to be crawled (safety limit for very large crawls). :param max_distance_from_start_url: Maximum number of links that have to be followed to arrive at a certain URL from the start_url. :param max_subdirectory_depth: Maximum sub-level of the host up to which to crawl. E.g., consider this schema: ``hostname/sub-directory1/sub-siteA``. If you would want to crawl all URLs of the same level as ``sub-directory1``, specify 1. ``sub-siteA`` will then not be found, but a site ``hostname/sub-directory2`` or ``hostname/sub-siteB`` will be. :param filter_non_standard_schemes: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param filter_media_files: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param blocklist: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param filter_foreign_urls: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param strip_url_parameters: See `strip_unnecessary_url_parts() <reference.html#scrawler.utils.web_utils.strip_unnecessary_url_parts>`__. :param strip_url_fragments: See `strip_unnecessary_url_parts() <reference.html#scrawler.utils.web_utils.strip_unnecessary_url_parts>`__. :param return_type: Specify which values to return ("all", "none", "data"). :param progress_bar: If a ``ProgressBar`` object is passed, prints a progress bar on the command line. :param current_index: Internal index needed to allow dynamic parameters (parameters where a list of values has been passed and only the values relevant to the currently processed URL should be used; for example, export_attrs may contain a list of filenames, and only the relevant filename for the currently processed URL should be used). See `this explanation <custom_data_extractors.html#dynamic-parameters>`__ for details. :return: List of the data collected from all URLs that where found using ``start_url`` as starting point. """ # Fetch and update start URL (solves redirects) start_url = get_redirected_url(start_url, user_agent=user_agent) if start_url is None: return None # Optionally get foreign URL matching pattern filter_foreign_urls = extract_same_host_pattern(start_url) if (filter_foreign_urls == "auto") else filter_foreign_urls # Robots.txt parsing if respect_robots_txt: robots_txt_parser = get_robot_file_parser(start_url, user_agent=user_agent) # Initiate some objects to_crawl = {start_url} processed = set() discarded = set() data = [] url_and_distance = {start_url: 0} # for parameter max_url_depth # Start logging progress bar on console if progress_bar is not None: progress_bar.update(iterations=0, total_length_update=1) while (len(to_crawl) > 0) and (len(data) < max_no_urls): next_url = to_crawl.pop() # Check if URL access is disallowed by robots.txt if respect_robots_txt and (robots_txt_parser is not None): ua = "*" if (user_agent is None) else user_agent if not robots_txt_parser.can_fetch(ua, next_url): logging.info(f"URL access disallowed for crawler by robots.txt: {next_url}") discarded.add(next_url) if progress_bar is not None: progress_bar.update(iterations=1) continue # Crawl only up to the subdirectory depth specified in the parameter current_directory_depth = get_directory_depth(next_url) if current_directory_depth > max_subdirectory_depth: logging.warning(f"Subdirectory depth too deep ({current_directory_depth}): {next_url}") discarded.add(next_url) if progress_bar is not None: progress_bar.update(iterations=1) continue # Crawl only up to a certain distance (links that had to be followed) from the start_url current_steps_from_start_page = url_and_distance[next_url] if current_steps_from_start_page > max_distance_from_start_url: logging.warning(f"Too many steps from start page ({current_steps_from_start_page}): {next_url}") discarded.add(next_url) if progress_bar is not None: progress_bar.update(iterations=1) continue # Get Website object for further processing try: website = Website(next_url, steps_from_start_page=current_steps_from_start_page).fetch(user_agent=user_agent, check_http_content_type=filter_media_files) except Exception as e: logging.error(f"{e.__class__.__module__}.{e.__class__.__name__} while processing {next_url}. Details: {e}") discarded.add(next_url) if progress_bar is not None: progress_bar.update(iterations=1) continue # Collect the data from the website url_data = search_attributes.extract_all_attrs_from_website(website, index=current_index) data.append(url_data) # Collect all available hyperlinks from the website and pre-process + filter them found_urls = LinkExtractor().run(website) if (current_steps_from_start_page < max_distance_from_start_url) else [] found_urls = strip_unnecessary_url_parts(found_urls, parameters=strip_url_parameters, fragments=strip_url_fragments) found_urls = fix_relative_urls(urls=found_urls, base_url=start_url) found_urls, filtered = filter_urls(found_urls, base_url=start_url, filter_foreign_urls=filter_foreign_urls, filter_non_standard_schemes=filter_non_standard_schemes, filter_media_files=filter_media_files, blocklist=blocklist, return_discarded=True) discarded = discarded.union(filtered) # All newly found URLs to working list (to_crawl) except those processed or discarded already processed.add(next_url) urls_to_add = found_urls.difference(processed, discarded, to_crawl) to_crawl.update(urls_to_add) # Add URL depth (distance from start) to each newly found URL for url in found_urls: if url in url_and_distance: # do not overwrite URL depths that are already included continue else: url_and_distance[url] = current_steps_from_start_page + 1 logging.debug(f"Processed {next_url}") # Update progress bar if progress_bar is not None: progress_bar.update(iterations=1, total_length_update=len(urls_to_add)) # pause to avoid being flagged as spammer time.sleep(pause_time) # Optionally export files immediately if (export_attrs is not None) and (len(data) > 0): export_to_csv(data, current_index=current_index, **export_attrs.__dict__) if return_type == "all": # TODO better return type definition? return data, to_crawl, processed, discarded, url_and_distance elif return_type == "data": return data else: return None
[docs]def scrape_site(url: str, search_attrs: SearchAttributes, export_attrs: ExportAttributes = None, user_agent: str = None, current_index: int = None, progress_bar: ProgressBar = None) -> list: """Scrape the data specified in search_attrs from one website. :param url: URL to be scraped. :param search_attrs: Specify which data to collect/search for in the website. :param export_attrs: Specify how and where to export the collected data (as CSV). :param user_agent: Optionally specify a user agent for making the HTTP request. :param current_index: Internal index needed to allow dynamic parameters (parameters where a list of values has been passed and only the values relevant to the currently processed URL should be used; for example, export_attrs may contain a list of filenames, and only the relevant filename for the currently processed URL should be used). See `this explanation <custom_data_extractors.html#dynamic-parameters>`__ for details. :param progress_bar: If a ``ProgressBar`` object is passed, prints a progress bar on the command line. :return: List of data collected from the website. """ if progress_bar is not None: progress_bar.update(iterations=0, total_length_update=1) try: website = Website(url).fetch(user_agent=user_agent) website_data = search_attrs.extract_all_attrs_from_website(website, index=current_index) except Exception as e: logging.error(f"{e.__class__.__module__}.{e.__class__.__name__} while processing {url}. Details: {e}") website_data = [] if progress_bar is not None: progress_bar.update(iterations=1) # Optionally export files immediately if (export_attrs is not None) and (len(website_data) > 0): export_to_csv(website_data, current_index=current_index, **export_attrs.__dict__) else: return website_data # TODO useful return values