[docs]defcrawl_domain(start_url:str,search_attributes:SearchAttributes,export_attrs:ExportAttributes=None,user_agent:str=None,pause_time:float=DEFAULT_PAUSE_TIME,respect_robots_txt:bool=True,max_no_urls:int=float("inf"),max_distance_from_start_url:int=float("inf"),max_subdirectory_depth:int=float("inf"),filter_non_standard_schemes:bool=True,filter_media_files:bool=True,blocklist:Iterable=(),filter_foreign_urls:Union[str,Callable]="auto",strip_url_parameters:bool=False,strip_url_fragments:bool=True,return_type:str="data",progress_bar:ProgressBar=None,current_index:int=None,**kwargs):""" Collect data from all sites of a given domain. The sites within the domain are found automatically be iteratively searching for all links inside all pages. :param start_url: The first URL to be accessed. From here, links will be extracted and iteratively processed to find all linked sites. :param search_attributes: Dictionary specifying what to search for and how to search it. :param export_attrs: Optional. If specified, the crawled data is exported as soon as it's ready, not after the entire crawling has finished. :param user_agent: Optionally specify a user agent for making the HTTP request. :param pause_time: Time to wait between the crawling of two URLs (in seconds). :param respect_robots_txt: Whether to respect the specifications made in the website's ``robots.txt`` file. :param max_no_urls: Maximum number of URLs to be crawled (safety limit for very large crawls). :param max_distance_from_start_url: Maximum number of links that have to be followed to arrive at a certain URL from the start_url. :param max_subdirectory_depth: Maximum sub-level of the host up to which to crawl. E.g., consider this schema: ``hostname/sub-directory1/sub-siteA``. If you would want to crawl all URLs of the same level as ``sub-directory1``, specify 1. ``sub-siteA`` will then not be found, but a site ``hostname/sub-directory2`` or ``hostname/sub-siteB`` will be. :param filter_non_standard_schemes: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param filter_media_files: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param blocklist: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param filter_foreign_urls: See `filter_urls() <reference.html#scrawler.utils.web_utils.filter_urls>`__. :param strip_url_parameters: See `strip_unnecessary_url_parts() <reference.html#scrawler.utils.web_utils.strip_unnecessary_url_parts>`__. :param strip_url_fragments: See `strip_unnecessary_url_parts() <reference.html#scrawler.utils.web_utils.strip_unnecessary_url_parts>`__. :param return_type: Specify which values to return ("all", "none", "data"). :param progress_bar: If a ``ProgressBar`` object is passed, prints a progress bar on the command line. :param current_index: Internal index needed to allow dynamic parameters (parameters where a list of values has been passed and only the values relevant to the currently processed URL should be used; for example, export_attrs may contain a list of filenames, and only the relevant filename for the currently processed URL should be used). See `this explanation <custom_data_extractors.html#dynamic-parameters>`__ for details. :return: List of the data collected from all URLs that where found using ``start_url`` as starting point. """# Fetch and update start URL (solves redirects)start_url=get_redirected_url(start_url,user_agent=user_agent)ifstart_urlisNone:returnNone# Optionally get foreign URL matching patternfilter_foreign_urls=extract_same_host_pattern(start_url)if(filter_foreign_urls=="auto")elsefilter_foreign_urls# Robots.txt parsingifrespect_robots_txt:robots_txt_parser=get_robot_file_parser(start_url,user_agent=user_agent)# Initiate some objectsto_crawl={start_url}processed=set()discarded=set()data=[]url_and_distance={start_url:0}# for parameter max_url_depth# Start logging progress bar on consoleifprogress_barisnotNone:progress_bar.update(iterations=0,total_length_update=1)while(len(to_crawl)>0)and(len(data)<max_no_urls):next_url=to_crawl.pop()# Check if URL access is disallowed by robots.txtifrespect_robots_txtand(robots_txt_parserisnotNone):ua="*"if(user_agentisNone)elseuser_agentifnotrobots_txt_parser.can_fetch(ua,next_url):logging.info(f"URL access disallowed for crawler by robots.txt: {next_url}")discarded.add(next_url)ifprogress_barisnotNone:progress_bar.update(iterations=1)continue# Crawl only up to the subdirectory depth specified in the parametercurrent_directory_depth=get_directory_depth(next_url)ifcurrent_directory_depth>max_subdirectory_depth:logging.warning(f"Subdirectory depth too deep ({current_directory_depth}): {next_url}")discarded.add(next_url)ifprogress_barisnotNone:progress_bar.update(iterations=1)continue# Crawl only up to a certain distance (links that had to be followed) from the start_urlcurrent_steps_from_start_page=url_and_distance[next_url]ifcurrent_steps_from_start_page>max_distance_from_start_url:logging.warning(f"Too many steps from start page ({current_steps_from_start_page}): {next_url}")discarded.add(next_url)ifprogress_barisnotNone:progress_bar.update(iterations=1)continue# Get Website object for further processingtry:website=Website(next_url,steps_from_start_page=current_steps_from_start_page).fetch(user_agent=user_agent,check_http_content_type=filter_media_files)exceptExceptionase:logging.error(f"{e.__class__.__module__}.{e.__class__.__name__} while processing {next_url}. Details: {e}")discarded.add(next_url)ifprogress_barisnotNone:progress_bar.update(iterations=1)continue# Collect the data from the websiteurl_data=search_attributes.extract_all_attrs_from_website(website,index=current_index)data.append(url_data)# Collect all available hyperlinks from the website and pre-process + filter themfound_urls=LinkExtractor().run(website)if(current_steps_from_start_page<max_distance_from_start_url)else[]found_urls=strip_unnecessary_url_parts(found_urls,parameters=strip_url_parameters,fragments=strip_url_fragments)found_urls=fix_relative_urls(urls=found_urls,base_url=start_url)found_urls,filtered=filter_urls(found_urls,base_url=start_url,filter_foreign_urls=filter_foreign_urls,filter_non_standard_schemes=filter_non_standard_schemes,filter_media_files=filter_media_files,blocklist=blocklist,return_discarded=True)discarded=discarded.union(filtered)# All newly found URLs to working list (to_crawl) except those processed or discarded alreadyprocessed.add(next_url)urls_to_add=found_urls.difference(processed,discarded,to_crawl)to_crawl.update(urls_to_add)# Add URL depth (distance from start) to each newly found URLforurlinfound_urls:ifurlinurl_and_distance:# do not overwrite URL depths that are already includedcontinueelse:url_and_distance[url]=current_steps_from_start_page+1logging.debug(f"Processed {next_url}")# Update progress barifprogress_barisnotNone:progress_bar.update(iterations=1,total_length_update=len(urls_to_add))# pause to avoid being flagged as spammertime.sleep(pause_time)# Optionally export files immediatelyif(export_attrsisnotNone)and(len(data)>0):export_to_csv(data,current_index=current_index,**export_attrs.__dict__)ifreturn_type=="all":# TODO better return type definition?returndata,to_crawl,processed,discarded,url_and_distanceelifreturn_type=="data":returndataelse:returnNone
[docs]defscrape_site(url:str,search_attrs:SearchAttributes,export_attrs:ExportAttributes=None,user_agent:str=None,current_index:int=None,progress_bar:ProgressBar=None)->list:"""Scrape the data specified in search_attrs from one website. :param url: URL to be scraped. :param search_attrs: Specify which data to collect/search for in the website. :param export_attrs: Specify how and where to export the collected data (as CSV). :param user_agent: Optionally specify a user agent for making the HTTP request. :param current_index: Internal index needed to allow dynamic parameters (parameters where a list of values has been passed and only the values relevant to the currently processed URL should be used; for example, export_attrs may contain a list of filenames, and only the relevant filename for the currently processed URL should be used). See `this explanation <custom_data_extractors.html#dynamic-parameters>`__ for details. :param progress_bar: If a ``ProgressBar`` object is passed, prints a progress bar on the command line. :return: List of data collected from the website. """ifprogress_barisnotNone:progress_bar.update(iterations=0,total_length_update=1)try:website=Website(url).fetch(user_agent=user_agent)website_data=search_attrs.extract_all_attrs_from_website(website,index=current_index)exceptExceptionase:logging.error(f"{e.__class__.__module__}.{e.__class__.__name__} while processing {url}. Details: {e}")website_data=[]ifprogress_barisnotNone:progress_bar.update(iterations=1)# Optionally export files immediatelyif(export_attrsisnotNone)and(len(website_data)>0):export_to_csv(website_data,current_index=current_index,**export_attrs.__dict__)else:returnwebsite_data# TODO useful return values