"""Specifies the attribute objects used by crawlers and scrapers."""fromtypingimportTuple,Union,Callablefrominspectimportsignatureimportosimportpandasaspdfromscrawler.defaultsimport(DEFAULT_CSV_ENCODING,DEFAULT_CSV_SEPARATOR,DEFAULT_CSV_QUOTING,DEFAULT_CSV_ESCAPECHAR,DEFAULT_PAUSE_TIME)fromscrawler.websiteimportWebsitefromscrawler.data_extractorsimportBaseExtractorfromscrawler.utils.web_utilsimportis_same_host,extract_same_host_pattern
[docs]classSearchAttributes:def__init__(self,*args:BaseExtractor,validate:bool=True):"""Specify which data to collect/search for in the website. :param args: Data extractors specifying which data to extract in websites (see `built-in data extractors <built_in_data_extractors.html>`__ or for possibilities or `define a custom data extractor <custom_data_extractors.html>`__). :param validate: Whether to make sure that input parameters are valid. """ifvalidate:forextractorinargs:ifnotisinstance(extractor,BaseExtractor):raiseTypeError(f"{extractor.__class__} does not inherit from BaseExtractor.")self.attributes:Tuple[BaseExtractor]=argsself.n_return_values:int=sum([extractor.n_return_valuesforextractorinself.attributes])
[docs]defextract_all_attrs_from_website(self,website:Website,index:int=None)->list:"""Extract data from a website using data extractors specified in ``SearchAttributes`` definition. :param website: Website object to collect the specified data points from. :param index: Optionally pass an index for data extractors that index into passed parameters. See `this explanation <custom_data_extractors.html#dynamic-parameters>`__ for details. """extracted_data=[]forextractorinself.attributes:# Case handling for functions using an indexif(indexisnotNone)andextractor.dynamic_parameters:result=extractor.run(website,index)else:result=extractor.run(website)# Case handling for functions with multiple return valuesifextractor.n_return_values!=1:extracted_data.extend(result)else:extracted_data.append(result)returnextracted_data
[docs]classExportAttributes:def__init__(self,directory:str,fn:Union[str,list],header:Union[list,str,bool]=None,encoding:str=DEFAULT_CSV_ENCODING,separator:str=DEFAULT_CSV_SEPARATOR,quoting:int=DEFAULT_CSV_QUOTING,escapechar:str=DEFAULT_CSV_ESCAPECHAR,validate:bool=True,**kwargs):"""Specify how and where to export the collected data. :param directory: Folder where file(s) will be saved to. :param fn: Name(s) of the file(s) containing the crawled data. *Without* file extension. :param header: Have the final CSV file have a header. Possible parameters: If ``None`` or ``False``, no header will be written. If ``first-row`` or ``True``, uses first row of data as header. Else, pass list of strings of appropriate length. :param encoding: Encoding to use to create the CSV file. :param separator: Column separator or delimiter to use for creating the CSV file. :param quoting: Puts quotes around cells that contain the separator character. :param escapechar: Escapes the separator character. :param validate: Whether to make sure that input parameters are valid. :param kwargs: Any parameter supported by :meth:`pandas:pandas.DataFrame.to_csv` can be passed. """ifvalidate:# Check that directory existsifnotos.path.isdir(directory):raiseNotADirectoryError(f"Export directory does not exist on this system ({directory}).")# Check that keyword arguments are allowed for pandas.DataFrame.to_csv()forkey,valueinkwargs.items():ifkeynotinsignature(pd.DataFrame.to_csv).parameters:raiseValueError(f'Invalid keyword argument passed to ExportAttributes: "{key}"')self.directory=directoryself.fn=fn# Filename(s)self.header=headerself.encoding=encodingself.separator=separatorself.quoting=quotingself.escapechar=escapecharforkey,valueinkwargs.items():# Add keyword arguments as attributesself.__setattr__(key,value)
[docs]classCrawlingAttributes:def__init__(self,filter_non_standard_schemes:bool=True,filter_media_files:bool=True,blocklist:tuple=(),filter_foreign_urls:Union[str,Callable]="auto",strip_url_parameters:bool=False,strip_url_fragments:bool=True,max_no_urls:int=None,max_distance_from_start_url:int=None,max_subdirectory_depth:int=None,pause_time:float=DEFAULT_PAUSE_TIME,respect_robots_txt:bool=True,validate:bool=True):"""Specify how to conduct the crawling, including filtering irrelevant URLs or limiting the number of crawled URLs. :param filter_non_standard_schemes: Filter URLs starting with schemes other than ``http:`` or ``https:`` (e.g., ``mailto:`` or ``javascript:``). :param filter_media_files: Whether to filter media files. Recommended: ``True`` to avoid long runtimes caused by large file downloads. :param blocklist: Filter URLs that contain one or more of the parts specified here. Has to be a ``list``. :param filter_foreign_urls: Filter URLs that do not belong to the same host (foreign URLs). Can either be a string that is passed to :func:`.is_same_host`, or a custom ``Callable`` that has to include two arguments, ``url1`` and ``url2``. In :func:`.is_same_host`, the following string values are permitted: 1. ``auto``: Automatically extracts a matching pattern from the start URL (see :func:`.extract_same_host_pattern` for details). 2. Any one of the attributes of the :class:`.ParsedUrl` class (e.g. ``domain``, ``hostname``, ``fld``). 3. ``subdomainX`` with ``X`` representing an integer number up to which subdomain the URLs should be compared. E.g., comparing ``http://www.sub.example.com`` and ``http://blog.sub.example.com``, ``sub`` is the first level, while the second levels are ``www`` and ``blog``, respectively. 4. ``directoryX`` with ``X`` representing an integer number up to which directory the URLs should be compared. E.g., for ``http://example.com/dir1/dir2/index.html``, ``directory2`` would include all files in ``dir2``. :param strip_url_parameters: Whether to strip URL query parameters (prefixed by ``?``) from the URL. :param strip_url_fragments: Whether to strip URL fragments (prefixed by ``#``) from the URL. :param max_no_urls: Maximum number of URLs to be crawled per domain (safety limit for very large crawls). Set to ``None`` if you want all URLs to be crawled. :param max_distance_from_start_url: Maximum number of links that have to be followed to arrive at a certain URL from the start URL. :param max_subdirectory_depth: Maximum sub-level of the host up to which to crawl. E.g., consider this schema: ``hostname/sub-directory1/sub-siteA``. If you would want to crawl all URLs of the same level as ``sub-directory1``, specify 1. ``sub-siteA`` will then not be found, but a site ``hostname/sub-directory2`` or ``hostname/sub-siteB`` will be. :param pause_time: Time to wait between the crawling of two URLs (in seconds). :param respect_robots_txt: Whether to respect the specifications made in the website's ``robots.txt`` file. """ifvalidate:# Check that a valid input is passed to parameter filter_foreign_urlTEST_URL="https://www.example.com"try:ifnotisinstance(filter_foreign_urls,Callable):test_mode=extract_same_host_pattern(TEST_URL)if(filter_foreign_urls=="auto")elsefilter_foreign_urlsassertis_same_host(TEST_URL,TEST_URL,mode=test_mode),"is_same_host() should be True if the same URL is used."else:assertfilter_foreign_urls(TEST_URL,TEST_URL),f"Error when testing your custom foreign URL filter function ({filter_foreign_urls.__name__}): Should be True if the same URL is used for both input arguments."except(ValueError,TypeError,AssertionError)ase:raiseValueError(f"Parameter filter_foreign_url is not correctly specified: {filter_foreign_urls}. The following error occurred during validation: {e}")self.filter_non_standard_schemes=filter_non_standard_schemesself.filter_media=filter_media_filesself.blocklist=blocklistself.filter_foreign_urls=filter_foreign_urlsself.strip_url_parameters=strip_url_parametersself.strip_url_fragments=strip_url_fragmentsself.max_no_urls=max_no_urlsif(max_no_urlsisnotNone)elsefloat("inf")self.max_distance_from_start_url=max_distance_from_start_urlif(max_distance_from_start_urlisnotNone)elsefloat("inf")self.max_subdirectory_depth=max_subdirectory_depthif(max_subdirectory_depthisnotNone)elsefloat("inf")self.pause_time=pause_timeself.respect_robots_txt=respect_robots_txt