[docs]classScraper:def__init__(self,urls:Union[list,str],search_attributes:SearchAttributes,export_attributes:ExportAttributes=None,user_agent:str=None,timeout:Union[int,aiohttp.ClientTimeout]=None,backend:str=DEFAULT_BACKEND,validate_input_parameters:bool=True):"""Scrape website or multiple websites in parallel. :param urls: Website URL or list of all URLs to scrape. :param search_attributes: Specify which data to collect/search for in websites. :param export_attributes: Specify how and where to export the collected data (as CSV). :param user_agent: Optionally specify a user agent for making the HTTP request. :param timeout: Timeout to be used when making HTTP requests. Note that the values specified here apply to each request individually, not to an entire session. When using the :mod:`.asyncio_backend`, you can pass an :class:`aiohttp:aiohttp.ClientTimeout` object where you can specify detailed timeout settings. Alternatively, you can pass an integer that will be interpreted as total timeout for one request in seconds. If nothing is passed, a default timeout will be used. :param backend: "asyncio" to use the :mod:`.asyncio_backend` (faster when crawling many domains at once, but more unstable and may get hung). "multithreading" to use the :mod:`~scrawler.backends.multithreading_backend` (more stable, but most likely slower). See also `Why are there two backends? <getting_started.html#why-are-there-two-backends>`__ :param validate_input_parameters: Whether to validate input parameters. Note that this validates that all URLs work and that the various attributes work together. However, the attributes themselves are also validated independently. You will need to also pass ``validate=False`` to the attributes individually to completely disable input validation. """self.urls=[urls]if(type(urls)isstr)elseurls# cast to list because validation expects list of urlsifvalidate_input_parameters:validate_input_params(urls=self.urls,search_attrs=search_attributes,export_attrs=export_attributes)self.search_attrs=search_attributesself.export_attrs=export_attributesself.backend=backendself.user_agent=user_agent# Timeout only works with asyncio backendifself.backend==backends.ASYNCIO:ifisinstance(timeout,aiohttp.ClientTimeout):self.timeout=timeouteliftype(timeout)isint:# interpret as total timeout in secondsself.timeout=aiohttp.ClientTimeout(total=timeout)else:self.timeout=aiohttp.ClientTimeout(total=DEFAULT_REQUEST_TIMEOUT)self._progress_bar=ProgressBar(custom_message="Sites scraped:")self.data=None
[docs]@timing_decoratordefrun(self,export_immediately:bool=False)->List[List[Any]]:"""Execute the scraping task and return the results. :param export_immediately: May be used when scraping many sites at once. In order to prevent a :class:`MemoryError`, data will be exported as soon as it is ready and then discarded to make room for the next sites. :return: The result is a list with one entry per processed URL (result = [url1, url2, ...]). Each URL entry is a list with one entry per extracted datapoint (url = [datapoint1, datapoint2, ...]). """export_attrs=self.export_attrsifexport_immediatelyelseNoneifself.backend==backends.MULTITHREADING:urls_and_index=list(enumerate(self.urls))# Define function with constant parameters pre-filleddefscrape_site_params_prefilled(current_index:int,url:str):returnmultithreading_backend.scrape_site(url,export_attrs=export_attrs,search_attrs=self.search_attrs,current_index=current_index,user_agent=self.user_agent,progress_bar=self._progress_bar)# Map to ThreadPoolpool=ThreadPool()self.data=pool.starmap(scrape_site_params_prefilled,urls_and_index)pool.close()pool.join()elifself.backend==backends.ASYNCIO:asyncdefscrape_all_urls(urls):connector=aiohttp.TCPConnector(limit_per_host=DEFAULT_NO_CONCURRENT_REQUESTS_PER_HOST)asyncwithaiohttp.ClientSession(timeout=self.timeout,connector=connector)assession:tasks=[asyncio_backend.async_scrape_site(url,session=session,search_attrs=self.search_attrs,export_attrs=export_attrs,current_index=i,user_agent=self.user_agent,progress_bar=self._progress_bar)fori,urlinenumerate(urls)]returnawaitasyncio.gather(*tasks)self.data=asyncio.get_event_loop().run_until_complete(scrape_all_urls(self.urls))# instead of asyncio.run() which throws RuntimeErrors, see https://github.com/aio-libs/aiohttp/issues/4324#issuecomment-676675779else:raiseValueError(f'Backend "{self.backend}" not supported.')returnself.data
[docs]defrun_and_export(self,export_attrs:ExportAttributes=None)->None:"""Shorthand for ``Scraper.run(export_immediately=True)``. :param export_attrs: :class:`.ExportAttributes` object specifying export parameters. """ifexport_attrsisnotNone:self.export_attrs=export_attrsifself.export_attrsisNone:raiseValueError("No export attributes have been passed.")self.run(export_immediately=True)
[docs]defexport_data(self,export_attrs:ExportAttributes=None,export_as_one_file:bool=True)->None:"""Export data previously collected from scraping task. :param export_attrs: :class:`.ExportAttributes` object specifying export parameters. :param export_as_one_file: If ``True``, the data will be exported in one CSV file, each line representing one scraped URL. """ea=export_attrsif(export_attrsisnotNone)elseself.export_attrsifeaisNone:raiseValueError("No export attributes have been passed.")ifself.dataisNone:print("No data to be exported. Execute the run() method before exporting.")returnifexport_as_one_file:export_to_csv(self.data,**ea.__dict__)else:iflen(self.data)==1:export_to_csv(self.data[0],**ea.__dict__)else:multithreaded_csv_export(self.data,**ea.__dict__)