"""Functions for local file import/export operations, e. g. CSV file reading and writing."""importosfrommultiprocessing.dummyimportPoolasThreadPoolfromtypingimportUnionimportloggingimportpandasaspdfromscrawler.defaultsimportDEFAULT_CSV_ENCODING,DEFAULT_CSV_SEPARATOR,DEFAULT_CSV_QUOTING,DEFAULT_CSV_ESCAPECHAR
[docs]defexport_to_csv(data,directory:str,fn:str,header:Union[list,str,bool]=None,encoding:str=DEFAULT_CSV_ENCODING,separator:str=DEFAULT_CSV_SEPARATOR,quoting:int=DEFAULT_CSV_QUOTING,escapechar:str=DEFAULT_CSV_ESCAPECHAR,current_index:int=None,**kwargs)->None:"""Export data to a CSV file. :param data: One- or two-dimensional data that will be parsed to a :class:`pandas:pandas.DataFrame`. :param directory: Path to directory where file will be saved. :param fn: Filename (*without* file extension). :param header: If ``None`` or ``False``, no header will be written. If ``first-row`` or ``True``, uses first row of data as header. Else, pass list of strings of appropriate length. :param encoding: Encoding to use to create the CSV file. :param separator: Column separator or delimiter to use for creating the CSV file. :param quoting: Puts quotes around cells that contain the separator character. :param escapechar: Escapes the separator character. :param current_index: If ``fn`` is a list of filenames, use this to specify which filename to use. :param kwargs: Any parameter supported by :meth:`pandas:pandas.DataFrame.to_csv` can be passed. """iftype(fn)isnotstrandcurrent_indexisnotNone:fn=fn[current_index]filepath=f"{directory}/{fn}.csv"write_index=Falsewrite_columns=Falseif(headerisNone)elseTrueifdataisNoneorlen(data)==0:# TODO maybe raise ValueError instead?logging.error("Can't export empty dataset.")returnifnotisinstance(data[0],(list,tuple,set)):# if data consists of just one data point, wrap into another list so that Pandas correctly parses it into multiple columnsdata=[data]if(headerisNone)or(headerisFalse):container=pd.DataFrame(data)elif(header=="first-row")or(headerisTrue):container=pd.DataFrame(data[1:],columns=data[0])else:container=pd.DataFrame(data,columns=header)container.to_csv(filepath,encoding=encoding,sep=separator,header=write_columns,index=write_index,quoting=quoting,escapechar=escapechar,**kwargs)logging.info(f"Data exported to {filepath}.")
[docs]defmultithreaded_csv_export(list_of_datasets:list,**kwargs)->None:"""Export a list of multi-column dataset to a CSV file in parallel using ``multithreading``. :param list_of_datasets: List of two-dimensional data objects that will be parsed to a :class:`pandas:pandas.DataFrame`. :param kwargs: Keywords arguments that are passed on to :func:`.export_to_csv`. """# Prepare argument listargs=list(enumerate(list_of_datasets))# Define function with constant parameters pre-filleddefdo_export(index,data):returnexport_to_csv(data,current_index=index,**kwargs)# Map function for multi-threadingpool=ThreadPool()pool.starmap(do_export,args)pool.close()pool.join()
[docs]defget_data_in_dir(directory:str,start_idx:int=0,end_idx:int=None,encoding:str=DEFAULT_CSV_ENCODING,separator:str=DEFAULT_CSV_SEPARATOR)->list:"""Read all CSV files within a directory. All files in the directory must be CSV files. :param directory: Path to the directory. :param start_idx: Sometimes, not all CSV files in the directory should be read. Together with ``end_idx``, this parameter allows to specify an interval of files that should be read in, e. g. the first up to the 5th file. :param end_idx: See ``start_idx``. :param encoding: The character encoding of the CSV files to be read. :param separator: The separator/delimiter of the CSV files to be read. """filenames=os.listdir(directory)end_idx=end_idxifend_idxisnotNoneelselen(filenames)paths=[directory+"/"+filenameforfilenameinfilenames[start_idx:end_idx]]return[pd.read_csv(path,encoding=encoding,sep=separator)forpathinpaths]