Source code for hpycc.save

"""
**TEMPORARILY DEPRICATED! Just use get and save teh result. Trust us, it's
cleaner**

Functions to get data out of an HPCC instance and save
them to disk.

This modules functions closely mirror those in `get`.
In fact all they really do is wrap `get`'s functions
around csv writing tasks. The first input to all
functions is an instance of `Connection`.

Functions
---------
- `save_output` -- Save the first output of an ECL script.
- `save_outputs` -- Save all outputs of an ECL script.
- `save_thor_file` -- Save the contents of a thor file.

"""

from hpycc import get_output, get_thor_file


[docs]def save_output(connection, script, path_or_buf=None, syntax_check=True, delete_workunit=True, stored=None, **kwargs): """ Save the first output of an ECL script as a csv. See save_outputs() for saving multiple outputs to file and get_output() for returning as a DataFrame. Parameters ---------- connection: `Connection` HPCC Connection instance, see also `Connection`. script: str Path of script to execute. path_or_buf : string or file handle, default None File path or object, if None is provided the result is returned as a string. syntax_check: bool, optional Should script be syntax checked before execution. True by default. delete_workunit: bool, optional Delete workunit once completed. True by default. stored : dict or None, optional Key value pairs to replace stored variables within the script. Values should be str, int or bool. None by default. kwargs Additional parameters to be provided to pandas.DataFrame.to_csv(). Returns ------- None or str if path_or_buf is not None, else a string representation of the output csv. """ result = get_output(connection=connection, script=script, syntax_check=syntax_check, delete_workunit=delete_workunit, stored=stored) return result.to_csv(path_or_buf=path_or_buf, **kwargs)
# def save_outputs(connection, script, directory=".", overwrite=True, # prefix='', syntax_check=True, delete_workunit=True, # stored=None, **kwargs): # """ # Save all outputs of an ECL script as csvs. See get_outputs() # for returning DataFrames and save_output() for writing a single # output to file. Names of CSVs are inhereted from the result # names of your OUTPUT statements. Use NAME() in ECL to specify. # A list of assigned names will be returned for reference. # # Parameters # ---------- # connection : hpycc.Connection # HPCC Connection instance, see also `Connection`. # script : str # Path of script to execute. # directory : str, optional # Directory to save output files in. "." by default. # overwrite : bool # Should files be overwritten if they already exist? True by # default. Because you should know better. # prefix : str, optional # Prefix to prepend to all file names. None by default. # syntax_check : bool, optional # Should the script be syntax checked before execution. True by # default. # delete_workunit : bool, optional # Delete workunit once completed. True by default. # stored : dict or None, optional # Key value pairs to replace stored variables within the # script. Values should be str, int or bool. None by default. # kwargs # Additional parameters to be provided to # pandas.DataFrame.to_csv(). # # # Returns # ------- # str # list of written CSVs # # Raises # ------ # IndexError # If `filenames` is of different length to the number of # outputs. # """ # results = hpycc.get_outputs( # connection, script, syntax_check, delete_workunit, stored) # # paths = [os.path.join(directory, prefix + res + ".csv") for res in results] # if not overwrite and any([os.path.isfile(f) for f in paths]): # raise FileExistsError("Target file already exists and overwrite is False. Aborting.") # # for path, result in zip(paths, results.items()): # result[1].to_csv(path, **kwargs) # # return paths
[docs]def save_thor_file(connection, thor_file, path_or_buf=None, max_workers=15, chunk_size='auto', max_attempts=3, max_sleep=60, dtype=None, **kwargs): """ Save a logical file to disk, see `get_thor_file()` for returning a DataFrame. Parameters ---------- connection: `Connection` HPCC Connection instance, see also `Connection`. thor_file: str Logical file to be downloaded path_or_buf : string or file handle, default None File path or object, if None is provided the result is returned as a string. max_workers: int, optional Number of concurrent threads to use when downloading. Warning: too many will likely cause either your machine or your cluster to crash! 15 by default. chunk_size: int, optional. Size of chunks to use when downloading file. 10000 by default. max_attempts: int, optional Maximum number of times a chunk should attempt to be downloaded in the case of an exception being raised. 3 by default. max_sleep: int, optional Maximum time, in seconds, to sleep between attempts. The true sleep time is a random int between `max_sleep` and `max_sleep` * 0.75. dtype: type name or dict of col -> type, optional Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}. If converters are specified, they will be applied INSTEAD of dtype conversion. If None, or columns are missing from the provided dict, they will be converted to one of bool, str or int based on the HPCC datatype. None by default. kwargs Additional parameters to be provided to pandas.DataFrame.to_csv(). Returns ------- None or str if path_or_buf is not None, else a string representation of the output csv. """ file = get_thor_file( connection, thor_file, max_workers=max_workers, chunk_size=chunk_size, max_attempts=max_attempts, max_sleep=max_sleep, dtype=dtype) return file.to_csv(path_or_buf, **kwargs)