Source code for hpycc.save

"""
**TEMPORARILY DEPRICATED! Just use get and save teh result. Trust us, it's
cleaner**

Functions to get data out of an HPCC instance and save
them to disk.

This modules functions closely mirror those in `get`.
In fact all they really do is wrap `get`'s functions
around csv writing tasks. The first input to all
functions is an instance of `Connection`.

Functions
---------
- `save_output` -- Save the first output of an ECL script.
- `save_outputs` -- Save all outputs of an ECL script.
- `save_thor_file` -- Save the contents of a thor file.

"""

from hpycc import get_output, get_thor_file


[docs]def save_output(connection, script, path_or_buf=None, syntax_check=True,
                delete_workunit=True, stored=None, **kwargs):
    """
    Save the first output of an ECL script as a csv. See
    save_outputs() for saving multiple outputs to file and
    get_output() for returning as a DataFrame.

    Parameters
    ----------
    connection: `Connection`
        HPCC Connection instance, see also `Connection`.
    script: str
        Path of script to execute.
    path_or_buf : string or file handle, default None
        File path or object, if None is provided the result is returned as
        a string.
    syntax_check: bool, optional
        Should script be syntax checked before execution. True by
        default.
    delete_workunit: bool, optional
        Delete workunit once completed. True by default.
    stored : dict or None, optional
        Key value pairs to replace stored variables within the
        script. Values should be str, int or bool. None by default.
    kwargs
        Additional parameters to be provided to
        pandas.DataFrame.to_csv().

    Returns
    -------
    None or str
        if path_or_buf is not None, else a string representation of
        the output csv.
    """
    result = get_output(connection=connection, script=script,
                        syntax_check=syntax_check,
                        delete_workunit=delete_workunit,
                        stored=stored)
    return result.to_csv(path_or_buf=path_or_buf, **kwargs)


# def save_outputs(connection, script, directory=".", overwrite=True,
#                  prefix='', syntax_check=True, delete_workunit=True,
#                  stored=None, **kwargs):
#     """
#     Save all outputs of an ECL script as csvs. See get_outputs()
#     for returning DataFrames and save_output() for writing a single
#     output to file. Names of CSVs are inhereted from the result
#     names of your OUTPUT statements. Use NAME() in ECL to specify.
#     A list of assigned names will be returned for reference.
#
#     Parameters
#     ----------
#     connection : hpycc.Connection
#         HPCC Connection instance, see also `Connection`.
#     script : str
#          Path of script to execute.
#     directory : str, optional
#         Directory to save output files in. "." by default.
#     overwrite : bool
#         Should files be overwritten if they already exist? True by
#         default. Because you should know better.
#     prefix : str, optional
#         Prefix to prepend to all file names. None by default.
#     syntax_check : bool, optional
#         Should the script be syntax checked before execution. True by
#         default.
#     delete_workunit : bool, optional
#         Delete workunit once completed. True by default.
#     stored : dict or None, optional
#         Key value pairs to replace stored variables within the
#         script. Values should be str, int or bool. None by default.
#     kwargs
#         Additional parameters to be provided to
#         pandas.DataFrame.to_csv().
#
#
#     Returns
#     -------
#     str
#         list of written CSVs
#
#     Raises
#     ------
#     IndexError
#         If `filenames` is of different length to the number of
#         outputs.
#     """
#     results = hpycc.get_outputs(
#         connection, script, syntax_check, delete_workunit, stored)
#
#     paths = [os.path.join(directory, prefix + res + ".csv") for res in results]
#     if not overwrite and any([os.path.isfile(f) for f in paths]):
#         raise FileExistsError("Target file already exists and overwrite is False. Aborting.")
#
#     for path, result in zip(paths, results.items()):
#         result[1].to_csv(path, **kwargs)
#
#     return paths


[docs]def save_thor_file(connection, thor_file, path_or_buf=None,
                   max_workers=15, chunk_size='auto', max_attempts=3,
                   max_sleep=60, dtype=None,
                   **kwargs):
    """
    Save a logical file to disk, see `get_thor_file()` for returning a
    DataFrame.

    Parameters
    ----------
    connection: `Connection`
        HPCC Connection instance, see also `Connection`.
    thor_file: str
        Logical file to be downloaded
    path_or_buf : string or file handle, default None
        File path or object, if None is provided the result is returned as
        a string.
    max_workers: int, optional
        Number of concurrent threads to use when downloading.
        Warning: too many will likely cause either your machine or
        your cluster to crash! 15 by default.
    chunk_size: int, optional.
        Size of chunks to use when downloading file. 10000 by
        default.
    max_attempts: int, optional
        Maximum number of times a chunk should attempt to be
        downloaded in the case of an exception being raised.
        3 by default.
    max_sleep: int, optional
        Maximum time, in seconds, to sleep between attempts.
        The true sleep time is a random int between `max_sleep` and
        `max_sleep` * 0.75.
    dtype: type name or dict of col -> type, optional
        Data type for data or columns. E.g. {'a': np.float64, 'b':
        np.int32}. If converters are specified, they will be applied
        INSTEAD of dtype conversion. If None, or columns are missing
        from the provided dict, they will be converted to one of
        bool, str or int based on the HPCC datatype. None by default.
    kwargs
        Additional parameters to be provided to
        pandas.DataFrame.to_csv().

    Returns
    -------
    None or str
        if path_or_buf is not None, else a string
        representation of the output csv.

    """

    file = get_thor_file(
        connection, thor_file, max_workers=max_workers, chunk_size=chunk_size,
        max_attempts=max_attempts, max_sleep=max_sleep, dtype=dtype)

    return file.to_csv(path_or_buf, **kwargs)