Source code for hpycc.utils.parsers

import re
from xml.etree import ElementTree
from collections import OrderedDict
import pandas as pd
import numpy as np


[docs]def parse_xml(xml):
    """
    Return a DataFrame from a nested XML.

    Parameters
    ----------
    xml : str
        xml to be parsed.

    Returns
    -------
    df : pd.DataFrame
        Parsed xml.
    """
    vls = []
    lvls = []

    for line in re.findall("<Row>(?P<content>.+?)</Row>", xml):
        with_start = '<Row>' + line + '</Row>'
        newvls = []
        etree = ElementTree.fromstring(with_start)
        for child in etree:
            if child.tag not in lvls:
                lvls.append(child.tag)
            newvls.append(child.text)
        vls.append(newvls)

    df = pd.DataFrame(vls, columns=lvls)
    df.replace("", np.nan, inplace=True)
    df.fillna(np.nan, inplace=True)

    df = _make_col_numeric(df)
    df = _make_col_bool(df)

    return df


def _make_col_numeric(df):
    """
    Convert string numeric columns to numerics.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to run conversion on.

    Returns
    -------
    df : pd.DataFrame
        Data frame with all string numeric columns converted to
        numeric.
    """

    for col in df.columns:
        try:
            nums = pd.to_numeric(df[col])
            df[col] = nums
        except ValueError:
            continue

    return df


def _make_col_bool(df):
    """
    Convert string boolean columns to booleans.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to run conversion on.

    Returns
    -------
    df : pd.DataFrame
        Data frame with all string boolean columns converted to
        boolean.
    """
    for col in df.columns:
        unique_vals = df[col].unique()
        if set(unique_vals).issubset(["true", "false"]):
            df[col] = (df[col] == "true").astype('bool')
        elif set(unique_vals).issubset(["true", "false", np.nan]):
            df[col] = df[col].map({
                "true": True,
                "false": False,
                "": np.nan,
                None: np.nan
            })

    return df


[docs]def parse_wuid_from_failed_response(result):
    regex = "W[0-9]{8}(\S*)"
    matches = re.search(regex, result)
    if matches:
        return matches.group(0)
    else:
        return None


[docs]def parse_wuid_from_xml(result):
    """
    Function retrieves a WUID for a script that has run. This retrieves it
    only in the cases where the request response was in XML format.

    Parameters
    ----------
    result : 'XML'
        The XML response for the script that has run.

    Returns
    -------
    wuid : str
        The Workunit ID from the XML.

    """
    regex = "wuid: (.+?)   state:"
    result = result.replace("\r\n", "")

    search = re.search(regex, result).group(0)
    if not search:
        return None
    wuid3 = search.replace('wuid: ', '')
    wuid2 = wuid3.replace('   state:', '')
    wuid1 = wuid2.replace(' ', '')
    regex2 = "W[0-9]{8}(\S*)"
    wuid = re.search(regex2, wuid1)
    if not wuid:
        return None

    return wuid.group(0)


[docs]def parse_schema_from_xml(xml):
    """
    Parse an ECL schema into python types.

    Parameters
    ----------
    xml : str
        xml string returned by ecl run. This is located in the json
        as ["WUResultResponse]["Result"]["XmlSchema"]["xml"].

    Returns
    -------
    OrderedDict
        dict of column stats, in the form {name: Str, type: Str, is_a_set: Bool}.
    list
        Column names in order of occurrence.

    """
    x = xml.replace("\n", "")
    xml = ElementTree.fromstring(x)
    schema = xml[0][0][0][0][0][0]

    schema_out = OrderedDict()
    for child in schema:
        name = child.attrib["name"]
        is_set = "type" not in child.keys()
        typ = get_python_type_from_ecl_type(child)
        schema_out[name] = {'type': typ, 'is_a_set': is_set}

    return schema_out


[docs]def apply_custom_dtypes(schema, dtypes):

    if isinstance(dtypes, dict):
        if set(dtypes.keys()).difference(schema.keys()):  # Check that all columns passed exist
            raise KeyError('Not all dtype columns exist in the logical file!\nFound: %s\nGiven: %s' %
                           (schema.keys(), dtypes))

        for name in dtypes.keys():
            schema[name]['type'] = dtypes[name]

    elif dtypes:  # assuming it's a single type for everything
        for key in schema.keys():
            schema[key]['type'] = dtypes

    return schema


[docs]def get_python_type_from_ecl_type(child):
    """
    Get the python type from an hpcc schema node

    Parameters
    ----------
    child : XML node
       Node of schema xml. See `parse_schema_from_xml`

    Returns
    -------
    type : type
        Pythonic type. If the HPCC type cannot be mapped, is str.

    """
    translated_type = {
        "boolean": bool,
        "decimal": float,
        "double": float,
        "integer": int,
        "udecimal": float,
        "nonnegativeinteger": int
    }

    c = max([z.attrib.get("type", "") for z in child.iter()]).lower()
    typed = re.sub("[0-9_]|(xs:)", "", c)
    return translated_type.get(typed, str)  # Return type, default to string.