Source code for easymoney.easy_pandas

# coding: utf-8

"""

    Supporting Pandas Tools
    ~~~~~~~~~~~~~~~~~~~~~~~

"""

# Modules #
import copy
import warnings
import datetime
import numpy as np
import pandas as pd


[docs]def pstr(s):
    """

    Convert to any obect to a string using pandas.

    :param s: item to be converted to a string.
    :type s: ``any``
    :return: a string
    :rtype: ``str``
    """
    return pd.Series([s]).astype('unicode')[0]


[docs]def strlist_to_list(to_parse, convert_to_str_first=False):
    """

    Work around for using ``eval()`` for the following conversion: ``str(list)`` → ``list``.

    For example: ``"[1992, '221-21', 2102, 'apples']"`` → ``['1992', '221-21', '2102', 'apples']``.

    :param to_parse: a string of a list.
    :type to_parse: ``str``
    :param convert_to_str_first: convert to a string first (as a precaution). Defaults to False.
    :type convert_to_str_first: ``bool``
    :return: string of a list to an actual list.
    :return: ``list``
    """
    str_list = pstr(to_parse) if convert_to_str_first else to_parse
    return [i.strip().replace("'", "") for i in [j.split(",") for j in [str_list.replace("[", "").replace("]", "")]][0]]


def _pandas_dictkey_to_key_unpack(pandas_series, unpack_dict, convert_values_to_str = False):
    """

    Used to unpack ISO Alpha2 --> Alpha2 ISO code Pandas Series.

    :param pandas_series: a Series to replace the alpha2 codes with another set.
    :type pandas_series: ``Pandas Series``
    :param convert_values_to_str: convert the values to string (precaution).
    :type convert_values_to_str: ``bool``
    :param unpack_dict: a dict with the values coerced into strings.
    :type unpack_dict: ``dict``
    :return: a pandas series with the alpha2 values replaced with unpack_dict.values().
             ``NaN`` is used if a match cannot be found.
    :rtype: ``Pandas Series``
    """
    if convert_values_to_str:
        unpack_dict = {k: pstr(v) for k, v in unpack_dict.items()}

    return pandas_series.replace(unpack_dict).map(lambda x: np.NaN if 'nan' in pstr(x) else strlist_to_list(pstr(x)))


def _standard_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str = True):
    """

    Method to produce a nested dict from a large pandas dataframe.
    Reliable technique (although slow with large DataFrames).

    :param data_frame: see ``twoD_nested_dict()``.
    :type data_frame: ``Pandas DataFrame``
    :param nest_col_a: see ``twoD_nested_dict()``.
    :type nest_col_a: ``str``
    :param nest_col_b: see ``twoD_nested_dict()``.
    :type nest_col_b: ``str``
    :param nest_col_c: see ``twoD_nested_dict()``.
    :type nest_col_c: ``str``
    :param keys_to_str: see ``twoD_nested_dict()``.
    :type keys_to_str: ``bool``
    :return: nested dict of the form: ``{nest_col_a: {nest_col_b: nest_col_c}}``.
    :rtype: dict
    """
    # Initialize
    df_slice = None

    # Make columns that are to become keys strings.
    if keys_to_str:
        data_frame[nest_col_a] = data_frame[nest_col_a].astype(str).str.upper()
        data_frame[nest_col_b] = data_frame[nest_col_b].astype(str)

    # Create a dict from keys()
    nested_dict = dict.fromkeys(data_frame[nest_col_a].unique())
    for k in nested_dict.keys():
        df_slice = data_frame[data_frame[nest_col_a] == k]
        nested_dict[k] = dict(zip(df_slice[nest_col_b], df_slice[nest_col_c]))

    return nested_dict


def _fast_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str = True):
    """

    | This is a fast way to produce a nested dict from a large Pandas DataFrames.
    | Can handle DataFrames with several nest_col_a entries that are the same,
    | e.g.,
    |         nest_col_a    nest_col_b  nest_col_c
    | 0       1999-01-04      AUD          0
    | 1       1999-01-04      CAD          1
    | 2       1999-01-05      CHF          2
    | 3       1999-01-05      CYP          3
    |
    | WARNING: Not well-tested. This procedure may produce inaccuracies.

    :param data_frame: see ``twoD_nested_dict()``.
    :type data_frame: Pandas DataFrame
    :param nest_col_a: see ``twoD_nested_dict()``.
    :type nest_col_a: str
    :param nest_col_b: see ``twoD_nested_dict()``.
    :type nest_col_b: str
    :param nest_col_c: see ``twoD_nested_dict()``.
    :type nest_col_c: str
    :param keys_to_str: see ``twoD_nested_dict()``.
    :type keys_to_str: ``bool``
    :return: nested dict of the form {nest_col_a: {nest_col_b: nest_col_c}}.
    :rtype: dict
    """
    # Group nest_col_b and nest_col_c by nest_col_a
    col_b_groupby = data_frame.groupby(nest_col_a)[nest_col_b].apply(lambda x: x.tolist()).reset_index()
    col_c_groupby = data_frame.groupby(nest_col_a)[nest_col_c].apply(lambda x: x.tolist()).reset_index()

    # Merge on nest_col_a
    grouped_data_frame = pd.merge(col_b_groupby, col_c_groupby, on = nest_col_a, how = 'outer')

    # Convert columns that will become keys to a string
    if keys_to_str:
        grouped_data_frame[nest_col_a] = grouped_data_frame[nest_col_a].astype(str)
        grouped_data_frame[nest_col_b] = grouped_data_frame[nest_col_b].map(lambda x: [pstr(i) for i in x])

    # Generate a dictionary by zipping the nest_col_b and nest_col_c columns
    grouped_data_frame['dict_zipped'] = grouped_data_frame[[nest_col_b, nest_col_c]].apply(
                                                                    lambda x: dict(zip(x[0], x[1])), axis = 1)

    # Return as a nested dict
    return dict(zip(grouped_data_frame[nest_col_a], grouped_data_frame['dict_zipped']))


[docs]def twoD_nested_dict(data_frame
                     , nest_col_a = None
                     , nest_col_b = None
                     , nest_col_c = None
                     , to_float = None
                     , to_int = None
                     , keys_to_str = True
                     , engine = 'standard'):
    """

    Generate a nested dictionary from the columns of a pandas dataframe.
    Defaults to using the first 3 columns.

    :param data_frame: a pandas dataframe.
    :type data_frame: ``Pandas DateFrame``
    :param nest_col_a: reference to column in the dataframe; to become the master key in dict.
                       Defaults to None (i.e., col 1).
    :type nest_col_a: ``str``
    :param nest_col_b: reference to column in the dataframe; to become the sub-key.
                       Defaults to None (i.e., col 2).
    :type nest_col_b: ``str``
    :param nest_col_c: reference to column in the dataframe; to become the value to corresponding to the sub-key.
                       Defaults to None (i.e., col 3).
    :type nest_col_c: ``str``
    :param to_float: a list items to float. Defaults to None.
    :type to_float: ``str``
    :param to_int: a list of the lists to convert to ints. Defaults to None.
    :type to_int: ``str``
    :param keys_to_str: Convert the columns that will become keys to strings. Default to True.
                        WARNING: will OVERRIDE *to_float* and *to_int* if they reference nest_col_a or nest_col_b.
    :type keys_to_str: ``bool``
    :param engine: 'standard' for a slower (but well-tested) method of generating a nested dict; 'fast' to employ a
                    speedy (but *not well-tested*) method for generating a nested dict. Default to 'standard'.
    :type engine: ``str``
    :return: nested dict of the form: ``{nest_col_a: {nest_col_b: nest_col_c}``.
    :rtype: ``dict``
    """
    if all(v is None for v in [nest_col_a, nest_col_b, nest_col_c]):
        nest_col_a = data_frame.columns[0]
        nest_col_b = data_frame.columns[1]
        nest_col_c = data_frame.columns[2]

    # Convert selected columns to float
    if to_float != None:
        for i in to_float: data_frame[i] = data_frame[i].astype(float)

    # Convert selected columns to int
    if to_int != None:
        for j in to_int: data_frame[j] = data_frame[j].astype(int)

    # Use one of two engines to generate the nested dict
    if engine == 'standard':
        return _standard_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str)

    elif engine == 'fast':
        return _fast_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str)


[docs]def pandas_list_column_to_str(data_frame, columns, join_on = ", ", bracket_wrap = False):
    """

    Tool for converting the columns in a Pandas DataFrame
    from pd.Series of lists into comma-seperated strings.

    :param data_frame: a dataframe.
    :type data_frame: ``Pandas DataFrame``
    :param columns: a list of columns in the DataFrame
    :type columns: ``list``
    :param join_on: a string to join on. Defaults to ", ".
    :type join_on: ``str``
    :return: a dataframe with the columns altered in the manner described above.
    :rtype: ``Pandas DataFrame``
    """
    df = copy.deepcopy(data_frame)
    for col in columns:
        df[col] = df[col].map(lambda x: join_on.join(x) if pstr(x) != 'nan' else x)
        if bracket_wrap:
            df[col] = df[col].map(lambda x: "[" + pstr(x) + "]" if pstr(x) != 'nan' else x)

    return df


[docs]def pandas_str_column_to_list(data_frame, columns):
    """

    Tool for converting the columns in a Pandas DataFrame
    from comma-seperated strings into a pd.Series of lists.

    :param data_frame: a dataframe.
    :type data_frame: ``Pandas DataFrame``
    :param columns: a list of columns in the dataframe.
    :type columns: ``list``
    :return: a dataframe with the columns altered in the manner described above.
    :rtype: ``Pandas DataFrame``
    """
    data_frame = copy.deepcopy(data_frame)
    for col in columns:
        data_frame[col] = data_frame[col].astype(str).map(lambda x: [i.strip() for i in x.split(",")])
    return data_frame


[docs]def type_in_series(series):
    """

    Return the types of objects in a Pandas Series.

    :param series: a series.
    :type series: ``Pandas Series``
    :return: list of the types in a series.
    :rtype: ``list``
    """
    return list(set([type(i).__name__ if pstr(i).strip() not in ['nan', ''] else 'nan' for i in series]))


[docs]def prettify_all_pandas_list_cols(data_frame, join_on = ", ", allow_nan=True, exclude=[], bracket_wrap=False):
    """

    Converts all columns with only lists to list-seperated-strings.

    :param data_frame: a dataframe.
    :type data_frame: ``Pandas DataFrame``
    :param allow_nan: allow nans
    :type nan: ``bool``
    :param join_on: a string to join on. Defaults to ", ".
    :type join_on: ``str``
    :param exclude: columns to exclude. Defaults to an empty list (``[]``).
    :type exclude: ``list``
    :param bracket_wrap: wrap in brackets.
    :type bracket_wrap: ``bool``
    :return: dataframe with columns lists converted to strings.
    :rtype: ``Pandas DataFrame``
    """
    allowed = [['list'], ['list', 'nan']] if allow_nan else [['list']]

    # Find allowed columns
    cols_to_prettify = \
        [c for c in data_frame.columns if sorted(type_in_series(data_frame[c])) in allowed and c not in exclude]

    if len(cols_to_prettify) == 0:
        return data_frame
    else:
        return pandas_list_column_to_str(data_frame, cols_to_prettify, join_on, bracket_wrap)


[docs]def items_null(element):
    """

    Check if an object is a NaN, including all the elements in an iterable.

    :param element: a python object.
    :type element: ``any``
    :return: assessment of whether or not `element` is a NaN.
    :rtype: ``bool``
    """
    if isinstance(element, (list, tuple, type(np.array))):
        return True if all(pd.isnull(i) for i in element) else False
    else:
        return pd.isnull(element)


[docs]def pandas_null_drop(data_frame, subset=None):
    """

    Drop rows with NaNs of all, or a subset of, the dataframe's columns.
    (Can handle iterables which only contain NaNs).

    :param data_frame: a dataframe.
    :type data_frame: ``Pandas DataFrame``
    :param subset: a subset of columns. Defaults to None, which will apply to all columns.
    :type subset: ``iterable``
    :return: dataframe with NaN dropped.
    :rtype: ``Pandas DataFrame``
    """
    # Fill Empty cells with nans
    data_frame = data_frame.fillna(np.NaN)

    if subset is None:
        data_frame = data_frame.dropna()
    else:
        for c in subset:
            data_frame = data_frame[~data_frame[c].map(items_null)]

    return data_frame.reset_index(drop=True)

# ----------------------------------------------------------------------------------------------------------
# Printing Suit
# ----------------------------------------------------------------------------------------------------------

def _padding(s, amount, justify):
    """

    Add padding to a string.

    :param s: a string.
    :type s: ``str``
    :param amount: the amount of white space to add.
    :type amount: ``float`` or ``int``
    :param justify: the justification of the resultant text. Must be one of: 'left' or 'center'.
    :type justify: ``str``
    :return: `s` justified, or as passed if `justify` is not one of: 'left' or 'center'.
    :rtype: ``str``
    """
    pad = ' ' * amount
    if justify == 'left':
        return "%s%s" % (pstr(s), pad)
    elif justify == 'center':
        return "%s%s%s" % (pad[:int(amount/2)], pstr(s), pad[int(amount/2):])
    else:
        return s

def _pandas_series_alignment(pandas_series, justify):
    """

    Align all items in a pandas series.

    :param pandas_series: a pandas series.
    :type pandas_series: ``Pandas Series``
    :param justify: the justification of the resultant text. Must be one of: 'left', 'right' or 'center'.
    :type justify: ``str``
    :return: aligned series
    :rtype: ``str``
    """
    if justify == 'right':
        return pandas_series
    longest_string = max([len(s) for s in pandas_series.astype('unicode')])
    return [_padding(s, longest_string - len(s), justify) if not items_null(s) else s for s in pandas_series]


[docs]def align_pandas(data_frame, to_align='right'):
    """

    Align the columns of a Pandas DataFrame by adding whitespace.

    :param data_frame: a dataframe.
    :type data_frame: ``Pandas DataFrame``
    :param to_align: 'left', 'right', 'center' or a dictionary of the form: ``{'Column': 'Alignment'}``.
    :type to_align: ``str``
    :return: dataframe with aligned columns.
    :rtype: ``Pandas DataFrame``
    """
    if isinstance(to_align, dict):
        alignment_dict = to_align
    elif to_align.lower() in ['left', 'right', 'center']:
        alignment_dict = dict.fromkeys(data_frame.columns, to_align.lower())
    else:
        raise ValueError("to_align must be either 'left', 'right', 'center', or a dict.")

    for col, justification in alignment_dict.items():
        data_frame[col] = _pandas_series_alignment(data_frame[col], justification)

    return data_frame


[docs]def pandas_print_full(pd_df, full_rows = True, full_cols = True):
    """

    Print *all* of a Pandas DataFrame.

    :param pd_df: DataFrame to printed in its entirety.
    :type pd_df: ``Pandas DataFrame``
    :param full_rows: print all rows if True. Defaults to True.
    :type full_rows: ``bool``
    :param full_cols: print all columns side-by-side if True. Defaults to True.
    :type full_cols: ``bool``
    """
    if full_rows: pd.set_option('display.max_rows', len(pd_df))
    if full_cols: pd.set_option('expand_frame_repr', False)

    # Print the data frame
    print(pd_df)

    if full_rows: pd.reset_option('display.max_rows')
    if full_cols: pd.set_option('expand_frame_repr', True)


[docs]def pandas_pretty_print(data_frame, col_align='right', header_align='center', full_rows=True, full_cols=True):
    """

    Pretty Print a Pandas DataFrame.

    :param data_frame: a dataframe.
    :type data_frame: ``Pandas DataFrame``
    :param col_align: 'left', 'right', 'center'' or a dictionary of the form: ``{'Column': 'Alignment'}``.
    :type col_align: ``str`` or ``dict``
    :param header_align: alignment of headers. Must be one of: 'left', 'right', 'center'.
    :type header_align: ``str`` or ``dict``
    :param full_rows: print all rows.
    :type full_rows: ``bool``
    :param full_cols: print all columns.
    :type full_cols: ``bool``
    """
    aligned_df = align_pandas(data_frame, col_align)
    pd.set_option('colheader_justify', header_align)
    pandas_print_full(aligned_df.fillna(""), full_rows, full_cols)
    pd.set_option('colheader_justify', 'right')