Source code for easymoney.easy_pandas

# coding: utf-8

"""

    Supporting Pandas Tools
    ~~~~~~~~~~~~~~~~~~~~~~~

"""

# Modules #
import copy
import warnings
import datetime
import numpy as np
import pandas as pd


[docs]def pstr(s): """ Convert to any obect to a string using pandas. :param s: item to be converted to a string. :type s: ``any`` :return: a string :rtype: ``str`` """ return pd.Series([s]).astype('unicode')[0]
[docs]def strlist_to_list(to_parse, convert_to_str_first=False): """ Work around for using ``eval()`` for the following conversion: ``str(list)`` → ``list``. For example: ``"[1992, '221-21', 2102, 'apples']"`` → ``['1992', '221-21', '2102', 'apples']``. :param to_parse: a string of a list. :type to_parse: ``str`` :param convert_to_str_first: convert to a string first (as a precaution). Defaults to False. :type convert_to_str_first: ``bool`` :return: string of a list to an actual list. :return: ``list`` """ str_list = pstr(to_parse) if convert_to_str_first else to_parse return [i.strip().replace("'", "") for i in [j.split(",") for j in [str_list.replace("[", "").replace("]", "")]][0]]
def _pandas_dictkey_to_key_unpack(pandas_series, unpack_dict, convert_values_to_str = False): """ Used to unpack ISO Alpha2 --> Alpha2 ISO code Pandas Series. :param pandas_series: a Series to replace the alpha2 codes with another set. :type pandas_series: ``Pandas Series`` :param convert_values_to_str: convert the values to string (precaution). :type convert_values_to_str: ``bool`` :param unpack_dict: a dict with the values coerced into strings. :type unpack_dict: ``dict`` :return: a pandas series with the alpha2 values replaced with unpack_dict.values(). ``NaN`` is used if a match cannot be found. :rtype: ``Pandas Series`` """ if convert_values_to_str: unpack_dict = {k: pstr(v) for k, v in unpack_dict.items()} return pandas_series.replace(unpack_dict).map(lambda x: np.NaN if 'nan' in pstr(x) else strlist_to_list(pstr(x))) def _standard_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str = True): """ Method to produce a nested dict from a large pandas dataframe. Reliable technique (although slow with large DataFrames). :param data_frame: see ``twoD_nested_dict()``. :type data_frame: ``Pandas DataFrame`` :param nest_col_a: see ``twoD_nested_dict()``. :type nest_col_a: ``str`` :param nest_col_b: see ``twoD_nested_dict()``. :type nest_col_b: ``str`` :param nest_col_c: see ``twoD_nested_dict()``. :type nest_col_c: ``str`` :param keys_to_str: see ``twoD_nested_dict()``. :type keys_to_str: ``bool`` :return: nested dict of the form: ``{nest_col_a: {nest_col_b: nest_col_c}}``. :rtype: dict """ # Initialize df_slice = None # Make columns that are to become keys strings. if keys_to_str: data_frame[nest_col_a] = data_frame[nest_col_a].astype(str).str.upper() data_frame[nest_col_b] = data_frame[nest_col_b].astype(str) # Create a dict from keys() nested_dict = dict.fromkeys(data_frame[nest_col_a].unique()) for k in nested_dict.keys(): df_slice = data_frame[data_frame[nest_col_a] == k] nested_dict[k] = dict(zip(df_slice[nest_col_b], df_slice[nest_col_c])) return nested_dict def _fast_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str = True): """ | This is a fast way to produce a nested dict from a large Pandas DataFrames. | Can handle DataFrames with several nest_col_a entries that are the same, | e.g., | nest_col_a nest_col_b nest_col_c | 0 1999-01-04 AUD 0 | 1 1999-01-04 CAD 1 | 2 1999-01-05 CHF 2 | 3 1999-01-05 CYP 3 | | WARNING: Not well-tested. This procedure may produce inaccuracies. :param data_frame: see ``twoD_nested_dict()``. :type data_frame: Pandas DataFrame :param nest_col_a: see ``twoD_nested_dict()``. :type nest_col_a: str :param nest_col_b: see ``twoD_nested_dict()``. :type nest_col_b: str :param nest_col_c: see ``twoD_nested_dict()``. :type nest_col_c: str :param keys_to_str: see ``twoD_nested_dict()``. :type keys_to_str: ``bool`` :return: nested dict of the form {nest_col_a: {nest_col_b: nest_col_c}}. :rtype: dict """ # Group nest_col_b and nest_col_c by nest_col_a col_b_groupby = data_frame.groupby(nest_col_a)[nest_col_b].apply(lambda x: x.tolist()).reset_index() col_c_groupby = data_frame.groupby(nest_col_a)[nest_col_c].apply(lambda x: x.tolist()).reset_index() # Merge on nest_col_a grouped_data_frame = pd.merge(col_b_groupby, col_c_groupby, on = nest_col_a, how = 'outer') # Convert columns that will become keys to a string if keys_to_str: grouped_data_frame[nest_col_a] = grouped_data_frame[nest_col_a].astype(str) grouped_data_frame[nest_col_b] = grouped_data_frame[nest_col_b].map(lambda x: [pstr(i) for i in x]) # Generate a dictionary by zipping the nest_col_b and nest_col_c columns grouped_data_frame['dict_zipped'] = grouped_data_frame[[nest_col_b, nest_col_c]].apply( lambda x: dict(zip(x[0], x[1])), axis = 1) # Return as a nested dict return dict(zip(grouped_data_frame[nest_col_a], grouped_data_frame['dict_zipped']))
[docs]def twoD_nested_dict(data_frame , nest_col_a = None , nest_col_b = None , nest_col_c = None , to_float = None , to_int = None , keys_to_str = True , engine = 'standard'): """ Generate a nested dictionary from the columns of a pandas dataframe. Defaults to using the first 3 columns. :param data_frame: a pandas dataframe. :type data_frame: ``Pandas DateFrame`` :param nest_col_a: reference to column in the dataframe; to become the master key in dict. Defaults to None (i.e., col 1). :type nest_col_a: ``str`` :param nest_col_b: reference to column in the dataframe; to become the sub-key. Defaults to None (i.e., col 2). :type nest_col_b: ``str`` :param nest_col_c: reference to column in the dataframe; to become the value to corresponding to the sub-key. Defaults to None (i.e., col 3). :type nest_col_c: ``str`` :param to_float: a list items to float. Defaults to None. :type to_float: ``str`` :param to_int: a list of the lists to convert to ints. Defaults to None. :type to_int: ``str`` :param keys_to_str: Convert the columns that will become keys to strings. Default to True. WARNING: will OVERRIDE *to_float* and *to_int* if they reference nest_col_a or nest_col_b. :type keys_to_str: ``bool`` :param engine: 'standard' for a slower (but well-tested) method of generating a nested dict; 'fast' to employ a speedy (but *not well-tested*) method for generating a nested dict. Default to 'standard'. :type engine: ``str`` :return: nested dict of the form: ``{nest_col_a: {nest_col_b: nest_col_c}``. :rtype: ``dict`` """ if all(v is None for v in [nest_col_a, nest_col_b, nest_col_c]): nest_col_a = data_frame.columns[0] nest_col_b = data_frame.columns[1] nest_col_c = data_frame.columns[2] # Convert selected columns to float if to_float != None: for i in to_float: data_frame[i] = data_frame[i].astype(float) # Convert selected columns to int if to_int != None: for j in to_int: data_frame[j] = data_frame[j].astype(int) # Use one of two engines to generate the nested dict if engine == 'standard': return _standard_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str) elif engine == 'fast': return _fast_pd_nester(data_frame, nest_col_a, nest_col_b, nest_col_c, keys_to_str)
[docs]def pandas_list_column_to_str(data_frame, columns, join_on = ", ", bracket_wrap = False): """ Tool for converting the columns in a Pandas DataFrame from pd.Series of lists into comma-seperated strings. :param data_frame: a dataframe. :type data_frame: ``Pandas DataFrame`` :param columns: a list of columns in the DataFrame :type columns: ``list`` :param join_on: a string to join on. Defaults to ", ". :type join_on: ``str`` :return: a dataframe with the columns altered in the manner described above. :rtype: ``Pandas DataFrame`` """ df = copy.deepcopy(data_frame) for col in columns: df[col] = df[col].map(lambda x: join_on.join(x) if pstr(x) != 'nan' else x) if bracket_wrap: df[col] = df[col].map(lambda x: "[" + pstr(x) + "]" if pstr(x) != 'nan' else x) return df
[docs]def pandas_str_column_to_list(data_frame, columns): """ Tool for converting the columns in a Pandas DataFrame from comma-seperated strings into a pd.Series of lists. :param data_frame: a dataframe. :type data_frame: ``Pandas DataFrame`` :param columns: a list of columns in the dataframe. :type columns: ``list`` :return: a dataframe with the columns altered in the manner described above. :rtype: ``Pandas DataFrame`` """ data_frame = copy.deepcopy(data_frame) for col in columns: data_frame[col] = data_frame[col].astype(str).map(lambda x: [i.strip() for i in x.split(",")]) return data_frame
[docs]def type_in_series(series): """ Return the types of objects in a Pandas Series. :param series: a series. :type series: ``Pandas Series`` :return: list of the types in a series. :rtype: ``list`` """ return list(set([type(i).__name__ if pstr(i).strip() not in ['nan', ''] else 'nan' for i in series]))
[docs]def prettify_all_pandas_list_cols(data_frame, join_on = ", ", allow_nan=True, exclude=[], bracket_wrap=False): """ Converts all columns with only lists to list-seperated-strings. :param data_frame: a dataframe. :type data_frame: ``Pandas DataFrame`` :param allow_nan: allow nans :type nan: ``bool`` :param join_on: a string to join on. Defaults to ", ". :type join_on: ``str`` :param exclude: columns to exclude. Defaults to an empty list (``[]``). :type exclude: ``list`` :param bracket_wrap: wrap in brackets. :type bracket_wrap: ``bool`` :return: dataframe with columns lists converted to strings. :rtype: ``Pandas DataFrame`` """ allowed = [['list'], ['list', 'nan']] if allow_nan else [['list']] # Find allowed columns cols_to_prettify = \ [c for c in data_frame.columns if sorted(type_in_series(data_frame[c])) in allowed and c not in exclude] if len(cols_to_prettify) == 0: return data_frame else: return pandas_list_column_to_str(data_frame, cols_to_prettify, join_on, bracket_wrap)
[docs]def items_null(element): """ Check if an object is a NaN, including all the elements in an iterable. :param element: a python object. :type element: ``any`` :return: assessment of whether or not `element` is a NaN. :rtype: ``bool`` """ if isinstance(element, (list, tuple, type(np.array))): return True if all(pd.isnull(i) for i in element) else False else: return pd.isnull(element)
[docs]def pandas_null_drop(data_frame, subset=None): """ Drop rows with NaNs of all, or a subset of, the dataframe's columns. (Can handle iterables which only contain NaNs). :param data_frame: a dataframe. :type data_frame: ``Pandas DataFrame`` :param subset: a subset of columns. Defaults to None, which will apply to all columns. :type subset: ``iterable`` :return: dataframe with NaN dropped. :rtype: ``Pandas DataFrame`` """ # Fill Empty cells with nans data_frame = data_frame.fillna(np.NaN) if subset is None: data_frame = data_frame.dropna() else: for c in subset: data_frame = data_frame[~data_frame[c].map(items_null)] return data_frame.reset_index(drop=True)
# ---------------------------------------------------------------------------------------------------------- # Printing Suit # ---------------------------------------------------------------------------------------------------------- def _padding(s, amount, justify): """ Add padding to a string. :param s: a string. :type s: ``str`` :param amount: the amount of white space to add. :type amount: ``float`` or ``int`` :param justify: the justification of the resultant text. Must be one of: 'left' or 'center'. :type justify: ``str`` :return: `s` justified, or as passed if `justify` is not one of: 'left' or 'center'. :rtype: ``str`` """ pad = ' ' * amount if justify == 'left': return "%s%s" % (pstr(s), pad) elif justify == 'center': return "%s%s%s" % (pad[:int(amount/2)], pstr(s), pad[int(amount/2):]) else: return s def _pandas_series_alignment(pandas_series, justify): """ Align all items in a pandas series. :param pandas_series: a pandas series. :type pandas_series: ``Pandas Series`` :param justify: the justification of the resultant text. Must be one of: 'left', 'right' or 'center'. :type justify: ``str`` :return: aligned series :rtype: ``str`` """ if justify == 'right': return pandas_series longest_string = max([len(s) for s in pandas_series.astype('unicode')]) return [_padding(s, longest_string - len(s), justify) if not items_null(s) else s for s in pandas_series]
[docs]def align_pandas(data_frame, to_align='right'): """ Align the columns of a Pandas DataFrame by adding whitespace. :param data_frame: a dataframe. :type data_frame: ``Pandas DataFrame`` :param to_align: 'left', 'right', 'center' or a dictionary of the form: ``{'Column': 'Alignment'}``. :type to_align: ``str`` :return: dataframe with aligned columns. :rtype: ``Pandas DataFrame`` """ if isinstance(to_align, dict): alignment_dict = to_align elif to_align.lower() in ['left', 'right', 'center']: alignment_dict = dict.fromkeys(data_frame.columns, to_align.lower()) else: raise ValueError("to_align must be either 'left', 'right', 'center', or a dict.") for col, justification in alignment_dict.items(): data_frame[col] = _pandas_series_alignment(data_frame[col], justification) return data_frame
[docs]def pandas_print_full(pd_df, full_rows = True, full_cols = True): """ Print *all* of a Pandas DataFrame. :param pd_df: DataFrame to printed in its entirety. :type pd_df: ``Pandas DataFrame`` :param full_rows: print all rows if True. Defaults to True. :type full_rows: ``bool`` :param full_cols: print all columns side-by-side if True. Defaults to True. :type full_cols: ``bool`` """ if full_rows: pd.set_option('display.max_rows', len(pd_df)) if full_cols: pd.set_option('expand_frame_repr', False) # Print the data frame print(pd_df) if full_rows: pd.reset_option('display.max_rows') if full_cols: pd.set_option('expand_frame_repr', True)
[docs]def pandas_pretty_print(data_frame, col_align='right', header_align='center', full_rows=True, full_cols=True): """ Pretty Print a Pandas DataFrame. :param data_frame: a dataframe. :type data_frame: ``Pandas DataFrame`` :param col_align: 'left', 'right', 'center'' or a dictionary of the form: ``{'Column': 'Alignment'}``. :type col_align: ``str`` or ``dict`` :param header_align: alignment of headers. Must be one of: 'left', 'right', 'center'. :type header_align: ``str`` or ``dict`` :param full_rows: print all rows. :type full_rows: ``bool`` :param full_cols: print all columns. :type full_cols: ``bool`` """ aligned_df = align_pandas(data_frame, col_align) pd.set_option('colheader_justify', header_align) pandas_print_full(aligned_df.fillna(""), full_rows, full_cols) pd.set_option('colheader_justify', 'right')