Source code for xpandas.data_container.data_container

import numpy as np
import pandas as pd


def _check_all_elements_have_the_same_property(array, func):
    '''
    Helper function that checks if all elements have the same func(element) value.
    :param array: input values
    :param func: any callable object
    :return: tuple. the first element indicates is all elements are have the same func(element) value,
             second element is a value of func(element)
    '''
    if len(array) == 0:
        return True, None
    try:
        first_element_type = func(array[0])
    except:
        return True, None
    do_all_have_property = all(func(x) == first_element_type
                               for x in array)

    return do_all_have_property, first_element_type


def _is_class_a_primitive(cls):
    '''
    Check if class is a number or string including numpy numbers
    :param cls: any class
    :return: True if class is a primitive class, else False
    '''
    primitives = [
        np.float16, np.float32, np.float64, np.float128,
        np.int8, np.int16, np.int32, np.int64,
        bool, str, np.uint8, np.uint16, np.uint32, np.uint64,
        int, float
    ]
    return cls in primitives


[docs]class XSeries(pd.Series):
    '''
    XSeries is an homogeneous abstract 1d container that encapsulates any data type inside.
    It is an extension of pandas.Series class.
    XSeries has a property data_type that is a type ot objects that are inside XSeries.
    '''
    _metadata = ['data_type']

    @property
    def _constructor(self):
        return XSeries

    @property
    def _constructor_expanddim(self):
        return XDataFrame

    def __init__(self, *args, **kwargs):
        '''
        The same arguments as for pandas.Series
        https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

        In order to create XSeries of any data_type, data argument must be a pythons list.
        For example, to create XSeries of pandas.Series, pass data should be
        data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series.
        '''
        super(XSeries, self).__init__(*args, **kwargs)

        data = kwargs.get('data')
        if data is None:
            data = args[0]

        check_result, data_type = _check_all_elements_have_the_same_property(data, type)
        if not check_result:
            raise ValueError('Not all elements the same type')

        if data_type is not None:
            self._data_type = data_type
        else:
            self._data_type = type(data._values[0])

[docs]    def apply(self, *args, **kwargs):
        '''
        Overwrite standart pandas.Series method.
        Apply transform function to all elements in self.
        *If transform function return dict like object,
        transform XSeries to XDataFrame see XDataFrame constructor*

        :param func: function to apply
        :param prefix: prefix for columns if needs to return XDataFrame object
        :return: XSeries of XDataFrame depending on transformation
        '''
        func = kwargs.get('func')
        if func is None:
            func = args[0]

        # TODO
        # Possibly change to handle NaN
        mapped_series = self.dropna()
        mapped_series = mapped_series.map(func, na_action='ignore')
        mapped_data_type = mapped_series.data_type

        custom_prefix = kwargs.get('prefix')
        if custom_prefix is None:
            custom_prefix = self.name
        else:
            custom_prefix = '{}_{}'.format(self.name, custom_prefix)

        if mapped_series.__is_data_type_dict_like():
            custom_df = XDataFrame.from_records(mapped_series.values)

            if custom_prefix is not None:
                custom_df.columns = custom_df.columns.map(lambda x: '{}_{}'.format(custom_prefix, x))
            return custom_df
        elif mapped_data_type == pd.DataFrame:
            return pd.concat(mapped_series.values, ignore_index=True)
        else:
            mapped_series.name = custom_prefix

        return mapped_series

    def __is_data_type_dict_like(self):
        '''
        Check if data encapsulated by self is instance of dict
        '''
        return isinstance(self.iloc[0], dict)

    @property
    def data_type(self):
        '''
        Getter for a data_type property
        data_type is a data type that self encapsulates
        For example, if self is contains images, that data_type would be Image
        '''
        first_element_data_type = type(self.iloc[0])
        self._data_type = first_element_data_type
        return self._data_type

    @data_type.setter
    def data_type(self, data_type):
        '''
        Setter for a data_type property
        data_type is a data type that self encapsulates
        For example, if self is contains images, that data_type would be Image
        '''

        self._data_type = data_type

[docs]    def to_pandas_series(self):
        '''
        Convert self to pandas.Series if data_type is a primitive type
        etc. number of string
        :return: Pandas Series or raise exception if data_type is not a primitive type
        '''
        is_primitive = _is_class_a_primitive(self.data_type)
        if is_primitive:
            self.__class__ = pd.Series
        else:
            raise ValueError('Unable to cast to pd.Series. {} is not a primitive type.'.format(self.data_type))
        return self

    def __str__(self):
        s = super(XSeries, self).__str__()
        return '{}\ndata_type: {}'.format(s, self.data_type)

    def __getitem__(self, key):
        return super(XSeries, self).__getitem__(key)

    def __setitem__(self, key, value):
        value_type = type(value)
        if value_type != self.data_type:
            raise ValueError('Can not assign key {} with {} wrong data_type {} correct is {}'.format(
                key, value, value_type, self.data_type
            ))

        return super(XSeries, self).__setitem__(key, value)


[docs]class XDataFrame(pd.DataFrame):
    '''
    XDataFrame is 2d container that stores XSeries objects
    XDataFrame is an extension of pandas.DataFrame object
    '''

    @property
    def _constructor(self):
        return XDataFrame

    @property
    def _constructor_sliced(self):
        return XSeries

    def __init__(self, *args, **kwargs):
        '''
        The same arguments as for pandas.DataFrame
        https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

        data argument should be a list of XSeries objects or dict of XSeries objects.
        In dict is passed, key must be a string and it's indicate appropriate column name.
        For example, to create XDataFrame data should looks like
        data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries
        '''
        data = kwargs.get('data')
        if data is None:
            data = args[0]

        data_to_check = []
        if isinstance(data, list):
            data_to_check = data
        elif isinstance(data, dict):
            data_to_check = data.values()

        for d in data_to_check:
            if not isinstance(d, XSeries):
                raise ValueError('All data must be XSeries instances')
        super(XDataFrame, self).__init__(*args, **kwargs)

[docs]    def get_columns_of_type(self, column_type):
        '''
        Get all columns from XDataFrame with given column_type
        :param column_type: list of types or a single type
        :return: tuple. the first element is subMultiDataFrame and second is a list of column of a given column_type
        '''
        if type(column_type) != list:
            column_type = [column_type]

        columns_to_select = [
            col_name
            for col_name in self
            if self[col_name].data_type in column_type
        ]

        return self[columns_to_select], columns_to_select

[docs]    def get_data_types(self):
        '''
        Get a list of data_types of each XSeries inside XDataFrame
        :return: list of data_type
        '''
        data_types = [
            self[column].data_type
            for column in self
        ]
        return data_types

[docs]    def to_pandas_dataframe(self):
        '''
        Convert self to pandas.DataFrame if all columns are primitive types.
        See more at XSeries.to_pandas_series
        :return:
        '''
        data_types = self.get_data_types()
        is_all_columns_are_primitive = all(
            _is_class_a_primitive(dt)
            for dt in data_types
        )
        if is_all_columns_are_primitive:
            self.__class__ = pd.DataFrame
        else:
            raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types))
        return self

    @classmethod
[docs]    def concat_dataframes(cls, data_frames):
        '''
        Concatenate XDataFrame using pandas.concat method
        https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html
        over columns
        :param data_frames: list of XDataFrame instances
        :return: XDataFrame — concatenated list of data_frames
        '''
        return pd.concat(data_frames, axis=1)