import numpy as np
import pandas as pd
def _check_all_elements_have_the_same_property(array, func):
'''
Helper function that checks if all elements have the same func(element) value.
:param array: input values
:param func: any callable object
:return: tuple. the first element indicates is all elements are have the same func(element) value,
second element is a value of func(element)
'''
if len(array) == 0:
return True, None
try:
first_element_type = func(array[0])
except:
return True, None
do_all_have_property = all(func(x) == first_element_type
for x in array)
return do_all_have_property, first_element_type
def _is_class_a_primitive(cls):
'''
Check if class is a number or string including numpy numbers
:param cls: any class
:return: True if class is a primitive class, else False
'''
primitives = [
np.float16, np.float32, np.float64, np.float128,
np.int8, np.int16, np.int32, np.int64,
bool, str, np.uint8, np.uint16, np.uint32, np.uint64,
int, float
]
return cls in primitives
[docs]class XSeries(pd.Series):
'''
XSeries is an homogeneous abstract 1d container that encapsulates any data type inside.
It is an extension of pandas.Series class.
XSeries has a property data_type that is a type ot objects that are inside XSeries.
'''
_metadata = ['data_type']
@property
def _constructor(self):
return XSeries
@property
def _constructor_expanddim(self):
return XDataFrame
def __init__(self, *args, **kwargs):
'''
The same arguments as for pandas.Series
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
In order to create XSeries of any data_type, data argument must be a pythons list.
For example, to create XSeries of pandas.Series, pass data should be
data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series.
'''
super(XSeries, self).__init__(*args, **kwargs)
data = kwargs.get('data')
if data is None:
data = args[0]
check_result, data_type = _check_all_elements_have_the_same_property(data, type)
if not check_result:
raise ValueError('Not all elements the same type')
if data_type is not None:
self._data_type = data_type
else:
self._data_type = type(data._values[0])
[docs] def apply(self, *args, **kwargs):
'''
Overwrite standart pandas.Series method.
Apply transform function to all elements in self.
*If transform function return dict like object,
transform XSeries to XDataFrame see XDataFrame constructor*
:param func: function to apply
:param prefix: prefix for columns if needs to return XDataFrame object
:return: XSeries of XDataFrame depending on transformation
'''
func = kwargs.get('func')
if func is None:
func = args[0]
# TODO
# Possibly change to handle NaN
mapped_series = self.dropna()
mapped_series = mapped_series.map(func, na_action='ignore')
mapped_data_type = mapped_series.data_type
custom_prefix = kwargs.get('prefix')
if custom_prefix is None:
custom_prefix = self.name
else:
custom_prefix = '{}_{}'.format(self.name, custom_prefix)
if mapped_series.__is_data_type_dict_like():
custom_df = XDataFrame.from_records(mapped_series.values)
if custom_prefix is not None:
custom_df.columns = custom_df.columns.map(lambda x: '{}_{}'.format(custom_prefix, x))
return custom_df
elif mapped_data_type == pd.DataFrame:
return pd.concat(mapped_series.values, ignore_index=True)
else:
mapped_series.name = custom_prefix
return mapped_series
def __is_data_type_dict_like(self):
'''
Check if data encapsulated by self is instance of dict
'''
return isinstance(self.iloc[0], dict)
@property
def data_type(self):
'''
Getter for a data_type property
data_type is a data type that self encapsulates
For example, if self is contains images, that data_type would be Image
'''
first_element_data_type = type(self.iloc[0])
self._data_type = first_element_data_type
return self._data_type
@data_type.setter
def data_type(self, data_type):
'''
Setter for a data_type property
data_type is a data type that self encapsulates
For example, if self is contains images, that data_type would be Image
'''
self._data_type = data_type
[docs] def to_pandas_series(self):
'''
Convert self to pandas.Series if data_type is a primitive type
etc. number of string
:return: Pandas Series or raise exception if data_type is not a primitive type
'''
is_primitive = _is_class_a_primitive(self.data_type)
if is_primitive:
self.__class__ = pd.Series
else:
raise ValueError('Unable to cast to pd.Series. {} is not a primitive type.'.format(self.data_type))
return self
def __str__(self):
s = super(XSeries, self).__str__()
return '{}\ndata_type: {}'.format(s, self.data_type)
def __getitem__(self, key):
return super(XSeries, self).__getitem__(key)
def __setitem__(self, key, value):
value_type = type(value)
if value_type != self.data_type:
raise ValueError('Can not assign key {} with {} wrong data_type {} correct is {}'.format(
key, value, value_type, self.data_type
))
return super(XSeries, self).__setitem__(key, value)
[docs]class XDataFrame(pd.DataFrame):
'''
XDataFrame is 2d container that stores XSeries objects
XDataFrame is an extension of pandas.DataFrame object
'''
@property
def _constructor(self):
return XDataFrame
@property
def _constructor_sliced(self):
return XSeries
def __init__(self, *args, **kwargs):
'''
The same arguments as for pandas.DataFrame
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
data argument should be a list of XSeries objects or dict of XSeries objects.
In dict is passed, key must be a string and it's indicate appropriate column name.
For example, to create XDataFrame data should looks like
data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries
'''
data = kwargs.get('data')
if data is None:
data = args[0]
data_to_check = []
if isinstance(data, list):
data_to_check = data
elif isinstance(data, dict):
data_to_check = data.values()
for d in data_to_check:
if not isinstance(d, XSeries):
raise ValueError('All data must be XSeries instances')
super(XDataFrame, self).__init__(*args, **kwargs)
[docs] def get_columns_of_type(self, column_type):
'''
Get all columns from XDataFrame with given column_type
:param column_type: list of types or a single type
:return: tuple. the first element is subMultiDataFrame and second is a list of column of a given column_type
'''
if type(column_type) != list:
column_type = [column_type]
columns_to_select = [
col_name
for col_name in self
if self[col_name].data_type in column_type
]
return self[columns_to_select], columns_to_select
[docs] def get_data_types(self):
'''
Get a list of data_types of each XSeries inside XDataFrame
:return: list of data_type
'''
data_types = [
self[column].data_type
for column in self
]
return data_types
[docs] def to_pandas_dataframe(self):
'''
Convert self to pandas.DataFrame if all columns are primitive types.
See more at XSeries.to_pandas_series
:return:
'''
data_types = self.get_data_types()
is_all_columns_are_primitive = all(
_is_class_a_primitive(dt)
for dt in data_types
)
if is_all_columns_are_primitive:
self.__class__ = pd.DataFrame
else:
raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types))
return self
@classmethod
[docs] def concat_dataframes(cls, data_frames):
'''
Concatenate XDataFrame using pandas.concat method
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html
over columns
:param data_frames: list of XDataFrame instances
:return: XDataFrame — concatenated list of data_frames
'''
return pd.concat(data_frames, axis=1)