Source code for xpandas.transformers.bag_of_features_transformer.bag_of_features_transformer

from collections import Counter

import numpy as np
import pandas as pd

from ..transformer import XSeriesTransformer


[docs]class BagOfWordsTransformer(XSeriesTransformer): ''' Performs bag-of-features transformer for strings of any categorical data. ''' def __init__(self, dictionary=None, **kwargs): ''' :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset ''' self.dictionary = dictionary accepted_types = [ pd.Series, list, np.array, tuple ] def bag_of_words_transform_function(corpus): counter = Counter(corpus) for el in self.dictionary: if counter.get(el) is None: counter[el] = 0 return counter super(BagOfWordsTransformer, self).__init__(data_types=accepted_types, columns=None, transform_function=bag_of_words_transform_function) def __calculate_dictionary(self, X): dictionary = set() for el in X: dictionary = dictionary.union(el) return dictionary
[docs] def fit(self, X=None, y=None, **kwargs): super(BagOfWordsTransformer, self).fit(X, y, **kwargs) if self.dictionary is not None: return self self.dictionary = self.__calculate_dictionary(X) return self