Source code for datafuzz.noise

# -*- coding: utf-8 -*-
"""
NoiseMaker adds noise strategies to datafuzz.

This will apply specified noise transformations to a series
of columns.
"""
import logging
import random
from datafuzz.settings import HAS_NUMPY
from datafuzz.strategy import Strategy
from datafuzz.utils.noise_helpers import messy_spaces, generate_random_int, \
    generate_random_float

if HAS_NUMPY:
    import numpy as np


[docs]class NoiseMaker(Strategy):
    """ NoiseMaker applies noisy data transformations
        to given dataset.

        see also `strategy.Strategy`

    """


    def __init__(self, dataset, **kwargs):
        """ See `strategy.Strategy`

            Additional kwargs:
                noise      (list of str): list of noise options to apply

                columns    (list of str): list of indexes or column names
                                          If not columns are given, a random
                                          set will be chosen.
                limits  (list of limits): range limits (list of ints)

            Available noise options:
                'add_nulls': add null values
                'string_permutation': apply string transformations
                'random': generate some random values based on col type
                'range': change values into given or column range
                'type_transform': apply type transformations
        """
        super().__init__(dataset, **kwargs)
        self.columns = kwargs.get('columns')
        self.limits = kwargs.get('limits')
        try:
            assert isinstance(kwargs.get('noise'), list)
            self.noise = kwargs.get('noise')
            assert self.noise
        except AssertionError:
            raise Exception('You must specify what types of noise to apply.')

        if not self.columns:
            self.columns = self.dataset.sample(self.percentage, columns=True)
        self.columns = self.get_numeric_columns(self.columns)

[docs]    def run_strategy(self):
        """ Run noise strategy on sample

            Performs transformations on self.dataset
        """
        if 'add_nulls' in self.noise:
            self.nullify()
        if 'string_permutation' in self.noise:
            self.string_permutation()
        if 'random' in self.noise:
            self.randomize()
        if 'range' in self.noise:
            self.use_range()
        if 'type_transform' in self.noise:
            self.type_transform()

[docs]    def set_value(self, value, column=None):
        """ Set value for a series of columns or one column.

            Arguments:
                value         (obj): value to set

            Kwargs:
                column (str or int): name or index of column

            TODO:
                - should this be available on Strategy class?
        """
        if column is None:
            for col in self.columns:
                self.set_value(value, column=col)
        if self.dataset.data_type == 'pandas':
            self.dataset.records.loc[
                np.random.choice(
                    self.dataset.records.shape[0],
                    self.num_rows), column] = value
        elif self.dataset.data_type == 'numpy':
            self.dataset.records[
                np.random.choice(
                    self.dataset.records.shape[0],
                    self.num_rows), column] = value
        else:
            indexes = np.random.choice(len(self.dataset.records),
                                       self.num_rows)
            self.dataset.records = [
                val if idx not in indexes else
                [v if i != column else value for i, v in enumerate(val)]
                for idx, val in enumerate(self.dataset.records)]

[docs]    def nullify(self):
        """ Set null values for sample in columns """
        for column in self.columns:
            self.set_value(np.nan, column=column)

[docs]    def randomize(self):
        """ Set random values for sample in columns

            NOTE: this will vary based on column type
        """
        for column in self.columns:
            col_type = self.dataset.column_dtype(column)
            min_val = self.dataset.column_agg(column, min)
            max_val = self.dataset.column_agg(column, max)

            func = None

            if 'float' in str(col_type):
                func = generate_random_float
            elif 'int' in str(col_type):
                func = generate_random_int
            if func:
                self.apply_func_to_column(
                    lambda x: func(min_val, max_val), column)
            elif col_type in [object, str]:
                self.string_permutation(column=column)

[docs]    def string_permutation(self, column=None):
        """ Permute string values for sample in columns

            TODO:
                - add permutations for missing characters
                - flipped strings
                - typos
                - homonyms / autocorrect
        """
        if column is None:
            for col in self.columns:
                self.string_permutation(column=col)
        else:
            self.apply_func_to_column(messy_spaces, column)

[docs]    def use_range(self):
        """ Use values from a range to set values in columns

            If `limits` not passed during initialization, this
            method will attempt to determine good limits based
            on the column ranges and use those.

            NOTE: range is only available for numeric columns

            TODO:
                - should we calculate IQR and insert outliers?
                - if not, should add_outliers be a new option for noise?
        """
        for column in self.columns:
            if self.limits is None:
                min_val = self.dataset.column_agg(column, min)
                max_val = self.dataset.column_agg(column, max)
            else:
                min_val = self.limits[0]
                max_val = self.limits[1]

            func = None
            col_type = self.dataset.column_dtype(column)
            if 'float' in str(col_type):
                func = generate_random_float
            elif 'int' in str(col_type):
                func = generate_random_int
            if func:
                self.apply_func_to_column(lambda x: func(x,
                                                         low=min_val,
                                                         high=max_val),
                                          column)
            elif col_type in [object, str]:
                raise NotImplementedError(
                    'You must use a numeric column when using `range`')

[docs]    def type_transform(self):
        """ Transform types for sample in columns.

            NOTE: if a string column is used and the values cannot
            be transformed into integer or float values, you
            may not see a useful transformation.

            TODO:
                - for strings, should numeric values be inserted as strings
                instead?

        """
        for column in self.columns:
            col_type = self.dataset.column_dtype(column)
            if 'int' in str(col_type):
                func = lambda x: random.choice([str, float])(x)
            elif 'float' in str(col_type):
                func = lambda x: random.choice([str, int])(x)
            elif col_type in [object, str]:
                func = lambda x: random.choice([float, int])(x)
            if func:
                try:
                    self.apply_func_to_column(func, column)
                except (ValueError, TypeError):
                    logging.exception(
                        'Could not change type for column: %s', column)