Source code for datafuzz.duplicator

# -*- coding: utf-8 -*-
"""
Duplicator is used as a duplication strategy for datasets.

It will take a series of rows of the dataset, duplicate and append them.
You can also add random noise to the duplicated rows.
"""
from datafuzz.dataset import DataSet
from datafuzz.strategy import Strategy
from datafuzz.utils.noise_helpers import messy_spaces, generate_random_int, \
    generate_random_float, pertubate_str


[docs]class Duplicator(Strategy): """ Duplicator is used to duplicate rows in a dataset see also: `strategy.Strategy` """ def __init__(self, dataset, **kwargs): """ see `strategy.Stragegy init` Additional Kwargs: add_noise (bool): add noise to duplicated rows """ super().__init__(dataset, **kwargs) self.add_noise = kwargs.get('add_noise')
[docs] def run_strategy(self): """ Run duplicator strategy and if add noise is selected, add noise to the data before appending it to the dataset. """ sample = self.dataset.sample(self.percentage) if self.add_noise: sample = self.noise(sample) self.dataset.append(sample)
[docs] def noise(self, sample): """ Adds noise to the duplicate rows Parameteres: sample (list or obj): `dataset.Dataset.sample` Returns sample (list or obj): distorted rows TODO: - implement more noise options than just random """ sample_dataset = DataSet(sample.copy()) columns = sample_dataset.sample(self.percentage, columns=True) if sample_dataset.data_type == 'pandas': sample_dataset.records = \ sample_dataset.records.reset_index(drop=True) for column in columns: col = sample_dataset.column_idx(column) col_type = sample_dataset.column_dtype(col) func = None if 'float' in str(col_type): func = generate_random_float elif 'int' in str(col_type): func = generate_random_int if func: kwargs = {'low': self.dataset.column_agg(col, min), 'high': self.dataset.column_agg(col, max)} if kwargs.get('low') == kwargs.get('high'): kwargs['high'] += 1 sample = self.apply_func_to_column( lambda x: func(x, **kwargs), col, dataset=sample_dataset) elif col_type in [object, str]: sample = self.apply_func_to_column(messy_spaces, col, dataset=sample_dataset) sample = self.apply_func_to_column(pertubate_str, col, dataset=sample_dataset) return sample_dataset.records