If you want to test how Maschine Learning algorithms perform with missing values you may need a script to distribute a fixed percentage of missing values in a feature. This script randomly distributes missing values in a single data set’s column.
Enjoy…
import numpy as np from pandas import Series,DataFrame import pandas as pd import csv import random from __future__ import division INPUT_FILE_NAME = 'bank-full-preprocessed-raw.csv' OUTPUT_FILE_NAME = 'bank-full-preprocessed-raw-selected-attribute-duration-small-fraction.csv' random.seed(2009) # seed missing_value = "?" dframe = pd.read_csv(INPUT_FILE_NAME, sep=',') ''' Script to randomly distribute missing values in selected attribute with given percentage. Example: distribute_selected("job", 0.03) Script distributes 3% of missing values in job attribute. ''' def distribute_selected(attribute, percentage): print "attribute name:", attribute # number of rows to change nrtbc = percentage*len(dframe) print "number of rows with missing values:", int(nrtbc) , " of total:", len(dframe) # make changes for j in range(1,int(nrtbc)): # find a random row randomrownr = random.randint(1,len(dframe)) #print "random_rownr:", randomrownr # change value @ randomrownr to ? dframe.loc[dframe.index==randomrownr, attribute]=missing_value distribute_selected("duration", 0.10) dframe.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC) #dframe.head()