Continuing to the previous post this script distributes missing values across all features in Pandas DataFrame. Just set min and max missing value distribution and you’re ready to execute …
Enjoy…
''' ### MISSING VALUES SCRIPT (RANDOM ACROSS ATTRIBUTES) ### Script to randomly distribute random percentage of missing values. Number of attributes to be filled with missing values is randomly generated and is between 1 and number of attributes. Variables min_percentage && max_percentage define the boundaries for random choosing of percentage of missing values that are going to be distributed in randomly selected number of attributes. ''' import numpy as np from pandas import Series,DataFrame import pandas as pd import csv import random from __future__ import division INPUT_FILE_NAME = 'bank-full-preprocessed-raw.csv' OUTPUT_FILE_NAME = 'bank-full-preprocessed-raw-random-missing-values-across-attributes-big-fraction.csv' random.seed(6787) # seed min_percentage = 30 # min percentage to distribute max_percentage = 60 # # max percentage to distribute missing_value = "?"; dframe = pd.read_csv(INPUT_FILE_NAME, sep=',') def distribute_randomly(): attribute_number = len(dframe.columns.values) # percentage to distribute (maximum 50%) #random_percentage = random.randint(1,50) #ptd = 1/ #print ptd * len(dframe) # how many attributes # randomly choose number of attributes to add ? to #rattr_number = random.randint(1,attribute_number) ## chose all attributes (not randomly selected) rattr_number = attribute_number print "number_of_attributes:", rattr_number for attribute_index in range(0,rattr_number): print "-----"; #print "attribute index:", i print "attribute name:", dframe.columns[attribute_index] # skip the clas if(dframe.columns[attribute_index] == 'y'): continue # missinig values between min_percentage && max_percentage random_percentage = random.randint(min_percentage,max_percentage)/100 print "percentage with missing values:", round(random_percentage, 2) # number of rows to change nrtbc = random_percentage*len(dframe) print "number of rows with missing values:", int(nrtbc) , " of total:", len(dframe) # make changes for j in range(1,int(nrtbc)): # find a random row randomrownr = random.randint(1,len(dframe)) #print randomrownr # change value @ randomrownr to ? dframe.loc[dframe.index==randomrownr, dframe.columns[attribute_index]]=missing_value ## use only for testing #if j==10: # break distribute_randomly() dframe.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC) #dframe.head()