Distribute Missing Values randomly across Columns in Python Pandas

Continuing to the previous post this script distributes missing values across all features in Pandas DataFrame. Just set min and max missing value distribution and you’re ready to execute …

Enjoy…


'''
### MISSING VALUES SCRIPT (RANDOM ACROSS ATTRIBUTES) ###

Script to randomly distribute random percentage of missing values.

Number of attributes to be filled with missing values is randomly
generated and is between 1 and number of attributes.

Variables min_percentage && max_percentage define the boundaries
for random choosing of percentage of missing values that are going
to be distributed in randomly selected number of attributes.
'''
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import csv
import random
from __future__ import division

INPUT_FILE_NAME = 'bank-full-preprocessed-raw.csv'
OUTPUT_FILE_NAME = 'bank-full-preprocessed-raw-random-missing-values-across-attributes-big-fraction.csv'

random.seed(6787) # seed
min_percentage = 30 # min percentage to distribute
max_percentage = 60 # # max percentage to distribute
missing_value = "?";

dframe = pd.read_csv(INPUT_FILE_NAME, sep=',')

def distribute_randomly():

attribute_number = len(dframe.columns.values)
# percentage to distribute (maximum 50%)
#random_percentage = random.randint(1,50)
#ptd = 1/
#print ptd * len(dframe)
# how many attributes

# randomly choose number of attributes to add ? to
#rattr_number = random.randint(1,attribute_number)
## chose all attributes (not randomly selected)
rattr_number = attribute_number
print "number_of_attributes:", rattr_number
for attribute_index in range(0,rattr_number):
print "-----";
#print "attribute index:", i
print "attribute name:", dframe.columns[attribute_index]
# skip the clas
if(dframe.columns[attribute_index] == 'y'):
continue
# missinig values between min_percentage && max_percentage
random_percentage = random.randint(min_percentage,max_percentage)/100
print "percentage with missing values:", round(random_percentage, 2)
# number of rows to change
nrtbc = random_percentage*len(dframe)
print "number of rows with missing values:", int(nrtbc) , " of total:", len(dframe)
# make changes
for j in range(1,int(nrtbc)):
# find a random row
randomrownr = random.randint(1,len(dframe))
#print randomrownr
# change value @ randomrownr to ?
dframe.loc[dframe.index==randomrownr, dframe.columns[attribute_index]]=missing_value
## use only for testing
#if j==10:
# break

distribute_randomly()
dframe.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC)
#dframe.head()

Leave a comment