Distribute Missing Values randomly across rolumns in Python Pandas

Continuing to the previous post this script distributes missing values across all features in Pandas DataFrame. Just set min and max missing value distribution and you’re ready to execute …



Script to randomly distribute random percentage of missing values.

Number of attributes to be filled with missing values is randomly
generated and is between 1 and number of attributes.

Variables min_percentage && max_percentage define the boundaries
for random choosing of percentage of missing values that are going
to be distributed in randomly selected number of attributes.
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import csv
import random
from __future__ import division

INPUT_FILE_NAME = 'bank-full-preprocessed-raw.csv'
OUTPUT_FILE_NAME = 'bank-full-preprocessed-raw-random-missing-values-across-attributes-big-fraction.csv'

random.seed(6787) # seed
min_percentage = 30 # min percentage to distribute
max_percentage = 60 # # max percentage to distribute
missing_value = "?";

dframe = pd.read_csv(INPUT_FILE_NAME, sep=',')

def distribute_randomly():

attribute_number = len(dframe.columns.values)
# percentage to distribute (maximum 50%)
#random_percentage = random.randint(1,50)
#ptd = 1/
#print ptd * len(dframe)
# how many attributes

# randomly choose number of attributes to add ? to
#rattr_number = random.randint(1,attribute_number)
## chose all attributes (not randomly selected)
rattr_number = attribute_number
print "number_of_attributes:", rattr_number
for attribute_index in range(0,rattr_number):
print "-----";
#print "attribute index:", i
print "attribute name:", dframe.columns[attribute_index]
# skip the clas
if(dframe.columns[attribute_index] == 'y'):
# missinig values between min_percentage && max_percentage
random_percentage = random.randint(min_percentage,max_percentage)/100
print "percentage with missing values:", round(random_percentage, 2)
# number of rows to change
nrtbc = random_percentage*len(dframe)
print "number of rows with missing values:", int(nrtbc) , " of total:", len(dframe)
# make changes
for j in range(1,int(nrtbc)):
# find a random row
randomrownr = random.randint(1,len(dframe))
#print randomrownr
# change value @ randomrownr to ?
dframe.loc[dframe.index==randomrownr, dframe.columns[attribute_index]]=missing_value
## use only for testing
#if j==10:
# break

dframe.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC)

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: