Distribute Missing Values in Pandas DataFrame Column with Python

Jun 30, 2016

Data Mining, Data Science, Missing Values, Pandas, Python

If you want to test how Maschine Learning algorithms perform with missing values you may need a script to distribute a fixed percentage of missing values in a feature. This script randomly distributes missing values in a single data set’s column.

Enjoy…


import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import csv
import random
from __future__ import division

INPUT_FILE_NAME = 'bank-full-preprocessed-raw.csv'
OUTPUT_FILE_NAME = 'bank-full-preprocessed-raw-selected-attribute-duration-small-fraction.csv'

random.seed(2009) # seed
missing_value = &quot;?&quot;

dframe = pd.read_csv(INPUT_FILE_NAME, sep=',')

'''
Script to randomly distribute missing values in selected attribute
with given percentage.

Example: distribute_selected(&quot;job&quot;, 0.03)
Script distributes 3% of missing values in job attribute.
'''
def distribute_selected(attribute, percentage):

print &quot;attribute name:&quot;, attribute
# number of rows to change
nrtbc = percentage*len(dframe)
print &quot;number of rows with missing values:&quot;, int(nrtbc) , &quot; of total:&quot;, len(dframe)
# make changes
for j in range(1,int(nrtbc)):
# find a random row
randomrownr = random.randint(1,len(dframe))
#print &quot;random_rownr:&quot;, randomrownr
# change value @ randomrownr to ?
dframe.loc[dframe.index==randomrownr, attribute]=missing_value

distribute_selected(&quot;duration&quot;, 0.10)
dframe.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC)
#dframe.head()

Distribute Missing Values in Pandas DataFrame Column with Python

Share this:

Leave a comment Cancel reply