Pandas DataFrame Subsampling in Python

Jun 30, 2016

Written long time ago to feed some ML algorithms with data subsets because the original data set was to huge and the algorithm execution performance was too long.

Have fun with the script…


# do subsampling

import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import csv
import random

'''
Subsampling (3)

Subsampling procedure that creates new dataset with given percentage-size
of the original sample. Sampling is performed randomly.

'''
# just to ensure repeatability
random.seed(2009)
INPUT_FILE_NAME = 'bank-full-preprocessed-raw.csv'
OUTPUT_FILE_NAME = 'bank-full-preprocessed-raw-subsampled.csv'

dframe = pd.read_csv(INPUT_FILE_NAME, sep=',')
# new dataset
subsampled = DataFrame(columns=dframe.columns)
'''
param "size_of_new_dataset_in_percentage" takes percentage of original
dataset to create new dataset with randomly choosen rows.

example: if original dataset has 100 rows. if "size_of_new_dataset_in_percentage"
is set to 0.10 the new dataset(subsampled) is going to be size of 10.
'''
def subsample_dataset(size_of_new_dataset_in_percentage):

new_dataset_size = int(size_of_new_dataset_in_percentage*len(dframe))
print "new dataset size:", new_dataset_size

for i in range(1,new_dataset_size-1):
randomrownr = random.randint(1,len(dframe)-1)
#print "random_rownr", randomrownr
subsampled.loc[i] = dframe.loc[randomrownr]
# end of method

# resample the original dataset to 30%
subsample_dataset(0.30)

subsampled.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC)
#subsampled.head()

Pandas DataFrame Subsampling in Python

Share this:

Leave a comment Cancel reply