Written long time ago to feed some ML algorithms with data subsets because the original data set was to huge and the algorithm execution performance was too long.
Have fun with the script…
# do subsampling import numpy as np from pandas import Series,DataFrame import pandas as pd import csv import random ''' Subsampling (3) Subsampling procedure that creates new dataset with given percentage-size of the original sample. Sampling is performed randomly. ''' # just to ensure repeatability random.seed(2009) INPUT_FILE_NAME = 'bank-full-preprocessed-raw.csv' OUTPUT_FILE_NAME = 'bank-full-preprocessed-raw-subsampled.csv' dframe = pd.read_csv(INPUT_FILE_NAME, sep=',') # new dataset subsampled = DataFrame(columns=dframe.columns) ''' param "size_of_new_dataset_in_percentage" takes percentage of original dataset to create new dataset with randomly choosen rows. example: if original dataset has 100 rows. if "size_of_new_dataset_in_percentage" is set to 0.10 the new dataset(subsampled) is going to be size of 10. ''' def subsample_dataset(size_of_new_dataset_in_percentage): new_dataset_size = int(size_of_new_dataset_in_percentage*len(dframe)) print "new dataset size:", new_dataset_size for i in range(1,new_dataset_size-1): randomrownr = random.randint(1,len(dframe)-1) #print "random_rownr", randomrownr subsampled.loc[i] = dframe.loc[randomrownr] # end of method # resample the original dataset to 30% subsample_dataset(0.30) subsampled.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC) #subsampled.head()