Hire is a small script that i wrote long time ago to scale some of the features in order to get better performance and better prediction results in some ML algorithms.
I used Python with Pandas to read in the CSV file and process feature values. Formulas for feature scaling used in the script can be found hire.
Have fun with the script…
''' ### PREPROCESSING ### standadrizing and normalizing of following attributes: balance duration campain pdays binning of the following attributes: age ''' import numpy as np from pandas import Series,DataFrame import pandas as pd import csv import random ''' Preprocessing (3) ''' INPUT_FILE_NAME = 'INPUT_FILE_PATH' OUTPUT_FILE_NAME = 'OUTPUT_FILE_PATH' dframe = pd.read_csv(INPUT_FILE_NAME, sep=';') ## 1 # normalize attribute balance def normalize_balance(balance): return (balance-dframe.balance.values.min())/(dframe.balance.values.max() - dframe.balance.values.min()) # standardise attribute balance def standardize_balance(balance): return (balance-dframe.balance.mean())/dframe.balance.std() ## 2 # normalize contact duration def normalize_duration(duration): return (duration-dframe.duration.values.min())/(dframe.duration.values.max()-dframe.duration.values.min()) # standardize contact duration def standardize_duration(duration): return (duration-dframe.duration.mean())/dframe.duration.std() ## 3 # normalize campaign def normalize_campaign(campaign): return (campaign-dframe.campaign.values.min())/(dframe.campaign.values.max()-dframe.campaign.values.min()) # standardize campaign def standardize_campaign(campaign): return (campaign-dframe.campaign.mean())/dframe.campaign.std() ## 4 # normalize pdays def normalize_pdays(pdays): return (pdays-dframe.pdays.values.min())/(dframe.pdays.values.max()-dframe.pdays.values.min()) # standardize pdays def standardize_pdays(pdays): return (pdays-dframe.pdays.mean())/dframe.pdays.std() ## 5 # bin ages def bin_ages(age): if(age <= 20): return "A" elif age > 20 and age <= 40: return "B" elif age > 40 and age <= 65: return "C" else: return "D" # balance dframe['balancenormalized'] = dframe.apply(lambda row: normalize_balance(row['balance']), axis=1) dframe['balancestandardised'] = dframe.apply(lambda row: standardize_balance(row['balance']), axis=1) # duration dframe['durationnormalized'] = dframe.apply(lambda row: normalize_duration(row['duration']), axis=1) dframe['durationstandardised'] = dframe.apply(lambda row: standardize_duration(row['duration']), axis=1) # campaign dframe['campaignnormalized'] = dframe.apply(lambda row: normalize_campaign(row['campaign']), axis=1) dframe['campaignstandardised'] = dframe.apply(lambda row: standardize_campaign(row['campaign']), axis=1) # pdays dframe['pdaysnnormalized'] = dframe.apply(lambda row: normalize_pdays(row['pdays']), axis=1) dframe['pdaysstandardised'] = dframe.apply(lambda row: standardize_pdays(row['pdays']), axis=1) # age dframe['agebinned'] = dframe.apply(lambda row: bin_ages(row['age']), axis=1) dframe.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC) #dframe.head()