Feature Scaling in Python and Pandas DataFrame

Hire is a small script that i wrote long time ago to scale some of the features in order to get better performance and better prediction results in some ML algorithms.

I used Python with Pandas to read in the CSV file and process feature values. Formulas for feature scaling used in the script can be found hire.

Have fun with the script…


'''
### PREPROCESSING ###

standadrizing and normalizing of following attributes:

balance
duration
campain
pdays

binning of the following attributes:

age
'''

import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import csv
import random

'''
Preprocessing (3)

'''

INPUT_FILE_NAME = 'INPUT_FILE_PATH'
OUTPUT_FILE_NAME = 'OUTPUT_FILE_PATH'

dframe = pd.read_csv(INPUT_FILE_NAME, sep=';')

## 1
# normalize attribute balance
def normalize_balance(balance):
return (balance-dframe.balance.values.min())/(dframe.balance.values.max() - dframe.balance.values.min())

# standardise attribute balance
def standardize_balance(balance):
return (balance-dframe.balance.mean())/dframe.balance.std()
## 2
# normalize contact duration
def normalize_duration(duration):
return (duration-dframe.duration.values.min())/(dframe.duration.values.max()-dframe.duration.values.min())

# standardize contact duration
def standardize_duration(duration):
return (duration-dframe.duration.mean())/dframe.duration.std()

## 3
# normalize campaign
def normalize_campaign(campaign):
return (campaign-dframe.campaign.values.min())/(dframe.campaign.values.max()-dframe.campaign.values.min())

# standardize campaign
def standardize_campaign(campaign):
return (campaign-dframe.campaign.mean())/dframe.campaign.std()

## 4
# normalize pdays
def normalize_pdays(pdays):
return (pdays-dframe.pdays.values.min())/(dframe.pdays.values.max()-dframe.pdays.values.min())

# standardize pdays
def standardize_pdays(pdays):
return (pdays-dframe.pdays.mean())/dframe.pdays.std()

## 5
# bin ages
def bin_ages(age):
if(age <= 20):
return "A"
elif age > 20 and age <= 40:
return "B"
elif age > 40 and age <= 65:
return "C"
else:
return "D"
# balance
dframe['balancenormalized'] = dframe.apply(lambda row: normalize_balance(row['balance']), axis=1)
dframe['balancestandardised'] = dframe.apply(lambda row: standardize_balance(row['balance']), axis=1)

# duration
dframe['durationnormalized'] = dframe.apply(lambda row: normalize_duration(row['duration']), axis=1)
dframe['durationstandardised'] = dframe.apply(lambda row: standardize_duration(row['duration']), axis=1)
# campaign
dframe['campaignnormalized'] = dframe.apply(lambda row: normalize_campaign(row['campaign']), axis=1)
dframe['campaignstandardised'] = dframe.apply(lambda row: standardize_campaign(row['campaign']), axis=1)
# pdays
dframe['pdaysnnormalized'] = dframe.apply(lambda row: normalize_pdays(row['pdays']), axis=1)
dframe['pdaysstandardised'] = dframe.apply(lambda row: standardize_pdays(row['pdays']), axis=1)

# age
dframe['agebinned'] = dframe.apply(lambda row: bin_ages(row['age']), axis=1)

dframe.to_csv(OUTPUT_FILE_NAME, sep=',', encoding='utf-8', index=False, quoting = csv.QUOTE_NONNUMERIC)
#dframe.head()

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: