-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDataProcessing.py
55 lines (42 loc) · 2.28 KB
/
DataProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def processData():
rawData = pd.read_csv('./data.csv')
#data preprocessing
infoData = pd.DataFrame()
infoData['FLAG'] = rawData['FLAG']
infoData['CONS_NO'] = rawData['CONS_NO']
data = rawData.drop(['FLAG', 'CONS_NO'], axis=1) #axis 1 column ,axis 0 row
#droping duplicate row
dropIndex = data[data.duplicated()].index # duplicates drop
data = data.drop(dropIndex, axis=0) #droping duplicate value present wen two row are same
infoData = infoData.drop(dropIndex, axis=0) #droping duplicate index infodata
#removing row with all zero(Nan) value
zeroIndex = data[(data.sum(axis=1) == 0)].index # zero rows drop
data = data.drop(zeroIndex, axis=0)
infoData = infoData.drop(zeroIndex, axis=0)
#change column name to dates(2014/1/1 to 2014-01-01)
data.columns = pd.to_datetime(data.columns) #columns reindexing according to dates
#sort data accoding to date( as previusoly column are unsorted)
data = data.reindex(sorted(data.columns), axis=1)
cols = data.columns
# reindex row name (as some row has been remove till this step due to duplicate or all nan values)
data.reset_index(inplace=True, drop=True) # index sorting
infoData.reset_index(inplace=True, drop=True)
#filling nan value using neighbouring value (middle missing value replace by average
#and other by maximum 2 distance element)
data = data.interpolate(method='linear', limit=2, limit_direction='both', axis=0).fillna(0)
#removing erronoues value(fixing outliers)
for i in range(data.shape[0]): # outliers treatment
m = data.loc[i].mean()
st = data.loc[i].std()
data.loc[i] = data.loc[i].mask(data.loc[i] > (m + 3 * st), other=m + 3 * st)
# save preprocessed data without scaling
data.to_csv(r'visualization.csv', index=False, header=True) # preprocessed data without scaling
#noramalisation process
scale = MinMaxScaler()
scaled = scale.fit_transform(data.values.T).T
mData = pd.DataFrame(data=scaled, columns=data.columns)
preprData = pd.concat([infoData, mData], axis=1, sort=False) # Back to initial format
# save preprocessed data after scaling
preprData.to_csv(r'preprocessedR.csv', index=False, header=True)