-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalize.py
38 lines (32 loc) · 1.7 KB
/
normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, Normalizer
# Read the dataset
file_path = 'dataset.csv'
data = pd.read_csv(file_path)
# Split the dataset into features and target
X = data.iloc[:, :-1] # Features
y = data.iloc[:, -1] # Target
# Initialize scalers
scalers = {
#This method scales the data to a fixed range, usually 0 to 1.
'MinMaxScaler': MinMaxScaler(),
'StandardScaler': StandardScaler(),
#This method is robust to outliers by using the median and the interquartile range for scaling.
'RobustScaler': RobustScaler(),
#This method scales each feature by its maximum absolute value, preserving the sparsity of the data.
'MaxAbsScaler': MaxAbsScaler(),
#This method transforms features to follow a uniform or normal distribution. It's useful when the data does not follow a Gaussian distribution.
'QuantileTransformer': QuantileTransformer(output_distribution='normal'),
#This method scales individual samples to have unit norm (i.e., the sum of the squares is 1). It's useful for text classification and clustering.
'Normalizer': Normalizer()
}
# Apply each scaler
normalized_data = {}
for scaler_name, scaler in scalers.items():
# Also known as standardization, this method centers the data to have a mean of 0 and a standard deviation of 1.
X_scaled = scaler.fit_transform(X)
X_normalized = pd.DataFrame(X_scaled, columns=X.columns)
normalized_data[scaler_name] = pd.concat([X_normalized, y], axis=1)
# Display normalized data for each method
for scaler_name, data in normalized_data.items():
print(f"Normalized Data using {scaler_name}:\n", data.head(), "\n")