64
64
from autoviml .sulov_method import FE_remove_variables_using_SULOV_method , remove_highly_correlated_vars_fast
65
65
66
66
from autoviml .classify_method import classify_columns
67
- from imbalanced_ensemble .ensemble import SelfPacedEnsembleClassifier
68
67
69
68
from sklearn .metrics import mean_absolute_error , mean_squared_error
70
69
@@ -1401,24 +1400,19 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
1401
1400
#### no numeric variables are removed. But next time, we will remove them later!
1402
1401
# Optionally, select top n variables based on their predictive power
1403
1402
# This step is useful if you want to bin only the most informative variables
1404
- entropy_binner = EntropyBinningTransformer (replace_vars = False , modeltype = modeltype , top_n_vars = None )
1405
-
1406
- # Fit the transformer to the training data
1407
- entropy_binner .fit_transform (X_train , y_train )
1408
-
1409
1403
part_train , num_vars , important_features , part_cv = add_entropy_binning (part_train ,
1410
- each_target , saved_num_vars ,
1411
- saved_important_features , part_cv ,
1412
- modeltype , entropy_binning = False ,
1413
- verbose = verbose )
1404
+ each_target , saved_num_vars ,
1405
+ saved_important_features , part_cv ,
1406
+ modeltype , entropy_binning = False ,verbose = verbose )
1414
1407
#### In saved_num_vars we send in all the continuous_vars but we bin only the top few vars.
1415
1408
### Those that are binned are removed from saved_num_vars and the remaining become num_vars
1416
1409
### Our job is to find the names of those original numeric variables which were binned.
1417
1410
### orig_num_vars contains original num vars. num_vars contains binned versions of those vars.
1418
1411
### Those binned variables have now become categorical vars and must be added to imp_cats.
1419
1412
### you get the name of the original vars which were binned here in this orig_num_vars variable!
1413
+ orig_num_vars = left_subtract (saved_num_vars ,num_vars )
1420
1414
#### you need to know the name of the binner variables. This is where you get it!
1421
- binned_num_vars = left_subtract (num_vars , saved_num_vars )
1415
+ binned_num_vars = left_subtract (num_vars ,saved_num_vars )
1422
1416
imp_cats += binned_num_vars
1423
1417
#### Also note that important_features does not contain orig_num_vars which have been erased.
1424
1418
else :
@@ -2361,8 +2355,8 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
2361
2355
from sklearn .calibration import CalibratedClassifierCV
2362
2356
model = CalibratedClassifierCV (model , method = method , cv = n_splits )
2363
2357
if not perform_scaling_flag :
2364
- X_ful = X_train . append ( X_cv )
2365
- y_ful = y_train . append ( y_cv )
2358
+ X_ful = pd . concat ([ X_train , X_cv ], ignore_index = True )
2359
+ y_ful = pd . concat ([ y_train , y_cv ], ignore_index = True )
2366
2360
else :
2367
2361
X_ful = np .r_ [X_train , X_cv ]
2368
2362
y_ful = np .r_ [y_train , y_cv ]
@@ -2646,7 +2640,7 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
2646
2640
print ('Training model on complete Train data and Predicting using given Test Data...' )
2647
2641
################ I M P O R T A N T: C O M B I N I N G D A T A ######################
2648
2642
#### This is Second time: we combine train and CV into Train and Test Sets #################
2649
- train = part_train . append ( part_cv )
2643
+ train = pd . concat ([ part_train , part_cv ], ignore_index = True )
2650
2644
important_features = [x for x in list (train ) if x not in target ]
2651
2645
############################################################################################
2652
2646
if model_label == 'Single_Label' :
@@ -2665,6 +2659,7 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
2665
2659
except :
2666
2660
pass
2667
2661
########################## BINNING SECOND TIME ###############################
2662
+ new_num_vars = train [important_features ].select_dtypes (include = [np .float64 ,np .float32 ,np .float16 ]).columns .tolist ()
2668
2663
## Now we re-use the saved_num_vars which contained a list of num_vars for binning now!
2669
2664
###### Once again we do Entropy Binning on the Full Train Data Set !!
2670
2665
########################## BINNING SECOND TIME ###############################
@@ -2677,9 +2672,8 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
2677
2672
#### When we Bin the second first time, we set the entropy_binning flag to True so
2678
2673
#### that all numeric variables that are binned are removed. This way, only bins remain.
2679
2674
train , num_vars , important_features , test = add_entropy_binning (train , each_target ,
2680
- saved_num_vars , important_features , test ,
2681
- modeltype , entropy_binning = True ,
2682
- verbose = verbose )
2675
+ saved_num_vars , important_features , test ,
2676
+ modeltype , entropy_binning = True ,verbose = verbose )
2683
2677
#### In saved_num_vars we send in all the continuous_vars but we bin only the top few vars.
2684
2678
### Those that are binned are removed from saved_num_vars and the remaining become num_vars
2685
2679
### Our job is to find the names of those original numeric variables which were binned.
@@ -4822,10 +4816,12 @@ def training_with_SMOTE(X_df, y_df, target, eval_set, model_input, Boosting_Flag
4822
4816
### For classification problems we are going to use SPE from now on
4823
4817
if modeltype == 'Binary_Classification' :
4824
4818
### For Binary class, SPE model is better ############
4825
- spe = SelfPacedEnsembleClassifier (estimator = model_copy , n_jobs = - 1 , soft_resample_flag = False )
4819
+ #spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
4820
+ spe = LogisticRegression ()
4826
4821
else :
4827
4822
## For multi-class OnevsRest model is better ###########
4828
- spe = SelfPacedEnsembleClassifier (estimator = model_copy , n_jobs = - 1 , soft_resample_flag = False )
4823
+ #spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
4824
+ spe = LogisticRegression ()
4829
4825
spe = OneVsRestClassifier (estimator = spe )
4830
4826
print ('Training Imbalanced model. This will take time...' )
4831
4827
spe .fit (X_df , y_df )
@@ -5418,6 +5414,7 @@ def plot_dfplot(dfplot, plot_title=""):
5418
5414
5419
5415
5420
5416
###############################################################################
5417
+ import pdb
5421
5418
def add_entropy_binning (temp_train , targ , num_vars , important_features , temp_test ,
5422
5419
modeltype , entropy_binning ,verbose = 0 ):
5423
5420
"""
@@ -5453,7 +5450,8 @@ def add_entropy_binning(temp_train, targ, num_vars, important_features, temp_tes
5453
5450
### This is an Awesome Entropy Based Binning Method for Continuous Variables ###########
5454
5451
from sklearn .tree import DecisionTreeRegressor , DecisionTreeClassifier
5455
5452
if modeltype == 'Regression' :
5456
- clf = DecisionTreeRegressor (criterion = 'mse' ,min_samples_leaf = 2 ,
5453
+ ### default is 'mse' in old version and 'squared error' in new version - hence leave criterion out
5454
+ clf = DecisionTreeRegressor (min_samples_leaf = 2 ,
5457
5455
max_depth = max_depth ,
5458
5456
random_state = seed )
5459
5457
else :
0 commit comments