New 0.2 version - works on python 3.12 or higher

AutoViML · AutoViML · commit aff7e4dbc4da · 2025-01-29T09:10:42.000-05:00
diff --git a/README.md b/README.md
@@ -12,9 +12,14 @@ Automatically Build Various Interpretable ML models fast!<br>
 
 Auto_ViML is pronounced as "auto vimal" (autovimal logo created by Sanket Ghanmare).
 
+## Update (Jan 2025)
+<ol>
+<li><b>Auto_ViML is now upgraded to version 0.2 </b>which means it now runs on Python 3.12 or greater and also pandas 2.0 - this is a huge upgrade to those working in Colabs, Kaggle and other latest kernels. Please make sure you check the `requirements.txt` file to know which versions are recommended.</li>
+</ol>
+
 ## Update (March 2023)
 <ol>
-<li><b>Auto_ViML has a new flag to speed up processing using GPU's</b>. Just set the `GPU_flag`=`True` on Colab and other environments. But don't forget to set the runtime type to be "GPU" while running on Colab. Otherwise you will get an error.</li>
+<li>Auto_ViML has a new flag to speed up processing using GPU's. Just set the `GPU_flag`=`True` on Colab and other environments. But don't forget to set the runtime type to be "GPU" while running on Colab. Otherwise you will get an error.</li>
 </ol>
 
 ## Update (May 2022)
diff --git a/autoviml/Auto_ViML.py b/autoviml/Auto_ViML.py
@@ -64,7 +64,6 @@
 from autoviml.sulov_method import FE_remove_variables_using_SULOV_method, remove_highly_correlated_vars_fast
 
 from autoviml.classify_method import classify_columns
-from imbalanced_ensemble.ensemble import SelfPacedEnsembleClassifier
 
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 
@@ -1401,24 +1400,19 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
             ####    no numeric variables are removed. But next time, we will remove them later!
             # Optionally, select top n variables based on their predictive power
             # This step is useful if you want to bin only the most informative variables
-            entropy_binner = EntropyBinningTransformer(replace_vars=False, modeltype=modeltype, top_n_vars=None)
-
-            # Fit the transformer to the training data
-            entropy_binner.fit_transform(X_train, y_train)
-            
             part_train, num_vars, important_features, part_cv = add_entropy_binning(part_train,
-                                                                                    each_target, saved_num_vars,
-                                                                                    saved_important_features, part_cv,
-                                                                                    modeltype, entropy_binning=False,
-                                                                                    verbose=verbose)
+                                            each_target, saved_num_vars,
+                                            saved_important_features, part_cv,
+                                            modeltype, entropy_binning=False,verbose=verbose)
             #### In saved_num_vars we send in all the continuous_vars but we bin only the top few vars.
             ###  Those that are binned are removed from saved_num_vars and the remaining become num_vars
             ### Our job is to find the names of those original numeric variables which were binned.
             ### orig_num_vars contains original num vars. num_vars contains binned versions of those vars.
             ### Those binned variables have now become categorical vars and must be added to imp_cats.
             ### you get the name of the original vars which were binned here in this orig_num_vars variable!
+            orig_num_vars = left_subtract(saved_num_vars,num_vars)
             #### you need to know the name of the binner variables. This is where you get it!
-            binned_num_vars = left_subtract(num_vars, saved_num_vars)
+            binned_num_vars = left_subtract(num_vars,saved_num_vars)
             imp_cats += binned_num_vars
             #### Also note that important_features does not contain orig_num_vars which have been erased.
         else:
@@ -2361,8 +2355,8 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
                     from sklearn.calibration import CalibratedClassifierCV
                     model = CalibratedClassifierCV(model, method=method, cv=n_splits)
                     if not perform_scaling_flag:
-                        X_ful = X_train.append(X_cv)
-                        y_ful = y_train.append(y_cv)
+                        X_ful = pd.concat([X_train, X_cv], ignore_index=True)
+                        y_ful = pd.concat([y_train, y_cv], ignore_index=True)
                     else:
                         X_ful = np.r_[X_train, X_cv]
                         y_ful = np.r_[y_train, y_cv]
@@ -2646,7 +2640,7 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
         print('Training model on complete Train data and Predicting using given Test Data...')
     ################        I M P O R T A N T: C O M B I N I N G  D A T A ######################
     #### This is Second time: we combine train and CV into Train and Test Sets #################
-    train = part_train.append(part_cv)
+    train = pd.concat([part_train, part_cv], ignore_index=True)
     important_features = [x for x in list(train) if x not in target]
     ############################################################################################
     if model_label == 'Single_Label':
@@ -2665,6 +2659,7 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
             except:
                 pass
         ########################## BINNING SECOND TIME  ###############################
+        new_num_vars = train[important_features].select_dtypes(include=[np.float64,np.float32,np.float16]).columns.tolist()
         ## Now we re-use the saved_num_vars which contained a list of num_vars for binning now!
         ###### Once again we do Entropy Binning on the Full Train Data Set !!
         ########################## BINNING SECOND TIME  ###############################
@@ -2677,9 +2672,8 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
             #### When we Bin the second first time, we set the entropy_binning flag to True so
             ####    that all numeric variables that are binned are removed. This way, only bins remain.
             train, num_vars, important_features, test = add_entropy_binning(train, each_target,
-                                                                            saved_num_vars, important_features, test,
-                                                                            modeltype, entropy_binning=True,
-                                                                            verbose=verbose)
+                                                  saved_num_vars, important_features, test,
+                                                  modeltype,  entropy_binning=True,verbose=verbose)
             #### In saved_num_vars we send in all the continuous_vars but we bin only the top few vars.
             ###  Those that are binned are removed from saved_num_vars and the remaining become num_vars
             ### Our job is to find the names of those original numeric variables which were binned.
@@ -4822,10 +4816,12 @@ def training_with_SMOTE(X_df, y_df, target, eval_set, model_input, Boosting_Flag
             ### For classification problems we are going to use SPE from now on
             if modeltype == 'Binary_Classification':
                 ### For Binary class, SPE model is better ############
-                spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
+                #spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
+                spe = LogisticRegression()
             else:
                 ## For multi-class OnevsRest model is better  ###########
-                spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
+                #spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
+                spe = LogisticRegression()
                 spe = OneVsRestClassifier(estimator=spe)
             print('Training Imbalanced model. This will take time...')
             spe.fit(X_df, y_df)
@@ -5418,6 +5414,7 @@ def plot_dfplot(dfplot, plot_title=""):
 
 
 ###############################################################################
+import pdb
 def add_entropy_binning(temp_train, targ, num_vars, important_features, temp_test,
                        modeltype, entropy_binning,verbose=0):
     """
@@ -5453,7 +5450,8 @@ def add_entropy_binning(temp_train, targ, num_vars, important_features, temp_tes
     ###   This is an Awesome Entropy Based Binning Method for Continuous Variables ###########
     from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
     if modeltype == 'Regression':
-        clf = DecisionTreeRegressor(criterion='mse',min_samples_leaf=2,
+        ### default is 'mse'  in old version and 'squared error' in new version - hence leave criterion out
+        clf = DecisionTreeRegressor(min_samples_leaf=2,
                                     max_depth=max_depth,
                                     random_state=seed)
     else:
diff --git a/autoviml/Transform_KM_Features.py b/autoviml/Transform_KM_Features.py
@@ -134,11 +134,11 @@ def Transform_KM_Features(training_data, training_labels, test_data, km_max=0):
     except:
         target_range = 5.0
     kmf = KMeansFeaturizer(k=k_max, target_scale=target_range, random_state=seed)
-    kmf_hint = kmf.fit(training_data, training_labels)
+    kmf_hint = kmf.fit(training_data.values, training_labels.values)
     ### Just return it with the cluster column => no need to return the data frame ###
-    training_cluster_features = kmf_hint.transform(training_data)
-    test_cluster_features = kmf_hint.transform(test_data)
-    npx = np.c_[training_data, training_labels.values]
+    training_cluster_features = kmf_hint.transform(training_data.values)
+    test_cluster_features = kmf_hint.transform(test_data.values)
+    npx = np.c_[training_data.values, training_labels.values]
     training_with_cluster = np.c_[npx, training_cluster_features]
     test_with_cluster = np.c_[test_data, test_cluster_features]
     ### We are going to just return the cluster values ######
diff --git a/autoviml/__version__.py b/autoviml/__version__.py
@@ -5,7 +5,7 @@
 __author__ = "Ram Seshadri"
 __description__ = "Automatically Build Multiple Interpretable ML Models in Single Line of Code"
 __url__ = "https://github.com/AutoViML/Auto_ViML.git"
-__version__ = "0.1.800"
+__version__ = "0.2.0"
 __nlp_version__ = "0.1.01"
 __license__ = "Apache License 2.0"
 __copyright__ = "2020-21 Google"
diff --git a/autoviml/sulov_method.py b/autoviml/sulov_method.py
@@ -34,7 +34,7 @@ def remove_highly_correlated_vars_fast(df, corr_limit=0.70):
     cor_matrix = df.corr().abs().astype(np.float16)
     # Selecting upper triangle of correlation matrix
     upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),
-                                         k=1).astype(np.bool))
+                                         k=1).astype(bool))
     # Finding index of feature columns with correlation greater than 0.95
     to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > corr_limit)]
     print()
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
-numpy>=1.24.0
-pandas>=1.1.3, <2.0
+numpy<2
+pandas>=2.0
 ipython
+scipy<=1.11.4
 matplotlib>3.7.4
 beautifulsoup4
 emoji
@@ -11,10 +12,9 @@ catboost
 textblob
 nltk
 regex
-scikit-learn>=0.24,<=1.2.2
-xgboost>=0.82,<1.7
+scikit-learn>=0.24,<=1.5.2
+xgboost>=0.82,<=1.7.6
 vaderSentiment
 imbalanced-learn>=0.10.1
 shap>=0.36.0
-imbalanced_ensemble>=0.2.0
 lightgbm>=3.0.0
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="autoviml",
-    version="0.1.800",
+    version="0.2.0",
     author="Ram Seshadri",
     # author_email="author@example.com",
     description="Automatically Build Variant Interpretable ML models fast - now with CatBoost!",
@@ -15,26 +15,27 @@
     url="https://github.com/AutoViML/Auto_ViML",
     packages=setuptools.find_packages(exclude=("tests",)),
     install_requires=[
-        "numpy>=1.24",
-        "pandas>=1.1.3, <2.0",
+        "numpy<2",
+        "pandas>=2.0",
+        "scipy<=1.11.4",
         "xlrd",
         "matplotlib>3.7.4",
         "beautifulsoup4",
         "emoji",
         "ipython",
         "jupyter",
         "seaborn",
-        "catboost",
+        "catboost>=1.2.7",
         "textblob",
         "nltk",
         "regex",
-        "xgboost>=0.82,<1.7",
+        "xgboost>=0.82,<=1.7.6",
         "vaderSentiment",
         "imbalanced-learn>=0.10.1",
         "shap>=0.36.0",
-        "imbalanced_ensemble>=0.2.0",
-        "scikit-learn>=0.24,<=1.2.2",
+        "scikit-learn>=0.24,<=1.5.2",
         "lightgbm>=3.0.0",
+        "networx",
     ],
     classifiers=[
         "Programming Language :: Python :: 3",