Skip to content

Commit aff7e4d

Browse files
committed
New 0.2 version - works on python 3.12 or higher
1 parent 9e1fdb0 commit aff7e4d

7 files changed

+43
-39
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,14 @@ Automatically Build Various Interpretable ML models fast!<br>
1212

1313
Auto_ViML is pronounced as "auto vimal" (autovimal logo created by Sanket Ghanmare).
1414

15+
## Update (Jan 2025)
16+
<ol>
17+
<li><b>Auto_ViML is now upgraded to version 0.2 </b>which means it now runs on Python 3.12 or greater and also pandas 2.0 - this is a huge upgrade to those working in Colabs, Kaggle and other latest kernels. Please make sure you check the `requirements.txt` file to know which versions are recommended.</li>
18+
</ol>
19+
1520
## Update (March 2023)
1621
<ol>
17-
<li><b>Auto_ViML has a new flag to speed up processing using GPU's</b>. Just set the `GPU_flag`=`True` on Colab and other environments. But don't forget to set the runtime type to be "GPU" while running on Colab. Otherwise you will get an error.</li>
22+
<li>Auto_ViML has a new flag to speed up processing using GPU's. Just set the `GPU_flag`=`True` on Colab and other environments. But don't forget to set the runtime type to be "GPU" while running on Colab. Otherwise you will get an error.</li>
1823
</ol>
1924

2025
## Update (May 2022)

autoviml/Auto_ViML.py

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@
6464
from autoviml.sulov_method import FE_remove_variables_using_SULOV_method, remove_highly_correlated_vars_fast
6565

6666
from autoviml.classify_method import classify_columns
67-
from imbalanced_ensemble.ensemble import SelfPacedEnsembleClassifier
6867

6968
from sklearn.metrics import mean_absolute_error, mean_squared_error
7069

@@ -1401,24 +1400,19 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
14011400
#### no numeric variables are removed. But next time, we will remove them later!
14021401
# Optionally, select top n variables based on their predictive power
14031402
# This step is useful if you want to bin only the most informative variables
1404-
entropy_binner = EntropyBinningTransformer(replace_vars=False, modeltype=modeltype, top_n_vars=None)
1405-
1406-
# Fit the transformer to the training data
1407-
entropy_binner.fit_transform(X_train, y_train)
1408-
14091403
part_train, num_vars, important_features, part_cv = add_entropy_binning(part_train,
1410-
each_target, saved_num_vars,
1411-
saved_important_features, part_cv,
1412-
modeltype, entropy_binning=False,
1413-
verbose=verbose)
1404+
each_target, saved_num_vars,
1405+
saved_important_features, part_cv,
1406+
modeltype, entropy_binning=False,verbose=verbose)
14141407
#### In saved_num_vars we send in all the continuous_vars but we bin only the top few vars.
14151408
### Those that are binned are removed from saved_num_vars and the remaining become num_vars
14161409
### Our job is to find the names of those original numeric variables which were binned.
14171410
### orig_num_vars contains original num vars. num_vars contains binned versions of those vars.
14181411
### Those binned variables have now become categorical vars and must be added to imp_cats.
14191412
### you get the name of the original vars which were binned here in this orig_num_vars variable!
1413+
orig_num_vars = left_subtract(saved_num_vars,num_vars)
14201414
#### you need to know the name of the binner variables. This is where you get it!
1421-
binned_num_vars = left_subtract(num_vars, saved_num_vars)
1415+
binned_num_vars = left_subtract(num_vars,saved_num_vars)
14221416
imp_cats += binned_num_vars
14231417
#### Also note that important_features does not contain orig_num_vars which have been erased.
14241418
else:
@@ -2361,8 +2355,8 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
23612355
from sklearn.calibration import CalibratedClassifierCV
23622356
model = CalibratedClassifierCV(model, method=method, cv=n_splits)
23632357
if not perform_scaling_flag:
2364-
X_ful = X_train.append(X_cv)
2365-
y_ful = y_train.append(y_cv)
2358+
X_ful = pd.concat([X_train, X_cv], ignore_index=True)
2359+
y_ful = pd.concat([y_train, y_cv], ignore_index=True)
23662360
else:
23672361
X_ful = np.r_[X_train, X_cv]
23682362
y_ful = np.r_[y_train, y_cv]
@@ -2646,7 +2640,7 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
26462640
print('Training model on complete Train data and Predicting using given Test Data...')
26472641
################ I M P O R T A N T: C O M B I N I N G D A T A ######################
26482642
#### This is Second time: we combine train and CV into Train and Test Sets #################
2649-
train = part_train.append(part_cv)
2643+
train = pd.concat([part_train, part_cv], ignore_index=True)
26502644
important_features = [x for x in list(train) if x not in target]
26512645
############################################################################################
26522646
if model_label == 'Single_Label':
@@ -2665,6 +2659,7 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
26652659
except:
26662660
pass
26672661
########################## BINNING SECOND TIME ###############################
2662+
new_num_vars = train[important_features].select_dtypes(include=[np.float64,np.float32,np.float16]).columns.tolist()
26682663
## Now we re-use the saved_num_vars which contained a list of num_vars for binning now!
26692664
###### Once again we do Entropy Binning on the Full Train Data Set !!
26702665
########################## BINNING SECOND TIME ###############################
@@ -2677,9 +2672,8 @@ def Auto_ViML(train, target, test='', sample_submission='', hyper_param='RS', fe
26772672
#### When we Bin the second first time, we set the entropy_binning flag to True so
26782673
#### that all numeric variables that are binned are removed. This way, only bins remain.
26792674
train, num_vars, important_features, test = add_entropy_binning(train, each_target,
2680-
saved_num_vars, important_features, test,
2681-
modeltype, entropy_binning=True,
2682-
verbose=verbose)
2675+
saved_num_vars, important_features, test,
2676+
modeltype, entropy_binning=True,verbose=verbose)
26832677
#### In saved_num_vars we send in all the continuous_vars but we bin only the top few vars.
26842678
### Those that are binned are removed from saved_num_vars and the remaining become num_vars
26852679
### Our job is to find the names of those original numeric variables which were binned.
@@ -4822,10 +4816,12 @@ def training_with_SMOTE(X_df, y_df, target, eval_set, model_input, Boosting_Flag
48224816
### For classification problems we are going to use SPE from now on
48234817
if modeltype == 'Binary_Classification':
48244818
### For Binary class, SPE model is better ############
4825-
spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
4819+
#spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
4820+
spe = LogisticRegression()
48264821
else:
48274822
## For multi-class OnevsRest model is better ###########
4828-
spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
4823+
#spe = SelfPacedEnsembleClassifier(estimator=model_copy, n_jobs=-1, soft_resample_flag=False)
4824+
spe = LogisticRegression()
48294825
spe = OneVsRestClassifier(estimator=spe)
48304826
print('Training Imbalanced model. This will take time...')
48314827
spe.fit(X_df, y_df)
@@ -5418,6 +5414,7 @@ def plot_dfplot(dfplot, plot_title=""):
54185414

54195415

54205416
###############################################################################
5417+
import pdb
54215418
def add_entropy_binning(temp_train, targ, num_vars, important_features, temp_test,
54225419
modeltype, entropy_binning,verbose=0):
54235420
"""
@@ -5453,7 +5450,8 @@ def add_entropy_binning(temp_train, targ, num_vars, important_features, temp_tes
54535450
### This is an Awesome Entropy Based Binning Method for Continuous Variables ###########
54545451
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
54555452
if modeltype == 'Regression':
5456-
clf = DecisionTreeRegressor(criterion='mse',min_samples_leaf=2,
5453+
### default is 'mse' in old version and 'squared error' in new version - hence leave criterion out
5454+
clf = DecisionTreeRegressor(min_samples_leaf=2,
54575455
max_depth=max_depth,
54585456
random_state=seed)
54595457
else:

autoviml/Transform_KM_Features.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,11 @@ def Transform_KM_Features(training_data, training_labels, test_data, km_max=0):
134134
except:
135135
target_range = 5.0
136136
kmf = KMeansFeaturizer(k=k_max, target_scale=target_range, random_state=seed)
137-
kmf_hint = kmf.fit(training_data, training_labels)
137+
kmf_hint = kmf.fit(training_data.values, training_labels.values)
138138
### Just return it with the cluster column => no need to return the data frame ###
139-
training_cluster_features = kmf_hint.transform(training_data)
140-
test_cluster_features = kmf_hint.transform(test_data)
141-
npx = np.c_[training_data, training_labels.values]
139+
training_cluster_features = kmf_hint.transform(training_data.values)
140+
test_cluster_features = kmf_hint.transform(test_data.values)
141+
npx = np.c_[training_data.values, training_labels.values]
142142
training_with_cluster = np.c_[npx, training_cluster_features]
143143
test_with_cluster = np.c_[test_data, test_cluster_features]
144144
### We are going to just return the cluster values ######

autoviml/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
__author__ = "Ram Seshadri"
66
__description__ = "Automatically Build Multiple Interpretable ML Models in Single Line of Code"
77
__url__ = "https://github.com/AutoViML/Auto_ViML.git"
8-
__version__ = "0.1.800"
8+
__version__ = "0.2.0"
99
__nlp_version__ = "0.1.01"
1010
__license__ = "Apache License 2.0"
1111
__copyright__ = "2020-21 Google"

autoviml/sulov_method.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def remove_highly_correlated_vars_fast(df, corr_limit=0.70):
3434
cor_matrix = df.corr().abs().astype(np.float16)
3535
# Selecting upper triangle of correlation matrix
3636
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),
37-
k=1).astype(np.bool))
37+
k=1).astype(bool))
3838
# Finding index of feature columns with correlation greater than 0.95
3939
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > corr_limit)]
4040
print()

requirements.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
numpy>=1.24.0
2-
pandas>=1.1.3, <2.0
1+
numpy<2
2+
pandas>=2.0
33
ipython
4+
scipy<=1.11.4
45
matplotlib>3.7.4
56
beautifulsoup4
67
emoji
@@ -11,10 +12,9 @@ catboost
1112
textblob
1213
nltk
1314
regex
14-
scikit-learn>=0.24,<=1.2.2
15-
xgboost>=0.82,<1.7
15+
scikit-learn>=0.24,<=1.5.2
16+
xgboost>=0.82,<=1.7.6
1617
vaderSentiment
1718
imbalanced-learn>=0.10.1
1819
shap>=0.36.0
19-
imbalanced_ensemble>=0.2.0
2020
lightgbm>=3.0.0

setup.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="autoviml",
8-
version="0.1.800",
8+
version="0.2.0",
99
author="Ram Seshadri",
1010
# author_email="[email protected]",
1111
description="Automatically Build Variant Interpretable ML models fast - now with CatBoost!",
@@ -15,26 +15,27 @@
1515
url="https://github.com/AutoViML/Auto_ViML",
1616
packages=setuptools.find_packages(exclude=("tests",)),
1717
install_requires=[
18-
"numpy>=1.24",
19-
"pandas>=1.1.3, <2.0",
18+
"numpy<2",
19+
"pandas>=2.0",
20+
"scipy<=1.11.4",
2021
"xlrd",
2122
"matplotlib>3.7.4",
2223
"beautifulsoup4",
2324
"emoji",
2425
"ipython",
2526
"jupyter",
2627
"seaborn",
27-
"catboost",
28+
"catboost>=1.2.7",
2829
"textblob",
2930
"nltk",
3031
"regex",
31-
"xgboost>=0.82,<1.7",
32+
"xgboost>=0.82,<=1.7.6",
3233
"vaderSentiment",
3334
"imbalanced-learn>=0.10.1",
3435
"shap>=0.36.0",
35-
"imbalanced_ensemble>=0.2.0",
36-
"scikit-learn>=0.24,<=1.2.2",
36+
"scikit-learn>=0.24,<=1.5.2",
3737
"lightgbm>=3.0.0",
38+
"networx",
3839
],
3940
classifiers=[
4041
"Programming Language :: Python :: 3",

0 commit comments

Comments
 (0)