Skip to content

Commit ed8669e

Browse files
committed
Merge branch 'main' into doc_build, fix library import conflicts
2 parents 2afe0c9 + 9ca7800 commit ed8669e

File tree

9 files changed

+151
-15
lines changed

9 files changed

+151
-15
lines changed

ImputerExperiments/data/r/.DS_Store

6 KB
Binary file not shown.

docs/requirements_docs.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ mkdocs-jupyter==0.25.0
66
mkdocs-material==9.5.35
77
mkdocstrings==0.26.1
88
mkdocstrings-python==1.11.1
9-
nbconvert==7.16.4
9+
nbconvert==7.16.5

setup.py

-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#TODO update this
44
from setuptools import setup, find_packages
55

6-
76
def calculate_version():
87
initpy = open('tpot/_version.py').read().split('\n')
98
version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1]

tpot/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@
3232
License along with TPOT. If not, see <http://www.gnu.org/licenses/>.
3333
3434
"""
35-
__version__ = '1.0.0'
35+
__version__ = '0.1.9a0'

tpot/builtin_modules/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .feature_set_selector import FeatureSetSelector
22
from .zero_count import ZeroCount
3-
from .column_one_hot_encoder import ColumnOneHotEncoder
3+
from .column_one_hot_encoder import ColumnOneHotEncoder, ColumnOrdinalEncoder
44
from .arithmetictransformer import ArithmeticTransformer
55
from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
66
from .passthrough import Passthrough, SkipTransformer

tpot/builtin_modules/column_one_hot_encoder.py

+136-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
from sklearn.base import BaseEstimator, TransformerMixin
3939
from sklearn.utils import check_array
40-
from sklearn.preprocessing import OneHotEncoder
40+
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
4141
import sklearn
4242

4343
import pandas as pd
@@ -203,3 +203,138 @@ def transform(self, X):
203203
return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1)
204204
else:
205205
return np.hstack((X_not_sel, X_sel))
206+
207+
class ColumnOrdinalEncoder(BaseEstimator, TransformerMixin):
208+
209+
210+
def __init__(self, columns='auto', handle_unknown='error', unknown_value = -1, encoded_missing_value = np.nan, min_frequency=None,max_categories=None):
211+
'''
212+
213+
Parameters
214+
----------
215+
216+
columns : str, list, default='auto'
217+
Determines which columns to onehot encode with sklearn.preprocessing.OneHotEncoder.
218+
- 'auto' : Automatically select categorical features based on columns with less than 10 unique values
219+
- 'categorical' : Automatically select categorical features
220+
- 'numeric' : Automatically select numeric features
221+
- 'all' : Select all features
222+
- list : A list of columns to select
223+
224+
drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder
225+
226+
'''
227+
228+
self.columns = columns
229+
self.handle_unknown = handle_unknown
230+
self.unknown_value = unknown_value
231+
self.encoded_missing_value = encoded_missing_value
232+
self.min_frequency = min_frequency
233+
self.max_categories = max_categories
234+
235+
236+
237+
def fit(self, X, y=None):
238+
"""Fit OneHotEncoder to X, then transform X.
239+
240+
Equivalent to self.fit(X).transform(X), but more convenient and more
241+
efficient. See fit for the parameters, transform for the return value.
242+
243+
Parameters
244+
----------
245+
X : array-like or sparse matrix, shape=(n_samples, n_features)
246+
Dense array or sparse matrix.
247+
y: array-like {n_samples,} (Optional, ignored)
248+
Feature labels
249+
"""
250+
251+
if (self.columns == "categorical" or self.columns == "numeric") and not isinstance(X, pd.DataFrame):
252+
raise ValueError(f"Invalid value for columns: {self.columns}. "
253+
"Only 'all' or <list> is supported for np arrays")
254+
255+
if self.columns == "categorical":
256+
self.columns_ = list(X.select_dtypes(exclude='number').columns)
257+
elif self.columns == "numeric":
258+
self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])]
259+
elif self.columns == "auto":
260+
self.columns_ = auto_select_categorical_features(X)
261+
elif self.columns == "all":
262+
if isinstance(X, pd.DataFrame):
263+
self.columns_ = X.columns
264+
else:
265+
self.columns_ = list(range(X.shape[1]))
266+
elif isinstance(self.columns, list):
267+
self.columns_ = self.columns
268+
else:
269+
raise ValueError(f"Invalid value for columns: {self.columns}")
270+
271+
if len(self.columns_) == 0:
272+
return self
273+
274+
self.enc = sklearn.preprocessing.OrdinalEncoder(categories='auto',
275+
handle_unknown = self.handle_unknown,
276+
unknown_value = self.unknown_value,
277+
encoded_missing_value = self.encoded_missing_value,
278+
min_frequency = self.min_frequency,
279+
max_categories = self.max_categories)
280+
#TODO make this more consistent with sklearn baseimputer/baseencoder
281+
'''
282+
if isinstance(X, pd.DataFrame):
283+
self.enc.set_output(transform="pandas")
284+
for col in X.columns:
285+
# check if the column name is not a string
286+
if not isinstance(col, str):
287+
# if it's not a string, rename the column with "X" prefix
288+
X.rename(columns={col: f"X{col}"}, inplace=True)
289+
'''
290+
291+
if len(self.columns_) == X.shape[1]:
292+
X_sel = self.enc.fit(X)
293+
else:
294+
X_sel, X_not_sel = _X_selected(X, self.columns_)
295+
X_sel = self.enc.fit(X_sel)
296+
297+
return self
298+
299+
def transform(self, X):
300+
"""Transform X using one-hot encoding.
301+
302+
Parameters
303+
----------
304+
X : array-like or sparse matrix, shape=(n_samples, n_features)
305+
Dense array or sparse matrix.
306+
307+
Returns
308+
-------
309+
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
310+
Transformed input.
311+
"""
312+
313+
314+
if len(self.columns_) == 0:
315+
return X
316+
317+
#TODO make this more consistent with sklearn baseimputer/baseencoder
318+
'''
319+
if isinstance(X, pd.DataFrame):
320+
for col in X.columns:
321+
# check if the column name is not a string
322+
if not isinstance(col, str):
323+
# if it's not a string, rename the column with "X" prefix
324+
X.rename(columns={col: f"X{col}"}, inplace=True)
325+
'''
326+
327+
if len(self.columns_) == X.shape[1]:
328+
return self.enc.transform(X)
329+
else:
330+
331+
X_sel, X_not_sel= _X_selected(X, self.columns_)
332+
X_sel = self.enc.transform(X_sel)
333+
334+
#If X is dataframe
335+
if isinstance(X, pd.DataFrame):
336+
337+
X_sel = pd.DataFrame(X_sel, columns=self.enc.get_feature_names_out())
338+
return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1)
339+
else:
340+
return np.hstack((X_not_sel, X_sel))

tpot/config/get_configspace.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
from tpot.builtin_modules import genetic_encoders, feature_encoding_frequency_selector
6161
from tpot.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
6262
from tpot.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder
63-
from tpot.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer
63+
from tpot.builtin_modules import ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, PassKBinsDiscretizer
6464
from tpot.builtin_modules import Passthrough, SkipTransformer
6565
from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression
6666
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
@@ -86,7 +86,7 @@
8686
import sklearn.calibration
8787

8888

89-
all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
89+
all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
9090
AdaBoostClassifier,MLPRegressor,
9191
GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor,
9292
AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer,
@@ -155,19 +155,17 @@
155155
"selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",],
156156
"selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"],
157157
"selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"],
158+
158159
"classifiers" : ["LGBMClassifier", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', 'MLPClassifier', 'MultinomialNB', "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],
159160
"regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'XGBRegressor'],
160-
161-
162-
"transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
161+
"transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer", "ColumnOneHotEncoder", "ColumnOrdinalEncoder"],
162+
163163
"scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ],
164164
"all_transformers" : ["transformers", "scalers"],
165-
166165
"arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"],
167166
"imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"],
168167
"skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"],
169168
"genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],
170-
171169
"classifiers_sklearnex" : ["RandomForestClassifier_sklearnex", "LogisticRegression_sklearnex", "KNeighborsClassifier_sklearnex", "SVC_sklearnex","NuSVC_sklearnex"],
172170
"regressors_sklearnex" : ["LinearRegression_sklearnex", "Ridge_sklearnex", "Lasso_sklearnex", "ElasticNet_sklearnex", "SVR_sklearnex", "NuSVR_sklearnex", "RandomForestRegressor_sklearnex", "KNeighborsRegressor_sklearnex"],
173171
"genetic encoders" : ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],
@@ -352,8 +350,6 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
352350
return transformers.get_QuantileTransformer_configspace(n_samples=n_samples, random_state=random_state)
353351
case "RobustScaler":
354352
return transformers.RobustScaler_configspace
355-
case "ColumnOneHotEncoder":
356-
return {}
357353
case "MaxAbsScaler":
358354
return {}
359355
case "PolynomialFeatures":
@@ -364,6 +360,10 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
364360
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)
365361
case "KBinsDiscretizer":
366362
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)
363+
case "ColumnOneHotEncoder":
364+
return {}
365+
case "ColumnOrdinalEncoder":
366+
return {}
367367

368368
#selectors.py
369369
case "SelectFwe":

tpot/config/imputers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
# and can cause errors. gk
5252
}
5353
)
54-
54+
#test
5555
def get_IterativeImputer_config_space(n_features, random_state):
5656
space = { 'initial_strategy' : Categorical('initial_strategy',
5757
['mean', 'median',

tpot/config/transformers.py

+2
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262

6363
OneHotEncoder_configspace = {} #TODO include the parameter for max unique values
6464

65+
OrdinalEncoder_configspace = {} #TODO include the parameter for max unique values
66+
6567
def get_FastICA_configspace(n_features=100, random_state=None):
6668

6769
space = {

0 commit comments

Comments
 (0)