diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 35b8a01b1b..ed6c545a19 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,6 +9,7 @@ Release Notes * Changes * Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283` * Updated ``graph_prediction_vs_actual_over_time()`` to be compatible with multiseries time series :pr:`4284` + * Updated ``excluded_model_families`` to take in a list of both ``str`` and ``ModelFamily`` data types :pr:`4287` * Unpinned ipywidgets :pr:`4288` * Documentation Changes * Removed erroneous warnings from Data Checks User Guide page and removed ``tqdm`` warning in all notebooks :pr:`4274` diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb index b314f0b002..22a811a888 100644 --- a/docs/source/user_guide/automl.ipynb +++ b/docs/source/user_guide/automl.ipynb @@ -903,8 +903,7 @@ "`DefaultAlgorithm` does this by creating the concept of two modes: `fast` and `long`, where `fast` is a subset of long. The algorithm runs as follows:\n", "\n", "1. Run naive pipelines:\n", - " a. a linear model with the default preprocessing pipeline\n", - " b. a random forest pipeline with the default preprocessing pipeline\n", + " a. a random forest pipeline with the default preprocessing pipeline\n", " \n", "2. Run the same pipelines, this time with feature selection. Subsequent pipelines will use the selected features with a SelectedColumns transformer.\n", "\n", @@ -924,7 +923,7 @@ " a. For each of the previous top 3 estimators, sample 10 parameters from the tuner. Run all 30 in one batch\n", " b. Run ensembling\n", " \n", - "To this end, it is recommended to use the top level `search()` method to run `DefaultAlgorithm`. This allows users to specify running search with just the `mode` parameter, where `fast` is recommended for users who want a fast scan at how EvalML pipelines will perform on their problem and where `long` is reserved for a deeper dive into high performing pipelines. If one needs finer control over AutoML parameters, one can also specify `automl_algorithm='default'` using `AutoMLSearch` and it will default to using `fast` mode. However, in this case ensembling will be defined by the `ensembling` flag (if `ensembling=False` the abovementioned ensembling batches will be skipped). Users are welcome to select `max_batches` according to the algorithm above (or other stopping criteria) but should be aware that results may not be optimal if the algorithm does not run for the full length of `fast` mode." + "To this end, it is recommended to use the top level `search()` method to run `DefaultAlgorithm`. This allows users to specify running search with just the `mode` parameter, where `fast` is recommended for users who want a fast scan at how EvalML pipelines will perform on their problem and where `long` is reserved for a deeper dive into high performing pipelines. If one needs finer control over AutoML parameters, one can also specify `automl_algorithm='default'` using `AutoMLSearch` and it will default to using `fast` mode. However, in this case ensembling will be defined by the `ensembling` flag (if `ensembling=False` the abovementioned ensembling batches will be skipped). Users are welcome to select `max_batches` according to the algorithm above (or other stopping criteria) but should be aware that results may not be optimal if the algorithm does not run for the full length of `fast` mode. Note that the `allowed_model_families` and `excluded_model_families` parameters are only applied to the non-naive batches in the default algorithms. If users want to apply these to all estimators, use the iterative algorithm by specifying `automl_algorithm='iterative'`." ] }, { diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py index bb6f7591b5..d6cb0b9aca 100644 --- a/evalml/automl/automl_algorithm/default_algorithm.py +++ b/evalml/automl/automl_algorithm/default_algorithm.py @@ -79,8 +79,8 @@ class DefaultAlgorithm(AutoMLAlgorithm): Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary` - to `multiclass` or `regression` depending on the problem type. - excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches. + to `multiclass` or `regression` depending on the problem type. For default algorithm, this only applies to estimators in the non-naive batches. + excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches. """ def __init__( diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index cd09d679db..2cb9281fd6 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -43,7 +43,7 @@ class IterativeAlgorithm(AutoMLAlgorithm): model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary` to `multiclass` or `regression` depending on the problem type. Note that if allowed_pipelines is provided, this parameter will be ignored. - excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. + excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the estimators used when building pipelines. allowed_component_graphs (dict): A dictionary of lists or ComponentGraphs indicating the component graphs allowed in the search. The format should follow { "Name_0": [list_of_components], "Name_1": [ComponentGraph(...)] } diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index b3bdc0bbfe..c32e640222 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -359,7 +359,7 @@ class AutoMLSearch: allowed_model_families (list(str, ModelFamily)): The model families to search. The default of None searches over all model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary` to `multiclass` or `regression` depending on the problem type. Note that if allowed_pipelines is provided, - this parameter will be ignored. + this parameter will be ignored. For default algorithm, this only applies to estimators in the non-naive batches. features (list)[FeatureBase]: List of features to run DFS on AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the search input @@ -442,7 +442,7 @@ class AutoMLSearch: exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by search. Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" - excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches. + excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches. holdout_set_size (float): The size of the holdout set that AutoML search will take for datasets larger than 500 rows. If set to 0, holdout set will not be taken regardless of number of rows. Must be between 0 and 1, exclusive. Defaults to 0.1. @@ -864,9 +864,12 @@ def _is_imbalanced(X, y, problem_type): raise ValueError( "`excluded_model_families` must be passed in the form of a list.", ) - if not all(isinstance(x, ModelFamily) for x in excluded_model_families): + if not all( + isinstance(x, ModelFamily) or isinstance(x, str) + for x in excluded_model_families + ): raise ValueError( - "All values in `excluded_model_families` must be of type `ModelFamily`.", + "All values in `excluded_model_families` must be of type `ModelFamily` or `str`.", ) self.excluded_model_families = excluded_model_families diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index b8dcb833c2..d8c016531f 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -67,8 +67,8 @@ def get_estimators( Args: problem_type (ProblemTypes or str): Problem type to filter for. - model_families (list[ModelFamily] or list[str]): Model families to filter for. - excluded_model_families (list[ModelFamily]): A list of model families to exclude from the results. + model_families (list(str, ModelFamily)): Model families to filter for. + excluded_model_families (list(str, ModelFamily)): A list of model families to exclude from the results. Returns: list[class]: A list of estimator subclasses. diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index e1ce9e7f26..d26ad9ba02 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -70,6 +70,7 @@ ARIMARegressor, DateTimeFeaturizer, EmailFeaturizer, + LinearRegressor, NaturalLanguageFeaturizer, RandomForestClassifier, SelectColumns, @@ -5356,7 +5357,7 @@ def test_excluded_model_families( y_train=y, problem_type=ProblemTypes.BINARY, automl_algorithm=automl_algorithm, - excluded_model_families=[ModelFamily.RANDOM_FOREST], + excluded_model_families=[ModelFamily.RANDOM_FOREST, "linear_model"], ) env = AutoMLTestEnv(ProblemTypes.BINARY) @@ -5377,6 +5378,7 @@ def test_excluded_model_families( assert SelectColumns.name not in pl.component_graph.compute_order else: assert RandomForestClassifier.name not in pl.component_graph.compute_order + assert LinearRegressor.name not in pl.component_graph.compute_order def test_excluded_model_families_error( @@ -5396,9 +5398,7 @@ def test_excluded_model_families_error( excluded_model_families=ModelFamily.RANDOM_FOREST, ) - match_text = ( - "All values in `excluded_model_families` must be of type `ModelFamily`." - ) + match_text = "All values in `excluded_model_families` must be of type `ModelFamily` or `str`." with pytest.raises( ValueError, match=match_text, @@ -5407,7 +5407,7 @@ def test_excluded_model_families_error( X_train=X, y_train=y, problem_type=ProblemTypes.BINARY, - excluded_model_families=[ModelFamily.RANDOM_FOREST, "XGBoost"], + excluded_model_families=[ModelFamily.RANDOM_FOREST, "XGBoost", 0], )