diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index abf16c90da..00b9a0b5b9 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -10,6 +10,7 @@ Changelog * Added access to parameters in Pipelines with `PipelineBase.parameters` (used to be return of `PipelineBase.describe`) :pr:`501` * Added `fill_value` parameter for SimpleImputer :pr:`509` * Added functionality to override component hyperparemeters and made pipelines take hyperparemeters from components :pr:`516` + * Allow numpy.random.RandomState for random_state parameters :pr:`556` * Fixes * Changes * Undo version cap in XGBoost placed in :pr:`402` and allowed all released of XGBoost :pr:`407` diff --git a/docs/source/pipelines/custom_pipelines.ipynb b/docs/source/pipelines/custom_pipelines.ipynb index c5515de03b..20e1caeb96 100644 --- a/docs/source/pipelines/custom_pipelines.ipynb +++ b/docs/source/pipelines/custom_pipelines.ipynb @@ -46,11 +46,10 @@ " 'Logistic Regression Classifier':{\n", " 'penalty':'l2',\n", " 'C':5,\n", - " 'random_state':3\n", " }\n", "}\n", "\n", - "pipeline = CustomPipeline(parameters={}, objective=objective)" + "pipeline = CustomPipeline(parameters={}, objective=objective, random_state=3)" ] }, { diff --git a/docs/source/pipelines/overview.ipynb b/docs/source/pipelines/overview.ipynb index 6ef6758cd4..85d9f2c31c 100644 --- a/docs/source/pipelines/overview.ipynb +++ b/docs/source/pipelines/overview.ipynb @@ -50,11 +50,10 @@ " \"eta\": 0.5,\n", " \"min_child_weight\": 5,\n", " \"max_depth\": 10,\n", - " \"random_state\":5\n", " }\n", " }\n", "\n", - "xgp = XGBoostPipeline(objective='recall', parameters=parameters)\n", + "xgp = XGBoostPipeline(objective='recall', parameters=parameters, random_state=5)\n", "xgp.graph()" ] }, diff --git a/evalml/automl/auto_base.py b/evalml/automl/auto_base.py index 5757493fe2..f285c593b1 100644 --- a/evalml/automl/auto_base.py +++ b/evalml/automl/auto_base.py @@ -1,5 +1,4 @@ import inspect -import random import time from collections import OrderedDict from sys import stdout @@ -16,7 +15,7 @@ from evalml.pipelines.components import handle_component from evalml.problem_types import ProblemTypes from evalml.tuners import SKOptTuner -from evalml.utils import Logger, convert_to_seconds +from evalml.utils import Logger, convert_to_seconds, get_random_state logger = Logger() @@ -78,10 +77,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time, 'search_order': [] } self.trained_pipelines = {} - - self.random_state = random_state - random.seed(self.random_state) - np.random.seed(seed=self.random_state) + self.random_state = get_random_state(random_state) self.n_jobs = n_jobs self.possible_model_families = list(set([p.model_family for p in self.possible_pipelines])) @@ -90,7 +86,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time, self.search_spaces = {} for p in self.possible_pipelines: space = list(p.hyperparameters.items()) - self.tuners[p.name] = tuner([s[1] for s in space], random_state=random_state) + self.tuners[p.name] = tuner([s[1] for s in space], random_state=self.random_state) self.search_spaces[p.name] = [s[0] for s in space] self.additional_objectives = additional_objectives self._MAX_NAME_LEN = 40 @@ -110,8 +106,8 @@ def search(self, X, y, feature_types=None, raise_errors=False, show_iteration_pl y (pd.Series): the target training labels of length [n_samples] - feature_types (list, optional): list of feature types. either numeric of categorical. - categorical features will automatically be encoded + feature_types (list, optional): list of feature types, either numerical or categorical. + Categorical features will automatically be encoded raise_errors (boolean): If true, raise errors and exit search if a pipeline errors during fitting @@ -245,8 +241,6 @@ def _transform_parameters(self, pipeline_class, parameters, number_features): component_class = component.__class__ # Inspects each component and adds the following parameters when needed - if 'random_state' in inspect.signature(component_class.__init__).parameters: - component_parameters['random_state'] = self.random_state if 'n_jobs' in inspect.signature(component_class.__init__).parameters: component_parameters['n_jobs'] = self.n_jobs if 'number_features' in inspect.signature(component_class.__init__).parameters: @@ -325,7 +319,7 @@ def _do_iteration(self, X, y, pbar, raise_errors): print('') def _select_pipeline(self): - return random.choice(self.possible_pipelines) + return self.random_state.choice(self.possible_pipelines) def _propose_parameters(self, pipeline_class): values = self.tuners[pipeline_class.name].propose() diff --git a/evalml/automl/auto_classification_search.py b/evalml/automl/auto_classification_search.py index 3f2375b394..3f1d617542 100644 --- a/evalml/automl/auto_classification_search.py +++ b/evalml/automl/auto_classification_search.py @@ -67,7 +67,7 @@ def __init__(self, additional_objectives (list): Custom set of objectives to score on. Will override default objectives for problem type if not empty. - random_state (int): the random_state + random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. diff --git a/evalml/automl/auto_regression_search.py b/evalml/automl/auto_regression_search.py index 039be1362f..05501b2044 100644 --- a/evalml/automl/auto_regression_search.py +++ b/evalml/automl/auto_regression_search.py @@ -64,7 +64,7 @@ def __init__(self, additional_objectives (list): Custom set of objectives to score on. Will override default objectives for problem type if not empty. - random_state (int): the random_state + random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. diff --git a/evalml/pipelines/classification/catboost.py b/evalml/pipelines/classification/catboost.py index 5f8c3d9547..afa094cfd1 100644 --- a/evalml/pipelines/classification/catboost.py +++ b/evalml/pipelines/classification/catboost.py @@ -15,8 +15,9 @@ class CatBoostClassificationPipeline(PipelineBase): "impute_strategy": ["most_frequent"], } - def __init__(self, parameters, objective): + def __init__(self, parameters, objective, random_state=0): # note: impute_strategy must support both string and numeric data super().__init__(parameters=parameters, - objective=objective) + objective=objective, + random_state=random_state) diff --git a/evalml/pipelines/classification/xgboost.py b/evalml/pipelines/classification/xgboost.py index 0b64c7d7a1..34d08f94e8 100644 --- a/evalml/pipelines/classification/xgboost.py +++ b/evalml/pipelines/classification/xgboost.py @@ -8,6 +8,7 @@ class XGBoostPipeline(PipelineBase): component_graph = ['One Hot Encoder', 'Simple Imputer', 'RF Classifier Select From Model', 'XGBoost Classifier'] supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] - def __init__(self, parameters, objective): + def __init__(self, parameters, objective, random_state=0): super().__init__(parameters=parameters, - objective=objective) + objective=objective, + random_state=random_state) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index b3e737f20b..d9c27f9a7d 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -1,14 +1,14 @@ from abc import ABC, abstractmethod from evalml.exceptions import MethodPropertyNotFoundError -from evalml.utils import Logger +from evalml.utils import Logger, get_random_state logger = Logger() class ComponentBase(ABC): def __init__(self, parameters, component_obj, random_state): - self.random_state = random_state + self.random_state = get_random_state(random_state) self._component_obj = component_obj self.parameters = parameters diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py index 0215df2cfb..82065005b1 100644 --- a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py @@ -6,7 +6,7 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes -from evalml.utils import import_or_raise +from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise class CatBoostClassifier(Estimator): @@ -26,6 +26,7 @@ class CatBoostClassifier(Estimator): supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0): + random_seed = get_random_seed(random_state, 0, SEED_BOUNDS.max_bound) parameters = {"n_estimators": n_estimators, "eta": eta, "max_depth": max_depth} @@ -36,6 +37,7 @@ def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None catboost = import_or_raise("catboost", error_msg=cb_error_msg) self._label_encoder = None cb_classifier = catboost.CatBoostClassifier(**parameters, + random_seed=random_seed, silent=True, allow_writing_files=False) super().__init__(parameters=parameters, diff --git a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py index 776a43ffe1..0753715e0a 100644 --- a/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py @@ -3,7 +3,7 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes -from evalml.utils import import_or_raise +from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise class XGBoostClassifier(Estimator): @@ -19,13 +19,14 @@ class XGBoostClassifier(Estimator): supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS] def __init__(self, eta=0.1, max_depth=3, min_child_weight=1, n_estimators=100, random_state=0): + random_seed = get_random_seed(random_state, SEED_BOUNDS.min_bound, SEED_BOUNDS.max_bound) parameters = {"eta": eta, "max_depth": max_depth, "min_child_weight": min_child_weight, "n_estimators": n_estimators} xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`" xgb = import_or_raise("xgboost", error_msg=xgb_error_msg) - xgb_classifier = xgb.XGBClassifier(random_state=random_state, + xgb_classifier = xgb.XGBClassifier(random_state=random_seed, eta=eta, max_depth=max_depth, n_estimators=n_estimators, diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index eb03d83f19..0e96028ba9 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -3,7 +3,7 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes -from evalml.utils import import_or_raise +from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise class CatBoostRegressor(Estimator): @@ -23,6 +23,7 @@ class CatBoostRegressor(Estimator): supported_problem_types = [ProblemTypes.REGRESSION] def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0): + random_seed = get_random_seed(random_state, 0, SEED_BOUNDS.max_bound) parameters = {"n_estimators": n_estimators, "eta": eta, "max_depth": max_depth} @@ -32,7 +33,7 @@ def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`" catboost = import_or_raise("catboost", error_msg=cb_error_msg) cb_regressor = catboost.CatBoostRegressor(**parameters, - random_state=random_state, + random_seed=random_seed, silent=True, allow_writing_files=False) super().__init__(parameters=parameters, diff --git a/evalml/pipelines/components/estimators/regressors/linear_regressor.py b/evalml/pipelines/components/estimators/regressors/linear_regressor.py index ae8b455e23..822555861f 100644 --- a/evalml/pipelines/components/estimators/regressors/linear_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/linear_regressor.py @@ -15,7 +15,7 @@ class LinearRegressor(Estimator): model_family = ModelFamily.LINEAR_MODEL supported_problem_types = [ProblemTypes.REGRESSION] - def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1): + def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1, random_state=0): parameters = { 'fit_intercept': fit_intercept, 'normalize': normalize @@ -25,7 +25,7 @@ def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1): n_jobs=n_jobs) super().__init__(parameters=parameters, component_obj=linear_regressor, - random_state=0) + random_state=random_state) @property def feature_importances(self): diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py index 253121fe0e..1b0be200bf 100644 --- a/evalml/pipelines/components/transformers/imputers/simple_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/simple_imputer.py @@ -9,7 +9,7 @@ class SimpleImputer(Transformer): name = 'Simple Imputer' hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} - def __init__(self, impute_strategy="most_frequent", fill_value=None): + def __init__(self, impute_strategy="most_frequent", fill_value=None, random_state=0): """Initalizes an transformer that imputes missing data according to the specified imputation strategy." Arguments: @@ -24,7 +24,7 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None): fill_value=fill_value) super().__init__(parameters=parameters, component_obj=imputer, - random_state=0) + random_state=random_state) def transform(self, X, y=None): """Transforms data X by imputing missing values diff --git a/evalml/pipelines/components/transformers/scalers/standard_scaler.py b/evalml/pipelines/components/transformers/scalers/standard_scaler.py index 50d8247413..d5c1c5d534 100644 --- a/evalml/pipelines/components/transformers/scalers/standard_scaler.py +++ b/evalml/pipelines/components/transformers/scalers/standard_scaler.py @@ -8,9 +8,9 @@ class StandardScaler(Transformer): name = "Standard Scaler" hyperparameter_ranges = {} - def __init__(self): + def __init__(self, random_state=0): parameters = {} scaler = SkScaler() super().__init__(parameters=parameters, component_obj=scaler, - random_state=0) + random_state=random_state) diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index ee0f10f3fe..4adfde963f 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -42,8 +42,6 @@ def _components_dict(): if params.defaults: if len(params.args) - 1 == len(params.defaults): components[obj.name] = obj - elif len(params.args) == 1: - components[obj.name] = obj return components diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index 3a81ffb610..3886d61862 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -13,7 +13,7 @@ from evalml.exceptions import IllFormattedClassNameError from evalml.objectives import get_objective from evalml.problem_types import handle_problem_types -from evalml.utils import Logger, classproperty +from evalml.utils import Logger, classproperty, get_random_state logger = Logger() @@ -45,7 +45,7 @@ def supported_problem_types(cls): custom_hyperparameters = None - def __init__(self, parameters, objective): + def __init__(self, parameters, objective, random_state=0): """Machine learning pipeline made out of transformers and a estimator. Required Class Variables: @@ -58,7 +58,9 @@ def __init__(self, parameters, objective): parameters (dict): dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary {} implies using all default values for component parameters. + random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. """ + self.random_state = get_random_state(random_state) self.component_graph = [self._instantiate_component(c, parameters) for c in self.component_graph] self.supported_problem_types = [handle_problem_types(problem_type) for problem_type in self.supported_problem_types] self.objective = get_objective(objective) @@ -125,7 +127,7 @@ def _instantiate_component(self, component, parameters): component_name = component.name try: component_parameters = parameters.get(component_name, {}) - new_component = component_class(**component_parameters) + new_component = component_class(**component_parameters, random_state=self.random_state) except (ValueError, TypeError) as e: err = "Error received when instantiating component {} with the following arguments {}".format(component_name, component_parameters) raise ValueError(err) from e diff --git a/evalml/pipelines/regression/catboost.py b/evalml/pipelines/regression/catboost.py index af5419eb25..c8c72c2f89 100644 --- a/evalml/pipelines/regression/catboost.py +++ b/evalml/pipelines/regression/catboost.py @@ -16,6 +16,7 @@ class CatBoostRegressionPipeline(PipelineBase): "impute_strategy": ["most_frequent"], } - def __init__(self, parameters, objective): + def __init__(self, parameters, objective, random_state=0): super().__init__(parameters=parameters, - objective=objective) + objective=objective, + random_state=random_state) diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 5486fbf0d4..6bfbc7397b 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -40,12 +40,12 @@ def load_data(path, index, label, n_rows=None, drop=None, verbose=True, **kwargs def split_data(X, y, regression=False, test_size=.2, random_state=None): """Splits data into train and test sets. - Args: + Arguments: X (pd.DataFrame or np.array) : data of shape [n_samples, n_features] y (pd.Series) : labels of length [n_samples] regression (bool): if true, do not use stratified split test_size (float) : percent of train set to holdout for testing - random_state (int) : seed for the random number generator + random_state (int, np.random.RandomState) : seed for the random number generator Returns: pd.DataFrame, pd.DataFrame, pd.Series, pd.Series : features and labels each split into train and test sets diff --git a/evalml/tests/automl_tests/test_auto_classification_search.py b/evalml/tests/automl_tests/test_auto_classification_search.py index 4d329ef683..e4e30967e5 100644 --- a/evalml/tests/automl_tests/test_auto_classification_search.py +++ b/evalml/tests/automl_tests/test_auto_classification_search.py @@ -47,6 +47,14 @@ def test_init(X_y): automl.describe_pipeline(0) +def test_get_pipeline_none(X_y): + X, y = X_y + + automl = AutoClassificationSearch() + with pytest.raises(RuntimeError, match="Pipeline not found"): + automl.describe_pipeline(0) + + def test_cv(X_y): X, y = X_y cv_folds = 5 @@ -386,3 +394,11 @@ def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y): assert y.is_monotonic_increasing assert len(x) == 3 assert len(y) == 3 + + +def test_max_time(X_y): + X, y = X_y + clf = AutoClassificationSearch(max_time=1e-16) + clf.search(X, y) + # search will always run at least one pipeline + assert len(clf.results['pipeline_results']) == 1 diff --git a/evalml/tests/automl_tests/test_autobase.py b/evalml/tests/automl_tests/test_autobase.py index 6d5c0e1c53..e8fa04f09d 100644 --- a/evalml/tests/automl_tests/test_autobase.py +++ b/evalml/tests/automl_tests/test_autobase.py @@ -1,3 +1,6 @@ +from unittest.mock import patch + +import numpy as np import pytest from sklearn.model_selection import StratifiedKFold @@ -75,8 +78,30 @@ def test_transform_parameters(): parameters = [('penalty', 'l2'), ('C', 8.444214828324364), ('impute_strategy', 'most_frequent')] parameters_dict = { 'Simple Imputer': {'impute_strategy': 'most_frequent'}, - 'One Hot Encoder': {'random_state': 100}, + 'One Hot Encoder': {}, 'Standard Scaler': {}, - 'Logistic Regression Classifier': {'penalty': 'l2', 'C': 8.444214828324364, 'n_jobs': 6, 'random_state': 100} + 'Logistic Regression Classifier': {'penalty': 'l2', 'C': 8.444214828324364, 'n_jobs': 6} } assert automl._transform_parameters(LogisticRegressionPipeline, parameters, 0) == parameters_dict + + +@patch('evalml.pipelines.PipelineBase.fit') +def test_pipeline_fit_raises(mock_fit, X_y): + msg = 'all your model are belong to us' + mock_fit.side_effect = Exception(msg) + X, y = X_y + automl = AutoClassificationSearch(max_pipelines=1) + with pytest.raises(Exception, match=msg): + automl.search(X, y, raise_errors=True) + + automl = AutoClassificationSearch(max_pipelines=1) + automl.search(X, y, raise_errors=False) + pipeline_results = automl.results.get('pipeline_results', {}) + assert len(pipeline_results) == 1 + cv_scores_all = pipeline_results[0].get('cv_data', {}) + for cv_scores in cv_scores_all: + for name, score in cv_scores['all_objective_scores'].items(): + if name in ['# Training', '# Testing']: + assert score > 0 + else: + assert np.isnan(score) diff --git a/evalml/tests/automl_tests/test_pipeline_search_plots.py b/evalml/tests/automl_tests/test_pipeline_search_plots.py index 33954a124e..25d7910be9 100644 --- a/evalml/tests/automl_tests/test_pipeline_search_plots.py +++ b/evalml/tests/automl_tests/test_pipeline_search_plots.py @@ -34,10 +34,9 @@ def search(self): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 0.5, - 'random_state': 0 } } - pipeline = LogisticRegressionPipeline(objective=objective, parameters=parameters) + pipeline = LogisticRegressionPipeline(objective=objective, parameters=parameters, random_state=0) cv = StratifiedKFold(n_splits=5, random_state=0) cv_data = [] for train, test in cv.split(X, y): @@ -122,10 +121,9 @@ def search(self): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 0.5, - 'random_state': 0 } } - pipeline = LogisticRegressionPipeline(objective=objective, parameters=parameters) + pipeline = LogisticRegressionPipeline(objective=objective, parameters=parameters, random_state=0) cv = StratifiedKFold(n_splits=5, random_state=0) cv_data = [] for train, test in cv.split(X, y): diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 21dc7da3a1..511da1365d 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -3,6 +3,7 @@ import pytest from evalml.pipelines.components import OneHotEncoder +from evalml.utils import get_random_state def test_fit_first(): @@ -52,17 +53,19 @@ def test_more_top_n_unique_values(): X["col_3"] = ["a", "a", "a", "a", "a", "a", "b"] X["col_4"] = [2, 0, 1, 3, 0, 1, 2] - encoder = OneHotEncoder(random_state=2) + random_seed = 2 + encoder = OneHotEncoder(random_state=random_seed) + test_random_state = get_random_state(random_seed) encoder.parameters['top_n'] = 5 encoder.fit(X) X_t = encoder.transform(X) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() - col_1_counts = col_1_counts.sample(frac=1, random_state=encoder.random_state) + col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist() col_2_counts = X["col_2"].value_counts(dropna=False).to_frame() - col_2_counts = col_2_counts.sample(frac=1, random_state=encoder.random_state) + col_2_counts = col_2_counts.sample(frac=1, random_state=test_random_state) col_2_counts = col_2_counts.sort_values(["col_2"], ascending=False, kind='mergesort') col_2_samples = col_2_counts.head(encoder.parameters['top_n']).index.tolist() @@ -83,12 +86,14 @@ def test_more_top_n_unique_values_large(): X["col_3"] = ["a", "a", "a", "b", "b", "b", "c", "c", "d"] X["col_4"] = [2, 0, 1, 3, 0, 1, 2, 4, 1] - encoder = OneHotEncoder() + random_seed = 2 + encoder = OneHotEncoder(random_state=random_seed) + test_random_state = get_random_state(random_seed) encoder.parameters['top_n'] = 3 encoder.fit(X) X_t = encoder.transform(X) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() - col_1_counts = col_1_counts.sample(frac=1, random_state=encoder.random_state) + col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist() expected_col_names = set(["col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4"]) diff --git a/evalml/tests/objective_tests/test_objectives.py b/evalml/tests/objective_tests/test_objectives.py index 3a6caa43f6..6fdcf9dd95 100644 --- a/evalml/tests/objective_tests/test_objectives.py +++ b/evalml/tests/objective_tests/test_objectives.py @@ -31,11 +31,10 @@ def test_binary_average(X_y): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 1.0, - 'random_state': 0 } } - pipeline = LogisticRegressionPipeline(objective=objective, parameters=parameters) + pipeline = LogisticRegressionPipeline(objective=objective, parameters=parameters, random_state=0) pipeline.fit(X, y) y_pred = pipeline.predict(X) diff --git a/evalml/tests/pipeline_tests/test_catboost_classification.py b/evalml/tests/pipeline_tests/test_catboost_classification.py index f83d3aaaa2..445de0ea5c 100644 --- a/evalml/tests/pipeline_tests/test_catboost_classification.py +++ b/evalml/tests/pipeline_tests/test_catboost_classification.py @@ -6,6 +6,7 @@ from evalml.objectives import PrecisionMicro from evalml.pipelines import CatBoostClassificationPipeline +from evalml.utils import SEED_BOUNDS, get_random_seed, get_random_state importorskip('catboost', reason='Skipping test because catboost not installed') @@ -24,17 +25,20 @@ def test_catboost_init(): "max_depth": 3, } } - clf = CatBoostClassificationPipeline(objective=objective, parameters=parameters) + clf = CatBoostClassificationPipeline(objective=objective, parameters=parameters, random_state=2) assert clf.parameters == parameters + assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0]) def test_catboost_multi(X_y_multi): from catboost import CatBoostClassifier as CBClassifier X, y = X_y_multi + random_seed = 42 + catboost_random_seed = get_random_seed(get_random_state(random_seed), min_bound=0, max_bound=SEED_BOUNDS.max_bound) imputer = SimpleImputer(strategy='mean') - estimator = CBClassifier(n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type='Bayesian', allow_writing_files=False, random_state=0) + estimator = CBClassifier(n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type='Bayesian', allow_writing_files=False, random_seed=catboost_random_seed) sk_pipeline = Pipeline([("imputer", imputer), ("estimator", estimator)]) sk_pipeline.fit(X, y) @@ -53,7 +57,7 @@ def test_catboost_multi(X_y_multi): } } - clf = CatBoostClassificationPipeline(objective=objective, parameters=parameters) + clf = CatBoostClassificationPipeline(objective=objective, parameters=parameters, random_state=get_random_state(random_seed)) clf.fit(X, y) clf_score = clf.score(X, y) y_pred = clf.predict(X) diff --git a/evalml/tests/pipeline_tests/test_catboost_regression.py b/evalml/tests/pipeline_tests/test_catboost_regression.py index e148374665..2150c899ec 100644 --- a/evalml/tests/pipeline_tests/test_catboost_regression.py +++ b/evalml/tests/pipeline_tests/test_catboost_regression.py @@ -5,6 +5,7 @@ from evalml.objectives import R2 from evalml.pipelines import CatBoostRegressionPipeline +from evalml.utils import SEED_BOUNDS, get_random_seed, get_random_state importorskip('catboost', reason='Skipping test because catboost not installed') @@ -23,16 +24,19 @@ def test_catboost_init(): "max_depth": 6, } } - clf = CatBoostRegressionPipeline(objective=objective, parameters=parameters) + clf = CatBoostRegressionPipeline(objective=objective, parameters=parameters, random_state=2) assert clf.parameters == parameters + assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0]) def test_catboost_regression(X_y_reg): from catboost import CatBoostRegressor as CBRegressor X, y = X_y_reg + random_seed = 42 + catboost_random_seed = get_random_seed(get_random_state(random_seed), min_bound=0, max_bound=SEED_BOUNDS.max_bound) imputer = SimpleImputer(strategy='mean') - estimator = CBRegressor(n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type='Bayesian', allow_writing_files=False, random_state=0) + estimator = CBRegressor(n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type='Bayesian', allow_writing_files=False, random_seed=catboost_random_seed) sk_pipeline = Pipeline([("imputer", imputer), ("estimator", estimator)]) sk_pipeline.fit(X, y) @@ -50,7 +54,7 @@ def test_catboost_regression(X_y_reg): "max_depth": 6, } } - clf = CatBoostRegressionPipeline(objective=objective, parameters=parameters) + clf = CatBoostRegressionPipeline(objective=objective, parameters=parameters, random_state=get_random_state(random_seed)) clf.fit(X, y) clf_score = clf.score(X, y) y_pred = clf.predict(X) diff --git a/evalml/tests/pipeline_tests/test_linear_regression.py b/evalml/tests/pipeline_tests/test_linear_regression.py index 6f014c3e05..66067f38e3 100644 --- a/evalml/tests/pipeline_tests/test_linear_regression.py +++ b/evalml/tests/pipeline_tests/test_linear_regression.py @@ -25,8 +25,9 @@ def test_lr_init(X_y_categorical_regression): 'normalize': True, }, } - clf = LinearRegressionPipeline(objective=objective, parameters=parameters) + clf = LinearRegressionPipeline(objective=objective, parameters=parameters, random_state=2) assert clf.parameters == parameters + assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0]) def test_linear_regression(X_y_categorical_regression): diff --git a/evalml/tests/pipeline_tests/test_logistic_regression.py b/evalml/tests/pipeline_tests/test_logistic_regression.py index d190ff414f..2da504b35c 100644 --- a/evalml/tests/pipeline_tests/test_logistic_regression.py +++ b/evalml/tests/pipeline_tests/test_logistic_regression.py @@ -25,8 +25,9 @@ def test_lor_init(X_y): 'C': 0.5, } } - clf = LogisticRegressionPipeline(objective=objective, parameters=parameters) + clf = LogisticRegressionPipeline(objective=objective, parameters=parameters, random_state=1) assert clf.parameters == parameters + assert (clf.random_state.get_state()[0] == np.random.RandomState(1).get_state()[0]) def test_lor_multi(X_y_multi): @@ -55,10 +56,9 @@ def test_lor_multi(X_y_multi): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 1.0, - 'random_state': 1 } } - clf = LogisticRegressionPipeline(objective=objective, parameters=parameters) + clf = LogisticRegressionPipeline(objective=objective, parameters=parameters, random_state=1) clf.fit(X, y) clf_score = clf.score(X, y) y_pred = clf.predict(X) @@ -83,11 +83,10 @@ def test_lor_input_feature_names(X_y): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 1.0, - 'random_state': 1 } } - clf = LogisticRegressionPipeline(objective=objective, parameters=parameters) + clf = LogisticRegressionPipeline(objective=objective, parameters=parameters, random_state=1) clf.fit(X, y) assert len(clf.feature_importances) == len(X.columns) diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index a9b73ba8d7..f37e348c99 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -2,6 +2,7 @@ from importlib import import_module from unittest.mock import patch +import numpy as np import pytest from skopt.space import Integer, Real @@ -97,11 +98,10 @@ def lr_pipeline(): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 3.0, - 'random_state': 1 } } - return LogisticRegressionPipeline(objective=objective, parameters=parameters) + return LogisticRegressionPipeline(objective=objective, parameters=parameters, random_state=42) def test_required_fields(): @@ -165,7 +165,6 @@ def test_reproducibility(X_y): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 1.0, - 'random_state': 1 } } @@ -269,7 +268,6 @@ def test_estimator_not_last(X_y): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 1.0, - 'random_state': 1 } } @@ -278,10 +276,6 @@ class MockLogisticRegressionPipeline(PipelineBase): supported_problem_types = ['binary', 'multiclass'] component_graph = ['One Hot Encoder', 'Simple Imputer', 'Logistic Regression Classifier', 'Standard Scaler'] - def __init__(self, objective, parameters): - super().__init__(objective=objective, - parameters=parameters) - err_msg = "A pipeline must have an Estimator as the last component in component_graph." with pytest.raises(ValueError, match=err_msg): MockLogisticRegressionPipeline(objective='recall', parameters=parameters) @@ -300,10 +294,6 @@ class TestPipeline(PipelineBase): "impute_strategy": ["mean", "median", "most_frequent"], } - def __init__(self, objective, parameters): - super().__init__(objective=objective, - parameters=parameters) - parameters = { 'Simple Imputer': { 'impute_strategy': 'mean' @@ -311,7 +301,6 @@ def __init__(self, objective, parameters): 'Logistic Regression Classifier': { 'penalty': 'l2', 'C': 1.0, - 'random_state': 1 } } @@ -340,10 +329,6 @@ class TestPipeline(PipelineBase): "impute_strategy": ["mean", "median", "most_frequent"], } - def __init__(self, objective, parameters): - super().__init__(objective=objective, - parameters=parameters) - clf = TestPipeline(parameters={}, objective='precision') correct_components = [SimpleImputer, OneHotEncoder, RFClassifierSelectFromModel, StandardScaler, RFClassifierSelectFromModel, LogisticRegressionClassifier] for component, correct_components in zip(clf.component_graph, correct_components): @@ -361,10 +346,6 @@ class TestPipeline(PipelineBase): component_graph = ['Logistic Regression Classifier'] supported_problem_types = ['binary', 'regression'] - def __init__(self, objective, parameters): - super().__init__(objective=objective, - parameters=parameters) - with pytest.raises(ValueError, match="not valid for this component graph. Valid problem types include *."): TestPipeline(parameters={}, objective='precision') @@ -376,7 +357,7 @@ class MockComponent(Transformer): 'a': [0, 1, 2] } - def __init__(self, a, b=1, c='2',): + def __init__(self, a, b=1, c='2', random_state=0): self.a = a self.b = b self.c = c @@ -385,25 +366,37 @@ class TestPipeline(PipelineBase): component_graph = [MockComponent(a=0), 'Logistic Regression Classifier'] supported_problem_types = ['binary'] - def __init__(self, objective, parameters): - super().__init__(objective=objective, - parameters=parameters) - with pytest.raises(ValueError, match="Error received when instantiating component *."): TestPipeline(parameters={}, objective='precision') assert TestPipeline(parameters={'Mock Component': {'a': 42}}, objective='precision') +def test_no_random_state_argument_in_component(): + class MockComponent(Transformer): + name = "Mock Component" + hyperparameter_ranges = { + 'a': [0, 1, 2] + } + + def __init__(self, a, b=1, c='2'): + self.a = a + self.b = b + self.c = c + + class TestPipeline(PipelineBase): + component_graph = [MockComponent(a=0), 'Logistic Regression Classifier'] + supported_problem_types = ['binary'] + + with pytest.raises(ValueError, match="Error received when instantiating component *."): + TestPipeline(parameters={'Mock Component': {'a': 42}}, objective='precision', random_state=0) + + def test_init_components_invalid_parameters(): class TestPipeline(PipelineBase): component_graph = ['RF Classifier Select From Model', 'Logistic Regression Classifier'] supported_problem_types = ['binary'] - def __init__(self, objective, parameters): - super().__init__(objective=objective, - parameters=parameters) - parameters = { 'Logistic Regression Classifier': { "cool_parameter": "yes" @@ -417,7 +410,7 @@ def __init__(self, objective, parameters): def test_correct_parameters(lr_pipeline): lr_pipeline = lr_pipeline - assert lr_pipeline.estimator.random_state == 1 + assert lr_pipeline.estimator.random_state.get_state()[0] == np.random.RandomState(1).get_state()[0] assert lr_pipeline.estimator.parameters['C'] == 3.0 assert lr_pipeline['Simple Imputer'].parameters['impute_strategy'] == 'median' @@ -465,8 +458,8 @@ class MockEstimator(Estimator): name = "Mock Estimator" supported_problem_types = [ProblemTypes.BINARY] - def __init__(self): - super().__init__(parameters={}, component_obj={}, random_state=0) + def __init__(self, random_state=0): + super().__init__(parameters={}, component_obj={}, random_state=random_state) class MockPipelineNone(PipelineBase): component_graph = [MockEstimator()] diff --git a/evalml/tests/pipeline_tests/test_rf.py b/evalml/tests/pipeline_tests/test_rf.py index dff8ae3586..a39273ee91 100644 --- a/evalml/tests/pipeline_tests/test_rf.py +++ b/evalml/tests/pipeline_tests/test_rf.py @@ -32,7 +32,7 @@ def test_rf_init(X_y): } } - clf = RFClassificationPipeline(objective=objective, parameters=parameters) + clf = RFClassificationPipeline(objective=objective, parameters=parameters, random_state=2) expected_parameters = { 'Simple Imputer': { @@ -51,6 +51,7 @@ def test_rf_init(X_y): } assert clf.parameters == expected_parameters + assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0]) def test_rf_multi(X_y_multi): diff --git a/evalml/tests/pipeline_tests/test_rf_regression.py b/evalml/tests/pipeline_tests/test_rf_regression.py index 933117cfc8..4142348776 100644 --- a/evalml/tests/pipeline_tests/test_rf_regression.py +++ b/evalml/tests/pipeline_tests/test_rf_regression.py @@ -31,7 +31,7 @@ def test_rf_init(X_y_reg): "max_depth": 5, } } - clf = RFRegressionPipeline(objective=objective, parameters=parameters) + clf = RFRegressionPipeline(objective=objective, parameters=parameters, random_state=2) expected_parameters = { 'Simple Imputer': { @@ -50,6 +50,7 @@ def test_rf_init(X_y_reg): } assert clf.parameters == expected_parameters + assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0]) def test_rf_regression(X_y_categorical_regression): diff --git a/evalml/tests/pipeline_tests/test_xgboost.py b/evalml/tests/pipeline_tests/test_xgboost.py index b773052823..e2a20713d7 100644 --- a/evalml/tests/pipeline_tests/test_xgboost.py +++ b/evalml/tests/pipeline_tests/test_xgboost.py @@ -9,7 +9,12 @@ from evalml.objectives import PrecisionMicro from evalml.pipelines import XGBoostPipeline -from evalml.utils import import_or_raise +from evalml.utils import ( + SEED_BOUNDS, + get_random_seed, + get_random_state, + import_or_raise +) importorskip('xgboost', reason='Skipping test because xgboost not installed') @@ -40,7 +45,7 @@ def test_xg_init(X_y): } } - clf = XGBoostPipeline(objective=objective, parameters=parameters) + clf = XGBoostPipeline(objective=objective, parameters=parameters, random_state=1) expected_parameters = { 'Simple Imputer': { @@ -63,20 +68,23 @@ def test_xg_init(X_y): } assert clf.parameters == expected_parameters + assert (clf.random_state.get_state()[0] == np.random.RandomState(1).get_state()[0]) def test_xg_multi(X_y_multi): X, y = X_y_multi + random_seed = 42 + xgb_random_seed = get_random_seed(get_random_state(random_seed), min_bound=SEED_BOUNDS.min_bound, max_bound=SEED_BOUNDS.max_bound) xgb = import_or_raise("xgboost") imputer = SimpleImputer(strategy='mean') enc = ce.OneHotEncoder(use_cat_names=True, return_df=True) - estimator = xgb.XGBClassifier(random_state=0, + estimator = xgb.XGBClassifier(random_state=xgb_random_seed, eta=0.1, max_depth=3, min_child_weight=1, n_estimators=10) - rf_estimator = SKRandomForestClassifier(random_state=0, n_estimators=10, max_depth=3) + rf_estimator = SKRandomForestClassifier(random_state=get_random_state(random_seed), n_estimators=10, max_depth=3) feature_selection = SelectFromModel(estimator=rf_estimator, max_features=max(1, int(1 * len(X[0]))), threshold=-np.inf) diff --git a/evalml/tests/tuner_tests/test_random_search_tuner.py b/evalml/tests/tuner_tests/test_random_search_tuner.py index 6b3ab1d9d9..f9eb178402 100644 --- a/evalml/tests/tuner_tests/test_random_search_tuner.py +++ b/evalml/tests/tuner_tests/test_random_search_tuner.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + import pytest from evalml import AutoRegressionSearch @@ -77,3 +79,12 @@ def test_random_search_tuner_invalid_space(): RandomSearchTuner(((0, 1))) with pytest.raises(ValueError): RandomSearchTuner([(0, 0)]) + + +@patch('evalml.tuners.RandomSearchTuner.is_search_space_exhausted') +def test_random_search_tuner_exhausted_space(mock_is_search_space_exhausted, X_y): + mock_is_search_space_exhausted.side_effects = lambda: False + X, y = X_y + clf = AutoRegressionSearch(objective="R2", max_pipelines=5, tuner=RandomSearchTuner) + clf.search(X, y) + assert len(clf.results['pipeline_results']) == 0 diff --git a/evalml/tests/utils_tests/test_gen_utils.py b/evalml/tests/utils_tests/test_gen_utils.py index e016c03b2c..7ec07dd4b3 100644 --- a/evalml/tests/utils_tests/test_gen_utils.py +++ b/evalml/tests/utils_tests/test_gen_utils.py @@ -3,8 +3,11 @@ import pytest from evalml.utils.gen_utils import ( + SEED_BOUNDS, classproperty, convert_to_seconds, + get_random_seed, + get_random_state, import_or_raise, normalize_confusion_matrix ) @@ -39,6 +42,35 @@ def test_convert_to_seconds(): assert convert_to_seconds("10 hours") == 36000 +def test_get_random_state(): + assert abs(get_random_state(None).rand() - get_random_state(None).rand()) > 1e-6 + assert get_random_state(42).rand() == np.random.RandomState(42).rand() + assert get_random_state(np.random.RandomState(42)).rand() == np.random.RandomState(42).rand() + + +def test_get_random_seed(): + assert get_random_seed(0) == 0 + assert get_random_seed(1) == 1 + assert get_random_seed(42) == 42 + assert get_random_seed(-42) == -42 + assert get_random_seed(42, min_bound=42) == 42 + assert get_random_seed(42, max_bound=43) == 42 + assert get_random_seed(42, min_bound=42, max_bound=43) == 42 + assert get_random_seed(-42, min_bound=-42, max_bound=0) == -42 + assert get_random_seed(420, min_bound=-500, max_bound=400) == 420 % 400 + assert get_random_seed(-420, min_bound=-400, max_bound=500) == -420 % 400 + + assert get_random_seed(SEED_BOUNDS.max_bound) == 0 + assert get_random_seed(SEED_BOUNDS.max_bound + 1) == 1 + assert get_random_seed(SEED_BOUNDS.min_bound) == SEED_BOUNDS.min_bound + assert get_random_seed(SEED_BOUNDS.min_bound - 1) == abs(SEED_BOUNDS.max_bound) - 1 + + with pytest.raises(ValueError): + get_random_seed(42, min_bound=42, max_bound=42) + with pytest.raises(ValueError): + get_random_seed(42, min_bound=420, max_bound=4) + + def test_normalize_confusion_matrix(): conf_mat = np.array([[2, 0, 0], [0, 0, 1], [1, 0, 2]]) conf_mat_normalized = normalize_confusion_matrix(conf_mat) diff --git a/evalml/tuners/__init__.py b/evalml/tuners/__init__.py index 82b4ecc68f..77eb65ceb8 100644 --- a/evalml/tuners/__init__.py +++ b/evalml/tuners/__init__.py @@ -1,6 +1,6 @@ # flake8:noqa from .skopt_tuner import SKOptTuner from .tuner import Tuner +from .tuner_exceptions import NoParamsException from .random_search_tuner import RandomSearchTuner from .grid_search_tuner import GridSearchTuner -from .tuner_exceptions import NoParamsException diff --git a/evalml/tuners/grid_search_tuner.py b/evalml/tuners/grid_search_tuner.py index 8c67c68b39..b9be580a44 100644 --- a/evalml/tuners/grid_search_tuner.py +++ b/evalml/tuners/grid_search_tuner.py @@ -19,7 +19,7 @@ class GridSearchTuner(Tuner): (3.25, 'A') """ - def __init__(self, space, n_points=10, random_state=None): + def __init__(self, space, n_points=10, random_state=0): """ Generate all of the possible points to search for in the grid Arguments: diff --git a/evalml/tuners/random_search_tuner.py b/evalml/tuners/random_search_tuner.py index b0cd8268f0..b847193c70 100644 --- a/evalml/tuners/random_search_tuner.py +++ b/evalml/tuners/random_search_tuner.py @@ -1,8 +1,7 @@ -from numpy.random import RandomState from skopt import Space -from .tuner import Tuner -from .tuner_exceptions import NoParamsException +from evalml.tuners import NoParamsException, Tuner +from evalml.utils import get_random_state class RandomSearchTuner(Tuner): @@ -18,7 +17,7 @@ class RandomSearchTuner(Tuner): (5, 'A') """ - def __init__(self, space, random_state=None, with_replacement=False, replacement_max_attempts=10): + def __init__(self, space, random_state=0, with_replacement=False, replacement_max_attempts=10): """ Sets up check for duplication if needed. Arguments: @@ -30,7 +29,7 @@ def __init__(self, space, random_state=None, with_replacement=False, replacement with_replacement=True """ self._space = Space(space) - self._random_state = RandomState(random_state) + self._random_state = get_random_state(random_state) self._with_replacement = with_replacement self._replacement_max_attempts = replacement_max_attempts self._used_parameters = set() diff --git a/evalml/tuners/skopt_tuner.py b/evalml/tuners/skopt_tuner.py index a366ab00cc..1f62a4a1e8 100644 --- a/evalml/tuners/skopt_tuner.py +++ b/evalml/tuners/skopt_tuner.py @@ -12,7 +12,7 @@ def __init__(self, space, random_state=0): Arguments: space (dict): search space for hyperparameters - random_state (int): random state + random_state (int, np.random.RandomState): The random state Returns: SKoptTuner: self diff --git a/evalml/tuners/tuner.py b/evalml/tuners/tuner.py index 94dd24c951..6b9043e2de 100644 --- a/evalml/tuners/tuner.py +++ b/evalml/tuners/tuner.py @@ -12,7 +12,7 @@ def __init__(self, space, random_state=0): Arguments: space (dict): search space for hyperparameters - random_state (int): random state + random_state (int, np.random.RandomState): The random state Returns: Tuner: self diff --git a/evalml/utils/__init__.py b/evalml/utils/__init__.py index 9c57b00f08..7c1c93a43c 100644 --- a/evalml/utils/__init__.py +++ b/evalml/utils/__init__.py @@ -1,3 +1,3 @@ # flake8:noqa from .logger import Logger -from .gen_utils import classproperty, import_or_raise, convert_to_seconds, normalize_confusion_matrix +from .gen_utils import classproperty, import_or_raise, convert_to_seconds, get_random_state, get_random_seed, normalize_confusion_matrix, SEED_BOUNDS diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 927ab88ce5..d63be6b327 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -1,18 +1,19 @@ import importlib +from collections import namedtuple import numpy as np import pandas as pd +from sklearn.utils import check_random_state def import_or_raise(library, error_msg=None): - ''' - Attempts to import the requested library by name. + """Attempts to import the requested library by name. If the import fails, raises an ImportError. Arguments: library (str): the name of the library error_msg (str): error message to return if the import fails - ''' + """ try: return importlib.import_module(library) except ImportError: @@ -39,6 +40,41 @@ def convert_to_seconds(input_str): raise AssertionError(msg) +def get_random_state(seed): + """Generates a numpy.random.RandomState instance using seed + + Arguments: + seed (None, int, np.random.RandomState object): seed to generate numpy.random.RandomState with + """ + return check_random_state(seed) + + +# define safe numbers to use as a lower/upper bound for the seed on both 32-bit and 64-bit systems +SEED_BOUNDS = namedtuple('SEED_BOUNDS', ('min_bound', 'max_bound'))(-2**30, 2**30) + + +def get_random_seed(random_state, min_bound=SEED_BOUNDS.min_bound, max_bound=SEED_BOUNDS.max_bound): + """Given a numpy.random.RandomState object, generate an int representing a seed value for another random number generator. Or, if given an int, return that int modulo the magnitude of the smallest bound to avoid numerical issues. + + Invariant: min_bound < max_bound + + Arguments: + random_state (int, numpy.random.RandomState): random state + min_bound (None, int): if not default of None, will be min bound when generating seed (inclusive) + max_bound (None, int): if not default of None, will be max bound when generating seed (exclusive) + + Returns: + int: seed for random number generator + """ + if not min_bound < max_bound: + raise ValueError("Provided min_bound {} is not less than max_bound {}".format(min_bound, max_bound)) + if isinstance(random_state, np.random.RandomState): + return random_state.randint(min_bound, max_bound) + if random_state < min_bound or random_state >= max_bound: + return random_state % min(abs(min_bound), abs(max_bound)) + return random_state + + def normalize_confusion_matrix(conf_mat, option='true'): """Normalizes a confusion matrix.