Skip to content

Commit 9bafdd2

Browse files
authored
Support numpy.random.RandomState objects (take 2) (#556)
* Support numpy.random.RandomState objects (#530) * Squash random_state work from 347_random_state * Lint * Lint * Changelog * Lint * Test update * Always pass random_state to components * Lint * Fix bug: set random state first. Remove usages of random_state as dict param item in test_pipelines.py * update test for clarity * Fix catboost * Update logreg test * Lint catboost * Update tuner impl to handle random_state * Test changes * Lint * Docs changes * Add unit test for get_random_state * Update test * Remove uncalled code after my changes * Fix tests after rebase * Add unit test coverage for RandomSearchTuner.is_search_space_exhausted * Add unit test coverage for max_time * Add test coverage of get_pipeline when invalid * Lint * Add unit test coverage of when fit/score throws in autobase * Remove duplicate * Lets try that again... got mysterious docs failure * Get min/max int instead of using fixed number which is incorrect for 32bit systems * Add limits to seed range for xgboost too * Introduce a sustainable pattern for generating random seeds from RNGs for different classes * Update changelog * Use SEED_BOUNDS in unit tests * Update comment
1 parent 2e0288e commit 9bafdd2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+253
-121
lines changed

docs/source/changelog.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Changelog
1010
* Added access to parameters in Pipelines with `PipelineBase.parameters` (used to be return of `PipelineBase.describe`) :pr:`501`
1111
* Added `fill_value` parameter for SimpleImputer :pr:`509`
1212
* Added functionality to override component hyperparemeters and made pipelines take hyperparemeters from components :pr:`516`
13+
* Allow numpy.random.RandomState for random_state parameters :pr:`556`
1314
* Fixes
1415
* Changes
1516
* Undo version cap in XGBoost placed in :pr:`402` and allowed all released of XGBoost :pr:`407`

docs/source/pipelines/custom_pipelines.ipynb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,10 @@
4646
" 'Logistic Regression Classifier':{\n",
4747
" 'penalty':'l2',\n",
4848
" 'C':5,\n",
49-
" 'random_state':3\n",
5049
" }\n",
5150
"}\n",
5251
"\n",
53-
"pipeline = CustomPipeline(parameters={}, objective=objective)"
52+
"pipeline = CustomPipeline(parameters={}, objective=objective, random_state=3)"
5453
]
5554
},
5655
{

docs/source/pipelines/overview.ipynb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,10 @@
5050
" \"eta\": 0.5,\n",
5151
" \"min_child_weight\": 5,\n",
5252
" \"max_depth\": 10,\n",
53-
" \"random_state\":5\n",
5453
" }\n",
5554
" }\n",
5655
"\n",
57-
"xgp = XGBoostPipeline(objective='recall', parameters=parameters)\n",
56+
"xgp = XGBoostPipeline(objective='recall', parameters=parameters, random_state=5)\n",
5857
"xgp.graph()"
5958
]
6059
},

evalml/automl/auto_base.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import inspect
2-
import random
32
import time
43
from collections import OrderedDict
54
from sys import stdout
@@ -16,7 +15,7 @@
1615
from evalml.pipelines.components import handle_component
1716
from evalml.problem_types import ProblemTypes
1817
from evalml.tuners import SKOptTuner
19-
from evalml.utils import Logger, convert_to_seconds
18+
from evalml.utils import Logger, convert_to_seconds, get_random_state
2019

2120
logger = Logger()
2221

@@ -78,10 +77,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
7877
'search_order': []
7978
}
8079
self.trained_pipelines = {}
81-
82-
self.random_state = random_state
83-
random.seed(self.random_state)
84-
np.random.seed(seed=self.random_state)
80+
self.random_state = get_random_state(random_state)
8581

8682
self.n_jobs = n_jobs
8783
self.possible_model_families = list(set([p.model_family for p in self.possible_pipelines]))
@@ -90,7 +86,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
9086
self.search_spaces = {}
9187
for p in self.possible_pipelines:
9288
space = list(p.hyperparameters.items())
93-
self.tuners[p.name] = tuner([s[1] for s in space], random_state=random_state)
89+
self.tuners[p.name] = tuner([s[1] for s in space], random_state=self.random_state)
9490
self.search_spaces[p.name] = [s[0] for s in space]
9591
self.additional_objectives = additional_objectives
9692
self._MAX_NAME_LEN = 40
@@ -110,8 +106,8 @@ def search(self, X, y, feature_types=None, raise_errors=False, show_iteration_pl
110106
111107
y (pd.Series): the target training labels of length [n_samples]
112108
113-
feature_types (list, optional): list of feature types. either numeric of categorical.
114-
categorical features will automatically be encoded
109+
feature_types (list, optional): list of feature types, either numerical or categorical.
110+
Categorical features will automatically be encoded
115111
116112
raise_errors (boolean): If true, raise errors and exit search if a pipeline errors during fitting
117113
@@ -245,8 +241,6 @@ def _transform_parameters(self, pipeline_class, parameters, number_features):
245241
component_class = component.__class__
246242

247243
# Inspects each component and adds the following parameters when needed
248-
if 'random_state' in inspect.signature(component_class.__init__).parameters:
249-
component_parameters['random_state'] = self.random_state
250244
if 'n_jobs' in inspect.signature(component_class.__init__).parameters:
251245
component_parameters['n_jobs'] = self.n_jobs
252246
if 'number_features' in inspect.signature(component_class.__init__).parameters:
@@ -325,7 +319,7 @@ def _do_iteration(self, X, y, pbar, raise_errors):
325319
print('')
326320

327321
def _select_pipeline(self):
328-
return random.choice(self.possible_pipelines)
322+
return self.random_state.choice(self.possible_pipelines)
329323

330324
def _propose_parameters(self, pipeline_class):
331325
values = self.tuners[pipeline_class.name].propose()

evalml/automl/auto_classification_search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def __init__(self,
6767
additional_objectives (list): Custom set of objectives to score on.
6868
Will override default objectives for problem type if not empty.
6969
70-
random_state (int): the random_state
70+
random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
7171
7272
n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines.
7373
None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.

evalml/automl/auto_regression_search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def __init__(self,
6464
additional_objectives (list): Custom set of objectives to score on.
6565
Will override default objectives for problem type if not empty.
6666
67-
random_state (int): the random_state
67+
random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
6868
6969
n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines.
7070
None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.

evalml/pipelines/classification/catboost.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ class CatBoostClassificationPipeline(PipelineBase):
1515
"impute_strategy": ["most_frequent"],
1616
}
1717

18-
def __init__(self, parameters, objective):
18+
def __init__(self, parameters, objective, random_state=0):
1919

2020
# note: impute_strategy must support both string and numeric data
2121
super().__init__(parameters=parameters,
22-
objective=objective)
22+
objective=objective,
23+
random_state=random_state)

evalml/pipelines/classification/xgboost.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class XGBoostPipeline(PipelineBase):
88
component_graph = ['One Hot Encoder', 'Simple Imputer', 'RF Classifier Select From Model', 'XGBoost Classifier']
99
supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
1010

11-
def __init__(self, parameters, objective):
11+
def __init__(self, parameters, objective, random_state=0):
1212
super().__init__(parameters=parameters,
13-
objective=objective)
13+
objective=objective,
14+
random_state=random_state)

evalml/pipelines/components/component_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from abc import ABC, abstractmethod
22

33
from evalml.exceptions import MethodPropertyNotFoundError
4-
from evalml.utils import Logger
4+
from evalml.utils import Logger, get_random_state
55

66
logger = Logger()
77

88

99
class ComponentBase(ABC):
1010
def __init__(self, parameters, component_obj, random_state):
11-
self.random_state = random_state
11+
self.random_state = get_random_state(random_state)
1212
self._component_obj = component_obj
1313
self.parameters = parameters
1414

evalml/pipelines/components/estimators/classifiers/catboost_classifier.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from evalml.model_family import ModelFamily
77
from evalml.pipelines.components.estimators import Estimator
88
from evalml.problem_types import ProblemTypes
9-
from evalml.utils import import_or_raise
9+
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
1010

1111

1212
class CatBoostClassifier(Estimator):
@@ -26,6 +26,7 @@ class CatBoostClassifier(Estimator):
2626
supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
2727

2828
def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0):
29+
random_seed = get_random_seed(random_state, 0, SEED_BOUNDS.max_bound)
2930
parameters = {"n_estimators": n_estimators,
3031
"eta": eta,
3132
"max_depth": max_depth}
@@ -36,6 +37,7 @@ def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None
3637
catboost = import_or_raise("catboost", error_msg=cb_error_msg)
3738
self._label_encoder = None
3839
cb_classifier = catboost.CatBoostClassifier(**parameters,
40+
random_seed=random_seed,
3941
silent=True,
4042
allow_writing_files=False)
4143
super().__init__(parameters=parameters,

evalml/pipelines/components/estimators/classifiers/xgboost_classifier.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from evalml.model_family import ModelFamily
44
from evalml.pipelines.components.estimators import Estimator
55
from evalml.problem_types import ProblemTypes
6-
from evalml.utils import import_or_raise
6+
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
77

88

99
class XGBoostClassifier(Estimator):
@@ -19,13 +19,14 @@ class XGBoostClassifier(Estimator):
1919
supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
2020

2121
def __init__(self, eta=0.1, max_depth=3, min_child_weight=1, n_estimators=100, random_state=0):
22+
random_seed = get_random_seed(random_state, SEED_BOUNDS.min_bound, SEED_BOUNDS.max_bound)
2223
parameters = {"eta": eta,
2324
"max_depth": max_depth,
2425
"min_child_weight": min_child_weight,
2526
"n_estimators": n_estimators}
2627
xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`"
2728
xgb = import_or_raise("xgboost", error_msg=xgb_error_msg)
28-
xgb_classifier = xgb.XGBClassifier(random_state=random_state,
29+
xgb_classifier = xgb.XGBClassifier(random_state=random_seed,
2930
eta=eta,
3031
max_depth=max_depth,
3132
n_estimators=n_estimators,

evalml/pipelines/components/estimators/regressors/catboost_regressor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from evalml.model_family import ModelFamily
44
from evalml.pipelines.components.estimators import Estimator
55
from evalml.problem_types import ProblemTypes
6-
from evalml.utils import import_or_raise
6+
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise
77

88

99
class CatBoostRegressor(Estimator):
@@ -23,6 +23,7 @@ class CatBoostRegressor(Estimator):
2323
supported_problem_types = [ProblemTypes.REGRESSION]
2424

2525
def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0):
26+
random_seed = get_random_seed(random_state, 0, SEED_BOUNDS.max_bound)
2627
parameters = {"n_estimators": n_estimators,
2728
"eta": eta,
2829
"max_depth": max_depth}
@@ -32,7 +33,7 @@ def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None
3233
cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
3334
catboost = import_or_raise("catboost", error_msg=cb_error_msg)
3435
cb_regressor = catboost.CatBoostRegressor(**parameters,
35-
random_state=random_state,
36+
random_seed=random_seed,
3637
silent=True,
3738
allow_writing_files=False)
3839
super().__init__(parameters=parameters,

evalml/pipelines/components/estimators/regressors/linear_regressor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class LinearRegressor(Estimator):
1515
model_family = ModelFamily.LINEAR_MODEL
1616
supported_problem_types = [ProblemTypes.REGRESSION]
1717

18-
def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1):
18+
def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1, random_state=0):
1919
parameters = {
2020
'fit_intercept': fit_intercept,
2121
'normalize': normalize
@@ -25,7 +25,7 @@ def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1):
2525
n_jobs=n_jobs)
2626
super().__init__(parameters=parameters,
2727
component_obj=linear_regressor,
28-
random_state=0)
28+
random_state=random_state)
2929

3030
@property
3131
def feature_importances(self):

evalml/pipelines/components/transformers/imputers/simple_imputer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class SimpleImputer(Transformer):
99
name = 'Simple Imputer'
1010
hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}
1111

12-
def __init__(self, impute_strategy="most_frequent", fill_value=None):
12+
def __init__(self, impute_strategy="most_frequent", fill_value=None, random_state=0):
1313
"""Initalizes an transformer that imputes missing data according to the specified imputation strategy."
1414
1515
Arguments:
@@ -24,7 +24,7 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None):
2424
fill_value=fill_value)
2525
super().__init__(parameters=parameters,
2626
component_obj=imputer,
27-
random_state=0)
27+
random_state=random_state)
2828

2929
def transform(self, X, y=None):
3030
"""Transforms data X by imputing missing values

evalml/pipelines/components/transformers/scalers/standard_scaler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ class StandardScaler(Transformer):
88
name = "Standard Scaler"
99
hyperparameter_ranges = {}
1010

11-
def __init__(self):
11+
def __init__(self, random_state=0):
1212
parameters = {}
1313
scaler = SkScaler()
1414
super().__init__(parameters=parameters,
1515
component_obj=scaler,
16-
random_state=0)
16+
random_state=random_state)

evalml/pipelines/components/utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@ def _components_dict():
4242
if params.defaults:
4343
if len(params.args) - 1 == len(params.defaults):
4444
components[obj.name] = obj
45-
elif len(params.args) == 1:
46-
components[obj.name] = obj
4745
return components
4846

4947

evalml/pipelines/pipeline_base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from evalml.exceptions import IllFormattedClassNameError
1414
from evalml.objectives import get_objective
1515
from evalml.problem_types import handle_problem_types
16-
from evalml.utils import Logger, classproperty
16+
from evalml.utils import Logger, classproperty, get_random_state
1717

1818
logger = Logger()
1919

@@ -45,7 +45,7 @@ def supported_problem_types(cls):
4545

4646
custom_hyperparameters = None
4747

48-
def __init__(self, parameters, objective):
48+
def __init__(self, parameters, objective, random_state=0):
4949
"""Machine learning pipeline made out of transformers and a estimator.
5050
5151
Required Class Variables:
@@ -58,7 +58,9 @@ def __init__(self, parameters, objective):
5858
5959
parameters (dict): dictionary with component names as keys and dictionary of that component's parameters as values.
6060
An empty dictionary {} implies using all default values for component parameters.
61+
random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
6162
"""
63+
self.random_state = get_random_state(random_state)
6264
self.component_graph = [self._instantiate_component(c, parameters) for c in self.component_graph]
6365
self.supported_problem_types = [handle_problem_types(problem_type) for problem_type in self.supported_problem_types]
6466
self.objective = get_objective(objective)
@@ -125,7 +127,7 @@ def _instantiate_component(self, component, parameters):
125127
component_name = component.name
126128
try:
127129
component_parameters = parameters.get(component_name, {})
128-
new_component = component_class(**component_parameters)
130+
new_component = component_class(**component_parameters, random_state=self.random_state)
129131
except (ValueError, TypeError) as e:
130132
err = "Error received when instantiating component {} with the following arguments {}".format(component_name, component_parameters)
131133
raise ValueError(err) from e

evalml/pipelines/regression/catboost.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class CatBoostRegressionPipeline(PipelineBase):
1616
"impute_strategy": ["most_frequent"],
1717
}
1818

19-
def __init__(self, parameters, objective):
19+
def __init__(self, parameters, objective, random_state=0):
2020
super().__init__(parameters=parameters,
21-
objective=objective)
21+
objective=objective,
22+
random_state=random_state)

evalml/preprocessing/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,12 @@ def load_data(path, index, label, n_rows=None, drop=None, verbose=True, **kwargs
4040
def split_data(X, y, regression=False, test_size=.2, random_state=None):
4141
"""Splits data into train and test sets.
4242
43-
Args:
43+
Arguments:
4444
X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
4545
y (pd.Series) : labels of length [n_samples]
4646
regression (bool): if true, do not use stratified split
4747
test_size (float) : percent of train set to holdout for testing
48-
random_state (int) : seed for the random number generator
48+
random_state (int, np.random.RandomState) : seed for the random number generator
4949
5050
Returns:
5151
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series : features and labels each split into train and test sets

evalml/tests/automl_tests/test_auto_classification_search.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ def test_init(X_y):
4747
automl.describe_pipeline(0)
4848

4949

50+
def test_get_pipeline_none(X_y):
51+
X, y = X_y
52+
53+
automl = AutoClassificationSearch()
54+
with pytest.raises(RuntimeError, match="Pipeline not found"):
55+
automl.describe_pipeline(0)
56+
57+
5058
def test_cv(X_y):
5159
X, y = X_y
5260
cv_folds = 5
@@ -386,3 +394,11 @@ def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y):
386394
assert y.is_monotonic_increasing
387395
assert len(x) == 3
388396
assert len(y) == 3
397+
398+
399+
def test_max_time(X_y):
400+
X, y = X_y
401+
clf = AutoClassificationSearch(max_time=1e-16)
402+
clf.search(X, y)
403+
# search will always run at least one pipeline
404+
assert len(clf.results['pipeline_results']) == 1

0 commit comments

Comments
 (0)