Skip to content

Commit

Permalink
Support numpy.random.RandomState objects (take 2) (#556)
Browse files Browse the repository at this point in the history
* Support numpy.random.RandomState objects (#530)

* Squash random_state work from 347_random_state

* Lint

* Lint

* Changelog

* Lint

* Test update

* Always pass random_state to components

* Lint

* Fix bug: set random state first. Remove usages of random_state as dict param item in test_pipelines.py

* update test for clarity

* Fix catboost

* Update logreg test

* Lint catboost

* Update tuner impl to handle random_state

* Test changes

* Lint

* Docs changes

* Add unit test for get_random_state

* Update test

* Remove uncalled code after my changes

* Fix tests after rebase

* Add unit test coverage for RandomSearchTuner.is_search_space_exhausted

* Add unit test coverage for max_time

* Add test coverage of get_pipeline when invalid

* Lint

* Add unit test coverage of when fit/score throws in autobase

* Remove duplicate

* Lets try that again... got mysterious docs failure

* Get min/max int instead of using fixed number which is incorrect for 32bit systems

* Add limits to seed range for xgboost too

* Introduce a sustainable pattern for generating random seeds from RNGs for different classes

* Update changelog

* Use SEED_BOUNDS in unit tests

* Update comment
  • Loading branch information
dsherry authored Apr 1, 2020
1 parent 2e0288e commit 9bafdd2
Show file tree
Hide file tree
Showing 41 changed files with 253 additions and 121 deletions.
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Changelog
* Added access to parameters in Pipelines with `PipelineBase.parameters` (used to be return of `PipelineBase.describe`) :pr:`501`
* Added `fill_value` parameter for SimpleImputer :pr:`509`
* Added functionality to override component hyperparemeters and made pipelines take hyperparemeters from components :pr:`516`
* Allow numpy.random.RandomState for random_state parameters :pr:`556`
* Fixes
* Changes
* Undo version cap in XGBoost placed in :pr:`402` and allowed all released of XGBoost :pr:`407`
Expand Down
3 changes: 1 addition & 2 deletions docs/source/pipelines/custom_pipelines.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,10 @@
" 'Logistic Regression Classifier':{\n",
" 'penalty':'l2',\n",
" 'C':5,\n",
" 'random_state':3\n",
" }\n",
"}\n",
"\n",
"pipeline = CustomPipeline(parameters={}, objective=objective)"
"pipeline = CustomPipeline(parameters={}, objective=objective, random_state=3)"
]
},
{
Expand Down
3 changes: 1 addition & 2 deletions docs/source/pipelines/overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,10 @@
" \"eta\": 0.5,\n",
" \"min_child_weight\": 5,\n",
" \"max_depth\": 10,\n",
" \"random_state\":5\n",
" }\n",
" }\n",
"\n",
"xgp = XGBoostPipeline(objective='recall', parameters=parameters)\n",
"xgp = XGBoostPipeline(objective='recall', parameters=parameters, random_state=5)\n",
"xgp.graph()"
]
},
Expand Down
18 changes: 6 additions & 12 deletions evalml/automl/auto_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import inspect
import random
import time
from collections import OrderedDict
from sys import stdout
Expand All @@ -16,7 +15,7 @@
from evalml.pipelines.components import handle_component
from evalml.problem_types import ProblemTypes
from evalml.tuners import SKOptTuner
from evalml.utils import Logger, convert_to_seconds
from evalml.utils import Logger, convert_to_seconds, get_random_state

logger = Logger()

Expand Down Expand Up @@ -78,10 +77,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
'search_order': []
}
self.trained_pipelines = {}

self.random_state = random_state
random.seed(self.random_state)
np.random.seed(seed=self.random_state)
self.random_state = get_random_state(random_state)

self.n_jobs = n_jobs
self.possible_model_families = list(set([p.model_family for p in self.possible_pipelines]))
Expand All @@ -90,7 +86,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
self.search_spaces = {}
for p in self.possible_pipelines:
space = list(p.hyperparameters.items())
self.tuners[p.name] = tuner([s[1] for s in space], random_state=random_state)
self.tuners[p.name] = tuner([s[1] for s in space], random_state=self.random_state)
self.search_spaces[p.name] = [s[0] for s in space]
self.additional_objectives = additional_objectives
self._MAX_NAME_LEN = 40
Expand All @@ -110,8 +106,8 @@ def search(self, X, y, feature_types=None, raise_errors=False, show_iteration_pl
y (pd.Series): the target training labels of length [n_samples]
feature_types (list, optional): list of feature types. either numeric of categorical.
categorical features will automatically be encoded
feature_types (list, optional): list of feature types, either numerical or categorical.
Categorical features will automatically be encoded
raise_errors (boolean): If true, raise errors and exit search if a pipeline errors during fitting
Expand Down Expand Up @@ -245,8 +241,6 @@ def _transform_parameters(self, pipeline_class, parameters, number_features):
component_class = component.__class__

# Inspects each component and adds the following parameters when needed
if 'random_state' in inspect.signature(component_class.__init__).parameters:
component_parameters['random_state'] = self.random_state
if 'n_jobs' in inspect.signature(component_class.__init__).parameters:
component_parameters['n_jobs'] = self.n_jobs
if 'number_features' in inspect.signature(component_class.__init__).parameters:
Expand Down Expand Up @@ -325,7 +319,7 @@ def _do_iteration(self, X, y, pbar, raise_errors):
print('')

def _select_pipeline(self):
return random.choice(self.possible_pipelines)
return self.random_state.choice(self.possible_pipelines)

def _propose_parameters(self, pipeline_class):
values = self.tuners[pipeline_class.name].propose()
Expand Down
2 changes: 1 addition & 1 deletion evalml/automl/auto_classification_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(self,
additional_objectives (list): Custom set of objectives to score on.
Will override default objectives for problem type if not empty.
random_state (int): the random_state
random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines.
None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
Expand Down
2 changes: 1 addition & 1 deletion evalml/automl/auto_regression_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(self,
additional_objectives (list): Custom set of objectives to score on.
Will override default objectives for problem type if not empty.
random_state (int): the random_state
random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines.
None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
Expand Down
5 changes: 3 additions & 2 deletions evalml/pipelines/classification/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ class CatBoostClassificationPipeline(PipelineBase):
"impute_strategy": ["most_frequent"],
}

def __init__(self, parameters, objective):
def __init__(self, parameters, objective, random_state=0):

# note: impute_strategy must support both string and numeric data
super().__init__(parameters=parameters,
objective=objective)
objective=objective,
random_state=random_state)
5 changes: 3 additions & 2 deletions evalml/pipelines/classification/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class XGBoostPipeline(PipelineBase):
component_graph = ['One Hot Encoder', 'Simple Imputer', 'RF Classifier Select From Model', 'XGBoost Classifier']
supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]

def __init__(self, parameters, objective):
def __init__(self, parameters, objective, random_state=0):
super().__init__(parameters=parameters,
objective=objective)
objective=objective,
random_state=random_state)
4 changes: 2 additions & 2 deletions evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from abc import ABC, abstractmethod

from evalml.exceptions import MethodPropertyNotFoundError
from evalml.utils import Logger
from evalml.utils import Logger, get_random_state

logger = Logger()


class ComponentBase(ABC):
def __init__(self, parameters, component_obj, random_state):
self.random_state = random_state
self.random_state = get_random_state(random_state)
self._component_obj = component_obj
self.parameters = parameters

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import import_or_raise
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise


class CatBoostClassifier(Estimator):
Expand All @@ -26,6 +26,7 @@ class CatBoostClassifier(Estimator):
supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]

def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0):
random_seed = get_random_seed(random_state, 0, SEED_BOUNDS.max_bound)
parameters = {"n_estimators": n_estimators,
"eta": eta,
"max_depth": max_depth}
Expand All @@ -36,6 +37,7 @@ def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None
catboost = import_or_raise("catboost", error_msg=cb_error_msg)
self._label_encoder = None
cb_classifier = catboost.CatBoostClassifier(**parameters,
random_seed=random_seed,
silent=True,
allow_writing_files=False)
super().__init__(parameters=parameters,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import import_or_raise
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise


class XGBoostClassifier(Estimator):
Expand All @@ -19,13 +19,14 @@ class XGBoostClassifier(Estimator):
supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]

def __init__(self, eta=0.1, max_depth=3, min_child_weight=1, n_estimators=100, random_state=0):
random_seed = get_random_seed(random_state, SEED_BOUNDS.min_bound, SEED_BOUNDS.max_bound)
parameters = {"eta": eta,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"n_estimators": n_estimators}
xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`"
xgb = import_or_raise("xgboost", error_msg=xgb_error_msg)
xgb_classifier = xgb.XGBClassifier(random_state=random_state,
xgb_classifier = xgb.XGBClassifier(random_state=random_seed,
eta=eta,
max_depth=max_depth,
n_estimators=n_estimators,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import import_or_raise
from evalml.utils import SEED_BOUNDS, get_random_seed, import_or_raise


class CatBoostRegressor(Estimator):
Expand All @@ -23,6 +23,7 @@ class CatBoostRegressor(Estimator):
supported_problem_types = [ProblemTypes.REGRESSION]

def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None, random_state=0):
random_seed = get_random_seed(random_state, 0, SEED_BOUNDS.max_bound)
parameters = {"n_estimators": n_estimators,
"eta": eta,
"max_depth": max_depth}
Expand All @@ -32,7 +33,7 @@ def __init__(self, n_estimators=1000, eta=0.03, max_depth=6, bootstrap_type=None
cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
catboost = import_or_raise("catboost", error_msg=cb_error_msg)
cb_regressor = catboost.CatBoostRegressor(**parameters,
random_state=random_state,
random_seed=random_seed,
silent=True,
allow_writing_files=False)
super().__init__(parameters=parameters,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class LinearRegressor(Estimator):
model_family = ModelFamily.LINEAR_MODEL
supported_problem_types = [ProblemTypes.REGRESSION]

def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1):
def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1, random_state=0):
parameters = {
'fit_intercept': fit_intercept,
'normalize': normalize
Expand All @@ -25,7 +25,7 @@ def __init__(self, fit_intercept=True, normalize=False, n_jobs=-1):
n_jobs=n_jobs)
super().__init__(parameters=parameters,
component_obj=linear_regressor,
random_state=0)
random_state=random_state)

@property
def feature_importances(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class SimpleImputer(Transformer):
name = 'Simple Imputer'
hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}

def __init__(self, impute_strategy="most_frequent", fill_value=None):
def __init__(self, impute_strategy="most_frequent", fill_value=None, random_state=0):
"""Initalizes an transformer that imputes missing data according to the specified imputation strategy."
Arguments:
Expand All @@ -24,7 +24,7 @@ def __init__(self, impute_strategy="most_frequent", fill_value=None):
fill_value=fill_value)
super().__init__(parameters=parameters,
component_obj=imputer,
random_state=0)
random_state=random_state)

def transform(self, X, y=None):
"""Transforms data X by imputing missing values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ class StandardScaler(Transformer):
name = "Standard Scaler"
hyperparameter_ranges = {}

def __init__(self):
def __init__(self, random_state=0):
parameters = {}
scaler = SkScaler()
super().__init__(parameters=parameters,
component_obj=scaler,
random_state=0)
random_state=random_state)
2 changes: 0 additions & 2 deletions evalml/pipelines/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ def _components_dict():
if params.defaults:
if len(params.args) - 1 == len(params.defaults):
components[obj.name] = obj
elif len(params.args) == 1:
components[obj.name] = obj
return components


Expand Down
8 changes: 5 additions & 3 deletions evalml/pipelines/pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from evalml.exceptions import IllFormattedClassNameError
from evalml.objectives import get_objective
from evalml.problem_types import handle_problem_types
from evalml.utils import Logger, classproperty
from evalml.utils import Logger, classproperty, get_random_state

logger = Logger()

Expand Down Expand Up @@ -45,7 +45,7 @@ def supported_problem_types(cls):

custom_hyperparameters = None

def __init__(self, parameters, objective):
def __init__(self, parameters, objective, random_state=0):
"""Machine learning pipeline made out of transformers and a estimator.
Required Class Variables:
Expand All @@ -58,7 +58,9 @@ def __init__(self, parameters, objective):
parameters (dict): dictionary with component names as keys and dictionary of that component's parameters as values.
An empty dictionary {} implies using all default values for component parameters.
random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
"""
self.random_state = get_random_state(random_state)
self.component_graph = [self._instantiate_component(c, parameters) for c in self.component_graph]
self.supported_problem_types = [handle_problem_types(problem_type) for problem_type in self.supported_problem_types]
self.objective = get_objective(objective)
Expand Down Expand Up @@ -125,7 +127,7 @@ def _instantiate_component(self, component, parameters):
component_name = component.name
try:
component_parameters = parameters.get(component_name, {})
new_component = component_class(**component_parameters)
new_component = component_class(**component_parameters, random_state=self.random_state)
except (ValueError, TypeError) as e:
err = "Error received when instantiating component {} with the following arguments {}".format(component_name, component_parameters)
raise ValueError(err) from e
Expand Down
5 changes: 3 additions & 2 deletions evalml/pipelines/regression/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class CatBoostRegressionPipeline(PipelineBase):
"impute_strategy": ["most_frequent"],
}

def __init__(self, parameters, objective):
def __init__(self, parameters, objective, random_state=0):
super().__init__(parameters=parameters,
objective=objective)
objective=objective,
random_state=random_state)
4 changes: 2 additions & 2 deletions evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ def load_data(path, index, label, n_rows=None, drop=None, verbose=True, **kwargs
def split_data(X, y, regression=False, test_size=.2, random_state=None):
"""Splits data into train and test sets.
Args:
Arguments:
X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
y (pd.Series) : labels of length [n_samples]
regression (bool): if true, do not use stratified split
test_size (float) : percent of train set to holdout for testing
random_state (int) : seed for the random number generator
random_state (int, np.random.RandomState) : seed for the random number generator
Returns:
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series : features and labels each split into train and test sets
Expand Down
16 changes: 16 additions & 0 deletions evalml/tests/automl_tests/test_auto_classification_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ def test_init(X_y):
automl.describe_pipeline(0)


def test_get_pipeline_none(X_y):
X, y = X_y

automl = AutoClassificationSearch()
with pytest.raises(RuntimeError, match="Pipeline not found"):
automl.describe_pipeline(0)


def test_cv(X_y):
X, y = X_y
cv_folds = 5
Expand Down Expand Up @@ -386,3 +394,11 @@ def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y):
assert y.is_monotonic_increasing
assert len(x) == 3
assert len(y) == 3


def test_max_time(X_y):
X, y = X_y
clf = AutoClassificationSearch(max_time=1e-16)
clf.search(X, y)
# search will always run at least one pipeline
assert len(clf.results['pipeline_results']) == 1
Loading

0 comments on commit 9bafdd2

Please sign in to comment.