diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py index ed90d03d8b..68f8f9dc0f 100644 --- a/fedot/api/api_utils/api_params_repository.py +++ b/fedot/api/api_utils/api_params_repository.py @@ -6,7 +6,7 @@ from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation -from fedot.core.constants import AUTO_PRESET_NAME +from fedot.core.constants import AUTO_PRESET_NAME, DEFAULT_CV_FOLDS_BY_TASK from fedot.core.repository.tasks import TaskTypesEnum from fedot.core.utils import default_fedot_data_dir @@ -32,12 +32,6 @@ def __init__(self, task_type: TaskTypesEnum): @staticmethod def default_params_for_task(task_type: TaskTypesEnum) -> dict: """ Returns a dict with default parameters""" - if task_type in [TaskTypesEnum.classification, TaskTypesEnum.regression]: - cv_folds = 5 - - elif task_type == TaskTypesEnum.ts_forecasting: - cv_folds = 3 - # Dict with allowed keyword attributes for Api and their default values. If None - default value set # in dataclasses ``PipelineComposerRequirements``, ``GPAlgorithmParameters``, ``GraphGenerationParams`` # will be used. @@ -51,7 +45,7 @@ def default_params_for_task(task_type: TaskTypesEnum) -> dict: keep_n_best=1, available_operations=None, metric=None, - cv_folds=cv_folds, + cv_folds=DEFAULT_CV_FOLDS_BY_TASK[task_type], genetic_scheme=None, early_stopping_iterations=None, early_stopping_timeout=10, diff --git a/fedot/core/constants.py b/fedot/core/constants.py index c827df30d4..030ec18f7b 100644 --- a/fedot/core/constants.py +++ b/fedot/core/constants.py @@ -15,8 +15,12 @@ FRACTION_OF_UNIQUE_VALUES = 0.95 -default_data_split_ratio_by_task = { +DEFAULT_DATA_SPLIT_RATIO_BY_TASK = { TaskTypesEnum.classification: 0.8, TaskTypesEnum.regression: 0.8, TaskTypesEnum.ts_forecasting: 0.5 } + +DEFAULT_CV_FOLDS_BY_TASK = {TaskTypesEnum.classification: 5, + TaskTypesEnum.regression: 5, + TaskTypesEnum.ts_forecasting: 3} diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 756c2151d8..c8b3799fca 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -1,9 +1,10 @@ from functools import partial from typing import Optional +from copy import deepcopy from golem.core.log import default_log -from fedot.core.constants import default_data_split_ratio_by_task +from fedot.core.constants import DEFAULT_DATA_SPLIT_RATIO_BY_TASK, DEFAULT_CV_FOLDS_BY_TASK from fedot.core.data.data import InputData from fedot.core.data.data_split import train_test_data_setup from fedot.core.data.multi_modal import MultiModalData @@ -31,7 +32,7 @@ def __init__(self, cv_folds: Optional[int] = None, validation_blocks: Optional[int] = None, split_ratio: Optional[float] = None, - shuffle: bool = False): + shuffle: bool = False,): self.cv_folds = cv_folds self.validation_blocks = validation_blocks self.split_ratio = split_ratio @@ -45,13 +46,21 @@ def build(self, data: InputData) -> DataSource: data.shuffle() # Check split_ratio - split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] + split_ratio = self.split_ratio or DEFAULT_DATA_SPLIT_RATIO_BY_TASK[data.task.task_type] if not (0 < split_ratio < 1): raise ValueError(f'split_ratio is {split_ratio} but should be between 0 and 1') - # Calculate the number of validation blocks - if self.validation_blocks is None and data.task.task_type is TaskTypesEnum.ts_forecasting: - self._propose_cv_folds_and_validation_blocks(data, split_ratio) + # Calculate the number of validation blocks and number of cv folds for ts forecasting + if data.task.task_type is TaskTypesEnum.ts_forecasting: + if self.validation_blocks is None: + self._propose_cv_folds_and_validation_blocks(data, split_ratio) + # when forecasting length is low and data length is high there are huge amount of validation blocks + # some model refit each step of forecasting that may be time consuming + # solution is set forecasting length to higher value and reduce validation blocks count + # without reducing validation data length which is equal to forecast_length * validation_blocks + max_validation_blocks = DEFAULT_CV_FOLDS_BY_TASK[data.task.task_type] if self.cv_folds is None else 1 + if self.validation_blocks > max_validation_blocks: + data = self._propose_forecast_length(data, max_validation_blocks) # Split data if self.cv_folds is not None: @@ -73,7 +82,7 @@ def _build_holdout_producer(self, data: InputData) -> DataSource: that always returns same data split. Equivalent to 1-fold validation. """ - split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] + split_ratio = self.split_ratio or DEFAULT_DATA_SPLIT_RATIO_BY_TASK[data.task.task_type] train_data, test_data = train_test_data_setup(data, split_ratio, validation_blocks=self.validation_blocks) if RemoteEvaluator().is_enabled: @@ -129,3 +138,11 @@ def _propose_cv_folds_and_validation_blocks(self, data, split_ratio): else: test_share = 1 / (self.cv_folds + 1) self.validation_blocks = int(data_shape * test_share // forecast_length) + + def _propose_forecast_length(self, data, max_validation_blocks): + horizon = self.validation_blocks * data.task.task_params.forecast_length + self.validation_blocks = max_validation_blocks + # TODO: make copy without copy all data, only with task copy + data = deepcopy(data) + data.task.task_params.forecast_length = int(horizon // self.validation_blocks) + return data