Skip to content

Commit

Permalink
Add forecasting length increasing for fit
Browse files Browse the repository at this point in the history
  • Loading branch information
kasyanovse committed Aug 2, 2023
1 parent c12c571 commit 5cc3aa5
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 16 deletions.
10 changes: 2 additions & 8 deletions fedot/api/api_utils/api_params_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum

from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation
from fedot.core.constants import AUTO_PRESET_NAME
from fedot.core.constants import AUTO_PRESET_NAME, DEFAULT_CV_FOLDS_BY_TASK
from fedot.core.repository.tasks import TaskTypesEnum
from fedot.core.utils import default_fedot_data_dir

Expand All @@ -32,12 +32,6 @@ def __init__(self, task_type: TaskTypesEnum):
@staticmethod
def default_params_for_task(task_type: TaskTypesEnum) -> dict:
""" Returns a dict with default parameters"""
if task_type in [TaskTypesEnum.classification, TaskTypesEnum.regression]:
cv_folds = 5

elif task_type == TaskTypesEnum.ts_forecasting:
cv_folds = 3

# Dict with allowed keyword attributes for Api and their default values. If None - default value set
# in dataclasses ``PipelineComposerRequirements``, ``GPAlgorithmParameters``, ``GraphGenerationParams``
# will be used.
Expand All @@ -51,7 +45,7 @@ def default_params_for_task(task_type: TaskTypesEnum) -> dict:
keep_n_best=1,
available_operations=None,
metric=None,
cv_folds=cv_folds,
cv_folds=DEFAULT_CV_FOLDS_BY_TASK[task_type],
genetic_scheme=None,
early_stopping_iterations=None,
early_stopping_timeout=10,
Expand Down
6 changes: 5 additions & 1 deletion fedot/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@

FRACTION_OF_UNIQUE_VALUES = 0.95

default_data_split_ratio_by_task = {
DEFAULT_DATA_SPLIT_RATIO_BY_TASK = {
TaskTypesEnum.classification: 0.8,
TaskTypesEnum.regression: 0.8,
TaskTypesEnum.ts_forecasting: 0.5
}

DEFAULT_CV_FOLDS_BY_TASK = {TaskTypesEnum.classification: 5,
TaskTypesEnum.regression: 5,
TaskTypesEnum.ts_forecasting: 3}
31 changes: 24 additions & 7 deletions fedot/core/optimisers/objective/data_source_splitter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from functools import partial
from typing import Optional
from copy import deepcopy

from golem.core.log import default_log

from fedot.core.constants import default_data_split_ratio_by_task
from fedot.core.constants import DEFAULT_DATA_SPLIT_RATIO_BY_TASK, DEFAULT_CV_FOLDS_BY_TASK
from fedot.core.data.data import InputData
from fedot.core.data.data_split import train_test_data_setup
from fedot.core.data.multi_modal import MultiModalData
Expand Down Expand Up @@ -31,7 +32,7 @@ def __init__(self,
cv_folds: Optional[int] = None,
validation_blocks: Optional[int] = None,
split_ratio: Optional[float] = None,
shuffle: bool = False):
shuffle: bool = False,):
self.cv_folds = cv_folds
self.validation_blocks = validation_blocks
self.split_ratio = split_ratio
Expand All @@ -45,13 +46,21 @@ def build(self, data: InputData) -> DataSource:
data.shuffle()

# Check split_ratio
split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type]
split_ratio = self.split_ratio or DEFAULT_DATA_SPLIT_RATIO_BY_TASK[data.task.task_type]
if not (0 < split_ratio < 1):
raise ValueError(f'split_ratio is {split_ratio} but should be between 0 and 1')

# Calculate the number of validation blocks
if self.validation_blocks is None and data.task.task_type is TaskTypesEnum.ts_forecasting:
self._propose_cv_folds_and_validation_blocks(data, split_ratio)
# Calculate the number of validation blocks and number of cv folds for ts forecasting
if data.task.task_type is TaskTypesEnum.ts_forecasting:
if self.validation_blocks is None:
self._propose_cv_folds_and_validation_blocks(data, split_ratio)
# when forecasting length is low and data length is high there are huge amount of validation blocks
# some model refit each step of forecasting that may be time consuming
# solution is set forecasting length to higher value and reduce validation blocks count
# without reducing validation data length which is equal to forecast_length * validation_blocks
max_validation_blocks = DEFAULT_CV_FOLDS_BY_TASK[data.task.task_type] if self.cv_folds is None else 1
if self.validation_blocks > max_validation_blocks:
data = self._propose_forecast_length(data, max_validation_blocks)

# Split data
if self.cv_folds is not None:
Expand All @@ -73,7 +82,7 @@ def _build_holdout_producer(self, data: InputData) -> DataSource:
that always returns same data split. Equivalent to 1-fold validation.
"""

split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type]
split_ratio = self.split_ratio or DEFAULT_DATA_SPLIT_RATIO_BY_TASK[data.task.task_type]
train_data, test_data = train_test_data_setup(data, split_ratio, validation_blocks=self.validation_blocks)

if RemoteEvaluator().is_enabled:
Expand Down Expand Up @@ -129,3 +138,11 @@ def _propose_cv_folds_and_validation_blocks(self, data, split_ratio):
else:
test_share = 1 / (self.cv_folds + 1)
self.validation_blocks = int(data_shape * test_share // forecast_length)

def _propose_forecast_length(self, data, max_validation_blocks):
horizon = self.validation_blocks * data.task.task_params.forecast_length
self.validation_blocks = max_validation_blocks
# TODO: make copy without copy all data, only with task copy
data = deepcopy(data)
data.task.task_params.forecast_length = int(horizon // self.validation_blocks)
return data

0 comments on commit 5cc3aa5

Please sign in to comment.