Skip to content

Commit 5cc3aa5

Browse files
committed
Add forecasting length increasing for fit
1 parent c12c571 commit 5cc3aa5

File tree

3 files changed

+31
-16
lines changed

3 files changed

+31
-16
lines changed

fedot/api/api_utils/api_params_repository.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum
77

88
from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation
9-
from fedot.core.constants import AUTO_PRESET_NAME
9+
from fedot.core.constants import AUTO_PRESET_NAME, DEFAULT_CV_FOLDS_BY_TASK
1010
from fedot.core.repository.tasks import TaskTypesEnum
1111
from fedot.core.utils import default_fedot_data_dir
1212

@@ -32,12 +32,6 @@ def __init__(self, task_type: TaskTypesEnum):
3232
@staticmethod
3333
def default_params_for_task(task_type: TaskTypesEnum) -> dict:
3434
""" Returns a dict with default parameters"""
35-
if task_type in [TaskTypesEnum.classification, TaskTypesEnum.regression]:
36-
cv_folds = 5
37-
38-
elif task_type == TaskTypesEnum.ts_forecasting:
39-
cv_folds = 3
40-
4135
# Dict with allowed keyword attributes for Api and their default values. If None - default value set
4236
# in dataclasses ``PipelineComposerRequirements``, ``GPAlgorithmParameters``, ``GraphGenerationParams``
4337
# will be used.
@@ -51,7 +45,7 @@ def default_params_for_task(task_type: TaskTypesEnum) -> dict:
5145
keep_n_best=1,
5246
available_operations=None,
5347
metric=None,
54-
cv_folds=cv_folds,
48+
cv_folds=DEFAULT_CV_FOLDS_BY_TASK[task_type],
5549
genetic_scheme=None,
5650
early_stopping_iterations=None,
5751
early_stopping_timeout=10,

fedot/core/constants.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,12 @@
1515

1616
FRACTION_OF_UNIQUE_VALUES = 0.95
1717

18-
default_data_split_ratio_by_task = {
18+
DEFAULT_DATA_SPLIT_RATIO_BY_TASK = {
1919
TaskTypesEnum.classification: 0.8,
2020
TaskTypesEnum.regression: 0.8,
2121
TaskTypesEnum.ts_forecasting: 0.5
2222
}
23+
24+
DEFAULT_CV_FOLDS_BY_TASK = {TaskTypesEnum.classification: 5,
25+
TaskTypesEnum.regression: 5,
26+
TaskTypesEnum.ts_forecasting: 3}

fedot/core/optimisers/objective/data_source_splitter.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from functools import partial
22
from typing import Optional
3+
from copy import deepcopy
34

45
from golem.core.log import default_log
56

6-
from fedot.core.constants import default_data_split_ratio_by_task
7+
from fedot.core.constants import DEFAULT_DATA_SPLIT_RATIO_BY_TASK, DEFAULT_CV_FOLDS_BY_TASK
78
from fedot.core.data.data import InputData
89
from fedot.core.data.data_split import train_test_data_setup
910
from fedot.core.data.multi_modal import MultiModalData
@@ -31,7 +32,7 @@ def __init__(self,
3132
cv_folds: Optional[int] = None,
3233
validation_blocks: Optional[int] = None,
3334
split_ratio: Optional[float] = None,
34-
shuffle: bool = False):
35+
shuffle: bool = False,):
3536
self.cv_folds = cv_folds
3637
self.validation_blocks = validation_blocks
3738
self.split_ratio = split_ratio
@@ -45,13 +46,21 @@ def build(self, data: InputData) -> DataSource:
4546
data.shuffle()
4647

4748
# Check split_ratio
48-
split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type]
49+
split_ratio = self.split_ratio or DEFAULT_DATA_SPLIT_RATIO_BY_TASK[data.task.task_type]
4950
if not (0 < split_ratio < 1):
5051
raise ValueError(f'split_ratio is {split_ratio} but should be between 0 and 1')
5152

52-
# Calculate the number of validation blocks
53-
if self.validation_blocks is None and data.task.task_type is TaskTypesEnum.ts_forecasting:
54-
self._propose_cv_folds_and_validation_blocks(data, split_ratio)
53+
# Calculate the number of validation blocks and number of cv folds for ts forecasting
54+
if data.task.task_type is TaskTypesEnum.ts_forecasting:
55+
if self.validation_blocks is None:
56+
self._propose_cv_folds_and_validation_blocks(data, split_ratio)
57+
# when forecasting length is low and data length is high there are huge amount of validation blocks
58+
# some model refit each step of forecasting that may be time consuming
59+
# solution is set forecasting length to higher value and reduce validation blocks count
60+
# without reducing validation data length which is equal to forecast_length * validation_blocks
61+
max_validation_blocks = DEFAULT_CV_FOLDS_BY_TASK[data.task.task_type] if self.cv_folds is None else 1
62+
if self.validation_blocks > max_validation_blocks:
63+
data = self._propose_forecast_length(data, max_validation_blocks)
5564

5665
# Split data
5766
if self.cv_folds is not None:
@@ -73,7 +82,7 @@ def _build_holdout_producer(self, data: InputData) -> DataSource:
7382
that always returns same data split. Equivalent to 1-fold validation.
7483
"""
7584

76-
split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type]
85+
split_ratio = self.split_ratio or DEFAULT_DATA_SPLIT_RATIO_BY_TASK[data.task.task_type]
7786
train_data, test_data = train_test_data_setup(data, split_ratio, validation_blocks=self.validation_blocks)
7887

7988
if RemoteEvaluator().is_enabled:
@@ -129,3 +138,11 @@ def _propose_cv_folds_and_validation_blocks(self, data, split_ratio):
129138
else:
130139
test_share = 1 / (self.cv_folds + 1)
131140
self.validation_blocks = int(data_shape * test_share // forecast_length)
141+
142+
def _propose_forecast_length(self, data, max_validation_blocks):
143+
horizon = self.validation_blocks * data.task.task_params.forecast_length
144+
self.validation_blocks = max_validation_blocks
145+
# TODO: make copy without copy all data, only with task copy
146+
data = deepcopy(data)
147+
data.task.task_params.forecast_length = int(horizon // self.validation_blocks)
148+
return data

0 commit comments

Comments
 (0)