aimclub · valer1435 · Aug 14, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 14, 2023
diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py
@@ -5,7 +5,8 @@
 from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum
 from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum
 
-from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation
+from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation, \
+    add_resample_mutation
 from fedot.core.constants import AUTO_PRESET_NAME
 from fedot.core.repository.tasks import TaskTypesEnum
 from fedot.core.utils import default_fedot_data_dir
@@ -131,5 +132,7 @@ def _get_default_mutations(task_type: TaskTypesEnum, params) -> Sequence[Mutatio
         # TODO remove workaround after boosting mutation fix
         if task_type == TaskTypesEnum.ts_forecasting:
             mutations.append(partial(boosting_mutation, params=params))
+        else:
+            mutations.append(add_resample_mutation)
 
         return mutations
diff --git a/fedot/core/composer/gp_composer/specific_operators.py b/fedot/core/composer/gp_composer/specific_operators.py
@@ -90,6 +90,24 @@ def boosting_mutation(pipeline: Pipeline, requirements, graph_gen_params, **kwar
     return pipeline
 
 
+def add_resample_mutation(pipeline: Pipeline, **kwargs):
+    """
+    Add resample operation before all primary operations in pipeline
+
+    :param pipeline: pipeline to insert resample
+
+    :return: mutated pipeline
+    """
+    resample_node = PipelineNode('resample')
+
+    p_nodes = [p_node for p_node in pipeline.primary_nodes]
+    pipeline.add_node(resample_node)
+
+    for node in p_nodes:
+        pipeline.connect_nodes(resample_node, node)
+    return pipeline
+
+
 def choose_new_model(boosting_model_candidates: List[str]) -> str:
     """ Since 'linear' and 'dtreg' operations are suitable for solving the problem
     and they are simpler than others, they are preferred """

diff --git a/fedot/core/composer/metrics.py b/fedot/core/composer/metrics.py
@@ -88,7 +88,7 @@ def get_value(cls, pipeline: 'Pipeline', reference_data: InputData,
                               save_path=Path(save_path, 'forecast.png'))
 
         except Exception as ex:
-            pipeline.log.info(f'Metric can not be evaluated because of: {ex}')
+            pipeline.log.info(f'Metric can not be evaluated because of: {ex}', raise_if_test=True)
 
         return metric
 
@@ -216,7 +216,10 @@ def metric(reference: InputData, predicted: OutputData) -> float:
         if n_classes > 2:
             additional_params = {'average': F1.multiclass_averaging_mode}
         else:
-            additional_params = {'average': F1.binary_averaging_mode}
+            u, count = np.unique(np.ravel(reference.target), return_counts=True)
+            count_sort_ind = np.argsort(count)
+            pos_label = u[count_sort_ind[0]].item()
+            additional_params = {'average': F1.binary_averaging_mode, 'pos_label': pos_label}
         return f1_score(y_true=reference.target, y_pred=predicted.predict,
                         **additional_params)
 
@@ -271,7 +274,16 @@ class Precision(QualityMetric):
     @staticmethod
     @from_maximised_metric
     def metric(reference: InputData, predicted: OutputData) -> float:
-        return precision_score(y_true=reference.target, y_pred=predicted.predict)
+        n_classes = reference.num_classes
+        if n_classes > 2:
+            return precision_score(y_true=reference.target, y_pred=predicted.predict)
+        else:
+            u, count = np.unique(np.ravel(reference.target), return_counts=True)
+            count_sort_ind = np.argsort(count)
+            pos_label = u[count_sort_ind[0]].item()
+            additional_params = {'pos_label': pos_label}
+            return precision_score(y_true=reference.target, y_pred=predicted.predict,
+                                   **additional_params)
 
 
 class Logloss(QualityMetric):

diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py
@@ -119,6 +119,7 @@ def predict_for_fit(self, fitted_operation, data: InputData, params: Optional[Op
     def _predict(self, fitted_operation, data: InputData, params: Optional[OperationParameters] = None,
                  output_mode: str = 'default', is_fit_stage: bool = False):
 
+
         is_main_target = data.supplementary_data.is_main_target
         data_flow_length = data.supplementary_data.data_flow_length
         self._init(data.task, output_mode=output_mode, params=params, n_samples_data=data.features.shape[0])
@@ -133,11 +134,11 @@ def _predict(self, fitted_operation, data: InputData, params: Optional[Operation
                 predict_data=data)
         prediction = self.assign_tabular_column_types(prediction, output_mode)
 
+        # any inplace operations here are dangerous!
         if is_main_target is False:
             prediction.supplementary_data.is_main_target = is_main_target
 
         prediction.supplementary_data.data_flow_length = data_flow_length
-        prediction.supplementary_data.obligatorily_preprocessed = True
         return prediction
 
     @staticmethod

diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py
@@ -1,7 +1,7 @@
 from copy import deepcopy
 from datetime import timedelta
 from os import PathLike
-from typing import Optional, Tuple, Union, Sequence, Dict
+from typing import Optional, Tuple, Union, Sequence, List, Dict
 
 import func_timeout
 from golem.core.dag.graph import Graph
@@ -307,7 +307,7 @@ def load(self, source: Union[str, dict], dict_fitted_operations: Optional[dict]
             dict_fitted_operations: dictionary of the fitted operations
         """
 
-        self.nodes = []
+        self.nodes: Optional[List[PipelineNode]] = []
         template = PipelineTemplate(self)
         template.import_pipeline(source, dict_fitted_operations)
         return self
@@ -327,6 +327,19 @@ def root_node(self) -> Optional[PipelineNode]:
             raise ValueError(f'{ERROR_PREFIX} More than 1 root_nodes in pipeline')
         return root[0]
 
+    @property
+    def primary_nodes(self) -> List[PipelineNode]:
+        """Finds pipeline's primary nodes
+
+        Returns:
+            list of primary nodes
+        """
+        if not self.nodes:
+            return []
+        primary_nodes = [node for node in self.nodes
+                         if node.is_primary]
+        return primary_nodes
+
     def pipeline_for_side_task(self, task_type: TaskTypesEnum) -> 'Pipeline':
         """Returns pipeline formed from the last node solving the given problem and all its parents
 

diff --git a/fedot/core/pipelines/random_pipeline_factory.py b/fedot/core/pipelines/random_pipeline_factory.py
@@ -1,3 +1,4 @@
+import random
 from copy import deepcopy
 from random import randint
 from typing import Optional
@@ -14,6 +15,7 @@
 
 class RandomPipelineFactory(RandomGraphFactory):
     """ Default realisation of random graph factory. Generates DAG graph using random growth. """
+    PROBABILITY_OF_GROWTH = 0.3
 
     def __init__(self,
                  verifier: GraphVerifier,
@@ -76,13 +78,14 @@ def graph_growth(graph: OptGraph,
     for offspring_node in range(offspring_size):
         height = distance_to_root_level(graph, node_parent)
         is_max_depth_exceeded = height >= max_depth - 2
-        is_primary_node_selected = height < max_depth - 1 and randint(0, 1)
-        if is_max_depth_exceeded or is_primary_node_selected:
+        is_primary_node_selected = height < max_depth - 1
+        is_growth_should_stopped = random.random() > RandomPipelineFactory.PROBABILITY_OF_GROWTH
+        if is_max_depth_exceeded or is_primary_node_selected or is_growth_should_stopped:
             primary_node = node_factory.get_node(is_primary=True)
             node_parent.nodes_from.append(primary_node)
             graph.add_node(primary_node)
         else:
             secondary_node = node_factory.get_node(is_primary=False)
             graph.add_node(secondary_node)
             node_parent.nodes_from.append(secondary_node)
-            graph_growth(graph, secondary_node, node_factory, requirements, max_depth)
+            graph_growth(graph, secondary_node, node_factory, requirements, max_depth)
diff --git a/fedot/core/pipelines/verification.py b/fedot/core/pipelines/verification.py
@@ -26,7 +26,7 @@
     has_no_data_flow_conflicts_in_ts_pipeline,
     has_primary_nodes,
     is_pipeline_contains_ts_operations,
-    only_non_lagged_operations_are_primary
+    only_non_lagged_operations_are_primary, has_correct_location_of_resample
 )
 from fedot.core.repository.tasks import TaskTypesEnum
 
@@ -40,7 +40,8 @@
                 has_no_conflicts_with_data_flow,
                 has_no_conflicts_in_decompose,
                 has_correct_data_connections,
-                has_correct_data_sources]
+                has_correct_data_sources,
+                has_correct_location_of_resample]
 
 ts_rules = [is_pipeline_contains_ts_operations,
             only_non_lagged_operations_are_primary,

diff --git a/fedot/core/pipelines/verification_rules.py b/fedot/core/pipelines/verification_rules.py
@@ -12,7 +12,7 @@
 
 
 def has_correct_operations_for_task(pipeline: Pipeline, task_type: Optional[TaskTypesEnum] = None):
-    if task_type and not task_type in pipeline.root_node.operation.acceptable_task_types:
+    if task_type and task_type not in pipeline.root_node.operation.acceptable_task_types:
         raise ValueError(f'{ERROR_PREFIX} Pipeline has incorrect operations positions')
     return True
 
@@ -152,6 +152,30 @@ def has_no_data_flow_conflicts_in_ts_pipeline(pipeline: Pipeline):
     return True
 
 
+def has_correct_location_of_resample(pipeline: Pipeline):
+    """
+    Pipeline can have only one resample operation located in start of the pipeline
+
+    :param pipeline: pipeline for checking
+    """
+    is_resample_primary = False
+    is_not_resample_primary = False
+    for node in pipeline.nodes:
+        if node.is_primary:
+            if node.name == 'resample':
+                is_resample_primary = True
+            else:
+                is_not_resample_primary = True
+        else:
+            if node.name == 'resample':
+                raise ValueError(
+                    f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline')
+    if is_resample_primary and is_not_resample_primary:
+        raise ValueError(
+            f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline')
+    return True
+
+
 def get_wrong_links(ts_to_table_operations: list, ts_data_operations: list, non_ts_data_operations: list,
                     ts_models: list, non_ts_models: list) -> dict:
     """

diff --git a/fedot/core/repository/data/model_repository.json b/fedot/core/repository/data/model_repository.json
@@ -165,7 +165,10 @@
       "meta": "sklearn_class",
       "presets": ["fast_train"],
       "tags": [
-        "bayesian", "non_multi", "linear"
+        "simple",
+        "bayesian",
+        "non_multi",
+        "linear"
       ]
     },
     "catboost": {
@@ -186,6 +189,7 @@
       "meta": "sklearn_class",
       "presets": ["fast_train", "*tree"],
       "tags": [
+        "simple",
         "tree",
         "interpretable",
         "non_linear"

diff --git a/test/integration/models/test_model.py b/test/integration/models/test_model.py
@@ -98,6 +98,20 @@ def classification_dataset():
     threshold = 0.5
     classes = np.array([0.0 if val <= threshold else 1.0 for val in y])
     classes = np.expand_dims(classes, axis=1)
+    data = InputData(features=MinMaxScaler().fit_transform(x), target=classes, idx=np.arange(0, len(x)),
+                     task=Task(TaskTypesEnum.classification),
+                     data_type=DataTypesEnum.table)
+    return data
+
+
+def classification_dataset_with_str_labels():
+    samples = 1000
+    x = 10.0 * np.random.rand(samples, ) - 5.0
+    x = np.expand_dims(x, axis=1)
+    y = 1.0 / (1.0 + np.exp(np.power(x, -1.0)))
+    threshold = 0.5
+    classes = np.array(['a' if val <= threshold else 'b' for val in y])
+    classes = np.expand_dims(classes, axis=1)
     data = InputData(features=MinMaxScaler().fit_transform(x), target=classes, idx=np.arange(0, len(x)),
                      task=Task(TaskTypesEnum.classification),
                      data_type=DataTypesEnum.table)

diff --git a/test/unit/composer/test_quality_metrics.py b/test/unit/composer/test_quality_metrics.py
@@ -22,8 +22,6 @@
 @pytest.fixture()
 def data_setup():
     predictors, response = load_breast_cancer(return_X_y=True)
-    np.random.shuffle(predictors)
-    np.random.shuffle(response)
     response = response[:100]
     predictors = predictors[:100]
 

diff --git a/test/unit/data/test_data.py b/test/unit/data/test_data.py
@@ -18,8 +18,6 @@
 @pytest.fixture()
 def data_setup() -> InputData:
     predictors, response = load_iris(return_X_y=True)
-    np.random.shuffle(predictors)
-    np.random.shuffle(response)
     predictors = predictors[:100]
     response = response[:100]
     data = InputData(features=predictors, target=response, idx=np.arange(0, 100),

diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py
@@ -18,7 +18,7 @@
     RegressionMetricsEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 from fedot.core.validation.split import tabular_cv_generator, OneFoldInputDataSplit
-from test.integration.models.test_model import classification_dataset
+from test.integration.models.test_model import classification_dataset, classification_dataset_with_str_labels
 from test.unit.tasks.test_forecasting import get_simple_ts_pipeline
 from test.unit.validation.test_table_cv import sample_pipeline
 from test.unit.validation.test_time_series_cv import configure_experiment
@@ -86,6 +86,23 @@ def test_pipeline_objective_evaluate_with_different_metrics(classification_datas
         assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name
 
 
+@pytest.mark.parametrize(
+    'pipeline',
+    [pipeline_first_test(), pipeline_second_test(), pipeline_third_test()]
+)
+def test_pipeline_objective_evaluate_with_different_metrics_with_str_labes(pipeline):
+    for metric in ClassificationMetricsEnum:
+        one_fold_split = OneFoldInputDataSplit()
+        data_split = partial(one_fold_split.input_split, input_data=classification_dataset_with_str_labels())
+        check_pipeline = deepcopy(pipeline)
+        objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split)
+        fitness = objective_eval(pipeline)
+        act_fitness = actual_fitness(data_split, check_pipeline, metric)
+        assert fitness.valid
+        assert fitness.value is not None
+        assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name
+
+
 def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset):
     pipeline = empty_pipeline()