diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py index ed90d03d8b..9a6e76f4ef 100644 --- a/fedot/api/api_utils/api_params_repository.py +++ b/fedot/api/api_utils/api_params_repository.py @@ -5,7 +5,8 @@ from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum -from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation +from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, boosting_mutation, \ + add_resample_mutation from fedot.core.constants import AUTO_PRESET_NAME from fedot.core.repository.tasks import TaskTypesEnum from fedot.core.utils import default_fedot_data_dir @@ -131,5 +132,7 @@ def _get_default_mutations(task_type: TaskTypesEnum, params) -> Sequence[Mutatio # TODO remove workaround after boosting mutation fix if task_type == TaskTypesEnum.ts_forecasting: mutations.append(partial(boosting_mutation, params=params)) + else: + mutations.append(add_resample_mutation) return mutations diff --git a/fedot/core/composer/gp_composer/specific_operators.py b/fedot/core/composer/gp_composer/specific_operators.py index 439bd89fad..7799fe432f 100644 --- a/fedot/core/composer/gp_composer/specific_operators.py +++ b/fedot/core/composer/gp_composer/specific_operators.py @@ -90,6 +90,24 @@ def boosting_mutation(pipeline: Pipeline, requirements, graph_gen_params, **kwar return pipeline +def add_resample_mutation(pipeline: Pipeline, **kwargs): + """ + Add resample operation before all primary operations in pipeline + + :param pipeline: pipeline to insert resample + + :return: mutated pipeline + """ + resample_node = PipelineNode('resample') + + p_nodes = [p_node for p_node in pipeline.primary_nodes] + pipeline.add_node(resample_node) + + for node in p_nodes: + pipeline.connect_nodes(resample_node, node) + return pipeline + + def choose_new_model(boosting_model_candidates: List[str]) -> str: """ Since 'linear' and 'dtreg' operations are suitable for solving the problem and they are simpler than others, they are preferred """ diff --git a/fedot/core/composer/metrics.py b/fedot/core/composer/metrics.py index 505c356be9..d3d9e42c3e 100644 --- a/fedot/core/composer/metrics.py +++ b/fedot/core/composer/metrics.py @@ -88,7 +88,7 @@ def get_value(cls, pipeline: 'Pipeline', reference_data: InputData, save_path=Path(save_path, 'forecast.png')) except Exception as ex: - pipeline.log.info(f'Metric can not be evaluated because of: {ex}') + pipeline.log.info(f'Metric can not be evaluated because of: {ex}', raise_if_test=True) return metric @@ -216,7 +216,10 @@ def metric(reference: InputData, predicted: OutputData) -> float: if n_classes > 2: additional_params = {'average': F1.multiclass_averaging_mode} else: - additional_params = {'average': F1.binary_averaging_mode} + u, count = np.unique(np.ravel(reference.target), return_counts=True) + count_sort_ind = np.argsort(count) + pos_label = u[count_sort_ind[0]].item() + additional_params = {'average': F1.binary_averaging_mode, 'pos_label': pos_label} return f1_score(y_true=reference.target, y_pred=predicted.predict, **additional_params) @@ -271,7 +274,16 @@ class Precision(QualityMetric): @staticmethod @from_maximised_metric def metric(reference: InputData, predicted: OutputData) -> float: - return precision_score(y_true=reference.target, y_pred=predicted.predict) + n_classes = reference.num_classes + if n_classes > 2: + return precision_score(y_true=reference.target, y_pred=predicted.predict) + else: + u, count = np.unique(np.ravel(reference.target), return_counts=True) + count_sort_ind = np.argsort(count) + pos_label = u[count_sort_ind[0]].item() + additional_params = {'pos_label': pos_label} + return precision_score(y_true=reference.target, y_pred=predicted.predict, + **additional_params) class Logloss(QualityMetric): diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py index 23314407e1..1b34ebacde 100644 --- a/fedot/core/operations/operation.py +++ b/fedot/core/operations/operation.py @@ -119,6 +119,7 @@ def predict_for_fit(self, fitted_operation, data: InputData, params: Optional[Op def _predict(self, fitted_operation, data: InputData, params: Optional[OperationParameters] = None, output_mode: str = 'default', is_fit_stage: bool = False): + is_main_target = data.supplementary_data.is_main_target data_flow_length = data.supplementary_data.data_flow_length self._init(data.task, output_mode=output_mode, params=params, n_samples_data=data.features.shape[0]) @@ -133,11 +134,11 @@ def _predict(self, fitted_operation, data: InputData, params: Optional[Operation predict_data=data) prediction = self.assign_tabular_column_types(prediction, output_mode) + # any inplace operations here are dangerous! if is_main_target is False: prediction.supplementary_data.is_main_target = is_main_target prediction.supplementary_data.data_flow_length = data_flow_length - prediction.supplementary_data.obligatorily_preprocessed = True return prediction @staticmethod diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py index 220984b206..a9a608c2c9 100644 --- a/fedot/core/pipelines/pipeline.py +++ b/fedot/core/pipelines/pipeline.py @@ -1,7 +1,7 @@ from copy import deepcopy from datetime import timedelta from os import PathLike -from typing import Optional, Tuple, Union, Sequence, Dict +from typing import Optional, Tuple, Union, Sequence, List, Dict import func_timeout from golem.core.dag.graph import Graph @@ -307,7 +307,7 @@ def load(self, source: Union[str, dict], dict_fitted_operations: Optional[dict] dict_fitted_operations: dictionary of the fitted operations """ - self.nodes = [] + self.nodes: Optional[List[PipelineNode]] = [] template = PipelineTemplate(self) template.import_pipeline(source, dict_fitted_operations) return self @@ -327,6 +327,19 @@ def root_node(self) -> Optional[PipelineNode]: raise ValueError(f'{ERROR_PREFIX} More than 1 root_nodes in pipeline') return root[0] + @property + def primary_nodes(self) -> List[PipelineNode]: + """Finds pipeline's primary nodes + + Returns: + list of primary nodes + """ + if not self.nodes: + return [] + primary_nodes = [node for node in self.nodes + if node.is_primary] + return primary_nodes + def pipeline_for_side_task(self, task_type: TaskTypesEnum) -> 'Pipeline': """Returns pipeline formed from the last node solving the given problem and all its parents diff --git a/fedot/core/pipelines/random_pipeline_factory.py b/fedot/core/pipelines/random_pipeline_factory.py index 1210af75c4..d08f180c58 100644 --- a/fedot/core/pipelines/random_pipeline_factory.py +++ b/fedot/core/pipelines/random_pipeline_factory.py @@ -1,3 +1,4 @@ +import random from copy import deepcopy from random import randint from typing import Optional @@ -14,6 +15,7 @@ class RandomPipelineFactory(RandomGraphFactory): """ Default realisation of random graph factory. Generates DAG graph using random growth. """ + PROBABILITY_OF_GROWTH = 0.3 def __init__(self, verifier: GraphVerifier, @@ -76,8 +78,9 @@ def graph_growth(graph: OptGraph, for offspring_node in range(offspring_size): height = distance_to_root_level(graph, node_parent) is_max_depth_exceeded = height >= max_depth - 2 - is_primary_node_selected = height < max_depth - 1 and randint(0, 1) - if is_max_depth_exceeded or is_primary_node_selected: + is_primary_node_selected = height < max_depth - 1 + is_growth_should_stopped = random.random() > RandomPipelineFactory.PROBABILITY_OF_GROWTH + if is_max_depth_exceeded or is_primary_node_selected or is_growth_should_stopped: primary_node = node_factory.get_node(is_primary=True) node_parent.nodes_from.append(primary_node) graph.add_node(primary_node) @@ -85,4 +88,4 @@ def graph_growth(graph: OptGraph, secondary_node = node_factory.get_node(is_primary=False) graph.add_node(secondary_node) node_parent.nodes_from.append(secondary_node) - graph_growth(graph, secondary_node, node_factory, requirements, max_depth) \ No newline at end of file + graph_growth(graph, secondary_node, node_factory, requirements, max_depth) diff --git a/fedot/core/pipelines/verification.py b/fedot/core/pipelines/verification.py index 6e619ba38c..6a5ff7d5a7 100644 --- a/fedot/core/pipelines/verification.py +++ b/fedot/core/pipelines/verification.py @@ -26,7 +26,7 @@ has_no_data_flow_conflicts_in_ts_pipeline, has_primary_nodes, is_pipeline_contains_ts_operations, - only_non_lagged_operations_are_primary + only_non_lagged_operations_are_primary, has_correct_location_of_resample ) from fedot.core.repository.tasks import TaskTypesEnum @@ -40,7 +40,8 @@ has_no_conflicts_with_data_flow, has_no_conflicts_in_decompose, has_correct_data_connections, - has_correct_data_sources] + has_correct_data_sources, + has_correct_location_of_resample] ts_rules = [is_pipeline_contains_ts_operations, only_non_lagged_operations_are_primary, diff --git a/fedot/core/pipelines/verification_rules.py b/fedot/core/pipelines/verification_rules.py index cfb02f480f..7c40d53280 100644 --- a/fedot/core/pipelines/verification_rules.py +++ b/fedot/core/pipelines/verification_rules.py @@ -12,7 +12,7 @@ def has_correct_operations_for_task(pipeline: Pipeline, task_type: Optional[TaskTypesEnum] = None): - if task_type and not task_type in pipeline.root_node.operation.acceptable_task_types: + if task_type and task_type not in pipeline.root_node.operation.acceptable_task_types: raise ValueError(f'{ERROR_PREFIX} Pipeline has incorrect operations positions') return True @@ -152,6 +152,30 @@ def has_no_data_flow_conflicts_in_ts_pipeline(pipeline: Pipeline): return True +def has_correct_location_of_resample(pipeline: Pipeline): + """ + Pipeline can have only one resample operation located in start of the pipeline + + :param pipeline: pipeline for checking + """ + is_resample_primary = False + is_not_resample_primary = False + for node in pipeline.nodes: + if node.is_primary: + if node.name == 'resample': + is_resample_primary = True + else: + is_not_resample_primary = True + else: + if node.name == 'resample': + raise ValueError( + f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline') + if is_resample_primary and is_not_resample_primary: + raise ValueError( + f'{ERROR_PREFIX} Pipeline can have only one resample operation located in start of the pipeline') + return True + + def get_wrong_links(ts_to_table_operations: list, ts_data_operations: list, non_ts_data_operations: list, ts_models: list, non_ts_models: list) -> dict: """ diff --git a/fedot/core/repository/data/model_repository.json b/fedot/core/repository/data/model_repository.json index 47d1be30c6..0ef86535fa 100644 --- a/fedot/core/repository/data/model_repository.json +++ b/fedot/core/repository/data/model_repository.json @@ -165,7 +165,10 @@ "meta": "sklearn_class", "presets": ["fast_train"], "tags": [ - "bayesian", "non_multi", "linear" + "simple", + "bayesian", + "non_multi", + "linear" ] }, "catboost": { @@ -186,6 +189,7 @@ "meta": "sklearn_class", "presets": ["fast_train", "*tree"], "tags": [ + "simple", "tree", "interpretable", "non_linear" diff --git a/test/integration/models/test_model.py b/test/integration/models/test_model.py index 5fdc72b52f..171acb23ad 100644 --- a/test/integration/models/test_model.py +++ b/test/integration/models/test_model.py @@ -98,6 +98,20 @@ def classification_dataset(): threshold = 0.5 classes = np.array([0.0 if val <= threshold else 1.0 for val in y]) classes = np.expand_dims(classes, axis=1) + data = InputData(features=MinMaxScaler().fit_transform(x), target=classes, idx=np.arange(0, len(x)), + task=Task(TaskTypesEnum.classification), + data_type=DataTypesEnum.table) + return data + + +def classification_dataset_with_str_labels(): + samples = 1000 + x = 10.0 * np.random.rand(samples, ) - 5.0 + x = np.expand_dims(x, axis=1) + y = 1.0 / (1.0 + np.exp(np.power(x, -1.0))) + threshold = 0.5 + classes = np.array(['a' if val <= threshold else 'b' for val in y]) + classes = np.expand_dims(classes, axis=1) data = InputData(features=MinMaxScaler().fit_transform(x), target=classes, idx=np.arange(0, len(x)), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) diff --git a/test/unit/composer/test_quality_metrics.py b/test/unit/composer/test_quality_metrics.py index bb9e1786f6..426faef66d 100644 --- a/test/unit/composer/test_quality_metrics.py +++ b/test/unit/composer/test_quality_metrics.py @@ -22,8 +22,6 @@ @pytest.fixture() def data_setup(): predictors, response = load_breast_cancer(return_X_y=True) - np.random.shuffle(predictors) - np.random.shuffle(response) response = response[:100] predictors = predictors[:100] diff --git a/test/unit/data/test_data.py b/test/unit/data/test_data.py index 7d987dc3c9..a3eb55bfaa 100644 --- a/test/unit/data/test_data.py +++ b/test/unit/data/test_data.py @@ -18,8 +18,6 @@ @pytest.fixture() def data_setup() -> InputData: predictors, response = load_iris(return_X_y=True) - np.random.shuffle(predictors) - np.random.shuffle(response) predictors = predictors[:100] response = response[:100] data = InputData(features=predictors, target=response, idx=np.arange(0, 100), diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index cfb3f86444..657b7a201e 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -18,7 +18,7 @@ RegressionMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.core.validation.split import tabular_cv_generator, OneFoldInputDataSplit -from test.integration.models.test_model import classification_dataset +from test.integration.models.test_model import classification_dataset, classification_dataset_with_str_labels from test.unit.tasks.test_forecasting import get_simple_ts_pipeline from test.unit.validation.test_table_cv import sample_pipeline from test.unit.validation.test_time_series_cv import configure_experiment @@ -86,6 +86,23 @@ def test_pipeline_objective_evaluate_with_different_metrics(classification_datas assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name +@pytest.mark.parametrize( + 'pipeline', + [pipeline_first_test(), pipeline_second_test(), pipeline_third_test()] +) +def test_pipeline_objective_evaluate_with_different_metrics_with_str_labes(pipeline): + for metric in ClassificationMetricsEnum: + one_fold_split = OneFoldInputDataSplit() + data_split = partial(one_fold_split.input_split, input_data=classification_dataset_with_str_labels()) + check_pipeline = deepcopy(pipeline) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) + fitness = objective_eval(pipeline) + act_fitness = actual_fitness(data_split, check_pipeline, metric) + assert fitness.valid + assert fitness.value is not None + assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name + + def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset): pipeline = empty_pipeline()