Skip to content

Commit

Permalink
Improving preprocessing (#1320)
Browse files Browse the repository at this point in the history
* Adding logs & the ability to specify categorical data

* Fixes categorical features

* Changing getsizeof to nbytes

* Delete _clean_extra_spaces

* Adding more logs, adding OptimisedFeature storage, refactoring fitting BinaryCategoricalPreprocessor, fix bugs, adding reduce memory size, delete clean_extra_spaces

* @Lopa10ko requested changes

* Fix bug with nbytes

* Fix bug with cat_features_names if there aren't exists features_names

* Adding reduce_memory_size to pipeline._preprocess

* Return to Pandas for nan_matrix

* Change logic of _into_categorical_features_transformation_for_fit

* Adding convert to np.array

* Update ImputationImplementation

* Fix bug in BinaryCategorical

* Fix bug with test_data_from_csv_load_correctly

* Fix bug with test_api_fit_predict_with_pseudo_large_dataset_with_label_correct

* Fix bug with test_pipeline_preprocessing_through_api_correctly

* Fix bug with test_default_forecast (add new TODO for ts_forecasting)

* Fix bug with test_cv_multiple_metrics_evaluated_correct by adding copy method to OptimisedFeature

* Fix bug with test_regression_pipeline_with_data_operation_fit_predict_correct by adding check for target

* Fix bug in test_default_train_test_simple with nbytes

* Fix bugs with str* types in features

* Fix bug with test_inf_and_nan_absence_after_imputation_implementation_fit_transform by adding cat and num idx in get_dataset func

* Fix bug with test_pipeline_objective_evaluate_with_different_metrics by switching Xgboost to Catboost, due to "Experimental support for categorical data is not implemented for current tree method yet." for XgBoost and checking feat ids with size

* Fix bug with test_order_by_data_flow_len_correct

* Fix bug with test_pipeline_with_imputer (finally)

* Fix bug with test_correct_api_dataset_with_text_preprocessing by update col_type regex rule for str*

* Update for OneHotImplementation

* Update for subset_features and post_init

* Update data_has_categorical_features

* Adding bool to numerical

* Update for ImputationImplementation

* Fix data for tests

* Fix test with adding new types

* Update test with deleting extra spaces

* Update test with adding extra types_encountered

* Fixes different tests

* Update expected_values for test_metrics test

* pep8 fixes

* Adding preprocessing copying to predefined models

* Adding docstring to reduce memory and optimisedfeatures

* Automated autopep8 fixes

* Fix bug with unhashable np

* Temp update

* Fix tests

* Fix test_regression_data_operations with inf data after poly_features

* Fix bug in tests with IndexError

* Adding take by indecies method and to_numpy() in OptimisedFeatures

* Update train_test_split for OptimisedFeatures

* Transform target to numpy array during memory_reduce

* PR#1318 migration

* Fixing for test_metrics with py3.10

* Fix test_from_ ... with broadcast

* Hide preprocessing messages under debug logging (2)

* Fix TypeError with float16, rejection from this type

* Refactoring OptimisedFeatures - _columns: np.ndarray -> _columns: pd.DataFrame

* Revert changes with features property

* Fixes various tests

* Global refactoring - Rejection from separate class

* Fix pep8, wrong code correction & test

* Fixes bug with memory_usage & test

* Fixes bug with invalid slice

* pep8 fix

* test fixes

* pep8 fix

* fix bug with memory_usage

* reduce_memory_usage in utils, fix test with operations

* fix tests

* fix tests in main api

* fix: fix ambiguous value in integration test

* fix: fix typing error

* fix: fix arrays used as indices must be of integer

* fix: fix NoneType object isn't subscriptable error

* fix: copy input_data to prevent modification

* fix: fix fedot input_data transform to h2o_frame for regression

* fix: update the type of ids attributes to np.ndarray

* Automated autopep8 fixes

* chore: change the logging levels of new messages

* chore: fix pep8 style problems

* Automated autopep8 fixes

* fix: cannot concatenate ndarray

* fix: preserve single ndarray type for num_features

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: DRMPN <[email protected]>
  • Loading branch information
3 people authored Nov 5, 2024
1 parent c09fe9a commit a2c6746
Show file tree
Hide file tree
Showing 34 changed files with 2,012 additions and 318 deletions.
32 changes: 24 additions & 8 deletions fedot/api/api_utils/api_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import sys
from datetime import datetime
from typing import Dict, Union
from typing import Optional
Expand Down Expand Up @@ -34,14 +33,19 @@ def __init__(self, task: Task, use_input_preprocessing: bool = True):
self.task = task

self._recommendations = {}
self.preprocessor = DummyPreprocessor()

if use_input_preprocessing:
self.preprocessor = DataPreprocessor()

# Dictionary with recommendations (e.g. 'cut' for cutting dataset, 'label_encoded'
# to encode features using label encoder). Parameters for transformation provided also
self._recommendations = {'cut': self.preprocessor.cut_dataset,
'label_encoded': self.preprocessor.label_encoding_for_fit}
self._recommendations = {
'cut': self.preprocessor.cut_dataset,
'label_encoded': self.preprocessor.label_encoding_for_fit
}

else:
self.preprocessor = DummyPreprocessor()

self.log = default_log(self)

Expand Down Expand Up @@ -133,18 +137,28 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod
def fit_transform(self, train_data: InputData) -> InputData:
start_time = datetime.now()
self.log.message('Preprocessing data')
memory_usage = convert_memory_size(sys.getsizeof(train_data.features))
memory_usage = convert_memory_size(train_data.memory_usage)
features_shape = train_data.features.shape
target_shape = train_data.target.shape
self.log.message(
f'Train Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}')

self.log.debug('- Obligatory preprocessing started')
train_data = self.preprocessor.obligatory_prepare_for_fit(data=train_data)

self.log.debug('- Optional preprocessing started')
train_data = self.preprocessor.optional_prepare_for_fit(pipeline=Pipeline(), data=train_data)

self.log.debug('- Converting indexes for fitting started')
train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data)

self.log.debug('- Reducing memory started')
train_data = self.preprocessor.reduce_memory_size(data=train_data)

train_data.supplementary_data.is_auto_preprocessed = True

memory_usage = convert_memory_size(sys.getsizeof(train_data.features))
memory_usage = convert_memory_size(train_data.memory_usage)

features_shape = train_data.features.shape
target_shape = train_data.target.shape
self.log.message(
Expand All @@ -156,7 +170,7 @@ def fit_transform(self, train_data: InputData) -> InputData:
def transform(self, test_data: InputData, current_pipeline) -> InputData:
start_time = datetime.now()
self.log.message('Preprocessing data')
memory_usage = convert_memory_size(sys.getsizeof(test_data))
memory_usage = convert_memory_size(test_data.memory_usage)
features_shape = test_data.features.shape
target_shape = test_data.target.shape
self.log.message(
Expand All @@ -168,7 +182,9 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData:
test_data = self.preprocessor.update_indices_for_time_series(test_data)
test_data.supplementary_data.is_auto_preprocessed = True

memory_usage = convert_memory_size(sys.getsizeof(test_data))
test_data = self.preprocessor.reduce_memory_size(data=test_data)

memory_usage = convert_memory_size(test_data.memory_usage)
features_shape = test_data.features.shape
target_shape = test_data.target.shape
self.log.message(
Expand Down
16 changes: 13 additions & 3 deletions fedot/api/api_utils/predefined_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,36 @@
from fedot.core.pipelines.node import PipelineNode
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.pipelines.verification import verify_pipeline
from fedot.preprocessing.base_preprocessing import BasePreprocessor


class PredefinedModel:
def __init__(self, predefined_model: Union[str, Pipeline], data: InputData, log: LoggerAdapter,
use_input_preprocessing: bool = True):
use_input_preprocessing: bool = True, api_preprocessor: BasePreprocessor = None):
self.predefined_model = predefined_model
self.data = data
self.log = log
self.pipeline = self._get_pipeline(use_input_preprocessing)
self.pipeline = self._get_pipeline(use_input_preprocessing, api_preprocessor)

def _get_pipeline(self, use_input_preprocessing: bool = True) -> Pipeline:
def _get_pipeline(self, use_input_preprocessing: bool = True,
api_preprocessor: BasePreprocessor = None) -> Pipeline:
if isinstance(self.predefined_model, Pipeline):
pipelines = self.predefined_model
elif self.predefined_model == 'auto':
# Generate initial assumption automatically
pipelines = AssumptionsBuilder.get(self.data).from_operations().build(
use_input_preprocessing=use_input_preprocessing)[0]

if use_input_preprocessing and api_preprocessor is not None:
pipelines.preprocessor = api_preprocessor

elif isinstance(self.predefined_model, str):
model = PipelineNode(self.predefined_model)
pipelines = Pipeline(model, use_input_preprocessing=use_input_preprocessing)

if use_input_preprocessing and api_preprocessor is not None:
pipelines.preprocessor = api_preprocessor

else:
raise ValueError(f'{type(self.predefined_model)} is not supported as Fedot model')

Expand Down
8 changes: 5 additions & 3 deletions fedot/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,11 @@ def fit(self,
with fedot_composer_timer.launch_fitting():
if predefined_model is not None:
# Fit predefined model and return it without composing
self.current_pipeline = PredefinedModel(predefined_model, self.train_data, self.log,
use_input_preprocessing=self.params.get(
'use_input_preprocessing')).fit()
self.current_pipeline = PredefinedModel(
predefined_model, self.train_data, self.log,
use_input_preprocessing=self.params.get('use_input_preprocessing'),
api_preprocessor=self.data_processor.preprocessor,
).fit()
else:
self.current_pipeline, self.best_models, self.history = self.api_composer.obtain_model(self.train_data)

Expand Down
Loading

0 comments on commit a2c6746

Please sign in to comment.