Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unit tests for ALL InputData and Data classes methods #1200 #1251

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cases/spam_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from fedot.core.data.data_split import train_test_data_setup
from fedot.core.pipelines.node import PipelineNode
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.repository.dataset_types import DataTypesEnum


def execute_pipeline_for_text_problem(train_data, test_data):
Expand All @@ -26,7 +27,8 @@ def execute_pipeline_for_text_problem(train_data, test_data):
def run_text_problem_from_meta_file():
data_file_abspath = os.path.abspath(os.path.join('data', 'spam', 'spamham.csv'))

data = InputData.from_text_meta_file(meta_file_path=data_file_abspath)
data = InputData.from_csv(file_path=data_file_abspath,
data_type=DataTypesEnum.text)

train_data, test_data = train_test_data_setup(data, split_ratio=0.7)

Expand All @@ -52,7 +54,8 @@ def run_text_problem_from_files():


def run_text_problem_from_saved_meta_file(path):
data = InputData.from_text_meta_file(meta_file_path=path)
data = InputData.from_csv(file_path=path,
data_type=DataTypesEnum.text)

train_data, test_data = train_test_data_setup(data, split_ratio=0.7)

Expand Down
2 changes: 1 addition & 1 deletion docs/source/advanced/cli_call.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ problems decision are presented.

The string below helps to run classification problem decision from the console:

``python --problem classification --train ../../test/data/simple_classification.csv --test ../../test/data/simple_classification.csv --target Y --timeout 0.1``
``python --problem classification --train ../../test/data/classification/simple_classification.csv --test ../../test/data/classification/simple_classification.csv --target Y --timeout 0.1``
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
set python_path = "DEFAULT"
cd ../../fedot/api
%python_path% fedot_cli.py --problem classification --train ../../test/data/simple_classification.csv --test ../../test/data/simple_classification.csv --target Y --timeout 0.1
%python_path% fedot_cli.py --problem classification --train ../../test/data/classification/simple_classification.csv --test ../../test/data/classification/simple_classification.csv --target Y --timeout 0.1
20 changes: 0 additions & 20 deletions fedot/core/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,26 +356,6 @@ def from_image(images: Union[str, np.ndarray] = None,

return InputData(idx=idx, features=features, target=target, task=task, data_type=DataTypesEnum.image)

@staticmethod
def from_text_meta_file(meta_file_path: str = None,
label: str = 'label',
task: Task = Task(TaskTypesEnum.classification),
data_type: DataTypesEnum = DataTypesEnum.text) -> InputData:

if os.path.isdir(meta_file_path):
raise ValueError("""CSV file expected but got directory""")

df_text = pd.read_csv(meta_file_path)
df_text = df_text.sample(frac=1).reset_index(drop=True)
messages = df_text['text'].astype('U').tolist()

features = np.array(messages)
target = np.array(df_text[label]).reshape(-1, 1)
idx = [index for index in range(len(target))]

return InputData(idx=idx, features=features,
target=target, task=task, data_type=data_type)

@staticmethod
def from_text_files(files_path: str,
label: str = 'label',
Expand Down
Binary file not shown.
Binary file not shown.
62 changes: 62 additions & 0 deletions test/data/regression/simple_regression.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
Unnamed: 0,Asphalt,N-par,Iso-par,AROM UV,P,V,S,target
41,0.21,71.09,18.5,2.13,839.3,13.44,0.09,0.223
8,1.2,63.47,20.44,7.53,891.8,42.49,0.61,0.168
43,2.37,61.27,20.41,9.0,906.8,62.9,1.01,0.11
21,4.16,59.83,19.72,9.79,916.5,81.55,1.55,0.07
27,1.26,68.27,18.77,4.04,858.6,19.59,0.47,0.99
51,0.98,68.44,18.9,3.96,857.5,19.19,0.39,1.75
20,4.24,59.62,19.75,9.93,917.9,85.16,1.58,0.05
42,2.3,59.63,21.0,10.17,918.2,87.75,1.04,0.03
24,3.29,62.36,19.44,8.07,899.1,50.44,1.23,0.34
53,1.59,65.0,19.66,4.11,881.2,32.51,0.67,1.21
59,2.93,60.93,20.16,4.03,909.1,66.68,1.18,0.242
10,1.05,64.52,20.18,6.78,884.5,35.43,0.53,0.196
45,2.51,64.44,19.24,6.66,884.9,35.24,0.94,1.03
22,4.09,60.05,19.7,9.64,915.0,78.14,1.53,0.102
60,3.45,60.61,19.93,2.43,911.2,70.47,1.33,0.252
14,0.22,70.36,18.74,2.66,844.3,14.71,0.11,0.239
29,0.31,71.04,18.45,2.16,839.6,13.53,0.12,0.35
55,3.28,60.71,20.0,2.96,910.5,69.17,1.28,0.223
32,2.07,61.45,20.55,8.88,905.5,60.93,0.92,0.03
6,1.31,62.75,20.61,8.03,896.7,48.26,0.66,0.072
57,1.86,61.6,20.65,7.35,904.7,59.6,0.85,0.087
37,1.76,64.89,19.59,6.43,881.9,33.0,0.72,0.618
54,1.63,66.53,19.11,2.03,870.5,25.36,0.63,2.06
33,2.97,60.06,20.43,9.77,915.0,79.26,1.21,0.123
38,1.3,66.73,19.26,5.15,869.2,24.7,0.53,0.874
12,0.69,67.1,19.54,4.97,866.8,23.53,0.35,0.465
58,2.37,61.27,20.41,5.76,906.8,62.86,1.01,0.178
28,0.62,70.13,18.55,2.78,845.8,15.21,0.23,0.68
31,2.21,60.53,20.77,9.53,911.9,72.93,0.99,0.02
46,2.59,64.39,19.21,6.68,885.2,35.5,0.97,1.726
25,2.59,64.39,19.21,6.68,885.2,35.5,0.97,1.726
7,1.28,62.93,20.57,7.91,895.5,46.73,0.65,0.134
48,0.72,62.53,24.37,5.1,877.2,28.25,0.37,0.28
9,1.18,63.64,20.39,7.4,890.6,41.19,0.6,0.169
39,0.85,68.51,18.95,3.92,859.0,19.0,0.35,0.59
52,1.51,63.44,20.24,7.51,891.9,42.51,0.7,0.341
49,0.93,65.39,19.96,6.17,878.5,30.7,0.47,0.452
11,0.93,65.39,19.96,6.17,878.5,30.7,0.47,0.452
18,4.39,59.19,19.8,10.23,920.9,93.0,1.64,0.03
35,2.47,62.04,20.09,8.4,901.5,54.18,1.01,0.322
47,0.85,58.08,28.54,5.7,890.6,37.14,0.43,0.19
4,1.36,62.39,20.7,8.29,899.2,51.54,0.69,0.019
15,0.11,71.15,18.54,2.1,838.9,13.36,0.06,0.108
36,2.23,63.0,19.92,7.73,894.8,45.5,0.91,0.65
1,1.97,58.1,21.76,11.32,928.7,122.8,0.99,0.02
56,3.17,59.1,20.63,5.42,921.8,97.04,1.3,0.02
17,4.76,58.09,19.92,10.97,928.4,117.2,1.78,0.02
23,4.02,60.26,19.68,9.5,913.5,74.9,1.5,0.119
0,2.24,56.15,22.24,12.7,942.2,193.1,1.14,0.02
3,1.43,61.85,20.84,8.67,902.9,57.04,0.73,0.01
34,2.72,61.06,20.26,9.08,908.2,65.18,1.11,0.123
50,0.93,66.95,19.43,5.04,867.8,24.0,0.42,0.66
30,2.35,59.6,21.0,10.19,918.3,88.19,1.05,0.02
44,2.44,62.88,19.82,7.8,895.7,46.43,0.98,0.62
13,0.45,68.75,19.14,3.8,855.4,18.47,0.23,0.39
26,1.92,66.36,18.98,5.34,871.7,25.95,0.72,2.657
40,0.42,70.24,18.65,2.72,845.1,15.0,0.17,0.51
19,4.31,59.4,19.77,10.08,919.4,89.0,1.61,0.04
2,1.7,60.0,21.29,9.98,915.6,81.93,0.86,0.015
16,5.53,55.86,20.18,12.5,943.7,194.9,2.07,0.02
5,1.33,62.57,20.66,8.16,897.9,49.87,0.67,0.047
Binary file added test/data/regression/simple_regression.npy
Binary file not shown.
2 changes: 1 addition & 1 deletion test/integration/api/test_api_cli_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_cli_with_parameters():
f'--cv_folds 2 --target sea_height --train {ts_train_path} '
f'--test {ts_train_path} --for_len 10'
).split()
class_train_path = project_root_path.joinpath('test/data/simple_classification.csv')
class_train_path = project_root_path.joinpath('test/data/classification/simple_classification.csv')
class_call = (
f'--problem classification --train {class_train_path} --test {class_train_path} --target Y '
'--preset fast_train --timeout 0.1 --depth 3 --arity 3 '
Expand Down
4 changes: 2 additions & 2 deletions test/integration/api/test_api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from fedot.preprocessing.preprocessing import DataPreprocessor
from test.data.datasets import get_cholesterol_dataset
from test.integration.api.test_main_api import get_dataset
from test.unit.tasks.test_classification import get_binary_classification_data
from test.unit.tasks.test_classification import get_binary_classification_data_from_csv


def test_compose_fedot_model_without_tuning():
Expand All @@ -32,7 +32,7 @@ def test_output_binary_classification_correct():

task_type = 'classification'

data = get_binary_classification_data()
data = get_binary_classification_data_from_csv()

train_data, test_data = train_test_data_setup(data, shuffle=True)

Expand Down
2 changes: 1 addition & 1 deletion test/integration/composer/test_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _test_individuals_in_history(history: OptHistory):

@pytest.mark.parametrize('n_jobs', [1, 2])
def test_newly_generated_history(n_jobs: int):
file_path_train = fedot_project_root().joinpath('test/data/simple_classification.csv')
file_path_train = fedot_project_root().joinpath('test/data/classification/simple_classification.csv')

num_of_gens = 2
auto_model = Fedot(problem='classification', seed=42,
Expand Down
2 changes: 1 addition & 1 deletion test/integration/pipelines/tuning/test_pipeline_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def regression_dataset():
@pytest.fixture()
def classification_dataset():
test_file_path = str(os.path.dirname(__file__))
file = os.path.join(str(fedot_project_root()), 'test/data/simple_classification.csv')
file = os.path.join(str(fedot_project_root()), 'test/data/classification/simple_classification.csv')
return InputData.from_csv(os.path.join(test_file_path, file), task=Task(TaskTypesEnum.classification))


Expand Down
3 changes: 2 additions & 1 deletion test/integration/real_applications/test_real_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@


def test_credit_scoring_problem():
full_path_train = full_path_test = fedot_project_root().joinpath('test/data/simple_classification.csv')
full_path_train = full_path_test = \
fedot_project_root().joinpath('test/data/classification/simple_classification.csv')

roc_auc_test = run_credit_scoring_problem(full_path_train, full_path_test, timeout=5, target='Y', n_jobs=1)
assert roc_auc_test > 0.5
Expand Down
6 changes: 3 additions & 3 deletions test/unit/data/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_data_subset_incorrect(data_setup):

def test_data_from_csv():
test_file_path = str(os.path.dirname(__file__))
file = '../../data/simple_classification.csv'
file = '../../data/classification/simple_classification.csv'
task = Task(TaskTypesEnum.classification)
df = pd.read_csv(os.path.join(test_file_path, file))
data_array = np.array(df).T
Expand All @@ -71,7 +71,7 @@ def test_data_from_csv():

def test_with_custom_target():
test_file_path = str(os.path.dirname(__file__))
file = '../../data/simple_classification.csv'
file = '../../data/classification/simple_classification.csv'
file_custom = '../../data/simple_classification_with_custom_target.csv'

file_data = InputData.from_csv(
Expand Down Expand Up @@ -140,7 +140,7 @@ def test_target_data_from_csv_correct():

def test_table_data_shuffle():
test_file_path = str(os.path.dirname(__file__))
file = '../../data/simple_classification.csv'
file = '../../data/classification/simple_classification.csv'

data = InputData.from_csv(os.path.join(test_file_path, file))
shuffled_data = deepcopy(data)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/data/test_multimodal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def test_text_data_only(data_type):
data_source_name = 'data_source_text/description'
elif data_type is DataTypesEnum.table:
# Case when there is no text data in csv, but MultiModalData.from_csv() is used
file_path = 'test/data/simple_classification.csv'
file_path = 'test/data/classification/simple_classification.csv'
data_source_name = 'data_source_table'

path = Path(fedot_project_root(), file_path)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/optimizer/gp_operators/test_mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_requirements_and_params_for_task(task: TaskTypesEnum):


def file_data():
test_file_path = Path(__file__).parents[3].joinpath('data', 'simple_classification.csv')
test_file_path = Path(__file__).parents[3].joinpath('data', 'classification', 'simple_classification.csv')
input_data = InputData.from_csv(test_file_path)
input_data.idx = to_categorical_codes(categorical_ids=input_data.idx)
return input_data
Expand Down
2 changes: 1 addition & 1 deletion test/unit/pipelines/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def classification_dataset():
@pytest.fixture()
def file_data_setup():
test_file_path = str(os.path.dirname(__file__))
file = '../../data/simple_classification.csv'
file = '../../data/classification/simple_classification.csv'
input_data = InputData.from_csv(
os.path.join(test_file_path, file))
input_data.idx = to_categorical_codes(categorical_ids=input_data.idx)
Expand Down
91 changes: 81 additions & 10 deletions test/unit/tasks/test_classification.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os

import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import load_iris, make_classification
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import roc_auc_score as roc_auc, f1_score as f1

from examples.simple.classification.image_classification_problem import run_image_classification_problem
from fedot.core.data.data import InputData
Expand Down Expand Up @@ -37,6 +39,13 @@ def pipeline_with_pca() -> Pipeline:
return pipeline


def simple_text_pipeline() -> Pipeline:
node_tfidf = PipelineNode('tfidf')
model_node = PipelineNode('logit', nodes_from=[node_tfidf])
pipeline = Pipeline(model_node)
return pipeline


def get_synthetic_classification_data(n_samples=1000, n_features=10, random_state=None) -> InputData:
synthetic_data = make_classification(n_samples=n_samples, n_features=n_features, random_state=random_state)
input_data = InputData(idx=np.arange(0, len(synthetic_data[1])),
Expand All @@ -60,12 +69,26 @@ def get_iris_data() -> InputData:
return input_data


def get_binary_classification_data():
def get_classification_data(source: str, problem: str) -> InputData:
test_file_path = str(os.path.dirname(__file__))
file = '../../data/simple_classification.csv'
input_data = InputData.from_csv(
os.path.join(test_file_path, file))
return input_data
if source == 'numpy':
file = f'../../data/classification/{problem}_classification.npy'
numpy_data = np.load(os.path.join(test_file_path, file))
features_array = numpy_data[:, :-1]
target_array = numpy_data[:, -1]
return InputData.from_numpy(features_array=features_array,
target_array=target_array)
elif source == 'dataframe':
file = f'../../data/classification/{problem}_classification.csv'
df_data = pd.read_csv(os.path.join(test_file_path, file))
features_df = df_data.iloc[:, :-1]
target_df = df_data.iloc[:, -1]
return InputData.from_dataframe(features_df=features_df,
target_df=target_df)
elif source == 'csv':
file = f'../../data/classification/{problem}_classification.csv'
return InputData.from_csv(
os.path.join(test_file_path, file))


def get_image_classification_data(composite_flag: bool = True):
Expand Down Expand Up @@ -96,8 +119,32 @@ def get_image_classification_data(composite_flag: bool = True):
return roc_auc_on_valid, dataset_to_train, dataset_to_validate


def test_multiclassification_pipeline_fit_correct():
data = get_iris_data()
CLASSIFICATION_DATA_SOURCES = ['numpy',
'dataframe',
'csv',
# 'from_text_files',
# 'from_json_files',
]


@pytest.mark.parametrize('source', CLASSIFICATION_DATA_SOURCES)
def test_binary_classification_pipeline_fit_correct(source: str):
data = get_classification_data(source, 'simple')
pipeline = pipeline_simple()
train_data, test_data = train_test_data_setup(data, shuffle=True)

pipeline.fit(input_data=train_data)
results = pipeline.predict(input_data=test_data)

roc_auc_on_test = roc_auc(y_true=test_data.target,
y_score=results.predict)

assert roc_auc_on_test > 0.8


@pytest.mark.parametrize('source', CLASSIFICATION_DATA_SOURCES)
def test_multiclassification_pipeline_fit_correct(source: str):
data = get_classification_data(source, 'multiclass')
pipeline = pipeline_simple()
train_data, test_data = train_test_data_setup(data, shuffle=True)

Expand All @@ -106,7 +153,7 @@ def test_multiclassification_pipeline_fit_correct():

roc_auc_on_test = roc_auc(y_true=test_data.target,
y_score=results.predict,
multi_class='ovo',
multi_class='ovr', # TODO: strange bug when ovo is chosen
average='macro')

assert roc_auc_on_test > 0.95
Expand Down Expand Up @@ -154,7 +201,7 @@ def test_output_mode_labels():


def test_output_mode_full_probs():
data = get_binary_classification_data()
data = get_classification_data('csv', 'simple')
pipeline = pipeline_simple()
train_data, test_data = train_test_data_setup(data, shuffle=True)

Expand All @@ -167,3 +214,27 @@ def test_output_mode_full_probs():
assert np.array_equal(results_probs.predict, results_default.predict)
assert results.predict.shape == (len(test_data.target), 2)
assert results_probs.predict.shape == (len(test_data.target), 1)


def test_image_pipeline_fit_correct():
roc_auc_on_valid, _, _ = get_image_classification_data()

assert roc_auc_on_valid >= 0.5


def test_text_classification_pipeline_fit_correct():
test_file_path = str(os.path.dirname(__file__))
file = '../../data/simple_multimodal_classification_text.csv'
data = InputData.from_csv(file_path=os.path.join(test_file_path, file),
data_type=DataTypesEnum.text)
pipeline = simple_text_pipeline()
train_data, test_data = train_test_data_setup(data, shuffle=True)

pipeline.fit(input_data=train_data)
results = pipeline.predict(input_data=test_data, output_mode='labels')

f1_on_test = f1(y_true=test_data.target,
y_pred=results.predict,
average='micro')

assert f1_on_test >= 0.5
Loading
Loading