From 82b1a15aba0c27afeb5d871dd6522b438a06650c Mon Sep 17 00:00:00 2001 From: JureZmzrlikar Date: Wed, 2 Aug 2023 09:08:28 +0200 Subject: [PATCH 1/5] Fix fit methods to match sklearn signature --- src/rnanorm/methods/between_sample.py | 6 +++-- tests/test_sklearn_compat.py | 37 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 tests/test_sklearn_compat.py diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py index ca00c25..abdde90 100644 --- a/src/rnanorm/methods/between_sample.py +++ b/src/rnanorm/methods/between_sample.py @@ -1,4 +1,6 @@ """Between sample normalizations.""" +from typing import Any, Optional + import numpy as np from scipy.stats import gmean, rankdata, scoreatpercentile from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin @@ -97,7 +99,7 @@ def _reset(self) -> None: if hasattr(self, "geometric_mean_"): del self.geometric_mean_ - def fit(self, X: Numeric2D) -> Self: + def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self: """Fit. :param X: Expression raw count matrix (n_samples, n_features) @@ -329,7 +331,7 @@ def _get_ref(self, X: Numeric2D) -> Numeric1D: ref_index = np.argmin(np.fabs(f75 - np.mean(f75))) return X[ref_index, :] - def fit(self, X: Numeric2D) -> Self: + def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self: """Fit. :param X: Expression raw count matrix (n_samples, n_features) diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py new file mode 100644 index 0000000..4cda4d5 --- /dev/null +++ b/tests/test_sklearn_compat.py @@ -0,0 +1,37 @@ +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ +from rnanorm.datasets import load_toy_data + + +def test_grid_search(): + """Test compatibility of all methods with sklearn machinery.""" + ds = load_toy_data() + X = ds.exp + y = pd.Series([0, 0, 1, 1], index=X.index) + pipeline = Pipeline( + steps=[ + ("normalization", CPM()), + ("scaler", StandardScaler()), + ("classifier", LogisticRegression()), + ] + ) + params = { + "normalization": [ + CPM(), + FPKM(gtf=ds.gtf_path), + TPM(gtf=ds.gtf_path), + UQ(), + CUF(), + TMM(), + CTF(), + ], + } + search = GridSearchCV(pipeline, params, cv=2, refit=False) + search.fit(X, y) + results = pd.DataFrame(search.cv_results_) + assert results.shape[0] == 7 From 07404cd02c88c6c68bcb5a5aed70996d11c6d3bc Mon Sep 17 00:00:00 2001 From: JureZmzrlikar Date: Fri, 4 Aug 2023 07:30:23 +0200 Subject: [PATCH 2/5] Fix LibrarySize to always return np.array in private functions If global set_output = pandas we are mixing numpy arrays with pandas and get errors. --- src/rnanorm/methods/between_sample.py | 17 ++++++++++++----- tests/test_ctf.py | 9 +++++++++ tests/test_cuf.py | 9 +++++++++ tests/test_tmm.py | 9 +++++++++ tests/test_uq.py | 9 +++++++++ 5 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py index abdde90..fd5cbbd 100644 --- a/src/rnanorm/methods/between_sample.py +++ b/src/rnanorm/methods/between_sample.py @@ -3,6 +3,7 @@ import numpy as np from scipy.stats import gmean, rankdata, scoreatpercentile +from sklearn import config_context from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin from sklearn.utils.validation import check_is_fitted @@ -72,7 +73,9 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: :param X: Expression raw count matrix (n_samples, n_features) """ X = remove_zero_genes(X) - lib_size = LibrarySize().fit_transform(X) + # Make sure that global set_config(transform_output="pandas") + # does not affect this method - we need numpy output here. + lib_size = LibrarySize().set_output(transform="default").fit_transform(X) # Compute upper quartile count for each sample. # No numpy method can be used as drop-in replacement for R's quantile. @@ -124,7 +127,8 @@ def transform(self, X: Numeric2D) -> Numeric2D: # Compute effective library sizes factors = self.get_norm_factors(X) - effective_lib_size = LibrarySize().fit_transform(X) * factors + lib_size = LibrarySize().set_output(transform="default").fit_transform(X) + effective_lib_size = lib_size * factors # Make CPM, but with effective library size return X / effective_lib_size[:, np.newaxis] * 1e6 @@ -243,8 +247,10 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: """ X = remove_zero_genes(X) - lib_size = LibrarySize().fit_transform(X) - lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :]) + # ensure that output of transform will be a np.array + with config_context(transform_output="default"): + lib_size = LibrarySize().fit_transform(X) + lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :]) # Values 0 cause a lot of troubles and warnings in log / division. # But computing with np.nan is OK, and is handled gracefully. @@ -356,7 +362,8 @@ def transform(self, X: Numeric2D) -> Numeric2D: """ # Compute effective library sizes factors = self.get_norm_factors(X) - effective_lib_size = LibrarySize().fit_transform(X) * factors + lib_size = LibrarySize().set_output(transform="default").fit_transform(X) + effective_lib_size = lib_size * factors # Method ``check_is_fitted`` is not called here, since it is # called in self.get_norm_factors diff --git a/tests/test_ctf.py b/tests/test_ctf.py index ee7efe8..2c27da3 100644 --- a/tests/test_ctf.py +++ b/tests/test_ctf.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +from sklearn import config_context from rnanorm import CTF @@ -44,3 +45,11 @@ def test_ctf(exp, expected_factors, expected_ctf): expected_ctf.loc[["Sample_2"]], rtol=1e-3, ) + + +def test_global_set_output(exp): + """Ensure that global config does not break things.""" + with config_context(transform_output="pandas"): + CTF().fit_transform(exp) + + CTF().set_output(transform="pandas").fit_transform(exp) diff --git a/tests/test_cuf.py b/tests/test_cuf.py index 4c46f5d..e63bd34 100644 --- a/tests/test_cuf.py +++ b/tests/test_cuf.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +from sklearn import config_context from rnanorm import CUF @@ -44,3 +45,11 @@ def test_cuf(exp, expected_factors, expected_cuf): expected_cuf.loc[["Sample_2"]], rtol=1e-3, ) + + +def test_global_set_output(exp): + """Ensure that global config does not break things.""" + with config_context(transform_output="pandas"): + CUF().fit_transform(exp) + + CUF().set_output(transform="pandas").fit_transform(exp) diff --git a/tests/test_tmm.py b/tests/test_tmm.py index 775a1f4..b9e2203 100644 --- a/tests/test_tmm.py +++ b/tests/test_tmm.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +from sklearn import config_context from rnanorm import TMM from rnanorm.datasets import load_gtex @@ -70,3 +71,11 @@ def test_tmm_rnanorm_edger(): rnanorm_factors, decimal=14, ) + + +def test_global_set_output(exp): + """Ensure that global config does not break things.""" + with config_context(transform_output="pandas"): + TMM().fit_transform(exp) + + TMM().set_output(transform="pandas").fit_transform(exp) diff --git a/tests/test_uq.py b/tests/test_uq.py index 836cefc..c891ab2 100644 --- a/tests/test_uq.py +++ b/tests/test_uq.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +from sklearn import config_context from rnanorm import UQ from rnanorm.datasets import load_gtex @@ -71,3 +72,11 @@ def test_uq_rnanorm_edger(): rnanorm_factors, decimal=14, ) + + +def test_global_set_output(exp): + """Ensure that global config does not break things.""" + with config_context(transform_output="pandas"): + UQ().fit_transform(exp) + + UQ().set_output(transform="pandas").fit_transform(exp) From fa049d8f033bf9248274d00b642f267fdf9bab76 Mon Sep 17 00:00:00 2001 From: JureZmzrlikar Date: Wed, 9 Aug 2023 12:39:59 +0200 Subject: [PATCH 3/5] Make get_norm_factors follow the set_output config --- src/rnanorm/methods/between_sample.py | 24 +++++++++++--- tests/test_ctf.py | 9 ------ tests/test_cuf.py | 9 ------ tests/test_sklearn_compat.py | 46 +++++++++++++++++++++++++++ tests/test_tmm.py | 9 ------ tests/test_uq.py | 9 ------ 6 files changed, 66 insertions(+), 40 deletions(-) diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py index fd5cbbd..4df64dc 100644 --- a/src/rnanorm/methods/between_sample.py +++ b/src/rnanorm/methods/between_sample.py @@ -2,9 +2,11 @@ from typing import Any, Optional import numpy as np +import pandas as pd from scipy.stats import gmean, rankdata, scoreatpercentile from sklearn import config_context from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin +from sklearn.utils._set_output import _get_output_config from sklearn.utils.validation import check_is_fitted from ..typing import Numeric1D, Numeric2D, Self @@ -95,7 +97,12 @@ def get_norm_factors(self, X: Numeric2D) -> Numeric1D: """ check_is_fitted(self) factors = self._get_norm_factors(X) - return factors / self.geometric_mean_ + factors = factors / self.geometric_mean_ + + config = _get_output_config("transform", self) + if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame): + return pd.Series(factors, index=X.index) + return factors def _reset(self) -> None: """Reset internal data-dependent state.""" @@ -319,11 +326,15 @@ def get_norm_factors(self, X: Numeric2D) -> Numeric1D: :param X: Expression raw count matrix (n_samples, n_features)s """ check_is_fitted(self) - X = self._validate_data(X, force_all_finite=True, reset=False, dtype=float) + X2 = self._validate_data(X, force_all_finite=True, reset=False, dtype=float) - factors = self._get_norm_factors(X) + factors = self._get_norm_factors(X2) + factors = factors / self.geometric_mean_ - return factors / self.geometric_mean_ + config = _get_output_config("transform", self) + if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame): + return pd.Series(factors, index=X.index) + return factors def _reset(self) -> None: """Reset internal data-dependent state.""" @@ -362,6 +373,9 @@ def transform(self, X: Numeric2D) -> Numeric2D: """ # Compute effective library sizes factors = self.get_norm_factors(X) + if isinstance(factors, pd.Series): + factors = factors.to_numpy() + lib_size = LibrarySize().set_output(transform="default").fit_transform(X) effective_lib_size = lib_size * factors @@ -416,6 +430,8 @@ def transform(self, X: Numeric2D) -> Numeric2D: """ # Just divide raw counts with normalization factors factors = self.get_norm_factors(X) + if isinstance(factors, pd.Series): + factors = factors.to_numpy() # Method ``check_is_fitted`` is not called here, since it is # called in self.get_norm_factors diff --git a/tests/test_ctf.py b/tests/test_ctf.py index 2c27da3..ee7efe8 100644 --- a/tests/test_ctf.py +++ b/tests/test_ctf.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import pytest -from sklearn import config_context from rnanorm import CTF @@ -45,11 +44,3 @@ def test_ctf(exp, expected_factors, expected_ctf): expected_ctf.loc[["Sample_2"]], rtol=1e-3, ) - - -def test_global_set_output(exp): - """Ensure that global config does not break things.""" - with config_context(transform_output="pandas"): - CTF().fit_transform(exp) - - CTF().set_output(transform="pandas").fit_transform(exp) diff --git a/tests/test_cuf.py b/tests/test_cuf.py index e63bd34..4c46f5d 100644 --- a/tests/test_cuf.py +++ b/tests/test_cuf.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import pytest -from sklearn import config_context from rnanorm import CUF @@ -45,11 +44,3 @@ def test_cuf(exp, expected_factors, expected_cuf): expected_cuf.loc[["Sample_2"]], rtol=1e-3, ) - - -def test_global_set_output(exp): - """Ensure that global config does not break things.""" - with config_context(transform_output="pandas"): - CUF().fit_transform(exp) - - CUF().set_output(transform="pandas").fit_transform(exp) diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py index 4cda4d5..29fba04 100644 --- a/tests/test_sklearn_compat.py +++ b/tests/test_sklearn_compat.py @@ -1,4 +1,6 @@ +import numpy as np import pandas as pd +from sklearn import config_context from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline @@ -35,3 +37,47 @@ def test_grid_search(): search.fit(X, y) results = pd.DataFrame(search.cv_results_) assert results.shape[0] == 7 + + +def test_set_output(): + """Ensure set_output behaves as expected.""" + ds = load_toy_data() + + method__has_factors = ( + (CPM(), False), + (FPKM(gtf=ds.gtf_path), False), + (TPM(gtf=ds.gtf_path), False), + (UQ(), True), + (CUF(), True), + (TMM(), True), + (CTF(), True), + ) + + for method, has_factors in method__has_factors: + # No configuration returns np.arrays + tmm = TMM() + result = tmm.fit_transform(ds.exp) + factors = tmm.get_norm_factors(ds.exp) + assert isinstance(result, np.ndarray) + assert isinstance(factors, np.ndarray) + + # Explicit global config should return corresponding objects + with config_context(transform_output="default"): + result = tmm.fit_transform(ds.exp) + factors = tmm.get_norm_factors(ds.exp) + assert isinstance(result, np.ndarray) + assert isinstance(factors, np.ndarray) + + with config_context(transform_output="pandas"): + result = tmm.fit_transform(ds.exp) + factors = tmm.get_norm_factors(ds.exp) + assert isinstance(result, pd.DataFrame) + assert isinstance(factors, pd.Series) + + # Explicit pandas config should return pandas objects + method.set_output(transform="pandas") + result = method.fit_transform(ds.exp) + assert isinstance(result, pd.DataFrame) + if has_factors: + factors = method.get_norm_factors(ds.exp) + assert isinstance(factors, pd.Series) diff --git a/tests/test_tmm.py b/tests/test_tmm.py index b9e2203..775a1f4 100644 --- a/tests/test_tmm.py +++ b/tests/test_tmm.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd import pytest -from sklearn import config_context from rnanorm import TMM from rnanorm.datasets import load_gtex @@ -71,11 +70,3 @@ def test_tmm_rnanorm_edger(): rnanorm_factors, decimal=14, ) - - -def test_global_set_output(exp): - """Ensure that global config does not break things.""" - with config_context(transform_output="pandas"): - TMM().fit_transform(exp) - - TMM().set_output(transform="pandas").fit_transform(exp) diff --git a/tests/test_uq.py b/tests/test_uq.py index c891ab2..836cefc 100644 --- a/tests/test_uq.py +++ b/tests/test_uq.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd import pytest -from sklearn import config_context from rnanorm import UQ from rnanorm.datasets import load_gtex @@ -72,11 +71,3 @@ def test_uq_rnanorm_edger(): rnanorm_factors, decimal=14, ) - - -def test_global_set_output(exp): - """Ensure that global config does not break things.""" - with config_context(transform_output="pandas"): - UQ().fit_transform(exp) - - UQ().set_output(transform="pandas").fit_transform(exp) From b4eae42a256b407a442dea2cc7130e0debc853cf Mon Sep 17 00:00:00 2001 From: JureZmzrlikar Date: Wed, 9 Aug 2023 14:37:55 +0200 Subject: [PATCH 4/5] Remove leftovers of as_frame parameter --- src/rnanorm/datasets.py | 2 +- tests/test_tmm.py | 2 +- tests/test_uq.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rnanorm/datasets.py b/src/rnanorm/datasets.py index 717f346..6b24684 100644 --- a/src/rnanorm/datasets.py +++ b/src/rnanorm/datasets.py @@ -37,7 +37,7 @@ def load_toy_data() -> Bunch: return ds -def load_gtex(as_frame: bool = False) -> Bunch: +def load_gtex() -> Bunch: """ Load a real RNA-seq dataset from GTFx project. diff --git a/tests/test_tmm.py b/tests/test_tmm.py index 775a1f4..878c463 100644 --- a/tests/test_tmm.py +++ b/tests/test_tmm.py @@ -61,7 +61,7 @@ def test_tmm_rnanorm_edger(): ) # Compute scaling factors here - ds = load_gtex(as_frame=True) + ds = load_gtex() rnanorm_factors = TMM().fit(ds.exp).get_norm_factors(ds.exp) # Compare loaded and computed scaling factors diff --git a/tests/test_uq.py b/tests/test_uq.py index 836cefc..c4435ad 100644 --- a/tests/test_uq.py +++ b/tests/test_uq.py @@ -62,7 +62,7 @@ def test_uq_rnanorm_edger(): ) # Compute scaling factors here - ds = load_gtex(as_frame=True) + ds = load_gtex() rnanorm_factors = UQ().fit(ds.exp).get_norm_factors(ds.exp) # Compare loaded and computed scaling factors From 019bf5fa1cc93b13569e854394416da55a2ceb2a Mon Sep 17 00:00:00 2001 From: JureZmzrlikar Date: Wed, 9 Aug 2023 15:02:24 +0200 Subject: [PATCH 5/5] Update citation file --- CITATION.cff | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 70e22a1..a1d0303 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,13 +1,19 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." -title: "RNA-seq data normalization in Python" +title: "RNAnorm: RNA-seq data normalization in Python" version: 2.0.0 date-released: 2023-07-12 url: "https://github.com/genialis/RNAnorm" authors: -- family-names: "Štajdohar" - given-names: "Miha" - orcid: "https://orcid.org/0009-0008-2749-8298" - family-names: "Zmrzlikar" given-names: "Jure" orcid: "https://orcid.org/0009-0009-9749-541X" +- family-names: "Žganec" + given-names: "Matjaž" + orcid: "https://orcid.org/0000-0002-4159-9596" +- family-names: "Ausec" + given-names: "Luka" + orcid: "https://orcid.org/0000-0003-3155-0999" +- family-names: "Štajdohar" + given-names: "Miha" + orcid: "https://orcid.org/0009-0008-2749-8298" \ No newline at end of file