From 82b1a15aba0c27afeb5d871dd6522b438a06650c Mon Sep 17 00:00:00 2001
From: JureZmzrlikar <zmrzlikar.jure@gmail.com>
Date: Wed, 2 Aug 2023 09:08:28 +0200
Subject: [PATCH 1/5] Fix fit methods to match sklearn signature

---
 src/rnanorm/methods/between_sample.py |  6 +++--
 tests/test_sklearn_compat.py          | 37 +++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_sklearn_compat.py

diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py
index ca00c25..abdde90 100644
--- a/src/rnanorm/methods/between_sample.py
+++ b/src/rnanorm/methods/between_sample.py
@@ -1,4 +1,6 @@
 """Between sample normalizations."""
+from typing import Any, Optional
+
 import numpy as np
 from scipy.stats import gmean, rankdata, scoreatpercentile
 from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
@@ -97,7 +99,7 @@ def _reset(self) -> None:
         if hasattr(self, "geometric_mean_"):
             del self.geometric_mean_
 
-    def fit(self, X: Numeric2D) -> Self:
+    def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self:
         """Fit.
 
         :param X: Expression raw count matrix (n_samples, n_features)
@@ -329,7 +331,7 @@ def _get_ref(self, X: Numeric2D) -> Numeric1D:
         ref_index = np.argmin(np.fabs(f75 - np.mean(f75)))
         return X[ref_index, :]
 
-    def fit(self, X: Numeric2D) -> Self:
+    def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self:
         """Fit.
 
         :param X: Expression raw count matrix (n_samples, n_features)
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
new file mode 100644
index 0000000..4cda4d5
--- /dev/null
+++ b/tests/test_sklearn_compat.py
@@ -0,0 +1,37 @@
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
+from rnanorm.datasets import load_toy_data
+
+
+def test_grid_search():
+    """Test compatibility of all methods with sklearn machinery."""
+    ds = load_toy_data()
+    X = ds.exp
+    y = pd.Series([0, 0, 1, 1], index=X.index)
+    pipeline = Pipeline(
+        steps=[
+            ("normalization", CPM()),
+            ("scaler", StandardScaler()),
+            ("classifier", LogisticRegression()),
+        ]
+    )
+    params = {
+        "normalization": [
+            CPM(),
+            FPKM(gtf=ds.gtf_path),
+            TPM(gtf=ds.gtf_path),
+            UQ(),
+            CUF(),
+            TMM(),
+            CTF(),
+        ],
+    }
+    search = GridSearchCV(pipeline, params, cv=2, refit=False)
+    search.fit(X, y)
+    results = pd.DataFrame(search.cv_results_)
+    assert results.shape[0] == 7

From 07404cd02c88c6c68bcb5a5aed70996d11c6d3bc Mon Sep 17 00:00:00 2001
From: JureZmzrlikar <zmrzlikar.jure@gmail.com>
Date: Fri, 4 Aug 2023 07:30:23 +0200
Subject: [PATCH 2/5] Fix LibrarySize to always return np.array in private
 functions

If global set_output = pandas we are mixing numpy arrays with
pandas and get errors.
---
 src/rnanorm/methods/between_sample.py | 17 ++++++++++++-----
 tests/test_ctf.py                     |  9 +++++++++
 tests/test_cuf.py                     |  9 +++++++++
 tests/test_tmm.py                     |  9 +++++++++
 tests/test_uq.py                      |  9 +++++++++
 5 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py
index abdde90..fd5cbbd 100644
--- a/src/rnanorm/methods/between_sample.py
+++ b/src/rnanorm/methods/between_sample.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from scipy.stats import gmean, rankdata, scoreatpercentile
+from sklearn import config_context
 from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
 from sklearn.utils.validation import check_is_fitted
 
@@ -72,7 +73,9 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
         :param X: Expression raw count matrix (n_samples, n_features)
         """
         X = remove_zero_genes(X)
-        lib_size = LibrarySize().fit_transform(X)
+        # Make sure that global set_config(transform_output="pandas")
+        # does not affect this method - we need numpy output here.
+        lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
 
         # Compute upper quartile count for each sample.
         # No numpy method can be used as drop-in replacement for R's quantile.
@@ -124,7 +127,8 @@ def transform(self, X: Numeric2D) -> Numeric2D:
 
         # Compute effective library sizes
         factors = self.get_norm_factors(X)
-        effective_lib_size = LibrarySize().fit_transform(X) * factors
+        lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
+        effective_lib_size = lib_size * factors
 
         # Make CPM, but with effective library size
         return X / effective_lib_size[:, np.newaxis] * 1e6
@@ -243,8 +247,10 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
         """
         X = remove_zero_genes(X)
 
-        lib_size = LibrarySize().fit_transform(X)
-        lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :])
+        # ensure that output of transform will be a np.array
+        with config_context(transform_output="default"):
+            lib_size = LibrarySize().fit_transform(X)
+            lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :])
 
         # Values 0 cause a lot of troubles and warnings in log / division.
         # But computing with np.nan is OK, and is handled gracefully.
@@ -356,7 +362,8 @@ def transform(self, X: Numeric2D) -> Numeric2D:
         """
         # Compute effective library sizes
         factors = self.get_norm_factors(X)
-        effective_lib_size = LibrarySize().fit_transform(X) * factors
+        lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
+        effective_lib_size = lib_size * factors
 
         # Method ``check_is_fitted`` is not called here, since it is
         # called in self.get_norm_factors
diff --git a/tests/test_ctf.py b/tests/test_ctf.py
index ee7efe8..2c27da3 100644
--- a/tests/test_ctf.py
+++ b/tests/test_ctf.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from sklearn import config_context
 
 from rnanorm import CTF
 
@@ -44,3 +45,11 @@ def test_ctf(exp, expected_factors, expected_ctf):
         expected_ctf.loc[["Sample_2"]],
         rtol=1e-3,
     )
+
+
+def test_global_set_output(exp):
+    """Ensure that global config does not break things."""
+    with config_context(transform_output="pandas"):
+        CTF().fit_transform(exp)
+
+    CTF().set_output(transform="pandas").fit_transform(exp)
diff --git a/tests/test_cuf.py b/tests/test_cuf.py
index 4c46f5d..e63bd34 100644
--- a/tests/test_cuf.py
+++ b/tests/test_cuf.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from sklearn import config_context
 
 from rnanorm import CUF
 
@@ -44,3 +45,11 @@ def test_cuf(exp, expected_factors, expected_cuf):
         expected_cuf.loc[["Sample_2"]],
         rtol=1e-3,
     )
+
+
+def test_global_set_output(exp):
+    """Ensure that global config does not break things."""
+    with config_context(transform_output="pandas"):
+        CUF().fit_transform(exp)
+
+    CUF().set_output(transform="pandas").fit_transform(exp)
diff --git a/tests/test_tmm.py b/tests/test_tmm.py
index 775a1f4..b9e2203 100644
--- a/tests/test_tmm.py
+++ b/tests/test_tmm.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from sklearn import config_context
 
 from rnanorm import TMM
 from rnanorm.datasets import load_gtex
@@ -70,3 +71,11 @@ def test_tmm_rnanorm_edger():
         rnanorm_factors,
         decimal=14,
     )
+
+
+def test_global_set_output(exp):
+    """Ensure that global config does not break things."""
+    with config_context(transform_output="pandas"):
+        TMM().fit_transform(exp)
+
+    TMM().set_output(transform="pandas").fit_transform(exp)
diff --git a/tests/test_uq.py b/tests/test_uq.py
index 836cefc..c891ab2 100644
--- a/tests/test_uq.py
+++ b/tests/test_uq.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from sklearn import config_context
 
 from rnanorm import UQ
 from rnanorm.datasets import load_gtex
@@ -71,3 +72,11 @@ def test_uq_rnanorm_edger():
         rnanorm_factors,
         decimal=14,
     )
+
+
+def test_global_set_output(exp):
+    """Ensure that global config does not break things."""
+    with config_context(transform_output="pandas"):
+        UQ().fit_transform(exp)
+
+    UQ().set_output(transform="pandas").fit_transform(exp)

From fa049d8f033bf9248274d00b642f267fdf9bab76 Mon Sep 17 00:00:00 2001
From: JureZmzrlikar <zmrzlikar.jure@gmail.com>
Date: Wed, 9 Aug 2023 12:39:59 +0200
Subject: [PATCH 3/5] Make get_norm_factors follow the set_output config

---
 src/rnanorm/methods/between_sample.py | 24 +++++++++++---
 tests/test_ctf.py                     |  9 ------
 tests/test_cuf.py                     |  9 ------
 tests/test_sklearn_compat.py          | 46 +++++++++++++++++++++++++++
 tests/test_tmm.py                     |  9 ------
 tests/test_uq.py                      |  9 ------
 6 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py
index fd5cbbd..4df64dc 100644
--- a/src/rnanorm/methods/between_sample.py
+++ b/src/rnanorm/methods/between_sample.py
@@ -2,9 +2,11 @@
 from typing import Any, Optional
 
 import numpy as np
+import pandas as pd
 from scipy.stats import gmean, rankdata, scoreatpercentile
 from sklearn import config_context
 from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
+from sklearn.utils._set_output import _get_output_config
 from sklearn.utils.validation import check_is_fitted
 
 from ..typing import Numeric1D, Numeric2D, Self
@@ -95,7 +97,12 @@ def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
         """
         check_is_fitted(self)
         factors = self._get_norm_factors(X)
-        return factors / self.geometric_mean_
+        factors = factors / self.geometric_mean_
+
+        config = _get_output_config("transform", self)
+        if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame):
+            return pd.Series(factors, index=X.index)
+        return factors
 
     def _reset(self) -> None:
         """Reset internal data-dependent state."""
@@ -319,11 +326,15 @@ def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
         :param X: Expression raw count matrix (n_samples, n_features)s
         """
         check_is_fitted(self)
-        X = self._validate_data(X, force_all_finite=True, reset=False, dtype=float)
+        X2 = self._validate_data(X, force_all_finite=True, reset=False, dtype=float)
 
-        factors = self._get_norm_factors(X)
+        factors = self._get_norm_factors(X2)
+        factors = factors / self.geometric_mean_
 
-        return factors / self.geometric_mean_
+        config = _get_output_config("transform", self)
+        if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame):
+            return pd.Series(factors, index=X.index)
+        return factors
 
     def _reset(self) -> None:
         """Reset internal data-dependent state."""
@@ -362,6 +373,9 @@ def transform(self, X: Numeric2D) -> Numeric2D:
         """
         # Compute effective library sizes
         factors = self.get_norm_factors(X)
+        if isinstance(factors, pd.Series):
+            factors = factors.to_numpy()
+
         lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
         effective_lib_size = lib_size * factors
 
@@ -416,6 +430,8 @@ def transform(self, X: Numeric2D) -> Numeric2D:
         """
         # Just divide raw counts with normalization factors
         factors = self.get_norm_factors(X)
+        if isinstance(factors, pd.Series):
+            factors = factors.to_numpy()
 
         # Method ``check_is_fitted`` is not called here, since it is
         # called in self.get_norm_factors
diff --git a/tests/test_ctf.py b/tests/test_ctf.py
index 2c27da3..ee7efe8 100644
--- a/tests/test_ctf.py
+++ b/tests/test_ctf.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from sklearn import config_context
 
 from rnanorm import CTF
 
@@ -45,11 +44,3 @@ def test_ctf(exp, expected_factors, expected_ctf):
         expected_ctf.loc[["Sample_2"]],
         rtol=1e-3,
     )
-
-
-def test_global_set_output(exp):
-    """Ensure that global config does not break things."""
-    with config_context(transform_output="pandas"):
-        CTF().fit_transform(exp)
-
-    CTF().set_output(transform="pandas").fit_transform(exp)
diff --git a/tests/test_cuf.py b/tests/test_cuf.py
index e63bd34..4c46f5d 100644
--- a/tests/test_cuf.py
+++ b/tests/test_cuf.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from sklearn import config_context
 
 from rnanorm import CUF
 
@@ -45,11 +44,3 @@ def test_cuf(exp, expected_factors, expected_cuf):
         expected_cuf.loc[["Sample_2"]],
         rtol=1e-3,
     )
-
-
-def test_global_set_output(exp):
-    """Ensure that global config does not break things."""
-    with config_context(transform_output="pandas"):
-        CUF().fit_transform(exp)
-
-    CUF().set_output(transform="pandas").fit_transform(exp)
diff --git a/tests/test_sklearn_compat.py b/tests/test_sklearn_compat.py
index 4cda4d5..29fba04 100644
--- a/tests/test_sklearn_compat.py
+++ b/tests/test_sklearn_compat.py
@@ -1,4 +1,6 @@
+import numpy as np
 import pandas as pd
+from sklearn import config_context
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
@@ -35,3 +37,47 @@ def test_grid_search():
     search.fit(X, y)
     results = pd.DataFrame(search.cv_results_)
     assert results.shape[0] == 7
+
+
+def test_set_output():
+    """Ensure set_output behaves as expected."""
+    ds = load_toy_data()
+
+    method__has_factors = (
+        (CPM(), False),
+        (FPKM(gtf=ds.gtf_path), False),
+        (TPM(gtf=ds.gtf_path), False),
+        (UQ(), True),
+        (CUF(), True),
+        (TMM(), True),
+        (CTF(), True),
+    )
+
+    for method, has_factors in method__has_factors:
+        # No configuration returns np.arrays
+        tmm = TMM()
+        result = tmm.fit_transform(ds.exp)
+        factors = tmm.get_norm_factors(ds.exp)
+        assert isinstance(result, np.ndarray)
+        assert isinstance(factors, np.ndarray)
+
+        # Explicit global config should return corresponding objects
+        with config_context(transform_output="default"):
+            result = tmm.fit_transform(ds.exp)
+            factors = tmm.get_norm_factors(ds.exp)
+            assert isinstance(result, np.ndarray)
+            assert isinstance(factors, np.ndarray)
+
+        with config_context(transform_output="pandas"):
+            result = tmm.fit_transform(ds.exp)
+            factors = tmm.get_norm_factors(ds.exp)
+            assert isinstance(result, pd.DataFrame)
+            assert isinstance(factors, pd.Series)
+
+        # Explicit pandas config should return pandas objects
+        method.set_output(transform="pandas")
+        result = method.fit_transform(ds.exp)
+        assert isinstance(result, pd.DataFrame)
+        if has_factors:
+            factors = method.get_norm_factors(ds.exp)
+            assert isinstance(factors, pd.Series)
diff --git a/tests/test_tmm.py b/tests/test_tmm.py
index b9e2203..775a1f4 100644
--- a/tests/test_tmm.py
+++ b/tests/test_tmm.py
@@ -3,7 +3,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from sklearn import config_context
 
 from rnanorm import TMM
 from rnanorm.datasets import load_gtex
@@ -71,11 +70,3 @@ def test_tmm_rnanorm_edger():
         rnanorm_factors,
         decimal=14,
     )
-
-
-def test_global_set_output(exp):
-    """Ensure that global config does not break things."""
-    with config_context(transform_output="pandas"):
-        TMM().fit_transform(exp)
-
-    TMM().set_output(transform="pandas").fit_transform(exp)
diff --git a/tests/test_uq.py b/tests/test_uq.py
index c891ab2..836cefc 100644
--- a/tests/test_uq.py
+++ b/tests/test_uq.py
@@ -3,7 +3,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from sklearn import config_context
 
 from rnanorm import UQ
 from rnanorm.datasets import load_gtex
@@ -72,11 +71,3 @@ def test_uq_rnanorm_edger():
         rnanorm_factors,
         decimal=14,
     )
-
-
-def test_global_set_output(exp):
-    """Ensure that global config does not break things."""
-    with config_context(transform_output="pandas"):
-        UQ().fit_transform(exp)
-
-    UQ().set_output(transform="pandas").fit_transform(exp)

From b4eae42a256b407a442dea2cc7130e0debc853cf Mon Sep 17 00:00:00 2001
From: JureZmzrlikar <zmrzlikar.jure@gmail.com>
Date: Wed, 9 Aug 2023 14:37:55 +0200
Subject: [PATCH 4/5] Remove leftovers of as_frame parameter

---
 src/rnanorm/datasets.py | 2 +-
 tests/test_tmm.py       | 2 +-
 tests/test_uq.py        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/rnanorm/datasets.py b/src/rnanorm/datasets.py
index 717f346..6b24684 100644
--- a/src/rnanorm/datasets.py
+++ b/src/rnanorm/datasets.py
@@ -37,7 +37,7 @@ def load_toy_data() -> Bunch:
     return ds
 
 
-def load_gtex(as_frame: bool = False) -> Bunch:
+def load_gtex() -> Bunch:
     """
     Load a real RNA-seq dataset from GTFx project.
 
diff --git a/tests/test_tmm.py b/tests/test_tmm.py
index 775a1f4..878c463 100644
--- a/tests/test_tmm.py
+++ b/tests/test_tmm.py
@@ -61,7 +61,7 @@ def test_tmm_rnanorm_edger():
     )
 
     # Compute scaling factors here
-    ds = load_gtex(as_frame=True)
+    ds = load_gtex()
     rnanorm_factors = TMM().fit(ds.exp).get_norm_factors(ds.exp)
 
     # Compare loaded and computed scaling factors
diff --git a/tests/test_uq.py b/tests/test_uq.py
index 836cefc..c4435ad 100644
--- a/tests/test_uq.py
+++ b/tests/test_uq.py
@@ -62,7 +62,7 @@ def test_uq_rnanorm_edger():
     )
 
     # Compute scaling factors here
-    ds = load_gtex(as_frame=True)
+    ds = load_gtex()
     rnanorm_factors = UQ().fit(ds.exp).get_norm_factors(ds.exp)
 
     # Compare loaded and computed scaling factors

From 019bf5fa1cc93b13569e854394416da55a2ceb2a Mon Sep 17 00:00:00 2001
From: JureZmzrlikar <zmrzlikar.jure@gmail.com>
Date: Wed, 9 Aug 2023 15:02:24 +0200
Subject: [PATCH 5/5] Update citation file

---
 CITATION.cff | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index 70e22a1..a1d0303 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,13 +1,19 @@
 cff-version: 1.2.0
 message: "If you use this software, please cite it as below."
-title: "RNA-seq data normalization in Python"
+title: "RNAnorm: RNA-seq data normalization in Python"
 version: 2.0.0
 date-released: 2023-07-12
 url: "https://github.com/genialis/RNAnorm"
 authors:
-- family-names: "Štajdohar"
-  given-names: "Miha"
-  orcid: "https://orcid.org/0009-0008-2749-8298"
 - family-names: "Zmrzlikar"
   given-names: "Jure"
   orcid: "https://orcid.org/0009-0009-9749-541X"
+- family-names: "Žganec"
+  given-names: "Matjaž"
+  orcid: "https://orcid.org/0000-0002-4159-9596"
+- family-names: "Ausec"
+  given-names: "Luka"
+  orcid: "https://orcid.org/0000-0003-3155-0999"
+- family-names: "Štajdohar"
+  given-names: "Miha"
+  orcid: "https://orcid.org/0009-0008-2749-8298"
\ No newline at end of file