Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small fixes #25

Merged
merged 5 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions src/rnanorm/methods/between_sample.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Between sample normalizations."""
from typing import Any, Optional

import numpy as np
from scipy.stats import gmean, rankdata, scoreatpercentile
from sklearn import config_context
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
from sklearn.utils.validation import check_is_fitted

Expand Down Expand Up @@ -70,7 +73,9 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
:param X: Expression raw count matrix (n_samples, n_features)
"""
X = remove_zero_genes(X)
lib_size = LibrarySize().fit_transform(X)
# Make sure that global set_config(transform_output="pandas")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really necessary? It's kind off ugly to break the intended sklearn logic.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really necessary? It's kind off ugly to break the intended sklearn logic. @mzganec, have a look.

Copy link

@mzganec mzganec Aug 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you find this approach better?

lib_size = LibrarySize().fit_transform(X)
...
return upper_quartiles / np.array(lib_size).flatten()

@JureZmrzlikar If we go with this solution, we need to fix effective_lib_size in transform as well.

lib_size = LibrarySize().fit_transform(X)
effective_lib_size = np.array(lib_size).flatten() * factors

# does not affect this method - we need numpy output here.
lib_size = LibrarySize().set_output(transform="default").fit_transform(X)

# Compute upper quartile count for each sample.
# No numpy method can be used as drop-in replacement for R's quantile.
Expand All @@ -97,7 +102,7 @@ def _reset(self) -> None:
if hasattr(self, "geometric_mean_"):
del self.geometric_mean_

def fit(self, X: Numeric2D) -> Self:
def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self:
"""Fit.

:param X: Expression raw count matrix (n_samples, n_features)
Expand All @@ -122,7 +127,8 @@ def transform(self, X: Numeric2D) -> Numeric2D:

# Compute effective library sizes
factors = self.get_norm_factors(X)
effective_lib_size = LibrarySize().fit_transform(X) * factors
lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
effective_lib_size = lib_size * factors

# Make CPM, but with effective library size
return X / effective_lib_size[:, np.newaxis] * 1e6
Expand Down Expand Up @@ -241,8 +247,10 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
"""
X = remove_zero_genes(X)

lib_size = LibrarySize().fit_transform(X)
lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :])
# ensure that output of transform will be a np.array
with config_context(transform_output="default"):
lib_size = LibrarySize().fit_transform(X)
lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :])

# Values 0 cause a lot of troubles and warnings in log / division.
# But computing with np.nan is OK, and is handled gracefully.
Expand Down Expand Up @@ -329,7 +337,7 @@ def _get_ref(self, X: Numeric2D) -> Numeric1D:
ref_index = np.argmin(np.fabs(f75 - np.mean(f75)))
return X[ref_index, :]

def fit(self, X: Numeric2D) -> Self:
def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self:
"""Fit.

:param X: Expression raw count matrix (n_samples, n_features)
Expand All @@ -354,7 +362,8 @@ def transform(self, X: Numeric2D) -> Numeric2D:
"""
# Compute effective library sizes
factors = self.get_norm_factors(X)
effective_lib_size = LibrarySize().fit_transform(X) * factors
lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
effective_lib_size = lib_size * factors

# Method ``check_is_fitted`` is not called here, since it is
# called in self.get_norm_factors
Expand Down
9 changes: 9 additions & 0 deletions tests/test_ctf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from sklearn import config_context

from rnanorm import CTF

Expand Down Expand Up @@ -44,3 +45,11 @@ def test_ctf(exp, expected_factors, expected_ctf):
expected_ctf.loc[["Sample_2"]],
rtol=1e-3,
)


def test_global_set_output(exp):
"""Ensure that global config does not break things."""
with config_context(transform_output="pandas"):
CTF().fit_transform(exp)

CTF().set_output(transform="pandas").fit_transform(exp)
9 changes: 9 additions & 0 deletions tests/test_cuf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from sklearn import config_context

from rnanorm import CUF

Expand Down Expand Up @@ -44,3 +45,11 @@ def test_cuf(exp, expected_factors, expected_cuf):
expected_cuf.loc[["Sample_2"]],
rtol=1e-3,
)


def test_global_set_output(exp):
"""Ensure that global config does not break things."""
with config_context(transform_output="pandas"):
CUF().fit_transform(exp)

CUF().set_output(transform="pandas").fit_transform(exp)
37 changes: 37 additions & 0 deletions tests/test_sklearn_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
from rnanorm.datasets import load_toy_data


def test_grid_search():
"""Test compatibility of all methods with sklearn machinery."""
ds = load_toy_data()
X = ds.exp
y = pd.Series([0, 0, 1, 1], index=X.index)
pipeline = Pipeline(
steps=[
("normalization", CPM()),
("scaler", StandardScaler()),
("classifier", LogisticRegression()),
]
)
params = {
"normalization": [
CPM(),
FPKM(gtf=ds.gtf_path),
TPM(gtf=ds.gtf_path),
UQ(),
CUF(),
TMM(),
CTF(),
],
}
search = GridSearchCV(pipeline, params, cv=2, refit=False)
search.fit(X, y)
results = pd.DataFrame(search.cv_results_)
assert results.shape[0] == 7
9 changes: 9 additions & 0 deletions tests/test_tmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
import pytest
from sklearn import config_context

from rnanorm import TMM
from rnanorm.datasets import load_gtex
Expand Down Expand Up @@ -70,3 +71,11 @@ def test_tmm_rnanorm_edger():
rnanorm_factors,
decimal=14,
)


def test_global_set_output(exp):
"""Ensure that global config does not break things."""
with config_context(transform_output="pandas"):
TMM().fit_transform(exp)

TMM().set_output(transform="pandas").fit_transform(exp)
9 changes: 9 additions & 0 deletions tests/test_uq.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
import pytest
from sklearn import config_context

from rnanorm import UQ
from rnanorm.datasets import load_gtex
Expand Down Expand Up @@ -71,3 +72,11 @@ def test_uq_rnanorm_edger():
rnanorm_factors,
decimal=14,
)


def test_global_set_output(exp):
"""Ensure that global config does not break things."""
with config_context(transform_output="pandas"):
UQ().fit_transform(exp)

UQ().set_output(transform="pandas").fit_transform(exp)