-
Notifications
You must be signed in to change notification settings - Fork 15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Small fixes #25
Small fixes #25
Changes from 2 commits
82b1a15
07404cd
fa049d8
b4eae42
019bf5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
"""Between sample normalizations.""" | ||
from typing import Any, Optional | ||
|
||
import numpy as np | ||
from scipy.stats import gmean, rankdata, scoreatpercentile | ||
from sklearn import config_context | ||
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin | ||
from sklearn.utils.validation import check_is_fitted | ||
|
||
|
@@ -70,7 +73,9 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: | |
:param X: Expression raw count matrix (n_samples, n_features) | ||
""" | ||
X = remove_zero_genes(X) | ||
lib_size = LibrarySize().fit_transform(X) | ||
# Make sure that global set_config(transform_output="pandas") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this really necessary? It's kind off ugly to break the intended sklearn logic. @mzganec, have a look. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you find this approach better?
@JureZmrzlikar If we go with this solution, we need to fix
|
||
# does not affect this method - we need numpy output here. | ||
lib_size = LibrarySize().set_output(transform="default").fit_transform(X) | ||
|
||
# Compute upper quartile count for each sample. | ||
# No numpy method can be used as drop-in replacement for R's quantile. | ||
|
@@ -97,7 +102,7 @@ def _reset(self) -> None: | |
if hasattr(self, "geometric_mean_"): | ||
del self.geometric_mean_ | ||
|
||
def fit(self, X: Numeric2D) -> Self: | ||
def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self: | ||
"""Fit. | ||
|
||
:param X: Expression raw count matrix (n_samples, n_features) | ||
|
@@ -122,7 +127,8 @@ def transform(self, X: Numeric2D) -> Numeric2D: | |
|
||
# Compute effective library sizes | ||
factors = self.get_norm_factors(X) | ||
effective_lib_size = LibrarySize().fit_transform(X) * factors | ||
lib_size = LibrarySize().set_output(transform="default").fit_transform(X) | ||
effective_lib_size = lib_size * factors | ||
|
||
# Make CPM, but with effective library size | ||
return X / effective_lib_size[:, np.newaxis] * 1e6 | ||
|
@@ -241,8 +247,10 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: | |
""" | ||
X = remove_zero_genes(X) | ||
|
||
lib_size = LibrarySize().fit_transform(X) | ||
lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :]) | ||
# ensure that output of transform will be a np.array | ||
with config_context(transform_output="default"): | ||
lib_size = LibrarySize().fit_transform(X) | ||
lib_size_ref = LibrarySize().fit_transform(self.ref_[np.newaxis, :]) | ||
|
||
# Values 0 cause a lot of troubles and warnings in log / division. | ||
# But computing with np.nan is OK, and is handled gracefully. | ||
|
@@ -329,7 +337,7 @@ def _get_ref(self, X: Numeric2D) -> Numeric1D: | |
ref_index = np.argmin(np.fabs(f75 - np.mean(f75))) | ||
return X[ref_index, :] | ||
|
||
def fit(self, X: Numeric2D) -> Self: | ||
def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self: | ||
"""Fit. | ||
|
||
:param X: Expression raw count matrix (n_samples, n_features) | ||
|
@@ -354,7 +362,8 @@ def transform(self, X: Numeric2D) -> Numeric2D: | |
""" | ||
# Compute effective library sizes | ||
factors = self.get_norm_factors(X) | ||
effective_lib_size = LibrarySize().fit_transform(X) * factors | ||
lib_size = LibrarySize().set_output(transform="default").fit_transform(X) | ||
effective_lib_size = lib_size * factors | ||
|
||
# Method ``check_is_fitted`` is not called here, since it is | ||
# called in self.get_norm_factors | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import pandas as pd | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.model_selection import GridSearchCV | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import StandardScaler | ||
|
||
from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ | ||
from rnanorm.datasets import load_toy_data | ||
|
||
|
||
def test_grid_search(): | ||
"""Test compatibility of all methods with sklearn machinery.""" | ||
ds = load_toy_data() | ||
X = ds.exp | ||
y = pd.Series([0, 0, 1, 1], index=X.index) | ||
pipeline = Pipeline( | ||
steps=[ | ||
("normalization", CPM()), | ||
("scaler", StandardScaler()), | ||
("classifier", LogisticRegression()), | ||
] | ||
) | ||
params = { | ||
"normalization": [ | ||
CPM(), | ||
FPKM(gtf=ds.gtf_path), | ||
TPM(gtf=ds.gtf_path), | ||
UQ(), | ||
CUF(), | ||
TMM(), | ||
CTF(), | ||
], | ||
} | ||
search = GridSearchCV(pipeline, params, cv=2, refit=False) | ||
search.fit(X, y) | ||
results = pd.DataFrame(search.cv_results_) | ||
assert results.shape[0] == 7 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really necessary? It's kind off ugly to break the intended sklearn logic.