-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #50 from michaelbornholdt/enrichment
Add enrichment operation
- Loading branch information
Showing
8 changed files
with
316 additions
and
1,090 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
"""Function to calculate the enrichment score for a given similarity matrix. | ||
""" | ||
import numpy as np | ||
import pandas as pd | ||
from typing import List | ||
import scipy | ||
|
||
from .util import assign_replicates, calculate_grit, check_grit_replicate_summary_method | ||
from cytominer_eval.transform.util import ( | ||
set_pair_ids, | ||
set_grit_column_info, | ||
assert_melt, | ||
) | ||
|
||
|
||
def enrichment( | ||
similarity_melted_df: pd.DataFrame, replicate_groups: List[str], percentile: 0.9, | ||
) -> dict: | ||
"""Calculate the enrichment score. This score is based on the fisher exact odds score. Similar to the other functions, the closest connections are determined and checked with the replicates. | ||
This score effectively calculates how much better the distribution of correct connections is compared to random. | ||
Parameters | ||
---------- | ||
similarity_melted_df : pandas.DataFrame | ||
An elongated symmetrical matrix indicating pairwise correlations between | ||
samples. Importantly, it must follow the exact structure as output from | ||
:py:func:`cytominer_eval.transform.transform.metric_melt`. | ||
replicate_groups : List | ||
a list of metadata column names in the original profile dataframe to use as | ||
replicate columns. | ||
percentile : float | ||
Determines what percentage of top connections used for the enrichment calculation. | ||
Returns | ||
------- | ||
dict | ||
percentile, threshold, odds ratio and p value | ||
""" | ||
# threshold based on percentile of top connections | ||
threshold = similarity_melted_df.similarity_metric.quantile(percentile) | ||
|
||
replicate_truth_df = assign_replicates( | ||
similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups | ||
) | ||
# calculate the individual components of the contingency tables | ||
v11 = len( | ||
replicate_truth_df.query( | ||
"group_replicate==True and similarity_metric>@threshold" | ||
) | ||
) | ||
v12 = len( | ||
replicate_truth_df.query( | ||
"group_replicate==False and similarity_metric>@threshold" | ||
) | ||
) | ||
v21 = len( | ||
replicate_truth_df.query( | ||
"group_replicate==True and similarity_metric<=@threshold" | ||
) | ||
) | ||
v22 = len( | ||
replicate_truth_df.query( | ||
"group_replicate==False and similarity_metric<=@threshold" | ||
) | ||
) | ||
|
||
v = np.asarray([[v11, v12], [v21, v22]]) | ||
r = scipy.stats.fisher_exact(v, alternative="greater") | ||
result = { | ||
"percentile": percentile, | ||
"threshold": threshold, | ||
"ods_ratio": r[0], | ||
"p-value": r[1], | ||
} | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import os | ||
import random | ||
import pytest | ||
import pathlib | ||
import tempfile | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from cytominer_eval.transform import metric_melt | ||
from cytominer_eval.operations.enrichment import enrichment | ||
from cytominer_eval import evaluate | ||
|
||
|
||
random.seed(3141) | ||
tmpdir = tempfile.gettempdir() | ||
|
||
|
||
# Load LINCS dataset | ||
example_file = "SQ00015054_normalized_feature_select.csv.gz" | ||
example_file = pathlib.Path( | ||
"{file}/../../example_data/compound/{eg}".format( | ||
file=os.path.dirname(__file__), eg=example_file | ||
) | ||
) | ||
|
||
df = pd.read_csv(example_file) | ||
|
||
meta_features = [ | ||
x for x in df.columns if (x.startswith("Metadata_") or x.startswith("Image_")) | ||
] | ||
features = df.drop(meta_features, axis="columns").columns.tolist() | ||
|
||
replicate_groups = ["Metadata_broad_sample"] | ||
|
||
similarity_melted_df = metric_melt( | ||
df=df, | ||
features=features, | ||
metadata_features=meta_features, | ||
similarity_metric="pearson", | ||
eval_metric="enrichment", | ||
) | ||
|
||
|
||
def test_enrichment(): | ||
result = [] | ||
for p in np.arange(1, 0.97, -0.005): | ||
r = enrichment( | ||
similarity_melted_df=similarity_melted_df, | ||
replicate_groups=replicate_groups, | ||
percentile=p, | ||
) | ||
result.append(r) | ||
result_df = pd.DataFrame(result) | ||
|
||
# check for correct shape and starts with 1.0 | ||
assert result_df.shape == (7, 4) | ||
assert result_df.percentile[0] == 1.0 | ||
# check if the higher percentiles are larger than the small one | ||
assert result_df.percentile[1] > result_df.percentile.iloc[-1] | ||
|
||
|
||
def test_compare_functions(): | ||
percentile = 0.9 | ||
eval_res = evaluate( | ||
profiles=df, | ||
features=features, | ||
meta_features=meta_features, | ||
replicate_groups=replicate_groups, | ||
operation="enrichment", | ||
similarity_metric="pearson", | ||
enrichment_percentile=percentile, | ||
) | ||
enr_res = enrichment( | ||
similarity_melted_df=similarity_melted_df, | ||
replicate_groups=replicate_groups, | ||
percentile=percentile, | ||
) | ||
assert enr_res == eval_res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.