Skip to content

Commit

Permalink
Merge pull request #50 from michaelbornholdt/enrichment
Browse files Browse the repository at this point in the history
Add enrichment operation
  • Loading branch information
gwaybio authored Apr 22, 2021
2 parents 59684cf + 4c4c99b commit 220b296
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 1,090 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ evaluate(

## Metrics

Currently, four metric operations are supported:
Currently, five metric operations are supported:

1. Replicate reproducibility
2. Precision/recall
3. mp-value
4. Grit
5. Enrichment

## Demos

Expand Down
12 changes: 11 additions & 1 deletion cytominer_eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
precision_recall,
grit,
mp_value,
enrichment,
)


def evaluate(
profiles: pd.DataFrame,
features: List[str],
Expand All @@ -30,6 +30,7 @@ def evaluate(
grit_control_perts: List[str] = ["None"],
grit_replicate_summary_method: str = "mean",
mp_value_params: dict = {},
enrichment_percentile: float = 0.5,
):
r"""Evaluate profile quality and strength.
Expand Down Expand Up @@ -99,6 +100,9 @@ def evaluate(
Only used when `operation='mp_value'`. A key, item pair of optional parameters
for calculating mp value. See also
:py:func:`cytominer_eval.operations.util.default_mp_value_parameters`
percentile : float, optional
Only used when `operation='enrichment'`. Determines the percentage of top connections
used for the enrichment calculation.
"""
# Check replicate groups input
check_replicate_groups(eval_metric=operation, replicate_groups=replicate_groups)
Expand Down Expand Up @@ -143,5 +147,11 @@ def evaluate(
features=features,
params=mp_value_params,
)
elif operation == "enrichment":
metric_result = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=enrichment_percentile,
)

return metric_result
1 change: 1 addition & 0 deletions cytominer_eval/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .precision_recall import precision_recall
from .grit import grit
from .mp_value import mp_value
from .enrichment import enrichment
75 changes: 75 additions & 0 deletions cytominer_eval/operations/enrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Function to calculate the enrichment score for a given similarity matrix.
"""
import numpy as np
import pandas as pd
from typing import List
import scipy

from .util import assign_replicates, calculate_grit, check_grit_replicate_summary_method
from cytominer_eval.transform.util import (
set_pair_ids,
set_grit_column_info,
assert_melt,
)


def enrichment(
similarity_melted_df: pd.DataFrame, replicate_groups: List[str], percentile: 0.9,
) -> dict:
"""Calculate the enrichment score. This score is based on the fisher exact odds score. Similar to the other functions, the closest connections are determined and checked with the replicates.
This score effectively calculates how much better the distribution of correct connections is compared to random.
Parameters
----------
similarity_melted_df : pandas.DataFrame
An elongated symmetrical matrix indicating pairwise correlations between
samples. Importantly, it must follow the exact structure as output from
:py:func:`cytominer_eval.transform.transform.metric_melt`.
replicate_groups : List
a list of metadata column names in the original profile dataframe to use as
replicate columns.
percentile : float
Determines what percentage of top connections used for the enrichment calculation.
Returns
-------
dict
percentile, threshold, odds ratio and p value
"""
# threshold based on percentile of top connections
threshold = similarity_melted_df.similarity_metric.quantile(percentile)

replicate_truth_df = assign_replicates(
similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
)
# calculate the individual components of the contingency tables
v11 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric>@threshold"
)
)
v12 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric>@threshold"
)
)
v21 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric<=@threshold"
)
)
v22 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric<=@threshold"
)
)

v = np.asarray([[v11, v12], [v21, v22]])
r = scipy.stats.fisher_exact(v, alternative="greater")
result = {
"percentile": percentile,
"threshold": threshold,
"ods_ratio": r[0],
"p-value": r[1],
}
return result
78 changes: 78 additions & 0 deletions cytominer_eval/tests/test_operations/test_enrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import random
import pytest
import pathlib
import tempfile
import numpy as np
import pandas as pd

from cytominer_eval.transform import metric_melt
from cytominer_eval.operations.enrichment import enrichment
from cytominer_eval import evaluate


random.seed(3141)
tmpdir = tempfile.gettempdir()


# Load LINCS dataset
example_file = "SQ00015054_normalized_feature_select.csv.gz"
example_file = pathlib.Path(
"{file}/../../example_data/compound/{eg}".format(
file=os.path.dirname(__file__), eg=example_file
)
)

df = pd.read_csv(example_file)

meta_features = [
x for x in df.columns if (x.startswith("Metadata_") or x.startswith("Image_"))
]
features = df.drop(meta_features, axis="columns").columns.tolist()

replicate_groups = ["Metadata_broad_sample"]

similarity_melted_df = metric_melt(
df=df,
features=features,
metadata_features=meta_features,
similarity_metric="pearson",
eval_metric="enrichment",
)


def test_enrichment():
result = []
for p in np.arange(1, 0.97, -0.005):
r = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=p,
)
result.append(r)
result_df = pd.DataFrame(result)

# check for correct shape and starts with 1.0
assert result_df.shape == (7, 4)
assert result_df.percentile[0] == 1.0
# check if the higher percentiles are larger than the small one
assert result_df.percentile[1] > result_df.percentile.iloc[-1]


def test_compare_functions():
percentile = 0.9
eval_res = evaluate(
profiles=df,
features=features,
meta_features=meta_features,
replicate_groups=replicate_groups,
operation="enrichment",
similarity_metric="pearson",
enrichment_percentile=percentile,
)
enr_res = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=percentile,
)
assert enr_res == eval_res
1 change: 1 addition & 0 deletions cytominer_eval/tests/test_transform/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def test_get_available_eval_metrics():
"precision_recall",
"grit",
"mp_value",
"enrichment"
]
assert expected_result == get_available_eval_metrics()

Expand Down
2 changes: 1 addition & 1 deletion cytominer_eval/transform/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def get_available_eval_metrics():
r"""Output the available eval metrics in the cytominer_eval library"""
return ["replicate_reproducibility", "precision_recall", "grit", "mp_value"]
return ["replicate_reproducibility", "precision_recall", "grit", "mp_value", "enrichment"]


def get_available_similarity_metrics():
Expand Down
Loading

0 comments on commit 220b296

Please sign in to comment.