Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update error message and docs for features argument to clarify CellProfiler default expectations and how to handle non-CellProfiler data #448

Merged
merged 15 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pycytominer/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def consensus(
features : list
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume features are from CellProfiler output and
prefixed with "Cells", "Nuclei", or "Cytoplasm".
output_file : str, optional
If provided, will write consensus profiles to file. If not specified, will
Expand Down
17 changes: 12 additions & 5 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def infer_cp_features(
metadata=False,
image_features=False,
):
"""Given a dataframe, output features that we expect to be Cell Painting features.
"""Given CellProfiler output data read as a DataFrame, output feature column names as a list.

Parameters
----------
Expand All @@ -90,6 +90,8 @@ def infer_cp_features(
Compartments from which Cell Painting features were extracted.
metadata : bool, default False
Whether or not to infer metadata features.
If metadata is set to True, find column names that begin with the `Metadata_` prefix.
This convention is expected by CellProfiler defaults.
image_features : bool, default False
Whether or not the profiles contain image features.

Expand All @@ -115,9 +117,12 @@ def infer_cp_features(
population_df.columns.str.startswith("Metadata_")
].tolist()

assert ( # noqa: S101
len(features) > 0
), "No CP features found. Are you sure this dataframe is from CellProfiler?"
if len(features) == 0:
raise ValueError(
"No features or metadata found. Pycytominer expects CellProfiler column names by default. "
"If you're using non-CellProfiler data, please do not 'infer' features. "
"Instead, check if the function has a `features` or `meta_features` parameter, and input column names manually."
)

return features

Expand Down Expand Up @@ -150,7 +155,9 @@ def drop_outlier_features(
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list of str or str, default "infer"
Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"
Features present in the population dataframe. If "infer",
then assume CellProfiler feature conventions
(start with "Cells_", "Nuclei_", or "Cytoplasm_")
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/cyto_utils/modz.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,10 @@ def modz(
a string or list of column(s) in the population dataframe that
indicate replicate level information
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
method : str, default "spearman"
indicating which correlation metric to use.
min_weight : float, default 0.01
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/cyto_utils/write_gct.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def write_gct(
features : list
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume features are from CellProfiler output and
prefixed with "Cells", "Nuclei", or "Cytoplasm".
meta_features : list
A list of strings corresponding to metadata column names in the `profiles`
Expand Down
4 changes: 2 additions & 2 deletions pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def feature_select(
----------
profiles : pandas.core.frame.DataFrame or file
DataFrame or file of profiles.
features : list
features : list, default "infer"
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
image_features: bool, default False
Whether the profiles contain image features.
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,15 @@ def normalize(
features : list
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume features are from CellProfiler output and
prefixed with "Cells", "Nuclei", or "Cytoplasm".
image_features: bool, default False
Whether the profiles contain image features.
meta_features : list
A list of strings corresponding to metadata column names in the `profiles`
DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
If "infer", then assume metadata features are those prefixed with "Metadata"
If "infer", then assume CellProfiler metadata features, identified by
column names that begin with the `Metadata_` prefix."
samples : str
The metadata column values to use as a normalization reference. We often use
control samples. The function uses a pd.query() function, so you should
Expand Down Expand Up @@ -114,7 +115,7 @@ def normalize(
normalized_df = normalize(
profiles=data_df,
features=["x", "y", "z", "zz"],
meta_features="infer",
meta_features=["Metadata_plate", "Metadata_treatment"],
samples="Metadata_treatment == 'control'",
method="standardize"
)
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/operations/correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ def correlation_threshold(
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
11 changes: 6 additions & 5 deletions pycytominer/operations/get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand All @@ -36,8 +37,8 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):

if features == "infer":
features = infer_cp_features(population_df)
else:
population_df = population_df.loc[:, features]

population_df = population_df.loc[:, features]
axiomcura marked this conversation as resolved.
Show resolved Hide resolved

num_rows = population_df.shape[0]
na_prop_df = population_df.isna().sum() / num_rows
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/operations/noise_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ def noise_removal(
The list of unique perturbations corresponding to the rows in population_df. For example,
perturb1_well1 and perturb1_well2 would both be "perturb1".
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/operations/variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ def variance_threshold(
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
4 changes: 2 additions & 2 deletions tests/test_cyto_utils/test_feature_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def test_feature_infer():


def test_feature_infer_nocp():
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
infer_cp_features(population_df=non_cp_data_df)

assert "No CP features found." in str(nocp.value)
assert "No features or metadata found." in str(nocp.value)


def test_metadata_feature_infer():
Expand Down
4 changes: 2 additions & 2 deletions tests/test_operations/test_correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_correlation_threshold_samples():


def test_correlation_threshold_featureinfer():
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
correlation_threshold_result = correlation_threshold(
population_df=data_df,
features="infer",
Expand All @@ -84,7 +84,7 @@ def test_correlation_threshold_featureinfer():
method="pearson",
)

assert "No CP features found." in str(nocp.value)
assert "No features found." in str(nocp.value)

data_cp_df = data_df.copy()
data_cp_df.columns = [f"Cells_{x}" for x in data_df.columns]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_operations/test_get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def test_get_na_columns_sample():


def test_get_na_columns_featureinfer():
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
get_na_columns(
population_df=data_df, samples="all", features="infer", cutoff=0.1
)

assert "No CP features found." in str(nocp.value)
assert "No features found." in str(nocp.value)
4 changes: 2 additions & 2 deletions tests/test_operations/test_variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,12 @@ def test_variance_threshold():

def test_variance_threshold_featureinfer():
unique_cut = 0.01
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
excluded_features = variance_threshold(
population_df=data_unique_test_df, features="infer", unique_cut=unique_cut
)

assert "No CP features found." in str(nocp.value)
assert "No features found." in str(nocp.value)

data_cp_df = data_unique_test_df.copy()
data_cp_df.columns = [f"Cells_{x}" for x in data_unique_test_df.columns]
Expand Down
Loading