-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from jump-cellpainting/refactor_recipe
Refactor recipe
- Loading branch information
Showing
4 changed files
with
239 additions
and
293 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,155 +1,192 @@ | ||
""" | ||
Perform the image-based profiling pipeline to process data | ||
""" | ||
# copied from https://github.com/broadinstitute/lincs-cell-painting/blob/master/profiles/profile.py | ||
# copied from | ||
# https://github.com/broadinstitute/profiling-resistance-mechanisms/blob/master/0.generate-profiles/scripts/profile_util.py | ||
|
||
import os | ||
import pathlib | ||
from profile_utils import process_pipeline | ||
import pandas as pd | ||
from pycytominer.aggregate import AggregateProfiles | ||
from pycytominer import ( | ||
aggregate, | ||
annotate, | ||
normalize, | ||
feature_select, | ||
audit, | ||
cyto_utils, | ||
) | ||
|
||
from profile_utils import get_args | ||
|
||
# Load Command Line Arguments | ||
args = get_args() | ||
|
||
sql_file = args.sql_file | ||
batch = args.batch | ||
plate_name = args.plate_name | ||
platemap_file = args.platemap_file | ||
barcode_platemap_file = args.barcode_platemap_file | ||
moa_file = args.moa_file | ||
cell_count_dir = args.cell_count_dir | ||
output_dir = args.output_dir | ||
|
||
# Initialize profile processing | ||
os.makedirs(output_dir, exist_ok=True) | ||
os.makedirs(cell_count_dir, exist_ok=True) | ||
cell_id = "A549" | ||
aggregate_method = "median" | ||
norm_method = "mad_robustize" | ||
compression = "gzip" | ||
float_format = "%.5g" | ||
strata = ["Image_Metadata_Plate", "Image_Metadata_Well"] | ||
feature_select_ops = [ | ||
"drop_na_columns", | ||
"variance_threshold", | ||
"correlation_threshold", | ||
"blacklist", | ||
] | ||
|
||
# Define external metadata to add to annotation | ||
moa_df = pd.read_csv(moa_file, sep="\t") | ||
barcode_platemap_df = pd.read_csv(barcode_platemap_file).query( | ||
"Assay_Plate_Barcode == @plate_name" | ||
) | ||
|
||
# # Aggregate profiles | ||
out_file = pathlib.PurePath(output_dir, f"{plate_name}.csv.gz") | ||
# ap = AggregateProfiles(sql_file=sql_file, strata=strata, operation=aggregate_method) | ||
# ap.aggregate_profiles( | ||
# output_file=out_file, float_format=float_format, compression="gzip" | ||
# ) | ||
|
||
# # Count cells | ||
# count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.csv") | ||
# cell_count_df = ap.count_cells() | ||
# cell_count_df.to_csv(count_file, sep=",", index=False) | ||
|
||
# del ap | ||
|
||
# Annotate profiles - Level 3 Data | ||
anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz") | ||
anno_df = annotate( | ||
profiles=out_file, | ||
platemap=platemap_file, | ||
join_on=["Metadata_well_position", "Metadata_Well"], | ||
cell_id=cell_id, | ||
format_broad_cmap=True, | ||
perturbation_mode="chemical", | ||
external_metadata=moa_df, | ||
external_join_left=["Metadata_broad_sample"], | ||
external_join_right=["Metadata_broad_sample"], | ||
) | ||
|
||
# Rename columns | ||
anno_df = anno_df.rename( | ||
{"Image_Metadata_Plate": "Metadata_Plate", "Image_Metadata_Well": "Metadata_Well"}, | ||
axis="columns", | ||
) | ||
|
||
# Add barcode platemap info | ||
anno_df = anno_df.assign( | ||
Metadata_Assay_Plate_Barcode=barcode_platemap_df.Assay_Plate_Barcode.values[0], | ||
Metadata_Plate_Map_Name=barcode_platemap_df.Plate_Map_Name.values[0] | ||
) | ||
|
||
# Reoroder columns | ||
metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True) | ||
cp_cols = cyto_utils.infer_cp_features(anno_df) | ||
reindex_cols = metadata_cols + cp_cols | ||
anno_df = anno_df.reindex(reindex_cols, axis="columns") | ||
|
||
# Output annotated file | ||
cyto_utils.output( | ||
df=anno_df, | ||
output_filename=anno_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
|
||
# Normalize Profiles (DMSO Control) - Level 4A Data | ||
norm_dmso_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized_dmso.csv.gz") | ||
normalize( | ||
profiles=anno_df, | ||
samples="Metadata_broad_sample == 'DMSO'", | ||
method=norm_method, | ||
output_file=norm_dmso_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
|
||
# Normalize Profiles (Whole Plate) - Level 4A Data | ||
norm_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized.csv.gz") | ||
normalize( | ||
profiles=anno_df, | ||
samples="all", | ||
method=norm_method, | ||
output_file=norm_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
|
||
# Feature Selection (DMSO Control) - Level 4B Data | ||
feat_dmso_file = pathlib.PurePath( | ||
output_dir, f"{plate_name}_normalized_feature_select_dmso.csv.gz" | ||
) | ||
feature_select( | ||
profiles=norm_dmso_file, | ||
features="infer", | ||
operation=feature_select_ops, | ||
output_file=feat_dmso_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
|
||
# Feature Selection (Whole Plate) - Level 4B Data | ||
feat_file = pathlib.PurePath( | ||
output_dir, f"{plate_name}_normalized_feature_select.csv.gz" | ||
) | ||
feature_select( | ||
profiles=norm_file, | ||
features="infer", | ||
operation=feature_select_ops, | ||
output_file=feat_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
def process_profile(batch, plate, cell, pipeline): | ||
# Set output directory information | ||
pipeline_output = pipeline["output_dir"] | ||
output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) | ||
|
||
# Set output file information | ||
aggregate_out_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz") | ||
aggregate_output_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz") | ||
annotate_output_file = pathlib.PurePath(output_dir, f"{plate}_augmented.csv.gz") | ||
normalize_output_file = pathlib.PurePath(output_dir, f"{plate}_normalized.csv.gz") | ||
normalize_output_negcon_file = pathlib.PurePath( | ||
output_dir, f"{plate}_normalized_negcon.csv.gz" | ||
) | ||
feature_output_file = pathlib.PurePath( | ||
output_dir, f"{plate}_normalized_feature_select.csv.gz" | ||
) | ||
feature_output_negcon_file = pathlib.PurePath( | ||
output_dir, f"{plate}_normalized_feature_select_negcon.csv.gz" | ||
) | ||
|
||
# Load pipeline options | ||
compression = process_pipeline(pipeline["options"], option="compression") | ||
float_format = process_pipeline(pipeline["options"], option="float_format") | ||
samples = process_pipeline(pipeline["options"], option="samples") | ||
|
||
# Aggregate Profiles | ||
|
||
aggregate_steps = pipeline["aggregate"] | ||
|
||
if aggregate_steps["perform"]: | ||
aggregate_features = aggregate_steps["features"] | ||
aggregate_operation = aggregate_steps["method"] | ||
aggregate_plate_column = aggregate_steps["plate_column"] | ||
aggregate_well_column = aggregate_steps["well_column"] | ||
|
||
sql_file = f'sqlite:////{os.path.abspath(os.path.join("../../backend", batch, plate, f"{plate}.sqlite"))}' | ||
|
||
strata = [aggregate_plate_column, aggregate_well_column] | ||
|
||
if "site_column" in aggregate_steps: | ||
aggregate_site_column = aggregate_steps["site_column"] | ||
strata += [aggregate_site_column] | ||
|
||
if aggregate_steps["perform"]: | ||
ap = AggregateProfiles( | ||
sql_file, | ||
strata=strata, | ||
features=aggregate_features, | ||
operation=aggregate_operation, | ||
) | ||
|
||
ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) | ||
|
||
# Annotate Profiles | ||
annotate_steps = pipeline["annotate"] | ||
annotate_well_column = annotate_steps["well_column"] | ||
|
||
if annotate_steps["perform"]: | ||
annotate_well_column = annotate_steps["well_column"] | ||
|
||
# Load and setup platemap info | ||
metadata_dir = pathlib.PurePath(".", "metadata", "platemaps", batch) | ||
barcode_plate_map_file = pathlib.PurePath(metadata_dir, "barcode_platemap.csv") | ||
barcode_plate_map_df = pd.read_csv( | ||
barcode_plate_map_file, dtype={"Assay_Plate_Barcode": str} | ||
) | ||
plate_map_name = barcode_plate_map_df.query( | ||
"Assay_Plate_Barcode == @plate" | ||
).Plate_Map_Name.values[0] | ||
plate_map_file = pathlib.PurePath(metadata_dir, "platemap", f"{plate_map_name}.txt") | ||
plate_map_df = pd.read_csv(plate_map_file, sep="\t") | ||
plate_map_df.columns = [ | ||
f"Metadata_{x}" if not x.startswith("Metadata_") else x | ||
for x in plate_map_df.columns | ||
] | ||
platemap_well_column = pipeline["platemap_well_column"] | ||
|
||
if annotate_steps["external"]: | ||
external_df = pd.read_csv( | ||
pathlib.PurePath(".", "metadata", "moa", annotate_steps["external"]), | ||
sep="\t", | ||
) | ||
anno_df = annotate( | ||
profiles=aggregate_output_file, | ||
platemap=plate_map_df, | ||
join_on=[platemap_well_column, annotate_well_column], | ||
cell_id=cell, | ||
external_metadata=external_df, | ||
external_join_left=["Metadata_broad_sample"], | ||
external_join_right=["Metadata_broad_sample"], | ||
) | ||
else: | ||
anno_df = annotate( | ||
profiles=aggregate_output_file, | ||
platemap=plate_map_df, | ||
join_on=[platemap_well_column, annotate_well_column], | ||
cell_id=cell, | ||
) | ||
|
||
anno_df = anno_df.rename( | ||
{ | ||
"Image_Metadata_Plate": "Metadata_Plate", | ||
"Image_Metadata_Well": "Metadata_Well", | ||
}, | ||
axis="columns", | ||
).assign( | ||
Metadata_Assay_Plate_Barcode=plate, | ||
Metadata_Plate_Map_Name=barcode_plate_map_df.loc[ | ||
barcode_plate_map_df.Assay_Plate_Barcode == plate, "Plate_Map_Name" | ||
].values[0], | ||
) | ||
|
||
# Reoroder columns | ||
metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True) | ||
cp_cols = cyto_utils.infer_cp_features(anno_df) | ||
reindex_cols = metadata_cols + cp_cols | ||
anno_df = anno_df.reindex(reindex_cols, axis="columns") | ||
|
||
# Output annotated file | ||
cyto_utils.output( | ||
df=anno_df, | ||
output_filename=annotate_output_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
|
||
# Normalize Profiles | ||
normalize_steps = pipeline["normalize"] | ||
if normalize_steps["perform"]: | ||
normalization_features = normalize_steps["features"] | ||
normalization_method = normalize_steps["method"] | ||
normalize( | ||
profiles=annotate_output_file, | ||
features=normalization_features, | ||
samples=samples, | ||
method=normalization_method, | ||
output_file=normalize_output_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
if normalize_steps["negcon"]: | ||
normalize( | ||
profiles=annotate_output_file, | ||
features=normalization_features, | ||
samples="Metadata_control_type == 'negcon'", | ||
method=normalization_method, | ||
output_file=normalize_output_negcon_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
|
||
# Apply feature selection | ||
feature_select_steps = pipeline["feature_select"] | ||
if feature_select_steps["perform"]: | ||
feature_select_operations = feature_select_steps["operations"] | ||
feature_select_features = feature_select_steps["features"] | ||
feature_select( | ||
profiles=normalize_output_file, | ||
features=feature_select_features, | ||
operation=feature_select_operations, | ||
output_file=feature_output_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) | ||
if feature_select_steps["negcon"]: | ||
feature_select( | ||
profiles=normalize_output_negcon_file, | ||
features=feature_select_features, | ||
operation=feature_select_operations, | ||
output_file=feature_output_negcon_file, | ||
float_format=float_format, | ||
compression=compression, | ||
) |
Oops, something went wrong.