Skip to content

Commit

Permalink
Merge pull request #14 from jump-cellpainting/refactor_recipe
Browse files Browse the repository at this point in the history
Refactor recipe
  • Loading branch information
niranjchandrasekaran authored Jan 28, 2021
2 parents f712db5 + c294765 commit ef77653
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 293 deletions.
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ dependencies:
- conda-forge::python=3.7.1
- conda-forge::pandas=0.24.2
- conda-forge::pip=19.2.2
- conda-forge::pyyaml=5.3.1
- pip
- pip:
- git+https://github.com/cytomining/pycytominer@820a8369ab7d49be118d73e9074f998739eab325
- git+https://github.com/cytomining/pycytominer@c1aa34b641b4e07eb5cbd424166f31355abdbd4d
315 changes: 176 additions & 139 deletions profiles/profile.py
Original file line number Diff line number Diff line change
@@ -1,155 +1,192 @@
"""
Perform the image-based profiling pipeline to process data
"""
# copied from https://github.com/broadinstitute/lincs-cell-painting/blob/master/profiles/profile.py
# copied from
# https://github.com/broadinstitute/profiling-resistance-mechanisms/blob/master/0.generate-profiles/scripts/profile_util.py

import os
import pathlib
from profile_utils import process_pipeline
import pandas as pd
from pycytominer.aggregate import AggregateProfiles
from pycytominer import (
aggregate,
annotate,
normalize,
feature_select,
audit,
cyto_utils,
)

from profile_utils import get_args

# Load Command Line Arguments
args = get_args()

sql_file = args.sql_file
batch = args.batch
plate_name = args.plate_name
platemap_file = args.platemap_file
barcode_platemap_file = args.barcode_platemap_file
moa_file = args.moa_file
cell_count_dir = args.cell_count_dir
output_dir = args.output_dir

# Initialize profile processing
os.makedirs(output_dir, exist_ok=True)
os.makedirs(cell_count_dir, exist_ok=True)
cell_id = "A549"
aggregate_method = "median"
norm_method = "mad_robustize"
compression = "gzip"
float_format = "%.5g"
strata = ["Image_Metadata_Plate", "Image_Metadata_Well"]
feature_select_ops = [
"drop_na_columns",
"variance_threshold",
"correlation_threshold",
"blacklist",
]

# Define external metadata to add to annotation
moa_df = pd.read_csv(moa_file, sep="\t")
barcode_platemap_df = pd.read_csv(barcode_platemap_file).query(
"Assay_Plate_Barcode == @plate_name"
)

# # Aggregate profiles
out_file = pathlib.PurePath(output_dir, f"{plate_name}.csv.gz")
# ap = AggregateProfiles(sql_file=sql_file, strata=strata, operation=aggregate_method)
# ap.aggregate_profiles(
# output_file=out_file, float_format=float_format, compression="gzip"
# )

# # Count cells
# count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.csv")
# cell_count_df = ap.count_cells()
# cell_count_df.to_csv(count_file, sep=",", index=False)

# del ap

# Annotate profiles - Level 3 Data
anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz")
anno_df = annotate(
profiles=out_file,
platemap=platemap_file,
join_on=["Metadata_well_position", "Metadata_Well"],
cell_id=cell_id,
format_broad_cmap=True,
perturbation_mode="chemical",
external_metadata=moa_df,
external_join_left=["Metadata_broad_sample"],
external_join_right=["Metadata_broad_sample"],
)

# Rename columns
anno_df = anno_df.rename(
{"Image_Metadata_Plate": "Metadata_Plate", "Image_Metadata_Well": "Metadata_Well"},
axis="columns",
)

# Add barcode platemap info
anno_df = anno_df.assign(
Metadata_Assay_Plate_Barcode=barcode_platemap_df.Assay_Plate_Barcode.values[0],
Metadata_Plate_Map_Name=barcode_platemap_df.Plate_Map_Name.values[0]
)

# Reoroder columns
metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True)
cp_cols = cyto_utils.infer_cp_features(anno_df)
reindex_cols = metadata_cols + cp_cols
anno_df = anno_df.reindex(reindex_cols, axis="columns")

# Output annotated file
cyto_utils.output(
df=anno_df,
output_filename=anno_file,
float_format=float_format,
compression=compression,
)

# Normalize Profiles (DMSO Control) - Level 4A Data
norm_dmso_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized_dmso.csv.gz")
normalize(
profiles=anno_df,
samples="Metadata_broad_sample == 'DMSO'",
method=norm_method,
output_file=norm_dmso_file,
float_format=float_format,
compression=compression,
)

# Normalize Profiles (Whole Plate) - Level 4A Data
norm_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized.csv.gz")
normalize(
profiles=anno_df,
samples="all",
method=norm_method,
output_file=norm_file,
float_format=float_format,
compression=compression,
)

# Feature Selection (DMSO Control) - Level 4B Data
feat_dmso_file = pathlib.PurePath(
output_dir, f"{plate_name}_normalized_feature_select_dmso.csv.gz"
)
feature_select(
profiles=norm_dmso_file,
features="infer",
operation=feature_select_ops,
output_file=feat_dmso_file,
float_format=float_format,
compression=compression,
)

# Feature Selection (Whole Plate) - Level 4B Data
feat_file = pathlib.PurePath(
output_dir, f"{plate_name}_normalized_feature_select.csv.gz"
)
feature_select(
profiles=norm_file,
features="infer",
operation=feature_select_ops,
output_file=feat_file,
float_format=float_format,
compression=compression,
)
def process_profile(batch, plate, cell, pipeline):
# Set output directory information
pipeline_output = pipeline["output_dir"]
output_dir = pathlib.PurePath(".", pipeline_output, batch, plate)

# Set output file information
aggregate_out_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz")
aggregate_output_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz")
annotate_output_file = pathlib.PurePath(output_dir, f"{plate}_augmented.csv.gz")
normalize_output_file = pathlib.PurePath(output_dir, f"{plate}_normalized.csv.gz")
normalize_output_negcon_file = pathlib.PurePath(
output_dir, f"{plate}_normalized_negcon.csv.gz"
)
feature_output_file = pathlib.PurePath(
output_dir, f"{plate}_normalized_feature_select.csv.gz"
)
feature_output_negcon_file = pathlib.PurePath(
output_dir, f"{plate}_normalized_feature_select_negcon.csv.gz"
)

# Load pipeline options
compression = process_pipeline(pipeline["options"], option="compression")
float_format = process_pipeline(pipeline["options"], option="float_format")
samples = process_pipeline(pipeline["options"], option="samples")

# Aggregate Profiles

aggregate_steps = pipeline["aggregate"]

if aggregate_steps["perform"]:
aggregate_features = aggregate_steps["features"]
aggregate_operation = aggregate_steps["method"]
aggregate_plate_column = aggregate_steps["plate_column"]
aggregate_well_column = aggregate_steps["well_column"]

sql_file = f'sqlite:////{os.path.abspath(os.path.join("../../backend", batch, plate, f"{plate}.sqlite"))}'

strata = [aggregate_plate_column, aggregate_well_column]

if "site_column" in aggregate_steps:
aggregate_site_column = aggregate_steps["site_column"]
strata += [aggregate_site_column]

if aggregate_steps["perform"]:
ap = AggregateProfiles(
sql_file,
strata=strata,
features=aggregate_features,
operation=aggregate_operation,
)

ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression)

# Annotate Profiles
annotate_steps = pipeline["annotate"]
annotate_well_column = annotate_steps["well_column"]

if annotate_steps["perform"]:
annotate_well_column = annotate_steps["well_column"]

# Load and setup platemap info
metadata_dir = pathlib.PurePath(".", "metadata", "platemaps", batch)
barcode_plate_map_file = pathlib.PurePath(metadata_dir, "barcode_platemap.csv")
barcode_plate_map_df = pd.read_csv(
barcode_plate_map_file, dtype={"Assay_Plate_Barcode": str}
)
plate_map_name = barcode_plate_map_df.query(
"Assay_Plate_Barcode == @plate"
).Plate_Map_Name.values[0]
plate_map_file = pathlib.PurePath(metadata_dir, "platemap", f"{plate_map_name}.txt")
plate_map_df = pd.read_csv(plate_map_file, sep="\t")
plate_map_df.columns = [
f"Metadata_{x}" if not x.startswith("Metadata_") else x
for x in plate_map_df.columns
]
platemap_well_column = pipeline["platemap_well_column"]

if annotate_steps["external"]:
external_df = pd.read_csv(
pathlib.PurePath(".", "metadata", "moa", annotate_steps["external"]),
sep="\t",
)
anno_df = annotate(
profiles=aggregate_output_file,
platemap=plate_map_df,
join_on=[platemap_well_column, annotate_well_column],
cell_id=cell,
external_metadata=external_df,
external_join_left=["Metadata_broad_sample"],
external_join_right=["Metadata_broad_sample"],
)
else:
anno_df = annotate(
profiles=aggregate_output_file,
platemap=plate_map_df,
join_on=[platemap_well_column, annotate_well_column],
cell_id=cell,
)

anno_df = anno_df.rename(
{
"Image_Metadata_Plate": "Metadata_Plate",
"Image_Metadata_Well": "Metadata_Well",
},
axis="columns",
).assign(
Metadata_Assay_Plate_Barcode=plate,
Metadata_Plate_Map_Name=barcode_plate_map_df.loc[
barcode_plate_map_df.Assay_Plate_Barcode == plate, "Plate_Map_Name"
].values[0],
)

# Reoroder columns
metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True)
cp_cols = cyto_utils.infer_cp_features(anno_df)
reindex_cols = metadata_cols + cp_cols
anno_df = anno_df.reindex(reindex_cols, axis="columns")

# Output annotated file
cyto_utils.output(
df=anno_df,
output_filename=annotate_output_file,
float_format=float_format,
compression=compression,
)

# Normalize Profiles
normalize_steps = pipeline["normalize"]
if normalize_steps["perform"]:
normalization_features = normalize_steps["features"]
normalization_method = normalize_steps["method"]
normalize(
profiles=annotate_output_file,
features=normalization_features,
samples=samples,
method=normalization_method,
output_file=normalize_output_file,
float_format=float_format,
compression=compression,
)
if normalize_steps["negcon"]:
normalize(
profiles=annotate_output_file,
features=normalization_features,
samples="Metadata_control_type == 'negcon'",
method=normalization_method,
output_file=normalize_output_negcon_file,
float_format=float_format,
compression=compression,
)

# Apply feature selection
feature_select_steps = pipeline["feature_select"]
if feature_select_steps["perform"]:
feature_select_operations = feature_select_steps["operations"]
feature_select_features = feature_select_steps["features"]
feature_select(
profiles=normalize_output_file,
features=feature_select_features,
operation=feature_select_operations,
output_file=feature_output_file,
float_format=float_format,
compression=compression,
)
if feature_select_steps["negcon"]:
feature_select(
profiles=normalize_output_negcon_file,
features=feature_select_features,
operation=feature_select_operations,
output_file=feature_output_negcon_file,
float_format=float_format,
compression=compression,
)
Loading

0 comments on commit ef77653

Please sign in to comment.