Skip to content

Commit ef77653

Browse files
Merge pull request #14 from jump-cellpainting/refactor_recipe
Refactor recipe
2 parents f712db5 + c294765 commit ef77653

File tree

4 files changed

+239
-293
lines changed

4 files changed

+239
-293
lines changed

environment.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ dependencies:
55
- conda-forge::python=3.7.1
66
- conda-forge::pandas=0.24.2
77
- conda-forge::pip=19.2.2
8+
- conda-forge::pyyaml=5.3.1
89
- pip
910
- pip:
10-
- git+https://github.com/cytomining/pycytominer@820a8369ab7d49be118d73e9074f998739eab325
11+
- git+https://github.com/cytomining/pycytominer@c1aa34b641b4e07eb5cbd424166f31355abdbd4d

profiles/profile.py

Lines changed: 176 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1,155 +1,192 @@
11
"""
22
Perform the image-based profiling pipeline to process data
33
"""
4-
# copied from https://github.com/broadinstitute/lincs-cell-painting/blob/master/profiles/profile.py
4+
# copied from
5+
# https://github.com/broadinstitute/profiling-resistance-mechanisms/blob/master/0.generate-profiles/scripts/profile_util.py
56

67
import os
78
import pathlib
9+
from profile_utils import process_pipeline
810
import pandas as pd
911
from pycytominer.aggregate import AggregateProfiles
1012
from pycytominer import (
11-
aggregate,
1213
annotate,
1314
normalize,
1415
feature_select,
15-
audit,
1616
cyto_utils,
1717
)
1818

19-
from profile_utils import get_args
20-
21-
# Load Command Line Arguments
22-
args = get_args()
23-
24-
sql_file = args.sql_file
25-
batch = args.batch
26-
plate_name = args.plate_name
27-
platemap_file = args.platemap_file
28-
barcode_platemap_file = args.barcode_platemap_file
29-
moa_file = args.moa_file
30-
cell_count_dir = args.cell_count_dir
31-
output_dir = args.output_dir
32-
33-
# Initialize profile processing
34-
os.makedirs(output_dir, exist_ok=True)
35-
os.makedirs(cell_count_dir, exist_ok=True)
36-
cell_id = "A549"
37-
aggregate_method = "median"
38-
norm_method = "mad_robustize"
39-
compression = "gzip"
40-
float_format = "%.5g"
41-
strata = ["Image_Metadata_Plate", "Image_Metadata_Well"]
42-
feature_select_ops = [
43-
"drop_na_columns",
44-
"variance_threshold",
45-
"correlation_threshold",
46-
"blacklist",
47-
]
48-
49-
# Define external metadata to add to annotation
50-
moa_df = pd.read_csv(moa_file, sep="\t")
51-
barcode_platemap_df = pd.read_csv(barcode_platemap_file).query(
52-
"Assay_Plate_Barcode == @plate_name"
53-
)
54-
55-
# # Aggregate profiles
56-
out_file = pathlib.PurePath(output_dir, f"{plate_name}.csv.gz")
57-
# ap = AggregateProfiles(sql_file=sql_file, strata=strata, operation=aggregate_method)
58-
# ap.aggregate_profiles(
59-
# output_file=out_file, float_format=float_format, compression="gzip"
60-
# )
61-
62-
# # Count cells
63-
# count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.csv")
64-
# cell_count_df = ap.count_cells()
65-
# cell_count_df.to_csv(count_file, sep=",", index=False)
66-
67-
# del ap
68-
69-
# Annotate profiles - Level 3 Data
70-
anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz")
71-
anno_df = annotate(
72-
profiles=out_file,
73-
platemap=platemap_file,
74-
join_on=["Metadata_well_position", "Metadata_Well"],
75-
cell_id=cell_id,
76-
format_broad_cmap=True,
77-
perturbation_mode="chemical",
78-
external_metadata=moa_df,
79-
external_join_left=["Metadata_broad_sample"],
80-
external_join_right=["Metadata_broad_sample"],
81-
)
82-
83-
# Rename columns
84-
anno_df = anno_df.rename(
85-
{"Image_Metadata_Plate": "Metadata_Plate", "Image_Metadata_Well": "Metadata_Well"},
86-
axis="columns",
87-
)
88-
89-
# Add barcode platemap info
90-
anno_df = anno_df.assign(
91-
Metadata_Assay_Plate_Barcode=barcode_platemap_df.Assay_Plate_Barcode.values[0],
92-
Metadata_Plate_Map_Name=barcode_platemap_df.Plate_Map_Name.values[0]
93-
)
94-
95-
# Reoroder columns
96-
metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True)
97-
cp_cols = cyto_utils.infer_cp_features(anno_df)
98-
reindex_cols = metadata_cols + cp_cols
99-
anno_df = anno_df.reindex(reindex_cols, axis="columns")
100-
101-
# Output annotated file
102-
cyto_utils.output(
103-
df=anno_df,
104-
output_filename=anno_file,
105-
float_format=float_format,
106-
compression=compression,
107-
)
10819

109-
# Normalize Profiles (DMSO Control) - Level 4A Data
110-
norm_dmso_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized_dmso.csv.gz")
111-
normalize(
112-
profiles=anno_df,
113-
samples="Metadata_broad_sample == 'DMSO'",
114-
method=norm_method,
115-
output_file=norm_dmso_file,
116-
float_format=float_format,
117-
compression=compression,
118-
)
119-
120-
# Normalize Profiles (Whole Plate) - Level 4A Data
121-
norm_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized.csv.gz")
122-
normalize(
123-
profiles=anno_df,
124-
samples="all",
125-
method=norm_method,
126-
output_file=norm_file,
127-
float_format=float_format,
128-
compression=compression,
129-
)
130-
131-
# Feature Selection (DMSO Control) - Level 4B Data
132-
feat_dmso_file = pathlib.PurePath(
133-
output_dir, f"{plate_name}_normalized_feature_select_dmso.csv.gz"
134-
)
135-
feature_select(
136-
profiles=norm_dmso_file,
137-
features="infer",
138-
operation=feature_select_ops,
139-
output_file=feat_dmso_file,
140-
float_format=float_format,
141-
compression=compression,
142-
)
143-
144-
# Feature Selection (Whole Plate) - Level 4B Data
145-
feat_file = pathlib.PurePath(
146-
output_dir, f"{plate_name}_normalized_feature_select.csv.gz"
147-
)
148-
feature_select(
149-
profiles=norm_file,
150-
features="infer",
151-
operation=feature_select_ops,
152-
output_file=feat_file,
153-
float_format=float_format,
154-
compression=compression,
155-
)
20+
def process_profile(batch, plate, cell, pipeline):
21+
# Set output directory information
22+
pipeline_output = pipeline["output_dir"]
23+
output_dir = pathlib.PurePath(".", pipeline_output, batch, plate)
24+
25+
# Set output file information
26+
aggregate_out_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz")
27+
aggregate_output_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz")
28+
annotate_output_file = pathlib.PurePath(output_dir, f"{plate}_augmented.csv.gz")
29+
normalize_output_file = pathlib.PurePath(output_dir, f"{plate}_normalized.csv.gz")
30+
normalize_output_negcon_file = pathlib.PurePath(
31+
output_dir, f"{plate}_normalized_negcon.csv.gz"
32+
)
33+
feature_output_file = pathlib.PurePath(
34+
output_dir, f"{plate}_normalized_feature_select.csv.gz"
35+
)
36+
feature_output_negcon_file = pathlib.PurePath(
37+
output_dir, f"{plate}_normalized_feature_select_negcon.csv.gz"
38+
)
39+
40+
# Load pipeline options
41+
compression = process_pipeline(pipeline["options"], option="compression")
42+
float_format = process_pipeline(pipeline["options"], option="float_format")
43+
samples = process_pipeline(pipeline["options"], option="samples")
44+
45+
# Aggregate Profiles
46+
47+
aggregate_steps = pipeline["aggregate"]
48+
49+
if aggregate_steps["perform"]:
50+
aggregate_features = aggregate_steps["features"]
51+
aggregate_operation = aggregate_steps["method"]
52+
aggregate_plate_column = aggregate_steps["plate_column"]
53+
aggregate_well_column = aggregate_steps["well_column"]
54+
55+
sql_file = f'sqlite:////{os.path.abspath(os.path.join("../../backend", batch, plate, f"{plate}.sqlite"))}'
56+
57+
strata = [aggregate_plate_column, aggregate_well_column]
58+
59+
if "site_column" in aggregate_steps:
60+
aggregate_site_column = aggregate_steps["site_column"]
61+
strata += [aggregate_site_column]
62+
63+
if aggregate_steps["perform"]:
64+
ap = AggregateProfiles(
65+
sql_file,
66+
strata=strata,
67+
features=aggregate_features,
68+
operation=aggregate_operation,
69+
)
70+
71+
ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression)
72+
73+
# Annotate Profiles
74+
annotate_steps = pipeline["annotate"]
75+
annotate_well_column = annotate_steps["well_column"]
76+
77+
if annotate_steps["perform"]:
78+
annotate_well_column = annotate_steps["well_column"]
79+
80+
# Load and setup platemap info
81+
metadata_dir = pathlib.PurePath(".", "metadata", "platemaps", batch)
82+
barcode_plate_map_file = pathlib.PurePath(metadata_dir, "barcode_platemap.csv")
83+
barcode_plate_map_df = pd.read_csv(
84+
barcode_plate_map_file, dtype={"Assay_Plate_Barcode": str}
85+
)
86+
plate_map_name = barcode_plate_map_df.query(
87+
"Assay_Plate_Barcode == @plate"
88+
).Plate_Map_Name.values[0]
89+
plate_map_file = pathlib.PurePath(metadata_dir, "platemap", f"{plate_map_name}.txt")
90+
plate_map_df = pd.read_csv(plate_map_file, sep="\t")
91+
plate_map_df.columns = [
92+
f"Metadata_{x}" if not x.startswith("Metadata_") else x
93+
for x in plate_map_df.columns
94+
]
95+
platemap_well_column = pipeline["platemap_well_column"]
96+
97+
if annotate_steps["external"]:
98+
external_df = pd.read_csv(
99+
pathlib.PurePath(".", "metadata", "moa", annotate_steps["external"]),
100+
sep="\t",
101+
)
102+
anno_df = annotate(
103+
profiles=aggregate_output_file,
104+
platemap=plate_map_df,
105+
join_on=[platemap_well_column, annotate_well_column],
106+
cell_id=cell,
107+
external_metadata=external_df,
108+
external_join_left=["Metadata_broad_sample"],
109+
external_join_right=["Metadata_broad_sample"],
110+
)
111+
else:
112+
anno_df = annotate(
113+
profiles=aggregate_output_file,
114+
platemap=plate_map_df,
115+
join_on=[platemap_well_column, annotate_well_column],
116+
cell_id=cell,
117+
)
118+
119+
anno_df = anno_df.rename(
120+
{
121+
"Image_Metadata_Plate": "Metadata_Plate",
122+
"Image_Metadata_Well": "Metadata_Well",
123+
},
124+
axis="columns",
125+
).assign(
126+
Metadata_Assay_Plate_Barcode=plate,
127+
Metadata_Plate_Map_Name=barcode_plate_map_df.loc[
128+
barcode_plate_map_df.Assay_Plate_Barcode == plate, "Plate_Map_Name"
129+
].values[0],
130+
)
131+
132+
# Reoroder columns
133+
metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True)
134+
cp_cols = cyto_utils.infer_cp_features(anno_df)
135+
reindex_cols = metadata_cols + cp_cols
136+
anno_df = anno_df.reindex(reindex_cols, axis="columns")
137+
138+
# Output annotated file
139+
cyto_utils.output(
140+
df=anno_df,
141+
output_filename=annotate_output_file,
142+
float_format=float_format,
143+
compression=compression,
144+
)
145+
146+
# Normalize Profiles
147+
normalize_steps = pipeline["normalize"]
148+
if normalize_steps["perform"]:
149+
normalization_features = normalize_steps["features"]
150+
normalization_method = normalize_steps["method"]
151+
normalize(
152+
profiles=annotate_output_file,
153+
features=normalization_features,
154+
samples=samples,
155+
method=normalization_method,
156+
output_file=normalize_output_file,
157+
float_format=float_format,
158+
compression=compression,
159+
)
160+
if normalize_steps["negcon"]:
161+
normalize(
162+
profiles=annotate_output_file,
163+
features=normalization_features,
164+
samples="Metadata_control_type == 'negcon'",
165+
method=normalization_method,
166+
output_file=normalize_output_negcon_file,
167+
float_format=float_format,
168+
compression=compression,
169+
)
170+
171+
# Apply feature selection
172+
feature_select_steps = pipeline["feature_select"]
173+
if feature_select_steps["perform"]:
174+
feature_select_operations = feature_select_steps["operations"]
175+
feature_select_features = feature_select_steps["features"]
176+
feature_select(
177+
profiles=normalize_output_file,
178+
features=feature_select_features,
179+
operation=feature_select_operations,
180+
output_file=feature_output_file,
181+
float_format=float_format,
182+
compression=compression,
183+
)
184+
if feature_select_steps["negcon"]:
185+
feature_select(
186+
profiles=normalize_output_negcon_file,
187+
features=feature_select_features,
188+
operation=feature_select_operations,
189+
output_file=feature_output_negcon_file,
190+
float_format=float_format,
191+
compression=compression,
192+
)

0 commit comments

Comments
 (0)