1
1
"""
2
2
Perform the image-based profiling pipeline to process data
3
3
"""
4
- # copied from https://github.com/broadinstitute/lincs-cell-painting/blob/master/profiles/profile.py
4
+ # copied from
5
+ # https://github.com/broadinstitute/profiling-resistance-mechanisms/blob/master/0.generate-profiles/scripts/profile_util.py
5
6
6
7
import os
7
8
import pathlib
9
+ from profile_utils import process_pipeline
8
10
import pandas as pd
9
11
from pycytominer .aggregate import AggregateProfiles
10
12
from pycytominer import (
11
- aggregate ,
12
13
annotate ,
13
14
normalize ,
14
15
feature_select ,
15
- audit ,
16
16
cyto_utils ,
17
17
)
18
18
19
- from profile_utils import get_args
20
-
21
- # Load Command Line Arguments
22
- args = get_args ()
23
-
24
- sql_file = args .sql_file
25
- batch = args .batch
26
- plate_name = args .plate_name
27
- platemap_file = args .platemap_file
28
- barcode_platemap_file = args .barcode_platemap_file
29
- moa_file = args .moa_file
30
- cell_count_dir = args .cell_count_dir
31
- output_dir = args .output_dir
32
-
33
- # Initialize profile processing
34
- os .makedirs (output_dir , exist_ok = True )
35
- os .makedirs (cell_count_dir , exist_ok = True )
36
- cell_id = "A549"
37
- aggregate_method = "median"
38
- norm_method = "mad_robustize"
39
- compression = "gzip"
40
- float_format = "%.5g"
41
- strata = ["Image_Metadata_Plate" , "Image_Metadata_Well" ]
42
- feature_select_ops = [
43
- "drop_na_columns" ,
44
- "variance_threshold" ,
45
- "correlation_threshold" ,
46
- "blacklist" ,
47
- ]
48
-
49
- # Define external metadata to add to annotation
50
- moa_df = pd .read_csv (moa_file , sep = "\t " )
51
- barcode_platemap_df = pd .read_csv (barcode_platemap_file ).query (
52
- "Assay_Plate_Barcode == @plate_name"
53
- )
54
-
55
- # # Aggregate profiles
56
- out_file = pathlib .PurePath (output_dir , f"{ plate_name } .csv.gz" )
57
- # ap = AggregateProfiles(sql_file=sql_file, strata=strata, operation=aggregate_method)
58
- # ap.aggregate_profiles(
59
- # output_file=out_file, float_format=float_format, compression="gzip"
60
- # )
61
-
62
- # # Count cells
63
- # count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.csv")
64
- # cell_count_df = ap.count_cells()
65
- # cell_count_df.to_csv(count_file, sep=",", index=False)
66
-
67
- # del ap
68
-
69
- # Annotate profiles - Level 3 Data
70
- anno_file = pathlib .PurePath (output_dir , f"{ plate_name } _augmented.csv.gz" )
71
- anno_df = annotate (
72
- profiles = out_file ,
73
- platemap = platemap_file ,
74
- join_on = ["Metadata_well_position" , "Metadata_Well" ],
75
- cell_id = cell_id ,
76
- format_broad_cmap = True ,
77
- perturbation_mode = "chemical" ,
78
- external_metadata = moa_df ,
79
- external_join_left = ["Metadata_broad_sample" ],
80
- external_join_right = ["Metadata_broad_sample" ],
81
- )
82
-
83
- # Rename columns
84
- anno_df = anno_df .rename (
85
- {"Image_Metadata_Plate" : "Metadata_Plate" , "Image_Metadata_Well" : "Metadata_Well" },
86
- axis = "columns" ,
87
- )
88
-
89
- # Add barcode platemap info
90
- anno_df = anno_df .assign (
91
- Metadata_Assay_Plate_Barcode = barcode_platemap_df .Assay_Plate_Barcode .values [0 ],
92
- Metadata_Plate_Map_Name = barcode_platemap_df .Plate_Map_Name .values [0 ]
93
- )
94
-
95
- # Reoroder columns
96
- metadata_cols = cyto_utils .infer_cp_features (anno_df , metadata = True )
97
- cp_cols = cyto_utils .infer_cp_features (anno_df )
98
- reindex_cols = metadata_cols + cp_cols
99
- anno_df = anno_df .reindex (reindex_cols , axis = "columns" )
100
-
101
- # Output annotated file
102
- cyto_utils .output (
103
- df = anno_df ,
104
- output_filename = anno_file ,
105
- float_format = float_format ,
106
- compression = compression ,
107
- )
108
19
109
- # Normalize Profiles (DMSO Control) - Level 4A Data
110
- norm_dmso_file = pathlib .PurePath (output_dir , f"{ plate_name } _normalized_dmso.csv.gz" )
111
- normalize (
112
- profiles = anno_df ,
113
- samples = "Metadata_broad_sample == 'DMSO'" ,
114
- method = norm_method ,
115
- output_file = norm_dmso_file ,
116
- float_format = float_format ,
117
- compression = compression ,
118
- )
119
-
120
- # Normalize Profiles (Whole Plate) - Level 4A Data
121
- norm_file = pathlib .PurePath (output_dir , f"{ plate_name } _normalized.csv.gz" )
122
- normalize (
123
- profiles = anno_df ,
124
- samples = "all" ,
125
- method = norm_method ,
126
- output_file = norm_file ,
127
- float_format = float_format ,
128
- compression = compression ,
129
- )
130
-
131
- # Feature Selection (DMSO Control) - Level 4B Data
132
- feat_dmso_file = pathlib .PurePath (
133
- output_dir , f"{ plate_name } _normalized_feature_select_dmso.csv.gz"
134
- )
135
- feature_select (
136
- profiles = norm_dmso_file ,
137
- features = "infer" ,
138
- operation = feature_select_ops ,
139
- output_file = feat_dmso_file ,
140
- float_format = float_format ,
141
- compression = compression ,
142
- )
143
-
144
- # Feature Selection (Whole Plate) - Level 4B Data
145
- feat_file = pathlib .PurePath (
146
- output_dir , f"{ plate_name } _normalized_feature_select.csv.gz"
147
- )
148
- feature_select (
149
- profiles = norm_file ,
150
- features = "infer" ,
151
- operation = feature_select_ops ,
152
- output_file = feat_file ,
153
- float_format = float_format ,
154
- compression = compression ,
155
- )
20
+ def process_profile (batch , plate , cell , pipeline ):
21
+ # Set output directory information
22
+ pipeline_output = pipeline ["output_dir" ]
23
+ output_dir = pathlib .PurePath ("." , pipeline_output , batch , plate )
24
+
25
+ # Set output file information
26
+ aggregate_out_file = pathlib .PurePath (output_dir , f"{ plate } .csv.gz" )
27
+ aggregate_output_file = pathlib .PurePath (output_dir , f"{ plate } .csv.gz" )
28
+ annotate_output_file = pathlib .PurePath (output_dir , f"{ plate } _augmented.csv.gz" )
29
+ normalize_output_file = pathlib .PurePath (output_dir , f"{ plate } _normalized.csv.gz" )
30
+ normalize_output_negcon_file = pathlib .PurePath (
31
+ output_dir , f"{ plate } _normalized_negcon.csv.gz"
32
+ )
33
+ feature_output_file = pathlib .PurePath (
34
+ output_dir , f"{ plate } _normalized_feature_select.csv.gz"
35
+ )
36
+ feature_output_negcon_file = pathlib .PurePath (
37
+ output_dir , f"{ plate } _normalized_feature_select_negcon.csv.gz"
38
+ )
39
+
40
+ # Load pipeline options
41
+ compression = process_pipeline (pipeline ["options" ], option = "compression" )
42
+ float_format = process_pipeline (pipeline ["options" ], option = "float_format" )
43
+ samples = process_pipeline (pipeline ["options" ], option = "samples" )
44
+
45
+ # Aggregate Profiles
46
+
47
+ aggregate_steps = pipeline ["aggregate" ]
48
+
49
+ if aggregate_steps ["perform" ]:
50
+ aggregate_features = aggregate_steps ["features" ]
51
+ aggregate_operation = aggregate_steps ["method" ]
52
+ aggregate_plate_column = aggregate_steps ["plate_column" ]
53
+ aggregate_well_column = aggregate_steps ["well_column" ]
54
+
55
+ sql_file = f'sqlite:////{ os .path .abspath (os .path .join ("../../backend" , batch , plate , f"{ plate } .sqlite" ))} '
56
+
57
+ strata = [aggregate_plate_column , aggregate_well_column ]
58
+
59
+ if "site_column" in aggregate_steps :
60
+ aggregate_site_column = aggregate_steps ["site_column" ]
61
+ strata += [aggregate_site_column ]
62
+
63
+ if aggregate_steps ["perform" ]:
64
+ ap = AggregateProfiles (
65
+ sql_file ,
66
+ strata = strata ,
67
+ features = aggregate_features ,
68
+ operation = aggregate_operation ,
69
+ )
70
+
71
+ ap .aggregate_profiles (output_file = aggregate_out_file , compression = compression )
72
+
73
+ # Annotate Profiles
74
+ annotate_steps = pipeline ["annotate" ]
75
+ annotate_well_column = annotate_steps ["well_column" ]
76
+
77
+ if annotate_steps ["perform" ]:
78
+ annotate_well_column = annotate_steps ["well_column" ]
79
+
80
+ # Load and setup platemap info
81
+ metadata_dir = pathlib .PurePath ("." , "metadata" , "platemaps" , batch )
82
+ barcode_plate_map_file = pathlib .PurePath (metadata_dir , "barcode_platemap.csv" )
83
+ barcode_plate_map_df = pd .read_csv (
84
+ barcode_plate_map_file , dtype = {"Assay_Plate_Barcode" : str }
85
+ )
86
+ plate_map_name = barcode_plate_map_df .query (
87
+ "Assay_Plate_Barcode == @plate"
88
+ ).Plate_Map_Name .values [0 ]
89
+ plate_map_file = pathlib .PurePath (metadata_dir , "platemap" , f"{ plate_map_name } .txt" )
90
+ plate_map_df = pd .read_csv (plate_map_file , sep = "\t " )
91
+ plate_map_df .columns = [
92
+ f"Metadata_{ x } " if not x .startswith ("Metadata_" ) else x
93
+ for x in plate_map_df .columns
94
+ ]
95
+ platemap_well_column = pipeline ["platemap_well_column" ]
96
+
97
+ if annotate_steps ["external" ]:
98
+ external_df = pd .read_csv (
99
+ pathlib .PurePath ("." , "metadata" , "moa" , annotate_steps ["external" ]),
100
+ sep = "\t " ,
101
+ )
102
+ anno_df = annotate (
103
+ profiles = aggregate_output_file ,
104
+ platemap = plate_map_df ,
105
+ join_on = [platemap_well_column , annotate_well_column ],
106
+ cell_id = cell ,
107
+ external_metadata = external_df ,
108
+ external_join_left = ["Metadata_broad_sample" ],
109
+ external_join_right = ["Metadata_broad_sample" ],
110
+ )
111
+ else :
112
+ anno_df = annotate (
113
+ profiles = aggregate_output_file ,
114
+ platemap = plate_map_df ,
115
+ join_on = [platemap_well_column , annotate_well_column ],
116
+ cell_id = cell ,
117
+ )
118
+
119
+ anno_df = anno_df .rename (
120
+ {
121
+ "Image_Metadata_Plate" : "Metadata_Plate" ,
122
+ "Image_Metadata_Well" : "Metadata_Well" ,
123
+ },
124
+ axis = "columns" ,
125
+ ).assign (
126
+ Metadata_Assay_Plate_Barcode = plate ,
127
+ Metadata_Plate_Map_Name = barcode_plate_map_df .loc [
128
+ barcode_plate_map_df .Assay_Plate_Barcode == plate , "Plate_Map_Name"
129
+ ].values [0 ],
130
+ )
131
+
132
+ # Reoroder columns
133
+ metadata_cols = cyto_utils .infer_cp_features (anno_df , metadata = True )
134
+ cp_cols = cyto_utils .infer_cp_features (anno_df )
135
+ reindex_cols = metadata_cols + cp_cols
136
+ anno_df = anno_df .reindex (reindex_cols , axis = "columns" )
137
+
138
+ # Output annotated file
139
+ cyto_utils .output (
140
+ df = anno_df ,
141
+ output_filename = annotate_output_file ,
142
+ float_format = float_format ,
143
+ compression = compression ,
144
+ )
145
+
146
+ # Normalize Profiles
147
+ normalize_steps = pipeline ["normalize" ]
148
+ if normalize_steps ["perform" ]:
149
+ normalization_features = normalize_steps ["features" ]
150
+ normalization_method = normalize_steps ["method" ]
151
+ normalize (
152
+ profiles = annotate_output_file ,
153
+ features = normalization_features ,
154
+ samples = samples ,
155
+ method = normalization_method ,
156
+ output_file = normalize_output_file ,
157
+ float_format = float_format ,
158
+ compression = compression ,
159
+ )
160
+ if normalize_steps ["negcon" ]:
161
+ normalize (
162
+ profiles = annotate_output_file ,
163
+ features = normalization_features ,
164
+ samples = "Metadata_control_type == 'negcon'" ,
165
+ method = normalization_method ,
166
+ output_file = normalize_output_negcon_file ,
167
+ float_format = float_format ,
168
+ compression = compression ,
169
+ )
170
+
171
+ # Apply feature selection
172
+ feature_select_steps = pipeline ["feature_select" ]
173
+ if feature_select_steps ["perform" ]:
174
+ feature_select_operations = feature_select_steps ["operations" ]
175
+ feature_select_features = feature_select_steps ["features" ]
176
+ feature_select (
177
+ profiles = normalize_output_file ,
178
+ features = feature_select_features ,
179
+ operation = feature_select_operations ,
180
+ output_file = feature_output_file ,
181
+ float_format = float_format ,
182
+ compression = compression ,
183
+ )
184
+ if feature_select_steps ["negcon" ]:
185
+ feature_select (
186
+ profiles = normalize_output_negcon_file ,
187
+ features = feature_select_features ,
188
+ operation = feature_select_operations ,
189
+ output_file = feature_output_negcon_file ,
190
+ float_format = float_format ,
191
+ compression = compression ,
192
+ )
0 commit comments