|
| 1 | +import os |
| 2 | +import pathlib |
| 3 | +import subprocess |
| 4 | +import sys |
| 5 | + |
| 6 | + |
| 7 | +def run_check_errors(cmd): |
| 8 | + """Run a system command, and exit if an error occurred, otherwise continue""" |
| 9 | + if type(cmd) == str: |
| 10 | + cmd = cmd.split() |
| 11 | + output = subprocess.run(cmd, capture_output=True, text=True) |
| 12 | + if output.stderr != "": |
| 13 | + print_cmd = " ".join(map(str, cmd)) |
| 14 | + sys.exit( |
| 15 | + f"The error {output.stderr} was generated when running {print_cmd}. Exiting." |
| 16 | + ) |
| 17 | + return |
| 18 | + |
| 19 | + |
| 20 | +def collate( |
| 21 | + batch, |
| 22 | + config, |
| 23 | + plate, |
| 24 | + base_directory="../..", |
| 25 | + column=None, |
| 26 | + munge=False, |
| 27 | + csv_dir="analysis", |
| 28 | + aws_remote=None, |
| 29 | + aggregate_only=False, |
| 30 | + tmp_dir="/tmp", |
| 31 | + overwrite=False, |
| 32 | + add_image_features=True, |
| 33 | + image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"], |
| 34 | + printtoscreen=True, |
| 35 | +): |
| 36 | + """Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database |
| 37 | +
|
| 38 | + Parameters |
| 39 | + ---------- |
| 40 | + batch : str |
| 41 | + Batch name to process |
| 42 | + config : str |
| 43 | + Config file to pass to cytominer-database |
| 44 | + plate : str |
| 45 | + Plate name to process |
| 46 | + base_directory : str, default "../.." |
| 47 | + Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory |
| 48 | + column : str, optional, default None |
| 49 | + An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists |
| 50 | + munge : bool, default False |
| 51 | + Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table |
| 52 | + csv_dir : str, default 'analysis' |
| 53 | + The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis" |
| 54 | + aws_remote : str, optional, default None |
| 55 | + A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run |
| 56 | + aggregate_only : bool, default False |
| 57 | + Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps |
| 58 | + tmp_dir: str, default '/tmp' |
| 59 | + The temporary directory to be used by cytominer-databases for output |
| 60 | + overwrite: bool, optional, default False |
| 61 | + Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists |
| 62 | + add_image_features: bool, optional, default True |
| 63 | + Whether or not to add the image features to the profiles |
| 64 | + image_feature_categories: list, optional, default ['Number','Intensity','Granularity','Texture','ImageQuality','Count','Threshold'] |
| 65 | + The list of image feature groups to be used by add_image_features during aggregation |
| 66 | + printtoscreen: bool, optional, default True |
| 67 | + Whether or not to print output to the terminal |
| 68 | + """ |
| 69 | + |
| 70 | + from pycytominer.cyto_utils.cells import SingleCells |
| 71 | + |
| 72 | + # Set up directories (these need to be abspaths to keep from confusing makedirs later) |
| 73 | + input_dir = pathlib.Path(f"{base_directory}/analysis/{batch}/{plate}/{csv_dir}") |
| 74 | + backend_dir = pathlib.Path(f"{base_directory}/backend/{batch}/{plate}") |
| 75 | + cache_backend_dir = pathlib.Path(f"{tmp_dir}/backend/{batch}/{plate}") |
| 76 | + |
| 77 | + aggregated_file = pathlib.Path(f"{backend_dir}/{plate}.csv") |
| 78 | + backend_file = pathlib.Path(f"{backend_dir}/{plate}.sqlite") |
| 79 | + cache_backend_file = pathlib.Path(f"{cache_backend_dir}/{plate}.sqlite") |
| 80 | + |
| 81 | + if not aggregate_only: |
| 82 | + if os.path.exists(cache_backend_file): |
| 83 | + if not overwrite: |
| 84 | + sys.exit( |
| 85 | + f"An SQLite file for {plate} already exists at {cache_backend_file} and overwrite is set to False. Terminating." |
| 86 | + ) |
| 87 | + else: |
| 88 | + os.remove(cache_backend_file) |
| 89 | + |
| 90 | + for eachdir in [input_dir, backend_dir, cache_backend_dir]: |
| 91 | + if not os.path.exists(eachdir): |
| 92 | + os.makedirs(eachdir, exist_ok=True) |
| 93 | + |
| 94 | + if aws_remote: |
| 95 | + |
| 96 | + remote_input_dir = f"{aws_remote}/analysis/{batch}/{plate}/{csv_dir}" |
| 97 | + |
| 98 | + remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite" |
| 99 | + |
| 100 | + remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv" |
| 101 | + |
| 102 | + sync_cmd = f'aws s3 sync --exclude "*" --include "*/Cells.csv" --include "*/Nuclei.csv" --include "*/Cytoplasm.csv" --include "*/Image.csv" {remote_input_dir} {input_dir}' |
| 103 | + if printtoscreen: |
| 104 | + print(f"Downloading CSVs from {remote_input_dir} to {input_dir}") |
| 105 | + run_check_errors(sync_cmd) |
| 106 | + |
| 107 | + ingest_cmd = [ |
| 108 | + "cytominer-database", |
| 109 | + "ingest", |
| 110 | + input_dir, |
| 111 | + f"sqlite:///{cache_backend_file}", |
| 112 | + "-c", |
| 113 | + config, |
| 114 | + ] |
| 115 | + if not munge: |
| 116 | + # munge is by default True in cytominer-database |
| 117 | + ingest_cmd.append("--no-munge") |
| 118 | + if printtoscreen: |
| 119 | + print(f"Ingesting {input_dir}") |
| 120 | + run_check_errors(ingest_cmd) |
| 121 | + |
| 122 | + if column: |
| 123 | + if print: |
| 124 | + print(f"Adding a Metadata_Plate column based on column {column}") |
| 125 | + alter_cmd = [ |
| 126 | + "sqlite3", |
| 127 | + cache_backend_file, |
| 128 | + "ALTER TABLE Image ADD COLUMN Metadata_Plate TEXT;", |
| 129 | + ] |
| 130 | + run_check_errors(alter_cmd) |
| 131 | + update_cmd = [ |
| 132 | + "sqlite3", |
| 133 | + cache_backend_file, |
| 134 | + f"UPDATE image SET Metadata_Plate ={column};", |
| 135 | + ] |
| 136 | + run_check_errors(update_cmd) |
| 137 | + |
| 138 | + if printtoscreen: |
| 139 | + print(f"Indexing database {cache_backend_file}") |
| 140 | + index_cmd_img = [ |
| 141 | + "sqlite3", |
| 142 | + cache_backend_file, |
| 143 | + "CREATE INDEX IF NOT EXISTS table_image_idx ON Image(TableNumber, ImageNumber);", |
| 144 | + ] |
| 145 | + run_check_errors(index_cmd_img) |
| 146 | + for eachcompartment in ["Cells", "Cytoplasm", "Nuclei"]: |
| 147 | + index_cmd_compartment = [ |
| 148 | + "sqlite3", |
| 149 | + cache_backend_file, |
| 150 | + f"CREATE INDEX IF NOT EXISTS table_image_object_{eachcompartment.lower()}_idx ON {eachcompartment}(TableNumber, ImageNumber, ObjectNumber);", |
| 151 | + ] |
| 152 | + run_check_errors(index_cmd_compartment) |
| 153 | + index_cmd_metadata = [ |
| 154 | + "sqlite3", |
| 155 | + cache_backend_file, |
| 156 | + "CREATE INDEX IF NOT EXISTS plate_well_image_idx ON Image(Metadata_Plate, Metadata_Well);", |
| 157 | + ] |
| 158 | + run_check_errors(index_cmd_metadata) |
| 159 | + |
| 160 | + if aws_remote: |
| 161 | + |
| 162 | + if printtoscreen: |
| 163 | + print(f"Uploading {cache_backend_file} to {remote_backend_file}") |
| 164 | + cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file] |
| 165 | + run_check_errors(cp_cmd) |
| 166 | + |
| 167 | + if printtoscreen: |
| 168 | + print( |
| 169 | + f"Removing analysis files from {input_dir} and {cache_backend_dir}" |
| 170 | + ) |
| 171 | + import shutil |
| 172 | + |
| 173 | + shutil.rmtree(input_dir) |
| 174 | + |
| 175 | + if printtoscreen: |
| 176 | + print(f"Renaming {cache_backend_file} to {backend_file}") |
| 177 | + os.rename(cache_backend_file, backend_file) |
| 178 | + |
| 179 | + if printtoscreen: |
| 180 | + print(f"Aggregating sqlite:///{backend_file}") |
| 181 | + |
| 182 | + if aggregate_only and aws_remote: |
| 183 | + remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite" |
| 184 | + |
| 185 | + remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv" |
| 186 | + |
| 187 | + cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file] |
| 188 | + if printtoscreen: |
| 189 | + print( |
| 190 | + f"Downloading SQLite files from {remote_backend_file} to {backend_file}" |
| 191 | + ) |
| 192 | + run_check_errors(cp_cmd) |
| 193 | + |
| 194 | + if not os.path.exists(backend_file): |
| 195 | + sys.exit(f"{backend_file} does not exist. Exiting.") |
| 196 | + |
| 197 | + if add_image_features: |
| 198 | + pass |
| 199 | + else: |
| 200 | + image_feature_categories = None # defensive but not sure what will happen if we give a list but set to False |
| 201 | + |
| 202 | + database = SingleCells( |
| 203 | + f"sqlite:///{backend_file}", |
| 204 | + aggregation_operation="mean", |
| 205 | + add_image_features=add_image_features, |
| 206 | + image_feature_categories=image_feature_categories, |
| 207 | + ) |
| 208 | + database.aggregate_profiles(output_file=aggregated_file) |
| 209 | + |
| 210 | + if aws_remote: |
| 211 | + if printtoscreen: |
| 212 | + print(f"Uploading {aggregated_file} to {remote_aggregated_file}") |
| 213 | + csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file] |
| 214 | + run_check_errors(csv_cp_cmd) |
| 215 | + |
| 216 | + if printtoscreen: |
| 217 | + print(f"Removing backend files from {backend_dir}") |
| 218 | + import shutil |
| 219 | + |
| 220 | + shutil.rmtree(backend_dir) |
0 commit comments