Skip to content

Commit f8ce3b4

Browse files
authored
Merge pull request #160 from cytomining/jump
Add collate.py
2 parents 9bb63c2 + 1e32392 commit f8ce3b4

File tree

31 files changed

+1993
-1
lines changed

31 files changed

+1993
-1
lines changed

.github/workflows/codecov.yml

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ jobs:
2020
pip install pytest
2121
pip install pytest-cov
2222
pip install -r requirements.txt
23+
pip install .[collate]
2324
pytest --cov=./ --cov-report=xml
2425
- name: Upload coverage to Codecov
2526
uses: codecov/codecov-action@v1

.github/workflows/python-app.yml

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ jobs:
3131
python -m pip install --upgrade pip
3232
pip install pytest
3333
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
34+
pip install .[collate]
3435
- name: Test with pytest
3536
run: |
3637
pytest

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ build
66
.coverage*
77
.idea/*
88
/dist
9+
*.sqlite
10+
pycytominer/tests/test_data/collate/backend/**/*.csv
11+
!pycytominer/tests/test_data/collate/backend/**/*master.csv
12+

README.md

+16
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,22 @@ Since the project is actively being developed, with new features added regularly
4545
# Example:
4646
pip install git+git://github.com/cytomining/pycytominer@2aa8638d7e505ab510f1d5282098dd59bb2cb470
4747
```
48+
### CSV collation
49+
50+
If running your images on a cluster, unless you have a MySQL or similar large database set up then you will likely end up with lots of different folders from the different cluster runs (often one per well or one per site), each one containing an `Image.csv`, `Nuclei.csv`, etc.
51+
In order to look at full plates, therefore, we first need to collate all of these CSVs into a single file (currently SQLite) per plate.
52+
We currently do this with a library called [cytominer-database](https://github.com/cytomining/cytominer-database).
53+
54+
If you want to perform this data collation inside pycytominer using the `cyto_utils` function `collate` (and/or you want to be able to run the tests and have them all pass!), you will need `cytominer-database==0.3.4`; this will change your installation commands slightly:
55+
56+
```bash
57+
# Example for general case commit:
58+
pip install "pycytominer[collate] @ git+git://github.com/cytomining/pycytominer"
59+
# Example for specific commit:
60+
pip install "pycytominer[collate] @ git+git://github.com/cytomining/pycytominer@2aa8638d7e505ab510f1d5282098dd59bb2cb470"
61+
```
62+
63+
If using `pycytominer` in a conda environment, in order to run `collate.py`, you will also want to make sure to add `cytominer-database=0.3.4` to your list of dependencies.
4864

4965
## Usage
5066

docs/pycytominer.cyto_utils.rst

+8
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ pycytominer.cyto\_utils.cells module
1212
:undoc-members:
1313
:show-inheritance:
1414

15+
pycytominer.cyto\_utils.collate module
16+
------------------------------------
17+
18+
.. automodule:: pycytominer.cyto_utils.collate
19+
:members:
20+
:undoc-members:
21+
:show-inheritance:
22+
1523
pycytominer.cyto\_utils.features module
1624
---------------------------------------
1725

docs/pycytominer.tests.test_cyto_utils.rst

+8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ pycytominer.tests.test\_cyto\_utils.test\_cells module
2020
:undoc-members:
2121
:show-inheritance:
2222

23+
pycytominer.tests.test\_cyto\_utils.test\_collate module
24+
------------------------------------------------------
25+
26+
.. automodule:: pycytominer.tests.test_cyto_utils.test_collate
27+
:members:
28+
:undoc-members:
29+
:show-inheritance:
30+
2331
pycytominer.tests.test\_cyto\_utils.test\_feature\_blocklist module
2432
-------------------------------------------------------------------
2533

pycytominer/cyto_utils/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,4 @@
3434
aggregate_fields_count,
3535
aggregate_image_features,
3636
)
37+
from .collate import collate

pycytominer/cyto_utils/collate.py

+220
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
import os
2+
import pathlib
3+
import subprocess
4+
import sys
5+
6+
7+
def run_check_errors(cmd):
8+
"""Run a system command, and exit if an error occurred, otherwise continue"""
9+
if type(cmd) == str:
10+
cmd = cmd.split()
11+
output = subprocess.run(cmd, capture_output=True, text=True)
12+
if output.stderr != "":
13+
print_cmd = " ".join(map(str, cmd))
14+
sys.exit(
15+
f"The error {output.stderr} was generated when running {print_cmd}. Exiting."
16+
)
17+
return
18+
19+
20+
def collate(
21+
batch,
22+
config,
23+
plate,
24+
base_directory="../..",
25+
column=None,
26+
munge=False,
27+
csv_dir="analysis",
28+
aws_remote=None,
29+
aggregate_only=False,
30+
tmp_dir="/tmp",
31+
overwrite=False,
32+
add_image_features=True,
33+
image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"],
34+
printtoscreen=True,
35+
):
36+
"""Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database
37+
38+
Parameters
39+
----------
40+
batch : str
41+
Batch name to process
42+
config : str
43+
Config file to pass to cytominer-database
44+
plate : str
45+
Plate name to process
46+
base_directory : str, default "../.."
47+
Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory
48+
column : str, optional, default None
49+
An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists
50+
munge : bool, default False
51+
Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table
52+
csv_dir : str, default 'analysis'
53+
The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"
54+
aws_remote : str, optional, default None
55+
A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run
56+
aggregate_only : bool, default False
57+
Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps
58+
tmp_dir: str, default '/tmp'
59+
The temporary directory to be used by cytominer-databases for output
60+
overwrite: bool, optional, default False
61+
Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists
62+
add_image_features: bool, optional, default True
63+
Whether or not to add the image features to the profiles
64+
image_feature_categories: list, optional, default ['Number','Intensity','Granularity','Texture','ImageQuality','Count','Threshold']
65+
The list of image feature groups to be used by add_image_features during aggregation
66+
printtoscreen: bool, optional, default True
67+
Whether or not to print output to the terminal
68+
"""
69+
70+
from pycytominer.cyto_utils.cells import SingleCells
71+
72+
# Set up directories (these need to be abspaths to keep from confusing makedirs later)
73+
input_dir = pathlib.Path(f"{base_directory}/analysis/{batch}/{plate}/{csv_dir}")
74+
backend_dir = pathlib.Path(f"{base_directory}/backend/{batch}/{plate}")
75+
cache_backend_dir = pathlib.Path(f"{tmp_dir}/backend/{batch}/{plate}")
76+
77+
aggregated_file = pathlib.Path(f"{backend_dir}/{plate}.csv")
78+
backend_file = pathlib.Path(f"{backend_dir}/{plate}.sqlite")
79+
cache_backend_file = pathlib.Path(f"{cache_backend_dir}/{plate}.sqlite")
80+
81+
if not aggregate_only:
82+
if os.path.exists(cache_backend_file):
83+
if not overwrite:
84+
sys.exit(
85+
f"An SQLite file for {plate} already exists at {cache_backend_file} and overwrite is set to False. Terminating."
86+
)
87+
else:
88+
os.remove(cache_backend_file)
89+
90+
for eachdir in [input_dir, backend_dir, cache_backend_dir]:
91+
if not os.path.exists(eachdir):
92+
os.makedirs(eachdir, exist_ok=True)
93+
94+
if aws_remote:
95+
96+
remote_input_dir = f"{aws_remote}/analysis/{batch}/{plate}/{csv_dir}"
97+
98+
remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"
99+
100+
remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"
101+
102+
sync_cmd = f'aws s3 sync --exclude "*" --include "*/Cells.csv" --include "*/Nuclei.csv" --include "*/Cytoplasm.csv" --include "*/Image.csv" {remote_input_dir} {input_dir}'
103+
if printtoscreen:
104+
print(f"Downloading CSVs from {remote_input_dir} to {input_dir}")
105+
run_check_errors(sync_cmd)
106+
107+
ingest_cmd = [
108+
"cytominer-database",
109+
"ingest",
110+
input_dir,
111+
f"sqlite:///{cache_backend_file}",
112+
"-c",
113+
config,
114+
]
115+
if not munge:
116+
# munge is by default True in cytominer-database
117+
ingest_cmd.append("--no-munge")
118+
if printtoscreen:
119+
print(f"Ingesting {input_dir}")
120+
run_check_errors(ingest_cmd)
121+
122+
if column:
123+
if print:
124+
print(f"Adding a Metadata_Plate column based on column {column}")
125+
alter_cmd = [
126+
"sqlite3",
127+
cache_backend_file,
128+
"ALTER TABLE Image ADD COLUMN Metadata_Plate TEXT;",
129+
]
130+
run_check_errors(alter_cmd)
131+
update_cmd = [
132+
"sqlite3",
133+
cache_backend_file,
134+
f"UPDATE image SET Metadata_Plate ={column};",
135+
]
136+
run_check_errors(update_cmd)
137+
138+
if printtoscreen:
139+
print(f"Indexing database {cache_backend_file}")
140+
index_cmd_img = [
141+
"sqlite3",
142+
cache_backend_file,
143+
"CREATE INDEX IF NOT EXISTS table_image_idx ON Image(TableNumber, ImageNumber);",
144+
]
145+
run_check_errors(index_cmd_img)
146+
for eachcompartment in ["Cells", "Cytoplasm", "Nuclei"]:
147+
index_cmd_compartment = [
148+
"sqlite3",
149+
cache_backend_file,
150+
f"CREATE INDEX IF NOT EXISTS table_image_object_{eachcompartment.lower()}_idx ON {eachcompartment}(TableNumber, ImageNumber, ObjectNumber);",
151+
]
152+
run_check_errors(index_cmd_compartment)
153+
index_cmd_metadata = [
154+
"sqlite3",
155+
cache_backend_file,
156+
"CREATE INDEX IF NOT EXISTS plate_well_image_idx ON Image(Metadata_Plate, Metadata_Well);",
157+
]
158+
run_check_errors(index_cmd_metadata)
159+
160+
if aws_remote:
161+
162+
if printtoscreen:
163+
print(f"Uploading {cache_backend_file} to {remote_backend_file}")
164+
cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file]
165+
run_check_errors(cp_cmd)
166+
167+
if printtoscreen:
168+
print(
169+
f"Removing analysis files from {input_dir} and {cache_backend_dir}"
170+
)
171+
import shutil
172+
173+
shutil.rmtree(input_dir)
174+
175+
if printtoscreen:
176+
print(f"Renaming {cache_backend_file} to {backend_file}")
177+
os.rename(cache_backend_file, backend_file)
178+
179+
if printtoscreen:
180+
print(f"Aggregating sqlite:///{backend_file}")
181+
182+
if aggregate_only and aws_remote:
183+
remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"
184+
185+
remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"
186+
187+
cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file]
188+
if printtoscreen:
189+
print(
190+
f"Downloading SQLite files from {remote_backend_file} to {backend_file}"
191+
)
192+
run_check_errors(cp_cmd)
193+
194+
if not os.path.exists(backend_file):
195+
sys.exit(f"{backend_file} does not exist. Exiting.")
196+
197+
if add_image_features:
198+
pass
199+
else:
200+
image_feature_categories = None # defensive but not sure what will happen if we give a list but set to False
201+
202+
database = SingleCells(
203+
f"sqlite:///{backend_file}",
204+
aggregation_operation="mean",
205+
add_image_features=add_image_features,
206+
image_feature_categories=image_feature_categories,
207+
)
208+
database.aggregate_profiles(output_file=aggregated_file)
209+
210+
if aws_remote:
211+
if printtoscreen:
212+
print(f"Uploading {aggregated_file} to {remote_aggregated_file}")
213+
csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file]
214+
run_check_errors(csv_cp_cmd)
215+
216+
if printtoscreen:
217+
print(f"Removing backend files from {backend_dir}")
218+
import shutil
219+
220+
shutil.rmtree(backend_dir)

0 commit comments

Comments
 (0)