Skip to content

Commit 1a5593e

Browse files
Enable Image_PathName_ columns to be used for building image file references (#61)
* add test cases * gather image path data * add data_image_paths to link image files * update example * [pre-commit.ci lite] apply automatic fixes --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
1 parent 7dcfc27 commit 1a5593e

File tree

6 files changed

+376
-47
lines changed

6 files changed

+376
-47
lines changed

Diff for: docs/src/examples/cytodataframe_at_a_glance.ipynb

+122-16
Large diffs are not rendered by default.

Diff for: docs/src/examples/cytodataframe_at_a_glance.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# extension: .py
66
# format_name: light
77
# format_version: '1.5'
8-
# jupytext_version: 1.16.4
8+
# jupytext_version: 1.16.6
99
# kernelspec:
1010
# display_name: Python 3 (ipykernel)
1111
# language: python
@@ -22,6 +22,10 @@
2222
# visual information which can be viewed directly in a Jupyter notebook.
2323

2424
# +
25+
import pathlib
26+
27+
import pandas as pd
28+
2529
from cytodataframe.frame import CytoDataFrame
2630

2731
# create paths for use with CytoDataFrames below
@@ -95,6 +99,35 @@
9599
]
96100
][:3]
97101

102+
# +
103+
# %%time
104+
# add active paths on the local system to show how CytoDataFrame
105+
# may be used without specifying a context directory for images.
106+
# Note: normally these paths are local to the system where the
107+
# profile data was generated, which often is not the same as the
108+
# system which will be used to analyze the data.
109+
parquet_path = f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet"
110+
nf1_dataset_with_modified_image_paths = pd.read_parquet(path=parquet_path)
111+
nf1_dataset_with_modified_image_paths.loc[
112+
:, ["Image_PathName_DAPI", "Image_PathName_GFP", "Image_PathName_RFP"]
113+
] = f"{pathlib.Path(parquet_path).parent}/Plate_2_images"
114+
115+
# view NF1 Cell Painting data with images and overlaid outlines from masks
116+
CytoDataFrame(
117+
# note: we can read directly from an existing Pandas DataFrame
118+
data=nf1_dataset_with_modified_image_paths,
119+
data_mask_context_dir=f"{nf1_cellpainting_path}/Plate_2_masks",
120+
)[
121+
[
122+
"Metadata_ImageNumber",
123+
"Metadata_Cells_Number_Object_Number",
124+
"Image_FileName_GFP",
125+
"Image_FileName_RFP",
126+
"Image_FileName_DAPI",
127+
]
128+
][:3]
129+
# -
130+
98131
# %%time
99132
# view nuclear speckles data with images and overlaid outlines from masks
100133
CytoDataFrame(

Diff for: media/coverage-badge.svg

+1-1
Loading

Diff for: src/cytodataframe/frame.py

+157-24
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def __init__( # noqa: PLR0913
6767
self: CytoDataFrame_type,
6868
data: Union[CytoDataFrame_type, pd.DataFrame, str, pathlib.Path],
6969
data_context_dir: Optional[str] = None,
70+
data_image_paths: Optional[pd.DataFrame] = None,
7071
data_bounding_box: Optional[pd.DataFrame] = None,
7172
data_mask_context_dir: Optional[str] = None,
7273
data_outline_context_dir: Optional[str] = None,
@@ -82,6 +83,8 @@ def __init__( # noqa: PLR0913
8283
The data source, either a pandas DataFrame or a file path.
8384
data_context_dir (Optional[str]):
8485
Directory context for the image data within the DataFrame.
86+
data_image_paths (Optional[pd.DataFrame]):
87+
Image path data for the image files.
8588
data_bounding_box (Optional[pd.DataFrame]):
8689
Bounding box data for the DataFrame images.
8790
data_mask_context_dir: Optional[str]:
@@ -108,6 +111,7 @@ def __init__( # noqa: PLR0913
108111
"data_context_dir": (
109112
data_context_dir if data_context_dir is not None else None
110113
),
114+
"data_image_paths": None,
111115
"data_bounding_box": None,
112116
"data_mask_context_dir": (
113117
data_mask_context_dir if data_mask_context_dir is not None else None
@@ -168,11 +172,17 @@ def __init__( # noqa: PLR0913
168172
else:
169173
super().__init__(data)
170174

171-
if data_bounding_box is None:
172-
self._custom_attrs["data_bounding_box"] = self.get_bounding_box_from_data()
175+
self._custom_attrs["data_bounding_box"] = (
176+
self.get_bounding_box_from_data()
177+
if data_bounding_box is None
178+
else data_bounding_box
179+
)
173180

174-
else:
175-
self._custom_attrs["data_bounding_box"] = data_bounding_box
181+
self._custom_attrs["data_image_paths"] = (
182+
self.get_image_paths_from_data(image_cols=self.find_image_columns())
183+
if data_image_paths is None
184+
else data_image_paths
185+
)
176186

177187
def __getitem__(self: CytoDataFrame_type, key: Union[int, str]) -> Any: # noqa: ANN401
178188
"""
@@ -196,6 +206,7 @@ def __getitem__(self: CytoDataFrame_type, key: Union[int, str]) -> Any: # noqa:
196206
return CytoDataFrame(
197207
super().__getitem__(key),
198208
data_context_dir=self._custom_attrs["data_context_dir"],
209+
data_image_paths=self._custom_attrs["data_image_paths"],
199210
data_bounding_box=self._custom_attrs["data_bounding_box"],
200211
data_mask_context_dir=self._custom_attrs["data_mask_context_dir"],
201212
data_outline_context_dir=self._custom_attrs["data_outline_context_dir"],
@@ -233,6 +244,7 @@ def _wrap_method(
233244
result = CytoDataFrame(
234245
result,
235246
data_context_dir=self._custom_attrs["data_context_dir"],
247+
data_image_paths=self._custom_attrs["data_image_paths"],
236248
data_bounding_box=self._custom_attrs["data_bounding_box"],
237249
data_mask_context_dir=self._custom_attrs["data_mask_context_dir"],
238250
data_outline_context_dir=self._custom_attrs["data_outline_context_dir"],
@@ -381,8 +393,25 @@ def is_notebook_or_lab() -> bool:
381393
except NameError:
382394
return False
383395

384-
def find_image_columns(self: CytoDataFrame_type) -> bool:
396+
def find_image_columns(self: CytoDataFrame_type) -> List[str]:
397+
"""
398+
Find columns containing image file names.
399+
400+
This method searches for columns in the DataFrame
401+
that contain image file names with extensions .tif
402+
or .tiff (case insensitive).
403+
404+
Returns:
405+
List[str]:
406+
A list of column names that contain
407+
image file names.
408+
409+
"""
410+
# build a pattern to match image file names
385411
pattern = r".*\.(tif|tiff)$"
412+
413+
# search for columns containing image file names
414+
# based on pattern above.
386415
return [
387416
column
388417
for column in self.columns
@@ -394,6 +423,64 @@ def find_image_columns(self: CytoDataFrame_type) -> bool:
394423
.any()
395424
]
396425

426+
def get_image_paths_from_data(
427+
self: CytoDataFrame_type, image_cols: List[str]
428+
) -> Dict[str, str]:
429+
"""
430+
Gather data containing image path names
431+
(the directory storing the images but not the file
432+
names). We do this by seeking the pattern:
433+
Image_FileName_X --> Image_PathName_X.
434+
435+
Args:
436+
image_cols: List[str]:
437+
A list of column names that contain
438+
image file names.
439+
440+
Returns:
441+
Dict[str, str]:
442+
A list of column names that contain
443+
image file names.
444+
445+
"""
446+
447+
image_path_columns = [
448+
col.replace("FileName", "PathName")
449+
for col in image_cols
450+
if col.replace("FileName", "PathName") in self.columns
451+
]
452+
453+
return self.filter(items=image_path_columns) if image_path_columns else None
454+
455+
def find_image_path_columns(
456+
self: CytoDataFrame_type, image_cols: List[str], all_cols: List[str]
457+
) -> Dict[str, str]:
458+
"""
459+
Find columns containing image path names
460+
(the directory storing the images but not the file
461+
names). We do this by seeking the pattern:
462+
Image_FileName_X --> Image_PathName_X.
463+
464+
Args:
465+
image_cols: List[str]:
466+
A list of column names that contain
467+
image file names.
468+
all_cols: List[str]:
469+
A list of all column names.
470+
471+
Returns:
472+
Dict[str, str]:
473+
A list of column names that contain
474+
image file names.
475+
476+
"""
477+
478+
return {
479+
col: col.replace("FileName", "PathName")
480+
for col in image_cols
481+
if col.replace("FileName", "PathName") in all_cols
482+
}
483+
397484
def search_for_mask_or_outline( # noqa: PLR0913, PLR0911
398485
self: CytoDataFrame_type,
399486
data_value: str,
@@ -471,6 +558,7 @@ def process_image_data_as_html_display(
471558
self: CytoDataFrame_type,
472559
data_value: Any, # noqa: ANN401
473560
bounding_box: Tuple[int, int, int, int],
561+
image_path: Optional[str] = None,
474562
) -> str:
475563
"""
476564
Process the image data based on the provided data value
@@ -489,38 +577,55 @@ def process_image_data_as_html_display(
489577
The HTML image display string, or the unmodified data
490578
value if the image cannot be processed.
491579
"""
580+
492581
candidate_path = None
493582
# Get the pattern map for segmentation file regex
494583
pattern_map = self._custom_attrs.get("segmentation_file_regex")
495584

496585
# Step 1: Find the candidate file if the data value is not already a file
497586
if not pathlib.Path(data_value).is_file():
587+
# determine if we have a file from the path (dir) + filename
588+
if (
589+
self._custom_attrs["data_context_dir"] is None
590+
and image_path is not None
591+
and (
592+
existing_image_from_path := pathlib.Path(
593+
f"{image_path}/{data_value}"
594+
)
595+
).is_file()
596+
):
597+
candidate_path = existing_image_from_path
598+
498599
# Search for the data value in the data context directory
499-
if candidate_paths := list(
500-
pathlib.Path(self._custom_attrs["data_context_dir"]).rglob(data_value)
600+
elif self._custom_attrs["data_context_dir"] is not None and (
601+
candidate_paths := list(
602+
pathlib.Path(self._custom_attrs["data_context_dir"]).rglob(
603+
data_value
604+
)
605+
)
501606
):
502607
# If a candidate file is found, use the first one
503608
candidate_path = candidate_paths[0]
504-
orig_image_array = skimage.io.imread(candidate_path)
505-
506-
# Adjust the image with image adjustment callable
507-
# or adaptive histogram equalization
508-
if self._custom_attrs["image_adjustment"] is not None:
509-
orig_image_array = self._custom_attrs["image_adjustment"](
510-
orig_image_array
511-
)
512-
else:
513-
orig_image_array = adjust_with_adaptive_histogram_equalization(
514-
orig_image_array
515-
)
516-
517-
# Normalize to 0-255 for image saving
518-
orig_image_array = img_as_ubyte(orig_image_array)
519609

520610
else:
521611
# If no candidate file is found, return the original data value
522612
return data_value
523613

614+
# read the image as an array
615+
orig_image_array = skimage.io.imread(candidate_path)
616+
617+
# Adjust the image with image adjustment callable
618+
# or adaptive histogram equalization
619+
if self._custom_attrs["image_adjustment"] is not None:
620+
orig_image_array = self._custom_attrs["image_adjustment"](orig_image_array)
621+
else:
622+
orig_image_array = adjust_with_adaptive_histogram_equalization(
623+
orig_image_array
624+
)
625+
626+
# Normalize to 0-255 for image saving
627+
orig_image_array = img_as_ubyte(orig_image_array)
628+
524629
prepared_image = None
525630
# Step 2: Search for a mask
526631
prepared_image = self.search_for_mask_or_outline(
@@ -632,8 +737,6 @@ def _repr_html_(
632737
max_cols = get_option("display.max_columns")
633738
show_dimensions = get_option("display.show_dimensions")
634739

635-
# determine if we have image_cols to display
636-
if image_cols := self.find_image_columns():
637740
# re-add bounding box cols if they are no longer available as in cases
638741
# of masking or accessing various pandas attr's
639742
bounding_box_externally_joined = False
@@ -647,6 +750,25 @@ def _repr_html_(
647750
else:
648751
data = self.copy()
649752

753+
# re-add image path (dirs for images) cols if they are no
754+
# longer available as in cases of masking or accessing
755+
# various pandas attr's
756+
image_paths_externally_joined = False
757+
758+
if self._custom_attrs["data_image_paths"] is not None and not all(
759+
col in self.columns.tolist()
760+
for col in self._custom_attrs["data_image_paths"].columns.tolist()
761+
):
762+
data = data.join(other=self._custom_attrs["data_image_paths"])
763+
image_paths_externally_joined = True
764+
765+
# determine if we have image_cols to display
766+
if image_cols := self.find_image_columns():
767+
# attempt to find the image path columns
768+
image_path_cols = self.find_image_path_columns(
769+
image_cols=image_cols, all_cols=data.columns
770+
)
771+
650772
# gather indices which will be displayed based on pandas configuration
651773
display_indices = self.get_displayed_rows()
652774

@@ -691,6 +813,12 @@ def _repr_html_(
691813
)
692814
],
693815
),
816+
# set the image path based on the image_path cols.
817+
image_path=(
818+
row[image_path_cols[image_col]]
819+
if image_path_cols is not None and image_path_cols != {}
820+
else None
821+
),
694822
),
695823
axis=1,
696824
)
@@ -700,6 +828,11 @@ def _repr_html_(
700828
self._custom_attrs["data_bounding_box"].columns.tolist(), axis=1
701829
)
702830

831+
if image_paths_externally_joined:
832+
data = data.drop(
833+
self._custom_attrs["data_image_paths"].columns.tolist(), axis=1
834+
)
835+
703836
formatter = fmt.DataFrameFormatter(
704837
data,
705838
columns=None,

Diff for: tests/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def fixture_cytotable_nuclear_speckle_data_parquet():
4242
return "tests/data/cytotable/nuclear_speckles/test_slide1_converted.parquet"
4343

4444

45-
@pytest.fixture(name="cytotable_pediatric_cancer_atlas_parquet_parquet")
45+
@pytest.fixture(name="cytotable_pediatric_cancer_atlas_parquet")
4646
def fixture_pediatric_cancer_atlas_data_parquet():
4747
"""
4848
Return df to test CytoTable pediatric cancer atlas data through

0 commit comments

Comments
 (0)