Skip to content

Commit aacdbe8

Browse files
committed
Feat: Add auto-detection for annotation file format
1 parent 4bd1922 commit aacdbe8

File tree

2 files changed

+370
-29
lines changed

2 files changed

+370
-29
lines changed

ethology/annotations/io/load_bboxes.py

Lines changed: 172 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Module for reading and writing manually labelled annotations."""
22

33
import json
4+
import warnings
45
from pathlib import Path
56
from typing import Literal
67

@@ -25,19 +26,104 @@
2526
] # superset of columns in the standard dataframe
2627

2728

29+
# --- NEW FUNCTION (for issue #43) ---
30+
def _detect_format(file_path: Path) -> Literal["VIA", "COCO"]:
31+
"""Detect the format (VIA or COCO) of a JSON annotation file.
32+
33+
Detection is based on the presence of characteristic top-level keys.
34+
35+
Parameters
36+
----------
37+
file_path : Path
38+
Path to the input annotation file.
39+
40+
Returns
41+
-------
42+
Literal["VIA", "COCO"]
43+
The detected format.
44+
45+
Raises
46+
------
47+
FileNotFoundError
48+
If the file_path does not exist.
49+
ValueError
50+
If the file cannot be decoded as JSON, or if the format cannot
51+
be reliably determined from the top-level keys.
52+
53+
"""
54+
if not file_path.is_file():
55+
raise FileNotFoundError(f"Annotation file not found: {file_path}")
56+
57+
try:
58+
with open(file_path) as f:
59+
# Load only enough to check keys, avoid loading huge files
60+
# if possible
61+
# For simplicity here, load the whole thing.
62+
# Optimization is possible if needed.
63+
data = json.load(f)
64+
except json.JSONDecodeError as e:
65+
raise ValueError(
66+
f"Error decoding JSON data from file {file_path}: {e}"
67+
) from e
68+
except Exception as e: # Catch other potential file reading errors
69+
raise ValueError(f"Could not read file {file_path}: {e}") from e
70+
71+
if not isinstance(data, dict):
72+
raise ValueError(
73+
f"Expected JSON root to be a dictionary, but got {type(data)} "
74+
f"in file {file_path}"
75+
)
76+
77+
top_level_keys = set(data.keys())
78+
79+
# Define characteristic keys
80+
# Based on validators and common usage
81+
via_keys = {"_via_img_metadata", "_via_attributes", "_via_settings"}
82+
# 'info' and 'licenses' are optional
83+
coco_keys = {"images", "annotations", "categories"}
84+
85+
has_via_keys = bool(via_keys.intersection(top_level_keys))
86+
# Require all core COCO keys
87+
has_coco_keys = coco_keys.issubset(top_level_keys)
88+
89+
if has_coco_keys and has_via_keys:
90+
# Ambiguous case - perhaps warn and default to COCO, or error?
91+
# Let's raise an error for now, as it indicates a weird file.
92+
raise ValueError(
93+
f"File {file_path} contains keys characteristic of *both* VIA "
94+
"and COCO formats. "
95+
"Cannot reliably determine format."
96+
)
97+
elif has_coco_keys:
98+
return "COCO"
99+
elif has_via_keys:
100+
return "VIA"
101+
else:
102+
raise ValueError(
103+
f"Could not automatically determine format for file {file_path}. "
104+
"File does not contain characteristic top-level keys for VIA or "
105+
"COCO."
106+
)
107+
108+
109+
# --- UPDATED FUNCTION (for issue #43) ---
28110
def from_files(
29111
file_paths: Path | str | list[Path | str],
30-
format: Literal["VIA", "COCO"],
31-
images_dirs: Path | str | list[Path | str] | None = None,
112+
format: Literal["VIA", "COCO", "auto"] = "auto", # Changed default and
113+
# added "auto"
114+
images_dirs: Path | str | list[Path | str] | None = None, # Use Union
32115
) -> pd.DataFrame:
33116
"""Read input annotation files as a bboxes dataframe.
34117
35118
Parameters
36119
----------
37120
file_paths : Path | str | list[Path | str]
38121
Path or list of paths to the input annotation files.
39-
format : Literal["VIA", "COCO"]
40-
Format of the input annotation files.
122+
format : Literal["VIA", "COCO", "auto"], optional
123+
Format of the input annotation files. If set to "auto" (default),
124+
the format will be detected based on the content of the first file
125+
provided. Detection relies on characteristic top-level keys in the
126+
JSON structure.
41127
images_dirs : Path | str | list[Path | str], optional
42128
Path or list of paths to the directories containing the images the
43129
annotations refer to.
@@ -46,54 +132,111 @@ def from_files(
46132
-------
47133
pd.DataFrame
48134
Bounding boxes annotations dataframe. The dataframe is indexed
49-
by "annotation_id" and has the following columns: "image_filename",
50-
"image_id", "image_width", "image_height", "x_min", "y_min",
51-
"width", "height", "supercategory", "category". It also has the
52-
following attributes: "annotation_files", "annotation_format",
53-
"images_directories". The "image_id" is assigned based
54-
on the alphabetically sorted list of unique image filenames across all
55-
input files. The "category_id" column is always a 0-based integer,
56-
except for VIA files where the values specified in the input file
57-
are retained.
135+
by "annotation_id" and has the standard columns (see Notes).
136+
It also has the following attributes: "annotation_files",
137+
"annotation_format", "images_directories".
138+
139+
Raises
140+
------
141+
ValueError
142+
If format="auto" and the format cannot be detected, or if an
143+
invalid format string is provided.
144+
FileNotFoundError
145+
If format="auto" and the first file path does not exist.
146+
json.JSONDecodeError
147+
If format="auto" and the first file cannot be parsed as JSON.
148+
(Wrapped in ValueError by _detect_format).
58149
59150
Notes
60151
-----
61-
We use image filenames' to assign IDs to images, so if two images have the
62-
same name but are in different input annotation files, they will be
63-
assigned the same image ID and their annotations will be merged.
64-
65-
If this behaviour is not desired, and you would like to assign different
66-
image IDs to images that have the same name but appear in different input
67-
annotation files, you can either make the image filenames distinct before
68-
loading the data, or you can load the data from each file
69-
as a separate dataframe, and then concatenate them as desired.
152+
The standard dataframe has the following columns: "image_filename",
153+
"image_id", "image_width", "image_height", "x_min", "y_min",
154+
"width", "height", "supercategory", "category", "category_id".
155+
156+
The "image_id" is assigned based on the alphabetically sorted list of
157+
unique image filenames across all input files. The "category_id" column
158+
is always a 0-based integer derived from the category names.
159+
160+
When loading multiple files:
161+
- If format="auto", the format is detected from the *first* file in the
162+
list and assumed to be the same for all subsequent files.
163+
- Image filenames are used to assign unique image IDs. If the same
164+
filename appears in multiple annotation files, annotations will be
165+
merged under the same image_id.
166+
- Duplicate annotations across files are dropped.
70167
71168
See Also
72169
--------
73170
pandas.concat : Concatenate pandas objects along a particular axis.
74-
75171
pandas.DataFrame.drop_duplicates : Return DataFrame with duplicate rows
76172
removed.
77173
78174
"""
175+
# Ensure file_paths is a list internally, even if single path is given
176+
if isinstance(file_paths, str | Path):
177+
input_file_list = [Path(file_paths)]
178+
is_single_file = True
179+
elif isinstance(file_paths, list):
180+
if not file_paths:
181+
raise ValueError("Input 'file_paths' list cannot be empty.")
182+
input_file_list = [Path(p) for p in file_paths]
183+
is_single_file = False
184+
else:
185+
raise TypeError(
186+
f"Unsupported type for 'file_paths': {type(file_paths)}"
187+
)
188+
189+
# --- Format Detection Logic ---
190+
determined_format: Literal["VIA", "COCO"]
191+
if format == "auto":
192+
try:
193+
# Detect format based on the first file
194+
determined_format = _detect_format(input_file_list[0])
195+
# Optionally warn if multiple files were provided
196+
if not is_single_file:
197+
warnings.warn(
198+
f"Format automatically detected as '{determined_format}' "
199+
f"based on the first file '{input_file_list[0].name}'. "
200+
"Assuming all files in the list have the same format.",
201+
UserWarning,
202+
stacklevel=2,
203+
)
204+
except (FileNotFoundError, ValueError) as e:
205+
# Re-raise errors related to detection more informatively
206+
raise ValueError(f"Automatic format detection failed: {e}") from e
207+
elif format in ["VIA", "COCO"]:
208+
determined_format = format
209+
else:
210+
raise ValueError(
211+
f"Invalid format specified: '{format}'. Must be 'VIA', "
212+
f"'COCO', or 'auto'."
213+
)
214+
# --- End Format Detection ---
215+
79216
# Delegate to reader of either a single file or multiple files
80-
if isinstance(file_paths, list):
81-
df_all = _from_multiple_files(file_paths, format=format)
217+
if is_single_file:
218+
df_all = _from_single_file(
219+
input_file_list[0], format=determined_format
220+
)
82221
else:
83-
df_all = _from_single_file(file_paths, format=format)
222+
# Pass the list of Path objects
223+
df_all = _from_multiple_files(
224+
input_file_list, format=determined_format
225+
)
84226

85227
# Add metadata
86228
df_all.attrs = {
87-
"annotation_files": file_paths,
88-
"annotation_format": format,
229+
"annotation_files": file_paths, # Store original input representation
230+
"annotation_format": determined_format,
231+
# Store detected/validated format
89232
"images_directories": images_dirs,
90233
}
91234

92235
return df_all
93236

94237

95238
def _from_multiple_files(
96-
list_filepaths: list[Path | str], format: Literal["VIA", "COCO"]
239+
list_filepaths: list[Path], format: Literal["VIA", "COCO"]
97240
):
98241
"""Read bounding boxes annotations from multiple files.
99242

0 commit comments

Comments
 (0)