11"""Module for reading and writing manually labelled annotations."""
22
33import json
4+ import warnings
45from pathlib import Path
56from typing import Literal
67
2526] # superset of columns in the standard dataframe
2627
2728
29+ # --- NEW FUNCTION (for issue #43) ---
30+ def _detect_format (file_path : Path ) -> Literal ["VIA" , "COCO" ]:
31+ """Detect the format (VIA or COCO) of a JSON annotation file.
32+
33+ Detection is based on the presence of characteristic top-level keys.
34+
35+ Parameters
36+ ----------
37+ file_path : Path
38+ Path to the input annotation file.
39+
40+ Returns
41+ -------
42+ Literal["VIA", "COCO"]
43+ The detected format.
44+
45+ Raises
46+ ------
47+ FileNotFoundError
48+ If the file_path does not exist.
49+ ValueError
50+ If the file cannot be decoded as JSON, or if the format cannot
51+ be reliably determined from the top-level keys.
52+
53+ """
54+ if not file_path .is_file ():
55+ raise FileNotFoundError (f"Annotation file not found: { file_path } " )
56+
57+ try :
58+ with open (file_path ) as f :
59+ # Load only enough to check keys, avoid loading huge files
60+ # if possible
61+ # For simplicity here, load the whole thing.
62+ # Optimization is possible if needed.
63+ data = json .load (f )
64+ except json .JSONDecodeError as e :
65+ raise ValueError (
66+ f"Error decoding JSON data from file { file_path } : { e } "
67+ ) from e
68+ except Exception as e : # Catch other potential file reading errors
69+ raise ValueError (f"Could not read file { file_path } : { e } " ) from e
70+
71+ if not isinstance (data , dict ):
72+ raise ValueError (
73+ f"Expected JSON root to be a dictionary, but got { type (data )} "
74+ f"in file { file_path } "
75+ )
76+
77+ top_level_keys = set (data .keys ())
78+
79+ # Define characteristic keys
80+ # Based on validators and common usage
81+ via_keys = {"_via_img_metadata" , "_via_attributes" , "_via_settings" }
82+ # 'info' and 'licenses' are optional
83+ coco_keys = {"images" , "annotations" , "categories" }
84+
85+ has_via_keys = bool (via_keys .intersection (top_level_keys ))
86+ # Require all core COCO keys
87+ has_coco_keys = coco_keys .issubset (top_level_keys )
88+
89+ if has_coco_keys and has_via_keys :
90+ # Ambiguous case - perhaps warn and default to COCO, or error?
91+ # Let's raise an error for now, as it indicates a weird file.
92+ raise ValueError (
93+ f"File { file_path } contains keys characteristic of *both* VIA "
94+ "and COCO formats. "
95+ "Cannot reliably determine format."
96+ )
97+ elif has_coco_keys :
98+ return "COCO"
99+ elif has_via_keys :
100+ return "VIA"
101+ else :
102+ raise ValueError (
103+ f"Could not automatically determine format for file { file_path } . "
104+ "File does not contain characteristic top-level keys for VIA or "
105+ "COCO."
106+ )
107+
108+
109+ # --- UPDATED FUNCTION (for issue #43) ---
28110def from_files (
29111 file_paths : Path | str | list [Path | str ],
30- format : Literal ["VIA" , "COCO" ],
31- images_dirs : Path | str | list [Path | str ] | None = None ,
112+ format : Literal ["VIA" , "COCO" , "auto" ] = "auto" , # Changed default and
113+ # added "auto"
114+ images_dirs : Path | str | list [Path | str ] | None = None , # Use Union
32115) -> pd .DataFrame :
33116 """Read input annotation files as a bboxes dataframe.
34117
35118 Parameters
36119 ----------
37120 file_paths : Path | str | list[Path | str]
38121 Path or list of paths to the input annotation files.
39- format : Literal["VIA", "COCO"]
40- Format of the input annotation files.
122+ format : Literal["VIA", "COCO", "auto"], optional
123+ Format of the input annotation files. If set to "auto" (default),
124+ the format will be detected based on the content of the first file
125+ provided. Detection relies on characteristic top-level keys in the
126+ JSON structure.
41127 images_dirs : Path | str | list[Path | str], optional
42128 Path or list of paths to the directories containing the images the
43129 annotations refer to.
@@ -46,54 +132,111 @@ def from_files(
46132 -------
47133 pd.DataFrame
48134 Bounding boxes annotations dataframe. The dataframe is indexed
49- by "annotation_id" and has the following columns: "image_filename",
50- "image_id", "image_width", "image_height", "x_min", "y_min",
51- "width", "height", "supercategory", "category". It also has the
52- following attributes: "annotation_files", "annotation_format",
53- "images_directories". The "image_id" is assigned based
54- on the alphabetically sorted list of unique image filenames across all
55- input files. The "category_id" column is always a 0-based integer,
56- except for VIA files where the values specified in the input file
57- are retained.
135+ by "annotation_id" and has the standard columns (see Notes).
136+ It also has the following attributes: "annotation_files",
137+ "annotation_format", "images_directories".
138+
139+ Raises
140+ ------
141+ ValueError
142+ If format="auto" and the format cannot be detected, or if an
143+ invalid format string is provided.
144+ FileNotFoundError
145+ If format="auto" and the first file path does not exist.
146+ json.JSONDecodeError
147+ If format="auto" and the first file cannot be parsed as JSON.
148+ (Wrapped in ValueError by _detect_format).
58149
59150 Notes
60151 -----
61- We use image filenames' to assign IDs to images, so if two images have the
62- same name but are in different input annotation files, they will be
63- assigned the same image ID and their annotations will be merged.
64-
65- If this behaviour is not desired, and you would like to assign different
66- image IDs to images that have the same name but appear in different input
67- annotation files, you can either make the image filenames distinct before
68- loading the data, or you can load the data from each file
69- as a separate dataframe, and then concatenate them as desired.
152+ The standard dataframe has the following columns: "image_filename",
153+ "image_id", "image_width", "image_height", "x_min", "y_min",
154+ "width", "height", "supercategory", "category", "category_id".
155+
156+ The "image_id" is assigned based on the alphabetically sorted list of
157+ unique image filenames across all input files. The "category_id" column
158+ is always a 0-based integer derived from the category names.
159+
160+ When loading multiple files:
161+ - If format="auto", the format is detected from the *first* file in the
162+ list and assumed to be the same for all subsequent files.
163+ - Image filenames are used to assign unique image IDs. If the same
164+ filename appears in multiple annotation files, annotations will be
165+ merged under the same image_id.
166+ - Duplicate annotations across files are dropped.
70167
71168 See Also
72169 --------
73170 pandas.concat : Concatenate pandas objects along a particular axis.
74-
75171 pandas.DataFrame.drop_duplicates : Return DataFrame with duplicate rows
76172 removed.
77173
78174 """
175+ # Ensure file_paths is a list internally, even if single path is given
176+ if isinstance (file_paths , str | Path ):
177+ input_file_list = [Path (file_paths )]
178+ is_single_file = True
179+ elif isinstance (file_paths , list ):
180+ if not file_paths :
181+ raise ValueError ("Input 'file_paths' list cannot be empty." )
182+ input_file_list = [Path (p ) for p in file_paths ]
183+ is_single_file = False
184+ else :
185+ raise TypeError (
186+ f"Unsupported type for 'file_paths': { type (file_paths )} "
187+ )
188+
189+ # --- Format Detection Logic ---
190+ determined_format : Literal ["VIA" , "COCO" ]
191+ if format == "auto" :
192+ try :
193+ # Detect format based on the first file
194+ determined_format = _detect_format (input_file_list [0 ])
195+ # Optionally warn if multiple files were provided
196+ if not is_single_file :
197+ warnings .warn (
198+ f"Format automatically detected as '{ determined_format } ' "
199+ f"based on the first file '{ input_file_list [0 ].name } '. "
200+ "Assuming all files in the list have the same format." ,
201+ UserWarning ,
202+ stacklevel = 2 ,
203+ )
204+ except (FileNotFoundError , ValueError ) as e :
205+ # Re-raise errors related to detection more informatively
206+ raise ValueError (f"Automatic format detection failed: { e } " ) from e
207+ elif format in ["VIA" , "COCO" ]:
208+ determined_format = format
209+ else :
210+ raise ValueError (
211+ f"Invalid format specified: '{ format } '. Must be 'VIA', "
212+ f"'COCO', or 'auto'."
213+ )
214+ # --- End Format Detection ---
215+
79216 # Delegate to reader of either a single file or multiple files
80- if isinstance (file_paths , list ):
81- df_all = _from_multiple_files (file_paths , format = format )
217+ if is_single_file :
218+ df_all = _from_single_file (
219+ input_file_list [0 ], format = determined_format
220+ )
82221 else :
83- df_all = _from_single_file (file_paths , format = format )
222+ # Pass the list of Path objects
223+ df_all = _from_multiple_files (
224+ input_file_list , format = determined_format
225+ )
84226
85227 # Add metadata
86228 df_all .attrs = {
87- "annotation_files" : file_paths ,
88- "annotation_format" : format ,
229+ "annotation_files" : file_paths , # Store original input representation
230+ "annotation_format" : determined_format ,
231+ # Store detected/validated format
89232 "images_directories" : images_dirs ,
90233 }
91234
92235 return df_all
93236
94237
95238def _from_multiple_files (
96- list_filepaths : list [Path | str ], format : Literal ["VIA" , "COCO" ]
239+ list_filepaths : list [Path ], format : Literal ["VIA" , "COCO" ]
97240):
98241 """Read bounding boxes annotations from multiple files.
99242
0 commit comments