diff --git a/Makefile b/Makefile index 16bc9c67..98f2db92 100644 --- a/Makefile +++ b/Makefile @@ -119,6 +119,9 @@ endif LOG_FILE = $(OUTPUT_DIR)/training.log +# Folder containing multiple subfolders for gt generation. Default: $(INPUT_DIR) +INPUT_DIR = images_in_subfolder_has_content_matching_folder_name + # BEGIN-EVAL makefile-parser --make-help Makefile help: @@ -138,6 +141,7 @@ help: @echo " clean-lstmf Clean generated .lstmf files" @echo " clean-output Clean generated output files" @echo " clean Clean all generated files" + @echo " generate-gt-from-folder-name Generate .gt.txt files for images based on their parent folder names and organize them in the Tesseract training directory structure or the specified output directory" @echo "" @echo " Variables" @echo "" @@ -177,7 +181,7 @@ endif .PRECIOUS: $(LAST_CHECKPOINT) -.PHONY: clean help lists proto-model tesseract-langdata training unicharset charfreq +.PHONY: clean help lists proto-model tesseract-langdata training unicharset charfreq generate-gt-from-folder-name ALL_FILES = $(and $(wildcard $(GROUND_TRUTH_DIR)),$(shell find -L $(GROUND_TRUTH_DIR) -name '*.gt.txt')) unexport ALL_FILES # prevent adding this to envp in recipes (which can cause E2BIG if too long; cf. make #44853) @@ -194,6 +198,37 @@ charfreq: $(ALL_GT) # Create lists of lstmf filenames for training and eval lists: $(OUTPUT_DIR)/list.train $(OUTPUT_DIR)/list.eval +# Supported image extensions +IMAGE_EXTENSIONS=-iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png" -o \ + -iname "*.tif" -o -iname "*.tiff" -o -iname "*.bmp" -o -iname "*.gif" + +# Generate .gt.txt files for images based on their parent folder names and organize them in the Tesseract training directory structure or the specified output directory +generate-gt-from-folder-name: + @# Check if INPUT_DIR exists + @if [ ! -d "$(INPUT_DIR)" ]; then \ + echo "Error: Input directory '$(INPUT_DIR)' does not exist!" >&2; \ + exit 1; \ + fi + @# Check that at least one subdirectory exists + @if [ $$(find "$(INPUT_DIR)" -mindepth 1 -maxdepth 1 -type d | wc -l) -eq 0 ]; then \ + echo "Error: '$(INPUT_DIR)' must contain at least one subdirectory!" >&2; \ + exit 1; \ + fi + @# Check for valid image files + @if [ $$(find "$(INPUT_DIR)" -mindepth 2 -maxdepth 2 -type f \( $(IMAGE_EXTENSIONS) \) | wc -l) -eq 0 ]; then \ + echo "Error: No valid images found in any subdirectory of '$(INPUT_DIR)'!" >&2; \ + exit 1; \ + fi + @# Create the output directory if it does not exist + @mkdir -p "$(GROUND_TRUTH_DIR)" + @# Execute the ground truth generation script + @$(PY_CMD) generate_gt_from_folder.py "$(INPUT_DIR)" "$(GROUND_TRUTH_DIR)" + @# Verify that output files were generated + @if [ $$(ls "$(GROUND_TRUTH_DIR)" | wc -l) -eq 0 ]; then \ + echo "Warning: No ground truth files were generated in '$(GROUND_TRUTH_DIR)'!" >&2; \ + exit 1; \ + fi + $(OUTPUT_DIR): @mkdir -p $@ diff --git a/generate_gt_from_folder.py b/generate_gt_from_folder.py new file mode 100644 index 00000000..6a5b77dc --- /dev/null +++ b/generate_gt_from_folder.py @@ -0,0 +1,73 @@ +import shutil +import argparse +from pathlib import Path +import sys +from typing import List, Set + + +def get_image_files(input_dir: Path, extensions: Set[str]) -> List[Path]: + """Get image files (only in first-level subdirectories) with matching extensions.""" + return [ + file + for folder in input_dir.iterdir() if folder.is_dir() + for file in folder.iterdir() if file.suffix.lower() in extensions + ] + + +def get_unique_base(output_dir: Path, base: str, ext: str) -> str: + """Find a unique base name efficiently without excessive looping.""" + candidate = base + counter = 1 + existing_files = {f.stem for f in output_dir.glob("*")} # Cache existing filenames for faster lookup + + while candidate in existing_files: + candidate = f"{base}_{counter}" + counter += 1 + + return candidate + + +def generate_gt_from_folders(input_dir: Path, output_dir: Path) -> int: + """Generate .gt.txt files for each image where the text file contains the parent folder's name.""" + print("Ground-Truth generation beginning...") + output_dir.mkdir(parents=True, exist_ok=True) + + image_extensions = {'.png', '.jpg', '.jpeg', '.tif', '.tiff', '.bmp', '.gif'} + image_files = get_image_files(input_dir, image_extensions) + + total_files = 0 + + for file_path in image_files: + folder_name = file_path.parent.name + base_name = file_path.stem + ext = file_path.suffix.lower() + + unique_base = get_unique_base(output_dir, base_name, ext) + dest_image = output_dir / f"{unique_base}{ext}" + txt_filepath = output_dir / f"{unique_base}.gt.txt" + + try: + txt_filepath.write_text(folder_name, encoding='utf-8') + shutil.copy(file_path, dest_image) # Faster than copy2() + total_files += 1 + print(f"Processed: {file_path.name} -> {folder_name}") + except (IOError, OSError) as e: + print(f"Error processing {file_path}: {e}", file=sys.stderr) + + print(f"Done! Processed {total_files} image files.") + return total_files + + +def main(): + parser = argparse.ArgumentParser(description="Generate .gt.txt files from folder names for Tesseract training") + parser.add_argument('input_dir', type=Path, help='Input directory containing subfolders with images') + parser.add_argument('output_dir', type=Path, help='Output directory') + args = parser.parse_args() + + count = generate_gt_from_folders(args.input_dir.resolve(), args.output_dir.resolve()) + if count == 0: + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file