Skip to content

Add a function that generates .gt.txt from folder name to all images inside that folder #420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ endif

LOG_FILE = $(OUTPUT_DIR)/training.log

# Folder containing multiple subfolders for gt generation. Default: $(INPUT_DIR)
INPUT_DIR = images_in_subfolder_has_content_matching_folder_name

# BEGIN-EVAL makefile-parser --make-help Makefile

help:
Expand All @@ -138,6 +141,7 @@ help:
@echo " clean-lstmf Clean generated .lstmf files"
@echo " clean-output Clean generated output files"
@echo " clean Clean all generated files"
@echo " generate-gt-from-folder-name Generate .gt.txt files for images based on their parent folder names and organize them in the Tesseract training directory structure or the specified output directory"
@echo ""
@echo " Variables"
@echo ""
Expand Down Expand Up @@ -177,7 +181,7 @@ endif

.PRECIOUS: $(LAST_CHECKPOINT)

.PHONY: clean help lists proto-model tesseract-langdata training unicharset charfreq
.PHONY: clean help lists proto-model tesseract-langdata training unicharset charfreq generate-gt-from-folder-name

ALL_FILES = $(and $(wildcard $(GROUND_TRUTH_DIR)),$(shell find -L $(GROUND_TRUTH_DIR) -name '*.gt.txt'))
unexport ALL_FILES # prevent adding this to envp in recipes (which can cause E2BIG if too long; cf. make #44853)
Expand All @@ -194,6 +198,37 @@ charfreq: $(ALL_GT)
# Create lists of lstmf filenames for training and eval
lists: $(OUTPUT_DIR)/list.train $(OUTPUT_DIR)/list.eval

# Supported image extensions
IMAGE_EXTENSIONS=-iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png" -o \
-iname "*.tif" -o -iname "*.tiff" -o -iname "*.bmp" -o -iname "*.gif"

# Generate .gt.txt files for images based on their parent folder names and organize them in the Tesseract training directory structure or the specified output directory
generate-gt-from-folder-name:
@# Check if INPUT_DIR exists
@if [ ! -d "$(INPUT_DIR)" ]; then \
echo "Error: Input directory '$(INPUT_DIR)' does not exist!" >&2; \
exit 1; \
fi
@# Check that at least one subdirectory exists
@if [ $$(find "$(INPUT_DIR)" -mindepth 1 -maxdepth 1 -type d | wc -l) -eq 0 ]; then \
echo "Error: '$(INPUT_DIR)' must contain at least one subdirectory!" >&2; \
exit 1; \
fi
@# Check for valid image files
@if [ $$(find "$(INPUT_DIR)" -mindepth 2 -maxdepth 2 -type f \( $(IMAGE_EXTENSIONS) \) | wc -l) -eq 0 ]; then \
echo "Error: No valid images found in any subdirectory of '$(INPUT_DIR)'!" >&2; \
exit 1; \
fi
@# Create the output directory if it does not exist
@mkdir -p "$(GROUND_TRUTH_DIR)"
@# Execute the ground truth generation script
@$(PY_CMD) generate_gt_from_folder.py "$(INPUT_DIR)" "$(GROUND_TRUTH_DIR)"
@# Verify that output files were generated
@if [ $$(ls "$(GROUND_TRUTH_DIR)" | wc -l) -eq 0 ]; then \
echo "Warning: No ground truth files were generated in '$(GROUND_TRUTH_DIR)'!" >&2; \
exit 1; \
fi

$(OUTPUT_DIR):
@mkdir -p $@

Expand Down
73 changes: 73 additions & 0 deletions generate_gt_from_folder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import shutil
import argparse
from pathlib import Path
import sys
from typing import List, Set


def get_image_files(input_dir: Path, extensions: Set[str]) -> List[Path]:
"""Get image files (only in first-level subdirectories) with matching extensions."""
return [
file
for folder in input_dir.iterdir() if folder.is_dir()
for file in folder.iterdir() if file.suffix.lower() in extensions
]


def get_unique_base(output_dir: Path, base: str, ext: str) -> str:
"""Find a unique base name efficiently without excessive looping."""
candidate = base
counter = 1
existing_files = {f.stem for f in output_dir.glob("*")} # Cache existing filenames for faster lookup

while candidate in existing_files:
candidate = f"{base}_{counter}"
counter += 1

return candidate


def generate_gt_from_folders(input_dir: Path, output_dir: Path) -> int:
"""Generate .gt.txt files for each image where the text file contains the parent folder's name."""
print("Ground-Truth generation beginning...")
output_dir.mkdir(parents=True, exist_ok=True)

image_extensions = {'.png', '.jpg', '.jpeg', '.tif', '.tiff', '.bmp', '.gif'}
image_files = get_image_files(input_dir, image_extensions)

total_files = 0

for file_path in image_files:
folder_name = file_path.parent.name
base_name = file_path.stem
ext = file_path.suffix.lower()

unique_base = get_unique_base(output_dir, base_name, ext)
dest_image = output_dir / f"{unique_base}{ext}"
txt_filepath = output_dir / f"{unique_base}.gt.txt"

try:
txt_filepath.write_text(folder_name, encoding='utf-8')
shutil.copy(file_path, dest_image) # Faster than copy2()
total_files += 1
print(f"Processed: {file_path.name} -> {folder_name}")
except (IOError, OSError) as e:
print(f"Error processing {file_path}: {e}", file=sys.stderr)

print(f"Done! Processed {total_files} image files.")
return total_files


def main():
parser = argparse.ArgumentParser(description="Generate .gt.txt files from folder names for Tesseract training")
parser.add_argument('input_dir', type=Path, help='Input directory containing subfolders with images')
parser.add_argument('output_dir', type=Path, help='Output directory')
args = parser.parse_args()

count = generate_gt_from_folders(args.input_dir.resolve(), args.output_dir.resolve())
if count == 0:
sys.exit(1)


if __name__ == "__main__":
main()