Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Docling version and improve OCR options handling with new docling ver. #574

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling[tesserocr]>=2.4.2,<=2.8.3; sys_platform != 'darwin'
docling>=2.4.2,<=2.8.3; sys_platform == 'darwin'
docling-parse>=2.0.0,<3.0.0
docling[tesserocr]>=2.18.0; sys_platform != 'darwin'
docling>=2.18.0; sys_platform == 'darwin'
GitPython>=3.1.42,<4.0.0
gguf>=0.6.0
httpx>=0.25.0,<1.0.0
Expand Down
46 changes: 38 additions & 8 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@
from typing import Dict, Iterable, List, Optional
import json
import logging
import os
import re
import sys

# Third Party
from datasets import Dataset
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
Expand All @@ -35,29 +39,50 @@ def _num_chars_from_tokens(num_tokens) -> int:
return int(num_tokens * 4) # 1 token ~ 4 English character


def resolve_ocr_options() -> OcrOptions:
def resolve_ocr_options(
docling_model_path: Optional[Path] = None,
) -> Optional[OcrOptions]:
# Declare ocr_options explicitly as Optional[OcrOptions]
ocr_options: Optional[OcrOptions] = None

# First, attempt to use tesserocr
try:
ocr_options = TesseractOcrOptions()
# pylint: disable=import-outside-toplevel
# Third Party
from docling.models.tesseract_ocr_model import TesseractOcrModel

_ = TesseractOcrModel(True, ocr_options)
_ = TesseractOcrModel(
enabled=True,
artifacts_path=docling_model_path,
options=ocr_options,
accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
)
return ocr_options
except ImportError:
# No tesserocr, so try something else
pass
logger.warning("Tesseract not found, falling back to EasyOCR.")

try:
ocr_options = EasyOcrOptions()
# Keep easyocr models on the CPU instead of GPU
ocr_options.use_gpu = False
ocr_options = EasyOcrOptions(
lang=["en"],
use_gpu=None,
confidence_threshold=0.5,
model_storage_directory=str(docling_model_path),
recog_network="standard",
download_enabled=True,
)
# triggers torch loading, import lazily
# pylint: disable=import-outside-toplevel
# Third Party
from docling.models.easyocr_model import EasyOcrModel

_ = EasyOcrModel(True, ocr_options)
_ = EasyOcrModel(
enabled=True,
artifacts_path=None,
options=ocr_options,
accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
)
return ocr_options
except ImportError:
# no easyocr either, so don't use any OCR
Expand Down Expand Up @@ -127,7 +152,12 @@ def _init_docling_converter(self):
do_ocr=False,
)

ocr_options = resolve_ocr_options()
# deactivate MPS acceleration on Github CI
if os.getenv("CI") and sys.platform == "darwin":
pipeline_options.accelerator_options = AcceleratorOptions(
device=AcceleratorDevice.CPU
)
ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path)
if ocr_options is not None:
pipeline_options.do_ocr = True
pipeline_options.ocr_options = ocr_options
Expand Down
94 changes: 15 additions & 79 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,14 @@
# Standard
from pathlib import Path
from tempfile import mkdtemp
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Union
import glob
import logging
import os
import re

# Third Party
from datasets import Dataset

# pylint: disable=no-name-in-module
from docling_parse.docling_parse import pdf_parser_v1
from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
from instructlab.schema.taxonomy import (
TaxonomyMessageFormat,
Expand All @@ -27,9 +24,6 @@
# Local
from .chunkers import DocumentChunker

# Initialize the pdf parser
PDFParser = pdf_parser_v1()

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -126,9 +120,9 @@ def _get_documents(
source: Dict[str, Union[str, List[str]]],
skip_checkout: bool = False,
document_output_dir: Path = None,
) -> Tuple[List[str], List[Path]]:
) -> List[Path]:
"""
Retrieve the content of files (Markdown and PDF) from a Git repository.
Retrieve the file paths of files (Markdown and PDF) from a Git repository.

Args:
source (dict): Source info containing repository URL, commit hash, and list of file patterns.
Expand All @@ -147,14 +141,13 @@ def _get_documents(
repo_url = source.get("repo")
commit_hash = source.get("commit")
file_patterns = source.get("patterns", [])

try: # pylint: disable=too-many-nested-blocks
# pylint: disable=too-many-nested-blocks
try:
repo = git.Repo.clone_from(repo_url, document_output_dir)

if not skip_checkout and commit_hash:
repo.git.checkout(commit_hash)

file_contents = []
filepaths = []

logger.info("Processing files...")
Expand All @@ -170,7 +163,6 @@ def _get_documents(
logger.info(f"Processing file: {file_path}")
try:
if file_path.lower().endswith(".md"):
# Process Markdown files
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
if _string_contains_html(content):
Expand All @@ -179,75 +171,19 @@ def _get_documents(
"NOTE: Continuing this might affect your data generation quality."
"To get best results please format your markdown documents without the use of HTML or use a different document filetype."
)
file_contents.append(content)
filepaths.append(Path(file_path))
logger.info(
f"Appended Markdown content from {file_path}"
)

elif file_path.lower().endswith(".pdf"):
# Process PDF files using docling_parse's pdf_parser_v1
doc_key = f"key_{os.path.basename(file_path)}" # Unique document key
logger.info(f"Loading PDF document from {file_path}")

success = PDFParser.load_document(doc_key, file_path)
if not success:
logger.warning(
f"Failed to load PDF document: {file_path}"
)
continue

num_pages = PDFParser.number_of_pages(doc_key)
logger.info(f"PDF '{file_path}' has {num_pages} pages.")

pdf_text = ""

for page in range(num_pages):
try:
json_doc = PDFParser.parse_pdf_from_key_on_page(
doc_key, page
)
if "pages" not in json_doc or not json_doc["pages"]:
logger.warning(
f"Page {page + 1} could not be parsed in '{file_path}'"
)
continue

json_page = json_doc["pages"][0]

# Extract text from cells
for cell in json_page.get("cells", []):
text = cell.get("content", {}).get(
"rnormalized", ""
)
if text.strip(): # Only append non-empty text
pdf_text += text.strip() + "\n"
except Exception as page_error: # pylint: disable=broad-exception-caught
logger.warning(
f"Error parsing page {page + 1} of '{file_path}': {page_error}"
)
continue

if pdf_text:
file_contents.append(pdf_text)
filepaths.append(Path(file_path))

# Unload the document to free memory
PDFParser.unload_document(doc_key)
logger.info(f"Unloaded PDF document: {file_path}")

else:
logger.info(f"Skipping unsupported file type: {file_path}")
except Exception as file_error: # pylint: disable=broad-exception-caught
filepaths.append(Path(file_path))
logger.info(f"Collected filepath: {file_path}")
# pylint: disable=broad-exception-caught
except Exception as file_error:
logger.error(
f"Error processing file '{file_path}': {file_error}"
)
continue
else:
logger.info(f"Skipping non-file path: {file_path}")

if file_contents:
return file_contents, filepaths
if filepaths:
return filepaths
raise SystemExit("Couldn't find knowledge documents")

except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
Expand Down Expand Up @@ -281,13 +217,13 @@ def _read_taxonomy_file(
task_description = contents.get("task_description", None)
domain = contents.get("domain")
documents = contents.get("document")
document_contents, doc_filepaths = None, None
doc_filepaths = None
if documents:
os.makedirs(document_output_dir, exist_ok=True)
unique_output_dir = mkdtemp(
prefix=f"{leaf_node_path}_", dir=document_output_dir
)
document_contents, doc_filepaths = _get_documents(
doc_filepaths = _get_documents(
source=documents,
document_output_dir=unique_output_dir,
)
Expand All @@ -302,7 +238,6 @@ def _read_taxonomy_file(
"questions_and_answers": question_answer_list,
"context": context,
"taxonomy_path": tax_path,
"documents": document_contents,
"filepaths": doc_filepaths,
"domain": domain,
"document_outline": contents.get("document_outline"),
Expand Down Expand Up @@ -493,7 +428,8 @@ def leaf_node_to_samples(
docling_model_path=None,
):
samples = []
if leaf_node and leaf_node[0].get("documents"):
# check if the leaf node has document filepaths, if so, it's a knowledge leaf node
if leaf_node and (leaf_node[0].get("filepaths")):
samples = _knowledge_leaf_node_to_samples(
leaf_node,
server_ctx_size,
Expand Down
5 changes: 4 additions & 1 deletion tests/functional/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ def test_chunk_documents(
chunks = chunker.chunk_documents()
assert len(chunks) > expected_chunks
if contains_text:
assert contains_text in chunks[0]
# Normalize spaces and remove newlines for more flexible text comparison
normalized_chunk = " ".join(chunks[0].replace("\n", " ").split())
normalized_text = " ".join(contains_text.split())
assert normalized_text in normalized_chunk
for chunk in chunks:
assert len(chunk) < 2500
Loading
Loading