Update PDF extraction and OCR options for hybrid chunking

aakankshaduggal · aakankshaduggal · commit d97fb714f36b · 2025-02-12T15:24:25.000-05:00
Signed-off-by: Aakanksha Duggal &lt;aduggal@redhat.com&gt;
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -8,9 +8,11 @@
 
 # Third Party
 from datasets import Dataset
+from docling.chunking import HybridChunker
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
     PdfPipelineOptions,
@@ -57,7 +59,10 @@ def resolve_ocr_options() -> OcrOptions:
         # Third Party
         from docling.models.easyocr_model import EasyOcrModel
 
-        _ = EasyOcrModel(True, ocr_options)
+        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+        _ = EasyOcrModel(
+            True, ocr_options, None, accelerator_options=accelerator_options
+        )
         return ocr_options
     except ImportError:
         # no easyocr either, so don't use any OCR
@@ -185,11 +190,11 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
         with open(json_fp, "r", encoding="utf-8") as f:
             data = json.load(f)
 
-        chunks = self.build_chunks_from_docling_json(
-            data,
-            max_token_per_chunk=500,
-            tokenizer=self.tokenizer,
-        )
+        chunker = HybridChunker(tokenizer=self.tokenizer, max_token_per_chunk=500)
+        chunk_iter = chunker.chunk(
+            dl_doc=data
+        )  # Use hybrid chunker to chunk the document
+        chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter]
         fused_texts = self.fuse_texts(chunks, 200)
 
         num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
@@ -288,215 +293,6 @@ def get_token_count(self, text, tokenizer):
         """
         return len(tokenizer.tokenize(text))
 
-    def add_heading_formatting(self, text):
-        """
-        Add heading formatting to the text if the first part is short.
-        Args:
-            text (str): The input text to format.
-        Returns:
-            str: Formatted text with headings applied.
-        """
-        text = text.split(".")
-
-        # Change this from hardcoded to something more flexible
-        if len(text) > 1 and len(text[0].split(" ")) < 3:
-            text = f"**{text[0]}**" + ".".join(text[1:])
-        else:
-            text = ".".join(text)
-        return text
-
-    def generate_table_from_parsed_rep(self, item):
-        """
-        Generate the table from the parsed representation and return as a string.
-        Args:
-            item (dict): Parsed representation of a table.
-        Returns:
-            str: Formatted table as a string.
-        """
-        caption = ""
-        if "text" in item:
-            caption = item["text"]
-
-        data = item["data"]
-
-        if len(data) <= 1 or len(data[0]) <= 1:
-            return ""
-
-        table = []
-        for _, row in enumerate(data):
-            trow = []
-            for _, cell in enumerate(row):
-                trow.append(cell["text"])
-            table.append(trow)
-
-        table_text = tabulate(table, tablefmt="github")
-        if caption:
-            table_text += f"\nCaption: {caption}\n"
-        return table_text
-
-    def get_table(self, json_book, table_ref):
-        """
-        Retrieve a table from a document based on a reference string.
-        Args:
-            json_book (dict): JSON representation of the document.
-            table_ref (str): Reference path to the table within the document.
-        Returns:
-            str: Formatted table string.
-        """
-        parts = table_ref.split("/")
-        table_text = self.generate_table_from_parsed_rep(
-            json_book[parts[1]][int(parts[2])]
-        )
-        return table_text
-
-    def get_table_page_number(self, json_book, idx):
-        """
-        Get the page number of a table or other document element.
-        Args:
-            json_book (dict): JSON representation of the document.
-            idx (int): Index of the element in the document.
-        Returns:
-            int: Page number of the element.
-        """
-        prev_page_num, next_page_num = None, None
-        for book_element in json_book["main-text"][idx - 1 :: -1]:
-            if "prov" in book_element:
-                prev_page_num = book_element["prov"][0]["page"]
-                break
-        for book_element in json_book["main-text"][idx:]:
-            if "prov" in book_element and book_element["prov"]:
-                next_page_num = book_element["prov"][0]["page"]
-                break
-        if prev_page_num is not None and next_page_num is not None:
-            if prev_page_num == next_page_num:
-                return prev_page_num
-            return next_page_num
-        if prev_page_num is not None:
-            return prev_page_num
-        if next_page_num is not None:
-            return next_page_num
-
-    def build_chunks_from_docling_json(
-        self,
-        json_book,
-        max_token_per_chunk,
-        tokenizer,
-        keep_same_page_thing_together=False,
-        chunking_criteria=None,
-    ):
-        """
-        Build document chunks from a docling JSON representation.
-        Args:
-            json_book (dict): JSON document to process.
-            max_token_per_chunk (int): Maximum token count per chunk.
-            tokenizer (AutoTokenizer): Tokenizer instance to use.
-            keep_same_page_thing_together (bool): Whether to keep content on the same page together.
-            chunking_criteria (callable): Custom function for determining chunk breaks.
-        Returns:
-            list: List of document chunks.
-        """
-        current_buffer = []
-        document_chunks = []
-        prev_page_number = None
-        book_title = None
-
-        for idx, book_element in enumerate(json_book["main-text"]):
-            if book_element["type"] in [
-                "page-footer",
-                "picture",
-                "reference",
-                "meta-data",
-                "figure",
-                "page-header",
-            ]:
-                continue
-            if book_element["type"] == "footnote":
-                current_book_page_number = book_element["prov"][0]["page"]
-            elif book_element["type"] in [
-                "subtitle-level-1",
-                "paragraph",
-                "table",
-                "title",
-                "equation",
-            ]:  # 'page-header',
-                if book_element["type"] == "table":
-                    current_book_page_number = self.get_table_page_number(
-                        json_book, idx
-                    )
-                    book_text = self.get_table(json_book, book_element["$ref"])
-                elif book_element["prov"]:
-                    current_book_page_number = book_element["prov"][0][
-                        "page"
-                    ]  # TODO export to function to handle empty ["prov"]
-                    book_text = book_element["text"]
-                else:
-                    current_book_page_number = None
-                    book_text = book_element["text"]
-
-                if book_element["type"] == "subtitle-level-1":
-                    if book_title is None:
-                        book_title = book_text
-                        book_text = f"# Title: **{book_text}**"
-                    else:
-                        book_text = f"## **{book_text}**"
-
-                if book_element["type"] == "title":
-                    book_text = f"# **{book_text}**"
-                if book_element["type"] == "page-header":
-                    book_text = f"Page Header: **{book_text}**\n\n"
-
-                if chunking_criteria is not None:
-                    # custom break function that can be used to chunk document
-                    if chunking_criteria(book_text):
-                        document_chunks.append("\n\n".join(current_buffer))
-                        current_buffer = []
-                elif (
-                    prev_page_number is not None
-                    and prev_page_number != current_book_page_number
-                ) and keep_same_page_thing_together:
-                    document_chunks.append("\n\n".join(current_buffer))
-                    current_buffer = []
-                else:
-                    if (
-                        self.get_token_count("\n\n".join(current_buffer), tokenizer)
-                        >= max_token_per_chunk
-                        and len(current_buffer) > 1
-                    ):
-                        chunk_text = "\n\n".join(current_buffer[:-1])
-                        logger.debug(
-                            f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}"
-                        )
-
-                        document_chunks.append("\n\n".join(current_buffer[:-1]))
-
-                        if (
-                            self.get_token_count(current_buffer[-1], tokenizer)
-                            >= max_token_per_chunk
-                        ):
-                            logger.debug(
-                                f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}"
-                            )
-                            document_chunks.append(current_buffer[-1])
-                            current_buffer = []
-                        else:
-                            current_buffer = current_buffer[-1:]
-
-                if book_element["type"] == "paragraph":
-                    book_text = self.add_heading_formatting(book_text)
-                if "## References" in book_text or "## Acknowledgements" in book_text:
-                    # For research papers we ignore everything after this sections
-                    break
-                current_buffer.append(book_text)
-
-            try:
-                prev_page_number = current_book_page_number
-            except Exception as e:  # pylint: disable=broad-exception-caught
-                logger.error(f"Error processing book element: {book_element}, {str(e)}")
-
-        if "\n\n".join(current_buffer) not in document_chunks:
-            document_chunks.append("\n\n".join(current_buffer))
-        return document_chunks
-
     def export_documents(self, converted_docs: Iterable[ConversionResult]):
         """Write converted documents to json files
 
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -13,7 +13,7 @@
 from datasets import Dataset
 
 # pylint: disable=no-name-in-module
-from docling_parse.docling_parse import pdf_parser_v1
+from docling_parse.pdf_parsers import pdf_parser_v2
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -28,7 +28,7 @@
 from .chunkers import DocumentChunker
 
 # Initialize the pdf parser
-PDFParser = pdf_parser_v1()
+PDFParser = pdf_parser_v2("error")
 
 logger = logging.getLogger(__name__)
 
@@ -122,13 +122,55 @@ def _string_contains_html(s: str) -> bool:
     return bool(html_tag_pattern.search(s))
 
 
+def extract_text_from_pdf(file_path: str) -> str:
+    """
+    Extracts text from a PDF file using the docling PdfParser API.
+
+    Args:
+        file_path (str): Path to the PDF file.
+
+    Returns:
+        str: The extracted text from all pages.
+    """
+    pdf_text = ""
+    # Load the PDF document in lazy mode. (Do not pass a doc_key here.)
+    pdf_doc = PDFParser.load(path_or_stream=file_path, lazy=True)
+    if pdf_doc is None:
+        logger.error(f"Failed to load PDF: {file_path}")
+        return ""
+
+    num_pages = pdf_doc.number_of_pages()
+    logger.info(f"PDF '{file_path}' has {num_pages} pages.")
+
+    # Note: The high-level API expects page numbers to be 1-indexed.
+    for page_no in range(1, num_pages + 1):
+        try:
+            pdf_page = pdf_doc.get_page(page_no=page_no)
+            text_lines = pdf_page.sanitized.export_to_textlines(
+                add_fontkey=True, add_fontname=False
+            )
+            page_text = "\n".join(text_lines)
+            pdf_text += page_text + "\n"
+        except Exception as e:
+            logger.warning(
+                f"Error extracting text from page {page_no} of '{file_path}': {e}"
+            )
+            continue
+
+    # Unload the document to free memory
+    PDFParser.unload_document(file_path)
+    logger.info(f"Unloaded PDF document: {file_path}")
+
+    return pdf_text
+
+
 def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
     document_output_dir: Path = None,
 ) -> Tuple[List[str], List[Path]]:
     """
-    Retrieve the content of files (Markdown and PDF) from a Git repository.
+    Retrieve the content of files (Markdown and PDFs) from a Git repository.
 
     Args:
         source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -186,55 +228,16 @@ def _get_documents(
                                 )
 
                         elif file_path.lower().endswith(".pdf"):
-                            # Process PDF files using docling_parse's pdf_parser_v1
-                            doc_key = f"key_{os.path.basename(file_path)}"  # Unique document key
-                            logger.info(f"Loading PDF document from {file_path}")
-
-                            success = PDFParser.load_document(doc_key, file_path)
-                            if not success:
-                                logger.warning(
-                                    f"Failed to load PDF document: {file_path}"
-                                )
-                                continue
-
-                            num_pages = PDFParser.number_of_pages(doc_key)
-                            logger.info(f"PDF '{file_path}' has {num_pages} pages.")
-
-                            pdf_text = ""
-
-                            for page in range(num_pages):
-                                try:
-                                    json_doc = PDFParser.parse_pdf_from_key_on_page(
-                                        doc_key, page
-                                    )
-                                    if "pages" not in json_doc or not json_doc["pages"]:
-                                        logger.warning(
-                                            f"Page {page + 1} could not be parsed in '{file_path}'"
-                                        )
-                                        continue
-
-                                    json_page = json_doc["pages"][0]
-
-                                    # Extract text from cells
-                                    for cell in json_page.get("cells", []):
-                                        text = cell.get("content", {}).get(
-                                            "rnormalized", ""
-                                        )
-                                        if text.strip():  # Only append non-empty text
-                                            pdf_text += text.strip() + "\n"
-                                except Exception as page_error:  # pylint: disable=broad-exception-caught
-                                    logger.warning(
-                                        f"Error parsing page {page + 1} of '{file_path}': {page_error}"
-                                    )
-                                    continue
-
+                            # Process PDF files using docling_parse's pdf_parser_v2
+                            logger.info(f"Extracting text from PDF file: {file_path}")
+                            pdf_text = extract_text_from_pdf(file_path)
                             if pdf_text:
                                 file_contents.append(pdf_text)
                                 filepaths.append(Path(file_path))
-
-                            # Unload the document to free memory
-                            PDFParser.unload_document(doc_key)
-                            logger.info(f"Unloaded PDF document: {file_path}")
+                            else:
+                                logger.warning(
+                                    f"PDF file {file_path} could not be processed"
+                                )
 
                         else:
                             logger.info(f"Skipping unsupported file type: {file_path}")