Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit d97fb71

Browse files
committedFeb 12, 2025·
Update PDF extraction and OCR options for hybrid chunking
Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
1 parent e2296fb commit d97fb71

File tree

2 files changed

+63
-264
lines changed

2 files changed

+63
-264
lines changed
 

‎src/instructlab/sdg/utils/chunkers.py

+11-215
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88

99
# Third Party
1010
from datasets import Dataset
11+
from docling.chunking import HybridChunker
1112
from docling.datamodel.base_models import InputFormat
1213
from docling.datamodel.document import ConversionResult
1314
from docling.datamodel.pipeline_options import (
15+
AcceleratorOptions,
1416
EasyOcrOptions,
1517
OcrOptions,
1618
PdfPipelineOptions,
@@ -57,7 +59,10 @@ def resolve_ocr_options() -> OcrOptions:
5759
# Third Party
5860
from docling.models.easyocr_model import EasyOcrModel
5961

60-
_ = EasyOcrModel(True, ocr_options)
62+
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
63+
_ = EasyOcrModel(
64+
True, ocr_options, None, accelerator_options=accelerator_options
65+
)
6166
return ocr_options
6267
except ImportError:
6368
# no easyocr either, so don't use any OCR
@@ -185,11 +190,11 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
185190
with open(json_fp, "r", encoding="utf-8") as f:
186191
data = json.load(f)
187192

188-
chunks = self.build_chunks_from_docling_json(
189-
data,
190-
max_token_per_chunk=500,
191-
tokenizer=self.tokenizer,
192-
)
193+
chunker = HybridChunker(tokenizer=self.tokenizer, max_token_per_chunk=500)
194+
chunk_iter = chunker.chunk(
195+
dl_doc=data
196+
) # Use hybrid chunker to chunk the document
197+
chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter]
193198
fused_texts = self.fuse_texts(chunks, 200)
194199

195200
num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
@@ -288,215 +293,6 @@ def get_token_count(self, text, tokenizer):
288293
"""
289294
return len(tokenizer.tokenize(text))
290295

291-
def add_heading_formatting(self, text):
292-
"""
293-
Add heading formatting to the text if the first part is short.
294-
Args:
295-
text (str): The input text to format.
296-
Returns:
297-
str: Formatted text with headings applied.
298-
"""
299-
text = text.split(".")
300-
301-
# Change this from hardcoded to something more flexible
302-
if len(text) > 1 and len(text[0].split(" ")) < 3:
303-
text = f"**{text[0]}**" + ".".join(text[1:])
304-
else:
305-
text = ".".join(text)
306-
return text
307-
308-
def generate_table_from_parsed_rep(self, item):
309-
"""
310-
Generate the table from the parsed representation and return as a string.
311-
Args:
312-
item (dict): Parsed representation of a table.
313-
Returns:
314-
str: Formatted table as a string.
315-
"""
316-
caption = ""
317-
if "text" in item:
318-
caption = item["text"]
319-
320-
data = item["data"]
321-
322-
if len(data) <= 1 or len(data[0]) <= 1:
323-
return ""
324-
325-
table = []
326-
for _, row in enumerate(data):
327-
trow = []
328-
for _, cell in enumerate(row):
329-
trow.append(cell["text"])
330-
table.append(trow)
331-
332-
table_text = tabulate(table, tablefmt="github")
333-
if caption:
334-
table_text += f"\nCaption: {caption}\n"
335-
return table_text
336-
337-
def get_table(self, json_book, table_ref):
338-
"""
339-
Retrieve a table from a document based on a reference string.
340-
Args:
341-
json_book (dict): JSON representation of the document.
342-
table_ref (str): Reference path to the table within the document.
343-
Returns:
344-
str: Formatted table string.
345-
"""
346-
parts = table_ref.split("/")
347-
table_text = self.generate_table_from_parsed_rep(
348-
json_book[parts[1]][int(parts[2])]
349-
)
350-
return table_text
351-
352-
def get_table_page_number(self, json_book, idx):
353-
"""
354-
Get the page number of a table or other document element.
355-
Args:
356-
json_book (dict): JSON representation of the document.
357-
idx (int): Index of the element in the document.
358-
Returns:
359-
int: Page number of the element.
360-
"""
361-
prev_page_num, next_page_num = None, None
362-
for book_element in json_book["main-text"][idx - 1 :: -1]:
363-
if "prov" in book_element:
364-
prev_page_num = book_element["prov"][0]["page"]
365-
break
366-
for book_element in json_book["main-text"][idx:]:
367-
if "prov" in book_element and book_element["prov"]:
368-
next_page_num = book_element["prov"][0]["page"]
369-
break
370-
if prev_page_num is not None and next_page_num is not None:
371-
if prev_page_num == next_page_num:
372-
return prev_page_num
373-
return next_page_num
374-
if prev_page_num is not None:
375-
return prev_page_num
376-
if next_page_num is not None:
377-
return next_page_num
378-
379-
def build_chunks_from_docling_json(
380-
self,
381-
json_book,
382-
max_token_per_chunk,
383-
tokenizer,
384-
keep_same_page_thing_together=False,
385-
chunking_criteria=None,
386-
):
387-
"""
388-
Build document chunks from a docling JSON representation.
389-
Args:
390-
json_book (dict): JSON document to process.
391-
max_token_per_chunk (int): Maximum token count per chunk.
392-
tokenizer (AutoTokenizer): Tokenizer instance to use.
393-
keep_same_page_thing_together (bool): Whether to keep content on the same page together.
394-
chunking_criteria (callable): Custom function for determining chunk breaks.
395-
Returns:
396-
list: List of document chunks.
397-
"""
398-
current_buffer = []
399-
document_chunks = []
400-
prev_page_number = None
401-
book_title = None
402-
403-
for idx, book_element in enumerate(json_book["main-text"]):
404-
if book_element["type"] in [
405-
"page-footer",
406-
"picture",
407-
"reference",
408-
"meta-data",
409-
"figure",
410-
"page-header",
411-
]:
412-
continue
413-
if book_element["type"] == "footnote":
414-
current_book_page_number = book_element["prov"][0]["page"]
415-
elif book_element["type"] in [
416-
"subtitle-level-1",
417-
"paragraph",
418-
"table",
419-
"title",
420-
"equation",
421-
]: # 'page-header',
422-
if book_element["type"] == "table":
423-
current_book_page_number = self.get_table_page_number(
424-
json_book, idx
425-
)
426-
book_text = self.get_table(json_book, book_element["$ref"])
427-
elif book_element["prov"]:
428-
current_book_page_number = book_element["prov"][0][
429-
"page"
430-
] # TODO export to function to handle empty ["prov"]
431-
book_text = book_element["text"]
432-
else:
433-
current_book_page_number = None
434-
book_text = book_element["text"]
435-
436-
if book_element["type"] == "subtitle-level-1":
437-
if book_title is None:
438-
book_title = book_text
439-
book_text = f"# Title: **{book_text}**"
440-
else:
441-
book_text = f"## **{book_text}**"
442-
443-
if book_element["type"] == "title":
444-
book_text = f"# **{book_text}**"
445-
if book_element["type"] == "page-header":
446-
book_text = f"Page Header: **{book_text}**\n\n"
447-
448-
if chunking_criteria is not None:
449-
# custom break function that can be used to chunk document
450-
if chunking_criteria(book_text):
451-
document_chunks.append("\n\n".join(current_buffer))
452-
current_buffer = []
453-
elif (
454-
prev_page_number is not None
455-
and prev_page_number != current_book_page_number
456-
) and keep_same_page_thing_together:
457-
document_chunks.append("\n\n".join(current_buffer))
458-
current_buffer = []
459-
else:
460-
if (
461-
self.get_token_count("\n\n".join(current_buffer), tokenizer)
462-
>= max_token_per_chunk
463-
and len(current_buffer) > 1
464-
):
465-
chunk_text = "\n\n".join(current_buffer[:-1])
466-
logger.debug(
467-
f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}"
468-
)
469-
470-
document_chunks.append("\n\n".join(current_buffer[:-1]))
471-
472-
if (
473-
self.get_token_count(current_buffer[-1], tokenizer)
474-
>= max_token_per_chunk
475-
):
476-
logger.debug(
477-
f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}"
478-
)
479-
document_chunks.append(current_buffer[-1])
480-
current_buffer = []
481-
else:
482-
current_buffer = current_buffer[-1:]
483-
484-
if book_element["type"] == "paragraph":
485-
book_text = self.add_heading_formatting(book_text)
486-
if "## References" in book_text or "## Acknowledgements" in book_text:
487-
# For research papers we ignore everything after this sections
488-
break
489-
current_buffer.append(book_text)
490-
491-
try:
492-
prev_page_number = current_book_page_number
493-
except Exception as e: # pylint: disable=broad-exception-caught
494-
logger.error(f"Error processing book element: {book_element}, {str(e)}")
495-
496-
if "\n\n".join(current_buffer) not in document_chunks:
497-
document_chunks.append("\n\n".join(current_buffer))
498-
return document_chunks
499-
500296
def export_documents(self, converted_docs: Iterable[ConversionResult]):
501297
"""Write converted documents to json files
502298

‎src/instructlab/sdg/utils/taxonomy.py

+52-49
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from datasets import Dataset
1414

1515
# pylint: disable=no-name-in-module
16-
from docling_parse.docling_parse import pdf_parser_v1
16+
from docling_parse.pdf_parsers import pdf_parser_v2
1717
from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
1818
from instructlab.schema.taxonomy import (
1919
TaxonomyMessageFormat,
@@ -28,7 +28,7 @@
2828
from .chunkers import DocumentChunker
2929

3030
# Initialize the pdf parser
31-
PDFParser = pdf_parser_v1()
31+
PDFParser = pdf_parser_v2("error")
3232

3333
logger = logging.getLogger(__name__)
3434

@@ -122,13 +122,55 @@ def _string_contains_html(s: str) -> bool:
122122
return bool(html_tag_pattern.search(s))
123123

124124

125+
def extract_text_from_pdf(file_path: str) -> str:
126+
"""
127+
Extracts text from a PDF file using the docling PdfParser API.
128+
129+
Args:
130+
file_path (str): Path to the PDF file.
131+
132+
Returns:
133+
str: The extracted text from all pages.
134+
"""
135+
pdf_text = ""
136+
# Load the PDF document in lazy mode. (Do not pass a doc_key here.)
137+
pdf_doc = PDFParser.load(path_or_stream=file_path, lazy=True)
138+
if pdf_doc is None:
139+
logger.error(f"Failed to load PDF: {file_path}")
140+
return ""
141+
142+
num_pages = pdf_doc.number_of_pages()
143+
logger.info(f"PDF '{file_path}' has {num_pages} pages.")
144+
145+
# Note: The high-level API expects page numbers to be 1-indexed.
146+
for page_no in range(1, num_pages + 1):
147+
try:
148+
pdf_page = pdf_doc.get_page(page_no=page_no)
149+
text_lines = pdf_page.sanitized.export_to_textlines(
150+
add_fontkey=True, add_fontname=False
151+
)
152+
page_text = "\n".join(text_lines)
153+
pdf_text += page_text + "\n"
154+
except Exception as e:
155+
logger.warning(
156+
f"Error extracting text from page {page_no} of '{file_path}': {e}"
157+
)
158+
continue
159+
160+
# Unload the document to free memory
161+
PDFParser.unload_document(file_path)
162+
logger.info(f"Unloaded PDF document: {file_path}")
163+
164+
return pdf_text
165+
166+
125167
def _get_documents(
126168
source: Dict[str, Union[str, List[str]]],
127169
skip_checkout: bool = False,
128170
document_output_dir: Path = None,
129171
) -> Tuple[List[str], List[Path]]:
130172
"""
131-
Retrieve the content of files (Markdown and PDF) from a Git repository.
173+
Retrieve the content of files (Markdown and PDFs) from a Git repository.
132174
133175
Args:
134176
source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -186,55 +228,16 @@ def _get_documents(
186228
)
187229

188230
elif file_path.lower().endswith(".pdf"):
189-
# Process PDF files using docling_parse's pdf_parser_v1
190-
doc_key = f"key_{os.path.basename(file_path)}" # Unique document key
191-
logger.info(f"Loading PDF document from {file_path}")
192-
193-
success = PDFParser.load_document(doc_key, file_path)
194-
if not success:
195-
logger.warning(
196-
f"Failed to load PDF document: {file_path}"
197-
)
198-
continue
199-
200-
num_pages = PDFParser.number_of_pages(doc_key)
201-
logger.info(f"PDF '{file_path}' has {num_pages} pages.")
202-
203-
pdf_text = ""
204-
205-
for page in range(num_pages):
206-
try:
207-
json_doc = PDFParser.parse_pdf_from_key_on_page(
208-
doc_key, page
209-
)
210-
if "pages" not in json_doc or not json_doc["pages"]:
211-
logger.warning(
212-
f"Page {page + 1} could not be parsed in '{file_path}'"
213-
)
214-
continue
215-
216-
json_page = json_doc["pages"][0]
217-
218-
# Extract text from cells
219-
for cell in json_page.get("cells", []):
220-
text = cell.get("content", {}).get(
221-
"rnormalized", ""
222-
)
223-
if text.strip(): # Only append non-empty text
224-
pdf_text += text.strip() + "\n"
225-
except Exception as page_error: # pylint: disable=broad-exception-caught
226-
logger.warning(
227-
f"Error parsing page {page + 1} of '{file_path}': {page_error}"
228-
)
229-
continue
230-
231+
# Process PDF files using docling_parse's pdf_parser_v2
232+
logger.info(f"Extracting text from PDF file: {file_path}")
233+
pdf_text = extract_text_from_pdf(file_path)
231234
if pdf_text:
232235
file_contents.append(pdf_text)
233236
filepaths.append(Path(file_path))
234-
235-
# Unload the document to free memory
236-
PDFParser.unload_document(doc_key)
237-
logger.info(f"Unloaded PDF document: {file_path}")
237+
else:
238+
logger.warning(
239+
f"PDF file {file_path} could not be processed"
240+
)
238241

239242
else:
240243
logger.info(f"Skipping unsupported file type: {file_path}")

0 commit comments

Comments
 (0)
Please sign in to comment.