|
8 | 8 |
|
9 | 9 | # Third Party
|
10 | 10 | from datasets import Dataset
|
| 11 | +from docling.chunking import HybridChunker |
11 | 12 | from docling.datamodel.base_models import InputFormat
|
12 | 13 | from docling.datamodel.document import ConversionResult
|
13 | 14 | from docling.datamodel.pipeline_options import (
|
| 15 | + AcceleratorOptions, |
14 | 16 | EasyOcrOptions,
|
15 | 17 | OcrOptions,
|
16 | 18 | PdfPipelineOptions,
|
@@ -57,7 +59,10 @@ def resolve_ocr_options() -> OcrOptions:
|
57 | 59 | # Third Party
|
58 | 60 | from docling.models.easyocr_model import EasyOcrModel
|
59 | 61 |
|
60 |
| - _ = EasyOcrModel(True, ocr_options) |
| 62 | + accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) |
| 63 | + _ = EasyOcrModel( |
| 64 | + True, ocr_options, None, accelerator_options=accelerator_options |
| 65 | + ) |
61 | 66 | return ocr_options
|
62 | 67 | except ImportError:
|
63 | 68 | # no easyocr either, so don't use any OCR
|
@@ -185,11 +190,11 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
|
185 | 190 | with open(json_fp, "r", encoding="utf-8") as f:
|
186 | 191 | data = json.load(f)
|
187 | 192 |
|
188 |
| - chunks = self.build_chunks_from_docling_json( |
189 |
| - data, |
190 |
| - max_token_per_chunk=500, |
191 |
| - tokenizer=self.tokenizer, |
192 |
| - ) |
| 193 | + chunker = HybridChunker(tokenizer=self.tokenizer, max_token_per_chunk=500) |
| 194 | + chunk_iter = chunker.chunk( |
| 195 | + dl_doc=data |
| 196 | + ) # Use hybrid chunker to chunk the document |
| 197 | + chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter] |
193 | 198 | fused_texts = self.fuse_texts(chunks, 200)
|
194 | 199 |
|
195 | 200 | num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
|
@@ -288,215 +293,6 @@ def get_token_count(self, text, tokenizer):
|
288 | 293 | """
|
289 | 294 | return len(tokenizer.tokenize(text))
|
290 | 295 |
|
291 |
| - def add_heading_formatting(self, text): |
292 |
| - """ |
293 |
| - Add heading formatting to the text if the first part is short. |
294 |
| - Args: |
295 |
| - text (str): The input text to format. |
296 |
| - Returns: |
297 |
| - str: Formatted text with headings applied. |
298 |
| - """ |
299 |
| - text = text.split(".") |
300 |
| - |
301 |
| - # Change this from hardcoded to something more flexible |
302 |
| - if len(text) > 1 and len(text[0].split(" ")) < 3: |
303 |
| - text = f"**{text[0]}**" + ".".join(text[1:]) |
304 |
| - else: |
305 |
| - text = ".".join(text) |
306 |
| - return text |
307 |
| - |
308 |
| - def generate_table_from_parsed_rep(self, item): |
309 |
| - """ |
310 |
| - Generate the table from the parsed representation and return as a string. |
311 |
| - Args: |
312 |
| - item (dict): Parsed representation of a table. |
313 |
| - Returns: |
314 |
| - str: Formatted table as a string. |
315 |
| - """ |
316 |
| - caption = "" |
317 |
| - if "text" in item: |
318 |
| - caption = item["text"] |
319 |
| - |
320 |
| - data = item["data"] |
321 |
| - |
322 |
| - if len(data) <= 1 or len(data[0]) <= 1: |
323 |
| - return "" |
324 |
| - |
325 |
| - table = [] |
326 |
| - for _, row in enumerate(data): |
327 |
| - trow = [] |
328 |
| - for _, cell in enumerate(row): |
329 |
| - trow.append(cell["text"]) |
330 |
| - table.append(trow) |
331 |
| - |
332 |
| - table_text = tabulate(table, tablefmt="github") |
333 |
| - if caption: |
334 |
| - table_text += f"\nCaption: {caption}\n" |
335 |
| - return table_text |
336 |
| - |
337 |
| - def get_table(self, json_book, table_ref): |
338 |
| - """ |
339 |
| - Retrieve a table from a document based on a reference string. |
340 |
| - Args: |
341 |
| - json_book (dict): JSON representation of the document. |
342 |
| - table_ref (str): Reference path to the table within the document. |
343 |
| - Returns: |
344 |
| - str: Formatted table string. |
345 |
| - """ |
346 |
| - parts = table_ref.split("/") |
347 |
| - table_text = self.generate_table_from_parsed_rep( |
348 |
| - json_book[parts[1]][int(parts[2])] |
349 |
| - ) |
350 |
| - return table_text |
351 |
| - |
352 |
| - def get_table_page_number(self, json_book, idx): |
353 |
| - """ |
354 |
| - Get the page number of a table or other document element. |
355 |
| - Args: |
356 |
| - json_book (dict): JSON representation of the document. |
357 |
| - idx (int): Index of the element in the document. |
358 |
| - Returns: |
359 |
| - int: Page number of the element. |
360 |
| - """ |
361 |
| - prev_page_num, next_page_num = None, None |
362 |
| - for book_element in json_book["main-text"][idx - 1 :: -1]: |
363 |
| - if "prov" in book_element: |
364 |
| - prev_page_num = book_element["prov"][0]["page"] |
365 |
| - break |
366 |
| - for book_element in json_book["main-text"][idx:]: |
367 |
| - if "prov" in book_element and book_element["prov"]: |
368 |
| - next_page_num = book_element["prov"][0]["page"] |
369 |
| - break |
370 |
| - if prev_page_num is not None and next_page_num is not None: |
371 |
| - if prev_page_num == next_page_num: |
372 |
| - return prev_page_num |
373 |
| - return next_page_num |
374 |
| - if prev_page_num is not None: |
375 |
| - return prev_page_num |
376 |
| - if next_page_num is not None: |
377 |
| - return next_page_num |
378 |
| - |
379 |
| - def build_chunks_from_docling_json( |
380 |
| - self, |
381 |
| - json_book, |
382 |
| - max_token_per_chunk, |
383 |
| - tokenizer, |
384 |
| - keep_same_page_thing_together=False, |
385 |
| - chunking_criteria=None, |
386 |
| - ): |
387 |
| - """ |
388 |
| - Build document chunks from a docling JSON representation. |
389 |
| - Args: |
390 |
| - json_book (dict): JSON document to process. |
391 |
| - max_token_per_chunk (int): Maximum token count per chunk. |
392 |
| - tokenizer (AutoTokenizer): Tokenizer instance to use. |
393 |
| - keep_same_page_thing_together (bool): Whether to keep content on the same page together. |
394 |
| - chunking_criteria (callable): Custom function for determining chunk breaks. |
395 |
| - Returns: |
396 |
| - list: List of document chunks. |
397 |
| - """ |
398 |
| - current_buffer = [] |
399 |
| - document_chunks = [] |
400 |
| - prev_page_number = None |
401 |
| - book_title = None |
402 |
| - |
403 |
| - for idx, book_element in enumerate(json_book["main-text"]): |
404 |
| - if book_element["type"] in [ |
405 |
| - "page-footer", |
406 |
| - "picture", |
407 |
| - "reference", |
408 |
| - "meta-data", |
409 |
| - "figure", |
410 |
| - "page-header", |
411 |
| - ]: |
412 |
| - continue |
413 |
| - if book_element["type"] == "footnote": |
414 |
| - current_book_page_number = book_element["prov"][0]["page"] |
415 |
| - elif book_element["type"] in [ |
416 |
| - "subtitle-level-1", |
417 |
| - "paragraph", |
418 |
| - "table", |
419 |
| - "title", |
420 |
| - "equation", |
421 |
| - ]: # 'page-header', |
422 |
| - if book_element["type"] == "table": |
423 |
| - current_book_page_number = self.get_table_page_number( |
424 |
| - json_book, idx |
425 |
| - ) |
426 |
| - book_text = self.get_table(json_book, book_element["$ref"]) |
427 |
| - elif book_element["prov"]: |
428 |
| - current_book_page_number = book_element["prov"][0][ |
429 |
| - "page" |
430 |
| - ] # TODO export to function to handle empty ["prov"] |
431 |
| - book_text = book_element["text"] |
432 |
| - else: |
433 |
| - current_book_page_number = None |
434 |
| - book_text = book_element["text"] |
435 |
| - |
436 |
| - if book_element["type"] == "subtitle-level-1": |
437 |
| - if book_title is None: |
438 |
| - book_title = book_text |
439 |
| - book_text = f"# Title: **{book_text}**" |
440 |
| - else: |
441 |
| - book_text = f"## **{book_text}**" |
442 |
| - |
443 |
| - if book_element["type"] == "title": |
444 |
| - book_text = f"# **{book_text}**" |
445 |
| - if book_element["type"] == "page-header": |
446 |
| - book_text = f"Page Header: **{book_text}**\n\n" |
447 |
| - |
448 |
| - if chunking_criteria is not None: |
449 |
| - # custom break function that can be used to chunk document |
450 |
| - if chunking_criteria(book_text): |
451 |
| - document_chunks.append("\n\n".join(current_buffer)) |
452 |
| - current_buffer = [] |
453 |
| - elif ( |
454 |
| - prev_page_number is not None |
455 |
| - and prev_page_number != current_book_page_number |
456 |
| - ) and keep_same_page_thing_together: |
457 |
| - document_chunks.append("\n\n".join(current_buffer)) |
458 |
| - current_buffer = [] |
459 |
| - else: |
460 |
| - if ( |
461 |
| - self.get_token_count("\n\n".join(current_buffer), tokenizer) |
462 |
| - >= max_token_per_chunk |
463 |
| - and len(current_buffer) > 1 |
464 |
| - ): |
465 |
| - chunk_text = "\n\n".join(current_buffer[:-1]) |
466 |
| - logger.debug( |
467 |
| - f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}" |
468 |
| - ) |
469 |
| - |
470 |
| - document_chunks.append("\n\n".join(current_buffer[:-1])) |
471 |
| - |
472 |
| - if ( |
473 |
| - self.get_token_count(current_buffer[-1], tokenizer) |
474 |
| - >= max_token_per_chunk |
475 |
| - ): |
476 |
| - logger.debug( |
477 |
| - f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}" |
478 |
| - ) |
479 |
| - document_chunks.append(current_buffer[-1]) |
480 |
| - current_buffer = [] |
481 |
| - else: |
482 |
| - current_buffer = current_buffer[-1:] |
483 |
| - |
484 |
| - if book_element["type"] == "paragraph": |
485 |
| - book_text = self.add_heading_formatting(book_text) |
486 |
| - if "## References" in book_text or "## Acknowledgements" in book_text: |
487 |
| - # For research papers we ignore everything after this sections |
488 |
| - break |
489 |
| - current_buffer.append(book_text) |
490 |
| - |
491 |
| - try: |
492 |
| - prev_page_number = current_book_page_number |
493 |
| - except Exception as e: # pylint: disable=broad-exception-caught |
494 |
| - logger.error(f"Error processing book element: {book_element}, {str(e)}") |
495 |
| - |
496 |
| - if "\n\n".join(current_buffer) not in document_chunks: |
497 |
| - document_chunks.append("\n\n".join(current_buffer)) |
498 |
| - return document_chunks |
499 |
| - |
500 | 296 | def export_documents(self, converted_docs: Iterable[ConversionResult]):
|
501 | 297 | """Write converted documents to json files
|
502 | 298 |
|
|
0 commit comments