From b546c0a50d11152f0ad65a1bc59e33478bc11052 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 13 Dec 2024 09:05:56 +0100 Subject: [PATCH] fix: set origin when merging chunks (#109) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .../transforms/chunker/hybrid_chunker.py | 2 ++ test/data/chunker/2a_out_chunks.json | 23 +++++++++++++++---- test/data/chunker/2c_out_chunks.json | 14 +++++++++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/docling_core/transforms/chunker/hybrid_chunker.py b/docling_core/transforms/chunker/hybrid_chunker.py index fcf9872..5f81d2e 100644 --- a/docling_core/transforms/chunker/hybrid_chunker.py +++ b/docling_core/transforms/chunker/hybrid_chunker.py @@ -98,6 +98,7 @@ def _make_chunk_from_doc_items( doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1], headings=doc_chunk.meta.headings, captions=doc_chunk.meta.captions, + origin=doc_chunk.meta.origin, ) new_chunk = DocChunk(text=window_text, meta=meta) return new_chunk @@ -244,6 +245,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): doc_items=window_items, headings=current_headings_and_captions[0], captions=current_headings_and_captions[1], + origin=chunk.meta.origin, ) new_chunk = DocChunk( text=window_text, diff --git a/test/data/chunker/2a_out_chunks.json b/test/data/chunker/2a_out_chunks.json index c70e6c3..a0c7099 100644 --- a/test/data/chunker/2a_out_chunks.json +++ b/test/data/chunker/2a_out_chunks.json @@ -95,7 +95,12 @@ ], "headings": [ "IBM" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 15535403176419637685, + "filename": "wiki.pdf" + } } }, { @@ -1000,7 +1005,12 @@ "IBM", "Corporate affairs", "Business trends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 15535403176419637685, + "filename": "wiki.pdf" + } } }, { @@ -1062,8 +1072,13 @@ "IBM", "Corporate affairs", "Business trends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 15535403176419637685, + "filename": "wiki.pdf" + } } } ] -} +} \ No newline at end of file diff --git a/test/data/chunker/2c_out_chunks.json b/test/data/chunker/2c_out_chunks.json index d197e36..4788e40 100644 --- a/test/data/chunker/2c_out_chunks.json +++ b/test/data/chunker/2c_out_chunks.json @@ -191,7 +191,12 @@ ], "headings": [ "IBM" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 15535403176419637685, + "filename": "wiki.pdf" + } } }, { @@ -448,7 +453,12 @@ "IBM", "Corporate affairs", "Business trends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 15535403176419637685, + "filename": "wiki.pdf" + } } } ]