Skip to content

Commit

Permalink
fix: set origin when merging chunks (#109)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Dec 13, 2024
1 parent 2591c70 commit b546c0a
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 6 deletions.
2 changes: 2 additions & 0 deletions docling_core/transforms/chunker/hybrid_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def _make_chunk_from_doc_items(
doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
headings=doc_chunk.meta.headings,
captions=doc_chunk.meta.captions,
origin=doc_chunk.meta.origin,
)
new_chunk = DocChunk(text=window_text, meta=meta)
return new_chunk
Expand Down Expand Up @@ -244,6 +245,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
doc_items=window_items,
headings=current_headings_and_captions[0],
captions=current_headings_and_captions[1],
origin=chunk.meta.origin,
)
new_chunk = DocChunk(
text=window_text,
Expand Down
23 changes: 19 additions & 4 deletions test/data/chunker/2a_out_chunks.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,12 @@
],
"headings": [
"IBM"
]
],
"origin": {
"mimetype": "application/pdf",
"binary_hash": 15535403176419637685,
"filename": "wiki.pdf"
}
}
},
{
Expand Down Expand Up @@ -1000,7 +1005,12 @@
"IBM",
"Corporate affairs",
"Business trends"
]
],
"origin": {
"mimetype": "application/pdf",
"binary_hash": 15535403176419637685,
"filename": "wiki.pdf"
}
}
},
{
Expand Down Expand Up @@ -1062,8 +1072,13 @@
"IBM",
"Corporate affairs",
"Business trends"
]
],
"origin": {
"mimetype": "application/pdf",
"binary_hash": 15535403176419637685,
"filename": "wiki.pdf"
}
}
}
]
}
}
14 changes: 12 additions & 2 deletions test/data/chunker/2c_out_chunks.json
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,12 @@
],
"headings": [
"IBM"
]
],
"origin": {
"mimetype": "application/pdf",
"binary_hash": 15535403176419637685,
"filename": "wiki.pdf"
}
}
},
{
Expand Down Expand Up @@ -448,7 +453,12 @@
"IBM",
"Corporate affairs",
"Business trends"
]
],
"origin": {
"mimetype": "application/pdf",
"binary_hash": 15535403176419637685,
"filename": "wiki.pdf"
}
}
}
]
Expand Down

0 comments on commit b546c0a

Please sign in to comment.