Skip to content

Commit f1daa17

Browse files
committed
fix: set origin when merging chunks
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 2591c70 commit f1daa17

File tree

3 files changed

+33
-6
lines changed

3 files changed

+33
-6
lines changed

docling_core/transforms/chunker/hybrid_chunker.py

+2
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ def _make_chunk_from_doc_items(
9898
doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
9999
headings=doc_chunk.meta.headings,
100100
captions=doc_chunk.meta.captions,
101+
origin=doc_chunk.meta.origin,
101102
)
102103
new_chunk = DocChunk(text=window_text, meta=meta)
103104
return new_chunk
@@ -244,6 +245,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
244245
doc_items=window_items,
245246
headings=current_headings_and_captions[0],
246247
captions=current_headings_and_captions[1],
248+
origin=chunk.meta.origin,
247249
)
248250
new_chunk = DocChunk(
249251
text=window_text,

test/data/chunker/2a_out_chunks.json

+19-4
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,12 @@
9595
],
9696
"headings": [
9797
"IBM"
98-
]
98+
],
99+
"origin": {
100+
"mimetype": "application/pdf",
101+
"binary_hash": 15535403176419637685,
102+
"filename": "wiki.pdf"
103+
}
99104
}
100105
},
101106
{
@@ -1000,7 +1005,12 @@
10001005
"IBM",
10011006
"Corporate affairs",
10021007
"Business trends"
1003-
]
1008+
],
1009+
"origin": {
1010+
"mimetype": "application/pdf",
1011+
"binary_hash": 15535403176419637685,
1012+
"filename": "wiki.pdf"
1013+
}
10041014
}
10051015
},
10061016
{
@@ -1062,8 +1072,13 @@
10621072
"IBM",
10631073
"Corporate affairs",
10641074
"Business trends"
1065-
]
1075+
],
1076+
"origin": {
1077+
"mimetype": "application/pdf",
1078+
"binary_hash": 15535403176419637685,
1079+
"filename": "wiki.pdf"
1080+
}
10661081
}
10671082
}
10681083
]
1069-
}
1084+
}

test/data/chunker/2c_out_chunks.json

+12-2
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,12 @@
191191
],
192192
"headings": [
193193
"IBM"
194-
]
194+
],
195+
"origin": {
196+
"mimetype": "application/pdf",
197+
"binary_hash": 15535403176419637685,
198+
"filename": "wiki.pdf"
199+
}
195200
}
196201
},
197202
{
@@ -448,7 +453,12 @@
448453
"IBM",
449454
"Corporate affairs",
450455
"Business trends"
451-
]
456+
],
457+
"origin": {
458+
"mimetype": "application/pdf",
459+
"binary_hash": 15535403176419637685,
460+
"filename": "wiki.pdf"
461+
}
452462
}
453463
}
454464
]

0 commit comments

Comments
 (0)