Skip to content

Commit e571e31

Browse files
committed
Merge branch 'main' into feat-support-google-ocr
2 parents aae5167 + ead396a commit e571e31

File tree

126 files changed

+88667
-2245
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+88667
-2245
lines changed

CHANGELOG.md

+19
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
1+
## [v2.14.0](https://github.com/DS4SD/docling/releases/tag/v2.14.0) - 2024-12-18
2+
3+
### Feature
4+
5+
* Create a backend to transform PubMed XML files to DoclingDocument ([#557](https://github.com/DS4SD/docling/issues/557)) ([`fd03480`](https://github.com/DS4SD/docling/commit/fd034802b65a0e567531b8ecc9a283aaf030e050))
6+
7+
## [v2.13.0](https://github.com/DS4SD/docling/releases/tag/v2.13.0) - 2024-12-17
8+
9+
### Feature
10+
11+
* Updated Layout processing with forms and key-value areas ([#530](https://github.com/DS4SD/docling/issues/530)) ([`60dc852`](https://github.com/DS4SD/docling/commit/60dc852f16dc1adbb5e9284c81a146043a301ec1))
12+
* Create a backend to parse USPTO patents into DoclingDocument ([#606](https://github.com/DS4SD/docling/issues/606)) ([`4e08750`](https://github.com/DS4SD/docling/commit/4e087504cc4b04210574e69f616badcddfa1f8e5))
13+
* Add Easyocr parameter recog_network ([#613](https://github.com/DS4SD/docling/issues/613)) ([`3b53bd3`](https://github.com/DS4SD/docling/commit/3b53bd38c8efcc5ba54421fbfa90d047f1a61f82))
14+
15+
### Documentation
16+
17+
* Add Haystack RAG example ([#615](https://github.com/DS4SD/docling/issues/615)) ([`3e599c7`](https://github.com/DS4SD/docling/commit/3e599c7bbeef211dc346e9bc1d3a249113fcc4e4))
18+
* Fix the path to the run_with_accelerator.py example ([#608](https://github.com/DS4SD/docling/issues/608)) ([`3bb3bf5`](https://github.com/DS4SD/docling/commit/3bb3bf57150c9705a055982e6fb0cc8d1408f161))
19+
120
## [v2.12.0](https://github.com/DS4SD/docling/releases/tag/v2.12.0) - 2024-12-13
221

322
### Feature

docling/backend/html_backend.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
3737

3838
try:
3939
if isinstance(self.path_or_stream, BytesIO):
40-
text_stream = self.path_or_stream.getvalue().decode("utf-8")
40+
text_stream = self.path_or_stream.getvalue()
4141
self.soup = BeautifulSoup(text_stream, "html.parser")
4242
if isinstance(self.path_or_stream, Path):
43-
with open(self.path_or_stream, "r", encoding="utf-8") as f:
43+
with open(self.path_or_stream, "rb") as f:
4444
html_content = f.read()
4545
self.soup = BeautifulSoup(html_content, "html.parser")
4646
except Exception as e:

docling/backend/mspowerpoint_backend.py

+15-11
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
TableCell,
1717
TableData,
1818
)
19-
from PIL import Image
19+
from PIL import Image, UnidentifiedImageError
2020
from pptx import Presentation
2121
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
2222

@@ -120,6 +120,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
120120
bullet_type = "None"
121121
list_text = ""
122122
list_label = GroupLabel.LIST
123+
doc_label = DocItemLabel.LIST_ITEM
123124
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
124125

125126
# Identify if shape contains lists
@@ -276,16 +277,19 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
276277
im_dpi, _ = image.dpi
277278

278279
# Open it with PIL
279-
pil_image = Image.open(BytesIO(image_bytes))
280-
281-
# shape has picture
282-
prov = self.generate_prov(shape, slide_ind, "")
283-
doc.add_picture(
284-
parent=parent_slide,
285-
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
286-
caption=None,
287-
prov=prov,
288-
)
280+
try:
281+
pil_image = Image.open(BytesIO(image_bytes))
282+
283+
# shape has picture
284+
prov = self.generate_prov(shape, slide_ind, "")
285+
doc.add_picture(
286+
parent=parent_slide,
287+
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
288+
caption=None,
289+
prov=prov,
290+
)
291+
except (UnidentifiedImageError, OSError) as e:
292+
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
289293
return
290294

291295
def handle_tables(self, shape, parent_slide, slide_ind, doc):

docling/backend/xml/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)