Skip to content

Commit cca2acd

Browse files
committed
add fsspec support
1 parent e54524d commit cca2acd

File tree

6 files changed

+107
-76
lines changed

6 files changed

+107
-76
lines changed

docling_core/cli/view.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,7 @@ def view(
6969
image_mode=ImageRefMode.EMBEDDED,
7070
split_page_view=split_view,
7171
)
72-
with open(target_path, "w", encoding="utf-8") as f:
73-
f.write(html_output)
72+
target_path.write_text(html_output, encoding="utf-8")
7473
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
7574

7675

docling_core/types/doc/document.py

Lines changed: 58 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,12 @@
66
import json
77
import logging
88
import mimetypes
9-
import os
109
import re
1110
import sys
1211
import typing
1312
import warnings
1413
from enum import Enum
15-
from io import BytesIO
14+
from io import BytesIO, StringIO
1615
from pathlib import Path
1716
from typing import (
1817
Any,
@@ -65,7 +64,11 @@
6564
PictureClassificationLabel,
6665
)
6766
from docling_core.types.doc.tokens import DocumentToken, TableToken
68-
from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
67+
from docling_core.types.doc.utils import (
68+
is_remote_path,
69+
parse_otsl_table_content,
70+
relative_path,
71+
)
6972

7073
_logger = logging.getLogger(__name__)
7174

@@ -4762,38 +4765,48 @@ def _with_pictures_refs(
47624765
img_count = 0
47634766
image_dir.mkdir(parents=True, exist_ok=True)
47644767

4765-
if image_dir.is_dir():
4766-
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
4767-
if isinstance(item, PictureItem):
4768-
img = item.get_image(doc=self)
4769-
if img is not None:
4770-
4771-
hexhash = PictureItem._image_to_hexhash(img)
4772-
4773-
# loc_path = image_dir / f"image_{img_count:06}.png"
4774-
if hexhash is not None:
4775-
loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
4776-
4777-
img.save(loc_path)
4778-
if reference_path is not None:
4779-
obj_path = relative_path(
4780-
reference_path.resolve(),
4781-
loc_path.resolve(),
4782-
)
4783-
else:
4784-
obj_path = loc_path
4768+
# Note: Skip is_dir() check for remote paths since S3/cloud storage
4769+
# doesn't have real directories - mkdir() is a no-op for remote paths
4770+
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
4771+
if isinstance(item, PictureItem):
4772+
img = item.get_image(doc=self)
4773+
if img is not None:
4774+
4775+
hexhash = PictureItem._image_to_hexhash(img)
4776+
4777+
# loc_path = image_dir / f"image_{img_count:06}.png"
4778+
if hexhash is not None:
4779+
loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
4780+
4781+
# Use BytesIO + write_bytes for UPath compatibility
4782+
buf = BytesIO()
4783+
img.save(buf, format="PNG")
4784+
loc_path.write_bytes(buf.getvalue())
4785+
4786+
# For remote paths, use absolute URI string; for local, compute relative
4787+
if is_remote_path(loc_path) or is_remote_path(reference_path):
4788+
# Convert to string URI for remote paths (Pydantic can't serialize UPath)
4789+
obj_path = str(loc_path)
4790+
elif reference_path is not None:
4791+
obj_path = relative_path(
4792+
reference_path.resolve(),
4793+
loc_path.resolve(),
4794+
)
4795+
else:
4796+
obj_path = loc_path
47854797

4786-
if item.image is None:
4787-
scale = img.size[0] / item.prov[0].bbox.width
4788-
item.image = ImageRef.from_pil(
4789-
image=img, dpi=round(72 * scale)
4790-
)
4791-
item.image.uri = Path(obj_path)
4798+
if item.image is None:
4799+
scale = img.size[0] / item.prov[0].bbox.width
4800+
item.image = ImageRef.from_pil(
4801+
image=img, dpi=round(72 * scale)
4802+
)
4803+
# For remote paths, store as string URI; for local, store as Path
4804+
item.image.uri = obj_path
47924805

4793-
# if item.image._pil is not None:
4794-
# item.image._pil.close()
4806+
# if item.image._pil is not None:
4807+
# item.image._pil.close()
47954808

4796-
img_count += 1
4809+
img_count += 1
47974810

47984811
return result
47994812

@@ -4859,7 +4872,7 @@ def save_as_json(
48594872
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
48604873

48614874
if image_mode == ImageRefMode.REFERENCED:
4862-
os.makedirs(artifacts_dir, exist_ok=True)
4875+
artifacts_dir.mkdir(parents=True, exist_ok=True)
48634876

48644877
new_doc = self._make_copy_with_refmode(
48654878
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
@@ -4868,8 +4881,7 @@ def save_as_json(
48684881
out = new_doc.export_to_dict(
48694882
coord_precision=coord_precision, confid_precision=confid_precision
48704883
)
4871-
with open(filename, "w", encoding="utf-8") as fw:
4872-
json.dump(out, fw, indent=indent)
4884+
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
48734885

48744886
@classmethod
48754887
def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
@@ -4884,8 +4896,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
48844896
"""
48854897
if isinstance(filename, str):
48864898
filename = Path(filename)
4887-
with open(filename, "r", encoding="utf-8") as f:
4888-
return cls.model_validate_json(f.read())
4899+
return cls.model_validate_json(filename.read_text(encoding="utf-8"))
48894900

48904901
def save_as_yaml(
48914902
self,
@@ -4902,7 +4913,7 @@ def save_as_yaml(
49024913
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
49034914

49044915
if image_mode == ImageRefMode.REFERENCED:
4905-
os.makedirs(artifacts_dir, exist_ok=True)
4916+
artifacts_dir.mkdir(parents=True, exist_ok=True)
49064917

49074918
new_doc = self._make_copy_with_refmode(
49084919
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
@@ -4911,8 +4922,9 @@ def save_as_yaml(
49114922
out = new_doc.export_to_dict(
49124923
coord_precision=coord_precision, confid_precision=confid_precision
49134924
)
4914-
with open(filename, "w", encoding="utf-8") as fw:
4915-
yaml.dump(out, fw, default_flow_style=default_flow_style)
4925+
stream = StringIO()
4926+
yaml.dump(out, stream, default_flow_style=default_flow_style)
4927+
filename.write_text(stream.getvalue(), encoding="utf-8")
49164928

49174929
@classmethod
49184930
def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
@@ -4926,8 +4938,7 @@ def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
49264938
"""
49274939
if isinstance(filename, str):
49284940
filename = Path(filename)
4929-
with open(filename, encoding="utf-8") as f:
4930-
data = yaml.load(f, Loader=yaml.SafeLoader)
4941+
data = yaml.load(filename.read_text(encoding="utf-8"), Loader=yaml.SafeLoader)
49314942
return DoclingDocument.model_validate(data)
49324943

49334944
def export_to_dict(
@@ -4979,7 +4990,7 @@ def save_as_markdown(
49794990
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
49804991

49814992
if image_mode == ImageRefMode.REFERENCED:
4982-
os.makedirs(artifacts_dir, exist_ok=True)
4993+
artifacts_dir.mkdir(parents=True, exist_ok=True)
49834994

49844995
new_doc = self._make_copy_with_refmode(
49854996
artifacts_dir, image_mode, page_no, reference_path=reference_path
@@ -5005,8 +5016,7 @@ def save_as_markdown(
50055016
mark_meta=mark_meta,
50065017
)
50075018

5008-
with open(filename, "w", encoding="utf-8") as fw:
5009-
fw.write(md_out)
5019+
filename.write_text(md_out, encoding="utf-8")
50105020

50115021
def export_to_markdown( # noqa: C901
50125022
self,
@@ -5185,7 +5195,7 @@ def save_as_html(
51855195
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
51865196

51875197
if image_mode == ImageRefMode.REFERENCED:
5188-
os.makedirs(artifacts_dir, exist_ok=True)
5198+
artifacts_dir.mkdir(parents=True, exist_ok=True)
51895199

51905200
new_doc = self._make_copy_with_refmode(
51915201
artifacts_dir, image_mode, page_no, reference_path=reference_path
@@ -5205,8 +5215,7 @@ def save_as_html(
52055215
include_annotations=include_annotations,
52065216
)
52075217

5208-
with open(filename, "w", encoding="utf-8") as fw:
5209-
fw.write(html_out)
5218+
filename.write_text(html_out, encoding="utf-8")
52105219

52115220
def _get_output_paths(
52125221
self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
@@ -5850,8 +5859,7 @@ def save_as_doctags(
58505859
minified=minified,
58515860
)
58525861

5853-
with open(filename, "w", encoding="utf-8") as fw:
5854-
fw.write(out)
5862+
filename.write_text(out, encoding="utf-8")
58555863

58565864
@deprecated("Use export_to_doctags() instead.")
58575865
def export_to_document_tokens(self, *args, **kwargs):

docling_core/types/doc/page.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -601,8 +601,7 @@ def save_as_json(
601601
if isinstance(filename, str):
602602
filename = Path(filename)
603603
out = self.export_to_dict()
604-
with open(filename, "w", encoding="utf-8") as fw:
605-
json.dump(out, fw, indent=indent)
604+
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
606605

607606
@classmethod
608607
def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage":
@@ -616,8 +615,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "SegmentedPdfPage":
616615
"""
617616
if isinstance(filename, str):
618617
filename = Path(filename)
619-
with open(filename, "r", encoding="utf-8") as f:
620-
return cls.model_validate_json(f.read())
618+
return cls.model_validate_json(filename.read_text(encoding="utf-8"))
621619

622620
def crop_text(
623621
self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
@@ -1218,8 +1216,7 @@ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
12181216
if isinstance(filename, str):
12191217
filename = Path(filename)
12201218
out = self.export_to_dict()
1221-
with open(filename, "w", encoding="utf-8") as fw:
1222-
json.dump(out, fw, indent=indent)
1219+
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
12231220

12241221
@classmethod
12251222
def load_from_json(cls, filename: Union[str, Path]) -> "PdfTableOfContents":
@@ -1233,8 +1230,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "PdfTableOfContents":
12331230
"""
12341231
if isinstance(filename, str):
12351232
filename = Path(filename)
1236-
with open(filename, "r", encoding="utf-8") as f:
1237-
return cls.model_validate_json(f.read())
1233+
return cls.model_validate_json(filename.read_text(encoding="utf-8"))
12381234

12391235

12401236
class ParsedPdfDocument(BaseModel):
@@ -1280,8 +1276,7 @@ def save_as_json(self, filename: Union[str, Path], indent: int = 2):
12801276
if isinstance(filename, str):
12811277
filename = Path(filename)
12821278
out = self.export_to_dict()
1283-
with open(filename, "w", encoding="utf-8") as fw:
1284-
json.dump(out, fw, indent=indent)
1279+
filename.write_text(json.dumps(out, indent=indent), encoding="utf-8")
12851280

12861281
@classmethod
12871282
def load_from_json(cls, filename: Union[str, Path]) -> "ParsedPdfDocument":
@@ -1295,5 +1290,4 @@ def load_from_json(cls, filename: Union[str, Path]) -> "ParsedPdfDocument":
12951290
"""
12961291
if isinstance(filename, str):
12971292
filename = Path(filename)
1298-
with open(filename, "r", encoding="utf-8") as f:
1299-
return cls.model_validate_json(f.read())
1293+
return cls.model_validate_json(filename.read_text(encoding="utf-8"))

docling_core/types/doc/utils.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,24 @@
1313
from docling_core.types.doc.document import TableCell, TableData
1414

1515

16+
def is_remote_path(p) -> bool:
17+
"""Check if a path is a remote/cloud path (e.g., S3, GCS, Azure).
18+
19+
Works with UPath objects from universal-pathlib. Local paths return False.
20+
21+
Args:
22+
p: A path object (Path, UPath, or similar)
23+
24+
Returns:
25+
bool: True if the path is a remote/cloud path, False otherwise.
26+
"""
27+
# UPath objects have a 'protocol' attribute
28+
protocol = getattr(p, "protocol", None)
29+
if protocol is not None and protocol not in ("file", ""):
30+
return True
31+
return False
32+
33+
1634
def relative_path(src: Path, target: Path) -> Path:
1735
"""Compute the relative path from `src` to `target`.
1836
@@ -25,9 +43,19 @@ def relative_path(src: Path, target: Path) -> Path:
2543
2644
Raises:
2745
ValueError: If either `src` or `target` is not an absolute path.
46+
47+
Note:
48+
For remote paths (UPath with non-file protocols), this function cannot
49+
compute relative paths. Use is_remote_path() to check before calling.
2850
"""
29-
src = Path(src).resolve()
30-
target = Path(target).resolve()
51+
# Convert to Path only if string, otherwise keep original type
52+
if isinstance(src, str):
53+
src = Path(src)
54+
if isinstance(target, str):
55+
target = Path(target)
56+
57+
src = src.resolve()
58+
target = target.resolve()
3159

3260
# Ensure both paths are absolute
3361
if not src.is_absolute():

docling_core/utils/generate_docs.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
import argparse
88
import json
9-
import os
109
from argparse import BooleanOptionalAction
1110
from pathlib import Path
1211
from shutil import rmtree
@@ -24,15 +23,16 @@ def _prepare_directory(folder: str, clean: bool = False) -> None:
2423
folder: The name of the directory.
2524
clean: Whether any existing content in the directory should be removed.
2625
"""
27-
if os.path.isdir(folder):
26+
folder_path = Path(folder)
27+
if folder_path.is_dir():
2828
if clean:
29-
for path in Path(folder).glob("**/*"):
29+
for path in folder_path.glob("**/*"):
3030
if path.is_file():
3131
path.unlink()
3232
elif path.is_dir():
3333
rmtree(path)
3434
else:
35-
os.makedirs(folder, exist_ok=True)
35+
folder_path.mkdir(parents=True, exist_ok=True)
3636

3737

3838
def generate_collection_jsonschema(folder: str):
@@ -41,12 +41,13 @@ def generate_collection_jsonschema(folder: str):
4141
Args:
4242
folder: The name of the directory.
4343
"""
44+
folder_path = Path(folder)
4445
for item in MODELS:
4546
json_schema = generate_json_schema(item)
46-
with open(
47-
os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
48-
) as json_file:
49-
json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
47+
output_file = folder_path / f"{item}.json"
48+
output_file.write_text(
49+
json.dumps(json_schema, ensure_ascii=False, indent=2), encoding="utf-8"
50+
)
5051

5152

5253
def main() -> None:

docling_core/utils/validate.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ def parse_arguments():
3232

3333
def run():
3434
"""Run the validation of a file containing a Document."""
35+
from pathlib import Path
36+
3537
file_format, input_file = parse_arguments()
3638

37-
with open(input_file, "r", encoding="utf-8") as fd:
38-
file_ = json.load(fd)
39+
file_ = json.loads(Path(input_file).read_text(encoding="utf-8"))
3940

4041
result = (False, "Empty result")
4142

0 commit comments

Comments
 (0)