66import json
77import logging
88import mimetypes
9- import os
109import re
1110import sys
1211import typing
1312import warnings
1413from enum import Enum
15- from io import BytesIO
14+ from io import BytesIO , StringIO
1615from pathlib import Path
1716from typing import (
1817 Any ,
6564 PictureClassificationLabel ,
6665)
6766from docling_core .types .doc .tokens import DocumentToken , TableToken
68- from docling_core .types .doc .utils import parse_otsl_table_content , relative_path
67+ from docling_core .types .doc .utils import (
68+ is_remote_path ,
69+ parse_otsl_table_content ,
70+ relative_path ,
71+ )
6972
7073_logger = logging .getLogger (__name__ )
7174
@@ -4762,38 +4765,48 @@ def _with_pictures_refs(
47624765 img_count = 0
47634766 image_dir .mkdir (parents = True , exist_ok = True )
47644767
4765- if image_dir .is_dir ():
4766- for item , level in result .iterate_items (page_no = page_no , with_groups = False ):
4767- if isinstance (item , PictureItem ):
4768- img = item .get_image (doc = self )
4769- if img is not None :
4770-
4771- hexhash = PictureItem ._image_to_hexhash (img )
4772-
4773- # loc_path = image_dir / f"image_{img_count:06}.png"
4774- if hexhash is not None :
4775- loc_path = image_dir / f"image_{ img_count :06} _{ hexhash } .png"
4776-
4777- img .save (loc_path )
4778- if reference_path is not None :
4779- obj_path = relative_path (
4780- reference_path .resolve (),
4781- loc_path .resolve (),
4782- )
4783- else :
4784- obj_path = loc_path
4768+ # Note: Skip is_dir() check for remote paths since S3/cloud storage
4769+ # doesn't have real directories - mkdir() is a no-op for remote paths
4770+ for item , level in result .iterate_items (page_no = page_no , with_groups = False ):
4771+ if isinstance (item , PictureItem ):
4772+ img = item .get_image (doc = self )
4773+ if img is not None :
4774+
4775+ hexhash = PictureItem ._image_to_hexhash (img )
4776+
4777+ # loc_path = image_dir / f"image_{img_count:06}.png"
4778+ if hexhash is not None :
4779+ loc_path = image_dir / f"image_{ img_count :06} _{ hexhash } .png"
4780+
4781+ # Use BytesIO + write_bytes for UPath compatibility
4782+ buf = BytesIO ()
4783+ img .save (buf , format = "PNG" )
4784+ loc_path .write_bytes (buf .getvalue ())
4785+
4786+ # For remote paths, use absolute URI string; for local, compute relative
4787+ if is_remote_path (loc_path ) or is_remote_path (reference_path ):
4788+ # Convert to string URI for remote paths (Pydantic can't serialize UPath)
4789+ obj_path = str (loc_path )
4790+ elif reference_path is not None :
4791+ obj_path = relative_path (
4792+ reference_path .resolve (),
4793+ loc_path .resolve (),
4794+ )
4795+ else :
4796+ obj_path = loc_path
47854797
4786- if item .image is None :
4787- scale = img .size [0 ] / item .prov [0 ].bbox .width
4788- item .image = ImageRef .from_pil (
4789- image = img , dpi = round (72 * scale )
4790- )
4791- item .image .uri = Path (obj_path )
4798+ if item .image is None :
4799+ scale = img .size [0 ] / item .prov [0 ].bbox .width
4800+ item .image = ImageRef .from_pil (
4801+ image = img , dpi = round (72 * scale )
4802+ )
4803+ # For remote paths, store as string URI; for local, store as Path
4804+ item .image .uri = obj_path
47924805
4793- # if item.image._pil is not None:
4794- # item.image._pil.close()
4806+ # if item.image._pil is not None:
4807+ # item.image._pil.close()
47954808
4796- img_count += 1
4809+ img_count += 1
47974810
47984811 return result
47994812
@@ -4859,7 +4872,7 @@ def save_as_json(
48594872 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
48604873
48614874 if image_mode == ImageRefMode .REFERENCED :
4862- os . makedirs ( artifacts_dir , exist_ok = True )
4875+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
48634876
48644877 new_doc = self ._make_copy_with_refmode (
48654878 artifacts_dir , image_mode , page_no = None , reference_path = reference_path
@@ -4868,8 +4881,7 @@ def save_as_json(
48684881 out = new_doc .export_to_dict (
48694882 coord_precision = coord_precision , confid_precision = confid_precision
48704883 )
4871- with open (filename , "w" , encoding = "utf-8" ) as fw :
4872- json .dump (out , fw , indent = indent )
4884+ filename .write_text (json .dumps (out , indent = indent ), encoding = "utf-8" )
48734885
48744886 @classmethod
48754887 def load_from_json (cls , filename : Union [str , Path ]) -> "DoclingDocument" :
@@ -4884,8 +4896,7 @@ def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
48844896 """
48854897 if isinstance (filename , str ):
48864898 filename = Path (filename )
4887- with open (filename , "r" , encoding = "utf-8" ) as f :
4888- return cls .model_validate_json (f .read ())
4899+ return cls .model_validate_json (filename .read_text (encoding = "utf-8" ))
48894900
48904901 def save_as_yaml (
48914902 self ,
@@ -4902,7 +4913,7 @@ def save_as_yaml(
49024913 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
49034914
49044915 if image_mode == ImageRefMode .REFERENCED :
4905- os . makedirs ( artifacts_dir , exist_ok = True )
4916+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
49064917
49074918 new_doc = self ._make_copy_with_refmode (
49084919 artifacts_dir , image_mode , page_no = None , reference_path = reference_path
@@ -4911,8 +4922,9 @@ def save_as_yaml(
49114922 out = new_doc .export_to_dict (
49124923 coord_precision = coord_precision , confid_precision = confid_precision
49134924 )
4914- with open (filename , "w" , encoding = "utf-8" ) as fw :
4915- yaml .dump (out , fw , default_flow_style = default_flow_style )
4925+ stream = StringIO ()
4926+ yaml .dump (out , stream , default_flow_style = default_flow_style )
4927+ filename .write_text (stream .getvalue (), encoding = "utf-8" )
49164928
49174929 @classmethod
49184930 def load_from_yaml (cls , filename : Union [str , Path ]) -> "DoclingDocument" :
@@ -4926,8 +4938,7 @@ def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
49264938 """
49274939 if isinstance (filename , str ):
49284940 filename = Path (filename )
4929- with open (filename , encoding = "utf-8" ) as f :
4930- data = yaml .load (f , Loader = yaml .SafeLoader )
4941+ data = yaml .load (filename .read_text (encoding = "utf-8" ), Loader = yaml .SafeLoader )
49314942 return DoclingDocument .model_validate (data )
49324943
49334944 def export_to_dict (
@@ -4979,7 +4990,7 @@ def save_as_markdown(
49794990 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
49804991
49814992 if image_mode == ImageRefMode .REFERENCED :
4982- os . makedirs ( artifacts_dir , exist_ok = True )
4993+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
49834994
49844995 new_doc = self ._make_copy_with_refmode (
49854996 artifacts_dir , image_mode , page_no , reference_path = reference_path
@@ -5005,8 +5016,7 @@ def save_as_markdown(
50055016 mark_meta = mark_meta ,
50065017 )
50075018
5008- with open (filename , "w" , encoding = "utf-8" ) as fw :
5009- fw .write (md_out )
5019+ filename .write_text (md_out , encoding = "utf-8" )
50105020
50115021 def export_to_markdown ( # noqa: C901
50125022 self ,
@@ -5185,7 +5195,7 @@ def save_as_html(
51855195 artifacts_dir , reference_path = self ._get_output_paths (filename , artifacts_dir )
51865196
51875197 if image_mode == ImageRefMode .REFERENCED :
5188- os . makedirs ( artifacts_dir , exist_ok = True )
5198+ artifacts_dir . mkdir ( parents = True , exist_ok = True )
51895199
51905200 new_doc = self ._make_copy_with_refmode (
51915201 artifacts_dir , image_mode , page_no , reference_path = reference_path
@@ -5205,8 +5215,7 @@ def save_as_html(
52055215 include_annotations = include_annotations ,
52065216 )
52075217
5208- with open (filename , "w" , encoding = "utf-8" ) as fw :
5209- fw .write (html_out )
5218+ filename .write_text (html_out , encoding = "utf-8" )
52105219
52115220 def _get_output_paths (
52125221 self , filename : Union [str , Path ], artifacts_dir : Optional [Path ] = None
@@ -5850,8 +5859,7 @@ def save_as_doctags(
58505859 minified = minified ,
58515860 )
58525861
5853- with open (filename , "w" , encoding = "utf-8" ) as fw :
5854- fw .write (out )
5862+ filename .write_text (out , encoding = "utf-8" )
58555863
58565864 @deprecated ("Use export_to_doctags() instead." )
58575865 def export_to_document_tokens (self , * args , ** kwargs ):
0 commit comments