fix: always write with utf8 encoding (#111)

Signed-off-by: Michele Dolfi <[email protected]>
DS4SD · Dec 17, 2024 · 268c294 · 268c294
1 parent 819b1a6
commit 268c294
Show file tree

Hide file tree

Showing 16 changed files with 106 additions and 78 deletions.
diff --git a/docling_core/cli/view.py b/docling_core/cli/view.py
@@ -57,7 +57,7 @@ def view(
     doc = DoclingDocument.load_from_json(filename=path)
     target_path = Path(tempfile.mkdtemp()) / "out.html"
     html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
-    with open(target_path, "w") as f:
+    with open(target_path, "w", encoding="utf-8") as f:
         f.write(html_output)
     webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
 

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -1884,7 +1884,7 @@ def save_as_json(
         )
 
         out = new_doc.export_to_dict()
-        with open(filename, "w") as fw:
+        with open(filename, "w", encoding="utf-8") as fw:
             json.dump(out, fw, indent=indent)
 
     @classmethod
@@ -1898,7 +1898,7 @@ def load_from_json(cls, filename: Path) -> "DoclingDocument":
         :rtype: DoclingDocument
 
         """
-        with open(filename, "r") as f:
+        with open(filename, "r", encoding="utf-8") as f:
             return cls.model_validate_json(f.read())
 
     def save_as_yaml(
@@ -1919,7 +1919,7 @@ def save_as_yaml(
         )
 
         out = new_doc.export_to_dict()
-        with open(filename, "w") as fw:
+        with open(filename, "w", encoding="utf-8") as fw:
             yaml.dump(out, fw, default_flow_style=default_flow_style)
 
     def export_to_dict(
@@ -1971,7 +1971,7 @@ def save_as_markdown(
             page_no=page_no,
         )
 
-        with open(filename, "w") as fw:
+        with open(filename, "w", encoding="utf-8") as fw:
             fw.write(md_out)
 
     def export_to_markdown(  # noqa: C901
@@ -2224,7 +2224,7 @@ def save_as_html(
             html_head=html_head,
         )
 
-        with open(filename, "w") as fw:
+        with open(filename, "w", encoding="utf-8") as fw:
             fw.write(html_out)
 
     def _get_output_paths(
@@ -2462,7 +2462,7 @@ def save_as_document_tokens(
             with_groups=with_groups,
         )
 
-        with open(filename, "w") as fw:
+        with open(filename, "w", encoding="utf-8") as fw:
             fw.write(out)
 
     def export_to_document_tokens(

diff --git a/docling_core/utils/validate.py b/docling_core/utils/validate.py
@@ -38,7 +38,7 @@ def run():
     """Run the validation of a file containing a Document."""
     file_format, input_file = parse_arguments()
 
-    with open(input_file, "r") as fd:
+    with open(input_file, "r", encoding="utf-8") as fd:
         file_ = json.load(fd)
 
     result = (False, "Empty result")

diff --git a/test/test_base.py b/test/test_base.py
@@ -36,7 +36,7 @@ def test_identifier():
     )
 
     # schema_json(): no need to set by_alias since it is True by the default
-    tf = open("test/data/json_schemas/base_identifier.json")
+    tf = open("test/data/json_schemas/base_identifier.json", encoding="utf-8")
     gold_json = json.load(tf)
 
     assert Identifier.model_json_schema() == gold_json
@@ -104,7 +104,7 @@ def test_log():
         == gold_dict
     )
 
-    with open("test/data/json_schemas/base_log.json") as tf:
+    with open("test/data/json_schemas/base_log.json", encoding="utf-8") as tf:
         gold_json_schema = json.load(tf)
     assert Log.model_json_schema() == gold_json_schema
 

diff --git a/test/test_collection.py b/test/test_collection.py
@@ -45,7 +45,7 @@ def test_generic():
 def test_document():
     """Test the Document model."""
     for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
-        with open(filename) as file_obj:
+        with open(filename, encoding="utf-8") as file_obj:
             file_json = file_obj.read()
         Document.model_validate_json(file_json)
 
@@ -54,7 +54,7 @@ def test_table_export_to_tokens():
     """Test the Table Tokens export."""
 
     for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
-        with open(filename) as file_obj:
+        with open(filename, encoding="utf-8") as file_obj:
             file_json = file_obj.read()
 
         doc = Document.model_validate_json(file_json)
@@ -73,10 +73,10 @@ def test_table_export_to_tokens():
                     fname = f"{filename}_table_{i}.doctags.txt"
                     if GENERATE:
                         print(f"writing {fname}")
-                        with open(fname, "w") as gold_obj:
+                        with open(fname, "w", encoding="utf-8") as gold_obj:
                             gold_obj.write(out)
 
-                    with open(fname, "r") as gold_obj:
+                    with open(fname, "r", encoding="utf-8") as gold_obj:
                         gold_data = gold_obj.read()
 
                     assert out == gold_data
@@ -96,10 +96,10 @@ def test_table_export_to_tokens():
                     fname = f"{filename}_table_{i}.doctags.txt"
                     if GENERATE:
                         print(f"writing {fname}")
-                        with open(fname, "w") as gold_obj:
+                        with open(fname, "w", encoding="utf-8") as gold_obj:
                             gold_obj.write(out)
 
-                    with open(fname, "r") as gold_obj:
+                    with open(fname, "r", encoding="utf-8") as gold_obj:
                         gold_data = gold_obj.read()
 
                     assert out == gold_data
@@ -110,35 +110,41 @@ def test_table_export_to_tokens():
 
 def test_document_export_to_md():
     """Test the Document Markdown export."""
-    with open("test/data/legacy_doc/doc-export.json") as src_obj:
+    with open("test/data/legacy_doc/doc-export.json", encoding="utf-8") as src_obj:
         src_data = src_obj.read()
     doc = Document.model_validate_json(src_data)
 
     md = doc.export_to_markdown()
 
     if GENERATE:
-        with open("test/data/legacy_doc/doc-export.md", "w") as gold_obj:
+        with open(
+            "test/data/legacy_doc/doc-export.md", "w", encoding="utf-8"
+        ) as gold_obj:
             gold_obj.write(md)
 
-    with open("test/data/legacy_doc/doc-export.md") as gold_obj:
+    with open("test/data/legacy_doc/doc-export.md", encoding="utf-8") as gold_obj:
         gold_data = gold_obj.read().strip()
 
     assert md == gold_data
 
 
 def test_document_export_to_tokens():
     """Test the Document Tokens export."""
-    with open("test/data/legacy_doc/doc-export.json") as src_obj:
+    with open("test/data/legacy_doc/doc-export.json", encoding="utf-8") as src_obj:
         src_data = src_obj.read()
 
     doc = Document.model_validate_json(src_data)
     xml = doc.export_to_document_tokens(delim=True)
 
     if GENERATE:
-        with open("test/data/legacy_doc/doc-export.doctags.txt", "w") as gold_obj:
+        with open(
+            "test/data/legacy_doc/doc-export.doctags.txt", "w", encoding="utf-8"
+        ) as gold_obj:
             gold_obj.write(xml)
 
-    with open("test/data/legacy_doc/doc-export.doctags.txt", "r") as gold_obj:
+    with open(
+        "test/data/legacy_doc/doc-export.doctags.txt", "r", encoding="utf-8"
+    ) as gold_obj:
         gold_data = gold_obj.read().strip()
 
     assert xml == gold_data
@@ -147,6 +153,6 @@ def test_document_export_to_tokens():
 def test_record():
     """Test the Document model."""
     for filename in glob.glob("test/data/rec/record-*.json"):
-        with open(filename) as file_obj:
+        with open(filename, encoding="utf-8") as file_obj:
             file_json = file_obj.read()
         Record.model_validate_json(file_json)
diff --git a/test/test_doc_legacy_convert.py b/test/test_doc_legacy_convert.py
@@ -15,7 +15,7 @@
 def test_new_to_old():
     filename = "test/data/doc/2206.01062.yaml"
 
-    with open(filename, "r") as fp:
+    with open(filename, "r", encoding="utf-8") as fp:
         dict_from_yaml = yaml.safe_load(fp)
 
     doc = DoclingDocument.model_validate(dict_from_yaml)

diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py
@@ -27,7 +27,7 @@
 def test_ccs_document():
     """Validate data with CCSDocument schema."""
     for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
-        with open(filename) as file_obj:
+        with open(filename, encoding="utf-8") as file_obj:
             file_json = file_obj.read()
         try:
             # do not pass strict=True, since date input values are not an instance of datetime.
@@ -41,7 +41,7 @@ def test_ccs_document():
 
     # check doc-error-1 is invalid in logs
     try:
-        with open("test/data/legacy_doc/error-1.json") as file_obj:
+        with open("test/data/legacy_doc/error-1.json", encoding="utf-8") as file_obj:
             file_json = file_obj.read()
         CCSDocument.model_validate_json(file_json)
         assert False, f"Data in file {filename} should be invalid for CCSDocument model"
@@ -55,15 +55,15 @@ def test_ccs_document():
     # check doc-error-2 is invalid for missing page-hashes
     with (
         pytest.raises(ValidationError, match="page-hashes"),
-        open("test/data/legacy_doc/error-2.json") as file_obj,
+        open("test/data/legacy_doc/error-2.json", encoding="utf-8") as file_obj,
     ):
         file_json = file_obj.read()
         CCSDocument.model_validate_json(file_json)
 
     # check doc-error-3 is invalid for wrong types in citation_count and reference_count
     with (
         pytest.raises(ValidationError, match="count"),
-        open("test/data/legacy_doc/error-3.json") as file_obj,
+        open("test/data/legacy_doc/error-3.json", encoding="utf-8") as file_obj,
     ):
         file_json = file_obj.read()
         CCSDocument.model_validate_json(file_json)
@@ -72,7 +72,7 @@ def test_ccs_document():
 def test_publication_journal():
     """ "Validate data with Publication model."""
     for filename in glob.glob("test/data/legacy_doc/intermediates/publication_*.json"):
-        with open(filename) as file_obj:
+        with open(filename, encoding="utf-8") as file_obj:
             file_json = file_obj.read()
             file_dict = json.loads(file_json)
         try:
@@ -85,7 +85,7 @@ def test_publication_journal():
 def test_description_advanced_t():
     """Validate data with different DescriptionAdvancedT instances."""
     # without description.advanced
-    with open("test/data/legacy_doc/doc-5.json") as file_obj:
+    with open("test/data/legacy_doc/doc-5.json", encoding="utf-8") as file_obj:
         desc = json.load(file_obj)["description"]
 
     # without advanced

diff --git a/test/test_doc_schema_extractor.py b/test/test_doc_schema_extractor.py
@@ -15,7 +15,7 @@ def test_ccs_document_update():
     """Validate data with CCSDocument extract."""
     filename = "test/data/legacy_doc/ext-1.json"
     try:
-        with open(filename) as f:
+        with open(filename, encoding="utf-8") as f:
             raw_doc = json.load(f)
             for item in raw_doc["main-text"]:
                 if "$ref" in item:

diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
@@ -64,11 +64,15 @@ def serialise(obj):
         return yaml.safe_dump(obj.model_dump(mode="json", by_alias=True))
 
     def write(name: str, serialisation: str):
-        with open(f"./test/data/docling_document/unit/{name}.yaml", "w") as fw:
+        with open(
+            f"./test/data/docling_document/unit/{name}.yaml", "w", encoding="utf-8"
+        ) as fw:
             fw.write(serialisation)
 
     def read(name: str):
-        with open(f"./test/data/docling_document/unit/{name}.yaml", "r") as fr:
+        with open(
+            f"./test/data/docling_document/unit/{name}.yaml", "r", encoding="utf-8"
+        ) as fr:
             gold = fr.read()
         return gold
 
@@ -146,7 +150,7 @@ def test_reference_doc():
     filename = "test/data/doc/dummy_doc.yaml"
 
     # Read YAML file of manual reference doc
-    with open(filename, "r") as fp:
+    with open(filename, "r", encoding="utf-8") as fp:
         dict_from_yaml = yaml.safe_load(fp)
 
     doc = DoclingDocument.model_validate(dict_from_yaml)
@@ -186,7 +190,7 @@ def test_parse_doc():
 
     filename = "test/data/doc/2206.01062.yaml"
 
-    with open(filename, "r") as fp:
+    with open(filename, "r", encoding="utf-8") as fp:
         dict_from_yaml = yaml.safe_load(fp)
 
     doc = DoclingDocument.model_validate(dict_from_yaml)
@@ -244,12 +248,12 @@ def _test_serialize_and_reload(doc):
 def _verify_regression_test(pred: str, filename: str, ext: str):
 
     if os.path.exists(filename + f".{ext}") and not GENERATE:
-        with open(filename + f".{ext}", "r") as fr:
+        with open(filename + f".{ext}", "r", encoding="utf-8") as fr:
             gt_true = fr.read()
 
         assert gt_true == pred, f"Does not pass regression-test for {filename}.{ext}"
     else:
-        with open(filename + f".{ext}", "w") as fw:
+        with open(filename + f".{ext}", "w", encoding="utf-8") as fw:
             fw.write(pred)
 
 
@@ -499,7 +503,7 @@ def test_version_doc():
     doc = DoclingDocument(name="Untitled 1")
     assert doc.version == CURRENT_VERSION
 
-    with open("test/data/doc/dummy_doc.yaml") as fp:
+    with open("test/data/doc/dummy_doc.yaml", encoding="utf-8") as fp:
         dict_from_yaml = yaml.safe_load(fp)
     doc = DoclingDocument.model_validate(dict_from_yaml)
     assert doc.version == CURRENT_VERSION
@@ -674,17 +678,17 @@ def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
 def _verify_saved_output(filename: str, paths: List[Path]):
 
     pred = ""
-    with open(filename, "r") as fr:
+    with open(filename, "r", encoding="utf-8") as fr:
         pred = fr.read()
 
     pred = _normalise_string_wrt_filepaths(pred, paths=paths)
 
     if GENERATE:
-        with open(str(filename) + ".gt", "w") as fw:
+        with open(str(filename) + ".gt", "w", encoding="utf-8") as fw:
             fw.write(pred)
     else:
         gt = ""
-        with open(str(filename) + ".gt", "r") as fr:
+        with open(str(filename) + ".gt", "r", encoding="utf-8") as fr:
             gt = fr.read()
 
         assert pred == gt, f"pred!=gt for {filename}"

diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
@@ -11,7 +11,7 @@
 
 
 def test_chunk_merge_list_items():
-    with open("test/data/chunker/0_inp_dl_doc.json") as f:
+    with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
     chunker = HierarchicalChunker(
@@ -21,13 +21,13 @@ def test_chunk_merge_list_items():
     act_data = dict(
         root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
     )
-    with open("test/data/chunker/0_out_chunks.json") as f:
+    with open("test/data/chunker/0_out_chunks.json", encoding="utf-8") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
 
 
 def test_chunk_no_merge_list_items():
-    with open("test/data/chunker/0_inp_dl_doc.json") as f:
+    with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
     chunker = HierarchicalChunker(
@@ -37,6 +37,6 @@ def test_chunk_no_merge_list_items():
     act_data = dict(
         root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
     )
-    with open("test/data/chunker/1_out_chunks.json") as f:
+    with open("test/data/chunker/1_out_chunks.json", encoding="utf-8") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data