Skip to content

Commit

Permalink
fix: always write with utf8 encoding (#111)
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Dec 17, 2024
1 parent 819b1a6 commit 268c294
Show file tree
Hide file tree
Showing 16 changed files with 106 additions and 78 deletions.
2 changes: 1 addition & 1 deletion docling_core/cli/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def view(
doc = DoclingDocument.load_from_json(filename=path)
target_path = Path(tempfile.mkdtemp()) / "out.html"
html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
with open(target_path, "w") as f:
with open(target_path, "w", encoding="utf-8") as f:
f.write(html_output)
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")

Expand Down
12 changes: 6 additions & 6 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ def save_as_json(
)

out = new_doc.export_to_dict()
with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
json.dump(out, fw, indent=indent)

@classmethod
Expand All @@ -1898,7 +1898,7 @@ def load_from_json(cls, filename: Path) -> "DoclingDocument":
:rtype: DoclingDocument
"""
with open(filename, "r") as f:
with open(filename, "r", encoding="utf-8") as f:
return cls.model_validate_json(f.read())

def save_as_yaml(
Expand All @@ -1919,7 +1919,7 @@ def save_as_yaml(
)

out = new_doc.export_to_dict()
with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
yaml.dump(out, fw, default_flow_style=default_flow_style)

def export_to_dict(
Expand Down Expand Up @@ -1971,7 +1971,7 @@ def save_as_markdown(
page_no=page_no,
)

with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
fw.write(md_out)

def export_to_markdown( # noqa: C901
Expand Down Expand Up @@ -2224,7 +2224,7 @@ def save_as_html(
html_head=html_head,
)

with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
fw.write(html_out)

def _get_output_paths(
Expand Down Expand Up @@ -2462,7 +2462,7 @@ def save_as_document_tokens(
with_groups=with_groups,
)

with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
fw.write(out)

def export_to_document_tokens(
Expand Down
2 changes: 1 addition & 1 deletion docling_core/utils/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def run():
"""Run the validation of a file containing a Document."""
file_format, input_file = parse_arguments()

with open(input_file, "r") as fd:
with open(input_file, "r", encoding="utf-8") as fd:
file_ = json.load(fd)

result = (False, "Empty result")
Expand Down
4 changes: 2 additions & 2 deletions test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_identifier():
)

# schema_json(): no need to set by_alias since it is True by the default
tf = open("test/data/json_schemas/base_identifier.json")
tf = open("test/data/json_schemas/base_identifier.json", encoding="utf-8")
gold_json = json.load(tf)

assert Identifier.model_json_schema() == gold_json
Expand Down Expand Up @@ -104,7 +104,7 @@ def test_log():
== gold_dict
)

with open("test/data/json_schemas/base_log.json") as tf:
with open("test/data/json_schemas/base_log.json", encoding="utf-8") as tf:
gold_json_schema = json.load(tf)
assert Log.model_json_schema() == gold_json_schema

Expand Down
32 changes: 19 additions & 13 deletions test/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_generic():
def test_document():
"""Test the Document model."""
for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
Document.model_validate_json(file_json)

Expand All @@ -54,7 +54,7 @@ def test_table_export_to_tokens():
"""Test the Table Tokens export."""

for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()

doc = Document.model_validate_json(file_json)
Expand All @@ -73,10 +73,10 @@ def test_table_export_to_tokens():
fname = f"{filename}_table_{i}.doctags.txt"
if GENERATE:
print(f"writing {fname}")
with open(fname, "w") as gold_obj:
with open(fname, "w", encoding="utf-8") as gold_obj:
gold_obj.write(out)

with open(fname, "r") as gold_obj:
with open(fname, "r", encoding="utf-8") as gold_obj:
gold_data = gold_obj.read()

assert out == gold_data
Expand All @@ -96,10 +96,10 @@ def test_table_export_to_tokens():
fname = f"{filename}_table_{i}.doctags.txt"
if GENERATE:
print(f"writing {fname}")
with open(fname, "w") as gold_obj:
with open(fname, "w", encoding="utf-8") as gold_obj:
gold_obj.write(out)

with open(fname, "r") as gold_obj:
with open(fname, "r", encoding="utf-8") as gold_obj:
gold_data = gold_obj.read()

assert out == gold_data
Expand All @@ -110,35 +110,41 @@ def test_table_export_to_tokens():

def test_document_export_to_md():
"""Test the Document Markdown export."""
with open("test/data/legacy_doc/doc-export.json") as src_obj:
with open("test/data/legacy_doc/doc-export.json", encoding="utf-8") as src_obj:
src_data = src_obj.read()
doc = Document.model_validate_json(src_data)

md = doc.export_to_markdown()

if GENERATE:
with open("test/data/legacy_doc/doc-export.md", "w") as gold_obj:
with open(
"test/data/legacy_doc/doc-export.md", "w", encoding="utf-8"
) as gold_obj:
gold_obj.write(md)

with open("test/data/legacy_doc/doc-export.md") as gold_obj:
with open("test/data/legacy_doc/doc-export.md", encoding="utf-8") as gold_obj:
gold_data = gold_obj.read().strip()

assert md == gold_data


def test_document_export_to_tokens():
"""Test the Document Tokens export."""
with open("test/data/legacy_doc/doc-export.json") as src_obj:
with open("test/data/legacy_doc/doc-export.json", encoding="utf-8") as src_obj:
src_data = src_obj.read()

doc = Document.model_validate_json(src_data)
xml = doc.export_to_document_tokens(delim=True)

if GENERATE:
with open("test/data/legacy_doc/doc-export.doctags.txt", "w") as gold_obj:
with open(
"test/data/legacy_doc/doc-export.doctags.txt", "w", encoding="utf-8"
) as gold_obj:
gold_obj.write(xml)

with open("test/data/legacy_doc/doc-export.doctags.txt", "r") as gold_obj:
with open(
"test/data/legacy_doc/doc-export.doctags.txt", "r", encoding="utf-8"
) as gold_obj:
gold_data = gold_obj.read().strip()

assert xml == gold_data
Expand All @@ -147,6 +153,6 @@ def test_document_export_to_tokens():
def test_record():
"""Test the Document model."""
for filename in glob.glob("test/data/rec/record-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
Record.model_validate_json(file_json)
2 changes: 1 addition & 1 deletion test/test_doc_legacy_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def test_new_to_old():
filename = "test/data/doc/2206.01062.yaml"

with open(filename, "r") as fp:
with open(filename, "r", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)

doc = DoclingDocument.model_validate(dict_from_yaml)
Expand Down
12 changes: 6 additions & 6 deletions test/test_doc_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
def test_ccs_document():
"""Validate data with CCSDocument schema."""
for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
try:
# do not pass strict=True, since date input values are not an instance of datetime.
Expand All @@ -41,7 +41,7 @@ def test_ccs_document():

# check doc-error-1 is invalid in logs
try:
with open("test/data/legacy_doc/error-1.json") as file_obj:
with open("test/data/legacy_doc/error-1.json", encoding="utf-8") as file_obj:
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)
assert False, f"Data in file {filename} should be invalid for CCSDocument model"
Expand All @@ -55,15 +55,15 @@ def test_ccs_document():
# check doc-error-2 is invalid for missing page-hashes
with (
pytest.raises(ValidationError, match="page-hashes"),
open("test/data/legacy_doc/error-2.json") as file_obj,
open("test/data/legacy_doc/error-2.json", encoding="utf-8") as file_obj,
):
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)

# check doc-error-3 is invalid for wrong types in citation_count and reference_count
with (
pytest.raises(ValidationError, match="count"),
open("test/data/legacy_doc/error-3.json") as file_obj,
open("test/data/legacy_doc/error-3.json", encoding="utf-8") as file_obj,
):
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)
Expand All @@ -72,7 +72,7 @@ def test_ccs_document():
def test_publication_journal():
""" "Validate data with Publication model."""
for filename in glob.glob("test/data/legacy_doc/intermediates/publication_*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
file_dict = json.loads(file_json)
try:
Expand All @@ -85,7 +85,7 @@ def test_publication_journal():
def test_description_advanced_t():
"""Validate data with different DescriptionAdvancedT instances."""
# without description.advanced
with open("test/data/legacy_doc/doc-5.json") as file_obj:
with open("test/data/legacy_doc/doc-5.json", encoding="utf-8") as file_obj:
desc = json.load(file_obj)["description"]

# without advanced
Expand Down
2 changes: 1 addition & 1 deletion test/test_doc_schema_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_ccs_document_update():
"""Validate data with CCSDocument extract."""
filename = "test/data/legacy_doc/ext-1.json"
try:
with open(filename) as f:
with open(filename, encoding="utf-8") as f:
raw_doc = json.load(f)
for item in raw_doc["main-text"]:
if "$ref" in item:
Expand Down
24 changes: 14 additions & 10 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,15 @@ def serialise(obj):
return yaml.safe_dump(obj.model_dump(mode="json", by_alias=True))

def write(name: str, serialisation: str):
with open(f"./test/data/docling_document/unit/{name}.yaml", "w") as fw:
with open(
f"./test/data/docling_document/unit/{name}.yaml", "w", encoding="utf-8"
) as fw:
fw.write(serialisation)

def read(name: str):
with open(f"./test/data/docling_document/unit/{name}.yaml", "r") as fr:
with open(
f"./test/data/docling_document/unit/{name}.yaml", "r", encoding="utf-8"
) as fr:
gold = fr.read()
return gold

Expand Down Expand Up @@ -146,7 +150,7 @@ def test_reference_doc():
filename = "test/data/doc/dummy_doc.yaml"

# Read YAML file of manual reference doc
with open(filename, "r") as fp:
with open(filename, "r", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)

doc = DoclingDocument.model_validate(dict_from_yaml)
Expand Down Expand Up @@ -186,7 +190,7 @@ def test_parse_doc():

filename = "test/data/doc/2206.01062.yaml"

with open(filename, "r") as fp:
with open(filename, "r", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)

doc = DoclingDocument.model_validate(dict_from_yaml)
Expand Down Expand Up @@ -244,12 +248,12 @@ def _test_serialize_and_reload(doc):
def _verify_regression_test(pred: str, filename: str, ext: str):

if os.path.exists(filename + f".{ext}") and not GENERATE:
with open(filename + f".{ext}", "r") as fr:
with open(filename + f".{ext}", "r", encoding="utf-8") as fr:
gt_true = fr.read()

assert gt_true == pred, f"Does not pass regression-test for {filename}.{ext}"
else:
with open(filename + f".{ext}", "w") as fw:
with open(filename + f".{ext}", "w", encoding="utf-8") as fw:
fw.write(pred)


Expand Down Expand Up @@ -499,7 +503,7 @@ def test_version_doc():
doc = DoclingDocument(name="Untitled 1")
assert doc.version == CURRENT_VERSION

with open("test/data/doc/dummy_doc.yaml") as fp:
with open("test/data/doc/dummy_doc.yaml", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)
doc = DoclingDocument.model_validate(dict_from_yaml)
assert doc.version == CURRENT_VERSION
Expand Down Expand Up @@ -674,17 +678,17 @@ def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
def _verify_saved_output(filename: str, paths: List[Path]):

pred = ""
with open(filename, "r") as fr:
with open(filename, "r", encoding="utf-8") as fr:
pred = fr.read()

pred = _normalise_string_wrt_filepaths(pred, paths=paths)

if GENERATE:
with open(str(filename) + ".gt", "w") as fw:
with open(str(filename) + ".gt", "w", encoding="utf-8") as fw:
fw.write(pred)
else:
gt = ""
with open(str(filename) + ".gt", "r") as fr:
with open(str(filename) + ".gt", "r", encoding="utf-8") as fr:
gt = fr.read()

assert pred == gt, f"pred!=gt for {filename}"
Expand Down
8 changes: 4 additions & 4 deletions test/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def test_chunk_merge_list_items():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(
Expand All @@ -21,13 +21,13 @@ def test_chunk_merge_list_items():
act_data = dict(
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
)
with open("test/data/chunker/0_out_chunks.json") as f:
with open("test/data/chunker/0_out_chunks.json", encoding="utf-8") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_no_merge_list_items():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(
Expand All @@ -37,6 +37,6 @@ def test_chunk_no_merge_list_items():
act_data = dict(
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
)
with open("test/data/chunker/1_out_chunks.json") as f:
with open("test/data/chunker/1_out_chunks.json", encoding="utf-8") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data
Loading

0 comments on commit 268c294

Please sign in to comment.