Skip to content

Commit 42856fd

Browse files
authored
fix: Let BeautifulSoup detect the HTML encoding (#695)
Signed-off-by: Christoph Auer <[email protected]>
1 parent 2d24fae commit 42856fd

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

docling/backend/html_backend.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
3737

3838
try:
3939
if isinstance(self.path_or_stream, BytesIO):
40-
text_stream = self.path_or_stream.getvalue().decode("utf-8")
40+
text_stream = self.path_or_stream.getvalue()
4141
self.soup = BeautifulSoup(text_stream, "html.parser")
4242
if isinstance(self.path_or_stream, Path):
43-
with open(self.path_or_stream, "r", encoding="utf-8") as f:
43+
with open(self.path_or_stream, "rb") as f:
4444
html_content = f.read()
4545
self.soup = BeautifulSoup(html_content, "html.parser")
4646
except Exception as e:

0 commit comments

Comments
 (0)