From dba0e4773a8704ec9598ef785e6061b09bc29db2 Mon Sep 17 00:00:00 2001 From: higuhigu-lb Date: Tue, 3 Dec 2024 11:44:16 +0900 Subject: [PATCH] fix: Skip NavigableString in HTML parsing --- docling/backend/html_backend.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 9cd1e29b..06f19452 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Set, Union -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString from docling_core.types.doc import ( DocItemLabel, DoclingDocument, @@ -92,6 +92,8 @@ def walk(self, element, doc): try: # Iterate over elements in the body of the document for idx, element in enumerate(element.children): + if isinstance(element, NavigableString): + continue # Skip over navigable strings try: self.analyse_element(element, idx, doc) except Exception as exc_child: