dgtlmoon · Constantin1489 · May 2, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
@@ -110,6 +110,38 @@ def elementpath_tostring(obj):
 
     return str(obj)
 
+def forest_transplanting(root):
+    """
+    libxml2 violates DOM rules. it means there can be multiple root element
+    nodes. So I choose just transplating them to a new root by default.
+    See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716
+    This will emulate xpath1 of html of libxml2 like '/html[2]/*'.
+    To make this function work, 'fragment=True' in elementpath.select is required.
+    """
+    from lxml import etree
+    from itertools import chain
+    root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)]
+    root_siblings = [s for s in root.itersiblings()]
+
+    Is_fragment=False
+    # If element node exsits in root element node's sibilings, it is fragment.
+    for node in chain(root_siblings_preceding, root_siblings):
+        if not hasattr(node.tag, '__name__'):
+            Is_fragment=True
+            # early exit. because the root is already root element.
+            # So, two root element nodes are detected. DOM violation.
+            break
+
+    if Is_fragment:
+        new_root = etree.Element("new_root")
+        root_siblings_preceding.reverse()
+        for node in chain(root_siblings_preceding, [root], root_siblings):
+            new_root.append(node)
+        return new_root, True
+
+    return root, False
+
+
 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
     from lxml import etree, html
@@ -123,9 +155,10 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
         parser = etree.XMLParser(strip_cdata=False)
 
     tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
+    tree, is_fragment = forest_transplanting(tree)
     html_block = ""
 
-    r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
+    r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=is_fragment)
     #@note: //title/text() wont work where <title>CDATA..
 
     if type(r) != list:

diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py
@@ -201,3 +201,61 @@ def test_trips(html_content, xpath, answer):
     html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
     assert type(html_content) == str
     assert answer in html_content
+
+DOM_violation_two_html_root_element = """<!DOCTYPE html>
+<html>
+  <body>
+    <h1>Hello world</h1>
+    <p>First paragraph.</p>
+  </body>
+</html>
+<html>
+  <body>
+    <h1>Hello world</h1>
+    <p>Browsers parse this part by fixing it but lxml doesn't and returns two root element node</p>
+    <p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p>
+  </body>
+</html>"""
+@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
+@pytest.mark.parametrize("xpath, answer", [
+    ("/html/body/p[1]", "First paragraph."),
+    ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+    ("//html/body/p[1]", "First paragraph."),
+    ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+    ("//body/p[1]", "First paragraph."),
+    ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+    ("/html[2]/body/p[1]", "First paragraph."),
+    ("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+    ("//html[2]/body/p[1]", "First paragraph."),
+    ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+                          ])
+def test_trips(html_content, xpath, answer):
+
+
+    # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
+    with pytest.raises(Exception):
+        from lxml import etree, html
+        import elementpath
+        from elementpath.xpath3 import XPath3Parser
+        parser = etree.HTMLParser()
+        tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
+        # just example xpath
+        # Error will occur.
+        r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
+
+    html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
+    assert type(html_content) == str
+    assert answer in html_content
+
+@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
+@pytest.mark.parametrize("xpath, answer", [
+    ("/html[2]/body/p[1]", "First paragraph."),
+    ("//html[2]/body/p[1]", "First paragraph."),
+                          ])
+def test_trips(html_content, xpath, answer):
+    # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
+
+    html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
+    assert type(html_content) == str
+    # check the answer is not in the html_content
+    assert answer not in html_content
diff --git a/requirements.txt b/requirements.txt
@@ -55,7 +55,7 @@ beautifulsoup4
 lxml >=4.8.0,<6
 
 # XPath 2.0-3.1 support - 4.2.0 broke something?
-elementpath==4.1.5
+elementpath==4.4.0
 
 selenium~=4.14.0