Avoid trailing whitespace in kobo spans (#174)

# Implement long overdue TODO ## Description of change This seems like a small change but has huge implications for the way text is rendered with optimizeLegibilty and geometricPrecision. Really all it does is move the trailing whitespace to the front of the next group, just like the TODO note suggested ought to be done. I'm guessing at the exact mechanism, but for *unclear* reasons whatever justification program Kobo method uses it apparently does not expand or contract the last space in a kobo span. (unless using optimizeSpeed, maybe?) This can causes weird looking text, and in the worst case may even expand the space between letters just to stretch the text far enough. If this is correct it would fit the behaviour [observed by jackie_w](https://www.mobileread.com/forums/showpost.php?p=3812607&postcount=60) on the mobileRead forums This change may create a few pure-whitespace kobo spans when a sentence has some trailing whitespace, which I think could be ignored. ## Test results This seems to fix most issues with optimizeLegibilty. I'm still testing but the improvements I'm seeing are substantial enough that I think it might be best if other people try this out as well. Even if we can't fix everything it improves things *a lot*. Only niggle is that the regexp only works for proper sentences, there are still a couple of issues with things like *emphasis* where a span might end up with trailing whitespace. These occur when the text ends in whitespace mid sentence.
jgoguen · Oct 13, 2024 · 296783a · 296783a
2 parents 327f809 + d63e6b4
commit 296783a
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 57 deletions.
diff --git a/container.py b/container.py
@@ -87,7 +87,7 @@
 MS_CRUFT_RE_1 = re.compile(r"<o:p>\s*</o:p>", re.UNICODE | re.MULTILINE)
 MS_CRUFT_RE_2 = re.compile(r"(?i)</?st1:\w+>", re.UNICODE | re.MULTILINE)
 TEXT_SPLIT_RE = re.compile(
-    r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?\s*)',
+    r'(\S.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]*(?=\s)|(?=\s*$)))',
     re.UNICODE | re.MULTILINE,
 )
 
@@ -577,10 +577,7 @@ def _add_kobo_spans_to_node(
 
         # the node text is converted to spans
         if node_text is not None:
-            if not self._append_kobo_spans_from_text(node, node_text, name):
-                # didn't add spans, restore text
-                node.text = node_text
-            else:
+            if self._append_kobo_spans_from_text(node, node_text, name):
                 self.paragraph_counter[name] += 1
 
         # re-add the node children
@@ -591,10 +588,7 @@ def _add_kobo_spans_to_node(
             node.append(self._add_kobo_spans_to_node(child, name))
             # the child tail is converted to spans
             if child_tail is not None:
-                if not self._append_kobo_spans_from_text(node, child_tail, name):
-                    # didn't add spans, restore tail on last child
-                    node[-1].tail = child_tail
-                else:
+                if self._append_kobo_spans_from_text(node, child_tail, name):
                     self.paragraph_counter[name] += 1
 
         return node
@@ -606,22 +600,18 @@ def _append_kobo_spans_from_text(
             self.log.error(f"[{name}] No text passed, can't add spans")
             return False
 
-        # if text is only whitespace, don't add spans
-        if text.strip() == "":
-            self.log.warning(f"[{name}] Found only whitespace, not adding spans")
-            return False
-
         # split text in sentences
         groups = TEXT_SPLIT_RE.split(text)
-        # remove empty strings resulting from split()
-        groups = [g for g in groups if g != ""]
 
-        # TODO: To match Kobo KePubs, the trailing whitespace needs to
-        # be prepended to the next group. Probably equivalent to make
-        # sure the space stays in the span at the end.
-        # add each sentence in its own span
+        # append first group (whitespace) as text
+        if len(node) == 0:
+            node.text     = groups[0]
+        else:
+            node[-1].tail = groups[0]
+
+        # append each sentence in its own span
         segment_counter = 1
-        for g in groups:
+        for g, ws in zip(groups[1::2], groups[2::2]):
             span = etree.Element(
                 f"{{{XHTML_NAMESPACE}}}span",
                 attrib={
@@ -630,7 +620,8 @@ def _append_kobo_spans_from_text(
                 },
             )
             span.text = g
+            span.tail = ws
             node.append(span)
             segment_counter += 1
 
-        return True
+        return len(groups) > 1  # Return true if any spans were added.
diff --git a/tests/test_container.py b/tests/test_container.py
@@ -89,7 +89,9 @@ def test_add_html_file(self):
         self.assertIn(self.container.mime_map[container_name], container.HTML_MIMETYPES)
         self.assertIn("content.opf", self.container.dirtied)
 
-    def __run_added_test(self, expect_changed, added_func):  # type: (bool, Callable) -> None
+    def __run_added_test(
+        self, expect_changed, added_func
+    ):  # type: (bool, Callable) -> None
         if expect_changed:
             source_file = self.files["test_without_spans_with_comments"]
         else:
@@ -259,38 +261,53 @@ def test_add_js(self):
             )
         )
 
+    def __run_single_node_test(self, text, text_only=False, number_of_sentences=None):
+        self.container.paragraph_counter = defaultdict(lambda: 1)
+        node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")
+
+        if text_only:
+            self.assertTrue(
+                self.container._append_kobo_spans_from_text(node, text, "test")
+            )
+        else:
+            node.text = text
+            node = self.container._add_kobo_spans_to_node(node, "test")
+
+        # check number of sentences
+        if number_of_sentences is not None:
+            self.assertEqual(len(node.getchildren()), number_of_sentences)
+
+        for span in node.getchildren():
+            # spans should not end in whitespace (PR#191), and be nonempty
+            self.assertFalse(re.match(r'\s', span.text[-1]))
+            # tail of span should *only* be whitespace
+            self.assertTrue(re.match(r'\s*', span.tail or ''))
+
+            # attrib is technically of type lxml.etree._Attrib, but functionally
+            # it's a dict. Cast it here to make assertDictEqual() happy.
+            self.assertDictEqual(
+                dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
+            )
+
+        # remaining text should only contain whitespace
+        self.assertTrue(re.match(r'\s*', node.text or ''))
+
+        # complete text should remain the same
+        self.assertEqual(''.join(node.itertext()), text)
+
     def test_add_spans_to_text(self):
         text_samples = [
             "Hello, World!",
             "    Hello, World!",
             "Hello, World!    ",
             "    Hello, World!    ",
             "\n\n    GIF is pronounced as it's spelled.\n   ",
+            " \"Yes, but I asked 'Why?'\" "
         ]
 
         for text in text_samples:
             for text_only in {True, False}:
-                self.container.paragraph_counter = defaultdict(lambda: 1)
-                node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")
-
-                if text_only:
-                    self.assertTrue(
-                        self.container._append_kobo_spans_from_text(node, text, "test")
-                    )
-                else:
-                    node.text = text
-                    node = self.container._add_kobo_spans_to_node(node, "test")
-
-                self.assertEqual(len(node.getchildren()), 1)
-
-                span = node.getchildren()[0]
-                self.assertIsNone(span.tail)
-                # attrib is technically of type lxml.etree._Attrib, but functionally
-                # it's a dict. Cast it here to make assertDictEqual() happy.
-                self.assertDictEqual(
-                    dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
-                )
-                self.assertEqual(span.text, text)
+                self.__run_single_node_test(text, text_only=text_only, number_of_sentences=1)
 
     def __run_multiple_node_test(self, text_nodes):  # type: (List[str]) -> None
         html = "<div>"
@@ -305,19 +322,21 @@ def __run_multiple_node_test(self, text_nodes):  # type: (List[str]) -> None
 
         node = self.container._add_kobo_spans_to_node(node, "test")
         children = node.getchildren()
-        self.assertEqual(len(children), len(text_nodes))
 
-        for node_idx, node in enumerate(children):
+        # check each paragraph
+        self.assertEqual(len(text_nodes), len(children))
+        for text, node in zip(text_nodes, children):
             spans = node.getchildren()
-            text_chunks = [
-                g
-                for g in container.TEXT_SPLIT_RE.split(text_nodes[node_idx])
-                if g.strip() != ""
-            ]
-            self.assertEqual(len(spans), len(text_chunks))
+            # note: this regexp isn't being tested (it's known to be fallible, but good enough)
+            sentences = container.TEXT_SPLIT_RE.findall(text)
+
+            # check spans are added correctly for phrase individually
+            self.__run_single_node_test(text, text_only=False, number_of_sentences=len(sentences))
 
-            for text_idx, text_chunk in enumerate(text_chunks):
-                self.assertEqual(spans[text_idx].text, text_chunk)
+            # assert span is correctly split into sentences
+            self.assertEqual(len(spans), len(sentences))
+            for span, sentence in zip(spans, sentences):
+                self.assertEqual(span.text, sentence)
 
     def test_add_spans_to_multiple_sentences(self):
         self.__run_multiple_node_test(
@@ -341,7 +360,7 @@ def test_gitub_pr_106(self):
 
         pre_span = self.container.parsed(container_name)
         text_chunks = [
-            g.lstrip("\n\t")
+            g
             for g in pre_span.xpath(
                 "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
             )
@@ -351,12 +370,12 @@ def test_gitub_pr_106(self):
 
         post_span = self.container.parsed(container_name)
         post_text_chunks = [
-            g.lstrip("\n\t")
+            g
             for g in post_span.xpath(
                 "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
             )
         ]
-        self.assertListEqual(text_chunks, post_text_chunks)
+        self.assertEqual(''.join(text_chunks), ''.join(post_text_chunks))
 
     def test_github_issue_136(self):
         source_file = os.path.join(self.testfile_basedir, "page_github_136.html")