From 4c88c17342ec60b3c6f460153ce9f7dbe043439e Mon Sep 17 00:00:00 2001
From: Shiandow <Shiandow@users.noreply.github.com>
Date: Sat, 5 Oct 2024 18:38:35 +0200
Subject: [PATCH 1/7] Avoid trailing whitespace in kobo spans

This seems like a small change but has huge implications for the way text is rendered with optimizeLegibilty and geometricPrecision.

Signed-off-by: Shiandow <Shiandow@users.noreply.github.com>
---
 container.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/container.py b/container.py
index d376a7d..affa3a9 100644
--- a/container.py
+++ b/container.py
@@ -87,7 +87,7 @@
 MS_CRUFT_RE_1 = re.compile(r"<o:p>\s*</o:p>", re.UNICODE | re.MULTILINE)
 MS_CRUFT_RE_2 = re.compile(r"(?i)</?st1:\w+>", re.UNICODE | re.MULTILINE)
 TEXT_SPLIT_RE = re.compile(
-    r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?\s*)',
+    r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?)(?=\s)',
     re.UNICODE | re.MULTILINE,
 )
 
@@ -616,9 +616,6 @@ def _append_kobo_spans_from_text(
         # remove empty strings resulting from split()
         groups = [g for g in groups if g != ""]
 
-        # TODO: To match Kobo KePubs, the trailing whitespace needs to
-        # be prepended to the next group. Probably equivalent to make
-        # sure the space stays in the span at the end.
         # add each sentence in its own span
         segment_counter = 1
         for g in groups:

From 4e1a432b55520a7cc6c7bbe63ec266fc70891d56 Mon Sep 17 00:00:00 2001
From: Shiandow <Shiandow@users.noreply.github.com>
Date: Sat, 5 Oct 2024 21:17:43 +0200
Subject: [PATCH 2/7] Also stop sentence on last non-whitespace character
 before end of line.

Signed-off-by: Shiandow <Shiandow@users.noreply.github.com>
---
 container.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/container.py b/container.py
index affa3a9..6e4ae5e 100644
--- a/container.py
+++ b/container.py
@@ -87,7 +87,7 @@
 MS_CRUFT_RE_1 = re.compile(r"<o:p>\s*</o:p>", re.UNICODE | re.MULTILINE)
 MS_CRUFT_RE_2 = re.compile(r"(?i)</?st1:\w+>", re.UNICODE | re.MULTILINE)
 TEXT_SPLIT_RE = re.compile(
-    r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?)(?=\s)',
+    r'(\s*.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?|\S(?=\s*$)))(?=\s)',
     re.UNICODE | re.MULTILINE,
 )
 

From 452236efbfce0cac336ee438531700bc11c5da18 Mon Sep 17 00:00:00 2001
From: Shiandow <shiandow@gmail.com>
Date: Tue, 8 Oct 2024 11:33:55 +0200
Subject: [PATCH 3/7] Stop linter nagging

---
 tests/test_container.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_container.py b/tests/test_container.py
index 322c4c8..ce867d4 100644
--- a/tests/test_container.py
+++ b/tests/test_container.py
@@ -89,7 +89,9 @@ def test_add_html_file(self):
         self.assertIn(self.container.mime_map[container_name], container.HTML_MIMETYPES)
         self.assertIn("content.opf", self.container.dirtied)
 
-    def __run_added_test(self, expect_changed, added_func):  # type: (bool, Callable) -> None
+    def __run_added_test(
+        self, expect_changed, added_func
+    ):  # type: (bool, Callable) -> None
         if expect_changed:
             source_file = self.files["test_without_spans_with_comments"]
         else:

From 78cd242fbea31b82915547cfcc4cd0c7f3abb86f Mon Sep 17 00:00:00 2001
From: Shiandow <shiandow@gmail.com>
Date: Tue, 8 Oct 2024 11:34:27 +0200
Subject: [PATCH 4/7] Put trailing/leading whitespace outside of spans

---
 container.py | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/container.py b/container.py
index 6e4ae5e..d52d644 100644
--- a/container.py
+++ b/container.py
@@ -87,7 +87,7 @@
 MS_CRUFT_RE_1 = re.compile(r"<o:p>\s*</o:p>", re.UNICODE | re.MULTILINE)
 MS_CRUFT_RE_2 = re.compile(r"(?i)</?st1:\w+>", re.UNICODE | re.MULTILINE)
 TEXT_SPLIT_RE = re.compile(
-    r'(\s*.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?|\S(?=\s*$)))(?=\s)',
+    r'(\S.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]*(?=\s)|(?=\s*$)))',
     re.UNICODE | re.MULTILINE,
 )
 
@@ -577,10 +577,7 @@ def _add_kobo_spans_to_node(
 
         # the node text is converted to spans
         if node_text is not None:
-            if not self._append_kobo_spans_from_text(node, node_text, name):
-                # didn't add spans, restore text
-                node.text = node_text
-            else:
+            if self._append_kobo_spans_from_text(node, node_text, name):
                 self.paragraph_counter[name] += 1
 
         # re-add the node children
@@ -591,10 +588,7 @@ def _add_kobo_spans_to_node(
             node.append(self._add_kobo_spans_to_node(child, name))
             # the child tail is converted to spans
             if child_tail is not None:
-                if not self._append_kobo_spans_from_text(node, child_tail, name):
-                    # didn't add spans, restore tail on last child
-                    node[-1].tail = child_tail
-                else:
+                if self._append_kobo_spans_from_text(node, child_tail, name):
                     self.paragraph_counter[name] += 1
 
         return node
@@ -606,19 +600,18 @@ def _append_kobo_spans_from_text(
             self.log.error(f"[{name}] No text passed, can't add spans")
             return False
 
-        # if text is only whitespace, don't add spans
-        if text.strip() == "":
-            self.log.warning(f"[{name}] Found only whitespace, not adding spans")
-            return False
-
         # split text in sentences
         groups = TEXT_SPLIT_RE.split(text)
-        # remove empty strings resulting from split()
-        groups = [g for g in groups if g != ""]
 
-        # add each sentence in its own span
-        segment_counter = 1
-        for g in groups:
+        # append first group (whitespace) as text
+        if len(node) == 0:
+            node.text     = groups[0]
+        else:
+            node[-1].tail = groups[0]
+
+        # append each sentence in its own span
+        segment_counter = 1 
+        for g, ws in zip(groups[1::2], groups[2::2]):
             span = etree.Element(
                 f"{{{XHTML_NAMESPACE}}}span",
                 attrib={
@@ -627,7 +620,8 @@ def _append_kobo_spans_from_text(
                 },
             )
             span.text = g
+            span.tail = ws
             node.append(span)
             segment_counter += 1
 
-        return True
+        return len(groups) > 1 # Return true if any spans were added.

From 5147596a763de60afbb27ef3f9fa596ed93cf026 Mon Sep 17 00:00:00 2001
From: Shiandow <shiandow@gmail.com>
Date: Tue, 8 Oct 2024 19:06:46 +0200
Subject: [PATCH 5/7] Attempt to fix tests

Some of these were just replicating the implementation, which
seemed less than useful. I've tried to test the important properties.

The PR#106 test needs to be weakened a bit but the weakened test should
still catch the bug.
---
 container.py            |  4 +-
 tests/test_container.py | 81 +++++++++++++++++++++++++----------------
 2 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/container.py b/container.py
index d52d644..fd916b7 100644
--- a/container.py
+++ b/container.py
@@ -610,7 +610,7 @@ def _append_kobo_spans_from_text(
             node[-1].tail = groups[0]
 
         # append each sentence in its own span
-        segment_counter = 1 
+        segment_counter = 1
         for g, ws in zip(groups[1::2], groups[2::2]):
             span = etree.Element(
                 f"{{{XHTML_NAMESPACE}}}span",
@@ -624,4 +624,4 @@ def _append_kobo_spans_from_text(
             node.append(span)
             segment_counter += 1
 
-        return len(groups) > 1 # Return true if any spans were added.
+        return len(groups) > 1  # Return true if any spans were added.
diff --git a/tests/test_container.py b/tests/test_container.py
index ce867d4..9eea576 100644
--- a/tests/test_container.py
+++ b/tests/test_container.py
@@ -261,6 +261,40 @@ def test_add_js(self):
             )
         )
 
+    def __run_single_node_test(self, sentence, text_only=False, number_of_sentences=None):
+        self.container.paragraph_counter = defaultdict(lambda: 1)
+        node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")
+
+        if text_only:
+            self.assertTrue(
+                self.container._append_kobo_spans_from_text(node, sentence, "test")
+            )
+        else:
+            node.text = sentence
+            node = self.container._add_kobo_spans_to_node(node, "test")
+
+        # check number of sentences
+        if number_of_sentences is not None:
+            self.assertEqual(len(node.getchildren()), number_of_sentences)
+
+        for span in node.getchildren():
+            # spans should not end in whitespace (PR#191)
+            self.assertFalse(re.match(r'\s', span.tail[-1]))
+            # tail of span should *only* be whitespace
+            self.assertTrue(re.match(r'\s*', span.tail[-1] or ''))
+
+            # attrib is technically of type lxml.etree._Attrib, but functionally
+            # it's a dict. Cast it here to make assertDictEqual() happy.
+            self.assertDictEqual(
+                dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
+            )
+
+        # remaining text should only contain whitespace
+        self.assertTrue(re.match(r'\s*', node.text or ''))
+
+        # complete text should remain the same
+        self.assertEqual(''.join(node.itertext()), text)
+
     def test_add_spans_to_text(self):
         text_samples = [
             "Hello, World!",
@@ -268,31 +302,12 @@ def test_add_spans_to_text(self):
             "Hello, World!    ",
             "    Hello, World!    ",
             "\n\n    GIF is pronounced as it's spelled.\n   ",
+            " \"Yes, but I asked 'Why?'\" "
         ]
 
         for text in text_samples:
             for text_only in {True, False}:
-                self.container.paragraph_counter = defaultdict(lambda: 1)
-                node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")
-
-                if text_only:
-                    self.assertTrue(
-                        self.container._append_kobo_spans_from_text(node, text, "test")
-                    )
-                else:
-                    node.text = text
-                    node = self.container._add_kobo_spans_to_node(node, "test")
-
-                self.assertEqual(len(node.getchildren()), 1)
-
-                span = node.getchildren()[0]
-                self.assertIsNone(span.tail)
-                # attrib is technically of type lxml.etree._Attrib, but functionally
-                # it's a dict. Cast it here to make assertDictEqual() happy.
-                self.assertDictEqual(
-                    dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
-                )
-                self.assertEqual(span.text, text)
+                self.__run_single_node_test(text, text_only=text_only, number_of_sentences=1)
 
     def __run_multiple_node_test(self, text_nodes):  # type: (List[str]) -> None
         html = "<div>"
@@ -307,19 +322,21 @@ def __run_multiple_node_test(self, text_nodes):  # type: (List[str]) -> None
 
         node = self.container._add_kobo_spans_to_node(node, "test")
         children = node.getchildren()
-        self.assertEqual(len(children), len(text_nodes))
 
-        for node_idx, node in enumerate(children):
+        # check each paragraph
+        self.assertEqual(len(text_nodes), len(children))
+        for text, node in zip(text_nodes, children):
             spans = node.getchildren()
-            text_chunks = [
-                g
-                for g in container.TEXT_SPLIT_RE.split(text_nodes[node_idx])
-                if g.strip() != ""
-            ]
-            self.assertEqual(len(spans), len(text_chunks))
+            # note: this regexp isn't being tested (it's known to be fallible, but good enough)
+            sentences = container.TEXT_SPLIT_RE.findall(text)
+
+            # check spans are added correctly for phrase individually
+            self.__run_single_node_test(text, text_only=False, number_of_sentences=len(sentences))
 
-            for text_idx, text_chunk in enumerate(text_chunks):
-                self.assertEqual(spans[text_idx].text, text_chunk)
+            # assert span is correctly split into sentences
+            self.assertEqual(len(spans), len(sentences))
+            for span, sentence in zip(spans, sentences):
+                self.assertEqual(span.text, sentence)
 
     def test_add_spans_to_multiple_sentences(self):
         self.__run_multiple_node_test(
@@ -358,7 +375,7 @@ def test_gitub_pr_106(self):
                 "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
             )
         ]
-        self.assertListEqual(text_chunks, post_text_chunks)
+        self.assertEqual(''.join(text_chunks), ''.join(post_text_chunks))
 
     def test_github_issue_136(self):
         source_file = os.path.join(self.testfile_basedir, "page_github_136.html")

From d46894f76159ade3be8d0acc167a7ee8baa86006 Mon Sep 17 00:00:00 2001
From: Shiandow <shiandow@gmail.com>
Date: Tue, 8 Oct 2024 20:39:21 +0200
Subject: [PATCH 6/7] Fix name

---
 tests/test_container.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_container.py b/tests/test_container.py
index 9eea576..eef464c 100644
--- a/tests/test_container.py
+++ b/tests/test_container.py
@@ -261,16 +261,16 @@ def test_add_js(self):
             )
         )
 
-    def __run_single_node_test(self, sentence, text_only=False, number_of_sentences=None):
+    def __run_single_node_test(self, text, text_only=False, number_of_sentences=None):
         self.container.paragraph_counter = defaultdict(lambda: 1)
         node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")
 
         if text_only:
             self.assertTrue(
-                self.container._append_kobo_spans_from_text(node, sentence, "test")
+                self.container._append_kobo_spans_from_text(node, text, "test")
             )
         else:
-            node.text = sentence
+            node.text = text
             node = self.container._add_kobo_spans_to_node(node, "test")
 
         # check number of sentences

From d63e6b49f606f1bffe512faa441b0ad5eb691c69 Mon Sep 17 00:00:00 2001
From: Shiandow <shiandow@gmail.com>
Date: Wed, 9 Oct 2024 02:28:49 +0200
Subject: [PATCH 7/7] Attempt to pass tests

The githup_106 test is a bit odd, because I'm not sure what changed.

Whitespace should be identical though, even without the lstrip.
Which is easy enough to check.
---
 tests/test_container.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_container.py b/tests/test_container.py
index eef464c..d09411e 100644
--- a/tests/test_container.py
+++ b/tests/test_container.py
@@ -278,10 +278,10 @@ def __run_single_node_test(self, text, text_only=False, number_of_sentences=None
             self.assertEqual(len(node.getchildren()), number_of_sentences)
 
         for span in node.getchildren():
-            # spans should not end in whitespace (PR#191)
-            self.assertFalse(re.match(r'\s', span.tail[-1]))
+            # spans should not end in whitespace (PR#191), and be nonempty
+            self.assertFalse(re.match(r'\s', span.text[-1]))
             # tail of span should *only* be whitespace
-            self.assertTrue(re.match(r'\s*', span.tail[-1] or ''))
+            self.assertTrue(re.match(r'\s*', span.tail or ''))
 
             # attrib is technically of type lxml.etree._Attrib, but functionally
             # it's a dict. Cast it here to make assertDictEqual() happy.
@@ -360,7 +360,7 @@ def test_gitub_pr_106(self):
 
         pre_span = self.container.parsed(container_name)
         text_chunks = [
-            g.lstrip("\n\t")
+            g
             for g in pre_span.xpath(
                 "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
             )
@@ -370,7 +370,7 @@ def test_gitub_pr_106(self):
 
         post_span = self.container.parsed(container_name)
         post_text_chunks = [
-            g.lstrip("\n\t")
+            g
             for g in post_span.xpath(
                 "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
             )