From 4c88c17342ec60b3c6f460153ce9f7dbe043439e Mon Sep 17 00:00:00 2001 From: Shiandow Date: Sat, 5 Oct 2024 18:38:35 +0200 Subject: [PATCH 1/7] Avoid trailing whitespace in kobo spans This seems like a small change but has huge implications for the way text is rendered with optimizeLegibilty and geometricPrecision. Signed-off-by: Shiandow --- container.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/container.py b/container.py index d376a7d..affa3a9 100644 --- a/container.py +++ b/container.py @@ -87,7 +87,7 @@ MS_CRUFT_RE_1 = re.compile(r"\s*", re.UNICODE | re.MULTILINE) MS_CRUFT_RE_2 = re.compile(r"(?i)", re.UNICODE | re.MULTILINE) TEXT_SPLIT_RE = re.compile( - r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?\s*)', + r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?)(?=\s)', re.UNICODE | re.MULTILINE, ) @@ -616,9 +616,6 @@ def _append_kobo_spans_from_text( # remove empty strings resulting from split() groups = [g for g in groups if g != ""] - # TODO: To match Kobo KePubs, the trailing whitespace needs to - # be prepended to the next group. Probably equivalent to make - # sure the space stays in the span at the end. # add each sentence in its own span segment_counter = 1 for g in groups: From 4e1a432b55520a7cc6c7bbe63ec266fc70891d56 Mon Sep 17 00:00:00 2001 From: Shiandow Date: Sat, 5 Oct 2024 21:17:43 +0200 Subject: [PATCH 2/7] Also stop sentence on last non-whitespace character before end of line. Signed-off-by: Shiandow --- container.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/container.py b/container.py index affa3a9..6e4ae5e 100644 --- a/container.py +++ b/container.py @@ -87,7 +87,7 @@ MS_CRUFT_RE_1 = re.compile(r"\s*", re.UNICODE | re.MULTILINE) MS_CRUFT_RE_2 = re.compile(r"(?i)", re.UNICODE | re.MULTILINE) TEXT_SPLIT_RE = re.compile( - r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?)(?=\s)', + r'(\s*.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?|\S(?=\s*$)))(?=\s)', re.UNICODE | re.MULTILINE, ) From 452236efbfce0cac336ee438531700bc11c5da18 Mon Sep 17 00:00:00 2001 From: Shiandow Date: Tue, 8 Oct 2024 11:33:55 +0200 Subject: [PATCH 3/7] Stop linter nagging --- tests/test_container.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_container.py b/tests/test_container.py index 322c4c8..ce867d4 100644 --- a/tests/test_container.py +++ b/tests/test_container.py @@ -89,7 +89,9 @@ def test_add_html_file(self): self.assertIn(self.container.mime_map[container_name], container.HTML_MIMETYPES) self.assertIn("content.opf", self.container.dirtied) - def __run_added_test(self, expect_changed, added_func): # type: (bool, Callable) -> None + def __run_added_test( + self, expect_changed, added_func + ): # type: (bool, Callable) -> None if expect_changed: source_file = self.files["test_without_spans_with_comments"] else: From 78cd242fbea31b82915547cfcc4cd0c7f3abb86f Mon Sep 17 00:00:00 2001 From: Shiandow Date: Tue, 8 Oct 2024 11:34:27 +0200 Subject: [PATCH 4/7] Put trailing/leading whitespace outside of spans --- container.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/container.py b/container.py index 6e4ae5e..d52d644 100644 --- a/container.py +++ b/container.py @@ -87,7 +87,7 @@ MS_CRUFT_RE_1 = re.compile(r"\s*", re.UNICODE | re.MULTILINE) MS_CRUFT_RE_2 = re.compile(r"(?i)", re.UNICODE | re.MULTILINE) TEXT_SPLIT_RE = re.compile( - r'(\s*.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?|\S(?=\s*$)))(?=\s)', + r'(\S.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]*(?=\s)|(?=\s*$)))', re.UNICODE | re.MULTILINE, ) @@ -577,10 +577,7 @@ def _add_kobo_spans_to_node( # the node text is converted to spans if node_text is not None: - if not self._append_kobo_spans_from_text(node, node_text, name): - # didn't add spans, restore text - node.text = node_text - else: + if self._append_kobo_spans_from_text(node, node_text, name): self.paragraph_counter[name] += 1 # re-add the node children @@ -591,10 +588,7 @@ def _add_kobo_spans_to_node( node.append(self._add_kobo_spans_to_node(child, name)) # the child tail is converted to spans if child_tail is not None: - if not self._append_kobo_spans_from_text(node, child_tail, name): - # didn't add spans, restore tail on last child - node[-1].tail = child_tail - else: + if self._append_kobo_spans_from_text(node, child_tail, name): self.paragraph_counter[name] += 1 return node @@ -606,19 +600,18 @@ def _append_kobo_spans_from_text( self.log.error(f"[{name}] No text passed, can't add spans") return False - # if text is only whitespace, don't add spans - if text.strip() == "": - self.log.warning(f"[{name}] Found only whitespace, not adding spans") - return False - # split text in sentences groups = TEXT_SPLIT_RE.split(text) - # remove empty strings resulting from split() - groups = [g for g in groups if g != ""] - # add each sentence in its own span - segment_counter = 1 - for g in groups: + # append first group (whitespace) as text + if len(node) == 0: + node.text = groups[0] + else: + node[-1].tail = groups[0] + + # append each sentence in its own span + segment_counter = 1 + for g, ws in zip(groups[1::2], groups[2::2]): span = etree.Element( f"{{{XHTML_NAMESPACE}}}span", attrib={ @@ -627,7 +620,8 @@ def _append_kobo_spans_from_text( }, ) span.text = g + span.tail = ws node.append(span) segment_counter += 1 - return True + return len(groups) > 1 # Return true if any spans were added. From 5147596a763de60afbb27ef3f9fa596ed93cf026 Mon Sep 17 00:00:00 2001 From: Shiandow Date: Tue, 8 Oct 2024 19:06:46 +0200 Subject: [PATCH 5/7] Attempt to fix tests Some of these were just replicating the implementation, which seemed less than useful. I've tried to test the important properties. The PR#106 test needs to be weakened a bit but the weakened test should still catch the bug. --- container.py | 4 +- tests/test_container.py | 81 +++++++++++++++++++++++++---------------- 2 files changed, 51 insertions(+), 34 deletions(-) diff --git a/container.py b/container.py index d52d644..fd916b7 100644 --- a/container.py +++ b/container.py @@ -610,7 +610,7 @@ def _append_kobo_spans_from_text( node[-1].tail = groups[0] # append each sentence in its own span - segment_counter = 1 + segment_counter = 1 for g, ws in zip(groups[1::2], groups[2::2]): span = etree.Element( f"{{{XHTML_NAMESPACE}}}span", @@ -624,4 +624,4 @@ def _append_kobo_spans_from_text( node.append(span) segment_counter += 1 - return len(groups) > 1 # Return true if any spans were added. + return len(groups) > 1 # Return true if any spans were added. diff --git a/tests/test_container.py b/tests/test_container.py index ce867d4..9eea576 100644 --- a/tests/test_container.py +++ b/tests/test_container.py @@ -261,6 +261,40 @@ def test_add_js(self): ) ) + def __run_single_node_test(self, sentence, text_only=False, number_of_sentences=None): + self.container.paragraph_counter = defaultdict(lambda: 1) + node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p") + + if text_only: + self.assertTrue( + self.container._append_kobo_spans_from_text(node, sentence, "test") + ) + else: + node.text = sentence + node = self.container._add_kobo_spans_to_node(node, "test") + + # check number of sentences + if number_of_sentences is not None: + self.assertEqual(len(node.getchildren()), number_of_sentences) + + for span in node.getchildren(): + # spans should not end in whitespace (PR#191) + self.assertFalse(re.match(r'\s', span.tail[-1])) + # tail of span should *only* be whitespace + self.assertTrue(re.match(r'\s*', span.tail[-1] or '')) + + # attrib is technically of type lxml.etree._Attrib, but functionally + # it's a dict. Cast it here to make assertDictEqual() happy. + self.assertDictEqual( + dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"} + ) + + # remaining text should only contain whitespace + self.assertTrue(re.match(r'\s*', node.text or '')) + + # complete text should remain the same + self.assertEqual(''.join(node.itertext()), text) + def test_add_spans_to_text(self): text_samples = [ "Hello, World!", @@ -268,31 +302,12 @@ def test_add_spans_to_text(self): "Hello, World! ", " Hello, World! ", "\n\n GIF is pronounced as it's spelled.\n ", + " \"Yes, but I asked 'Why?'\" " ] for text in text_samples: for text_only in {True, False}: - self.container.paragraph_counter = defaultdict(lambda: 1) - node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p") - - if text_only: - self.assertTrue( - self.container._append_kobo_spans_from_text(node, text, "test") - ) - else: - node.text = text - node = self.container._add_kobo_spans_to_node(node, "test") - - self.assertEqual(len(node.getchildren()), 1) - - span = node.getchildren()[0] - self.assertIsNone(span.tail) - # attrib is technically of type lxml.etree._Attrib, but functionally - # it's a dict. Cast it here to make assertDictEqual() happy. - self.assertDictEqual( - dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"} - ) - self.assertEqual(span.text, text) + self.__run_single_node_test(text, text_only=text_only, number_of_sentences=1) def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None html = "
" @@ -307,19 +322,21 @@ def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None node = self.container._add_kobo_spans_to_node(node, "test") children = node.getchildren() - self.assertEqual(len(children), len(text_nodes)) - for node_idx, node in enumerate(children): + # check each paragraph + self.assertEqual(len(text_nodes), len(children)) + for text, node in zip(text_nodes, children): spans = node.getchildren() - text_chunks = [ - g - for g in container.TEXT_SPLIT_RE.split(text_nodes[node_idx]) - if g.strip() != "" - ] - self.assertEqual(len(spans), len(text_chunks)) + # note: this regexp isn't being tested (it's known to be fallible, but good enough) + sentences = container.TEXT_SPLIT_RE.findall(text) + + # check spans are added correctly for phrase individually + self.__run_single_node_test(text, text_only=False, number_of_sentences=len(sentences)) - for text_idx, text_chunk in enumerate(text_chunks): - self.assertEqual(spans[text_idx].text, text_chunk) + # assert span is correctly split into sentences + self.assertEqual(len(spans), len(sentences)) + for span, sentence in zip(spans, sentences): + self.assertEqual(span.text, sentence) def test_add_spans_to_multiple_sentences(self): self.__run_multiple_node_test( @@ -358,7 +375,7 @@ def test_gitub_pr_106(self): "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE} ) ] - self.assertListEqual(text_chunks, post_text_chunks) + self.assertEqual(''.join(text_chunks), ''.join(post_text_chunks)) def test_github_issue_136(self): source_file = os.path.join(self.testfile_basedir, "page_github_136.html") From d46894f76159ade3be8d0acc167a7ee8baa86006 Mon Sep 17 00:00:00 2001 From: Shiandow Date: Tue, 8 Oct 2024 20:39:21 +0200 Subject: [PATCH 6/7] Fix name --- tests/test_container.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_container.py b/tests/test_container.py index 9eea576..eef464c 100644 --- a/tests/test_container.py +++ b/tests/test_container.py @@ -261,16 +261,16 @@ def test_add_js(self): ) ) - def __run_single_node_test(self, sentence, text_only=False, number_of_sentences=None): + def __run_single_node_test(self, text, text_only=False, number_of_sentences=None): self.container.paragraph_counter = defaultdict(lambda: 1) node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p") if text_only: self.assertTrue( - self.container._append_kobo_spans_from_text(node, sentence, "test") + self.container._append_kobo_spans_from_text(node, text, "test") ) else: - node.text = sentence + node.text = text node = self.container._add_kobo_spans_to_node(node, "test") # check number of sentences From d63e6b49f606f1bffe512faa441b0ad5eb691c69 Mon Sep 17 00:00:00 2001 From: Shiandow Date: Wed, 9 Oct 2024 02:28:49 +0200 Subject: [PATCH 7/7] Attempt to pass tests The githup_106 test is a bit odd, because I'm not sure what changed. Whitespace should be identical though, even without the lstrip. Which is easy enough to check. --- tests/test_container.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_container.py b/tests/test_container.py index eef464c..d09411e 100644 --- a/tests/test_container.py +++ b/tests/test_container.py @@ -278,10 +278,10 @@ def __run_single_node_test(self, text, text_only=False, number_of_sentences=None self.assertEqual(len(node.getchildren()), number_of_sentences) for span in node.getchildren(): - # spans should not end in whitespace (PR#191) - self.assertFalse(re.match(r'\s', span.tail[-1])) + # spans should not end in whitespace (PR#191), and be nonempty + self.assertFalse(re.match(r'\s', span.text[-1])) # tail of span should *only* be whitespace - self.assertTrue(re.match(r'\s*', span.tail[-1] or '')) + self.assertTrue(re.match(r'\s*', span.tail or '')) # attrib is technically of type lxml.etree._Attrib, but functionally # it's a dict. Cast it here to make assertDictEqual() happy. @@ -360,7 +360,7 @@ def test_gitub_pr_106(self): pre_span = self.container.parsed(container_name) text_chunks = [ - g.lstrip("\n\t") + g for g in pre_span.xpath( "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE} ) @@ -370,7 +370,7 @@ def test_gitub_pr_106(self): post_span = self.container.parsed(container_name) post_text_chunks = [ - g.lstrip("\n\t") + g for g in post_span.xpath( "//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE} )