Skip to content

Commit

Permalink
Avoid trailing whitespace in kobo spans (#174)
Browse files Browse the repository at this point in the history
# Implement long overdue TODO

## Description of change

This seems like a small change but has huge implications for the way
text is rendered with optimizeLegibilty and geometricPrecision. Really
all it does is move the trailing whitespace to the front of the next
group, just like the TODO note suggested ought to be done.

I'm guessing at the exact mechanism, but for *unclear* reasons whatever
justification program Kobo method uses it apparently does not expand or
contract the last space in a kobo span. (unless using optimizeSpeed,
maybe?) This can causes weird looking text, and in the worst case may
even expand the space between letters just to stretch the text far
enough. If this is correct it would fit the behaviour [observed by
jackie_w](https://www.mobileread.com/forums/showpost.php?p=3812607&postcount=60)
on the mobileRead forums

This change may create a few pure-whitespace kobo spans when a sentence
has some trailing whitespace, which I think could be ignored.

## Test results

This seems to fix most issues with optimizeLegibilty. I'm still testing
but the improvements I'm seeing are substantial enough that I think it
might be best if other people try this out as well. Even if we can't fix
everything it improves things *a lot*.

Only niggle is that the regexp only works for proper sentences, there
are still a couple of issues with things like *emphasis* where a span
might end up with trailing whitespace. These occur when the text ends in
whitespace mid sentence.
  • Loading branch information
jgoguen authored Oct 13, 2024
2 parents 327f809 + d63e6b4 commit 296783a
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 57 deletions.
35 changes: 13 additions & 22 deletions container.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
MS_CRUFT_RE_1 = re.compile(r"<o:p>\s*</o:p>", re.UNICODE | re.MULTILINE)
MS_CRUFT_RE_2 = re.compile(r"(?i)</?st1:\w+>", re.UNICODE | re.MULTILINE)
TEXT_SPLIT_RE = re.compile(
r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?\s*)',
r'(\S.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]*(?=\s)|(?=\s*$)))',
re.UNICODE | re.MULTILINE,
)

Expand Down Expand Up @@ -577,10 +577,7 @@ def _add_kobo_spans_to_node(

# the node text is converted to spans
if node_text is not None:
if not self._append_kobo_spans_from_text(node, node_text, name):
# didn't add spans, restore text
node.text = node_text
else:
if self._append_kobo_spans_from_text(node, node_text, name):
self.paragraph_counter[name] += 1

# re-add the node children
Expand All @@ -591,10 +588,7 @@ def _add_kobo_spans_to_node(
node.append(self._add_kobo_spans_to_node(child, name))
# the child tail is converted to spans
if child_tail is not None:
if not self._append_kobo_spans_from_text(node, child_tail, name):
# didn't add spans, restore tail on last child
node[-1].tail = child_tail
else:
if self._append_kobo_spans_from_text(node, child_tail, name):
self.paragraph_counter[name] += 1

return node
Expand All @@ -606,22 +600,18 @@ def _append_kobo_spans_from_text(
self.log.error(f"[{name}] No text passed, can't add spans")
return False

# if text is only whitespace, don't add spans
if text.strip() == "":
self.log.warning(f"[{name}] Found only whitespace, not adding spans")
return False

# split text in sentences
groups = TEXT_SPLIT_RE.split(text)
# remove empty strings resulting from split()
groups = [g for g in groups if g != ""]

# TODO: To match Kobo KePubs, the trailing whitespace needs to
# be prepended to the next group. Probably equivalent to make
# sure the space stays in the span at the end.
# add each sentence in its own span
# append first group (whitespace) as text
if len(node) == 0:
node.text = groups[0]
else:
node[-1].tail = groups[0]

# append each sentence in its own span
segment_counter = 1
for g in groups:
for g, ws in zip(groups[1::2], groups[2::2]):
span = etree.Element(
f"{{{XHTML_NAMESPACE}}}span",
attrib={
Expand All @@ -630,7 +620,8 @@ def _append_kobo_spans_from_text(
},
)
span.text = g
span.tail = ws
node.append(span)
segment_counter += 1

return True
return len(groups) > 1 # Return true if any spans were added.
89 changes: 54 additions & 35 deletions tests/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def test_add_html_file(self):
self.assertIn(self.container.mime_map[container_name], container.HTML_MIMETYPES)
self.assertIn("content.opf", self.container.dirtied)

def __run_added_test(self, expect_changed, added_func): # type: (bool, Callable) -> None
def __run_added_test(
self, expect_changed, added_func
): # type: (bool, Callable) -> None
if expect_changed:
source_file = self.files["test_without_spans_with_comments"]
else:
Expand Down Expand Up @@ -259,38 +261,53 @@ def test_add_js(self):
)
)

def __run_single_node_test(self, text, text_only=False, number_of_sentences=None):
self.container.paragraph_counter = defaultdict(lambda: 1)
node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")

if text_only:
self.assertTrue(
self.container._append_kobo_spans_from_text(node, text, "test")
)
else:
node.text = text
node = self.container._add_kobo_spans_to_node(node, "test")

# check number of sentences
if number_of_sentences is not None:
self.assertEqual(len(node.getchildren()), number_of_sentences)

for span in node.getchildren():
# spans should not end in whitespace (PR#191), and be nonempty
self.assertFalse(re.match(r'\s', span.text[-1]))
# tail of span should *only* be whitespace
self.assertTrue(re.match(r'\s*', span.tail or ''))

# attrib is technically of type lxml.etree._Attrib, but functionally
# it's a dict. Cast it here to make assertDictEqual() happy.
self.assertDictEqual(
dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
)

# remaining text should only contain whitespace
self.assertTrue(re.match(r'\s*', node.text or ''))

# complete text should remain the same
self.assertEqual(''.join(node.itertext()), text)

def test_add_spans_to_text(self):
text_samples = [
"Hello, World!",
" Hello, World!",
"Hello, World! ",
" Hello, World! ",
"\n\n GIF is pronounced as it's spelled.\n ",
" \"Yes, but I asked 'Why?'\" "
]

for text in text_samples:
for text_only in {True, False}:
self.container.paragraph_counter = defaultdict(lambda: 1)
node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")

if text_only:
self.assertTrue(
self.container._append_kobo_spans_from_text(node, text, "test")
)
else:
node.text = text
node = self.container._add_kobo_spans_to_node(node, "test")

self.assertEqual(len(node.getchildren()), 1)

span = node.getchildren()[0]
self.assertIsNone(span.tail)
# attrib is technically of type lxml.etree._Attrib, but functionally
# it's a dict. Cast it here to make assertDictEqual() happy.
self.assertDictEqual(
dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
)
self.assertEqual(span.text, text)
self.__run_single_node_test(text, text_only=text_only, number_of_sentences=1)

def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None
html = "<div>"
Expand All @@ -305,19 +322,21 @@ def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None

node = self.container._add_kobo_spans_to_node(node, "test")
children = node.getchildren()
self.assertEqual(len(children), len(text_nodes))

for node_idx, node in enumerate(children):
# check each paragraph
self.assertEqual(len(text_nodes), len(children))
for text, node in zip(text_nodes, children):
spans = node.getchildren()
text_chunks = [
g
for g in container.TEXT_SPLIT_RE.split(text_nodes[node_idx])
if g.strip() != ""
]
self.assertEqual(len(spans), len(text_chunks))
# note: this regexp isn't being tested (it's known to be fallible, but good enough)
sentences = container.TEXT_SPLIT_RE.findall(text)

# check spans are added correctly for phrase individually
self.__run_single_node_test(text, text_only=False, number_of_sentences=len(sentences))

for text_idx, text_chunk in enumerate(text_chunks):
self.assertEqual(spans[text_idx].text, text_chunk)
# assert span is correctly split into sentences
self.assertEqual(len(spans), len(sentences))
for span, sentence in zip(spans, sentences):
self.assertEqual(span.text, sentence)

def test_add_spans_to_multiple_sentences(self):
self.__run_multiple_node_test(
Expand All @@ -341,7 +360,7 @@ def test_gitub_pr_106(self):

pre_span = self.container.parsed(container_name)
text_chunks = [
g.lstrip("\n\t")
g
for g in pre_span.xpath(
"//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
)
Expand All @@ -351,12 +370,12 @@ def test_gitub_pr_106(self):

post_span = self.container.parsed(container_name)
post_text_chunks = [
g.lstrip("\n\t")
g
for g in post_span.xpath(
"//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
)
]
self.assertListEqual(text_chunks, post_text_chunks)
self.assertEqual(''.join(text_chunks), ''.join(post_text_chunks))

def test_github_issue_136(self):
source_file = os.path.join(self.testfile_basedir, "page_github_136.html")
Expand Down

0 comments on commit 296783a

Please sign in to comment.