Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid trailing whitespace in kobo spans #174

Merged
merged 7 commits into from
Oct 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 13 additions & 22 deletions container.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
MS_CRUFT_RE_1 = re.compile(r"<o:p>\s*</o:p>", re.UNICODE | re.MULTILINE)
MS_CRUFT_RE_2 = re.compile(r"(?i)</?st1:\w+>", re.UNICODE | re.MULTILINE)
TEXT_SPLIT_RE = re.compile(
r'(\s*.*?[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]?\s*)',
r'(\S.*?(?:[\.\!\?\:][\'"\u201c\u201d\u2018\u2019\u2026]*(?=\s)|(?=\s*$)))',
re.UNICODE | re.MULTILINE,
)

Expand Down Expand Up @@ -577,10 +577,7 @@ def _add_kobo_spans_to_node(

# the node text is converted to spans
if node_text is not None:
if not self._append_kobo_spans_from_text(node, node_text, name):
# didn't add spans, restore text
node.text = node_text
else:
if self._append_kobo_spans_from_text(node, node_text, name):
self.paragraph_counter[name] += 1

# re-add the node children
Expand All @@ -591,10 +588,7 @@ def _add_kobo_spans_to_node(
node.append(self._add_kobo_spans_to_node(child, name))
# the child tail is converted to spans
if child_tail is not None:
if not self._append_kobo_spans_from_text(node, child_tail, name):
# didn't add spans, restore tail on last child
node[-1].tail = child_tail
else:
if self._append_kobo_spans_from_text(node, child_tail, name):
self.paragraph_counter[name] += 1

return node
Expand All @@ -606,22 +600,18 @@ def _append_kobo_spans_from_text(
self.log.error(f"[{name}] No text passed, can't add spans")
return False

# if text is only whitespace, don't add spans
if text.strip() == "":
self.log.warning(f"[{name}] Found only whitespace, not adding spans")
return False

# split text in sentences
groups = TEXT_SPLIT_RE.split(text)
# remove empty strings resulting from split()
groups = [g for g in groups if g != ""]

# TODO: To match Kobo KePubs, the trailing whitespace needs to
# be prepended to the next group. Probably equivalent to make
# sure the space stays in the span at the end.
# add each sentence in its own span
# append first group (whitespace) as text
if len(node) == 0:
node.text = groups[0]
else:
node[-1].tail = groups[0]

# append each sentence in its own span
segment_counter = 1
for g in groups:
for g, ws in zip(groups[1::2], groups[2::2]):
span = etree.Element(
f"{{{XHTML_NAMESPACE}}}span",
attrib={
Expand All @@ -630,7 +620,8 @@ def _append_kobo_spans_from_text(
},
)
span.text = g
span.tail = ws
node.append(span)
segment_counter += 1

return True
return len(groups) > 1 # Return true if any spans were added.
89 changes: 54 additions & 35 deletions tests/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def test_add_html_file(self):
self.assertIn(self.container.mime_map[container_name], container.HTML_MIMETYPES)
self.assertIn("content.opf", self.container.dirtied)

def __run_added_test(self, expect_changed, added_func): # type: (bool, Callable) -> None
def __run_added_test(
self, expect_changed, added_func
): # type: (bool, Callable) -> None
if expect_changed:
source_file = self.files["test_without_spans_with_comments"]
else:
Expand Down Expand Up @@ -259,38 +261,53 @@ def test_add_js(self):
)
)

def __run_single_node_test(self, text, text_only=False, number_of_sentences=None):
self.container.paragraph_counter = defaultdict(lambda: 1)
node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")

if text_only:
self.assertTrue(
self.container._append_kobo_spans_from_text(node, text, "test")
)
else:
node.text = text
node = self.container._add_kobo_spans_to_node(node, "test")

# check number of sentences
if number_of_sentences is not None:
self.assertEqual(len(node.getchildren()), number_of_sentences)

for span in node.getchildren():
# spans should not end in whitespace (PR#191), and be nonempty
self.assertFalse(re.match(r'\s', span.text[-1]))
# tail of span should *only* be whitespace
self.assertTrue(re.match(r'\s*', span.tail or ''))

# attrib is technically of type lxml.etree._Attrib, but functionally
# it's a dict. Cast it here to make assertDictEqual() happy.
self.assertDictEqual(
dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
)

# remaining text should only contain whitespace
self.assertTrue(re.match(r'\s*', node.text or ''))

# complete text should remain the same
self.assertEqual(''.join(node.itertext()), text)

def test_add_spans_to_text(self):
text_samples = [
"Hello, World!",
" Hello, World!",
"Hello, World! ",
" Hello, World! ",
"\n\n GIF is pronounced as it's spelled.\n ",
" \"Yes, but I asked 'Why?'\" "
]

for text in text_samples:
for text_only in {True, False}:
self.container.paragraph_counter = defaultdict(lambda: 1)
node = etree.Element(f"{{{container.XHTML_NAMESPACE}}}p")

if text_only:
self.assertTrue(
self.container._append_kobo_spans_from_text(node, text, "test")
)
else:
node.text = text
node = self.container._add_kobo_spans_to_node(node, "test")

self.assertEqual(len(node.getchildren()), 1)

span = node.getchildren()[0]
self.assertIsNone(span.tail)
# attrib is technically of type lxml.etree._Attrib, but functionally
# it's a dict. Cast it here to make assertDictEqual() happy.
self.assertDictEqual(
dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
)
self.assertEqual(span.text, text)
self.__run_single_node_test(text, text_only=text_only, number_of_sentences=1)

def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None
html = "<div>"
Expand All @@ -305,19 +322,21 @@ def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None

node = self.container._add_kobo_spans_to_node(node, "test")
children = node.getchildren()
self.assertEqual(len(children), len(text_nodes))

for node_idx, node in enumerate(children):
# check each paragraph
self.assertEqual(len(text_nodes), len(children))
for text, node in zip(text_nodes, children):
spans = node.getchildren()
text_chunks = [
g
for g in container.TEXT_SPLIT_RE.split(text_nodes[node_idx])
if g.strip() != ""
]
self.assertEqual(len(spans), len(text_chunks))
# note: this regexp isn't being tested (it's known to be fallible, but good enough)
sentences = container.TEXT_SPLIT_RE.findall(text)

# check spans are added correctly for phrase individually
self.__run_single_node_test(text, text_only=False, number_of_sentences=len(sentences))

for text_idx, text_chunk in enumerate(text_chunks):
self.assertEqual(spans[text_idx].text, text_chunk)
# assert span is correctly split into sentences
self.assertEqual(len(spans), len(sentences))
for span, sentence in zip(spans, sentences):
self.assertEqual(span.text, sentence)

def test_add_spans_to_multiple_sentences(self):
self.__run_multiple_node_test(
Expand All @@ -341,7 +360,7 @@ def test_gitub_pr_106(self):

pre_span = self.container.parsed(container_name)
text_chunks = [
g.lstrip("\n\t")
g
for g in pre_span.xpath(
"//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
)
Expand All @@ -351,12 +370,12 @@ def test_gitub_pr_106(self):

post_span = self.container.parsed(container_name)
post_text_chunks = [
g.lstrip("\n\t")
g
for g in post_span.xpath(
"//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
)
]
self.assertListEqual(text_chunks, post_text_chunks)
self.assertEqual(''.join(text_chunks), ''.join(post_text_chunks))

def test_github_issue_136(self):
source_file = os.path.join(self.testfile_basedir, "page_github_136.html")
Expand Down