Skip to content

Commit

Permalink
Fix tests with new chunking
Browse files Browse the repository at this point in the history
Text is now split differently, requiring a different approach to validating the generated kobo spans.
  • Loading branch information
jgoguen committed Oct 13, 2024
1 parent d63e6b4 commit b60e7b7
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions tests/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,12 @@ def __run_single_node_test(self, text, text_only=False, number_of_sentences=None
if number_of_sentences is not None:
self.assertEqual(len(node.getchildren()), number_of_sentences)

for span in node.getchildren():
para_count = 1
text_chunks = [
chunk.strip() for chunk in text.split("\n") if chunk.strip() != ""
]
for span, text_chunk in zip(node.getchildren(), text_chunks):
self.assertEqual(span.text, text_chunk)
# spans should not end in whitespace (PR#191), and be nonempty
self.assertFalse(re.match(r'\s', span.text[-1]))
# tail of span should *only* be whitespace
Expand All @@ -286,8 +291,9 @@ def __run_single_node_test(self, text, text_only=False, number_of_sentences=None
# attrib is technically of type lxml.etree._Attrib, but functionally
# it's a dict. Cast it here to make assertDictEqual() happy.
self.assertDictEqual(
dict(span.attrib), {"id": "kobo.1.1", "class": "koboSpan"}
dict(span.attrib), {"id": f"kobo.1.{para_count}", "class": "koboSpan"}
)
para_count += 1

# remaining text should only contain whitespace
self.assertTrue(re.match(r'\s*', node.text or ''))
Expand Down

0 comments on commit b60e7b7

Please sign in to comment.