Support Spacy SBD

PJ-Finlay · PJ-Finlay · commit 69186a4132e3 · 2024-07-06T13:41:54.000-04:00
https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606
diff --git a/argostranslate/sbd.py b/argostranslate/sbd.py
@@ -6,6 +6,36 @@
 from argostranslate.package import Package
 from argostranslate.utils import info
 
+from typing import List
+from typing import Optional
+from difflib import SequenceMatcher
+
+import spacy
+
+class ISentenceBoundaryDetectionModel:
+    # https://github.com/argosopentech/sbd/blob/main/main.py
+    def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
+        raise NotImplementedError
+
+# Spacy sentence boundary detection Sentencizer
+# https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3
+
+# Download model:
+# python -m spacy download xx_sent_ud_sm
+class SpacySentencizerSmall(ISentenceBoundaryDetectionModel):
+    def __init__(self):
+        self.nlp = spacy.load("xx_sent_ud_sm")
+        self.nlp.add_pipe("sentencizer")
+
+    def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
+        doc = self.nlp(text)
+        return [sent.text for sent in doc.sents]
+
+    def __str__(self):
+        return "Spacy xx_sent_ud_sm"
+
+# Few Shot Sentence Boundary Detection
+
 fewshot_prompt = """<detect-sentence-boundaries> I walked down to the river. Then I went to the
 I walked down to the river. <sentence-boundary>
 ----------
@@ -90,4 +120,4 @@ def detect_sentence(
     sbd_translated_guess = sbd_translation.translate(
         DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess
     )
-    return process_seq2seq_sbd(input_text, sbd_translated_guess)
+    return process_seq2seq_sbd(input_text, sbd_translated_guess)
diff --git a/argostranslate/translate.py b/argostranslate/translate.py
@@ -11,7 +11,7 @@
 from argostranslate.models import ILanguageModel
 from argostranslate.package import Package
 from argostranslate.utils import info
-
+from argostranslate.sbd import SpacySentencizerSmall
 
 class Hypothesis:
     """Represents a translation hypothesis
@@ -412,9 +412,12 @@ def apply_packaged_translation(
     info("apply_packaged_translation", input_text)
 
     # Sentence boundary detection
+    """
+    # Argos Translate 1.9 Sentence Boundary Detection
     if pkg.type == "sbd":
         sentences = [input_text]
     elif settings.stanza_available:
+        # PJDEBUG
         stanza_pipeline = stanza.Pipeline(
             lang=pkg.from_code,
             dir=str(pkg.package_path / "stanza"),
@@ -448,6 +451,10 @@ def apply_packaged_translation(
             info("sbd_index", sbd_index)
             info(input_text[start_index:sbd_index])
             start_index = sbd_index
+    """
+    sentencizer = SpacySentencizerSmall()
+    sentences = sentencizer.split_sentences(input_text)
+
     info("sentences", sentences)
 
     # Tokenization
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 ctranslate2>=4.0,<5
 sentencepiece==0.2.0
 stanza==1.1.1
+spacy
 packaging
 sacremoses==0.0.53