Skip to content

Commit

Permalink
Support Spacy SBD
Browse files Browse the repository at this point in the history
  • Loading branch information
PJ-Finlay committed Jul 6, 2024
1 parent f8cadf0 commit 69186a4
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 2 deletions.
32 changes: 31 additions & 1 deletion argostranslate/sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,36 @@
from argostranslate.package import Package
from argostranslate.utils import info

from typing import List
from typing import Optional
from difflib import SequenceMatcher

import spacy

class ISentenceBoundaryDetectionModel:
# https://github.com/argosopentech/sbd/blob/main/main.py
def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
raise NotImplementedError

# Spacy sentence boundary detection Sentencizer
# https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3

# Download model:
# python -m spacy download xx_sent_ud_sm
class SpacySentencizerSmall(ISentenceBoundaryDetectionModel):
def __init__(self):
self.nlp = spacy.load("xx_sent_ud_sm")
self.nlp.add_pipe("sentencizer")

def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
doc = self.nlp(text)
return [sent.text for sent in doc.sents]

def __str__(self):
return "Spacy xx_sent_ud_sm"

# Few Shot Sentence Boundary Detection

fewshot_prompt = """<detect-sentence-boundaries> I walked down to the river. Then I went to the
I walked down to the river. <sentence-boundary>
----------
Expand Down Expand Up @@ -90,4 +120,4 @@ def detect_sentence(
sbd_translated_guess = sbd_translation.translate(
DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess
)
return process_seq2seq_sbd(input_text, sbd_translated_guess)
return process_seq2seq_sbd(input_text, sbd_translated_guess)
9 changes: 8 additions & 1 deletion argostranslate/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from argostranslate.models import ILanguageModel
from argostranslate.package import Package
from argostranslate.utils import info

from argostranslate.sbd import SpacySentencizerSmall

class Hypothesis:
"""Represents a translation hypothesis
Expand Down Expand Up @@ -412,9 +412,12 @@ def apply_packaged_translation(
info("apply_packaged_translation", input_text)

# Sentence boundary detection
"""
# Argos Translate 1.9 Sentence Boundary Detection
if pkg.type == "sbd":
sentences = [input_text]
elif settings.stanza_available:
# PJDEBUG
stanza_pipeline = stanza.Pipeline(
lang=pkg.from_code,
dir=str(pkg.package_path / "stanza"),
Expand Down Expand Up @@ -448,6 +451,10 @@ def apply_packaged_translation(
info("sbd_index", sbd_index)
info(input_text[start_index:sbd_index])
start_index = sbd_index
"""
sentencizer = SpacySentencizerSmall()
sentences = sentencizer.split_sentences(input_text)

info("sentences", sentences)

# Tokenization
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
ctranslate2>=4.0,<5
sentencepiece==0.2.0
stanza==1.1.1
spacy
packaging
sacremoses==0.0.53

0 comments on commit 69186a4

Please sign in to comment.