Skip to content

Commit 69186a4

Browse files
committed

File tree

3 files changed

+40
-2
lines changed

3 files changed

+40
-2
lines changed

argostranslate/sbd.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,36 @@
66
from argostranslate.package import Package
77
from argostranslate.utils import info
88

9+
from typing import List
10+
from typing import Optional
11+
from difflib import SequenceMatcher
12+
13+
import spacy
14+
15+
class ISentenceBoundaryDetectionModel:
16+
# https://github.com/argosopentech/sbd/blob/main/main.py
17+
def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
18+
raise NotImplementedError
19+
20+
# Spacy sentence boundary detection Sentencizer
21+
# https://community.libretranslate.com/t/sentence-boundary-detection-for-machine-translation/606/3
22+
23+
# Download model:
24+
# python -m spacy download xx_sent_ud_sm
25+
class SpacySentencizerSmall(ISentenceBoundaryDetectionModel):
26+
def __init__(self):
27+
self.nlp = spacy.load("xx_sent_ud_sm")
28+
self.nlp.add_pipe("sentencizer")
29+
30+
def split_sentences(self, text: str, lang_code: Optional[str] = None) -> List[str]:
31+
doc = self.nlp(text)
32+
return [sent.text for sent in doc.sents]
33+
34+
def __str__(self):
35+
return "Spacy xx_sent_ud_sm"
36+
37+
# Few Shot Sentence Boundary Detection
38+
939
fewshot_prompt = """<detect-sentence-boundaries> I walked down to the river. Then I went to the
1040
I walked down to the river. <sentence-boundary>
1141
----------
@@ -90,4 +120,4 @@ def detect_sentence(
90120
sbd_translated_guess = sbd_translation.translate(
91121
DETECT_SENTENCE_BOUNDARIES_TOKEN + sentence_guess
92122
)
93-
return process_seq2seq_sbd(input_text, sbd_translated_guess)
123+
return process_seq2seq_sbd(input_text, sbd_translated_guess)

argostranslate/translate.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from argostranslate.models import ILanguageModel
1212
from argostranslate.package import Package
1313
from argostranslate.utils import info
14-
14+
from argostranslate.sbd import SpacySentencizerSmall
1515

1616
class Hypothesis:
1717
"""Represents a translation hypothesis
@@ -412,9 +412,12 @@ def apply_packaged_translation(
412412
info("apply_packaged_translation", input_text)
413413

414414
# Sentence boundary detection
415+
"""
416+
# Argos Translate 1.9 Sentence Boundary Detection
415417
if pkg.type == "sbd":
416418
sentences = [input_text]
417419
elif settings.stanza_available:
420+
# PJDEBUG
418421
stanza_pipeline = stanza.Pipeline(
419422
lang=pkg.from_code,
420423
dir=str(pkg.package_path / "stanza"),
@@ -448,6 +451,10 @@ def apply_packaged_translation(
448451
info("sbd_index", sbd_index)
449452
info(input_text[start_index:sbd_index])
450453
start_index = sbd_index
454+
"""
455+
sentencizer = SpacySentencizerSmall()
456+
sentences = sentencizer.split_sentences(input_text)
457+
451458
info("sentences", sentences)
452459

453460
# Tokenization

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
ctranslate2>=4.0,<5
22
sentencepiece==0.2.0
33
stanza==1.1.1
4+
spacy
45
packaging
56
sacremoses==0.0.53

0 commit comments

Comments
 (0)