Skip to content

Commit c82be6a

Browse files
authored
feat: detect language using lingua, instead of trusting LLM (#518)
* feat: detect language using lingua, instead of trusting LLM Signed-off-by: Frost Ming <[email protected]> * fix: cache the detector Signed-off-by: Frost Ming <[email protected]> * fix comment Signed-off-by: Frost Ming <[email protected]> --------- Signed-off-by: Frost Ming <[email protected]>
1 parent 82c57cc commit c82be6a

File tree

5 files changed

+106
-22
lines changed

5 files changed

+106
-22
lines changed

pdm.lock

+75-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ dependencies = [
2828
"groq>=0.5.0",
2929
"pyyaml>=6.0.1",
3030
"langchain-community>=0.0.38",
31+
# lingua doesn't ship wheels for python 3.13 nor sdist
32+
"lingua-language-detector>=2.0.2; python_version < \"3.13\"",
3133
]
3234
license = {text = "MIT"}
3335
dynamic = ["version", "optional-dependencies"]

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ langchain-community==0.2.0
4545
langchain-core==0.2.0
4646
langchain-text-splitters==0.2.0
4747
langsmith==0.1.45
48+
lingua-language-detector==2.0.2; python_version < "3.13"
4849
markdown-it-py==3.0.0
4950
marshmallow==3.20.1
5051
mdurl==0.1.2

xiaogpt/utils.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@
55
import re
66
import socket
77
from http.cookies import SimpleCookie
8-
from typing import AsyncIterator
8+
from typing import TYPE_CHECKING, AsyncIterator
99
from urllib.parse import urlparse
1010

1111
from requests.utils import cookiejar_from_dict
1212

13+
if TYPE_CHECKING:
14+
from lingua import LanguageDetector
15+
1316

1417
### HELP FUNCTION ###
1518
def parse_cookie_string(cookie_string):
@@ -69,3 +72,21 @@ def get_hostname() -> str:
6972
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
7073
s.connect(("8.8.8.8", 80))
7174
return s.getsockname()[0]
75+
76+
77+
def _get_detector() -> LanguageDetector | None:
78+
try:
79+
from lingua import LanguageDetectorBuilder
80+
except ImportError:
81+
return None
82+
return LanguageDetectorBuilder.from_all_spoken_languages().build()
83+
84+
85+
_detector = _get_detector()
86+
87+
88+
def detect_language(text: str) -> str:
89+
if _detector is None:
90+
return "zh" # default to Chinese if langdetect module is not available
91+
lang = _detector.detect_language_of(text)
92+
return lang.iso_code_639_1.name.lower() if lang is not None else "zh"

xiaogpt/xiaogpt.py

+6-20
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@
2424
Config,
2525
)
2626
from xiaogpt.tts import TTS, MiTTS, TetosTTS
27-
from xiaogpt.utils import (
28-
parse_cookie_string,
29-
)
27+
from xiaogpt.utils import detect_language, parse_cookie_string
3028

3129
EOF = object()
3230

@@ -390,11 +388,6 @@ async def run_forever(self):
390388
query = f"{query}{self.config.prompt}"
391389
# some model can not detect the language code, so we need to add it
392390

393-
if self.config.tts != "mi": # mi only say Chinese
394-
query += (
395-
",并用本段话的language code作为开头,用|分隔,如:en-US|你好……"
396-
)
397-
398391
if self.config.mute_xiaoai:
399392
await self.stop_if_xiaoai_is_playing()
400393
else:
@@ -420,18 +413,11 @@ async def run_forever(self):
420413
await self.wakeup_xiaoai()
421414

422415
async def speak(self, text_stream: AsyncIterator[str]) -> None:
423-
text = await text_stream.__anext__()
424-
# See if the first part contains language code(e.g. en-US|Hello world)
425-
lang, _, first_chunk = text.rpartition("|")
426-
if len(lang) > 7:
427-
# It is not a legal language code, discard it
428-
lang, first_chunk = "", text
429-
430-
lang = (
431-
matches[0]
432-
if (matches := re.findall(r"([a-z]{2}-[A-Z]{2})", lang))
433-
else "zh-CN"
434-
)
416+
first_chunk = await text_stream.__anext__()
417+
# Detect the language from the first chunk
418+
# Add suffix '-' because tetos expects it to exist when selecting voices
419+
# however, the nation code is never used.
420+
lang = detect_language(first_chunk) + "-"
435421

436422
async def gen(): # reconstruct the generator
437423
yield first_chunk

0 commit comments

Comments
 (0)