Skip to content

Commit 6856d2c

Browse files
committed
Support extra languages in multi-lang kokoro tts
1 parent 59d118c commit 6856d2c

26 files changed

+166
-37
lines changed

flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct {
201201
external double lengthScale;
202202
external Pointer<Utf8> dictDir;
203203
external Pointer<Utf8> lexicon;
204+
external Pointer<Utf8> lang;
204205
}
205206

206207
final class SherpaOnnxOfflineTtsModelConfig extends Struct {

flutter/sherpa_onnx/lib/src/tts.dart

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig {
117117
this.lengthScale = 1.0,
118118
this.dictDir = '',
119119
this.lexicon = '',
120+
this.lang = '',
120121
});
121122

122123
factory OfflineTtsKokoroModelConfig.fromJson(Map<String, dynamic> json) {
@@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig {
128129
lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0,
129130
dictDir: json['dictDir'] as String? ?? '',
130131
lexicon: json['lexicon'] as String? ?? '',
132+
lang: json['lang'] as String? ?? '',
131133
);
132134
}
133135

134136
@override
135137
String toString() {
136-
return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon)';
138+
return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon, lang: $lang)';
137139
}
138140

139141
Map<String, dynamic> toJson() => {
@@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig {
144146
'lengthScale': lengthScale,
145147
'dictDir': dictDir,
146148
'lexicon': lexicon,
149+
'lang': lang,
147150
};
148151

149152
final String model;
@@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig {
153156
final double lengthScale;
154157
final String dictDir;
155158
final String lexicon;
159+
final String lang;
156160
}
157161

158162
class OfflineTtsModelConfig {
@@ -286,6 +290,7 @@ class OfflineTts {
286290
c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale;
287291
c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8();
288292
c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8();
293+
c.ref.model.kokoro.lang = config.model.kokoro.lang.toNativeUtf8();
289294

290295
c.ref.model.numThreads = config.model.numThreads;
291296
c.ref.model.debug = config.model.debug ? 1 : 0;
@@ -302,6 +307,7 @@ class OfflineTts {
302307
calloc.free(c.ref.ruleFsts);
303308
calloc.free(c.ref.model.provider);
304309

310+
calloc.free(c.ref.model.kokoro.lang);
305311
calloc.free(c.ref.model.kokoro.lexicon);
306312
calloc.free(c.ref.model.kokoro.dictDir);
307313
calloc.free(c.ref.model.kokoro.dataDir);

harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-tts.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig(
7070
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale);
7171
SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir);
7272
SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon);
73+
SHERPA_ONNX_ASSIGN_ATTR_STR(lang, lang);
7374

7475
return c;
7576
}
@@ -177,6 +178,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
177178
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir);
178179
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir);
179180
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon);
181+
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lang);
180182

181183
SHERPA_ONNX_DELETE_C_STR(c.model.provider);
182184

harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig {
3636
public lengthScale: number = 1.0;
3737
public dictDir: string = '';
3838
public lexicon: string = '';
39+
public lang: string = '';
3940
}
4041

4142
export class OfflineTtsModelConfig {

scripts/dotnet/OfflineTtsKokoroModelConfig.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ public OfflineTtsKokoroModelConfig()
1818

1919
DictDir = "";
2020
Lexicon = "";
21+
Lang = "";
2122
}
2223
[MarshalAs(UnmanagedType.LPStr)]
2324
public string Model;
@@ -38,5 +39,8 @@ public OfflineTtsKokoroModelConfig()
3839

3940
[MarshalAs(UnmanagedType.LPStr)]
4041
public string Lexicon;
42+
43+
[MarshalAs(UnmanagedType.LPStr)]
44+
public string Lang;
4145
}
4246
}

scripts/go/sherpa_onnx.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct {
857857
DataDir string // Path to espeak-ng-data directory
858858
DictDir string // Path to dict directory
859859
Lexicon string // Path to lexicon files
860+
Lang string // Example: es for Spanish, fr-fr for French. Can be empty
860861
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
861862
}
862863

@@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
10061007
c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon)
10071008
defer C.free(unsafe.Pointer(c.model.kokoro.lexicon))
10081009

1010+
c.model.kokoro.lang = C.CString(config.Model.Kokoro.Lang)
1011+
defer C.free(unsafe.Pointer(c.model.kokoro.lang))
1012+
10091013
c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale)
10101014

10111015
c.model.num_threads = C.int(config.Model.NumThreads)

sherpa-onnx/c-api/c-api.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
11641164
SHERPA_ONNX_OR(config->model.kokoro.dict_dir, "");
11651165
tts_config.model.kokoro.lexicon =
11661166
SHERPA_ONNX_OR(config->model.kokoro.lexicon, "");
1167+
tts_config.model.kokoro.lang = SHERPA_ONNX_OR(config->model.kokoro.lang, "");
11671168

11681169
tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
11691170
tts_config.model.debug = config->model.debug;

sherpa-onnx/c-api/c-api.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig {
958958
float length_scale; // < 1, faster in speech speed; > 1, slower in speed
959959
const char *dict_dir;
960960
const char *lexicon;
961+
const char *lang;
961962
} SherpaOnnxOfflineTtsKokoroModelConfig;
962963

963964
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {

sherpa-onnx/c-api/cxx-api.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
366366
c.model.kokoro.length_scale = config.model.kokoro.length_scale;
367367
c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str();
368368
c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str();
369+
c.model.kokoro.lang = config.model.kokoro.lang.c_str();
369370

370371
c.model.num_threads = config.model.num_threads;
371372
c.model.debug = config.model.debug;

sherpa-onnx/c-api/cxx-api.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig {
367367
std::string data_dir;
368368
std::string dict_dir;
369369
std::string lexicon;
370+
std::string lang;
370371

371372
float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
372373
};

sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl {
6767
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
6868
}
6969

70-
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
70+
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
71+
const std::string &voice) const {
7172
std::string text = ToLowerCase(_text);
7273
if (debug_) {
7374
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
@@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl {
124125
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
125126
}
126127

127-
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
128+
ids_vec = ConvertNonChineseToTokenIDs(ms, voice);
128129
}
129130

130131
for (const auto &ids : ids_vec) {
@@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl {
255256
return ans;
256257
}
257258

258-
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
259+
std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak(
259260
const std::string &text, const std::string &voice) const {
261+
auto temp = ConvertTextToTokenIdsKokoro(
262+
phoneme2id_, meta_data_.max_token_len, text, voice);
263+
std::vector<std::vector<int32_t>> ans;
264+
ans.reserve(temp.size());
265+
266+
for (const auto &i : temp) {
267+
ans.emplace_back(i.tokens.begin(), i.tokens.end());
268+
}
269+
270+
return ans;
271+
}
272+
273+
std::vector<std::vector<int32_t>> ConvertNonChineseToTokenIDs(
274+
const std::string &text, const std::string &voice) const {
275+
if (!voice.empty()) {
276+
return ConvertTextToTokenIDsWithEspeak(text, voice);
277+
}
278+
279+
// If voice is empty, we split the text into words and use the lexicon
280+
// to lookup the pronunciation of each word, fallback to espeak if
281+
// a word is not in the lexicon.
282+
260283
std::vector<std::string> words = SplitUtf8(text);
261284
if (debug_) {
262285
std::ostringstream os;
@@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl {
317340

318341
piper::eSpeakPhonemeConfig config;
319342

320-
config.voice = voice;
343+
config.voice = meta_data_.voice;
321344

322345
std::vector<std::vector<piper::Phoneme>> phonemes;
323346

@@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl {
391414

392415
void InitTokens(std::istream &is) {
393416
token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
417+
418+
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
419+
std::u32string s;
420+
for (const auto &p : token2id_) {
421+
s = conv.from_bytes(p.first);
422+
423+
if (s.size() != 1) {
424+
SHERPA_ONNX_LOGE("Error for token %s with id %d", p.first.c_str(),
425+
p.second);
426+
SHERPA_ONNX_EXIT(-1);
427+
}
428+
429+
char32_t c = s[0];
430+
phoneme2id_.insert({c, p.second});
431+
}
394432
}
395433

396434
void InitLexicon(const std::string &lexicon) {
435+
if (lexicon.empty()) {
436+
return;
437+
}
438+
397439
std::vector<std::string> files;
398440
SplitStringToVector(lexicon, ",", false, &files);
399441
for (const auto &f : files) {
@@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl {
404446

405447
template <typename Manager>
406448
void InitLexicon(Manager *mgr, const std::string &lexicon) {
449+
if (lexicon.empty()) {
450+
return;
451+
}
452+
407453
std::vector<std::string> files;
408454
SplitStringToVector(lexicon, ",", false, &files);
409455
for (const auto &f : files) {
@@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl {
445491

446492
std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
447493

448-
if (ids.empty()) {
494+
if (ids.empty() && word != "") {
449495
SHERPA_ONNX_LOGE(
450496
"Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
451497
word.c_str(), line_num, line.c_str());
@@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl {
465511
// tokens.txt is saved in token2id_
466512
std::unordered_map<std::string, int32_t> token2id_;
467513

514+
std::unordered_map<char32_t, int32_t> phoneme2id_;
515+
468516
std::unique_ptr<cppjieba::Jieba> jieba_;
469517
bool debug_ = false;
470518
};
@@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon(
487535
meta_data, debug)) {}
488536

489537
std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
490-
const std::string &text, const std::string & /*unused_voice = ""*/) const {
491-
return impl_->ConvertTextToTokenIds(text);
538+
const std::string &text, const std::string &voice /*= ""*/) const {
539+
return impl_->ConvertTextToTokenIds(text, voice);
492540
}
493541

494542
#if __ANDROID_API__ >= 9

sherpa-onnx/csrc/offline-speech-denoiser-model-config.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig {
2020

2121
OfflineSpeechDenoiserModelConfig() = default;
2222

23-
OfflineSpeechDenoiserModelConfig(OfflineSpeechDenoiserGtcrnModelConfig gtcrn,
24-
int32_t num_threads, bool debug,
25-
const std::string &provider)
23+
OfflineSpeechDenoiserModelConfig(
24+
const OfflineSpeechDenoiserGtcrnModelConfig &gtcrn, int32_t num_threads,
25+
bool debug, const std::string &provider)
2626
: gtcrn(gtcrn),
2727
num_threads(num_threads),
2828
debug(debug),

sherpa-onnx/csrc/offline-tts-frontend.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
77
#include <cstdint>
88
#include <string>
9+
#include <unordered_map>
910
#include <utility>
1011
#include <vector>
1112

@@ -57,6 +58,12 @@ class OfflineTtsFrontend {
5758
// implementation is in ./piper-phonemize-lexicon.cc
5859
void InitEspeak(const std::string &data_dir);
5960

61+
// implementation in ./piper-phonemize-lexicon.cc
62+
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
63+
const std::unordered_map<char32_t, int32_t> &token2id,
64+
int32_t max_token_len, const std::string &text,
65+
const std::string &voice = "");
66+
6067
} // namespace sherpa_onnx
6168

6269
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_

sherpa-onnx/csrc/offline-tts-kokoro-impl.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
220220
}
221221
}
222222

223-
std::vector<TokenIDs> token_ids =
224-
frontend_->ConvertTextToTokenIds(text, meta_data.voice);
223+
std::vector<TokenIDs> token_ids = frontend_->ConvertTextToTokenIds(
224+
text, config_.model.kokoro.lang.empty() ? meta_data.voice
225+
: config_.model.kokoro.lang);
225226

226227
if (token_ids.empty() ||
227228
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
@@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
335336
if (meta_data.version >= 2) {
336337
// this is a multi-lingual model, we require that you pass lexicon
337338
// and dict_dir
338-
if (config_.model.kokoro.lexicon.empty() ||
339+
if ((config_.model.kokoro.lexicon.empty() &&
340+
config_.model.kokoro.lang.empty()) ||
339341
config_.model.kokoro.dict_dir.empty()) {
340342
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
341343
SHERPA_ONNX_LOGE(
342344
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
343-
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
345+
"v1.0). Please pass --kokoro-lexicon and --kokoro-dict-dir or "
346+
"provide --kokoro-lang and --kokoro-dict-dir");
344347
SHERPA_ONNX_EXIT(-1);
345348
}
346349

@@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
362365
if (meta_data.version >= 2) {
363366
// this is a multi-lingual model, we require that you pass lexicon
364367
// and dict_dir
365-
if (config_.model.kokoro.lexicon.empty() ||
368+
if ((config_.model.kokoro.lexicon.empty() &&
369+
config_.model.kokoro.lang.empty()) ||
366370
config_.model.kokoro.dict_dir.empty()) {
367371
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
368372
SHERPA_ONNX_LOGE(

sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
1818
"Path to voices.bin for Kokoro models");
1919
po->Register("kokoro-tokens", &tokens,
2020
"Path to tokens.txt for Kokoro models");
21+
po->Register("kokoro-lang", &lang,
22+
"Used only by kokoro >= 1.0. Example values: "
23+
"en (English), "
24+
"es (Spanish), fr (French), hi (hindi), it (Italian), "
25+
"pt-br (Brazilian Portuguese)."
26+
"You can leave it empty, in which case you need to provide "
27+
"--kokoro-lexicon.");
2128
po->Register(
2229
"kokoro-lexicon", &lexicon,
2330
"Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
@@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
127134
os << "lexicon=\"" << lexicon << "\", ";
128135
os << "data_dir=\"" << data_dir << "\", ";
129136
os << "dict_dir=\"" << dict_dir << "\", ";
130-
os << "length_scale=" << length_scale << ")";
137+
os << "length_scale=" << length_scale << ", ";
138+
os << "lang=\"" << lang << "\")";
131139

132140
return os.str();
133141
}

0 commit comments

Comments
 (0)