Skip to content

Commit 4915a77

Browse files
committed
Add thaig2p_v2
1 parent fedcd90 commit 4915a77

File tree

4 files changed

+41
-1
lines changed

4 files changed

+41
-1
lines changed

docs/api/transliterate.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ This section includes multiple transliteration engines designed to suit various
5555

5656
- **icu**: Utilizes the ICU transliteration system for phonetic conversion.
5757
- **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text.
58-
- **thaig2p**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.
58+
- **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.
59+
- **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0
5960
- **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration.
6061
- **iso_11940**: Focuses on the ISO 11940 transliteration standard.
6162

pythainlp/transliterate/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ def transliterate(
109109
`TLTK <https://pypi.org/project/tltk/>`_.,
110110
* *iso_11940* - Thai text into Latin characters with ISO 11940.
111111
* *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA)
112+
* *thaig2p_v2* - Thai Grapheme-to-Phoneme,
113+
output is IPA. https://huggingface.co/pythainlp/thaig2p-v2.0
112114
113115
:Example:
114116
::
@@ -159,6 +161,8 @@ def transliterate(
159161
from pythainlp.transliterate.tltk import tltk_ipa as transliterate
160162
elif engine == "iso_11940":
161163
from pythainlp.transliterate.iso_11940 import transliterate
164+
elif engine == "thaig2p_v2":
165+
from pythainlp.transliterate.thaig2p_v2 import transliterate
162166
else: # use default engine: "thaig2p"
163167
from pythainlp.transliterate.thaig2p import transliterate
164168

pythainlp/transliterate/thaig2p_v2.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Thai Grapheme-to-Phoneme (Thai G2P)
6+
7+
huggingface: https://huggingface.co/pythainlp/thaig2p-v2.0
8+
"""
9+
10+
# Use a pipeline as a high-level helper
11+
from transformers import pipeline
12+
13+
14+
class ThaiG2P:
15+
"""
16+
Latin transliteration of Thai words, using International Phonetic Alphabet
17+
"""
18+
19+
def __init__(self, device: str="cpu"):
20+
self.pipe = pipeline("text2text-generation", model="pythainlp/thaig2p-v2.0", device=device)
21+
22+
def g2p(self, text: str) -> str:
23+
return self.pipe(text)[0]["generated_text"]
24+
25+
26+
_THAI_G2P = None
27+
28+
29+
def transliterate(text: str, device="cpu") -> str:
30+
global _THAI_G2P
31+
if _THAI_G2P == None:
32+
_THAI_G2P = ThaiG2P(device=device)
33+
return _THAI_G2P.g2p(text)

tests/test_transliterate.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,8 @@ def test_transliterate(self):
216216
self.assertEqual(transliterate("คน", engine="ipa"), "kʰon")
217217
self.assertIsNotNone(transliterate("คน", engine="thaig2p"))
218218
self.assertIsNotNone(transliterate("แมว", engine="thaig2p"))
219+
self.assertIsNotNone(transliterate("คน", engine="thaig2p_v2"))
220+
self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v2"))
219221
self.assertIsNotNone(transliterate("คน", engine="tltk_g2p"))
220222
self.assertIsNotNone(transliterate("แมว", engine="tltk_g2p"))
221223
self.assertIsNotNone(transliterate("คน", engine="tltk_ipa"))

0 commit comments

Comments
 (0)