-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* [conf,examples] change original conf to conf/{asr,sse} * [aps,examples] init librimix recipe * [aps,examples,tests] support online tokenizer * [aps] re-impl aps.tokenizer (also update in dataloader) * [aps,examples,tests] add streaming transformer encoder to streaming_asr * [aps,cmd] update ngram_rescore.py to lm_rescore.py * [aps,cmd] optimize nbest reader * [conf,examples] check librimix recipe * [aps,examples] update librimix results & fix streaming cfg error for transducer/ctc * update codecov.yml
- Loading branch information
Showing
117 changed files
with
1,265 additions
and
704 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,84 +1,74 @@ | ||
# Copyright 2019 Jian Wu | ||
# License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) | ||
|
||
from aps.tokenizer import Tokenizer | ||
from aps.conf import load_dict | ||
from aps.const import UNK_TOKEN | ||
|
||
from typing import List | ||
|
||
|
||
class TextPreProcessor(object): | ||
class TextProcess(object): | ||
""" | ||
The class for pre processing of the transcriptions, i.e., | ||
mapping them to the sequence ids for CTC alignment | ||
Base class for pre-/post-processing of the audio transcriptions, i.e., | ||
mapping them to the sequence ids for CTC alignment or mapping decoding | ||
id sequences to word sequences | ||
""" | ||
|
||
def __init__(self, dict_str: str, space: str = "", spm: str = "") -> None: | ||
self.vocab = None | ||
self.space = space | ||
self.sp_mdl = None | ||
if dict_str: | ||
# str to int | ||
self.vocab = load_dict(dict_str) | ||
tokenizer_kwargs = {} | ||
if spm: | ||
import sentencepiece as sp | ||
self.sp_mdl = sp.SentencePieceProcessor(model_file=spm) | ||
tokenizer = "subword" | ||
tokenizer_kwargs["spm"] = spm | ||
else: | ||
if space: | ||
tokenizer = "char" | ||
tokenizer_kwargs["space"] = space | ||
else: | ||
tokenizer = "word" | ||
# str to int | ||
if dict_str: | ||
vocab_dict = load_dict(dict_str) | ||
self.tokenizer = Tokenizer(vocab_dict, | ||
tokenizer=tokenizer, | ||
tokenizer_kwargs=tokenizer_kwargs) | ||
else: | ||
self.tokenizer = None | ||
|
||
|
||
class TextPreProcessor(TextProcess): | ||
""" | ||
Text pre-processing class | ||
""" | ||
|
||
def __init__(self, dict_str: str, space: str = "", spm: str = "") -> None: | ||
super(TextPreProcessor, self).__init__(dict_str, space=space, spm=spm) | ||
|
||
def run(self, str_seq: List[str]) -> List[int]: | ||
if self.vocab: | ||
if self.sp_mdl: | ||
# subword str sequence | ||
str_seq = self.sp_mdl.encode(" ".join(str_seq), out_type=str) | ||
if self.space: | ||
# insert space | ||
str_seq = f" {self.space} ".join(str_seq).split(" ") | ||
int_seq = [(self.vocab[idx] | ||
if idx in self.vocab else self.vocab[UNK_TOKEN]) | ||
for idx in str_seq] | ||
if self.tokenizer: | ||
int_seq = self.tokenizer.encode(str_seq) | ||
else: | ||
# no tokenizer avaiable | ||
int_seq = [int(idx) for idx in str_seq] | ||
return int_seq | ||
|
||
|
||
class TextPostProcessor(object): | ||
class TextPostProcessor(TextProcess): | ||
""" | ||
The class for post processing of decoding sequence, i.e., | ||
mapping id sequence to token sequence | ||
Text post-processing class | ||
""" | ||
|
||
def __init__(self, | ||
dict_str: str, | ||
space: str = "", | ||
show_unk: str = "<unk>", | ||
spm: str = "") -> None: | ||
super(TextPostProcessor, self).__init__(dict_str, space=space, spm=spm) | ||
self.unk = show_unk | ||
self.space = space | ||
self.vocab = None | ||
self.sp_mdl = None | ||
if dict_str: | ||
# int to str | ||
self.vocab = load_dict(dict_str, reverse=True) | ||
if spm: | ||
import sentencepiece as sp | ||
self.sp_mdl = sp.SentencePieceProcessor(model_file=spm) | ||
|
||
def run(self, int_seq: List[int]) -> str: | ||
if self.vocab: | ||
trans = [self.vocab[idx] for idx in int_seq] | ||
else: | ||
trans = [str(idx) for idx in int_seq] | ||
# char sequence | ||
if self.vocab: | ||
if self.sp_mdl: | ||
trans = self.sp_mdl.decode(trans) | ||
else: | ||
if self.space: | ||
trans = "".join(trans).replace(self.space, " ") | ||
else: | ||
trans = " ".join(trans) | ||
if self.unk != UNK_TOKEN: | ||
trans = trans.replace(UNK_TOKEN, self.unk) | ||
# ID sequence | ||
if self.tokenizer: | ||
str_seq = self.tokenizer.decode(int_seq, unk_sym=self.unk) | ||
else: | ||
trans = " ".join(trans) | ||
return trans | ||
# if tokenizer avaiable | ||
str_seq = [str(idx) for idx in int_seq] | ||
return " ".join(str_seq) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.