Add LibriMix recipe (#52)

* [conf,examples] change original conf to conf/{asr,sse} * [aps,examples] init librimix recipe * [aps,examples,tests] support online tokenizer * [aps] re-impl aps.tokenizer (also update in dataloader) * [aps,examples,tests] add streaming transformer encoder to streaming_asr * [aps,cmd] update ngram_rescore.py to lm_rescore.py * [aps,cmd] optimize nbest reader * [conf,examples] check librimix recipe * [aps,examples] update librimix results & fix streaming cfg error for transducer/ctc * update codecov.yml
funcwj · Nov 20, 2021 · 571c845 · 571c845
1 parent a31c55a
commit 571c845
Show file tree

Hide file tree

Showing 117 changed files with 1,265 additions and 704 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,9 +3,13 @@
 ### 2021/11
 
 1. Update to PyTorch 1.10
-2. Update subampling layer for Transformer based encoders
+2. Refactor subampling layer for Transformer based encoders
 3. Adaptive SpecAug
 4. Update LibriSpeech & GigaSpeech results
+5. Add AED decoder rescoring for CTC beam search
+6. Add LibriMix recipe
+7. Refactor tokenizer for online tokenizing
+8. Update ngram_rescore.py to lm_rescore.py
 
 ### 2021/10
 

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://opensource.org/licenses/Apache-2.0)
 [![Python-Version](https://img.shields.io/badge/Python-3.7%7C3.8-brightgreen)](https://github.com/funcwj/aps)
-[![codecov](https://codecov.io/gh/funcwj/aps/branch/jwu/master/graph/badge.svg)](https://codecov.io/gh/funcwj/aps)
+[![codecov](https://codecov.io/gh/funcwj/aps/branch/master/graph/badge.svg)](https://codecov.io/gh/funcwj/aps)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![CI](https://github.com/funcwj/aps/actions/workflows/test_python.yml/badge.svg)](https://github.com/funcwj/aps/actions/workflows/test_python.yml/badge.svg)
 
@@ -40,4 +40,4 @@ cmake .. && make -j
 
 ## Acknowledge
 
-The project was started at early 2019 when the author was a master student of the Audio, Speech and Language Processing Group (ASLP) in Northwestern Polytechnical University (NWPU), Xi'an, China. Originally it was used to collect the source code of the experiments that the author did in the past.
+The project was started at early 2019 when the author was a master student of the [Audio, Speech and Language Processing Group](http://www.npu-aslp.org) (ASLP) in [Northwestern Polytechnical University](https://www.nwpu.edu.cn) (NWPU), Xi'an, China. Originally it was used to collect the source code of the experiments that the author did in the past.
diff --git a/aps/asr/lm/ngram.py b/aps/asr/lm/ngram.py
@@ -10,6 +10,7 @@
     kenlm_available = False
 
 from aps.conf import load_dict
+from aps.const import EOS_TOKEN, SOS_TOKEN
 
 
 class NgramLM(object):
@@ -29,12 +30,21 @@ def __init__(self, lm: str, vocab_dict: str) -> None:
         vocab = load_dict(vocab_dict, reverse=True)
         self.token = [None] * len(vocab)
         for i, tok in vocab.items():
-            if tok == "<eos>":
+            if tok == EOS_TOKEN:
                 tok = "</s>"
-            if tok == "<sos>":
+            if tok == SOS_TOKEN:
                 tok = "<s>"
             self.token[i] = tok
 
+    def score(self,
+              utterance: str,
+              sos: bool = True,
+              eos: bool = True) -> float:
+        """
+        Score a given utterance
+        """
+        return self.ngram_lm.score(utterance, bos=sos, eos=eos)
+
     def _step(self, prev_state):
         """
         Args:

diff --git a/aps/asr/lm/rnn.py b/aps/asr/lm/rnn.py
@@ -4,7 +4,7 @@
 import torch as th
 import torch.nn as nn
 import torch.nn.init as init
-from typing import NoReturn, Union, Tuple, Optional
+from typing import NoReturn, Union, Tuple, List, Optional
 from aps.asr.base.component import OneHotEmbedding, PyTorchRNN
 from aps.asr.base.decoder import LayerNormRNN
 from aps.libs import ApsRegisters
@@ -76,10 +76,32 @@ def __init__(self,
             self.dist.weight = self.embed.weight
 
     def init_weights(self, initrange: float = 0.1) -> NoReturn:
+        """
+        Initialize model weights
+        """
         init.zeros_(self.dist.bias)
         init.uniform_(self.dist.weight, -initrange, initrange)
         init.uniform_(self.embed.weight, -initrange, initrange)
 
+    def score(self,
+              hypos: List[int],
+              sos: int = -1,
+              eos: int = -1,
+              device: int = -1) -> float:
+        """
+        Score the given hypothesis
+        """
+        hyp_tensor = th.as_tensor(
+            [sos] + hypos, device="cpu" if device < 0 else f"cuda:{device:d}")
+        # 1 x T+1 => 1 x T+1 x V
+        prob, _ = self(hyp_tensor[None, ...])
+        # T+1 x V
+        prob = th.log_softmax(prob[0], -1)
+        score = 0
+        for n, w in enumerate(hypos + [eos]):
+            score += prob[n, w].item()
+        return score
+
     def forward(
             self,
             token: th.Tensor,

diff --git a/aps/asr/lm/transformer.py b/aps/asr/lm/transformer.py
@@ -4,7 +4,7 @@
 import torch as th
 import torch.nn as nn
 
-from typing import Optional, Tuple, Dict
+from typing import Optional, Tuple, Dict, List
 from aps.asr.transformer.pose import get_xfmr_pose
 from aps.asr.transformer.impl import get_xfmr_encoder
 from aps.asr.transformer.utils import prep_sub_mask
@@ -32,6 +32,25 @@ def __init__(self,
         self.dist = nn.Linear(att_dim, vocab_size)
         self.vocab_size = vocab_size
 
+    def score(self,
+              hypos: List[int],
+              sos: int = -1,
+              eos: int = -1,
+              device: int = -1) -> float:
+        """
+        Score the given hypothesis
+        """
+        hyp_tensor = th.as_tensor(
+            [sos] + hypos, device="cpu" if device < 0 else f"cuda:{device:d}")
+        # 1 x T+1 => 1 x T+1 x V
+        prob, _ = self(hyp_tensor[None, ...])
+        # T+1 x V
+        prob = th.log_softmax(prob[0], -1)
+        score = 0
+        for n, w in enumerate(hypos + [eos]):
+            score += prob[n, w].item()
+        return score
+
     def forward(
             self,
             token: th.Tensor,

diff --git a/aps/conf.py b/aps/conf.py
@@ -7,6 +7,7 @@
 import codecs
 
 from typing import Dict, List, Tuple
+from aps.const import EOS_TOKEN, SOS_TOKEN
 
 required_keys = [
     "nnet", "nnet_conf", "task", "task_conf", "data_conf", "trainer_conf"
@@ -16,11 +17,14 @@
     "enh_transform", "asr_transform", "cmd_args"
 ]
 all_lm_conf_keys = required_keys + ["cmd_args"]
+transducer_or_ctc_tasks = [
+    "asr@transducer", "asr@ctc", "streaming_asr@transducer", "streaming_asr@ctc"
+]
 
 
 def load_dict(dict_path: str,
               reverse: bool = False,
-              required: List[str] = ["<sos>", "<eos>"]) -> Dict:
+              required: List[str] = [EOS_TOKEN, SOS_TOKEN]) -> Dict:
     """
     Load the dictionary object
     Args:
@@ -110,18 +114,19 @@ def load_am_conf(yaml_conf: str, dict_path: str) -> Tuple[Dict, Dict]:
 
     # add dict info
     nnet_conf = conf["nnet_conf"]
-    vocab = load_dict(
-        dict_path,
-        required=[] if conf["task"] == "asr@ctc" else ["<eos>", "<sos>"])
+    is_transducer_or_ctc = conf["task"] in transducer_or_ctc_tasks
+
+    required_units = [] if is_transducer_or_ctc else [EOS_TOKEN, EOS_TOKEN]
+    vocab = load_dict(dict_path, required=required_units)
     nnet_conf["vocab_size"] = len(vocab)
 
+    # Generally we don't use eos/sos in
+    if not is_transducer_or_ctc:
+        nnet_conf["sos"] = vocab[SOS_TOKEN]
+        nnet_conf["eos"] = vocab[EOS_TOKEN]
+    # for transducer/CTC
     task_conf = conf["task_conf"]
     use_ctc = "ctc_weight" in task_conf and task_conf["ctc_weight"] > 0
-    is_transducer_or_ctc = conf["task"] in ["asr@transducer", "asr@ctc"]
-    if not is_transducer_or_ctc:
-        nnet_conf["sos"] = vocab["<sos>"]
-        nnet_conf["eos"] = vocab["<eos>"]
-    # for CTC/RNNT
     if use_ctc or is_transducer_or_ctc:
         conf["task_conf"]["blank"] = len(vocab)
         # add blank

diff --git a/aps/const.py b/aps/const.py
@@ -17,5 +17,7 @@
 MAX_INT16 = np.iinfo(np.int16).max
 UNK_TOKEN = "<unk>"
 BLK_TOKEN = "<b>"
+EOS_TOKEN = "<eos>"
+SOS_TOKEN = "<sos>"
 OOM_STRING = "out of memory"
 TORCH_VERSION = LooseVersion(th.__version__)
diff --git a/aps/eval/asr.py b/aps/eval/asr.py
@@ -1,84 +1,74 @@
 # Copyright 2019 Jian Wu
 # License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
 
+from aps.tokenizer import Tokenizer
 from aps.conf import load_dict
-from aps.const import UNK_TOKEN
 
 from typing import List
 
 
-class TextPreProcessor(object):
+class TextProcess(object):
     """
-    The class for pre processing of the transcriptions, i.e.,
-    mapping them to the sequence ids for CTC alignment
+    Base class for pre-/post-processing of the audio transcriptions, i.e.,
+    mapping them to the sequence ids for CTC alignment or mapping decoding
+    id sequences to word sequences
     """
 
     def __init__(self, dict_str: str, space: str = "", spm: str = "") -> None:
-        self.vocab = None
-        self.space = space
-        self.sp_mdl = None
-        if dict_str:
-            # str to int
-            self.vocab = load_dict(dict_str)
+        tokenizer_kwargs = {}
         if spm:
-            import sentencepiece as sp
-            self.sp_mdl = sp.SentencePieceProcessor(model_file=spm)
+            tokenizer = "subword"
+            tokenizer_kwargs["spm"] = spm
+        else:
+            if space:
+                tokenizer = "char"
+                tokenizer_kwargs["space"] = space
+            else:
+                tokenizer = "word"
+        # str to int
+        if dict_str:
+            vocab_dict = load_dict(dict_str)
+            self.tokenizer = Tokenizer(vocab_dict,
+                                       tokenizer=tokenizer,
+                                       tokenizer_kwargs=tokenizer_kwargs)
+        else:
+            self.tokenizer = None
+
+
+class TextPreProcessor(TextProcess):
+    """
+    Text pre-processing class
+    """
+
+    def __init__(self, dict_str: str, space: str = "", spm: str = "") -> None:
+        super(TextPreProcessor, self).__init__(dict_str, space=space, spm=spm)
 
     def run(self, str_seq: List[str]) -> List[int]:
-        if self.vocab:
-            if self.sp_mdl:
-                # subword str sequence
-                str_seq = self.sp_mdl.encode(" ".join(str_seq), out_type=str)
-            if self.space:
-                # insert space
-                str_seq = f" {self.space} ".join(str_seq).split(" ")
-            int_seq = [(self.vocab[idx]
-                        if idx in self.vocab else self.vocab[UNK_TOKEN])
-                       for idx in str_seq]
+        if self.tokenizer:
+            int_seq = self.tokenizer.encode(str_seq)
         else:
+            # no tokenizer avaiable
             int_seq = [int(idx) for idx in str_seq]
         return int_seq
 
 
-class TextPostProcessor(object):
+class TextPostProcessor(TextProcess):
     """
-    The class for post processing of decoding sequence, i.e.,
-    mapping id sequence to token sequence
+    Text post-processing class
     """
 
     def __init__(self,
                  dict_str: str,
                  space: str = "",
                  show_unk: str = "<unk>",
                  spm: str = "") -> None:
+        super(TextPostProcessor, self).__init__(dict_str, space=space, spm=spm)
         self.unk = show_unk
-        self.space = space
-        self.vocab = None
-        self.sp_mdl = None
-        if dict_str:
-            # int to str
-            self.vocab = load_dict(dict_str, reverse=True)
-        if spm:
-            import sentencepiece as sp
-            self.sp_mdl = sp.SentencePieceProcessor(model_file=spm)
 
     def run(self, int_seq: List[int]) -> str:
-        if self.vocab:
-            trans = [self.vocab[idx] for idx in int_seq]
-        else:
-            trans = [str(idx) for idx in int_seq]
-        # char sequence
-        if self.vocab:
-            if self.sp_mdl:
-                trans = self.sp_mdl.decode(trans)
-            else:
-                if self.space:
-                    trans = "".join(trans).replace(self.space, " ")
-                else:
-                    trans = " ".join(trans)
-            if self.unk != UNK_TOKEN:
-                trans = trans.replace(UNK_TOKEN, self.unk)
-        # ID sequence
+        if self.tokenizer:
+            str_seq = self.tokenizer.decode(int_seq, unk_sym=self.unk)
         else:
-            trans = " ".join(trans)
-        return trans
+            # if tokenizer avaiable
+            str_seq = [str(idx) for idx in int_seq]
+        return " ".join(str_seq)
diff --git a/aps/io/text.py b/aps/io/text.py
@@ -40,27 +40,29 @@ class NbestReader(object):
     def __init__(self, nbest: str):
         self.nbest, self.hypos = self._load_nbest(nbest)
 
+    def __len__(self) -> int:
+        return len(self.hypos)
+
+    def __iter__(self):
+        return iter(self.hypos.items())
+
     def _load_nbest(self, nbest: str):
         hypos = {}
-        nbest = 1
-        with codecs.open(nbest, "r", encoding="utf-8") as f:
-            nbest = int(f.readline())
-            while True:
-                key = f.readline().strip()
-                if not key:
-                    break
-                topk = []
-                n = 0
-                while n < self.nbest:
-                    items = f.readline().strip().split()
-                    score = float(items[0])
-                    num_tokens = int(items[1])
-                    trans = " ".join(items[2:])
-                    topk.append((score, num_tokens, trans))
-                    n += 1
-                hypos[key] = topk
+        with codecs.open(nbest, "r", encoding="utf-8") as fd:
+            all_lines = fd.readlines()
+        nbest = int(all_lines[0].strip())
+        if (len(all_lines) - 1) % (nbest + 1) != 0:
+            raise RuntimeError("Seems that nbest format is wrong")
+        n = 1
+        while n < len(all_lines):
+            key = all_lines[n].strip()
+            topk = []
+            for i in range(nbest):
+                items = all_lines[n + 1 + i].strip().split()
+                score = float(items[0])
+                num_tokens = int(items[1])
+                trans = " ".join(items[2:])
+                topk.append((score, num_tokens, trans))
+            n += nbest + 1
+            hypos[key] = topk
         return nbest, hypos
-
-    def __iter__(self):
-        for key in self.hypos:
-            yield key, self.hypos[key]
diff --git a/aps/libs.py b/aps/libs.py
@@ -79,7 +79,7 @@ class ApsModules(object):
         "bss.dense_unet", "bss.sepformer"
     ]
     loader_submodules = [
-        "am.kaldi", "am.raw", "am.command", "se.chunk", "se.command",
+        "am.kaldi", "am.raw", "am.simu_cmd", "se.chunk", "se.simu_cmd",
         "se.config", "lm.utt", "lm.bptt"
     ]
     asr = Module("aps.asr", asr_submodules)
@@ -265,10 +265,10 @@ def start_trainer(trainer: str,
                                 max_batch_size=args.batch_size // num_process,
                                 **loader_conf,
                                 **data_conf["train"])
+    dev_batch_size = int(args.batch_size // args.dev_batch_factor)
     dev_loader = aps_dataloader(train=False,
                                 distributed=False,
-                                max_batch_size=args.batch_size //
-                                args.dev_batch_factor,
+                                max_batch_size=dev_batch_size,
                                 **loader_conf,
                                 **data_conf["valid"])
     trainer.run(trn_loader,