Skip to content

Commit

Permalink
Add LibriMix recipe (#52)
Browse files Browse the repository at this point in the history
* [conf,examples] change original conf to conf/{asr,sse}

* [aps,examples] init librimix recipe

* [aps,examples,tests] support online tokenizer

* [aps] re-impl aps.tokenizer (also update in dataloader)

* [aps,examples,tests] add streaming transformer encoder to streaming_asr

* [aps,cmd] update ngram_rescore.py to lm_rescore.py

* [aps,cmd] optimize nbest reader

* [conf,examples] check librimix recipe

* [aps,examples] update librimix results & fix streaming cfg error for transducer/ctc

* update codecov.yml
  • Loading branch information
funcwj authored Nov 20, 2021
1 parent a31c55a commit 571c845
Show file tree
Hide file tree
Showing 117 changed files with 1,265 additions and 704 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
### 2021/11

1. Update to PyTorch 1.10
2. Update subampling layer for Transformer based encoders
2. Refactor subampling layer for Transformer based encoders
3. Adaptive SpecAug
4. Update LibriSpeech & GigaSpeech results
5. Add AED decoder rescoring for CTC beam search
6. Add LibriMix recipe
7. Refactor tokenizer for online tokenizing
8. Update ngram_rescore.py to lm_rescore.py

### 2021/10

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://opensource.org/licenses/Apache-2.0)
[![Python-Version](https://img.shields.io/badge/Python-3.7%7C3.8-brightgreen)](https://github.com/funcwj/aps)
[![codecov](https://codecov.io/gh/funcwj/aps/branch/jwu/master/graph/badge.svg)](https://codecov.io/gh/funcwj/aps)
[![codecov](https://codecov.io/gh/funcwj/aps/branch/master/graph/badge.svg)](https://codecov.io/gh/funcwj/aps)
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
[![CI](https://github.com/funcwj/aps/actions/workflows/test_python.yml/badge.svg)](https://github.com/funcwj/aps/actions/workflows/test_python.yml/badge.svg)

Expand Down Expand Up @@ -40,4 +40,4 @@ cmake .. && make -j

## Acknowledge

The project was started at early 2019 when the author was a master student of the Audio, Speech and Language Processing Group (ASLP) in Northwestern Polytechnical University (NWPU), Xi'an, China. Originally it was used to collect the source code of the experiments that the author did in the past.
The project was started at early 2019 when the author was a master student of the [Audio, Speech and Language Processing Group](http://www.npu-aslp.org) (ASLP) in [Northwestern Polytechnical University](https://www.nwpu.edu.cn) (NWPU), Xi'an, China. Originally it was used to collect the source code of the experiments that the author did in the past.
14 changes: 12 additions & 2 deletions aps/asr/lm/ngram.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
kenlm_available = False

from aps.conf import load_dict
from aps.const import EOS_TOKEN, SOS_TOKEN


class NgramLM(object):
Expand All @@ -29,12 +30,21 @@ def __init__(self, lm: str, vocab_dict: str) -> None:
vocab = load_dict(vocab_dict, reverse=True)
self.token = [None] * len(vocab)
for i, tok in vocab.items():
if tok == "<eos>":
if tok == EOS_TOKEN:
tok = "</s>"
if tok == "<sos>":
if tok == SOS_TOKEN:
tok = "<s>"
self.token[i] = tok

def score(self,
utterance: str,
sos: bool = True,
eos: bool = True) -> float:
"""
Score a given utterance
"""
return self.ngram_lm.score(utterance, bos=sos, eos=eos)

def _step(self, prev_state):
"""
Args:
Expand Down
24 changes: 23 additions & 1 deletion aps/asr/lm/rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch as th
import torch.nn as nn
import torch.nn.init as init
from typing import NoReturn, Union, Tuple, Optional
from typing import NoReturn, Union, Tuple, List, Optional
from aps.asr.base.component import OneHotEmbedding, PyTorchRNN
from aps.asr.base.decoder import LayerNormRNN
from aps.libs import ApsRegisters
Expand Down Expand Up @@ -76,10 +76,32 @@ def __init__(self,
self.dist.weight = self.embed.weight

def init_weights(self, initrange: float = 0.1) -> NoReturn:
"""
Initialize model weights
"""
init.zeros_(self.dist.bias)
init.uniform_(self.dist.weight, -initrange, initrange)
init.uniform_(self.embed.weight, -initrange, initrange)

def score(self,
hypos: List[int],
sos: int = -1,
eos: int = -1,
device: int = -1) -> float:
"""
Score the given hypothesis
"""
hyp_tensor = th.as_tensor(
[sos] + hypos, device="cpu" if device < 0 else f"cuda:{device:d}")
# 1 x T+1 => 1 x T+1 x V
prob, _ = self(hyp_tensor[None, ...])
# T+1 x V
prob = th.log_softmax(prob[0], -1)
score = 0
for n, w in enumerate(hypos + [eos]):
score += prob[n, w].item()
return score

def forward(
self,
token: th.Tensor,
Expand Down
21 changes: 20 additions & 1 deletion aps/asr/lm/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch as th
import torch.nn as nn

from typing import Optional, Tuple, Dict
from typing import Optional, Tuple, Dict, List
from aps.asr.transformer.pose import get_xfmr_pose
from aps.asr.transformer.impl import get_xfmr_encoder
from aps.asr.transformer.utils import prep_sub_mask
Expand Down Expand Up @@ -32,6 +32,25 @@ def __init__(self,
self.dist = nn.Linear(att_dim, vocab_size)
self.vocab_size = vocab_size

def score(self,
hypos: List[int],
sos: int = -1,
eos: int = -1,
device: int = -1) -> float:
"""
Score the given hypothesis
"""
hyp_tensor = th.as_tensor(
[sos] + hypos, device="cpu" if device < 0 else f"cuda:{device:d}")
# 1 x T+1 => 1 x T+1 x V
prob, _ = self(hyp_tensor[None, ...])
# T+1 x V
prob = th.log_softmax(prob[0], -1)
score = 0
for n, w in enumerate(hypos + [eos]):
score += prob[n, w].item()
return score

def forward(
self,
token: th.Tensor,
Expand Down
23 changes: 14 additions & 9 deletions aps/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import codecs

from typing import Dict, List, Tuple
from aps.const import EOS_TOKEN, SOS_TOKEN

required_keys = [
"nnet", "nnet_conf", "task", "task_conf", "data_conf", "trainer_conf"
Expand All @@ -16,11 +17,14 @@
"enh_transform", "asr_transform", "cmd_args"
]
all_lm_conf_keys = required_keys + ["cmd_args"]
transducer_or_ctc_tasks = [
"asr@transducer", "asr@ctc", "streaming_asr@transducer", "streaming_asr@ctc"
]


def load_dict(dict_path: str,
reverse: bool = False,
required: List[str] = ["<sos>", "<eos>"]) -> Dict:
required: List[str] = [EOS_TOKEN, SOS_TOKEN]) -> Dict:
"""
Load the dictionary object
Args:
Expand Down Expand Up @@ -110,18 +114,19 @@ def load_am_conf(yaml_conf: str, dict_path: str) -> Tuple[Dict, Dict]:

# add dict info
nnet_conf = conf["nnet_conf"]
vocab = load_dict(
dict_path,
required=[] if conf["task"] == "asr@ctc" else ["<eos>", "<sos>"])
is_transducer_or_ctc = conf["task"] in transducer_or_ctc_tasks

required_units = [] if is_transducer_or_ctc else [EOS_TOKEN, EOS_TOKEN]
vocab = load_dict(dict_path, required=required_units)
nnet_conf["vocab_size"] = len(vocab)

# Generally we don't use eos/sos in
if not is_transducer_or_ctc:
nnet_conf["sos"] = vocab[SOS_TOKEN]
nnet_conf["eos"] = vocab[EOS_TOKEN]
# for transducer/CTC
task_conf = conf["task_conf"]
use_ctc = "ctc_weight" in task_conf and task_conf["ctc_weight"] > 0
is_transducer_or_ctc = conf["task"] in ["asr@transducer", "asr@ctc"]
if not is_transducer_or_ctc:
nnet_conf["sos"] = vocab["<sos>"]
nnet_conf["eos"] = vocab["<eos>"]
# for CTC/RNNT
if use_ctc or is_transducer_or_ctc:
conf["task_conf"]["blank"] = len(vocab)
# add blank
Expand Down
2 changes: 2 additions & 0 deletions aps/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@
MAX_INT16 = np.iinfo(np.int16).max
UNK_TOKEN = "<unk>"
BLK_TOKEN = "<b>"
EOS_TOKEN = "<eos>"
SOS_TOKEN = "<sos>"
OOM_STRING = "out of memory"
TORCH_VERSION = LooseVersion(th.__version__)
94 changes: 42 additions & 52 deletions aps/eval/asr.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,74 @@
# Copyright 2019 Jian Wu
# License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

from aps.tokenizer import Tokenizer
from aps.conf import load_dict
from aps.const import UNK_TOKEN

from typing import List


class TextPreProcessor(object):
class TextProcess(object):
"""
The class for pre processing of the transcriptions, i.e.,
mapping them to the sequence ids for CTC alignment
Base class for pre-/post-processing of the audio transcriptions, i.e.,
mapping them to the sequence ids for CTC alignment or mapping decoding
id sequences to word sequences
"""

def __init__(self, dict_str: str, space: str = "", spm: str = "") -> None:
self.vocab = None
self.space = space
self.sp_mdl = None
if dict_str:
# str to int
self.vocab = load_dict(dict_str)
tokenizer_kwargs = {}
if spm:
import sentencepiece as sp
self.sp_mdl = sp.SentencePieceProcessor(model_file=spm)
tokenizer = "subword"
tokenizer_kwargs["spm"] = spm
else:
if space:
tokenizer = "char"
tokenizer_kwargs["space"] = space
else:
tokenizer = "word"
# str to int
if dict_str:
vocab_dict = load_dict(dict_str)
self.tokenizer = Tokenizer(vocab_dict,
tokenizer=tokenizer,
tokenizer_kwargs=tokenizer_kwargs)
else:
self.tokenizer = None


class TextPreProcessor(TextProcess):
"""
Text pre-processing class
"""

def __init__(self, dict_str: str, space: str = "", spm: str = "") -> None:
super(TextPreProcessor, self).__init__(dict_str, space=space, spm=spm)

def run(self, str_seq: List[str]) -> List[int]:
if self.vocab:
if self.sp_mdl:
# subword str sequence
str_seq = self.sp_mdl.encode(" ".join(str_seq), out_type=str)
if self.space:
# insert space
str_seq = f" {self.space} ".join(str_seq).split(" ")
int_seq = [(self.vocab[idx]
if idx in self.vocab else self.vocab[UNK_TOKEN])
for idx in str_seq]
if self.tokenizer:
int_seq = self.tokenizer.encode(str_seq)
else:
# no tokenizer avaiable
int_seq = [int(idx) for idx in str_seq]
return int_seq


class TextPostProcessor(object):
class TextPostProcessor(TextProcess):
"""
The class for post processing of decoding sequence, i.e.,
mapping id sequence to token sequence
Text post-processing class
"""

def __init__(self,
dict_str: str,
space: str = "",
show_unk: str = "<unk>",
spm: str = "") -> None:
super(TextPostProcessor, self).__init__(dict_str, space=space, spm=spm)
self.unk = show_unk
self.space = space
self.vocab = None
self.sp_mdl = None
if dict_str:
# int to str
self.vocab = load_dict(dict_str, reverse=True)
if spm:
import sentencepiece as sp
self.sp_mdl = sp.SentencePieceProcessor(model_file=spm)

def run(self, int_seq: List[int]) -> str:
if self.vocab:
trans = [self.vocab[idx] for idx in int_seq]
else:
trans = [str(idx) for idx in int_seq]
# char sequence
if self.vocab:
if self.sp_mdl:
trans = self.sp_mdl.decode(trans)
else:
if self.space:
trans = "".join(trans).replace(self.space, " ")
else:
trans = " ".join(trans)
if self.unk != UNK_TOKEN:
trans = trans.replace(UNK_TOKEN, self.unk)
# ID sequence
if self.tokenizer:
str_seq = self.tokenizer.decode(int_seq, unk_sym=self.unk)
else:
trans = " ".join(trans)
return trans
# if tokenizer avaiable
str_seq = [str(idx) for idx in int_seq]
return " ".join(str_seq)
44 changes: 23 additions & 21 deletions aps/io/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,27 +40,29 @@ class NbestReader(object):
def __init__(self, nbest: str):
self.nbest, self.hypos = self._load_nbest(nbest)

def __len__(self) -> int:
return len(self.hypos)

def __iter__(self):
return iter(self.hypos.items())

def _load_nbest(self, nbest: str):
hypos = {}
nbest = 1
with codecs.open(nbest, "r", encoding="utf-8") as f:
nbest = int(f.readline())
while True:
key = f.readline().strip()
if not key:
break
topk = []
n = 0
while n < self.nbest:
items = f.readline().strip().split()
score = float(items[0])
num_tokens = int(items[1])
trans = " ".join(items[2:])
topk.append((score, num_tokens, trans))
n += 1
hypos[key] = topk
with codecs.open(nbest, "r", encoding="utf-8") as fd:
all_lines = fd.readlines()
nbest = int(all_lines[0].strip())
if (len(all_lines) - 1) % (nbest + 1) != 0:
raise RuntimeError("Seems that nbest format is wrong")
n = 1
while n < len(all_lines):
key = all_lines[n].strip()
topk = []
for i in range(nbest):
items = all_lines[n + 1 + i].strip().split()
score = float(items[0])
num_tokens = int(items[1])
trans = " ".join(items[2:])
topk.append((score, num_tokens, trans))
n += nbest + 1
hypos[key] = topk
return nbest, hypos

def __iter__(self):
for key in self.hypos:
yield key, self.hypos[key]
6 changes: 3 additions & 3 deletions aps/libs.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class ApsModules(object):
"bss.dense_unet", "bss.sepformer"
]
loader_submodules = [
"am.kaldi", "am.raw", "am.command", "se.chunk", "se.command",
"am.kaldi", "am.raw", "am.simu_cmd", "se.chunk", "se.simu_cmd",
"se.config", "lm.utt", "lm.bptt"
]
asr = Module("aps.asr", asr_submodules)
Expand Down Expand Up @@ -265,10 +265,10 @@ def start_trainer(trainer: str,
max_batch_size=args.batch_size // num_process,
**loader_conf,
**data_conf["train"])
dev_batch_size = int(args.batch_size // args.dev_batch_factor)
dev_loader = aps_dataloader(train=False,
distributed=False,
max_batch_size=args.batch_size //
args.dev_batch_factor,
max_batch_size=dev_batch_size,
**loader_conf,
**data_conf["valid"])
trainer.run(trn_loader,
Expand Down
Loading

0 comments on commit 571c845

Please sign in to comment.