[doc] update aishell2 u2++ transformer results (#477)

* [doc] update aishell2 u2++ transformer results * Update train_u2++_transformer.yaml Co-authored-by: Binbin Zhang <[email protected]>
wenet-e2e · Jun 21, 2021 · 7f00996 · 7f00996
1 parent c66e860
commit 7f00996
Show file tree

Hide file tree

Showing 2 changed files with 115 additions and 0 deletions.
diff --git a/examples/aishell2/s0/README.md b/examples/aishell2/s0/README.md
@@ -15,6 +15,21 @@
 | attention rescoring       | 5.39  | 5.78  |
 | LM + attention rescoring  | 5.35  | 5.73  |
 
+## U2++ Transformer Result
+
+* Feature info: using fbank feature, with cmvn, no speed perturb
+* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 240 epochs, dither 0.0
+* Decoding info: ctc_weight 0.1, reverse_weight 0.5, average_num 30
+* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95
+* Model link: http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell2/20210621_u2pp_transformer_exp.tar.gz
+
+| decoding mode/chunk size  | full  | 16    |
+|---------------------------|-------|-------|
+| ctc greedy search         | 7.35  | 8.23  |
+| ctc prefix beam search    | 7.36  | 8.23  |
+| attention rescoring       | 6.09  | 6.70  |
+| LM + attention rescoring  | 6.07  | 6.55  |
+
 ## Unified Conformer Result
 
 * Feature info: using fbank feature, with cmvn, no speed perturb.

diff --git a/examples/aishell2/s0/conf/train_u2++_transformer.yaml b/examples/aishell2/s0/conf/train_u2++_transformer.yaml
@@ -0,0 +1,100 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 1.0
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: false
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 0.0
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: false
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        using_pitch: false
+    # spec level config
+    # spec_swap: false
+    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
+    spec_aug: true
+    spec_aug_conf:
+        warp_for_time: False
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+        max_w: 80
+    spec_sub: true
+    spec_sub_conf:
+        num_t_sub: 3
+        max_t: 20
+
+# dataset related
+dataset_conf:
+    max_length: 40960
+    min_length: 0
+    batch_type: 'static' # static or dynamic
+    # the size of batch_size should be set according to your gpu memory size, here we used titan xp gpu whose memory size is 12GB
+    batch_size: 22
+    sort: true
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000