diff --git a/egs/aishell/s10b/ctc/transform.py b/egs/aishell/s10b/ctc/transform.py index 480377136bd..04d63605ae7 100644 --- a/egs/aishell/s10b/ctc/transform.py +++ b/egs/aishell/s10b/ctc/transform.py @@ -4,19 +4,22 @@ # Apache 2.0 import torch +import torch.nn as nn import torch.nn.functional as F -def compute_delta_feat(x, weight): +def compute_delta_feat(x, weight, enable_padding): ''' Args: x: input feat of shape [batch_size, seq_len, feat_dim] weight: coefficients for computing delta features; - it has a shape of [feat_dim, 1, kernel_size]. + it has shape [feat_dim, 1, kernel_size]. + + enable_padding: True to add padding. Returns: - a tensor fo shape [batch_size, seq_len, feat_dim] + a tensor of shape [batch_size, seq_len, feat_dim] ''' assert x.ndim == 3 @@ -27,51 +30,61 @@ def compute_delta_feat(x, weight): feat_dim = x.size(2) - pad_size = weight.size(2) // 2 + if enable_padding: + pad_size = weight.size(2) // 2 - # F.pad requires a 4-D tensor in our case - x = x.unsqueeze(0) + # F.pad requires a 4-D tensor in our case + x = x.unsqueeze(0) - # (0, 0, pad_size, pad_size) == (left, right, top, bottom) - padded_x = F.pad(x, (0, 0, pad_size, pad_size), mode='replicate') + # (0, 0, pad_size, pad_size) == (left, right, top, bottom) + x = F.pad(x, (0, 0, pad_size, pad_size), mode='replicate') - # after padding, we have to convert it back to 3-D - # since conv1d requires 3-D input - padded_x = padded_x.squeeze(0) + # after padding, we have to convert it back to 3-D + # since conv1d requires 3-D input + x = x.squeeze(0) # conv1d requires a shape of [batch_size, feat_dim, seq_len] - padded_x = padded_x.permute(0, 2, 1) + x = x.permute(0, 2, 1) # NOTE(fangjun): we perform a depthwise convolution here by # setting groups == number of channels - y = F.conv1d(input=padded_x, weight=weight, groups=feat_dim) + y = F.conv1d(input=x, weight=weight, groups=feat_dim) - # now convert y back to be of shape [batch_size, seq_len, feat_dim] + # now convert y back to shape [batch_size, seq_len, feat_dim] y = y.permute(0, 2, 1) return y -class AddDeltasTransform: +class AddDeltasTransform(nn.Module): ''' This class implements `add-deltas` in kaldi with order == 2 and window == 2. - It generates the identical output as kaldi's `add-deltas` with default - parameters given the same input. + It can generate the identical output as kaldi's `add-deltas`. + + See transform_test.py ''' - def __init__(self): - # yapf: disable - self.first_order_coef = torch.tensor([-0.2, -0.1, 0, 0.1, 0.2]) - self.second_order_coef = torch.tensor([0.04, 0.04, 0.01, -0.04, -0.1, -0.04, 0.01, 0.04, 0.04]) - # yapf: enable + def __init__(self, + first_order_coef=[-1, 0, 1], + second_order_coef=[1, 0, -2, 0, 1], + enable_padding=False): + ''' + Note that this class has no trainable `nn.Parameters`. + + Args: + first_order_coef: coefficient to compute the first order delta feature + + second_order_coef: coefficient to compute the second order delta feature + ''' + super().__init__() - # TODO(fangjun): change the coefficients to the following as suggested by Dan - # [-1, 0, 1] - # [1, 0, -2, 0, 1] + self.first_order_coef = torch.tensor(first_order_coef) + self.second_order_coef = torch.tensor(second_order_coef) + self.enable_padding = enable_padding - def __call__(self, x): + def forward(self, x): ''' Args: x: a tensor of shape [batch_size, seq_len, feat_dim] @@ -94,9 +107,22 @@ def __call__(self, x): self.first_order_coef = self.first_order_coef.to(device) self.second_order_coef = self.second_order_coef.to(device) - first_order = compute_delta_feat(x, self.first_order_coef) - second_order = compute_delta_feat(x, self.second_order_coef) - - y = torch.cat([x, first_order, second_order], dim=2) + first_order = compute_delta_feat(x, self.first_order_coef, + self.enable_padding) + second_order = compute_delta_feat(x, self.second_order_coef, + self.enable_padding) + + if self.enable_padding: + y = torch.cat([x, first_order, second_order], dim=2) + else: + zeroth = (x.size(1) - second_order.size(1)) // 2 + first = (first_order.size(1) - second_order.size(1)) // 2 + + y = torch.cat([ + x[:, zeroth:-zeroth, :], + first_order[:, first:-first, :], + second_order, + ], + dim=2) return y diff --git a/egs/aishell/s10b/ctc/transform_test.py b/egs/aishell/s10b/ctc/transform_test.py index a413abe49da..6c4e3550d01 100755 --- a/egs/aishell/s10b/ctc/transform_test.py +++ b/egs/aishell/s10b/ctc/transform_test.py @@ -30,11 +30,18 @@ def test_add_deltas_transform(self): [3, 2], [5, 1], [10, -2], + [10, 20], + [100, 200], ]).float() x = x.unsqueeze(0) - transform = AddDeltasTransform() + transform = AddDeltasTransform( + first_order_coef=[-0.2, -0.1, 0, 0.1, 0.2], + second_order_coef=[ + 0.04, 0.04, 0.01, -0.04, -0.1, -0.04, 0.01, 0.04, 0.04 + ], + enable_padding=True) y = transform(x) # now use kaldi's add-deltas to compute the ground truth @@ -60,7 +67,17 @@ def test_add_deltas_transform(self): y = y.squeeze(0) - np.testing.assert_array_almost_equal(y.numpy(), expected.numpy()) + np.testing.assert_array_almost_equal(y.numpy(), + expected.numpy(), + decimal=5) + + # now for padding == False + transform.enable_padding = False + y = transform(x).squeeze(0) + + np.testing.assert_array_almost_equal(y.numpy(), + expected.numpy()[4:-4, :], + decimal=5) reader.Close()