Skip to content

Commit 4b2bce7

Browse files
authored
Merge pull request #4919 from a2d8a4v/master
Same continuous phonemes are aggregated when computing gop features via compute-gop
2 parents 01aadd7 + 6c74b9a commit 4b2bce7

File tree

3 files changed

+43
-22
lines changed

3 files changed

+43
-22
lines changed

egs/gop_speechocean762/README.md

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
There is a copy of this document on Google Docs, which renders the equations better:
2-
[link](https://docs.google.com/document/d/1pie-PU6u2NZZC_FzocBGGm6mpfBJMiCft9UoG0uA1kA/edit?usp=sharing)
3-
4-
* * *
5-
61
# GOP on Kaldi
72

83
The Goodness of Pronunciation (GOP) is a variation of the posterior probability, for phone level pronunciation scoring.

egs/gop_speechocean762/s5/run.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
# Copyright 2019 Junbo Zhang
44
# 2020-2021 Xiaomi Corporation (Author: Junbo Zhang, Yongqing Wang)
5+
# 2024 Jiun-Ting Li (National Taiwan Normal University)
56
# Apache 2.0
67

78
# This script shows how to calculate Goodness of Pronunciation (GOP) and
@@ -175,6 +176,7 @@ if [ $stage -le 12 ]; then
175176
compute-gop --phone-map=data/lang_nosp/phone-to-pure-phone.int \
176177
--skip-phones-string=0:1:2 \
177178
$model/final.mdl \
179+
"ark,t:gunzip -c exp/ali_$part/ali.JOB.gz|" \
178180
"ark,t:gunzip -c exp/ali_$part/ali-phone.JOB.gz|" \
179181
"ark:exp/probs_$part/output.JOB.ark" \
180182
"ark,scp:exp/gop_$part/gop.JOB.ark,exp/gop_$part/gop.JOB.scp" \

src/bin/compute-gop.cc

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// bin/compute-gop.cc
22

33
// Copyright 2019 Junbo Zhang
4+
// 2024 Jiun-Ting Li (National Taiwan Normal University)
45

56
// See ../../COPYING for clarification regarding multiple authors
67
//
@@ -107,11 +108,14 @@ int main(int argc, char *argv[]) {
107108
const char *usage =
108109
"Compute Goodness Of Pronunciation (GOP) from a matrix of "
109110
"probabilities (e.g. from nnet3-compute).\n"
110-
"Usage: compute-gop [options] <model> <alignments-rspecifier> "
111+
"Usage: compute-gop [options] <model> "
112+
"<transition-alignments-respecifier> "
113+
"<phoneme-alignments-rspecifier> "
111114
"<prob-matrix-rspecifier> <gop-wspecifier> "
112-
"[<phone-feature-wspecifier>]\n"
115+
"<phone-feature-wspecifier>\n"
113116
"e.g.:\n"
114-
" nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
117+
" nnet3-compute [args] | compute-gop 1.mdl ark:ali.1 ark:ali-phone.1 "
118+
" ark:output.1.ark "
115119
" ark:gop.1 ark:phone-feat.1\n";
116120

117121
ParseOptions po(usage);
@@ -130,16 +134,17 @@ int main(int argc, char *argv[]) {
130134

131135
po.Read(argc, argv);
132136

133-
if (po.NumArgs() != 4 && po.NumArgs() != 5) {
137+
if (po.NumArgs() != 6) {
134138
po.PrintUsage();
135139
exit(1);
136140
}
137141

138142
std::string model_filename = po.GetArg(1),
139-
alignments_rspecifier = po.GetArg(2),
140-
prob_rspecifier = po.GetArg(3),
141-
gop_wspecifier = po.GetArg(4),
142-
feat_wspecifier = po.GetArg(5);
143+
transition_alignments_rspecifier = po.GetArg(2),
144+
phoneme_alignments_rspecifier = po.GetArg(3),
145+
prob_rspecifier = po.GetArg(4),
146+
gop_wspecifier = po.GetArg(5),
147+
feat_wspecifier = po.GetArg(6);
143148

144149
TransitionModel trans_model;
145150
{
@@ -174,33 +179,50 @@ int main(int argc, char *argv[]) {
174179
}
175180
}
176181

177-
RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier);
182+
RandomAccessInt32VectorReader phoneme_alignments_reader(phoneme_alignments_rspecifier);
183+
RandomAccessInt32VectorReader transition_alignments_reader(transition_alignments_rspecifier);
178184
SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier);
179185
PosteriorWriter gop_writer(gop_wspecifier);
180186
BaseFloatVectorWriter feat_writer(feat_wspecifier);
181187

182188
int32 num_done = 0;
183189
for (; !prob_reader.Done(); prob_reader.Next()) {
184190
std::string key = prob_reader.Key();
185-
if (!alignment_reader.HasKey(key)) {
186-
KALDI_WARN << "No alignment for utterance " << key;
191+
if (!phoneme_alignments_reader.HasKey(key)) {
192+
KALDI_WARN << "No phoneme alignment for utterance " << key;
187193
continue;
188194
}
189-
auto alignment = alignment_reader.Value(key);
195+
if (!transition_alignments_reader.HasKey(key)) {
196+
KALDI_WARN << "No transition alignment for utterance " << key;
197+
continue;
198+
}
199+
auto phoneme_alignment = phoneme_alignments_reader.Value(key);
200+
auto transition_alignment = transition_alignments_reader.Value(key);
190201
Matrix<BaseFloat> &probs = prob_reader.Value();
191202
if (log_applied) probs.ApplyExp();
192203

204+
std::vector<std::vector<int32> > split;
205+
SplitToPhones(trans_model, transition_alignment, &split);
206+
207+
std::vector<int32> phone_boundary;
208+
for (int32 i = 0; i < split.size(); i++) {
209+
for (int32 j = 0; j < split[i].size(); j++) {
210+
phone_boundary.push_back(i);
211+
}
212+
}
213+
193214
Matrix<BaseFloat> lpps;
194215
ComputeLpps(probs, pdf2phones, &lpps);
195216

196-
int32 frame_num = alignment.size();
197-
if (alignment.size() != probs.NumRows()) {
217+
int32 frame_num = phoneme_alignment.size();
218+
if (phoneme_alignment.size() != probs.NumRows()) {
198219
KALDI_WARN << "The frame numbers of alignment and prob are not equal.";
199220
if (frame_num > probs.NumRows()) frame_num = probs.NumRows();
200221
}
201222

202223
KALDI_ASSERT(frame_num > 0);
203-
int32 cur_phone_id = alignment[0];
224+
int32 cur_phone_id = phoneme_alignment[0];
225+
int32 cur_phone_pos = phone_boundary[0];
204226
int32 duration = 0;
205227
Vector<BaseFloat> phone_level_feat(1 + phone_num * 2); // [phone LPPs LPRs]
206228
SubVector<BaseFloat> lpp_part(phone_level_feat, 1, phone_num);
@@ -220,8 +242,9 @@ int main(int argc, char *argv[]) {
220242
lpp_part.AddVec(1, frame_level_lpp);
221243
duration++;
222244

223-
int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1]: -1;
224-
if (next_phone_id != cur_phone_id) {
245+
int32 next_phone_id = (i < frame_num - 1) ? phoneme_alignment[i + 1]: -1;
246+
int32 next_phone_pos = (i < frame_num - 1) ? phone_boundary[i + 1]: -1;
247+
if (next_phone_pos != cur_phone_pos) {
225248
int32 phone_id = phone_map.empty() ? cur_phone_id : phone_map[cur_phone_id];
226249

227250
// The current phone's feature have been ready
@@ -248,6 +271,7 @@ int main(int argc, char *argv[]) {
248271
duration = 0;
249272
}
250273
cur_phone_id = next_phone_id;
274+
cur_phone_pos = next_phone_pos;
251275
}
252276

253277
// Write GOPs and the GOP-based features

0 commit comments

Comments
 (0)