1
1
// bin/compute-gop.cc
2
2
3
3
// Copyright 2019 Junbo Zhang
4
+ // 2024 Jiun-Ting Li (National Taiwan Normal University)
4
5
5
6
// See ../../COPYING for clarification regarding multiple authors
6
7
//
@@ -107,11 +108,14 @@ int main(int argc, char *argv[]) {
107
108
const char *usage =
108
109
" Compute Goodness Of Pronunciation (GOP) from a matrix of "
109
110
" probabilities (e.g. from nnet3-compute).\n "
110
- " Usage: compute-gop [options] <model> <alignments-rspecifier> "
111
+ " Usage: compute-gop [options] <model> "
112
+ " <transition-alignments-respecifier> "
113
+ " <phoneme-alignments-rspecifier> "
111
114
" <prob-matrix-rspecifier> <gop-wspecifier> "
112
- " [ <phone-feature-wspecifier>] \n "
115
+ " <phone-feature-wspecifier>\n "
113
116
" e.g.:\n "
114
- " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
117
+ " nnet3-compute [args] | compute-gop 1.mdl ark:ali.1 ark:ali-phone.1 "
118
+ " ark:output.1.ark "
115
119
" ark:gop.1 ark:phone-feat.1\n " ;
116
120
117
121
ParseOptions po (usage);
@@ -130,16 +134,17 @@ int main(int argc, char *argv[]) {
130
134
131
135
po.Read (argc, argv);
132
136
133
- if (po.NumArgs () != 4 && po. NumArgs () != 5 ) {
137
+ if (po.NumArgs () != 6 ) {
134
138
po.PrintUsage ();
135
139
exit (1 );
136
140
}
137
141
138
142
std::string model_filename = po.GetArg (1 ),
139
- alignments_rspecifier = po.GetArg (2 ),
140
- prob_rspecifier = po.GetArg (3 ),
141
- gop_wspecifier = po.GetArg (4 ),
142
- feat_wspecifier = po.GetArg (5 );
143
+ transition_alignments_rspecifier = po.GetArg (2 ),
144
+ phoneme_alignments_rspecifier = po.GetArg (3 ),
145
+ prob_rspecifier = po.GetArg (4 ),
146
+ gop_wspecifier = po.GetArg (5 ),
147
+ feat_wspecifier = po.GetArg (6 );
143
148
144
149
TransitionModel trans_model;
145
150
{
@@ -174,33 +179,50 @@ int main(int argc, char *argv[]) {
174
179
}
175
180
}
176
181
177
- RandomAccessInt32VectorReader alignment_reader (alignments_rspecifier);
182
+ RandomAccessInt32VectorReader phoneme_alignments_reader (phoneme_alignments_rspecifier);
183
+ RandomAccessInt32VectorReader transition_alignments_reader (transition_alignments_rspecifier);
178
184
SequentialBaseFloatMatrixReader prob_reader (prob_rspecifier);
179
185
PosteriorWriter gop_writer (gop_wspecifier);
180
186
BaseFloatVectorWriter feat_writer (feat_wspecifier);
181
187
182
188
int32 num_done = 0 ;
183
189
for (; !prob_reader.Done (); prob_reader.Next ()) {
184
190
std::string key = prob_reader.Key ();
185
- if (!alignment_reader .HasKey (key)) {
186
- KALDI_WARN << " No alignment for utterance " << key;
191
+ if (!phoneme_alignments_reader .HasKey (key)) {
192
+ KALDI_WARN << " No phoneme alignment for utterance " << key;
187
193
continue ;
188
194
}
189
- auto alignment = alignment_reader.Value (key);
195
+ if (!transition_alignments_reader.HasKey (key)) {
196
+ KALDI_WARN << " No transition alignment for utterance " << key;
197
+ continue ;
198
+ }
199
+ auto phoneme_alignment = phoneme_alignments_reader.Value (key);
200
+ auto transition_alignment = transition_alignments_reader.Value (key);
190
201
Matrix<BaseFloat> &probs = prob_reader.Value ();
191
202
if (log_applied) probs.ApplyExp ();
192
203
204
+ std::vector<std::vector<int32> > split;
205
+ SplitToPhones (trans_model, transition_alignment, &split);
206
+
207
+ std::vector<int32> phone_boundary;
208
+ for (int32 i = 0 ; i < split.size (); i++) {
209
+ for (int32 j = 0 ; j < split[i].size (); j++) {
210
+ phone_boundary.push_back (i);
211
+ }
212
+ }
213
+
193
214
Matrix<BaseFloat> lpps;
194
215
ComputeLpps (probs, pdf2phones, &lpps);
195
216
196
- int32 frame_num = alignment .size ();
197
- if (alignment .size () != probs.NumRows ()) {
217
+ int32 frame_num = phoneme_alignment .size ();
218
+ if (phoneme_alignment .size () != probs.NumRows ()) {
198
219
KALDI_WARN << " The frame numbers of alignment and prob are not equal." ;
199
220
if (frame_num > probs.NumRows ()) frame_num = probs.NumRows ();
200
221
}
201
222
202
223
KALDI_ASSERT (frame_num > 0 );
203
- int32 cur_phone_id = alignment[0 ];
224
+ int32 cur_phone_id = phoneme_alignment[0 ];
225
+ int32 cur_phone_pos = phone_boundary[0 ];
204
226
int32 duration = 0 ;
205
227
Vector<BaseFloat> phone_level_feat (1 + phone_num * 2 ); // [phone LPPs LPRs]
206
228
SubVector<BaseFloat> lpp_part (phone_level_feat, 1 , phone_num);
@@ -220,8 +242,9 @@ int main(int argc, char *argv[]) {
220
242
lpp_part.AddVec (1 , frame_level_lpp);
221
243
duration++;
222
244
223
- int32 next_phone_id = (i < frame_num - 1 ) ? alignment[i + 1 ]: -1 ;
224
- if (next_phone_id != cur_phone_id) {
245
+ int32 next_phone_id = (i < frame_num - 1 ) ? phoneme_alignment[i + 1 ]: -1 ;
246
+ int32 next_phone_pos = (i < frame_num - 1 ) ? phone_boundary[i + 1 ]: -1 ;
247
+ if (next_phone_pos != cur_phone_pos) {
225
248
int32 phone_id = phone_map.empty () ? cur_phone_id : phone_map[cur_phone_id];
226
249
227
250
// The current phone's feature have been ready
@@ -248,6 +271,7 @@ int main(int argc, char *argv[]) {
248
271
duration = 0 ;
249
272
}
250
273
cur_phone_id = next_phone_id;
274
+ cur_phone_pos = next_phone_pos;
251
275
}
252
276
253
277
// Write GOPs and the GOP-based features
0 commit comments