[src,scripts] Simplify model combination: do simple average over last n models (kaldi-asr#2067)

freewym · danpovey · commit a5561c3cfc37 · 2017-12-26T20:52:50.000-08:00
diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl
@@ -137,6 +137,9 @@ sub get_combine_info {
       if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
         return sprintf(" combine=%.3f->%.3f", $1, $2);
+      } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
+        close(F);
+        return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); 
       }
     }
   }
diff --git a/egs/wsj/s5/steps/info/nnet3_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
@@ -137,6 +137,9 @@ sub get_combine_info {
       if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
         return sprintf(" combine=%.2f->%.2f", $1, $2);
+      } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
+        close(F);
+        return sprintf(" combine=%.2f->%.2f (over %d)", $2, $3, $1); 
       }
     }
   }
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -492,7 +492,7 @@ def compute_progress(dir, iter, run_opts):
 def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, leaky_hmm_coefficient, l2_regularize,
                    xent_regularize, run_opts,
-                   sum_to_one_penalty=0.0):
+                   max_objective_evaluations=30):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -505,9 +505,6 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     models_to_combine.add(num_iters)
 
-    # TODO: if it turns out the sum-to-one-penalty code is not useful,
-    # remove support for it.
-
     for iter in sorted(models_to_combine):
         model_file = '{0}/{1}.mdl'.format(dir, iter)
         if os.path.exists(model_file):
@@ -528,12 +525,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     common_lib.execute_command(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-chain-combine --num-iters={opt_iters} \
+                nnet3-chain-combine \
+                --max-objective-evaluations={max_objective_evaluations} \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-                --separate-weights-per-component={separate_weights} \
-                --enforce-sum-to-one={hard_enforce} \
-                --sum-to-one-penalty={penalty} \
-                --enforce-positive-weights=true \
                 --verbose=3 {dir}/den.fst {raw_models} \
                 "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \
                     nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \
@@ -542,12 +536,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                 {dir}/final.mdl""".format(
                     command=run_opts.command,
                     combine_queue_opt=run_opts.combine_queue_opt,
-                    opt_iters=(20 if sum_to_one_penalty <= 0 else 80),
-                    separate_weights=(sum_to_one_penalty > 0),
+                    max_objective_evaluations=max_objective_evaluations,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
-                    hard_enforce=(sum_to_one_penalty <= 0),
-                    penalty=sum_to_one_penalty,
                     num_chunk_per_mb=num_chunk_per_minibatch_str,
                     num_iters=num_iters,
                     egs_dir=egs_dir))
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -852,6 +852,16 @@ def __init__(self,
                                  the final model combination stage.  These
                                  models will themselves be averages of
                                  iteration-number ranges""")
+        self.parser.add_argument("--trainer.optimization.max-objective-evaluations",
+                                 "--trainer.max-objective-evaluations",
+                                 type=int, dest='max_objective_evaluations',
+                                 default=30,
+                                 help="""The maximum number of objective
+                                 evaluations in order to figure out the
+                                 best number of models to combine. It helps to
+                                 speedup if the number of models provided to the
+                                 model combination binary is quite large (e.g.
+                                 several hundred).""")
         self.parser.add_argument("--trainer.optimization.do-final-combination",
                                  dest='do_final_combination', type=str,
                                  action=common_lib.StrToBoolAction,
@@ -861,9 +871,7 @@ def __init__(self,
                                  last-numbered model as the final.mdl).""")
         self.parser.add_argument("--trainer.optimization.combine-sum-to-one-penalty",
                                  type=float, dest='combine_sum_to_one_penalty', default=0.0,
-                                 help="""If > 0, activates 'soft' enforcement of the
-                                 sum-to-one penalty in combination (may be helpful
-                                 if using dropout).  E.g. 1.0e-03.""")
+                                 help="""This option is deprecated and does nothing.""")
         self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                  dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -452,7 +452,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                    minibatch_size_str,
                    run_opts,
                    chunk_width=None, get_raw_nnet_from_am=True,
-                   sum_to_one_penalty=0.0,
+                   max_objective_evaluations=30,
                    use_multitask_egs=False,
                    compute_per_dim_accuracy=False):
     """ Function to do model combination
@@ -501,10 +501,8 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                              use_multitask_egs=use_multitask_egs)
     common_lib.execute_command(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-combine --num-iters=80 \
-                --enforce-sum-to-one={hard_enforce} \
-                --sum-to-one-penalty={penalty} \
-                --enforce-positive-weights=true \
+                nnet3-combine \
+                --max-objective-evaluations={max_objective_evaluations} \
                 --verbose=3 {raw_models} \
                 "ark,bg:nnet3-copy-egs {multitask_egs_opts} \
                     {egs_rspecifier} ark:- | \
@@ -513,9 +511,8 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
         """.format(command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,
                    dir=dir, raw_models=" ".join(raw_model_strings),
+                   max_objective_evaluations=max_objective_evaluations,
                    egs_rspecifier=egs_rspecifier,
-                   hard_enforce=(sum_to_one_penalty <= 0),
-                   penalty=sum_to_one_penalty,
                    mbsize=minibatch_size_str,
                    out_model=out_model,
                    multitask_egs_opts=multitask_egs_opts))
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -554,7 +554,7 @@ def train(args, run_opts):
                 l2_regularize=args.l2_regularize,
                 xent_regularize=args.xent_regularize,
                 run_opts=run_opts,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty)
+                max_objective_evaluations=args.max_objective_evaluations)
         else:
             logger.info("Copying the last-numbered model to final.mdl")
             common_lib.force_symlink("{0}.mdl".format(num_iters),
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -364,7 +364,7 @@ def train(args, run_opts):
                 models_to_combine=models_to_combine,
                 egs_dir=egs_dir,
                 minibatch_size_str=args.minibatch_size, run_opts=run_opts,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty)
+                max_objective_evaluations=args.max_objective_evaluations)
     
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -398,7 +398,7 @@ def train(args, run_opts):
                 models_to_combine=models_to_combine, egs_dir=egs_dir,
                 minibatch_size_str=args.minibatch_size, run_opts=run_opts,
                 get_raw_nnet_from_am=False,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty,
+                max_objective_evaluations=args.max_objective_evaluations,
                 use_multitask_egs=use_multitask_egs)
         else:
             common_lib.force_symlink("{0}.raw".format(num_iters),
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -475,7 +475,7 @@ def train(args, run_opts):
                 run_opts=run_opts, chunk_width=args.chunk_width,
                 get_raw_nnet_from_am=False,
                 compute_per_dim_accuracy=args.compute_per_dim_accuracy,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty)
+                max_objective_evaluations=args.max_objective_evaluations)
         else:
             common_lib.force_symlink("{0}.raw".format(num_iters),
                                      "{0}/final.raw".format(args.dir))
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -451,7 +451,7 @@ def train(args, run_opts):
                 run_opts=run_opts,
                 minibatch_size_str=args.num_chunk_per_minibatch,
                 chunk_width=args.chunk_width,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty,
+                max_objective_evaluations=args.max_objective_evaluations,
                 compute_per_dim_accuracy=args.compute_per_dim_accuracy)
 
     if args.stage <= num_iters + 1:
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
@@ -1,6 +1,7 @@
 // chainbin/nnet3-chain-combine.cc
 
 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2017  Yiming Wang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,7 +20,65 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "nnet3/nnet-chain-combine.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-chain-diagnostics.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+// Computes and returns the objective function for the examples in 'egs' given
+// the model in 'nnet'. If either of batchnorm/dropout test modes is true, we
+// make a copy of 'nnet', set test modes on that and evaluate its objective.
+// Note: the object that prob_computer->nnet_ refers to should be 'nnet'.
+double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
+                   const std::vector<NnetChainExample> &egs, const Nnet &nnet,
+                   const chain::ChainTrainingOptions &chain_config,
+                   const fst::StdVectorFst &den_fst,
+                   NnetChainComputeProb *prob_computer) {
+  if (batchnorm_test_mode || dropout_test_mode) {
+    Nnet nnet_copy(nnet);
+    if (batchnorm_test_mode)
+      SetBatchnormTestMode(true, &nnet_copy);
+    if (dropout_test_mode)
+      SetDropoutTestMode(true, &nnet_copy);
+    NnetComputeProbOptions compute_prob_opts;
+    NnetChainComputeProb prob_computer_test(compute_prob_opts, chain_config,
+        den_fst, nnet_copy);
+    return ComputeObjf(false, false, egs, nnet_copy,
+                       chain_config, den_fst, &prob_computer_test);
+  } else {
+    prob_computer->Reset();
+    std::vector<NnetChainExample>::const_iterator iter = egs.begin(),
+                                                   end = egs.end();
+    for (; iter != end; ++iter)
+      prob_computer->Compute(*iter);
+    const ChainObjectiveInfo *objf_info =
+        prob_computer->GetObjective("output");
+    if (objf_info == NULL)
+      KALDI_ERR << "Error getting objective info (unsuitable egs?)";
+    KALDI_ASSERT(objf_info->tot_weight > 0.0);
+    // inf/nan tot_objf->return -inf objective.
+    double tot_objf = objf_info->tot_like + objf_info->tot_l2_term;
+    if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0))
+      return -std::numeric_limits<double>::infinity();
+    // we prefer to deal with normalized objective functions.
+    return tot_objf / objf_info->tot_weight;
+  }
+}
+
+// Updates moving average over num_models nnets, given the average over
+// previous (num_models - 1) nnets, and the new nnet.
+void UpdateNnetMovingAverage(int32 num_models,
+    const Nnet &nnet, Nnet *moving_average_nnet) {
+  KALDI_ASSERT(NumParameters(nnet) == NumParameters(*moving_average_nnet));
+  ScaleNnet((num_models - 1.0) / num_models, moving_average_nnet);
+  AddNnet(nnet, 1.0 / num_models, moving_average_nnet);
+}
+
+}
+}
 
 
 int main(int argc, char *argv[]) {
@@ -30,9 +89,11 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int64 int64;
 
     const char *usage =
-        "Using a subset of training or held-out nnet3+chain examples, compute an\n"
-        "optimal combination of  anumber of nnet3 neural nets by maximizing the\n"
-        "'chain' objective function.  See documentation of options for more details.\n"
+        "Using a subset of training or held-out nnet3+chain examples, compute\n"
+        "the average over the first n nnet models where we maximize the\n"
+        "'chain' objective function for n. Note that the order of models has\n"
+        "been reversed before feeding into this binary. So we are actually\n"
+        "combining last n models.\n"
         "Inputs and outputs are nnet3 raw nnets.\n"
         "\n"
         "Usage:  nnet3-chain-combine [options] <den-fst> <raw-nnet-in1> <raw-nnet-in2> ... <raw-nnet-inN> <chain-examples-in> <raw-nnet-out>\n"
@@ -41,23 +102,28 @@ int main(int argc, char *argv[]) {
         " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n";
 
     bool binary_write = true;
+    int32 max_objective_evaluations = 30;
     bool batchnorm_test_mode = false,
         dropout_test_mode = true;
     std::string use_gpu = "yes";
-    NnetCombineConfig combine_config;
     chain::ChainTrainingOptions chain_config;
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("max-objective-evaluations", &max_objective_evaluations, "The "
+                "maximum number of objective evaluations in order to figure "
+                "out the best number of models to combine. It helps to speedup "
+                "if the number of models provided to this binary is quite "
+                "large (e.g. several hundred)."); 
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
-                "If true, set test-mode to true on any BatchNormComponents.");
+                "If true, set test-mode to true on any BatchNormComponents "
+                "while evaluating objectives.");
     po.Register("dropout-test-mode", &dropout_test_mode,
                 "If true, set test-mode to true on any DropoutComponents and "
-                "DropoutMaskComponents.");
+                "DropoutMaskComponents while evaluating objectives.");
 
-    combine_config.Register(&po);
     chain_config.Register(&po);
 
     po.Read(argc, argv);
@@ -83,11 +149,10 @@ int main(int argc, char *argv[]) {
 
     Nnet nnet;
     ReadKaldiObject(raw_nnet_rxfilename, &nnet);
-
-    if (batchnorm_test_mode)
-      SetBatchnormTestMode(true, &nnet);
-    if (dropout_test_mode)
-      SetDropoutTestMode(true, &nnet);
+    Nnet moving_average_nnet(nnet), best_nnet(nnet);
+    NnetComputeProbOptions compute_prob_opts;
+    NnetChainComputeProb prob_computer(compute_prob_opts, chain_config,
+        den_fst, moving_average_nnet);
 
     std::vector<NnetChainExample> egs;
     egs.reserve(10000);  // reserve a lot of space to minimize the chance of
@@ -102,29 +167,50 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(!egs.empty());
     }
 
+    // first evaluates the objective using the last model.
+    int32 best_num_to_combine = 1;
+    double
+        init_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+            egs, moving_average_nnet, chain_config, den_fst, &prob_computer),
+        best_objf = init_objf;
+    KALDI_LOG << "objective function using the last model is " << init_objf;
 
     int32 num_nnets = po.NumArgs() - 3;
-    NnetChainCombiner combiner(combine_config, chain_config,
-                               num_nnets, egs, den_fst, nnet);
-
+    // then each time before we re-evaluate the objective function, we will add
+    // num_to_add models to the moving average.
+    int32 num_to_add = (num_nnets + max_objective_evaluations - 1) /
+                       max_objective_evaluations;
     for (int32 n = 1; n < num_nnets; n++) {
       std::string this_nnet_rxfilename = po.GetArg(n + 2);
       ReadKaldiObject(this_nnet_rxfilename, &nnet);
-      combiner.AcceptNnet(nnet);
+      // updates the moving average
+      UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet);
+      // evaluates the objective everytime after adding num_to_add model or
+      // all the models to the moving average.
+      if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) {
+        double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+            egs, moving_average_nnet, chain_config, den_fst, &prob_computer);
+        KALDI_LOG << "Combining last " << n + 1
+                  << " models, objective function is " << objf;
+        if (objf > best_objf) {
+          best_objf = objf;
+          best_nnet = moving_average_nnet;
+          best_num_to_combine = n + 1;
+        }
+      }
     }
+    KALDI_LOG << "Combining " << best_num_to_combine
+              << " nnets, objective function changed from " << init_objf
+              << " to " << best_objf;
 
-    combiner.Combine();
-
-    nnet = combiner.GetNnet();
     if (HasBatchnorm(nnet))
-      RecomputeStats(egs, chain_config, den_fst, &nnet);
+      RecomputeStats(egs, chain_config, den_fst, &best_nnet);
 
 #if HAVE_CUDA==1
     CuDevice::Instantiate().PrintProfile();
 #endif
 
-    WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
-
+    WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write);
     KALDI_LOG << "Finished combining neural nets, wrote model to "
               << nnet_wxfilename;
   } catch(const std::exception &e) {
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
@@ -22,9 +22,9 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-example.o nnet-nnet.o nnet-compile-utils.o \
   nnet-utils.o nnet-compute.o nnet-test-utils.o nnet-analyze.o \
   nnet-example-utils.o nnet-training.o \
-  nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o \
+  nnet-diagnostics.o nnet-am-decodable-simple.o \
   nnet-optimize-utils.o nnet-chain-example.o \
-  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o \
+  nnet-chain-training.o nnet-chain-diagnostics.o \
   discriminative-supervision.o nnet-discriminative-example.o \
   nnet-discriminative-diagnostics.o \
   discriminative-training.o nnet-discriminative-training.o \
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc

Original file line number	Diff line number	Diff line change
`@@ -137,6 +137,9 @@ sub get_combine_info {`
`137`	`137`	`if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {`
`138`	`138`	`close(F);`
`139`	`139`	`return sprintf(" combine=%.3f->%.3f", $1, $2);`
	`140`	`+ } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {`
	`141`	`+ close(F);`
	`142`	`+ return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1);`
`140`	`143`	`}`
`141`	`144`	`}`
`142`	`145`	`}`