[optimizer] Fix AdamW logic and apply decoupled weight decay

minseo25 · jijoongmoon · commit ba47d7fc37bd · 2025-11-18T14:44:39.000+09:00
This patch corrects the AdamW optimizer implementation to match the paper and PyTorch reference.

Signed-off-by: Minseo Kim &lt;ms05251@naver.com&gt;
diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp
@@ -58,17 +58,17 @@ void Adam::setProperty(const std::vector<std::string> &values) {
   Optimizer::setProperty(left);
 }
 
-double Adam::getUpdatedLearningRate(unsigned int iteration, double ll) const {
+double Adam::getUpdatedLearningRate(unsigned int iteration, double lr) const {
   auto &beta1 = std::get<PropsB1>(adam_props).get();
   auto &beta2 = std::get<PropsB2>(adam_props).get();
 
   std::function<float(double)> biasCorrection = [&](float f) {
     return 1.0f - pow(f, iteration + 1);
   };
 
-  ll *= sqrt(biasCorrection(beta2)) / biasCorrection(beta1);
+  lr *= sqrt(biasCorrection(beta2)) / biasCorrection(beta1);
 
-  return ll;
+  return lr;
 }
 
 void Adam::applyGradient(RunOptimizerContext &context) {
diff --git a/nntrainer/optimizers/adam.h b/nntrainer/optimizers/adam.h
@@ -121,11 +121,11 @@ class Adam : public Optimizer {
   /**
    * @brief Get updated learning rate
    *
-   * @param ll learning rate
+   * @param lr learning rate
    *
    * @return updated learning rate
    */
-  double getUpdatedLearningRate(unsigned int iteration, double ll) const;
+  double getUpdatedLearningRate(unsigned int iteration, double lr) const;
 };
 } /* namespace nntrainer */
 
diff --git a/nntrainer/optimizers/adamw.cpp b/nntrainer/optimizers/adamw.cpp
@@ -23,21 +23,32 @@
 
 namespace nntrainer {
 
-AdamW::AdamW() : adam_props(PropsB1(), PropsB2(), PropsEpsilon(), TorchRef()) {
+AdamW::AdamW() :
+  adam_props(PropsB1(), PropsB2(), PropsEpsilon(), TorchRef(),
+             PropsWeightDecayW()) {
   /** default properties */
-  auto &[b1, b2, eps, torch_ref] = adam_props;
+  auto &[b1, b2, eps, torch_ref, weight_decay] = adam_props;
   b1.set(0.9f);
   b2.set(0.999f);
-  eps.set(1.0e-7f);
+  eps.set(1.0e-8f);
   torch_ref.set(false);
+  weight_decay.set(0.0f);
 }
 
 AdamW::~AdamW() {}
 
 enum AdamParams { wm, wv };
 
 std::vector<TensorDim> AdamW::getOptimizerVariableDim(const TensorDim &dim) {
-  return {dim, dim};
+  /**
+   * @note We assume the optimizer parameters should be full precision to
+   * maintain the accuracy even in mixed precision training.
+   */
+  TensorDim wm_dim(dim);
+  TensorDim wv_dim(dim);
+  wm_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  wv_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  return {wm_dim, wv_dim};
 }
 
 void AdamW::exportTo(Exporter &exporter,
@@ -51,6 +62,14 @@ void AdamW::setProperty(const std::vector<std::string> &values) {
   Optimizer::setProperty(left);
 }
 
+double AdamW::getUpdatedLearningRate(unsigned int iteration, double lr) const {
+  auto &beta1 = std::get<PropsB1>(adam_props).get();
+  auto &beta2 = std::get<PropsB2>(adam_props).get();
+  auto biasCorrection = [&](double f) { return 1.0 - pow(f, iteration + 1); };
+  lr *= sqrt(biasCorrection(beta2)) / biasCorrection(beta1);
+  return lr;
+}
+
 void AdamW::applyGradient(RunOptimizerContext &context) {
   Tensor empty_tensor;
 
@@ -68,13 +87,8 @@ void AdamW::applyGradient(RunOptimizerContext &context) {
   auto &beta1 = std::get<PropsB1>(adam_props).get();
   auto &beta2 = std::get<PropsB2>(adam_props).get();
   auto &epsilon = std::get<PropsEpsilon>(adam_props).get();
-  auto &torch_ref = std::get<TorchRef>(adam_props).get();
+  auto &weight_decay = std::get<PropsWeightDecayW>(adam_props).get();
 
-  // This is implementation of adam from original paper.
-  // This is not deleted intentionally.
-  unsigned int iteration = context.getIteration();
-  float biasCorrection1 = 1 - pow(beta1, iteration + 1);
-  float biasCorrection2 = 1 - pow(beta2, iteration + 1);
   Tensor &wm = context.getOptimizerVariable(AdamParams::wm);
   Tensor &wv = context.getOptimizerVariable(AdamParams::wv);
 
@@ -84,16 +98,23 @@ void AdamW::applyGradient(RunOptimizerContext &context) {
   wv.multiply_i(beta2);
   wv.add_i(x_grad.multiply(x_grad), 1.0f - beta2);
 
-  wv.divide_i(biasCorrection2);
+  // Decoupled weight decay: w = w - lr * wd * w
+  if (weight_decay > 0.0) {
+    Tensor &w = context.isMixedPrecision() ? context.getWeightFP32()
+                                           : context.getWeight();
+    w.multiply_i(1.0f - (context.getLearningRate() * weight_decay));
+  }
+
+  // Adam update with bias-corrected lr
+  double lr_t =
+    getUpdatedLearningRate(context.getIteration(), context.getLearningRate());
+
   std::function<double(double)> sqrtEps = [epsilon](double f) {
     return 1 / (sqrtDouble(f) + epsilon);
   };
   x_grad = wv.apply<float>(sqrtEps, x_grad);
-  x_grad.divide_i(biasCorrection1);
   x_grad.multiply_i(wm);
-  context.calcWeightDecayGradient();
-
-  context.applyGradient(context.getLearningRate(), x_grad);
+  context.applyGradient(lr_t, x_grad);
 }
 
 } // namespace nntrainer
diff --git a/nntrainer/optimizers/adamw.h b/nntrainer/optimizers/adamw.h
@@ -24,6 +24,15 @@
 
 namespace nntrainer {
 
+/**
+ * @brief weight decay property for AdamW
+ */
+class PropsWeightDecayW : public Property<double> {
+public:
+  static constexpr const char *key = "weight_decay";
+  using prop_tag = double_prop_tag;
+};
+
 /**
  * @class   AdamW Optimizer class
  * @brief   AdamW Optimizer
@@ -78,7 +87,17 @@ class AdamW : public Optimizer {
   void setProperty(const std::vector<std::string> &values) override;
 
 private:
-  std::tuple<PropsB1, PropsB2, PropsEpsilon, TorchRef> adam_props;
+  std::tuple<PropsB1, PropsB2, PropsEpsilon, TorchRef, PropsWeightDecayW>
+    adam_props;
+
+  /**
+   * @brief Get updated learning rate
+   *
+   * @param lr learning rate
+   *
+   * @return updated learning rate
+   */
+  double getUpdatedLearningRate(unsigned int iteration, double lr) const;
 };
 } /* namespace nntrainer */
 
diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp
@@ -22,6 +22,13 @@ Tensor &RunOptimizerContext::getWeight() const {
   return weight->getVariableRef();
 }
 
+/**
+ * @brief Get the Weight FP32 tensor object (master weight for mixed precision)
+ */
+Tensor &RunOptimizerContext::getWeightFP32() const {
+  return weight->getVariableFP32Ref();
+}
+
 /**
  * @brief Get the Weight Gradient tensor object
  */
@@ -63,7 +70,10 @@ void RunOptimizerContext::applyLossScale(Tensor &fp32_grad) {
   fp32_grad.divide_i(loss_scale);
 }
 
-void RunOptimizerContext::calcWeightDecayGradient() {
-  weight->calcWeightDecayGradient();
+/**
+ * @brief Return if the underlying weight is mixed precision
+ */
+bool RunOptimizerContext::isMixedPrecision() const {
+  return weight->isMixedPrecision();
 }
 } // namespace nntrainer
diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h
@@ -44,13 +44,26 @@ class RunOptimizerContext {
    */
   Tensor &getWeight() const;
 
+  /**
+   * @brief Get the Weight FP32 tensor object (master weight for mixed
+   * precision)
+   *
+   * @return Tensor& Reference to the FP32 master weight tensor
+   */
+  Tensor &getWeightFP32() const;
+
   /**
    * @brief Get the Weight Gradient tensor object
    *
    * @return Tensor& Reference to the weight grad tensor
    */
   Tensor &getGradient() const;
 
+  /**
+   * @brief Return if the underlying weight is mixed precision
+   */
+  bool isMixedPrecision() const;
+
   /**
    * @brief Get the optimizer variable associated to this weight
    *
@@ -102,11 +115,6 @@ class RunOptimizerContext {
    */
   void applyLossScale(Tensor &fp32_grad);
 
-  /**
-   * @brief     Calculate gradient from the decay of the weight
-   */
-  void calcWeightDecayGradient();
-
 private:
   Weight *weight;       /**< weights for the optimizer */
   size_t iteration;     /**< iteration number */
diff --git a/test/unittest/unittest_nntrainer_internal.cpp b/test/unittest/unittest_nntrainer_internal.cpp
@@ -124,6 +124,38 @@ TEST(nntrainer_Optimizer, create_09_n) {
   EXPECT_ANY_THROW(op = ac->createOptimizerObject("non-existing type", {}));
 }
 
+/**
+ * @brief Optimizer create
+ */
+TEST(nntrainer_Optimizer, create_adamw_01_p) {
+  std::unique_ptr<nntrainer::Optimizer> op;
+  auto &eg = nntrainer::Engine::Global();
+  auto ac = eg.getRegisteredContext("cpu");
+  EXPECT_NO_THROW(op =
+                    ac->createOptimizerObject("adamw", {"weight_decay=0.01"}));
+}
+
+/**
+ * @brief Optimizer create
+ */
+TEST(nntrainer_Optimizer, create_adamw_02_n) {
+  std::unique_ptr<nntrainer::Optimizer> op;
+  auto &eg = nntrainer::Engine::Global();
+  auto ac = eg.getRegisteredContext("cpu");
+  EXPECT_ANY_THROW(op = ac->createOptimizerObject("adamw", {"unknown"}));
+}
+
+/**
+ * @brief Optimizer create
+ */
+TEST(nntrainer_Optimizer, create_adamw_03_n) {
+  std::unique_ptr<nntrainer::Optimizer> op;
+  auto &eg = nntrainer::Engine::Global();
+  auto ac = eg.getRegisteredContext("cpu");
+  EXPECT_ANY_THROW(op =
+                     ac->createOptimizerObject("adamw", {"learning_rate:0.1"}));
+}
+
 TEST(nntrainer_throw_if, throw_invalid_arg_p) {
   try {
     NNTR_THROW_IF(1 == 1, std::invalid_argument) << "error msg";

Original file line number	Diff line number	Diff line change
`@@ -121,11 +121,11 @@ class Adam : public Optimizer {`
`121`	`121`	`/**`
`122`	`122`	`* @brief Get updated learning rate`
`123`	`123`	`*`
`124`		`- * @param ll learning rate`
	`124`	`+ * @param lr learning rate`
`125`	`125`	`*`
`126`	`126`	`* @return updated learning rate`
`127`	`127`	`*/`
`128`		`- double getUpdatedLearningRate(unsigned int iteration, double ll) const;`
	`128`	`+ double getUpdatedLearningRate(unsigned int iteration, double lr) const;`
`129`	`129`	`};`
`130`	`130`	`} /* namespace nntrainer */`
`131`	`131`