diff --git a/example/models.py b/example/models.py
index cfd3eff..dab93a3 100644
--- a/example/models.py
+++ b/example/models.py
@@ -64,9 +64,10 @@ def universal_transformer_gpt_model(
     # Penalty for confidence of the output distribution, as described in
     # "Regularizing Neural Networks by Penalizing Confident
     # Output Distributions" (https://arxiv.org/abs/1701.06548)
+    eps = K.epsilon()
     confidence_penalty = K.mean(
         confidence_penalty_weight *
-        K.sum(word_predictions * K.log(word_predictions), axis=-1))
+        K.sum(word_predictions * K.log(word_predictions+eps), axis=-1))
     model.add_loss(confidence_penalty)
     return model