fix masking

felixdittrich92 · felixdittrich92 · commit 2d8d8340d473 · 2022-07-01T17:45:28.000+02:00
diff --git a/doctr/models/recognition/master/tensorflow.py b/doctr/models/recognition/master/tensorflow.py
@@ -91,20 +91,21 @@ def make_source_and_target_mask(
         source: tf.Tensor,
         target: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor]:
-        # NOTE: inverse from PyTorch implementation
-        # [0, 0, 0, ..., 1, 1, 1] -> 1 is masked
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
         # (N, 1, 1, max_length)
-        target_pad_mask = tf.cast(tf.math.equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
         target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
         target_length = target.shape[1]
-        # sub mask filled diagonal with 0 = see 1 = masked (max_length, max_length)
-        target_sub_mask = 1 - tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
-        # source mask filled with zeros (max_length, positional_encoded_seq_len)
-        source_mask = tf.zeros((target_length, source.shape[1]))
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
         # combine the two masks into one (N, 1, max_length, max_length)
-        target_mask = tf.math.logical_and(
-            tf.cast(target_sub_mask, dtype=tf.bool),
-            tf.cast(target_pad_mask, dtype=tf.bool)
+        target_mask = tf.cast(
+            tf.math.logical_and(
+                tf.cast(target_sub_mask, dtype=tf.bool),
+                tf.cast(target_pad_mask, dtype=tf.bool)
+            ), dtype=tf.uint8
         )
         return source_mask, target_mask
 
diff --git a/doctr/models/recognition/transformer/tensorflow.py b/doctr/models/recognition/transformer/tensorflow.py
@@ -62,7 +62,7 @@ def scaled_dot_product_attention(
 
     scores = tf.matmul(query, key, transpose_b=True) / math.sqrt(query.shape[-1])
     if mask is not None:
-        scores += (tf.cast(mask, dtype=query.dtype) * -1e9)
+        scores = tf.where(mask == 0, -1e9, scores)
     p_attn = tf.nn.softmax(scores, axis=-1)
     return tf.matmul(p_attn, value), p_attn