tokenizers respect padding: true with non-null max_length

dwisdom0 · dwisdom0 · commit b452eb139d33 · 2025-04-13T20:15:24.000-07:00
This commit changes the behavior of tokenizers to match the
behavior described in the docs and the behavior of the Python
library.

Before this commit, passing
{
  padding: true,
  max_length: 512
}
or
{
  padding: 'max_length',
  max_length: 512
}
would both always pad all outputs to 512 tokens.

After this change,
{
  padding: true,
  max_length: 512
}
will now pad the outputs to match the longest encoding
or max_length, whichever is shorter.

This commit  also adds a test to prevent regressions.
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -2790,17 +2790,16 @@ export class PreTrainedTokenizer extends Callable {
         // At this point, tokens is batched: [batch_size, tokens]
         // However, array may be jagged. So, we pad to max_length
 
-        if (max_length === null) {
-            if (padding === 'max_length') {
-                max_length = this.model_max_length;
-            } else {
-                // Calculate max length from sequences
-                max_length = max(encodedTokens.map(x => x.input_ids.length))[0];
-            }
-        } else {
-            if (!truncation) {
-                console.warn(`Truncation was not explicitly activated but \`max_length\` is provided a specific value, please use \`truncation=true\` to explicitly truncate examples to max length.`)
-            }
+        if (truncation && max_length === null) {
+            max_length = this.model_max_length;
+        } else if (max_length && truncation === null) {
+            console.warn(`Truncation was not explicitly activated but \`max_length\` is provided a specific value, please use \`truncation=true\` to explicitly truncate examples to max length.`)
+        }
+
+        // padding: 'max_length' doesn't require any additional calculation
+        // but padding: true has to calculate max_length from the sequences
+        if (padding === true) {
+            max_length = Math.min(max(encodedTokens.map(x => x.input_ids.length))[0], max_length ?? Infinity);
         }
 
         // Ensure it is less than model max length
diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js
@@ -180,6 +180,53 @@ describe("Tokenizer padding/truncation", () => {
           [0n, 0n],
         ]);
       }
+
+      {
+        // padding: true should pad encodings to match the longest encoding in the batch,
+        // regardless of what is set in max_length
+        const { input_ids, attention_mask, token_type_ids } = tokenizer(inputs, {
+          padding: true,
+          truncation: true,
+          add_special_tokens: false,
+          max_length: 3,
+        });
+
+        expect(input_ids.tolist()).toEqual([
+          [1037n, 0n],
+          [1038n, 1039n],
+        ]);
+        expect(attention_mask.tolist()).toEqual([
+          [1n, 0n],
+          [1n, 1n],
+        ]);
+        expect(token_type_ids.tolist()).toEqual([
+          [0n, 0n],
+          [0n, 0n],
+        ]);
+      }
+
+      {
+        // padding: 'max_length' should pad encodings to match max_length
+        const { input_ids, attention_mask, token_type_ids } = tokenizer(inputs, {
+          padding: 'max_length',
+          truncation: true,
+          add_special_tokens: false,
+          max_length: 3,
+        });
+
+        expect(input_ids.tolist()).toEqual([
+          [1037n, 0n, 0n],
+          [1038n, 1039n, 0n],
+        ]);
+        expect(attention_mask.tolist()).toEqual([
+          [1n, 0n, 0n],
+          [1n, 1n, 0n],
+        ]);
+        expect(token_type_ids.tolist()).toEqual([
+          [0n, 0n, 0n],
+          [0n, 0n, 0n],
+        ]);
+      }
     },
     MAX_TEST_EXECUTION_TIME,
   );