Further optimize added tokens splitter (#1265)

xenova · web-flow · commit e58404209113 · 2025-04-01T01:02:21.000-04:00
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -2598,21 +2598,13 @@ export class PreTrainedTokenizer extends Callable {
             this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
         }
 
-        // Divide added tokens into those that left/right strip, and those that don't
-        const added_tokens_with_strip = this.added_tokens.filter(x => x.rstrip || x.lstrip);
-        const added_tokens_without_strip = this.added_tokens.filter(x => !x.rstrip && !x.lstrip);
-        const split_regex = added_tokens_with_strip.length > 0 ? new RegExp(
-            added_tokens_with_strip.slice()
-                // Sort by length (desc) to avoid early partial matches
-                .sort((a, b) => b.content.length - a.content.length)
-                .map(x => `${x.lstrip ? '\\s*' : ''}(${escapeRegExp(x.content)})${x.rstrip ? '\\s*' : ''}`)
-                .join('|')
-        ) : null;
         this.added_tokens_splitter = new DictionarySplitter(
-            added_tokens_without_strip.map(x => x.content),
-            split_regex,
+            this.added_tokens.map(x => x.content),
         );
 
+        /** @type {Map<string, AddedToken>} */
+        this.added_tokens_map = new Map(this.added_tokens.map(x => [x.content, x]))
+
         // Set mask token if present (otherwise will be undefined, which is fine)
         this.mask_token = this.getToken('mask_token');
         this.mask_token_id = this.model.tokens_to_ids.get(this.mask_token);
@@ -2907,38 +2899,49 @@ export class PreTrainedTokenizer extends Callable {
         // First, we take care of special tokens. Needed to avoid issues arising from
         // normalization and/or pretokenization (which may not preserve special tokens)
         const sections = this.added_tokens_splitter.split(text);
-        const tokens = sections.map((x, section_index) => {
-            const addedToken = this.added_tokens.find(t => t.content === x);
-            if (addedToken !== undefined) {
-                // Ignore added tokens
-                return x
-            } else {
-                if (this.remove_space === true) {
-                    x = x.trim().split(/\s+/).join(' ');
-                }
-                if (this.do_lowercase_and_remove_accent) {
-                    x = lowercase_and_remove_accent(x);
-                }
 
-                if (this.normalizer !== null) {
-                    x = this.normalizer(x);
+        // Process left/right stripping of added tokens
+        for (let i = 0; i < sections.length; ++i) {
+            const addedToken = this.added_tokens_map.get(sections[i]);
+            if (addedToken) {
+                if (addedToken.lstrip && i > 0) {
+                    sections[i - 1] = sections[i - 1].trimEnd();
                 }
-
-                // If, after normalization, this section is empty (e.g., trimming whitespace),
-                // we return an empty array
-                if (x.length === 0) {
-                    return [];
+                if (addedToken.rstrip && i < sections.length - 1) {
+                    sections[i + 1] = sections[i + 1].trimStart();
                 }
+            }
+        }
 
-                const sectionTokens = (this.pre_tokenizer !== null) ? this.pre_tokenizer(x, {
-                    section_index,
-                }) : [x];
+        const tokens = sections.flatMap((x, section_index) => {
+            if (x.length === 0) return [];
+            if (this.added_tokens_map.has(x)) return [x]; // Return added tokens unchanged
 
-                const tokens = this.model(sectionTokens);
+            if (this.remove_space === true) {
+                x = x.trim().split(/\s+/).join(' ');
+            }
+            if (this.do_lowercase_and_remove_accent) {
+                x = lowercase_and_remove_accent(x);
+            }
+
+            if (this.normalizer !== null) {
+                x = this.normalizer(x);
+            }
 
-                return tokens;
+            // If, after normalization, this section is empty (e.g., trimming whitespace),
+            // we return an empty array
+            if (x.length === 0) {
+                return [];
             }
-        }).flat();
+
+            const sectionTokens = (this.pre_tokenizer !== null) ? this.pre_tokenizer(x, {
+                section_index,
+            }) : [x];
+
+            const tokens = this.model(sectionTokens);
+
+            return tokens;
+        });
 
         return tokens;
     }
diff --git a/src/utils/data-structures.js b/src/utils/data-structures.js
@@ -455,11 +455,9 @@ class TokenLatticeNode {
 export class DictionarySplitter {
     /**
      * @param {string[]} dictionary The dictionary of words to use for splitting.
-     * @param {RegExp} [splitRegex] Optional split regex for preprocessing the input text.
      */
-    constructor(dictionary, splitRegex = null) {
+    constructor(dictionary) {
         this.trie = this._buildTrie(dictionary);
-        this.splitRegex = splitRegex;
     }
 
     /**
@@ -486,20 +484,6 @@ export class DictionarySplitter {
      * @returns {string[]} An array of tokens.
      */
     split(text) {
-        return this.splitRegex ?
-            text.split(this.splitRegex)
-                .filter(x => x)
-                .flatMap(x => this._splitSingle(x))
-            : this._splitSingle(text)
-    }
-
-    /**
-     * Helper function to split a single text string into tokens.
-     * @param {string} text The input text to split.
-     * @returns {string[]} An array of tokens.
-     * @private
-     */
-    _splitSingle(text) {
         const result = [];
         const n = text.length;
         let start = 0;
diff --git a/tests/utils/data_structures.test.js b/tests/utils/data_structures.test.js
@@ -34,32 +34,16 @@ describe("Priority queue", () => {
 
 describe("Dictionary splitter", () => {
   it("should split on a defined dictionary", () => {
-    const splitter = new DictionarySplitter(
-      ["a", "b", "c", "abc"],
-      null, // no split regex
-    );
+    const splitter = new DictionarySplitter(["a", "b", "c", "abc"]);
     const text = ".a.b.cc.abcdef.";
     const expected = [".", "a", ".", "b", ".", "c", "c", ".", "abc", "def."];
     const result = splitter.split(text);
     expect(result).toEqual(expected);
   });
-  it("should split on a defined dictionary w/ split regex", () => {
-    const splitter = new DictionarySplitter(
-      ["a", "b", "c", "abc"],
-      /\s+/, // split on whitespace
-    );
-    const text = "a  b  c";
-    const expected = ["a", "b", "c"];
-    const result = splitter.split(text);
-    expect(result).toEqual(expected);
-  });
 
   it("should handle multi-byte characters", () => {
     const text = "before🤗after\ud83etest";
-    const splitter = new DictionarySplitter(
-      ["🤗" /* '\ud83e\udd17' */, "\ud83e"],
-      null, // no split regex
-    );
+    const splitter = new DictionarySplitter(["🤗" /* '\ud83e\udd17' */, "\ud83e"]);
     const expected = ["before", "🤗", "after", "\ud83e", "test"];
     const result = splitter.split(text);
     expect(result).toEqual(expected);