Merge pull request #3 from symanto-research/feature/token-to-char-hf

jogonba2 · web-flow · commit 73c57aab0670 · 2024-03-25T10:06:29.000+01:00
HF spans can be passed to greedy coverage
diff --git a/README.md b/README.md
@@ -83,7 +83,7 @@ or the following equation if `word_ids` are not passed:
 $$\textrm{match}(x_i) = \underset{i-k\leq j\leq i+k}{\textrm{min}}\ \textrm{dist}(x_i, y_j)$$
 It is recommended to use a large radius `k` (e.g., 30) to avoid introducing matching errors at the end of the sequence if the "speed" of the tokenizations varies a lot.
 
-**Greedy-coverage**: aligns the tokens from two different tokenizers, using a greedy matching algorithm based on text coverage. This algorithm remove whitespaces from the text, and finds the positions (start, end) that each token covers in the text without whitespaces. Once we have the lists of (start, end) for each token and for each tokenization, we merge the tokens of the second tokenization that are spanned by the tokens of the first tokenization. For instance, having computed $spans_a$ = [(0, 5), (5, 13), (13, 23)] and $spans_b$ = [(0, 4), (5, 8), (8, 11), (11, 14), (15, 19), (19, 21), (21, 23)], the alignment will be [(0, [0]), (1, [1, 2, 3]), (2, [4, 5, 6])]. `merge-tokenizers` provides a C and a Python implementation of this algorithm.
+**Greedy-coverage**: aligns the tokens from two different tokenizers, using a greedy matching algorithm based on text coverage. This algorithm first remove whitespaces from the text, and finds the char positions (start, end) that each token covers in the text without whitespaces. This step can be avoided if you pass the char spans that each token covers, for instance, using `token_to_chars` from HuggingFace tokenizers. Once we have the lists of (start, end) for each token and for each tokenization, we merge the tokens of the second tokenization that are spanned by the tokens of the first tokenization. For instance, having computed $spans_a$ = [(0, 5), (5, 13), (13, 23)] and $spans_b$ = [(0, 4), (5, 8), (8, 11), (11, 14), (15, 19), (19, 21), (21, 23)], the alignment will be [(0, [0]), (1, [1, 2, 3]), (2, [4, 5, 6])]. `merge-tokenizers` provides a C and a Python implementation of this algorithm.
 
 # 🔎 What algorithm should I use?
 
diff --git a/assets/benchmark.md b/assets/benchmark.md
@@ -1,66 +1,66 @@
 | Tokens   | Algorithm                   |        Mean |         Std |
 |:---------|:----------------------------|------------:|------------:|
-| 64-32    | DTWAligner                  | 0.000810194 | 0.00012322  |
-| 64-32    | PythonDTWAligner            | 0.004805    | 0.0382287   |
-| 64-32    | GreedyDistanceAligner       | 0.000986452 | 0.00013143  |
-| 64-32    | PythonGreedyCoverageAligner | 0.000352281 | 3.61758e-05 |
-| 64-32    | GreedyCoverageAligner       | 0.000657192 | 0.00500252  |
-| 64-32    | FastDTWAligner              | 0.0017894   | 0.000227047 |
-| 64-32    | TamuheyAligner              | 0.000297001 | 3.36874e-05 |
-| 64-32    | WordIdsAligner              | 0.000164852 | 1.55548e-05 |
-| 64-64    | DTWAligner                  | 0.00134678  | 0.000129305 |
-| 64-64    | PythonDTWAligner            | 0.00585662  | 0.000402595 |
-| 64-64    | GreedyDistanceAligner       | 0.00167739  | 0.000128032 |
-| 64-64    | PythonGreedyCoverageAligner | 0.000391849 | 3.21681e-05 |
-| 64-64    | GreedyCoverageAligner       | 0.000489788 | 3.76893e-05 |
-| 64-64    | FastDTWAligner              | 0.0021804   | 0.000175052 |
-| 64-64    | TamuheyAligner              | 0.000346222 | 3.42165e-05 |
-| 64-64    | WordIdsAligner              | 0.000283829 | 2.9102e-05  |
-| 128-64   | DTWAligner                  | 0.00264733  | 0.00027827  |
-| 128-64   | PythonDTWAligner            | 0.0115429   | 0.000933241 |
-| 128-64   | GreedyDistanceAligner       | 0.00237439  | 0.004617    |
-| 128-64   | PythonGreedyCoverageAligner | 0.000677936 | 5.42784e-05 |
-| 128-64   | GreedyCoverageAligner       | 0.00105866  | 0.00445993  |
-| 128-64   | FastDTWAligner              | 0.00313382  | 0.00021815  |
-| 128-64   | TamuheyAligner              | 0.000589425 | 4.43911e-05 |
-| 128-64   | WordIdsAligner              | 0.000301838 | 2.67991e-05 |
-| 128-128  | DTWAligner                  | 0.00478876  | 0.000480806 |
-| 128-128  | PythonDTWAligner            | 0.0224406   | 0.00123022  |
-| 128-128  | GreedyDistanceAligner       | 0.00401765  | 0.00395362  |
-| 128-128  | PythonGreedyCoverageAligner | 0.00093948  | 0.00425627  |
-| 128-128  | GreedyCoverageAligner       | 0.00116883  | 0.00507499  |
-| 128-128  | FastDTWAligner              | 0.00401919  | 0.00448164  |
-| 128-128  | TamuheyAligner              | 0.000688969 | 4.54692e-05 |
-| 128-128  | WordIdsAligner              | 0.000563423 | 3.19658e-05 |
-| 256-128  | DTWAligner                  | 0.00928647  | 0.00389654  |
-| 256-128  | PythonDTWAligner            | 0.0441353   | 0.00618959  |
-| 256-128  | GreedyDistanceAligner       | 0.00512323  | 0.00615063  |
-| 256-128  | PythonGreedyCoverageAligner | 0.00159762  | 0.00507555  |
-| 256-128  | GreedyCoverageAligner       | 0.00177792  | 0.00383042  |
-| 256-128  | FastDTWAligner              | 0.00586248  | 0.00397453  |
-| 256-128  | TamuheyAligner              | 0.00151315  | 0.00547704  |
-| 256-128  | WordIdsAligner              | 0.000931732 | 0.00502417  |
-| 256-256  | DTWAligner                  | 0.0170545   | 0.00156439  |
-| 256-256  | PythonDTWAligner            | 0.0849948   | 0.0103587   |
-| 256-256  | GreedyDistanceAligner       | 0.00850646  | 0.000559411 |
-| 256-256  | PythonGreedyCoverageAligner | 0.00142612  | 0.000129661 |
-| 256-256  | GreedyCoverageAligner       | 0.00181911  | 0.000135627 |
-| 256-256  | FastDTWAligner              | 0.00791399  | 0.0084514   |
-| 256-256  | TamuheyAligner              | 0.00135891  | 0.000101657 |
-| 256-256  | WordIdsAligner              | 0.00251443  | 0.0110405   |
-| 512-256  | DTWAligner                  | 0.0278183   | 0.0090862   |
-| 512-256  | PythonDTWAligner            | 0.138419    | 0.0380331   |
-| 512-256  | GreedyDistanceAligner       | 0.0109791   | 0.0111162   |
-| 512-256  | PythonGreedyCoverageAligner | 0.0030857   | 0.00920703  |
-| 512-256  | GreedyCoverageAligner       | 0.0030004   | 0.00516345  |
-| 512-256  | FastDTWAligner              | 0.0108567   | 0.0104887   |
-| 512-256  | TamuheyAligner              | 0.00249206  | 0.00669024  |
-| 512-256  | WordIdsAligner              | 0.00149341  | 0.0050352   |
-| 512-512  | DTWAligner                  | 0.0473118   | 0.0210875   |
-| 512-512  | PythonDTWAligner            | 0.238191    | 0.100658    |
-| 512-512  | GreedyDistanceAligner       | 0.0154189   | 0.0086549   |
-| 512-512  | PythonGreedyCoverageAligner | 0.003124    | 0.00870312  |
-| 512-512  | GreedyCoverageAligner       | 0.0039112   | 0.0088561   |
-| 512-512  | FastDTWAligner              | 0.0126372   | 0.0108067   |
-| 512-512  | TamuheyAligner              | 0.0027519   | 0.0064969   |
-| 512-512  | WordIdsAligner              | 0.00279659  | 0.0089556   |
+| 64-32    | DTWAligner                  | 0.000861357 | 0.000186175 |
+| 64-32    | PythonDTWAligner            | 0.00488161  | 0.0384466   |
+| 64-32    | GreedyDistanceAligner       | 0.00101318  | 8.03171e-05 |
+| 64-32    | PythonGreedyCoverageAligner | 0.000289684 | 2.87963e-05 |
+| 64-32    | GreedyCoverageAligner       | 0.000599975 | 0.00527034  |
+| 64-32    | FastDTWAligner              | 0.00185941  | 0.00019357  |
+| 64-32    | TamuheyAligner              | 0.000320376 | 3.99697e-05 |
+| 64-32    | WordIdsAligner              | 0.000179664 | 1.95877e-05 |
+| 64-64    | DTWAligner                  | 0.00136892  | 0.000118518 |
+| 64-64    | PythonDTWAligner            | 0.00584478  | 0.000417107 |
+| 64-64    | GreedyDistanceAligner       | 0.00168752  | 0.000148426 |
+| 64-64    | PythonGreedyCoverageAligner | 0.000310445 | 3.35894e-05 |
+| 64-64    | GreedyCoverageAligner       | 0.000395369 | 2.96585e-05 |
+| 64-64    | FastDTWAligner              | 0.00219548  | 0.000157088 |
+| 64-64    | TamuheyAligner              | 0.00036602  | 3.45161e-05 |
+| 64-64    | WordIdsAligner              | 0.000297422 | 2.06211e-05 |
+| 128-64   | DTWAligner                  | 0.00271824  | 0.000200187 |
+| 128-64   | PythonDTWAligner            | 0.0115742   | 0.000744274 |
+| 128-64   | GreedyDistanceAligner       | 0.00239153  | 0.00425989  |
+| 128-64   | PythonGreedyCoverageAligner | 0.000585796 | 0.000134413 |
+| 128-64   | GreedyCoverageAligner       | 0.000711155 | 7.19061e-05 |
+| 128-64   | FastDTWAligner              | 0.00339628  | 0.00433525  |
+| 128-64   | TamuheyAligner              | 0.000844504 | 0.00459188  |
+| 128-64   | WordIdsAligner              | 0.00032799  | 3.18738e-05 |
+| 128-128  | DTWAligner                  | 0.00509727  | 0.00440907  |
+| 128-128  | PythonDTWAligner            | 0.0226934   | 0.004201    |
+| 128-128  | GreedyDistanceAligner       | 0.00391005  | 0.000248378 |
+| 128-128  | PythonGreedyCoverageAligner | 0.000805967 | 0.00402241  |
+| 128-128  | GreedyCoverageAligner       | 0.00079272  | 5.0157e-05  |
+| 128-128  | FastDTWAligner              | 0.00392676  | 0.000197439 |
+| 128-128  | TamuheyAligner              | 0.000951038 | 0.00426009  |
+| 128-128  | WordIdsAligner              | 0.000611856 | 3.1337e-05  |
+| 256-128  | DTWAligner                  | 0.00987252  | 0.00110288  |
+| 256-128  | PythonDTWAligner            | 0.0465292   | 0.00897979  |
+| 256-128  | GreedyDistanceAligner       | 0.00516035  | 0.000530812 |
+| 256-128  | PythonGreedyCoverageAligner | 0.00173597  | 0.00690275  |
+| 256-128  | GreedyCoverageAligner       | 0.00183678  | 0.00602486  |
+| 256-128  | FastDTWAligner              | 0.00713865  | 0.00992345  |
+| 256-128  | TamuheyAligner              | 0.00192115  | 0.00740615  |
+| 256-128  | WordIdsAligner              | 0.000898538 | 0.00400415  |
+| 256-256  | DTWAligner                  | 0.0191239   | 0.00894593  |
+| 256-256  | PythonDTWAligner            | 0.0894118   | 0.0111358   |
+| 256-256  | GreedyDistanceAligner       | 0.00993978  | 0.00830519  |
+| 256-256  | PythonGreedyCoverageAligner | 0.00216607  | 0.0092726   |
+| 256-256  | GreedyCoverageAligner       | 0.00206758  | 0.00684659  |
+| 256-256  | FastDTWAligner              | 0.00829801  | 0.00766549  |
+| 256-256  | TamuheyAligner              | 0.00181568  | 0.00433226  |
+| 256-256  | WordIdsAligner              | 0.00127796  | 0.000218844 |
+| 512-256  | DTWAligner                  | 0.0302889   | 0.0120761   |
+| 512-256  | PythonDTWAligner            | 0.143709    | 0.0406983   |
+| 512-256  | GreedyDistanceAligner       | 0.0108034   | 0.00822557  |
+| 512-256  | PythonGreedyCoverageAligner | 0.00318358  | 0.0107403   |
+| 512-256  | GreedyCoverageAligner       | 0.00435857  | 0.0134328   |
+| 512-256  | FastDTWAligner              | 0.0112162   | 0.0100254   |
+| 512-256  | TamuheyAligner              | 0.0028252   | 0.00688869  |
+| 512-256  | WordIdsAligner              | 0.0019157   | 0.00707873  |
+| 512-512  | DTWAligner                  | 0.0483401   | 0.0218033   |
+| 512-512  | PythonDTWAligner            | 0.241758    | 0.103833    |
+| 512-512  | GreedyDistanceAligner       | 0.0151268   | 0.00600527  |
+| 512-512  | PythonGreedyCoverageAligner | 0.0022175   | 0.00379141  |
+| 512-512  | GreedyCoverageAligner       | 0.00377235  | 0.0106682   |
+| 512-512  | FastDTWAligner              | 0.0116152   | 0.00300244  |
+| 512-512  | TamuheyAligner              | 0.00515227  | 0.0154369   |
+| 512-512  | WordIdsAligner              | 0.00371507  | 0.0122044   |
diff --git a/assets/benchmark.png b/assets/benchmark.png
diff --git a/merge_tokenizers/aligners/base.py b/merge_tokenizers/aligners/base.py
@@ -44,6 +44,7 @@ def align_pair(
         tokenized_pair.preprocessed_tokens_b = preprocess_tokens(
             tokenized_pair.tokens_b
         )
+
         # If both tokenizations are the same, return 1-1 alignment
         if (
             tokenized_pair.preprocessed_tokens_a
@@ -69,20 +70,30 @@ def align(self, tokenized_set: TokenizedSet) -> List[Alignment]:
             if tokenized_set.word_ids
             else [[] for _ in range(len(tokenized_set.tokens))]
         )
+        spans = (
+            tokenized_set.spans
+            if tokenized_set.spans
+            else [[] for _ in range(len(tokenized_set.tokens))]
+        )
+
         tokens_a = tokenized_set.tokens[0]
         word_ids_a = word_ids[0]
+        spans_a = spans[0]
+
         return [
             self.align_pair(
                 TokenizedPair(
                     tokens_a=tokens_a,
                     tokens_b=tokens_b,
                     word_ids_a=word_ids_a,
                     word_ids_b=word_ids_b,
+                    spans_a=spans_a,
+                    spans_b=spans_b,
                     text=tokenized_set.text,
                 )
             )
-            for tokens_b, word_ids_b in zip(
-                tokenized_set.tokens[1:], word_ids[1:]
+            for tokens_b, word_ids_b, spans_b in zip(
+                tokenized_set.tokens[1:], word_ids[1:], spans[1:]
             )
         ]
 
@@ -164,16 +175,24 @@ def aggregate_features(
             else [[] for _ in range(len(tokenized_set.tokens))]
         )
 
+        spans = (
+            tokenized_set.spans
+            if tokenized_set.spans
+            else [[] for _ in range(len(tokenized_set.tokens))]
+        )
+
         tokens_a = tokenized_set.tokens[0]
         word_ids_a = word_ids[0]
+        spans_a = spans[0]
         features_a = tokenized_set.features[0]
         merged_features = []
 
-        for idx, (tokens_b, features_b, word_ids_b) in enumerate(
+        for idx, (tokens_b, features_b, word_ids_b, spans_b) in enumerate(
             zip(
                 tokenized_set.tokens[1:],
                 tokenized_set.features[1:],
                 word_ids[1:],
+                spans[1:],
             )
         ):
             merged_features.append(
@@ -183,6 +202,8 @@ def aggregate_features(
                         tokens_b=tokens_b,
                         word_ids_a=word_ids_a,
                         word_ids_b=word_ids_b,
+                        spans_a=spans_a,
+                        spans_b=spans_b,
                         features_a=features_a,
                         features_b=features_b,
                         text=tokenized_set.text,
diff --git a/merge_tokenizers/aligners/greedy_coverage.py b/merge_tokenizers/aligners/greedy_coverage.py
@@ -73,27 +73,33 @@ def _align_pair(
 
         will result in [(0, [0]), (1, [1, 2, 3]), (2, [4, 5, 6])]
         """
-        text = tokenized_pair.text.lower().replace(" ", "").encode("utf-8")
 
         # Get the span covered by each token
         spans = {}
-        for tokenization, preprocessed_tokens in {
-            "a": tokenized_pair.preprocessed_tokens_a,
-            "b": tokenized_pair.preprocessed_tokens_b,
-        }.items():
-            ptr = (ctypes.c_char_p * len(preprocessed_tokens))(
-                *[token.encode("utf-8") for token in preprocessed_tokens]
-            )
-            c_spans = self.c_get_spans(
-                ptr,
-                text,
-                len(preprocessed_tokens),
-            )
-            spans[tokenization] = [
-                (c_spans[i].start, c_spans[i].end)
-                for i in range(len(preprocessed_tokens))
-            ]
-            self.c_free_spans(c_spans)
+        # If the spans covering the text are not passed, compute them.
+        if not tokenized_pair.spans_a and not tokenized_pair.spans_b:
+            text = tokenized_pair.text.lower().replace(" ", "").encode("utf-8")
+            for tokenization, preprocessed_tokens in {
+                "a": tokenized_pair.preprocessed_tokens_a,
+                "b": tokenized_pair.preprocessed_tokens_b,
+            }.items():
+                ptr = (ctypes.c_char_p * len(preprocessed_tokens))(
+                    *[token.encode("utf-8") for token in preprocessed_tokens]
+                )
+                c_spans = self.c_get_spans(
+                    ptr,
+                    text,
+                    len(preprocessed_tokens),
+                )
+                spans[tokenization] = [
+                    (c_spans[i].start, c_spans[i].end)
+                    for i in range(len(preprocessed_tokens))
+                ]
+                self.c_free_spans(c_spans)
+        # Otherwise, use them
+        else:
+            spans["a"] = tokenized_pair.spans_a
+            spans["b"] = tokenized_pair.spans_b
 
         # Merge the spans
         c_spans_a = (Tuple * len(spans["a"]))(*spans["a"])  # type: ignore
diff --git a/merge_tokenizers/aligners/greedy_coverage_py.py b/merge_tokenizers/aligners/greedy_coverage_py.py
@@ -117,11 +117,22 @@ def _align_pair(
 
         will result in [(0, [0]), (1, [1, 2, 3]), (2, [4, 5, 6])]
         """
-        text = tokenized_pair.text.lower().replace(" ", "")
 
-        # Get spans and align
-        spans_a = get_spans(tokenized_pair.preprocessed_tokens_a, text)
-        spans_b = get_spans(tokenized_pair.preprocessed_tokens_b, text)
+        # Get spans
+        # If the spans covering the text are not passed, compute them.
+        if not tokenized_pair.spans_a and not tokenized_pair.spans_b:
+            assert (
+                tokenized_pair.text
+            ), "`text` must be passed as argument when not passing `span_a` and `span_b`"
+            text = tokenized_pair.text.lower().replace(" ", "")
+            spans_a = get_spans(tokenized_pair.preprocessed_tokens_a, text)
+            spans_b = get_spans(tokenized_pair.preprocessed_tokens_b, text)
+        # Otherwise, use them.
+        else:
+            spans_a = tokenized_pair.spans_a
+            spans_b = tokenized_pair.spans_b
+
+        # Align spans
         alignments = merge_spans(spans_a, spans_b)
 
         # Merge alignments
diff --git a/merge_tokenizers/types.py b/merge_tokenizers/types.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Tuple
 
 import numpy as np
 from pydantic import BaseModel, field_validator, model_validator
@@ -13,6 +13,8 @@ class TokenizedPair(BaseModel):
     tokens_b: List[str]
     word_ids_a: List[int] = []
     word_ids_b: List[int] = []
+    spans_a: List[Tuple[int, int]] = []
+    spans_b: List[Tuple[int, int]] = []
     preprocessed_tokens_a: List[str] = []
     preprocessed_tokens_b: List[str] = []
     text: str = ""
@@ -21,7 +23,7 @@ class TokenizedPair(BaseModel):
 
     @field_validator("word_ids_a", "word_ids_b", mode="before")
     @classmethod
-    def prepare_word_ids_a(cls, word_ids):
+    def prepare_word_ids(cls, word_ids):
         if word_ids:
             if word_ids[0] is None:
                 word_ids[0] = -1
@@ -31,6 +33,18 @@ def prepare_word_ids_a(cls, word_ids):
         else:
             return []
 
+    @field_validator("spans_a", "spans_b", mode="before")
+    @classmethod
+    def prepare_spans(cls, spans):
+        if spans:
+            if spans[0] is None:
+                spans[0] = (-1, -1)
+            if spans[-1] is None:
+                spans[-1] = (-1, -1)
+            return spans
+        else:
+            return []
+
     class Config:
         arbitrary_types_allowed = True
 
@@ -42,6 +56,7 @@ class TokenizedSet(BaseModel):
 
     tokens: List[List[str]]
     word_ids: List[List[int]] = []
+    spans: List[List[Tuple[int, int]]] = []
     features: List[np.ndarray] = []
     text: str = ""
 
@@ -58,7 +73,7 @@ def check_len_word_ids(self) -> "TokenizedSet":
 
     @field_validator("word_ids", mode="before")
     @classmethod
-    def prepare_word_ids_a(cls, _word_ids):
+    def prepare_word_ids(cls, _word_ids):
         new_word_ids = []
         if _word_ids:
             for word_ids in _word_ids:
@@ -71,6 +86,21 @@ def prepare_word_ids_a(cls, _word_ids):
         else:
             return []
 
+    @field_validator("spans", mode="before")
+    @classmethod
+    def prepare_spans(cls, _spans):
+        new_spans = []
+        if _spans:
+            for spans in _spans:
+                if spans[0] is None:
+                    spans[0] = (-1, -1)
+                if spans[-1] is None:
+                    spans[-1] = (-1, -1)
+                new_spans.append(spans)
+            return new_spans
+        else:
+            return []
+
 
 class PositionAlignment(BaseModel):
     """
diff --git a/merge_tokenizers/version.py b/merge_tokenizers/version.py
@@ -1,6 +1,6 @@
 _MAJOR = "0"
 _MINOR = "0"
-_REVISION = "5"
+_REVISION = "6"
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
 VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION)
diff --git a/scripts/benchmark.py b/scripts/benchmark.py