| 
16 | 16 | _prefixes_elision = "m n l y t k w"  | 
17 | 17 | _prefixes_elision += " " + _prefixes_elision.upper()  | 
18 | 18 | 
 
  | 
19 |  | -TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [  | 
20 |  | -    r"(?:({pe})[{el}])(?=[{a}])".format(  | 
21 |  | -        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)  | 
22 |  | -    )  | 
23 |  | -]  | 
 | 19 | +TOKENIZER_PREFIXES = (  | 
 | 20 | +    LIST_PUNCT  | 
 | 21 | +    + LIST_QUOTES  | 
 | 22 | +    + [  | 
 | 23 | +        r"(?:({pe})[{el}])(?=[{a}])".format(  | 
 | 24 | +            a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)  | 
 | 25 | +        )  | 
 | 26 | +    ]  | 
 | 27 | +)  | 
24 | 28 | 
 
  | 
25 |  | -TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [  | 
26 |  | -    r"(?<=[0-9])%",  # numbers like 10%  | 
27 |  | -    r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers  | 
28 |  | -    r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters  | 
29 |  | -    r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions  | 
30 |  | -    r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number  | 
31 |  | -    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string  | 
32 |  | -    r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis  | 
33 |  | -]  | 
 | 29 | +TOKENIZER_SUFFIXES = (  | 
 | 30 | +    LIST_PUNCT  | 
 | 31 | +    + LIST_QUOTES  | 
 | 32 | +    + LIST_ELLIPSES  | 
 | 33 | +    + [  | 
 | 34 | +        r"(?<=[0-9])%",  # numbers like 10%  | 
 | 35 | +        r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers  | 
 | 36 | +        r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters  | 
 | 37 | +        r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions  | 
 | 38 | +        r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number  | 
 | 39 | +        r"(?<=[{a}])\.(?=\s|$)".format(  | 
 | 40 | +            a=ALPHA  | 
 | 41 | +        ),  # period after letter if space or end of string  | 
 | 42 | +        r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis  | 
 | 43 | +    ]  | 
 | 44 | +)  | 
34 | 45 | 
 
  | 
35 |  | -TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [  | 
36 |  | -    r"(?<=[0-9])[+\-\*^](?=[0-9-])",  | 
37 |  | -    r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(  | 
38 |  | -        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES  | 
39 |  | -    ),  | 
40 |  | -    r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),  | 
41 |  | -    r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),  | 
42 |  | -    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),  | 
43 |  | -]  | 
 | 46 | +TOKENIZER_INFIXES = (  | 
 | 47 | +    LIST_ELLIPSES  | 
 | 48 | +    + LIST_ICONS  | 
 | 49 | +    + [  | 
 | 50 | +        r"(?<=[0-9])[+\-\*^](?=[0-9-])",  | 
 | 51 | +        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(  | 
 | 52 | +            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES  | 
 | 53 | +        ),  | 
 | 54 | +        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),  | 
 | 55 | +        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),  | 
 | 56 | +        r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),  | 
 | 57 | +    ]  | 
 | 58 | +)  | 
0 commit comments