🔥 Remove deprecated tokenizer config args

hezarai · Nov 14, 2024 · 07ab703 · 07ab703
1 parent aa12363
commit 07ab703
Show file tree

Hide file tree

Showing 6 changed files with 0 additions and 18 deletions.
diff --git a/hezar/models/speech_recognition/whisper/whisper_tokenizer.py b/hezar/models/speech_recognition/whisper/whisper_tokenizer.py
@@ -254,11 +254,8 @@
 @dataclass
 class WhisperBPEConfig(BPEConfig):
     name = "whisper_bpe_tokenizer"
-    max_length: int = 448
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     pad_to_multiple_of: int = 0
     pad_token: str = "<|endoftext|>"

diff --git a/hezar/preprocessors/tokenizers/bpe.py b/hezar/preprocessors/tokenizers/bpe.py
@@ -19,11 +19,8 @@
 @dataclass
 class BPEConfig(TokenizerConfig):
     name = "bpe_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     pad_to_multiple_of: int = 0
     bos_token: str = "<s>"

diff --git a/hezar/preprocessors/tokenizers/sentencepiece_bpe.py b/hezar/preprocessors/tokenizers/sentencepiece_bpe.py
@@ -19,11 +19,8 @@
 @dataclass
 class SentencePieceBPEConfig(TokenizerConfig):
     name = "sentencepiece_bpe_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     bos_token: str = "<s>"
     eos_token: str = "</s>"

diff --git a/hezar/preprocessors/tokenizers/sentencepiece_unigram.py b/hezar/preprocessors/tokenizers/sentencepiece_unigram.py
@@ -19,11 +19,8 @@
 @dataclass
 class SentencePieceUnigramConfig(TokenizerConfig):
     name = "sentencepiece_unigram_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     bos_token: str = "<s>"
     eos_token: str = "</s>"

diff --git a/hezar/preprocessors/tokenizers/tokenizer.py b/hezar/preprocessors/tokenizers/tokenizer.py
@@ -42,11 +42,8 @@ class TokenizerConfig(PreprocessorConfig):
     Configuration for the Tokenizer.
 
     Args:
-        max_length (int): Maximum length of the tokenized sequences.
-        truncation (str): Truncation strategy for tokenization.
         truncation_side (str): Truncation direction for tokenization.
         stride (int): Stride for tokenization.
-        padding (str): Padding type for tokenization e.g, max_length, longest, no_padding.
         padding_side (str): Padding direction for tokenization.
         pad_to_multiple_of (int): Pad to a multiple of this value.
         pad_token_type_id (int): ID of the padding token type.

diff --git a/hezar/preprocessors/tokenizers/wordpiece.py b/hezar/preprocessors/tokenizers/wordpiece.py
@@ -19,11 +19,8 @@
 @dataclass
 class WordPieceConfig(TokenizerConfig):
     name = "wordpiece_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     pad_to_multiple_of: int = 0
     pad_token: str = "[PAD]"