From 07ab70341347cf6542203175245d106b7cc9e89c Mon Sep 17 00:00:00 2001
From: arxyzan <arxyzan@gmail.com>
Date: Thu, 14 Nov 2024 11:55:03 +0330
Subject: [PATCH] :fire: Remove deprecated tokenizer config args

---
 hezar/models/speech_recognition/whisper/whisper_tokenizer.py | 3 ---
 hezar/preprocessors/tokenizers/bpe.py                        | 3 ---
 hezar/preprocessors/tokenizers/sentencepiece_bpe.py          | 3 ---
 hezar/preprocessors/tokenizers/sentencepiece_unigram.py      | 3 ---
 hezar/preprocessors/tokenizers/tokenizer.py                  | 3 ---
 hezar/preprocessors/tokenizers/wordpiece.py                  | 3 ---
 6 files changed, 18 deletions(-)
diff --git a/hezar/models/speech_recognition/whisper/whisper_tokenizer.py b/hezar/models/speech_recognition/whisper/whisper_tokenizer.py
index c980e78a..9ff23e9c 100644
--- a/hezar/models/speech_recognition/whisper/whisper_tokenizer.py
+++ b/hezar/models/speech_recognition/whisper/whisper_tokenizer.py
@@ -254,11 +254,8 @@
 @dataclass
 class WhisperBPEConfig(BPEConfig):
     name = "whisper_bpe_tokenizer"
-    max_length: int = 448
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     pad_to_multiple_of: int = 0
     pad_token: str = "<|endoftext|>"
diff --git a/hezar/preprocessors/tokenizers/bpe.py b/hezar/preprocessors/tokenizers/bpe.py
index 51802eed..3da3a7dc 100644
--- a/hezar/preprocessors/tokenizers/bpe.py
+++ b/hezar/preprocessors/tokenizers/bpe.py
@@ -19,11 +19,8 @@
 @dataclass
 class BPEConfig(TokenizerConfig):
     name = "bpe_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     pad_to_multiple_of: int = 0
     bos_token: str = "<s>"
diff --git a/hezar/preprocessors/tokenizers/sentencepiece_bpe.py b/hezar/preprocessors/tokenizers/sentencepiece_bpe.py
index ecb0df1e..36cda5b8 100644
--- a/hezar/preprocessors/tokenizers/sentencepiece_bpe.py
+++ b/hezar/preprocessors/tokenizers/sentencepiece_bpe.py
@@ -19,11 +19,8 @@
 @dataclass
 class SentencePieceBPEConfig(TokenizerConfig):
     name = "sentencepiece_bpe_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     bos_token: str = "<s>"
     eos_token: str = "</s>"
diff --git a/hezar/preprocessors/tokenizers/sentencepiece_unigram.py b/hezar/preprocessors/tokenizers/sentencepiece_unigram.py
index c38d352d..834b6dc0 100644
--- a/hezar/preprocessors/tokenizers/sentencepiece_unigram.py
+++ b/hezar/preprocessors/tokenizers/sentencepiece_unigram.py
@@ -19,11 +19,8 @@
 @dataclass
 class SentencePieceUnigramConfig(TokenizerConfig):
     name = "sentencepiece_unigram_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     bos_token: str = "<s>"
     eos_token: str = "</s>"
diff --git a/hezar/preprocessors/tokenizers/tokenizer.py b/hezar/preprocessors/tokenizers/tokenizer.py
index 15be87ff..33a83e35 100644
--- a/hezar/preprocessors/tokenizers/tokenizer.py
+++ b/hezar/preprocessors/tokenizers/tokenizer.py
@@ -42,11 +42,8 @@ class TokenizerConfig(PreprocessorConfig):
     Configuration for the Tokenizer.
 
     Args:
-        max_length (int): Maximum length of the tokenized sequences.
-        truncation (str): Truncation strategy for tokenization.
         truncation_side (str): Truncation direction for tokenization.
         stride (int): Stride for tokenization.
-        padding (str): Padding type for tokenization e.g, max_length, longest, no_padding.
         padding_side (str): Padding direction for tokenization.
         pad_to_multiple_of (int): Pad to a multiple of this value.
         pad_token_type_id (int): ID of the padding token type.
diff --git a/hezar/preprocessors/tokenizers/wordpiece.py b/hezar/preprocessors/tokenizers/wordpiece.py
index b97d30ef..60644fff 100644
--- a/hezar/preprocessors/tokenizers/wordpiece.py
+++ b/hezar/preprocessors/tokenizers/wordpiece.py
@@ -19,11 +19,8 @@
 @dataclass
 class WordPieceConfig(TokenizerConfig):
     name = "wordpiece_tokenizer"
-    max_length: int = 512
-    truncation: str = "longest_first"
     truncation_side: str = "right"
     stride: int = 0
-    padding: str = "longest"
     padding_side: str = "right"
     pad_to_multiple_of: int = 0
     pad_token: str = "[PAD]"