From 07ab70341347cf6542203175245d106b7cc9e89c Mon Sep 17 00:00:00 2001 From: arxyzan Date: Thu, 14 Nov 2024 11:55:03 +0330 Subject: [PATCH] :fire: Remove deprecated tokenizer config args --- hezar/models/speech_recognition/whisper/whisper_tokenizer.py | 3 --- hezar/preprocessors/tokenizers/bpe.py | 3 --- hezar/preprocessors/tokenizers/sentencepiece_bpe.py | 3 --- hezar/preprocessors/tokenizers/sentencepiece_unigram.py | 3 --- hezar/preprocessors/tokenizers/tokenizer.py | 3 --- hezar/preprocessors/tokenizers/wordpiece.py | 3 --- 6 files changed, 18 deletions(-) diff --git a/hezar/models/speech_recognition/whisper/whisper_tokenizer.py b/hezar/models/speech_recognition/whisper/whisper_tokenizer.py index c980e78a..9ff23e9c 100644 --- a/hezar/models/speech_recognition/whisper/whisper_tokenizer.py +++ b/hezar/models/speech_recognition/whisper/whisper_tokenizer.py @@ -254,11 +254,8 @@ @dataclass class WhisperBPEConfig(BPEConfig): name = "whisper_bpe_tokenizer" - max_length: int = 448 - truncation: str = "longest_first" truncation_side: str = "right" stride: int = 0 - padding: str = "longest" padding_side: str = "right" pad_to_multiple_of: int = 0 pad_token: str = "<|endoftext|>" diff --git a/hezar/preprocessors/tokenizers/bpe.py b/hezar/preprocessors/tokenizers/bpe.py index 51802eed..3da3a7dc 100644 --- a/hezar/preprocessors/tokenizers/bpe.py +++ b/hezar/preprocessors/tokenizers/bpe.py @@ -19,11 +19,8 @@ @dataclass class BPEConfig(TokenizerConfig): name = "bpe_tokenizer" - max_length: int = 512 - truncation: str = "longest_first" truncation_side: str = "right" stride: int = 0 - padding: str = "longest" padding_side: str = "right" pad_to_multiple_of: int = 0 bos_token: str = "" diff --git a/hezar/preprocessors/tokenizers/sentencepiece_bpe.py b/hezar/preprocessors/tokenizers/sentencepiece_bpe.py index ecb0df1e..36cda5b8 100644 --- a/hezar/preprocessors/tokenizers/sentencepiece_bpe.py +++ b/hezar/preprocessors/tokenizers/sentencepiece_bpe.py @@ -19,11 +19,8 @@ @dataclass class SentencePieceBPEConfig(TokenizerConfig): name = "sentencepiece_bpe_tokenizer" - max_length: int = 512 - truncation: str = "longest_first" truncation_side: str = "right" stride: int = 0 - padding: str = "longest" padding_side: str = "right" bos_token: str = "" eos_token: str = "" diff --git a/hezar/preprocessors/tokenizers/sentencepiece_unigram.py b/hezar/preprocessors/tokenizers/sentencepiece_unigram.py index c38d352d..834b6dc0 100644 --- a/hezar/preprocessors/tokenizers/sentencepiece_unigram.py +++ b/hezar/preprocessors/tokenizers/sentencepiece_unigram.py @@ -19,11 +19,8 @@ @dataclass class SentencePieceUnigramConfig(TokenizerConfig): name = "sentencepiece_unigram_tokenizer" - max_length: int = 512 - truncation: str = "longest_first" truncation_side: str = "right" stride: int = 0 - padding: str = "longest" padding_side: str = "right" bos_token: str = "" eos_token: str = "" diff --git a/hezar/preprocessors/tokenizers/tokenizer.py b/hezar/preprocessors/tokenizers/tokenizer.py index 15be87ff..33a83e35 100644 --- a/hezar/preprocessors/tokenizers/tokenizer.py +++ b/hezar/preprocessors/tokenizers/tokenizer.py @@ -42,11 +42,8 @@ class TokenizerConfig(PreprocessorConfig): Configuration for the Tokenizer. Args: - max_length (int): Maximum length of the tokenized sequences. - truncation (str): Truncation strategy for tokenization. truncation_side (str): Truncation direction for tokenization. stride (int): Stride for tokenization. - padding (str): Padding type for tokenization e.g, max_length, longest, no_padding. padding_side (str): Padding direction for tokenization. pad_to_multiple_of (int): Pad to a multiple of this value. pad_token_type_id (int): ID of the padding token type. diff --git a/hezar/preprocessors/tokenizers/wordpiece.py b/hezar/preprocessors/tokenizers/wordpiece.py index b97d30ef..60644fff 100644 --- a/hezar/preprocessors/tokenizers/wordpiece.py +++ b/hezar/preprocessors/tokenizers/wordpiece.py @@ -19,11 +19,8 @@ @dataclass class WordPieceConfig(TokenizerConfig): name = "wordpiece_tokenizer" - max_length: int = 512 - truncation: str = "longest_first" truncation_side: str = "right" stride: int = 0 - padding: str = "longest" padding_side: str = "right" pad_to_multiple_of: int = 0 pad_token: str = "[PAD]"