Skip to content

Commit

Permalink
🔥 Remove deprecated tokenizer config args
Browse files Browse the repository at this point in the history
  • Loading branch information
arxyzan committed Nov 14, 2024
1 parent aa12363 commit 07ab703
Show file tree
Hide file tree
Showing 6 changed files with 0 additions and 18 deletions.
3 changes: 0 additions & 3 deletions hezar/models/speech_recognition/whisper/whisper_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,8 @@
@dataclass
class WhisperBPEConfig(BPEConfig):
name = "whisper_bpe_tokenizer"
max_length: int = 448
truncation: str = "longest_first"
truncation_side: str = "right"
stride: int = 0
padding: str = "longest"
padding_side: str = "right"
pad_to_multiple_of: int = 0
pad_token: str = "<|endoftext|>"
Expand Down
3 changes: 0 additions & 3 deletions hezar/preprocessors/tokenizers/bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@
@dataclass
class BPEConfig(TokenizerConfig):
name = "bpe_tokenizer"
max_length: int = 512
truncation: str = "longest_first"
truncation_side: str = "right"
stride: int = 0
padding: str = "longest"
padding_side: str = "right"
pad_to_multiple_of: int = 0
bos_token: str = "<s>"
Expand Down
3 changes: 0 additions & 3 deletions hezar/preprocessors/tokenizers/sentencepiece_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@
@dataclass
class SentencePieceBPEConfig(TokenizerConfig):
name = "sentencepiece_bpe_tokenizer"
max_length: int = 512
truncation: str = "longest_first"
truncation_side: str = "right"
stride: int = 0
padding: str = "longest"
padding_side: str = "right"
bos_token: str = "<s>"
eos_token: str = "</s>"
Expand Down
3 changes: 0 additions & 3 deletions hezar/preprocessors/tokenizers/sentencepiece_unigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@
@dataclass
class SentencePieceUnigramConfig(TokenizerConfig):
name = "sentencepiece_unigram_tokenizer"
max_length: int = 512
truncation: str = "longest_first"
truncation_side: str = "right"
stride: int = 0
padding: str = "longest"
padding_side: str = "right"
bos_token: str = "<s>"
eos_token: str = "</s>"
Expand Down
3 changes: 0 additions & 3 deletions hezar/preprocessors/tokenizers/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,8 @@ class TokenizerConfig(PreprocessorConfig):
Configuration for the Tokenizer.
Args:
max_length (int): Maximum length of the tokenized sequences.
truncation (str): Truncation strategy for tokenization.
truncation_side (str): Truncation direction for tokenization.
stride (int): Stride for tokenization.
padding (str): Padding type for tokenization e.g, max_length, longest, no_padding.
padding_side (str): Padding direction for tokenization.
pad_to_multiple_of (int): Pad to a multiple of this value.
pad_token_type_id (int): ID of the padding token type.
Expand Down
3 changes: 0 additions & 3 deletions hezar/preprocessors/tokenizers/wordpiece.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@
@dataclass
class WordPieceConfig(TokenizerConfig):
name = "wordpiece_tokenizer"
max_length: int = 512
truncation: str = "longest_first"
truncation_side: str = "right"
stride: int = 0
padding: str = "longest"
padding_side: str = "right"
pad_to_multiple_of: int = 0
pad_token: str = "[PAD]"
Expand Down

0 comments on commit 07ab703

Please sign in to comment.