Merge pull request #11 from ss-sebastian/cantonese-try-3

Jemoka · web-flow · commit 6ed1845181b4 · 2024-11-05T09:56:40.000-08:00
Cantonese try 3
diff --git a/batchalign/constants.py b/batchalign/constants.py
@@ -1,5 +1,5 @@
 # CHAT punctuation specifications
-ENDING_PUNCT = [".", "?", "!", "+//.", "+/.", "+...", "+\"/.", "+..?", "+\".", "+//?", "+.", "+!?", "+/?", "..."]
+ENDING_PUNCT = [".", "?", "？", "！", "!", "+//.", "+/.", "+...", "+\"/.", "+..?","+..？", "+\".", "+//?", "+//？","+.", "+!?", "+！？", "+/?", "+/？",  "...", "？","！"]
 MOR_PUNCT = ["‡", "„", ","]
 CHAT_IGNORE = ["xxx", "yyy", "www"]
 
diff --git a/batchalign/models/resolve.py b/batchalign/models/resolve.py
@@ -12,7 +12,7 @@
     },
     "whisper": {
         'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
-        # 'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),
+        'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),
     }
 }
 
diff --git a/batchalign/models/utils.py b/batchalign/models/utils.py
@@ -15,13 +15,13 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
         tensor containing the timestamps in seconds for each predicted token
     """
     # Create a list with `decoder_layers` elements, each a tensor of shape
-    # (batch size, attention_heads, output length, input length).
+    # (batch_size, attention_heads, output_length, input_length).
     cross_attentions = []
     for i in range(self.config.decoder_layers):
         cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
 
-    # Select specific cross-attention layers and heads. This is a tensor
-    # of shape (batch size, num selected, output length, input length).
+    # Select specific cross-attention layers and heads. This results in a tensor
+    # of shape (batch_size, num_selected_heads, output_length, input_length).
     weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
     weights = weights.permute([1, 0, 2, 3])
     if num_frames is not None:
@@ -32,21 +32,39 @@ def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_prec
     weights = (weights - mean) / std
     weights = _median_filter(weights, self.config.median_filter_width)
 
-    # Average the different cross-attention heads.
+    # Average the different cross-attention heads to get a matrix of shape
+    # (batch_size, output_length, input_length).
     matrix = weights.mean(dim=1)
 
-    timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
+    # Initialize the timestamps tensor with the correct size.
+    # We'll find the maximum length of `jump_times` across the batch.
+    batch_size = generate_outputs.sequences.size(0)
+    max_jump_length = 0
+    batch_jump_times = []
 
-    # Perform dynamic time warping on each element of the batch.
-    for batch_idx in range(timestamps.shape[0]):
+    # First pass: Compute `jump_times` and find the maximum length.
+    for batch_idx in range(batch_size):
         text_indices, time_indices = _dynamic_time_warping(-matrix[batch_idx].float().cpu().numpy())
         jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
         jump_times = time_indices[jumps] * time_precision
-        timestamps[batch_idx, 1:] = torch.tensor(jump_times)
+        batch_jump_times.append(jump_times)
+        if len(jump_times) > max_jump_length:
+            max_jump_length = len(jump_times)
+
+    # Initialize timestamps tensor with appropriate size.
+    # Adding 1 to account for the initial zero (timestamps[:, 0]).
+    timestamps = torch.zeros((batch_size, max_jump_length + 1), dtype=torch.float32)
+
+    # Second pass: Assign `jump_times` to the timestamps tensor.
+    for batch_idx, jump_times in enumerate(batch_jump_times):
+        length = len(jump_times)
+        # Assign `jump_times` to the appropriate slice in `timestamps`.
+        timestamps[batch_idx, 1:1+length] = torch.tensor(jump_times, dtype=torch.float32)
 
     return timestamps
 
 
+
 @dataclass
 class ASRAudioFile:
     file : str
diff --git a/batchalign/models/utterance/infer.py b/batchalign/models/utterance/infer.py
@@ -31,6 +31,8 @@ def __init__(self, model):
         self.tokenizer = AutoTokenizer.from_pretrained(model)
         self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
 
+        self.max_length = self.model.config.max_position_embeddings
+
         # eval mode
         self.model.eval()
 
@@ -43,15 +45,27 @@ def __call__(self, passage):
         passage = passage.replace('.','')
 
         # "tokenize" the result by just splitting by space
-        input_tokenized = passage.split(' ')
+        input_tokenized = passage.split(' ') if passage.strip() else []
+        if not input_tokenized:
+            raise ValueError("Tokenized input is empty after preprocessing")
+
+        if len(input_tokenized) > self.max_length:
+            input_tokenized = input_tokenized[:self.max_length]
+
+        print(f"Input tokenized length: {len(input_tokenized)}, tokens: {input_tokenized}")
+
 
         # pass it through the tokenizer and model
         tokd = self.tokenizer([input_tokenized],
                               return_tensors='pt',
-                              is_split_into_words=True).to(DEVICE)
+                              is_split_into_words=True,
+                              truncation=True,
+                              max_length=self.max_length
+                              ).to(DEVICE)
 
         # pass it through the model
-        res = self.model(**tokd).logits
+        with torch.no_grad():
+            res = self.model(**tokd).logits
 
         # argmax
         classified_targets = torch.argmax(res, dim=2).cpu()
diff --git a/batchalign/models/whisper/infer_asr.py b/batchalign/models/whisper/infer_asr.py
diff --git a/batchalign/pipelines/asr/utils.py b/batchalign/pipelines/asr/utils.py

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`},`
`13`	`13`	`"whisper": {`
`14`	`14`	`'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),`
`15`		`- # 'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),`
	`15`	`+ 'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),`
`16`	`16`	`}`
`17`	`17`	`}`
`18`	`18`