Optimized self-confirm and added another argument

dscripka · dscripka · commit 6d96480f8314 · 2025-10-19T17:25:53.000-04:00
diff --git a/openwakeword/model.py b/openwakeword/model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 # Imports
+from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import openwakeword
 from openwakeword.utils import AudioFeatures, re_arg
@@ -42,6 +43,7 @@ def __init__(
             enable_speex_noise_suppression: bool = False,
             vad_threshold: float = 0,
             self_confirm: bool = False,
+            self_confirm_ncpus: int = 1,
             custom_verifier_models: dict = {},
             custom_verifier_threshold: float = 0.1,
             inference_framework: str = "tflite",
@@ -71,6 +73,7 @@ def __init__(
                                 augmentation that can significantly reduce false detections, but also significantly increases
                                 the computational cost of running the model when used. See the `self_confirm` method for more
                                 details on how to leverage this functionality.
+            self_confirm_ncpus (int): The number of CPU cores to use when running the self-confirmation model.
             custom_verifier_models (dict): A dictionary of paths to custom verifier models, where
                                            the keys are the model names (corresponding to the openwakeword.MODELS
                                            attribute) and the values are the filepaths of the
@@ -222,8 +225,19 @@ def onnx_predict(onnx_model, x):
                 class_mapping_dicts=class_mapping_dicts,
                 self_confirm=False,
                 inference_framework=inference_framework,
-                **kwargs
+                ncpu=self_confirm_ncpus
             )
+            self.confirmation_results = None
+            self.self_confirm_ncpus = self_confirm_ncpus
+
+            # Create thread pool for self_confirm calling
+            self.confirmation_executor = ThreadPoolExecutor(max_workers=1)
+
+            # Force thread pool initialization by submitting a dummy task
+            # This avoids the first-call overhead later
+            def _noop():
+                pass
+            self.confirmation_executor.submit(_noop).result()
 
         # Create AudioFeatures object
         self.preprocessor = AudioFeatures(inference_framework=inference_framework, **kwargs)
@@ -244,6 +258,7 @@ def reset(self):
         when called too frequently."""
         self.prediction_buffer = defaultdict(partial(deque, maxlen=30))
         self.preprocessor.reset()
+        self.confirmation_results = None
 
     def predict(self, x: np.ndarray, patience: dict = {},
                 threshold: dict = {}, debounce_time: float = 0.0, timing: bool = False):
@@ -401,7 +416,7 @@ def predict(self, x: np.ndarray, patience: dict = {},
         else:
             return predictions
 
-    def self_confirm(self, last_n_seconds: float = 1.5):
+    def self_confirm(self, last_n_seconds: float = 1.5, background=False):
         """
         Use the confirmation model to confirm the predictions from the main model. This is a form of
         test-time augmentation that can significantly reduce false detections, but significantly increases
@@ -420,42 +435,60 @@ def self_confirm(self, last_n_seconds: float = 1.5):
             last_n_seconds (float): The number of seconds of audio to use for confirmation.
                                     The default (1.5) should be sufficient for most use cases, but increase if your
                                     target wake-word/phrase is long, or decrease if short.
+            background (bool): Whether to run the confirmation model in a background thread. If True, the results of
+                               the function will be returned asynchronously and stored in the
+                               `self.confirmation_results` attribute. Until the results are available, this attribute
+                                will be None.
         Returns:
             dict: A dictionary of scores between 0 and 1 for each model, representing the maximum
                     score from the confirmation model over the last `last_n_seconds` seconds of audio.
+                    If background=True, returns None and stores results in self.confirmation_results when ready.
         """
         # Check for self-confirm functionality
         if self.self_confirm_enabled is False:
             raise ValueError("The self-confirm functionality is not enabled for this model instance!")
 
         # Check for at least two cores
         cpu_count = os.cpu_count()
-        if cpu_count is None or cpu_count < 2:
+        if (cpu_count is None or cpu_count < 2) and background is True:
             raise ValueError("The self-confirm functionality requires at least two CPU cores, as it uses threading.")
 
-        # Get the last n seconds of audio from the audio buffer of the main model, and get the features
-        # with the self-confirmation model preprocessor
-        n_samples = int(last_n_seconds*16000)
-        if len(self.preprocessor.raw_data_buffer) < n_samples:
-            raise ValueError("Not enough audio data has been processed to use the self-confirm functionality!")
-        audio_data = np.array(self.preprocessor.raw_data_buffer)[-n_samples:]
-
-        # Reset the self-confirmation model, if it has been used before
-        if self.confirmation_model.preprocessor.accumulated_samples == 0:
-            self.confirmation_model.reset()
-
-        # Run model to get predictions
-        step_size = 1280
-        predictions = []
-        for i in range(0, audio_data.shape[0]-step_size, step_size):
-            predictions.append(self.confirmation_model.predict(audio_data[i:i+step_size]))
+        # Define the function to run predictions
+        def _run_confirmation_predictions():
+            # Get the last n seconds of audio from the audio buffer of the main model, and get the features
+            # with the self-confirmation model preprocessor
+            n_samples = int(last_n_seconds*16000)
+            if len(self.preprocessor.raw_data_buffer) < n_samples:
+                raise ValueError("Not enough audio data has been processed to use the self-confirm functionality!")
+            audio_data = np.fromiter(self.preprocessor.raw_data_buffer, dtype=np.int16)[-n_samples:]
 
-        predictions_dict = {}
-        for mdl in predictions[0].keys():
-            predictions_per_model = [p[mdl] for p in predictions]
-            predictions_dict[mdl] = np.max(predictions_per_model)
+            # Reset the self-confirmation model, if it has been used before
+            if self.confirmation_model.preprocessor.accumulated_samples == 0:
+                self.confirmation_model.reset()
 
-        return predictions_dict
+            # Run model to get predictions
+            step_size = 1280
+            predictions = []
+            for i in range(0, audio_data.shape[0]-step_size, step_size):
+                predictions.append(self.confirmation_model.predict(audio_data[i:i+step_size]))
+
+            predictions_dict = {}
+            for mdl in predictions[0].keys():
+                predictions_per_model = [p[mdl] for p in predictions]
+                predictions_dict[mdl] = np.max(predictions_per_model)
+
+            # Store results asynchronously
+            self.confirmation_results = predictions_dict
+
+        # Run in background thread if requested
+        if background:
+            self.confirmation_results = None
+            self.confirmation_executor.submit(_run_confirmation_predictions)
+            return None
+        else:
+            # Run synchronously
+            _run_confirmation_predictions()
+            return self.confirmation_results
 
     def predict_clip(self, clip: Union[str, np.ndarray], padding: int = 1, chunk_size=1280, **kwargs):
         """Predict on an full audio clip, simulating streaming prediction.
diff --git a/openwakeword/utils.py b/openwakeword/utils.py
@@ -166,7 +166,9 @@ def tflite_embedding_predict(x):
         self.melspectrogram_max_len = 10*97  # 97 is the number of frames in 1 second of 16hz audio
         self.accumulated_samples = 0  # the samples added to the buffer since the audio preprocessor was last called
         self.raw_data_remainder = np.empty(0)
-        self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
+        # self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
+        self.feature_buffer = np.load(os.path.join(pathlib.Path(__file__).parent.resolve(),
+                                                   "resources", "models", "feature_buffer_reset_data.npy"))
         self.feature_buffer_max_len = 120  # ~10 seconds of feature buffer history
 
     def reset(self):
@@ -175,7 +177,9 @@ def reset(self):
         self.melspectrogram_buffer = np.ones((76, 32))
         self.accumulated_samples = 0
         self.raw_data_remainder = np.empty(0)
-        self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
+        # self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
+        self.feature_buffer = np.load(os.path.join(pathlib.Path(__file__).parent.resolve(),
+                                                   "resources", "models", "feature_buffer_reset_data.npy"))
 
     def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2):
         """
diff --git a/tests/test_self_confirm.py b/tests/test_self_confirm.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import pytest
+import time
 
 
 # Tests
@@ -171,3 +172,48 @@ def test_self_confirm_multiclass_model(self):
         for model_name, score in predictions_dict.items():
             assert isinstance(score, (float, np.floating)), f"Score for {model_name} should be a float"
             assert 0 <= score <= 1, f"Score for {model_name} should be between 0 and 1, got {score}"
+
+    def test_self_confirm_background_true(self):
+        """Test self_confirm with background=True returns None and populates confirmation_results"""
+        owwModel = openwakeword.Model(
+            wakeword_models=[os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")],
+            inference_framework="onnx",
+            self_confirm=True
+        )
+
+        # Feed in ~10 seconds of random data to fill the audio buffer
+        chunk_size = 1280
+        n_samples = 160000
+
+        for i in range(0, n_samples, chunk_size):
+            random_audio = np.random.randint(-1000, 1000, chunk_size).astype(np.int16)
+            owwModel.predict(random_audio)
+
+        # Run self-confirm in background mode
+        result = owwModel.self_confirm(last_n_seconds=1.5, background=True)
+
+        # When background=True, should return None immediately
+        assert result is None, "self_confirm with background=True should return None"
+
+        # confirmation_results should eventually be populated
+        # Poll for results with a timeout (max 10 seconds)
+        max_wait_time = 10
+        start_time = time.time()
+        while owwModel.confirmation_results is None and (time.time() - start_time) < max_wait_time:
+            time.sleep(0.1)
+
+        # Verify that confirmation_results has been populated
+        assert owwModel.confirmation_results is not None, "confirmation_results should be populated after background execution"
+
+        # Verify confirmation_results is properly formed
+        predictions_dict = owwModel.confirmation_results
+        assert isinstance(predictions_dict, dict), "confirmation_results should be a dictionary"
+
+        expected_models = list(owwModel.models.keys())
+        assert len(predictions_dict) == len(expected_models), f"confirmation_results should have {len(expected_models)} key(s)"
+
+        for model_name in expected_models:
+            assert model_name in predictions_dict, f"confirmation_results should contain key '{model_name}'"
+            score = predictions_dict[model_name]
+            assert isinstance(score, (float, np.floating)), f"Score for {model_name} should be a float"
+            assert 0 <= score <= 1, f"Score for {model_name} should be between 0 and 1, got {score}"