GenaNiv · GenaNiv · Nov 17, 2025 · Nov 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,23 @@
+# Python artifacts
+__pycache__/
+*.py[cod]
+*.pyo
+
+# Virtual environments
+.venv/
+venv/
+
+# Editor configs
+.idea/
+.vscode/
+
+# OS files
+.DS_Store
+
+# Audio/model outputs
+audio_files/
+test_environment/
+
+# Data/cache
+data/bst_data.pkl
+*.log
diff --git a/README.md b/README.md
@@ -29,8 +29,9 @@ The Speaker Recognition Engine supports several commands for managing speaker au
 
 1. **Enroll a Speaker**: Enroll a new speaker using an audio file.
 2. **Recognize a Speaker**: Identify a speaker from a given audio file.
-3. **List Enrolled Speakers**: Display a list of all enrolled speakers.
-4. **Delete a Speaker**: Remove a speaker's data from the system.
+3. **Recognize a Stream**: Feed audio chunks in near real-time and observe interim matches.
+4. **List Enrolled Speakers**: Display a list of all enrolled speakers.
+5. **Delete a Speaker**: Remove a speaker's data from the system.
 
 Each command can be executed from the command line with the appropriate arguments. 
 The general syntax for using the tool is:
@@ -60,3 +61,84 @@ python cli.py enroll <speaker_name> <audio_file_path> [optional parameters]
 ```bash
 python cli.py enroll gena /home/gena/audio_files/gena.wav --sample_rate 16000 --num_filters 40 --num_ceps 13 --n_fft 512 --frame_size 0.025 --frame_step 0.01 --n_mixtures 8
 ```
+
+## Recognize a Speaker
+
+Run the `recognize` command with a wav file. The CLI prints the best match and log-likelihood scores that come from the shared `VoiceRecognitionService`.
+
+```bash
+python cli.py recognize /home/gena/audio_files/gena.wav --sample_rate 16000
+```
+
+## Recognize a Stream (Real-Time Simulation)
+
+The `recognize_stream` command reuses the same service façade but feeds the audio file in chunks (default 0.5 s). This mimics real-time capture and prints interim matches as soon as the likelihoods are high enough.
+
+```bash
+python cli.py recognize_stream /home/gena/audio_files/gena.wav --chunk_duration 0.25
+```
+
+## Live Microphone Demo
+
+Use `src/live_recognition.py` to capture audio from the default input device and route it directly through the streaming API. Ensure `sounddevice` sees your microphone, then run:
+
+```bash
+python src/live_recognition.py
+```
+
+Speak into the microphone—interim matches will appear as the engine accumulates enough audio. Press `Ctrl+C` to stop.
+
+## Embedding the Service API
+
+For tighter integration with other applications (e.g., the upcoming voice engine), import `VoiceRecognitionService` and the request/response models:
+
+```python
+from file_management.bst import BinarySearchTree
+from service.api import VoiceRecognitionService, EnrollmentRequest, EnrollmentConfig
+from service.audio_sources import BufferAudioSource
+
+bst = BinarySearchTree()
+service = VoiceRecognitionService(bst=bst, base_directory="test_environment")
+
+# Enroll using in-memory buffers
+req = EnrollmentRequest(
+    speaker_id="alice",
+    audio_source=BufferAudioSource(buffers=[pcm_chunk_1, pcm_chunk_2]),
+    config=EnrollmentConfig(sample_rate=16000),
+)
+service.enroll(req)
+```
+
+The same façade exposes `recognize`, `start_session`, `list_speakers`, and `delete_speaker`, allowing other repositories to depend on this module without invoking the CLI.
+
+## Recording a Test WAV on Raspberry Pi with Jabra Speak 410
+
+Use this workflow to capture a 16 kHz mono WAV file on the Raspberry Pi 5 connected to the Jabra speaker/mic. All commands assume the repository lives under `/home/gena/PROJECTS`.
+
+1. Set the Jabra device as the default PipeWire sink/source:
+   ```bash
+   ./roomba_stack/audio_jabra_default.sh
+   ```
+2. Confirm the capture device name (needed in the next step):
+   ```bash
+   pactl list short sources | grep -i jabra
+   ```
+   You should see something like `alsa_input.usb-0b0e_Jabra_SPEAK_410_USB_...-mono-fallback` running at 16 kHz.
+3. Make sure there is a place to store recordings:
+   ```bash
+   mkdir -p voice-recognition-engine/audio_files
+   ```
+4. Record a short sample (5–10 seconds) using the PipeWire/ALSA device discovered in step 2:
+   ```bash
+   parecord \
+     --device=alsa_input.usb-0b0e_Jabra_SPEAK_410_USB_50C2ED166881x011200-00.mono-fallback \
+     --rate=16000 --channels=1 --format=s16le \
+     voice-recognition-engine/audio_files/gmm_test.wav
+   ```
+   Speak while the command runs and press `Ctrl+C` when finished.
+5. Validate the recording before using it with the GMM engine:
+   ```bash
+   aplay voice-recognition-engine/audio_files/gmm_test.wav
+   ```
+
+The resulting `gmm_test.wav` resides in `voice-recognition-engine/audio_files/` and can be supplied to the CLI commands (e.g., `python src/cli.py recognize voice-recognition-engine/audio_files/gmm_test.wav --sample_rate 16000`).
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ librosa==0.10.2.post1
 numpy==2.0.2
 scikit-learn==1.5.2
 matplotlib==3.9.2
+sounddevice==0.4.7
diff --git a/src/cli.py b/src/cli.py
@@ -1,15 +1,16 @@
 import argparse
 import os
 
+from file_management.bst import BinarySearchTree
+from service.api import EnrollmentConfig, RecognitionConfig, VoiceRecognitionService
 from service.commands import (
     EnrollSpeakerCommand,
     RecognizeSpeakerCommand,
+    RecognizeStreamCommand,
     ListSpeakersCommand,
     DeleteSpeakerCommand,
     CommandHandler
 )
-from file_management.bst import BinarySearchTree
-from file_management.file_management import FileManagementInterface
 
 def setup_environment(base_directory):
     # Ensure the base directory for models, audio files, and metadata exists
@@ -52,6 +53,19 @@ def main(command_line_args=None):
     recognize_parser.add_argument('--fft_size', type=int, default=512, help='FFT size for audio processing')
     recognize_parser.add_argument('--num_filters', type=int, default=26, help='Number of Mel filters')
     recognize_parser.add_argument('--num_ceps', type=int, default=13, help='Number of MFCC coefficients')
+    recognize_parser.add_argument('--score_threshold', type=float, default=None, help='Minimum log-likelihood to accept a speaker match')
+
+    # Streaming recognition command
+    recognize_stream_parser = subparsers.add_parser('recognize_stream', help='Stream audio chunks to recognize a speaker in near real-time')
+    recognize_stream_parser.add_argument('audio_file', type=str, help='Path to the audio file')
+    recognize_stream_parser.add_argument('--sample_rate', type=int, default=16000, help='Sample rate of the audio stream')
+    recognize_stream_parser.add_argument('--frame_size', type=float, default=0.025, help='Frame size in seconds')
+    recognize_stream_parser.add_argument('--frame_step', type=float, default=0.01, help='Frame step (overlap) in seconds')
+    recognize_stream_parser.add_argument('--fft_size', type=int, default=512, help='FFT size for audio processing')
+    recognize_stream_parser.add_argument('--num_filters', type=int, default=26, help='Number of Mel filters')
+    recognize_stream_parser.add_argument('--num_ceps', type=int, default=13, help='Number of MFCC coefficients')
+    recognize_stream_parser.add_argument('--score_threshold', type=float, default=None, help='Minimum log-likelihood to accept a speaker match')
+    recognize_stream_parser.add_argument('--chunk_duration', type=float, default=0.5, help='Duration (seconds) of each streamed chunk')
 
     # List Speakers Command
     subparsers.add_parser('list_speakers', help='List all enrolled speakers')
@@ -72,79 +86,84 @@ def main(command_line_args=None):
     # Ensure environment setup
     setup_environment(base_directory)
 
-    # Initialize Binary Search Tree
-    bst = BinarySearchTree()  # Placeholder for actual binary search tree implementation
+    # Initialize Binary Search Tree and shared service
+    bst = BinarySearchTree()
+    service = VoiceRecognitionService(bst=bst, base_directory=base_directory)
 
     # Process the command based on the parsed arguments
     if args.command == 'enroll':
-        command = EnrollSpeakerCommand(
-            speaker_name=args.speaker_name,
-            audio_file=args.audio_file,
-            bst=bst,
-            base_directory=base_directory,
+        enroll_config = EnrollmentConfig(
             sample_rate=args.sample_rate,
             num_filters=args.num_filters,
             num_ceps=args.num_ceps,
-            n_fft=args.n_fft,
+            fft_size=args.n_fft,
             frame_size=args.frame_size,
             frame_step=args.frame_step,
-            n_mixtures=args.n_mixtures
+            mixtures=args.n_mixtures,
+        )
+        command = EnrollSpeakerCommand(
+            service=service,
+            speaker_name=args.speaker_name,
+            audio_file=args.audio_file,
+            config=enroll_config,
         )
         handler.run(command)
-
-        # Serialize the BST before exiting the program
-        bst.serialize_bst()
 
     elif args.command == 'recognize':
+        recognize_config = RecognitionConfig(
+            sample_rate=args.sample_rate,
+            frame_size=args.frame_size,
+            frame_step=args.frame_step,
+            fft_size=args.fft_size,
+            num_filters=args.num_filters,
+            num_ceps=args.num_ceps,
+        )
         command = RecognizeSpeakerCommand(
-            bst=bst,
+            service=service,
             audio_file=args.audio_file,
-            base_directory=base_directory,
+            config=recognize_config,
+            score_threshold=args.score_threshold
+        )
+        handler.run(command)
+
+    elif args.command == 'recognize_stream':
+        recognize_config = RecognitionConfig(
             sample_rate=args.sample_rate,
             frame_size=args.frame_size,
             frame_step=args.frame_step,
             fft_size=args.fft_size,
             num_filters=args.num_filters,
-            num_ceps=args.num_ceps
+            num_ceps=args.num_ceps,
+        )
+        command = RecognizeStreamCommand(
+            service=service,
+            audio_file=args.audio_file,
+            config=recognize_config,
+            score_threshold=args.score_threshold,
+            chunk_duration=args.chunk_duration,
         )
         handler.run(command)
 
     elif args.command == 'list_speakers':
-        file_management = FileManagementInterface(bst=bst, base_directory=base_directory)
-        command = ListSpeakersCommand(file_management)
+        command = ListSpeakersCommand(service=service)
         handler.run(command)
 
     elif args.command == 'delete_speaker':
-        file_management = FileManagementInterface(bst=bst, base_directory=base_directory)
-        command = DeleteSpeakerCommand(args.speaker_name, file_management)
+        command = DeleteSpeakerCommand(service=service, speaker_name=args.speaker_name)
         handler.run(command)
 
     else:
         parser.print_help()
 
+    # Persist BST state after command execution
+    bst.serialize_bst()
+
 if __name__ == "__main__":
-    #debug_args = [
-    #    'enroll',
-    #    'maria',
-    #    '/home/gena/PROJECTS/voice-recognition-engine/audio_files/maria.wav',
-    #    '--sample_rate', '16000',
-    #    '--num_filters', '40',
-    #    '--num_ceps', '13',
-    #    '--n_fft', '512',
-    #    '--frame_size', '0.025',
-    #    '--frame_step', '0.01',
-    #    '--n_mixtures', '8'
-    #]
-
-    debug_args = [
-        'recognize',
-        '/home/gena/PROJECTS/voice-recognition-engine/audio_files/leah_recognize.wav',
-        '--sample_rate', '16000',
-        '--frame_size', '0.025',
-        '--frame_step', '0.01',
-        '--fft_size', '512',
-        '--num_filters', '40',
-        '--num_ceps', '13',
-    ]
-
-    main(debug_args)
+    # To run with ad-hoc arguments during development, pass them explicitly, e.g.:
+    # debug_args = [
+    #     'recognize',
+    #     '/path/to/audio.wav',
+    #     '--sample_rate', '16000',
+    # ]
+    # main(debug_args)
+    main()
diff --git a/src/live_recognition.py b/src/live_recognition.py
@@ -0,0 +1,63 @@
+"""Simple live recognition runner using the shared service façade."""
+from __future__ import annotations
+
+import queue
+import sys
+from typing import Optional
+
+import numpy as np
+import sounddevice as sd
+
+from file_management.bst import BinarySearchTree
+from service.api import RecognitionConfig, VoiceRecognitionService
+
+
+def run_live_recognition(
+    base_directory: str = "test_environment",
+    sample_rate: int = 16000,
+    chunk_duration: float = 0.25,
+    threshold: Optional[float] = None,
+):
+    """Capture microphone audio and stream it to the recognition service."""
+
+    bst = BinarySearchTree()
+    service = VoiceRecognitionService(bst=bst, base_directory=base_directory)
+    config = RecognitionConfig(sample_rate=sample_rate)
+    session = service.start_session(config=config, threshold=threshold)
+
+    chunk_samples = max(1, int(chunk_duration * sample_rate))
+    audio_queue: "queue.Queue[np.ndarray]" = queue.Queue()
+
+    def _callback(indata, frames, time, status):  # pylint: disable=unused-argument
+        if status:
+            print(status, file=sys.stderr)
+        audio_queue.put(indata.copy().reshape(-1))
+
+    print("Listening... Press Ctrl+C to stop.")
+    try:
+        with sd.InputStream(
+            samplerate=sample_rate,
+            channels=1,
+            blocksize=chunk_samples,
+            dtype="float32",
+            callback=_callback,
+        ):
+            latest = None
+            while True:
+                chunk = audio_queue.get()
+                result = session.consume(chunk)
+                if not result:
+                    continue
+                latest = result
+                if result.speaker_id and not result.rejected:
+                    print(f"Recognized {result.speaker_id} (score {result.score:.2f})")
+
+    except KeyboardInterrupt:
+        print("Stopping live recognition...")
+    finally:
+        session.close()
+        bst.serialize_bst()
+
+
+if __name__ == "__main__":
+    run_live_recognition()