Very basic pipeline with a custom classifier without GUI #610

JPalmerK · 2025-03-05T07:43:37Z

JPalmerK
Mar 5, 2025

Hey folks,
I have a very basic question here. I've looked through the tutorials and the scripts and haven't found then answer, so if it's already listed elsewhere apologies.

I'm working on marine mammal classifiers using birdnet embeddings and I've been able to train a custom model in the GUI and it works pretty well. I've also written custom scripts to use the model to predict on audio files to that I can build precision recall curves and confusion matrices.

Unfortunately, I've not been able to figure out how to do that using the pre-written package functions. I'm reasonably ok at Python, I can write and read objects, but I don't see where I can point to a tflite and species list as well as the input parameters I used to train, e.g. limited to 8khz. Real stupid question here but how to I point to my pre-trained model using model.load_model() when it doesn't have an input location? Below is what I've written to process my data with my custom model but it still does't take into account the spectral parameters.

class BirdNetPredictor:
    def __init__(self, model_path, label_path, audio_folder, 
                 sample_rate=48000, audio_duration=3.0, confidence_thresh =0.5):
        """
        Processor class for runing tflite (e.g. birdnet) models on audio files
        in a folder
        
        Parameters:
            model_path (str): Path to the TensorFlow Lite model.
            label_path (str): Path to the label file (text file with one label per line).
            audio_folder (str): Path to the folder containing audio files.
            sample_rate (int): Sample rate for the model (default 48000).
            audio_duration (float): Duration in seconds of audio to classify (default 2.0).
            
        # Example usage
        model_path = "C:\\Users\\kaity\\Documents\\GitHub\\Ecotype\\Experiments\\BirdNET\\Bckrnd_mn_srkw_tkw_offshore_TKW_balanced_4k\\CustomClassifier_100_calls_Balanced_calltypes.tflite"
        label_path = "C:\\Users\\kaity\\Documents\\GitHub\\Ecotype\\Experiments\\BirdNET\\Bckrnd_mn_srkw_tkw_offshore_TKW_balanced_4k\\CustomClassifier_100_calls_Balanced_calltypes_Labels.txt"
        audio_folder = 'C:\\TempData\\DCLDE_EVAL\\SMRU\\Audio\\SMRU_test\\'
        processor = BirdNetPredictor(model_path, label_path, audio_folder)

        # Batch process audio files in folder and export to CSV
        output_csv = "predictions_output.csv"
        df_SMRU = processor.batch_process_audio_folder(output_csv)
        processor.export_to_raven(df_SMRU, raven_file="raven_output.txt")
        """
        self.model_path = model_path
        self.label_path = label_path
        self.audio_folder = audio_folder
        self.sample_rate = sample_rate
        self.audio_duration = audio_duration
        
        # Load model and labels
        self.interpreter = tflite.Interpreter(model_path=model_path)
        self.interpreter.allocate_tensors()
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()
        
        # Confidence threshold
        self.confidence_thresh = confidence_thresh
        
        # Extract expected sample rate and duration from model input shape
        input_shape = self.input_details[0]['shape']
        self.audio_duration = input_shape[1] / self.sample_rate
        print(f"Model expects {self.audio_duration} seconds of audio at {self.sample_rate} Hz")
        
        # Load labels
        self.labels = self.load_labels(label_path)
        
    def load_labels(self, label_path):
        """Load class labels from a text file (one label per line)."""
        with open(label_path, "r") as f:
            return [line.strip() for line in f.readlines()]
    
    def preprocess_audio(self, audio, sr, target_sr=48000, duration=3.0):
        """Resample, trim/pad, and format audio to match model input."""
        if sr != target_sr:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        
        # Calculate the required length in samples (144,000 for a 3-second audio at 48 kHz)
        required_length = 144000  # Change this to 144000 for the 3-second audio duration
    
        # Zero padding if the segment is shorter than required
        if len(audio) < required_length:
            padding = required_length - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant', constant_values=0)
        else:
            # Trim the audio to the exact required length if it's longer
            audio = audio[:required_length]
        
        return np.expand_dims(audio.astype(np.float32), axis=0)

        
    def predict_segment(self, audio_segment):
        """Run inference on a single preprocessed audio segment."""
        
        self.interpreter.set_tensor(self.input_details[0]['index'], audio_segment)
        self.interpreter.invoke()
        predictions = self.interpreter.get_tensor(self.output_details[0]['index'])[0]
        return predictions
    
    def predict_long_audio(self, audio_path):
        """Split a long audio file into fixed-length chunks and classify each segment."""
        y, sr = librosa.load(audio_path, sr=None)  # Load with native sample rate

        segment_length = int(sr * self.audio_duration)  # Samples per segment
        num_segments = int(np.ceil(len(y) / segment_length))  # Number of chunks

        results = []
        for i in range(num_segments):
            start_sample = i * segment_length
            end_sample = min((i + 1) * segment_length, len(y))
            segment = y[start_sample:end_sample]

            # Preprocess segment
            processed_segment = self.preprocess_audio(segment, sr=sr, 
                                                      target_sr=self.sample_rate)

            # Predict (logit output)
            predictions = self.predict_segment(processed_segment)

            # Convert logits to BirdNET confidence scores using sigmoid
            confidence_scores = scipy.special.expit(predictions)  # Sigmoid function
            
            top_idx = np.argmax(predictions)
            top_label = self.labels[top_idx] if top_idx < len(self.labels) else f"Unknown Class {top_idx}"
            confidence = confidence_scores[top_idx]

            if confidence >= self.confidence_thresh:
                results.append({
                    "Begin Time (S)": round(start_sample / sr, 2),
                    "End Time (S)": round(end_sample / sr, 2),
                    "Class": top_label,
                    "Common name": top_label,  # Replace with common name if needed
                    "Score": round(confidence, 4),
                    "File": os.path.basename(audio_path),
                    'FilePath': audio_path
                })


            print(f"Segment {i + 1}: {top_label} (Confidence: {confidence:.2f})")

        df = pd.DataFrame(results)
        return df

    def batch_process_audio_folder(self, output_csv="predictions.csv"):
        """Recursively process all audio files in a folder and save results to CSV."""
        all_results = []
        
        # Recursively walk through the directory and subdirectories
        for root, _, files in os.walk(self.audio_folder):
            for filename in files:
                if filename.endswith(('.wav', '.mp3', '.flac', '.ogg')):  # Process only audio files
                    audio_path = os.path.join(root, filename)
                    print(f"Processing {audio_path}...")
                    df = self.predict_long_audio(audio_path)
                    all_results.append(df)
    
        # Concatenate all DataFrames and save to CSV
        final_df = pd.concat(all_results, ignore_index=True)
        final_df.to_csv(output_csv, index=False)
        print(f"Batch processing complete! Results saved to {output_csv}")
        return final_df

    def export_to_raven(self, df, raven_file="raven_output.txt"):
        """Export prediction results to a Raven selection table format."""
        
        df['Selection']= range(1, df.shape[0])
        df['Channel']= 1
        df['View']= 'Spectrogram 1'

        with open(raven_file, 'w') as f:
            # Write header
            f.write("Selection\tView\tChannel\tBegin Time (S)\tEnd Time (S)\tCommon name\tScore\n")
        
            for _, row in df.iterrows():
                Selection = row['Selection']
                View = row['View']
                Channel = row['Channel']
                start = row['Begin Time (S)']
                end = row['End Time (S)']
                label = row['Common name']
                confidence = row['Score']
                
                line = f"{Selection}\t{View}\t{Channel}\t{start}\t{end}\t{label}\t{confidence}\n"
                f.write(line)

        print(f"Raven selection table exported to {raven_file}")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Very basic pipeline with a custom classifier without GUI #610

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Very basic pipeline with a custom classifier without GUI #610

Uh oh!

Uh oh!

JPalmerK Mar 5, 2025

Replies: 0 comments

JPalmerK
Mar 5, 2025