"""
Module needed for pre-processing of uploaded audio
Uses silero_vad for silence removal and librosa for image generation

Author: Jakub Polnis
Copyright: Copyright 2025, Jakub Polnis
License: Apache 2.0
Email: jakubpolnis@gmail.com
"""

import io
import torch
import librosa
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from silero_vad import (load_silero_vad,
                          read_audio,
                          get_speech_timestamps,
                          save_audio,
                          VADIterator,
                          collect_chunks)
USE_ONNX = False
model = load_silero_vad(onnx=USE_ONNX)

SAMPLING_RATE = 16000

def silero_vad_remove_silence(audio_file_path):
    torch.set_num_threads(1)
    audio = read_audio(audio_file_path, sampling_rate=SAMPLING_RATE)

    # Get speech timestamps from full audio file
    speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=SAMPLING_RATE)

    if not speech_timestamps:
        print(f"No speech detected in {audio_file_path}. Returning original audio.")
        return audio  # Return unmodified audio
    else:
        # Merge all speech chunks and return the result
        processed_audio = collect_chunks(speech_timestamps, audio)
        return processed_audio

def create_mel_spectrograms(file_path, segment_duration, start_offset):

    duration = segment_duration
    startOffset = start_offset
    pil_images = []

    # Call silero_vad to remove silence
    processed_audio = silero_vad_remove_silence(file_path)
    y = processed_audio.numpy()
    sr = SAMPLING_RATE

    # Calc duration of audio in seconds
    audio_duration = librosa.get_duration(y=y, sr=sr)

    # Calc duration of audio file in samples
    segment_duration_samples = int(duration * sr)

    # Calc the closest round number in seconds
    rounded_duration = int(np.round(audio_duration))

    # Trim the signal
    if len(y) > rounded_duration * sr:
        y = y[:rounded_duration * sr]
    elif len(y) < rounded_duration * sr:
        y = np.pad(y, (0, rounded_duration * sr - len(y)), mode='constant')

    # Loop through the signal
    for i in range(int(rounded_duration)):
        # Starting index
        start_sample = i * sr

        # End index
        end_sample = start_sample + segment_duration_samples

        if end_sample > len(y):
            continue

        y_segment = y[start_sample:end_sample]

        if len(y_segment) > 0:

            # Creat mel-spectrogram
            S = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=128, fmax=8000, center=True)

            # Save it as img
            fig, ax = plt.subplots(figsize=(224 / 100, 224 / 100))

            # power_to_db
            S_dB = librosa.power_to_db(S, ref=np.max)

            # Setup axis
            img = librosa.display.specshow(S_dB, sr=sr, fmax=8000, ax=ax)
            ax.set_xlim(0, S.shape[-1])
            ax.set_ylim(0, S.shape[0])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_xticklabels([])
            ax.set_yticklabels([])

            # Save into the buffer so we can return PIL images
            buffer = io.BytesIO()
            plt.savefig(buffer, format='PNG', bbox_inches=None, pad_inches=0,
                        dpi=100, transparent=True)
            buffer.seek(0)

            # Convert buffer to PIL Image
            pil_image = Image.open(buffer)
            pil_images.append(pil_image.copy())  # Copy to avoid buffer issues

            # Close buffer and figure to free memory
            buffer.close()
            plt.close(fig)

    print(pil_images)

    return pil_images