import spaces
import torch
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
import soundfile as sf
import numpy as np
import requests
import os

# Load Whisper model
ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    chunk_length_s=30,
    device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
)

all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]

def _preload_and_load_models():
    global vicuna_tokenizer, vicuna_model
    VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
    vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
    vicuna_model = AutoModelForCausalLM.from_pretrained(
        VICUNA_MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto", # or.to('cuda')
    ) #.to('cuda') # Explicitly move to CUDA after loading

_preload_and_load_models()

@spaces.GPU(required=True)
def process_audio(microphone, state, task="transcribe"):
    if microphone is None:
        return state, state, None

    asr_pipe.model.config.forced_decoder_ids = [
        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
    ]
    text = asr_pipe(microphone)["text"]
    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
        You answer questions clearly and simply, using age-appropriate language.
        You are also a little bit silly and like to make jokes."""
    prompt = f"{system_prompt}\nUser: {text}"

    with torch.no_grad():
        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
        vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
        vicuna_response = vicuna_response.replace(prompt, "").strip()
    updated_state = state + "\n" + vicuna_response

    try:
        API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
        headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
        payloads = {'inputs': vicuna_response}  # Use Vicuna's response for TTS
        response = requests.post(API_URL, headers=headers, json=payloads)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        audio_data = response.content
        # Convert bytes to numpy array (adjust sampling rate if needed)
        audio_arr = np.frombuffer(audio_data, dtype=np.int16)  # Assumes 16-bit PCM
        SAMPLE_RATE = 22050 # Common for this model; you might need to check the actual value
        audio_arr = audio_arr.reshape(-1, 1).astype(np.float32) / np.iinfo(np.int16).max # Normalize
        audio_arr = audio_arr.flatten() # Make it 1D
        audio_output = (SAMPLE_RATE, audio_arr)
        #sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
    except requests.exceptions.RequestException as e:
        print(f"Error in Hugging Face API request: {e}")
        audio_output = None
    except Exception as e:
        print(f"Error in speech synthesis: {e}")
        audio_output = None
    return updated_state, updated_state, audio_output

with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo:  # Updated title
    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS") # Updated Markdown
    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
    with gr.Tab("Transcribe & Synthesize"):
        mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
        audio_output = gr.Audio(label="Synthesized Speech", type="numpy") # Important: type="numpy"
        transcription_state = gr.State(value="")
        mic_input.change(
            fn=process_audio,  # Call the combined function
            inputs=[mic_input, transcription_state],
            outputs=[transcription_output, transcription_state, audio_output]
        )

demo.launch(share=False)