import torch
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import soundfile as sf
import numpy as np

# --- Whisper (ASR) Setup ---
ASR_MODEL_NAME = "openai/whisper-large-v2"
asr_device = "cuda" if torch.cuda.is_available() else "cpu"
asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    chunk_length_s=30,
    device=asr_device,
)
all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]

# --- FastSpeech2 (TTS) Setup ---
TTS_MODEL_NAME = "facebook/fastspeech2-en-ljspeech"

# Load the config (we'll need it for the model class)
tts_config = AutoConfig.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)

# Load the processor and model, using trust_remote_code
tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_NAME, config=tts_config, trust_remote_code=True)


tts_device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = tts_model.to(tts_device)

# --- Vicuna (LLM) Setup ---
VICUNA_MODEL_NAME = "lmsys/vicuna-33b-v1.3"  # Or a smaller Vicuna model
vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
vicuna_model = AutoModelForCausalLM.from_pretrained(
    VICUNA_MODEL_NAME,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

# --- ASR Function ---
def transcribe_audio(microphone, state, task="transcribe"):
    if microphone is None:
        return state, state
    asr_pipe.model.config.forced_decoder_ids = [
        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
    ]
    text = asr_pipe(microphone)["text"]

    # --- VICUNA INTEGRATION ---
    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
      You answer questions clearly and simply, using age-appropriate language.
      You are also a little bit silly and like to make jokes."""

    prompt = f"{system_prompt}\nUser: {text}"

    with torch.no_grad():
        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(vicuna_device)
        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
        vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
        vicuna_response = vicuna_response.replace(prompt, "").strip()

    updated_state = state + "\n" + vicuna_response
    return updated_state, updated_state

# --- TTS Function ---
def synthesize_speech(text):
    try:
        inputs = tts_processor(text=text, return_tensors="pt")
        inputs = {key: value.to(tts_device) for key, value in inputs.items()}
        with torch.no_grad():
            output = tts_model(**inputs).waveform # Use the model directly, it outputs a waveform
        output = output.cpu()
        waveform = output.squeeze().numpy()
        return (tts_processor.feature_extractor.sampling_rate, waveform)
    except Exception as e:
        print(e)
        return (None, None)

# --- Gradio Interface ---
with gr.Blocks(title="Whisper, Vicuna, & FastSpeech2 Demo") as demo:
    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna")
    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")

    with gr.Tab("Transcribe & Synthesize"):
        mic_input = gr.Audio(source="microphone", type="filepath", optional=True, label="Speak Here")
        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
        audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
        transcription_state = gr.State(value="")

        mic_input.change(
            fn=transcribe_audio,
            inputs=[mic_input, transcription_state],
            outputs=[transcription_output, transcription_state]
        ).then(
            fn=synthesize_speech,
            inputs=transcription_output,
            outputs=audio_output
        )

demo.launch(enable_queue=True, share=False) # share=False is usually better for local development