vicuna-clip

Running on Zero

File size: 4,383 Bytes

import spaces
import torch
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
import soundfile as sf
import numpy as np
from bark import SAMPLE_RATE, generate_audio, preload_models
import torch.multiprocessing as mp  # Import multiprocessing
import os

# Load Whisper and Vicuna models (as before)
ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    chunk_length_s=30,
    device='cuda',
)

all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]

def _preload_and_load_models():
    global vicuna_tokenizer, vicuna_model
    # Load Vicuna (as before)
    VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
    vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
    vicuna_model = AutoModelForCausalLM.from_pretrained(
        VICUNA_MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto", # or.to('cuda')
    ).to('cuda') # Explicitly move to CUDA after loading

    # Bark model loading (modified)
    from bark.models import (
        BARK_V0_MODEL_NAMES,
        BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME,
    )  # Import model names

    from bark.generation import preload_models as _preload_models # rename the function
    _preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models


if __name__ == "__main__":
    if "HF_SPACE_ID" in os.environ:
        mp.set_start_method('spawn', force=True)
        p = mp.Process(target=_preload_and_load_models)
        p.start()
        p.join()
    else:
        _preload_and_load_models()
    
    @spaces.GPU(required=True)
    def process_audio(microphone, state, task="transcribe"):
        if microphone is None:
            return state, state, None

        asr_pipe.model.config.forced_decoder_ids = [
            [2, transcribe_token_id if task == "transcribe" else translate_token_id]
        ]
        text = asr_pipe(microphone)["text"]
        system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
            You answer questions clearly and simply, using age-appropriate language.
            You are also a little bit silly and like to make jokes."""
        prompt = f"{system_prompt}\nUser: {text}"

        with torch.no_grad():
            vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
            vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
            vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
            vicuna_response = vicuna_response.replace(prompt, "").strip()
        updated_state = state + "\n" + vicuna_response

        try:
            # Use Bark's generate_audio function directly
            audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed

            # Scale and convert audio (as before)
            audio_arr = (audio_arr * 32767).astype(np.int16)

            # Save audio for debugging
            sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)

            audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE

        except Exception as e:
            print(f"Error in speech synthesis: {e}")
            audio_output = None

        return updated_state, updated_state, audio_output

    with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
        gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
        gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
        with gr.Tab("Transcribe & Synthesize"):
            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
            transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
            audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
            transcription_state = gr.State(value="")
            mic_input.change(
                fn=process_audio,  # Call the combined function
                inputs=[mic_input, transcription_state],
                outputs=[transcription_output, transcription_state, audio_output]
            )

    demo.launch(share=False)