File size: 4,383 Bytes
9f8fb3c
29b4682
1323ad0
b1622fb
 
1323ad0
b1622fb
 
e0628e1
1323ad0
3c50a05
a736521
f4d388e
 
 
 
 
 
c249a04
f4d388e
 
 
1323ad0
06fb866
 
 
b1622fb
 
 
 
 
 
 
 
06fb866
 
 
 
 
 
 
 
 
b1622fb
06fb866
 
 
 
 
 
 
 
e0628e1
06fb866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1622fb
06fb866
b1622fb
 
06fb866
b1622fb
 
06fb866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import spaces
import torch
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
import soundfile as sf
import numpy as np
from bark import SAMPLE_RATE, generate_audio, preload_models
import torch.multiprocessing as mp  # Import multiprocessing
import os

# Load Whisper and Vicuna models (as before)
ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    chunk_length_s=30,
    device='cuda',
)

all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]

def _preload_and_load_models():
    global vicuna_tokenizer, vicuna_model
    # Load Vicuna (as before)
    VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
    vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
    vicuna_model = AutoModelForCausalLM.from_pretrained(
        VICUNA_MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto", # or.to('cuda')
    ).to('cuda') # Explicitly move to CUDA after loading

    # Bark model loading (modified)
    from bark.models import (
        BARK_V0_MODEL_NAMES,
        BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME,
    )  # Import model names

    from bark.generation import preload_models as _preload_models # rename the function
    _preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models


if __name__ == "__main__":
    if "HF_SPACE_ID" in os.environ:
        mp.set_start_method('spawn', force=True)
        p = mp.Process(target=_preload_and_load_models)
        p.start()
        p.join()
    else:
        _preload_and_load_models()
    
    @spaces.GPU(required=True)
    def process_audio(microphone, state, task="transcribe"):
        if microphone is None:
            return state, state, None

        asr_pipe.model.config.forced_decoder_ids = [
            [2, transcribe_token_id if task == "transcribe" else translate_token_id]
        ]
        text = asr_pipe(microphone)["text"]
        system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
            You answer questions clearly and simply, using age-appropriate language.
            You are also a little bit silly and like to make jokes."""
        prompt = f"{system_prompt}\nUser: {text}"

        with torch.no_grad():
            vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
            vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
            vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
            vicuna_response = vicuna_response.replace(prompt, "").strip()
        updated_state = state + "\n" + vicuna_response

        try:
            # Use Bark's generate_audio function directly
            audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed

            # Scale and convert audio (as before)
            audio_arr = (audio_arr * 32767).astype(np.int16)

            # Save audio for debugging
            sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)

            audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE

        except Exception as e:
            print(f"Error in speech synthesis: {e}")
            audio_output = None

        return updated_state, updated_state, audio_output

    with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
        gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
        gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
        with gr.Tab("Transcribe & Synthesize"):
            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
            transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
            audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
            transcription_state = gr.State(value="")
            mic_input.change(
                fn=process_audio,  # Call the combined function
                inputs=[mic_input, transcription_state],
                outputs=[transcription_output, transcription_state, audio_output]
            )

    demo.launch(share=False)