vicuna-clip

Sleeping

App Files Files Community

vicuna-clip / app.py

ford442

Update app.py

06fb866 verified 3 months ago

raw

history blame

4.38 kB

	import spaces
	import torch
	import gradio as gr
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
	import soundfile as sf
	import numpy as np
	from bark import SAMPLE_RATE, generate_audio, preload_models
	import torch.multiprocessing as mp # Import multiprocessing
	import os

	# Load Whisper and Vicuna models (as before)
	ASR_MODEL_NAME = "openai/whisper-medium.en"
	asr_pipe = pipeline(
	task="automatic-speech-recognition",
	model=ASR_MODEL_NAME,
	chunk_length_s=30,
	device='cuda',
	)

	all_special_ids = asr_pipe.tokenizer.all_special_ids
	transcribe_token_id = all_special_ids[-5]
	translate_token_id = all_special_ids[-6]

	def _preload_and_load_models():
	global vicuna_tokenizer, vicuna_model
	# Load Vicuna (as before)
	VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
	vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
	vicuna_model = AutoModelForCausalLM.from_pretrained(
	VICUNA_MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto", # or.to('cuda')
	).to('cuda') # Explicitly move to CUDA after loading

	# Bark model loading (modified)
	from bark.models import (
	BARK_V0_MODEL_NAMES,
	BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME,
	) # Import model names

	from bark.generation import preload_models as _preload_models # rename the function
	_preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models


	if __name__ == "__main__":
	if "HF_SPACE_ID" in os.environ:
	mp.set_start_method('spawn', force=True)
	p = mp.Process(target=_preload_and_load_models)
	p.start()
	p.join()
	else:
	_preload_and_load_models()

	@spaces.GPU(required=True)
	def process_audio(microphone, state, task="transcribe"):
	if microphone is None:
	return state, state, None

	asr_pipe.model.config.forced_decoder_ids = [
	[2, transcribe_token_id if task == "transcribe" else translate_token_id]
	]
	text = asr_pipe(microphone)["text"]
	system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
	You answer questions clearly and simply, using age-appropriate language.
	You are also a little bit silly and like to make jokes."""
	prompt = f"{system_prompt}\nUser: {text}"

	with torch.no_grad():
	vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
	vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
	vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
	vicuna_response = vicuna_response.replace(prompt, "").strip()
	updated_state = state + "\n" + vicuna_response

	try:
	# Use Bark's generate_audio function directly
	audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed

	# Scale and convert audio (as before)
	audio_arr = (audio_arr * 32767).astype(np.int16)

	# Save audio for debugging
	sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)

	audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE

	except Exception as e:
	print(f"Error in speech synthesis: {e}")
	audio_output = None

	return updated_state, updated_state, audio_output

	with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
	gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
	gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
	with gr.Tab("Transcribe & Synthesize"):
	mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
	transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
	audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
	transcription_state = gr.State(value="")
	mic_input.change(
	fn=process_audio, # Call the combined function
	inputs=[mic_input, transcription_state],
	outputs=[transcription_output, transcription_state, audio_output]
	)

	demo.launch(share=False)