vicuna-clip

Running on Zero

App Files Files Community

vicuna-clip / app.py

ford442

Update app.py

1fb301f verified 3 months ago

raw

history blame

4.19 kB

	import spaces
	import torch
	import gradio as gr
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
	import soundfile as sf
	import numpy as np
	import requests
	import os

	ASR_MODEL_NAME = "openai/whisper-medium.en"
	asr_pipe = pipeline(
	task="automatic-speech-recognition",
	model=ASR_MODEL_NAME,
	chunk_length_s=30,
	device='cuda',
	)

	all_special_ids = asr_pipe.tokenizer.all_special_ids
	transcribe_token_id = all_special_ids[-5]
	translate_token_id = all_special_ids[-6]

	def _preload_and_load_models():
	global vicuna_tokenizer, vicuna_model
	VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
	vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
	vicuna_model = AutoModelForCausalLM.from_pretrained(
	VICUNA_MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto", # or.to('cuda')
	) #.to('cuda') # Explicitly move to CUDA after loading

	@spaces.GPU(required=True)
	def process_audio(microphone, state, task="transcribe"):
	if microphone is None:
	return state, state, None

	asr_pipe.model.config.forced_decoder_ids = [
	[2, transcribe_token_id if task == "transcribe" else translate_token_id]
	]
	text = asr_pipe(microphone)["text"]
	system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
	You answer questions clearly and simply, using age-appropriate language.
	You are also a little bit silly and like to make jokes."""
	prompt = f"{system_prompt}\nUser: {text}"

	with torch.no_grad():
	vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
	vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
	vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
	vicuna_response = vicuna_response.replace(prompt, "").strip()
	updated_state = state + "\n" + vicuna_response

	try:
	API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
	headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
	payloads = {'inputs': vicuna_response} # Use Vicuna's response for TTS
	response = requests.post(API_URL, headers=headers, json=payloads)
	response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

	audio_data = response.content
	# Convert bytes to numpy array (adjust sampling rate if needed)
	audio_arr = np.frombuffer(audio_data, dtype=np.int16) # Assumes 16-bit PCM
	SAMPLE_RATE = 22050 # Common for this model; you might need to check the actual value
	audio_arr = audio_arr.reshape(-1, 1).astype(np.float32) / np.iinfo(np.int16).max # Normalize
	audio_arr = audio_arr.flatten() # Make it 1D
	audio_output = (SAMPLE_RATE, audio_arr)
	#sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
	except requests.exceptions.RequestException as e:
	print(f"Error in Hugging Face API request: {e}")
	audio_output = None
	except Exception as e:
	print(f"Error in speech synthesis: {e}")
	audio_output = None
	return updated_state, updated_state, audio_output

	with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
	gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS") # Updated Markdown
	gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
	with gr.Tab("Transcribe & Synthesize"):
	mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
	transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
	audio_output = gr.Audio(label="Synthesized Speech", type="numpy") # Important: type="numpy"
	transcription_state = gr.State(value="")
	mic_input.change(
	fn=process_audio, # Call the combined function
	inputs=[mic_input, transcription_state],
	outputs=[transcription_output, transcription_state, audio_output]
	)

	demo.launch(share=False)