Spaces:

lshzhm
/

DeepAudio-V1

Running

App Files Files Community

DeepAudio-V1 / app.py

lshzhm

Upload app.py

ea8c66f verified about 1 month ago

raw

history blame

2.02 kB

	import spaces
	import logging
	from datetime import datetime
	from pathlib import Path

	import gradio as gr
	import torch
	import torchaudio
	import os

	import tempfile

	log = logging.getLogger()


	os.system("cd ./F5-TTS; pip install -e .")


	#@spaces.GPU(duration=120)
	@torch.inference_mode()
	def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
	cfg_strength: float, duration: float):

	os.system("bash v2a.sh")

	return "v2a"


	video_to_audio_tab = gr.Interface(
	fn=video_to_audio,
	description="""
	Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
	Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
	NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
	Doing so does not improve results.
	The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
	""",
	inputs=[
	gr.Video(),
	gr.Text(label='Prompt'),
	gr.Text(label='Negative prompt', value='music'),
	gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
	gr.Number(label='Num steps', value=25, precision=0, minimum=1),
	gr.Number(label='Guidance Strength', value=4.5, minimum=1),
	gr.Number(label='Duration (sec)', value=8, minimum=1),
	],
	outputs='playable_video',
	cache_examples=False,
	title='MMAudio — Video-to-Audio Synthesis',
	examples=[
	[
	'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
	'waves, seagulls',
	'',
	0,
	25,
	4.5,
	10,
	],
	])


	if __name__ == "__main__":
	gr.TabbedInterface([video_to_audio_tab],
	['Video-to-Audio']).launch()