DeepAudio-V1 / app.py
lshzhm's picture
Upload app.py
ea8c66f verified
raw
history blame
2.02 kB
import spaces
import logging
from datetime import datetime
from pathlib import Path
import gradio as gr
import torch
import torchaudio
import os
import tempfile
log = logging.getLogger()
os.system("cd ./F5-TTS; pip install -e .")
#@spaces.GPU(duration=120)
@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
cfg_strength: float, duration: float):
os.system("bash v2a.sh")
return "v2a"
video_to_audio_tab = gr.Interface(
fn=video_to_audio,
description="""
Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
Doing so does not improve results.
The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
""",
inputs=[
gr.Video(),
gr.Text(label='Prompt'),
gr.Text(label='Negative prompt', value='music'),
gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
gr.Number(label='Num steps', value=25, precision=0, minimum=1),
gr.Number(label='Guidance Strength', value=4.5, minimum=1),
gr.Number(label='Duration (sec)', value=8, minimum=1),
],
outputs='playable_video',
cache_examples=False,
title='MMAudio β€” Video-to-Audio Synthesis',
examples=[
[
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
'waves, seagulls',
'',
0,
25,
4.5,
10,
],
])
if __name__ == "__main__":
gr.TabbedInterface([video_to_audio_tab],
['Video-to-Audio']).launch()