import spaces import logging from datetime import datetime from pathlib import Path import gradio as gr import torch import torchaudio import os import tempfile log = logging.getLogger() #@spaces.GPU(duration=120) @torch.inference_mode() def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float, duration: float): os.system("bash v2a.sh") return "v2a" video_to_audio_tab = gr.Interface( fn=video_to_audio, description=""" Project page: https://hkchengrex.com/MMAudio/
Code: https://github.com/hkchengrex/MMAudio
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side). Doing so does not improve results. The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine. """, inputs=[ gr.Video(), gr.Text(label='Prompt'), gr.Text(label='Negative prompt', value='music'), gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1), gr.Number(label='Num steps', value=25, precision=0, minimum=1), gr.Number(label='Guidance Strength', value=4.5, minimum=1), gr.Number(label='Duration (sec)', value=8, minimum=1), ], outputs='playable_video', cache_examples=False, title='MMAudio — Video-to-Audio Synthesis', examples=[ [ 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4', 'waves, seagulls', '', 0, 25, 4.5, 10, ], ]) if __name__ == "__main__": gr.TabbedInterface([video_to_audio_tab], ['Video-to-Audio']).launch()