import spaces
import logging
from datetime import datetime
from pathlib import Path
import gradio as gr
import torch
import torchaudio
import os
import tempfile
log = logging.getLogger()
#@spaces.GPU(duration=120)
@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
cfg_strength: float, duration: float):
os.system("bash v2a.sh")
return "v2a"
video_to_audio_tab = gr.Interface(
fn=video_to_audio,
description="""
Project page: https://hkchengrex.com/MMAudio/
Code: https://github.com/hkchengrex/MMAudio
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
Doing so does not improve results.
The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
""",
inputs=[
gr.Video(),
gr.Text(label='Prompt'),
gr.Text(label='Negative prompt', value='music'),
gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
gr.Number(label='Num steps', value=25, precision=0, minimum=1),
gr.Number(label='Guidance Strength', value=4.5, minimum=1),
gr.Number(label='Duration (sec)', value=8, minimum=1),
],
outputs='playable_video',
cache_examples=False,
title='MMAudio — Video-to-Audio Synthesis',
examples=[
[
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
'waves, seagulls',
'',
0,
25,
4.5,
10,
],
])
if __name__ == "__main__":
gr.TabbedInterface([video_to_audio_tab],
['Video-to-Audio']).launch()