Spaces:
Running
Running
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
import logging
|
3 |
+
from datetime import datetime
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import torch
|
8 |
+
import torchaudio
|
9 |
+
import os
|
10 |
+
|
11 |
+
import tempfile
|
12 |
+
|
13 |
+
log = logging.getLogger()
|
14 |
+
|
15 |
+
|
16 |
+
#@spaces.GPU(duration=120)
|
17 |
+
@torch.inference_mode()
|
18 |
+
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
|
19 |
+
cfg_strength: float, duration: float):
|
20 |
+
|
21 |
+
os.system("bash v2a.sh")
|
22 |
+
|
23 |
+
return "v2a"
|
24 |
+
|
25 |
+
|
26 |
+
video_to_audio_tab = gr.Interface(
|
27 |
+
fn=video_to_audio,
|
28 |
+
description="""
|
29 |
+
Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
|
30 |
+
Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
|
31 |
+
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
|
32 |
+
Doing so does not improve results.
|
33 |
+
The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
|
34 |
+
""",
|
35 |
+
inputs=[
|
36 |
+
gr.Video(),
|
37 |
+
gr.Text(label='Prompt'),
|
38 |
+
gr.Text(label='Negative prompt', value='music'),
|
39 |
+
gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
|
40 |
+
gr.Number(label='Num steps', value=25, precision=0, minimum=1),
|
41 |
+
gr.Number(label='Guidance Strength', value=4.5, minimum=1),
|
42 |
+
gr.Number(label='Duration (sec)', value=8, minimum=1),
|
43 |
+
],
|
44 |
+
outputs='playable_video',
|
45 |
+
cache_examples=False,
|
46 |
+
title='MMAudio — Video-to-Audio Synthesis',
|
47 |
+
examples=[
|
48 |
+
[
|
49 |
+
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
|
50 |
+
'waves, seagulls',
|
51 |
+
'',
|
52 |
+
0,
|
53 |
+
25,
|
54 |
+
4.5,
|
55 |
+
10,
|
56 |
+
],
|
57 |
+
])
|
58 |
+
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
gr.TabbedInterface([video_to_audio_tab],
|
62 |
+
['Video-to-Audio']).launch()
|
63 |
+
|