File size: 2,024 Bytes
fe77cbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea8c66f
 
 
fe77cbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import spaces
import logging
from datetime import datetime
from pathlib import Path

import gradio as gr
import torch
import torchaudio
import os

import tempfile

log = logging.getLogger()


os.system("cd ./F5-TTS; pip install -e .")


#@spaces.GPU(duration=120)
@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,

                   cfg_strength: float, duration: float):

    os.system("bash v2a.sh")
    
    return "v2a"


video_to_audio_tab = gr.Interface(
    fn=video_to_audio,
    description="""

    Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>

    Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>

    NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side). 

    Doing so does not improve results.

    The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.

    """,
    inputs=[
        gr.Video(),
        gr.Text(label='Prompt'),
        gr.Text(label='Negative prompt', value='music'),
        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
        gr.Number(label='Duration (sec)', value=8, minimum=1),
    ],
    outputs='playable_video',
    cache_examples=False,
    title='MMAudio — Video-to-Audio Synthesis',
    examples=[
        [
            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
            'waves, seagulls',
            '',
            0,
            25,
            4.5,
            10,
        ],
    ])


if __name__ == "__main__":
    gr.TabbedInterface([video_to_audio_tab],
                       ['Video-to-Audio']).launch()