Spaces:
Sleeping
Sleeping
hello world
Browse files- .gitignore +10 -0
- app.py +22 -11
- mdx_models/model_data_v2.json +50 -0
- mdxnet_model.py +20 -21
- pyproject.toml +35 -0
- utils.py +25 -144
- uvr_processing.py +236 -0
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.DS_Store
|
3 |
+
*.wav
|
4 |
+
*.m4a
|
5 |
+
*.mp3
|
6 |
+
*.mp4
|
7 |
+
*.avi
|
8 |
+
*.mkv
|
9 |
+
.env
|
10 |
+
.vscode
|
app.py
CHANGED
@@ -4,9 +4,9 @@ import shutil
|
|
4 |
import numpy as np
|
5 |
from pathlib import Path
|
6 |
import os
|
7 |
-
from utils import get_hash
|
8 |
import time
|
9 |
import torch
|
|
|
10 |
|
11 |
|
12 |
def get_device_info():
|
@@ -17,23 +17,34 @@ def get_device_info():
|
|
17 |
return f"当前运行环境: {device}"
|
18 |
|
19 |
|
20 |
-
def inference(audio_file
|
|
|
21 |
# audio_file = '/private/var/folders/02/_9ymjkz12xq8m_xh5592pl840000gn/T/gradio/74c3de047a439ea3cfb8e2d1ee6e5a85ea999d3eb30537b88d386aac177902d0/Spare Zeit und Aufwand mit den Servicevertragen von Mercedes-Benz Trucks..m4a'
|
22 |
|
23 |
-
|
24 |
-
|
25 |
|
26 |
-
|
27 |
-
|
28 |
|
29 |
-
|
|
|
|
|
30 |
|
|
|
31 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
shutil.copy(audio_file, output_path2)
|
35 |
-
|
36 |
-
return [output_path1, output_path2]
|
37 |
|
38 |
|
39 |
def get_gui(theme, title, description):
|
|
|
4 |
import numpy as np
|
5 |
from pathlib import Path
|
6 |
import os
|
|
|
7 |
import time
|
8 |
import torch
|
9 |
+
from uvr_processing import process_uvr_task
|
10 |
|
11 |
|
12 |
def get_device_info():
|
|
|
17 |
return f"当前运行环境: {device}"
|
18 |
|
19 |
|
20 |
+
def inference(audio_file: str,
|
21 |
+
stem: str = "vocal",) -> list[str]:
|
22 |
# audio_file = '/private/var/folders/02/_9ymjkz12xq8m_xh5592pl840000gn/T/gradio/74c3de047a439ea3cfb8e2d1ee6e5a85ea999d3eb30537b88d386aac177902d0/Spare Zeit und Aufwand mit den Servicevertragen von Mercedes-Benz Trucks..m4a'
|
23 |
|
24 |
+
if not audio_file:
|
25 |
+
raise ValueError("The audio path is missing.")
|
26 |
|
27 |
+
if not stem:
|
28 |
+
raise ValueError("Please select 'vocal' or 'background' stem.")
|
29 |
|
30 |
+
audio_file = Path(audio_file)
|
31 |
+
mdxnet_models_dir = Path("./mdx_models")
|
32 |
+
output_dir = Path("./output")
|
33 |
|
34 |
+
outputs = []
|
35 |
start_time = time.time()
|
36 |
+
background_path, vocals_path = process_uvr_task(
|
37 |
+
mdxnet_models_dir=mdxnet_models_dir,
|
38 |
+
input_file_path=audio_file,
|
39 |
+
output_dir=output_dir,
|
40 |
+
)
|
41 |
+
end_time = time.time()
|
42 |
+
execution_time = end_time - start_time
|
43 |
+
|
44 |
+
outputs.append(str(background_path))
|
45 |
+
outputs.append(str(vocals_path))
|
46 |
|
47 |
+
return outputs
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
def get_gui(theme, title, description):
|
mdx_models/model_data_v2.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"77d07b2667ddf05b9e3175941b4454a0": {
|
3 |
+
"compensate": 1.021,
|
4 |
+
"mdx_dim_f_set": 3072,
|
5 |
+
"mdx_dim_t_set": 8,
|
6 |
+
"mdx_n_fft_scale_set": 7680,
|
7 |
+
"primary_stem": "Vocals",
|
8 |
+
"name": "UVR-MDX-NET-Voc_FT.onnx"
|
9 |
+
},
|
10 |
+
"1d64a6d2c30f709b8c9b4ce1366d96ee": {
|
11 |
+
"compensate": 1.035,
|
12 |
+
"mdx_dim_f_set": 2048,
|
13 |
+
"mdx_dim_t_set": 8,
|
14 |
+
"mdx_n_fft_scale_set": 5120,
|
15 |
+
"primary_stem": "Instrumental",
|
16 |
+
"name": "UVR_MDXNET_KARA_2.onnx"
|
17 |
+
},
|
18 |
+
"cd5b2989ad863f116c855db1dfe24e39": {
|
19 |
+
"compensate": 1.035,
|
20 |
+
"mdx_dim_f_set": 3072,
|
21 |
+
"mdx_dim_t_set": 9,
|
22 |
+
"mdx_n_fft_scale_set": 6144,
|
23 |
+
"primary_stem": "Other",
|
24 |
+
"name": "Reverb_HQ_By_FoxJoy.onnx"
|
25 |
+
},
|
26 |
+
"55657dd70583b0fedfba5f67df11d711": {
|
27 |
+
"compensate": 1.022,
|
28 |
+
"mdx_dim_f_set": 3072,
|
29 |
+
"mdx_dim_t_set": 8,
|
30 |
+
"mdx_n_fft_scale_set": 6144,
|
31 |
+
"primary_stem": "Instrumental",
|
32 |
+
"name": "UVR-MDX-NET-Inst_HQ_3.onnx"
|
33 |
+
},
|
34 |
+
"cc63408db3d80b4d85b0287d1d7c9632": {
|
35 |
+
"compensate": 1.033,
|
36 |
+
"mdx_dim_f_set": 3072,
|
37 |
+
"mdx_dim_t_set": 8,
|
38 |
+
"mdx_n_fft_scale_set": 6144,
|
39 |
+
"primary_stem": "Instrumental",
|
40 |
+
"name": "UVR-MDX-NET-Inst_HQ_2.onnx"
|
41 |
+
},
|
42 |
+
"0f2a6bc5b49d87d64728ee40e23bceb1": {
|
43 |
+
"compensate": 1.022,
|
44 |
+
"mdx_dim_f_set": 3072,
|
45 |
+
"mdx_dim_t_set": 8,
|
46 |
+
"mdx_n_fft_scale_set": 6144,
|
47 |
+
"primary_stem": "Instrumental",
|
48 |
+
"name": "UVR-MDX-NET-Inst_HQ_4.onnx"
|
49 |
+
}
|
50 |
+
}
|
mdxnet_model.py
CHANGED
@@ -5,20 +5,20 @@ import onnxruntime as ort
|
|
5 |
import hashlib
|
6 |
import queue
|
7 |
import threading
|
|
|
8 |
from tqdm import tqdm
|
|
|
9 |
|
10 |
|
11 |
class MDXModel:
|
12 |
-
def __init__(
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
compensation=1.000,
|
21 |
-
):
|
22 |
self.dim_f = dim_f # frequency bins
|
23 |
self.dim_t = dim_t
|
24 |
self.dim_c = 4
|
@@ -92,7 +92,7 @@ class MDX:
|
|
92 |
DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
|
93 |
DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
|
94 |
|
95 |
-
def __init__(self, model_path:
|
96 |
# Set the device and the provider (CPU or CUDA)
|
97 |
self.device = (
|
98 |
torch.device(f"cuda:{processor}")
|
@@ -121,7 +121,7 @@ class MDX:
|
|
121 |
self.prog = None
|
122 |
|
123 |
@staticmethod
|
124 |
-
def get_hash(model_path:
|
125 |
try:
|
126 |
with open(model_path, "rb") as f:
|
127 |
f.seek(-10000 * 1024, 2)
|
@@ -132,12 +132,11 @@ class MDX:
|
|
132 |
return model_hash
|
133 |
|
134 |
@staticmethod
|
135 |
-
def segment(
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
):
|
141 |
"""
|
142 |
Segment or join segmented wave array
|
143 |
|
@@ -192,7 +191,7 @@ class MDX:
|
|
192 |
|
193 |
return processed_wave
|
194 |
|
195 |
-
def pad_wave(self, wave):
|
196 |
"""
|
197 |
Pad the wave array to match the required chunk size
|
198 |
|
@@ -232,7 +231,7 @@ class MDX:
|
|
232 |
|
233 |
return mix_waves, pad, trim
|
234 |
|
235 |
-
def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
|
236 |
"""
|
237 |
Process each wave segment in a multi-threaded environment
|
238 |
|
@@ -268,7 +267,7 @@ class MDX:
|
|
268 |
q.put({_id: processed_signal})
|
269 |
return processed_signal
|
270 |
|
271 |
-
def process_wave(self, wave: np.array, mt_threads=1):
|
272 |
"""
|
273 |
Process the wave array in a multi-threaded environment
|
274 |
|
|
|
5 |
import hashlib
|
6 |
import queue
|
7 |
import threading
|
8 |
+
from pathlib import Path
|
9 |
from tqdm import tqdm
|
10 |
+
from typing import Tuple
|
11 |
|
12 |
|
13 |
class MDXModel:
|
14 |
+
def __init__(self,
|
15 |
+
device: torch.device,
|
16 |
+
dim_f: int,
|
17 |
+
dim_t: int,
|
18 |
+
n_fft: int,
|
19 |
+
hop: int = 1024,
|
20 |
+
stem_name: str = "Vocals",
|
21 |
+
compensation: float = 1.000,):
|
|
|
|
|
22 |
self.dim_f = dim_f # frequency bins
|
23 |
self.dim_t = dim_t
|
24 |
self.dim_c = 4
|
|
|
92 |
DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
|
93 |
DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
|
94 |
|
95 |
+
def __init__(self, model_path: Path, params: MDXModel, processor: int = 0):
|
96 |
# Set the device and the provider (CPU or CUDA)
|
97 |
self.device = (
|
98 |
torch.device(f"cuda:{processor}")
|
|
|
121 |
self.prog = None
|
122 |
|
123 |
@staticmethod
|
124 |
+
def get_hash(model_path: Path) -> str:
|
125 |
try:
|
126 |
with open(model_path, "rb") as f:
|
127 |
f.seek(-10000 * 1024, 2)
|
|
|
132 |
return model_hash
|
133 |
|
134 |
@staticmethod
|
135 |
+
def segment(wave: np.array,
|
136 |
+
combine: bool = True,
|
137 |
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
138 |
+
margin_size: int = DEFAULT_MARGIN_SIZE,
|
139 |
+
) -> np.array:
|
|
|
140 |
"""
|
141 |
Segment or join segmented wave array
|
142 |
|
|
|
191 |
|
192 |
return processed_wave
|
193 |
|
194 |
+
def pad_wave(self, wave: np.array) -> Tuple[np.array, int, int]:
|
195 |
"""
|
196 |
Pad the wave array to match the required chunk size
|
197 |
|
|
|
231 |
|
232 |
return mix_waves, pad, trim
|
233 |
|
234 |
+
def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int) -> np.array:
|
235 |
"""
|
236 |
Process each wave segment in a multi-threaded environment
|
237 |
|
|
|
267 |
q.put({_id: processed_signal})
|
268 |
return processed_signal
|
269 |
|
270 |
+
def process_wave(self, wave: np.array, mt_threads=1) -> np.array:
|
271 |
"""
|
272 |
Process the wave array in a multi-threaded environment
|
273 |
|
pyproject.toml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "vocal-bgm-separator"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = [
|
6 |
+
{name = "Zhiliang Zhou",email = "[email protected]"}
|
7 |
+
]
|
8 |
+
license = {text = "MIT"}
|
9 |
+
readme = "README.md"
|
10 |
+
requires-python = ">=3.12,<4.0"
|
11 |
+
dependencies = [
|
12 |
+
"gradio (>=5.23.0,<6.0.0)",
|
13 |
+
"demucs (>=4.0.1,<5.0.0)",
|
14 |
+
"torch (>=2.6.0,<3.0.0)",
|
15 |
+
"torchaudio (>=2.6.0,<3.0.0)",
|
16 |
+
"llvmlite (>=0.44.0,<0.45.0)", # must install before librose on Mac
|
17 |
+
"librosa (>=0.11.0,<0.12.0)", # A python package for music and audio analysis.
|
18 |
+
"soundfile (>=0.13.1,<0.14.0)",
|
19 |
+
"pedalboard (>=0.9.16,<0.10.0)" # pedalboard is a Python library for adding effects to audio from spotify
|
20 |
+
]
|
21 |
+
|
22 |
+
|
23 |
+
[build-system]
|
24 |
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
25 |
+
build-backend = "poetry.core.masonry.api"
|
26 |
+
|
27 |
+
[tool.poetry.group.dev.dependencies]
|
28 |
+
spaces = "^0.34.0"
|
29 |
+
onnxruntime = "^1.21.0"
|
30 |
+
gradio-client = "^1.8.0"
|
31 |
+
jupyter = "^1.1.1"
|
32 |
+
qtconsole = "^5.6.1"
|
33 |
+
pyqt5 = "^5.15.11"
|
34 |
+
dotenv = "^0.9.9"
|
35 |
+
|
utils.py
CHANGED
@@ -1,152 +1,33 @@
|
|
1 |
# reference: https://huggingface.co/spaces/r3gm/Audio_separator
|
2 |
-
import
|
3 |
from urllib.parse import urlparse
|
4 |
-
import
|
5 |
-
import
|
6 |
-
import
|
7 |
|
8 |
|
9 |
-
def
|
10 |
-
|
11 |
-
model_dir: str,
|
12 |
-
file_name: str | None = None,
|
13 |
-
overwrite: bool = False,
|
14 |
-
progress: bool = True,
|
15 |
-
) -> str:
|
16 |
-
"""Download a file from `url` into `model_dir`,
|
17 |
-
using the file present if possible.
|
18 |
|
19 |
-
|
20 |
-
""
|
21 |
-
|
22 |
-
if not file_name:
|
23 |
-
parts = urlparse(url)
|
24 |
-
file_name = os.path.basename(parts.path)
|
25 |
-
cached_file = os.path.abspath(os.path.join(model_dir, file_name))
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
if overwrite or os.path.getsize(cached_file) == 0:
|
30 |
-
remove_files(cached_file)
|
31 |
-
|
32 |
-
# Download
|
33 |
-
if not os.path.exists(cached_file):
|
34 |
-
logger.info(f'Downloading: "{url}" to {cached_file}\n')
|
35 |
-
from torch.hub import download_url_to_file
|
36 |
-
|
37 |
-
download_url_to_file(url, cached_file, progress=progress)
|
38 |
-
else:
|
39 |
-
logger.debug(cached_file)
|
40 |
-
|
41 |
-
return cached_file
|
42 |
-
|
43 |
-
|
44 |
-
def friendly_name(file: str):
|
45 |
-
if file.startswith("http"):
|
46 |
-
file = urlparse(file).path
|
47 |
-
|
48 |
-
file = os.path.basename(file)
|
49 |
-
model_name, extension = os.path.splitext(file)
|
50 |
-
return model_name, extension
|
51 |
-
|
52 |
-
|
53 |
-
def download_manager(
|
54 |
-
url: str,
|
55 |
-
path: str,
|
56 |
-
extension: str = "",
|
57 |
-
overwrite: bool = False,
|
58 |
-
progress: bool = True,
|
59 |
-
):
|
60 |
-
url = url.strip()
|
61 |
-
|
62 |
-
name, ext = friendly_name(url)
|
63 |
-
name += ext if not extension else f".{extension}"
|
64 |
-
|
65 |
-
if url.startswith("http"):
|
66 |
-
filename = load_file_from_url(
|
67 |
-
url=url,
|
68 |
-
model_dir=path,
|
69 |
-
file_name=name,
|
70 |
-
overwrite=overwrite,
|
71 |
-
progress=progress,
|
72 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
else:
|
74 |
-
|
75 |
-
|
76 |
-
return filename
|
77 |
-
|
78 |
-
|
79 |
-
def remove_files(file_list):
|
80 |
-
if isinstance(file_list, str):
|
81 |
-
file_list = [file_list]
|
82 |
-
|
83 |
-
for file in file_list:
|
84 |
-
if os.path.exists(file):
|
85 |
-
os.remove(file)
|
86 |
-
|
87 |
-
|
88 |
-
def remove_directory_contents(directory_path):
|
89 |
-
"""
|
90 |
-
Removes all files and subdirectories within a directory.
|
91 |
-
|
92 |
-
Parameters:
|
93 |
-
directory_path (str): Path to the directory whose
|
94 |
-
contents need to be removed.
|
95 |
-
"""
|
96 |
-
if os.path.exists(directory_path):
|
97 |
-
for filename in os.listdir(directory_path):
|
98 |
-
file_path = os.path.join(directory_path, filename)
|
99 |
-
try:
|
100 |
-
if os.path.isfile(file_path):
|
101 |
-
os.remove(file_path)
|
102 |
-
elif os.path.isdir(file_path):
|
103 |
-
shutil.rmtree(file_path)
|
104 |
-
except Exception as e:
|
105 |
-
logger.error(f"Failed to delete {file_path}. Reason: {e}")
|
106 |
-
logger.info(f"Content in '{directory_path}' removed.")
|
107 |
-
else:
|
108 |
-
logger.error(f"Directory '{directory_path}' does not exist.")
|
109 |
-
|
110 |
-
|
111 |
-
# Create directory if not exists
|
112 |
-
def create_directories(directory_path):
|
113 |
-
if isinstance(directory_path, str):
|
114 |
-
directory_path = [directory_path]
|
115 |
-
for one_dir_path in directory_path:
|
116 |
-
if not os.path.exists(one_dir_path):
|
117 |
-
os.makedirs(one_dir_path)
|
118 |
-
logger.debug(f"Directory '{one_dir_path}' created.")
|
119 |
-
|
120 |
-
|
121 |
-
def setup_logger(name_log):
|
122 |
-
logger = logging.getLogger(name_log)
|
123 |
-
logger.setLevel(logging.INFO)
|
124 |
-
|
125 |
-
_default_handler = logging.StreamHandler() # Set sys.stderr as stream.
|
126 |
-
_default_handler.flush = sys.stderr.flush
|
127 |
-
logger.addHandler(_default_handler)
|
128 |
-
|
129 |
-
logger.propagate = False
|
130 |
-
|
131 |
-
handlers = logger.handlers
|
132 |
-
|
133 |
-
for handler in handlers:
|
134 |
-
formatter = logging.Formatter("[%(levelname)s] >> %(message)s")
|
135 |
-
handler.setFormatter(formatter)
|
136 |
-
|
137 |
-
# logger.handlers
|
138 |
-
|
139 |
-
return logger
|
140 |
-
|
141 |
-
|
142 |
-
logger = setup_logger("ss")
|
143 |
-
logger.setLevel(logging.INFO)
|
144 |
-
|
145 |
-
|
146 |
-
def get_hash(filepath):
|
147 |
-
with open(filepath, 'rb') as f:
|
148 |
-
file_hash = hashlib.blake2b()
|
149 |
-
while chunk := f.read(8192):
|
150 |
-
file_hash.update(chunk)
|
151 |
-
|
152 |
-
return file_hash.hexdigest()[:18]
|
|
|
1 |
# reference: https://huggingface.co/spaces/r3gm/Audio_separator
|
2 |
+
import subprocess, shlex, sys # noqa
|
3 |
from urllib.parse import urlparse
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
from pathlib import Path
|
7 |
|
8 |
|
9 |
+
def convert_to_stereo_and_wav(audio_path: Path) -> Path:
|
10 |
+
wave, sr = librosa.load(str(audio_path), mono=False, sr=44100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
# check if mono
|
13 |
+
if type(wave[0]) != np.ndarray or audio_path.suffix != ".wav": # noqa
|
14 |
+
stereo_path = audio_path.with_name(audio_path.stem + "_stereo.wav")
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
command = shlex.split(
|
17 |
+
f'ffmpeg -y -loglevel error -i "{str(audio_path)}" -ac 2 -f wav "{str(stereo_path)}"'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
)
|
19 |
+
sub_params = {
|
20 |
+
"stdout": subprocess.PIPE,
|
21 |
+
"stderr": subprocess.PIPE,
|
22 |
+
"creationflags": subprocess.CREATE_NO_WINDOW
|
23 |
+
if sys.platform == "win32"
|
24 |
+
else 0,
|
25 |
+
}
|
26 |
+
process_wav = subprocess.Popen(command, **sub_params)
|
27 |
+
output, errors = process_wav.communicate()
|
28 |
+
if process_wav.returncode != 0 or not stereo_path.exists():
|
29 |
+
raise Exception("Error processing audio to stereo wav")
|
30 |
+
|
31 |
+
return stereo_path
|
32 |
else:
|
33 |
+
return Path(audio_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uvr_processing.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import json
|
3 |
+
import gc
|
4 |
+
import spaces
|
5 |
+
import librosa
|
6 |
+
import soundfile as sf
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import Dict, Tuple
|
10 |
+
from utils import convert_to_stereo_and_wav
|
11 |
+
from mdxnet_model import MDX, MDXModel
|
12 |
+
import time
|
13 |
+
|
14 |
+
|
15 |
+
STEM_NAMING = {
|
16 |
+
"Vocals": "Instrumental",
|
17 |
+
"Other": "Instruments",
|
18 |
+
"Instrumental": "Vocals",
|
19 |
+
"Drums": "Drumless",
|
20 |
+
"Bass": "Bassless",
|
21 |
+
}
|
22 |
+
|
23 |
+
|
24 |
+
@spaces.GPU()
|
25 |
+
def run_mdx(model_params: Dict,
|
26 |
+
input_filename: Path,
|
27 |
+
output_dir: Path,
|
28 |
+
model_path: Path,
|
29 |
+
denoise: bool = False,
|
30 |
+
m_threads: int = 2,
|
31 |
+
device_base: str = "cuda",
|
32 |
+
) -> Tuple[str, str]:
|
33 |
+
"""
|
34 |
+
使用MDX模型分离人声
|
35 |
+
"""
|
36 |
+
if device_base == "cuda":
|
37 |
+
device = torch.device("cuda:0")
|
38 |
+
processor_num = 0
|
39 |
+
device_properties = torch.cuda.get_device_properties(device)
|
40 |
+
vram_gb = device_properties.total_memory / 1024**3
|
41 |
+
m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
|
42 |
+
else:
|
43 |
+
device = torch.device("cpu")
|
44 |
+
processor_num = -1
|
45 |
+
m_threads = 1
|
46 |
+
|
47 |
+
model_hash = MDX.get_hash(model_path) # type: str
|
48 |
+
mp = model_params.get(model_hash)
|
49 |
+
model = MDXModel(
|
50 |
+
device,
|
51 |
+
dim_f=mp["mdx_dim_f_set"],
|
52 |
+
dim_t=2 ** mp["mdx_dim_t_set"],
|
53 |
+
n_fft=mp["mdx_n_fft_scale_set"],
|
54 |
+
stem_name=mp["primary_stem"],
|
55 |
+
compensation=mp["compensate"],
|
56 |
+
)
|
57 |
+
|
58 |
+
mdx_sess = MDX(model_path, model, processor=processor_num)
|
59 |
+
wave, sr = librosa.load(input_filename, mono=False, sr=44100)
|
60 |
+
# normalizing input wave gives better output
|
61 |
+
peak = max(np.max(wave), abs(np.min(wave)))
|
62 |
+
wave /= peak
|
63 |
+
if denoise:
|
64 |
+
wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads)) # type: np.array
|
65 |
+
wave_processed *= 0.5
|
66 |
+
else:
|
67 |
+
wave_processed = mdx_sess.process_wave(wave, m_threads)
|
68 |
+
# return to previous peak
|
69 |
+
wave_processed *= peak
|
70 |
+
stem_name = model.stem_name
|
71 |
+
|
72 |
+
# output main track
|
73 |
+
main_filepath = output_dir / input_filename.with_name(f"{input_filename.stem}_{stem_name}.wav")
|
74 |
+
sf.write(main_filepath, wave_processed.T, sr)
|
75 |
+
|
76 |
+
# output reverse track
|
77 |
+
invert_filepath = output_dir / input_filename.with_name(f"{input_filename.stem}_{stem_name}_reverse.wav")
|
78 |
+
sf.write(invert_filepath, (-wave_processed.T * model.compensation) + wave.T, sr)
|
79 |
+
|
80 |
+
del mdx_sess, wave_processed, wave
|
81 |
+
gc.collect()
|
82 |
+
torch.cuda.empty_cache()
|
83 |
+
return main_filepath, invert_filepath
|
84 |
+
|
85 |
+
|
86 |
+
def run_mdx_cpu(model_params: Dict,
|
87 |
+
input_filename: Path,
|
88 |
+
output_dir: Path,
|
89 |
+
model_path: Path,
|
90 |
+
denoise: bool = False,
|
91 |
+
m_threads: int = 2,
|
92 |
+
device_base: str = ""):
|
93 |
+
m_threads = 1
|
94 |
+
duration = librosa.get_duration(filename=input_filename)
|
95 |
+
if duration >= 60 and duration <= 120:
|
96 |
+
m_threads = 8
|
97 |
+
elif duration > 120:
|
98 |
+
m_threads = 16
|
99 |
+
|
100 |
+
model_hash = MDX.get_hash(model_path)
|
101 |
+
device = torch.device("cpu")
|
102 |
+
processor_num = -1
|
103 |
+
mp = model_params.get(model_hash)
|
104 |
+
model = MDXModel(
|
105 |
+
device,
|
106 |
+
dim_f=mp["mdx_dim_f_set"],
|
107 |
+
dim_t=2 ** mp["mdx_dim_t_set"],
|
108 |
+
n_fft=mp["mdx_n_fft_scale_set"],
|
109 |
+
stem_name=mp["primary_stem"],
|
110 |
+
compensation=mp["compensate"],
|
111 |
+
)
|
112 |
+
|
113 |
+
mdx_sess = MDX(model_path, model, processor=processor_num)
|
114 |
+
wave, sr = librosa.load(input_filename, mono=False, sr=44100)
|
115 |
+
# normalizing input wave gives better output
|
116 |
+
peak = max(np.max(wave), abs(np.min(wave)))
|
117 |
+
wave /= peak
|
118 |
+
if denoise:
|
119 |
+
wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
|
120 |
+
mdx_sess.process_wave(wave, m_threads)
|
121 |
+
)
|
122 |
+
wave_processed *= 0.5
|
123 |
+
else:
|
124 |
+
wave_processed = mdx_sess.process_wave(wave, m_threads)
|
125 |
+
# return to previous peak
|
126 |
+
wave_processed *= peak
|
127 |
+
stem_name = model.stem_name
|
128 |
+
|
129 |
+
# output main track
|
130 |
+
main_filepath = output_dir / input_filename.with_name(f"{input_filename.stem}_{stem_name}.wav")
|
131 |
+
sf.write(main_filepath, wave_processed.T, sr)
|
132 |
+
|
133 |
+
# output reverse track
|
134 |
+
invert_filepath = output_dir / input_filename.with_name(f"{input_filename.stem}_{stem_name}_reverse.wav")
|
135 |
+
sf.write(invert_filepath, (-wave_processed.T * model.compensation) + wave.T, sr)
|
136 |
+
|
137 |
+
del mdx_sess, wave_processed, wave
|
138 |
+
gc.collect()
|
139 |
+
torch.cuda.empty_cache()
|
140 |
+
return main_filepath, invert_filepath
|
141 |
+
|
142 |
+
|
143 |
+
def extract_bgm(mdx_model_params: Dict,
|
144 |
+
input_filename: Path,
|
145 |
+
mdxnet_models_dir: Path,
|
146 |
+
output_dir: Path,
|
147 |
+
device_base: str = "cuda") -> Path:
|
148 |
+
"""
|
149 |
+
提取纯音乐背景,去除人声部分
|
150 |
+
"""
|
151 |
+
background_path, _ = run_mdx(model_params=mdx_model_params,
|
152 |
+
input_filename=input_filename,
|
153 |
+
output_dir=output_dir,
|
154 |
+
model_path=mdxnet_models_dir/"UVR-MDX-NET-Inst_HQ_3.onnx",
|
155 |
+
denoise=False,
|
156 |
+
device_base=device_base,
|
157 |
+
)
|
158 |
+
return background_path
|
159 |
+
|
160 |
+
|
161 |
+
def extract_vocal(mdx_model_params: Dict,
|
162 |
+
input_filename: Path,
|
163 |
+
mdxnet_models_dir: Path,
|
164 |
+
output_dir: Path,
|
165 |
+
main_vocals_flag: bool = False,
|
166 |
+
dereverb_flag: bool = False,
|
167 |
+
device_base: str = "cuda") -> Path:
|
168 |
+
"""
|
169 |
+
提取人声
|
170 |
+
"""
|
171 |
+
# 首先使用 UVR-MDX-NET-Voc_FT.onnx 基础的人声分离模型
|
172 |
+
vocals_path, _ = run_mdx(mdx_model_params,
|
173 |
+
input_filename,
|
174 |
+
output_dir,
|
175 |
+
mdxnet_models_dir/"UVR-MDX-NET-Voc_FT.onnx",
|
176 |
+
denoise=True,
|
177 |
+
device_base=device_base,
|
178 |
+
)
|
179 |
+
# 如果开启"main_vocals_flag",则使用UVR_MDXNET_KARA_2.onnx进一步分成主人声(Main)和和声/背景声(Backup)
|
180 |
+
if main_vocals_flag:
|
181 |
+
time.sleep(2)
|
182 |
+
backup_vocals_path, main_vocals_path = run_mdx(mdx_model_params,
|
183 |
+
output_dir,
|
184 |
+
mdxnet_models_dir/"UVR_MDXNET_KARA_2.onnx",
|
185 |
+
vocals_path,
|
186 |
+
denoise=True,
|
187 |
+
device_base=device_base,
|
188 |
+
)
|
189 |
+
vocals_path = main_vocals_path
|
190 |
+
# 如果开启"dereverb_flag",则使用Reverb_HQ_By_FoxJoy.onnx进行去混响
|
191 |
+
if dereverb_flag:
|
192 |
+
time.sleep(2)
|
193 |
+
_, vocals_dereverb_path = run_mdx(mdx_model_params,
|
194 |
+
output_dir,
|
195 |
+
mdxnet_models_dir/"Reverb_HQ_By_FoxJoy.onnx",
|
196 |
+
vocals_path,
|
197 |
+
denoise=True,
|
198 |
+
device_base=device_base,
|
199 |
+
)
|
200 |
+
vocals_path = vocals_dereverb_path
|
201 |
+
return vocals_path
|
202 |
+
|
203 |
+
def process_uvr_task(mdxnet_models_dir: Path,
|
204 |
+
input_file_path: Path,
|
205 |
+
output_dir: Path,
|
206 |
+
main_vocals_flag: bool = False, # 如果开启"Main",则使用UVR_MDXNET_KARA_2.onnx进一步分离主副人声
|
207 |
+
dereverb_flag: bool = False, # 如果开启"DeReverb",则使用Reverb_HQ_By_FoxJoy.onnx进行去混响
|
208 |
+
) -> Tuple[Path, Path]:
|
209 |
+
|
210 |
+
device_base = "cuda" if torch.cuda.is_available() else "cpu"
|
211 |
+
|
212 |
+
# load mdx model definition
|
213 |
+
with open(mdxnet_models_dir/"model_data_v2.json") as infile:
|
214 |
+
mdx_model_params = json.load(infile) # type: Dict
|
215 |
+
|
216 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
217 |
+
input_file_path = convert_to_stereo_and_wav(input_file_path) # type: Path
|
218 |
+
|
219 |
+
# 1. 提取纯音乐背景,去除人声部分
|
220 |
+
background_path = extract_bgm(mdx_model_params,
|
221 |
+
input_file_path,
|
222 |
+
mdxnet_models_dir,
|
223 |
+
output_dir,
|
224 |
+
device_base=device_base)
|
225 |
+
|
226 |
+
# 2. 分离人声
|
227 |
+
# 首先使用 UVR-MDX-NET-Voc_FT.onnx 基础的人声分离模型
|
228 |
+
vocals_path = extract_vocal(mdx_model_params,
|
229 |
+
input_file_path,
|
230 |
+
mdxnet_models_dir,
|
231 |
+
output_dir,
|
232 |
+
main_vocals_flag=main_vocals_flag,
|
233 |
+
dereverb_flag=dereverb_flag,
|
234 |
+
device_base=device_base)
|
235 |
+
|
236 |
+
return background_path, vocals_path
|