Update stablecuda12build1.py
Browse files16 bit and addition in audio have diff maths for each float fixed up errors
- stablecuda12build1.py +50 -38
stablecuda12build1.py
CHANGED
@@ -19,6 +19,7 @@ from pathlib import Path
|
|
19 |
import mmap
|
20 |
import subprocess
|
21 |
import re
|
|
|
22 |
|
23 |
# Suppress warnings for cleaner output
|
24 |
warnings.filterwarnings("ignore")
|
@@ -149,13 +150,16 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
|
|
149 |
stereo_samples = stereo_samples * mask
|
150 |
left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
|
151 |
right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
|
152 |
-
left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0
|
153 |
right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
|
154 |
if left_rms > 0 and right_rms > 0:
|
155 |
avg_rms = (left_rms + right_rms) / 2
|
156 |
stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
|
157 |
stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
|
158 |
balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
|
|
|
|
|
|
159 |
balanced_segment = AudioSegment(
|
160 |
balanced_samples.tobytes(),
|
161 |
frame_rate=sample_rate,
|
@@ -206,6 +210,9 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
|
|
206 |
limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
|
207 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
208 |
samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
|
|
|
|
|
|
209 |
limited_segment = AudioSegment(
|
210 |
samples.tobytes(),
|
211 |
frame_rate=sample_rate,
|
@@ -248,12 +255,15 @@ def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
|
|
248 |
# Genre prompt functions
|
249 |
def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
|
250 |
try:
|
251 |
-
rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("
|
252 |
-
drum = f", {drum_beat} drums" if drum_beat != "none" else ""
|
253 |
synth = f", {synthesizer} accents" if synthesizer != "none" else ""
|
254 |
-
bass = f", {bass_style}" if bass_style != "none" else ",
|
255 |
-
guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated guitar riffs"
|
256 |
-
prompt =
|
|
|
|
|
|
|
257 |
logger.debug(f"Generated RHCP prompt: {prompt}")
|
258 |
return prompt
|
259 |
except Exception as e:
|
@@ -468,7 +478,8 @@ PRESETS = {
|
|
468 |
"rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
|
469 |
"techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
|
470 |
"grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
|
471 |
-
"indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8}
|
|
|
472 |
}
|
473 |
|
474 |
# Function to get the latest log file
|
@@ -523,7 +534,7 @@ def set_bit_depth_24():
|
|
523 |
return "24"
|
524 |
|
525 |
# Optimized generation function
|
526 |
-
def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str):
|
527 |
global musicgen_model
|
528 |
if not instrumental_prompt.strip():
|
529 |
logger.warning("Empty instrumental prompt provided")
|
@@ -550,6 +561,10 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
550 |
except ValueError:
|
551 |
logger.error(f"Invalid bit_depth value: {bit_depth}")
|
552 |
return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
|
|
|
|
|
|
|
|
|
553 |
max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
|
554 |
total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
|
555 |
processing_sample_rate = 16000 # Fixed for processing
|
@@ -570,8 +585,6 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
570 |
logger.error("Insufficient disk space")
|
571 |
return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
|
572 |
|
573 |
-
# Set random seed for this generation run
|
574 |
-
seed = random.randint(0, 10000)
|
575 |
logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
|
576 |
base_prompt = instrumental_prompt
|
577 |
clean_memory()
|
@@ -731,10 +744,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
731 |
logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
|
732 |
prev_overlap = final_segment[-overlap_ms:]
|
733 |
curr_overlap = current_segment[:overlap_ms]
|
734 |
-
#
|
735 |
-
prev_overlap = ensure_stereo(prev_overlap, processing_sample_rate, sample_width)
|
736 |
-
curr_overlap = ensure_stereo(curr_overlap, processing_sample_rate, sample_width)
|
737 |
-
# Calculate samples using torchaudio for precision
|
738 |
prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
|
739 |
curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
|
740 |
num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
|
@@ -744,27 +754,21 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
744 |
logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
|
745 |
final_segment += current_segment
|
746 |
continue
|
747 |
-
blended_samples =
|
748 |
-
prev_samples = prev_audio[:, :num_samples]
|
749 |
-
curr_samples = curr_audio[:, :num_samples]
|
750 |
-
hann_window =
|
751 |
-
fade_out = hann_window
|
752 |
fade_in = hann_window
|
753 |
-
blended_samples = (prev_samples * fade_out
|
754 |
-
#
|
755 |
-
blended_samples = blended_samples.
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
blended_segment = AudioSegment(
|
763 |
-
byte_data,
|
764 |
-
frame_rate=processing_sample_rate,
|
765 |
-
sample_width=sample_width,
|
766 |
-
channels=2
|
767 |
-
)
|
768 |
blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
|
769 |
final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
|
770 |
else:
|
@@ -822,7 +826,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
822 |
# Clear inputs function
|
823 |
def clear_inputs():
|
824 |
logger.info("Clearing input fields")
|
825 |
-
return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16"
|
826 |
|
827 |
# Custom CSS
|
828 |
css = """
|
@@ -1024,7 +1028,7 @@ with gr.Blocks(css=css) as demo:
|
|
1024 |
)
|
1025 |
preset = gr.Dropdown(
|
1026 |
label="Preset Configuration 🎛️",
|
1027 |
-
choices=["default", "rock", "techno", "grunge", "indie"],
|
1028 |
value="default",
|
1029 |
info="Select a preset optimized for specific genres."
|
1030 |
)
|
@@ -1034,6 +1038,14 @@ with gr.Blocks(css=css) as demo:
|
|
1034 |
value=1300,
|
1035 |
info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
|
1036 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1037 |
bitrate_state = gr.State(value="96k") # Default bitrate
|
1038 |
sample_rate_state = gr.State(value="32000") # Default output sampling rate
|
1039 |
bit_depth_state = gr.State(value="16") # Default bit depth
|
@@ -1088,13 +1100,13 @@ with gr.Blocks(css=css) as demo:
|
|
1088 |
bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
|
1089 |
gen_btn.click(
|
1090 |
generate_music,
|
1091 |
-
inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state],
|
1092 |
outputs=[out_audio, status, vram_status]
|
1093 |
)
|
1094 |
clr_btn.click(
|
1095 |
clear_inputs,
|
1096 |
inputs=None,
|
1097 |
-
outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state]
|
1098 |
)
|
1099 |
log_btn.click(
|
1100 |
get_latest_log,
|
|
|
19 |
import mmap
|
20 |
import subprocess
|
21 |
import re
|
22 |
+
import io
|
23 |
|
24 |
# Suppress warnings for cleaner output
|
25 |
warnings.filterwarnings("ignore")
|
|
|
150 |
stereo_samples = stereo_samples * mask
|
151 |
left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
|
152 |
right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
|
153 |
+
left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 else 0
|
154 |
right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
|
155 |
if left_rms > 0 and right_rms > 0:
|
156 |
avg_rms = (left_rms + right_rms) / 2
|
157 |
stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
|
158 |
stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
|
159 |
balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
160 |
+
# Ensure sample length is even for stereo
|
161 |
+
if len(balanced_samples) % 2 != 0:
|
162 |
+
balanced_samples = balanced_samples[:-1]
|
163 |
balanced_segment = AudioSegment(
|
164 |
balanced_samples.tobytes(),
|
165 |
frame_rate=sample_rate,
|
|
|
210 |
limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
|
211 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
212 |
samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
213 |
+
# Ensure sample length is even for stereo
|
214 |
+
if len(samples) % 2 != 0:
|
215 |
+
samples = samples[:-1]
|
216 |
limited_segment = AudioSegment(
|
217 |
samples.tobytes(),
|
218 |
frame_rate=sample_rate,
|
|
|
255 |
# Genre prompt functions
|
256 |
def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
|
257 |
try:
|
258 |
+
rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("syncopated funk rhythms" if bpm > 120 else "groovy funk flow")
|
259 |
+
drum = f", {drum_beat} drums" if drum_beat != "none" else ", tight funk drums with punchy snares"
|
260 |
synth = f", {synthesizer} accents" if synthesizer != "none" else ""
|
261 |
+
bass = f", {bass_style}" if bass_style != "none" else ", prominent slap bass with funky grooves"
|
262 |
+
guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated funk guitar riffs with clean and distorted tones"
|
263 |
+
prompt = (
|
264 |
+
f"Instrumental funk rock{bass}{guitar}{drum}{synth}, Red Hot Chili Peppers-inspired vibe with high-energy slap bass, "
|
265 |
+
f"syncopated guitar riffs, dynamic breakdowns, and a raw, funky edge, {rhythm} at {bpm} BPM."
|
266 |
+
)
|
267 |
logger.debug(f"Generated RHCP prompt: {prompt}")
|
268 |
return prompt
|
269 |
except Exception as e:
|
|
|
478 |
"rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
|
479 |
"techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
|
480 |
"grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
|
481 |
+
"indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8},
|
482 |
+
"funk_rock": {"cfg_scale": 2.2, "top_k": 150, "top_p": 0.95, "temperature": 1.0} # Enhanced for RHCP
|
483 |
}
|
484 |
|
485 |
# Function to get the latest log file
|
|
|
534 |
return "24"
|
535 |
|
536 |
# Optimized generation function
|
537 |
+
def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str, seed: int):
|
538 |
global musicgen_model
|
539 |
if not instrumental_prompt.strip():
|
540 |
logger.warning("Empty instrumental prompt provided")
|
|
|
561 |
except ValueError:
|
562 |
logger.error(f"Invalid bit_depth value: {bit_depth}")
|
563 |
return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
|
564 |
+
# Validate seed
|
565 |
+
if not (0 <= seed <= 10000):
|
566 |
+
logger.error(f"Invalid seed value: {seed}. Must be between 0 and 10000.")
|
567 |
+
return None, "❌ Invalid seed value; must be between 0 and 10000", vram_status
|
568 |
max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
|
569 |
total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
|
570 |
processing_sample_rate = 16000 # Fixed for processing
|
|
|
585 |
logger.error("Insufficient disk space")
|
586 |
return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
|
587 |
|
|
|
|
|
588 |
logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
|
589 |
base_prompt = instrumental_prompt
|
590 |
clean_memory()
|
|
|
744 |
logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
|
745 |
prev_overlap = final_segment[-overlap_ms:]
|
746 |
curr_overlap = current_segment[:overlap_ms]
|
747 |
+
# Use torchaudio for precise crossfading
|
|
|
|
|
|
|
748 |
prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
|
749 |
curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
|
750 |
num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
|
|
|
754 |
logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
|
755 |
final_segment += current_segment
|
756 |
continue
|
757 |
+
blended_samples = torch.zeros(2, num_samples, dtype=torch.float32)
|
758 |
+
prev_samples = prev_audio[:, :num_samples]
|
759 |
+
curr_samples = curr_audio[:, :num_samples]
|
760 |
+
hann_window = torch.hann_window(num_samples, periodic=False)
|
761 |
+
fade_out = hann_window.flip(0)
|
762 |
fade_in = hann_window
|
763 |
+
blended_samples = (prev_samples * fade_out + curr_samples * fade_in)
|
764 |
+
# Convert to appropriate dtype for bit depth
|
765 |
+
blended_samples = (blended_samples * (2**23 if sample_width == 3 else 32767)).to(torch.int32 if sample_width == 3 else torch.int16)
|
766 |
+
# Save to temporary WAV to create AudioSegment
|
767 |
+
temp_crossfade_path = f"temp_crossfade_{int(time.time()*1000)}.wav"
|
768 |
+
torchaudio.save(temp_crossfade_path, blended_samples, processing_sample_rate, bits_per_sample=bit_depth_int)
|
769 |
+
blended_segment = AudioSegment.from_wav(temp_crossfade_path)
|
770 |
+
os.remove(temp_crossfade_path)
|
771 |
+
blended_segment = ensure_stereo(blended_segment, processing_sample_rate, sample_width)
|
|
|
|
|
|
|
|
|
|
|
|
|
772 |
blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
|
773 |
final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
|
774 |
else:
|
|
|
826 |
# Clear inputs function
|
827 |
def clear_inputs():
|
828 |
logger.info("Clearing input fields")
|
829 |
+
return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16", 0
|
830 |
|
831 |
# Custom CSS
|
832 |
css = """
|
|
|
1028 |
)
|
1029 |
preset = gr.Dropdown(
|
1030 |
label="Preset Configuration 🎛️",
|
1031 |
+
choices=["default", "rock", "techno", "grunge", "indie", "funk_rock"],
|
1032 |
value="default",
|
1033 |
info="Select a preset optimized for specific genres."
|
1034 |
)
|
|
|
1038 |
value=1300,
|
1039 |
info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
|
1040 |
)
|
1041 |
+
seed = gr.Slider(
|
1042 |
+
label="Random Seed 🌱",
|
1043 |
+
minimum=0,
|
1044 |
+
maximum=10000,
|
1045 |
+
value=0,
|
1046 |
+
step=1,
|
1047 |
+
info="Set a seed for reproducibility (0-10000). Change for different variations."
|
1048 |
+
)
|
1049 |
bitrate_state = gr.State(value="96k") # Default bitrate
|
1050 |
sample_rate_state = gr.State(value="32000") # Default output sampling rate
|
1051 |
bit_depth_state = gr.State(value="16") # Default bit depth
|
|
|
1100 |
bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
|
1101 |
gen_btn.click(
|
1102 |
generate_music,
|
1103 |
+
inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state, seed],
|
1104 |
outputs=[out_audio, status, vram_status]
|
1105 |
)
|
1106 |
clr_btn.click(
|
1107 |
clear_inputs,
|
1108 |
inputs=None,
|
1109 |
+
outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state, seed]
|
1110 |
)
|
1111 |
log_btn.click(
|
1112 |
get_latest_log,
|