ghostai1
/

GHOSTSONAFB

English

python

Model card Files Files and versions

xet

Community

ghostai1 commited on 7 days ago

Commit

5a32854

verified ·

1 Parent(s): 7a43119

Update stablecuda12build1.py

Browse files

16 bit and addition in audio have diff maths for each float fixed up errors

Files changed (1) hide show

stablecuda12build1.py +50 -38

stablecuda12build1.py CHANGED Viewed

@@ -19,6 +19,7 @@ from pathlib import Path
 import mmap
 import subprocess
 import re
 # Suppress warnings for cleaner output
 warnings.filterwarnings("ignore")
@@ -149,13 +150,16 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
             stereo_samples = stereo_samples * mask
             left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
             right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
-            left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 machts 0
             right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
             if left_rms > 0 and right_rms > 0:
                 avg_rms = (left_rms + right_rms) / 2
                 stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
                 stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
             balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
             balanced_segment = AudioSegment(
                 balanced_samples.tobytes(),
                 frame_rate=sample_rate,
@@ -206,6 +210,9 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
         limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
         samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
         samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
         limited_segment = AudioSegment(
             samples.tobytes(),
             frame_rate=sample_rate,
@@ -248,12 +255,15 @@ def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
 # Genre prompt functions
 def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
     try:
-        rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("strong rhythmic steps" if bpm > 120 else "groovy rhythmic flow")
-        drum = f", {drum_beat} drums" if drum_beat != "none" else ""
         synth = f", {synthesizer} accents" if synthesizer != "none" else ""
-        bass = f", {bass_style}" if bass_style != "none" else ", groovy basslines"
-        guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated guitar riffs"
-        prompt = f"Instrumental funk rock{bass}{guitar}{drum}{synth}, Red Hot Chili Peppers-inspired vibe with dynamic energy and funky breakdowns, {rhythm} at {bpm} BPM."
         logger.debug(f"Generated RHCP prompt: {prompt}")
         return prompt
     except Exception as e:
@@ -468,7 +478,8 @@ PRESETS = {
     "rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
     "techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
     "grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
-    "indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8}
 }
 # Function to get the latest log file
@@ -523,7 +534,7 @@ def set_bit_depth_24():
     return "24"
 # Optimized generation function
-def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str):
     global musicgen_model
     if not instrumental_prompt.strip():
         logger.warning("Empty instrumental prompt provided")
@@ -550,6 +561,10 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
         except ValueError:
             logger.error(f"Invalid bit_depth value: {bit_depth}")
             return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
         max_duration = min(max_steps_int / 50, 30)  # Convert steps to seconds, cap at 30s
         total_duration = min(max(total_duration, 30), 120)  # Clamp between 30s and 120s
         processing_sample_rate = 16000  # Fixed for processing
@@ -570,8 +585,6 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
             logger.error("Insufficient disk space")
             return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
-        # Set random seed for this generation run
-        seed = random.randint(0, 10000)
         logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
         base_prompt = instrumental_prompt
         clean_memory()
@@ -731,10 +744,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
                     logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
                     prev_overlap = final_segment[-overlap_ms:]
                     curr_overlap = current_segment[:overlap_ms]
-                    # Ensure stereo and consistent sample length
-                    prev_overlap = ensure_stereo(prev_overlap, processing_sample_rate, sample_width)
-                    curr_overlap = ensure_stereo(curr_overlap, processing_sample_rate, sample_width)
-                    # Calculate samples using torchaudio for precision
                     prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
                     curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
                     num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
@@ -744,27 +754,21 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
                         logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
                         final_segment += current_segment
                         continue
-                    blended_samples = np.zeros((num_samples, 2), dtype=np.float32)
-                    prev_samples = prev_audio[:, :num_samples].numpy().T
-                    curr_samples = curr_audio[:, :num_samples].numpy().T
-                    hann_window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(num_samples) / num_samples))
-                    fade_out = hann_window[::-1]
                     fade_in = hann_window
-                    blended_samples = (prev_samples * fade_out[:, None] + curr_samples * fade_in[:, None])
-                    # Ensure byte length is multiple of sample_width * channels
-                    blended_samples = blended_samples.astype(np.int32 if sample_width == 3 else np.int16)
-                    byte_data = blended_samples.tobytes()
-                    byte_length = len(byte_data)
-                    expected_length = byte_length - (byte_length % (sample_width * channels))
-                    if byte_length != expected_length:
-                        logger.debug(f"Truncating blended samples from {byte_length} to {expected_length} bytes")
-                        byte_data = byte_data[:expected_length]
-                    blended_segment = AudioSegment(
-                        byte_data,
-                        frame_rate=processing_sample_rate,
-                        sample_width=sample_width,
-                        channels=2
-                    )
                     blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
                     final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
                 else:
@@ -822,7 +826,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
 # Clear inputs function
 def clear_inputs():
     logger.info("Clearing input fields")
-    return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16"
 # Custom CSS
 css = """
@@ -1024,7 +1028,7 @@ with gr.Blocks(css=css) as demo:
             )
             preset = gr.Dropdown(
                 label="Preset Configuration 🎛️",
-                choices=["default", "rock", "techno", "grunge", "indie"],
                 value="default",
                 info="Select a preset optimized for specific genres."
             )
@@ -1034,6 +1038,14 @@ with gr.Blocks(css=css) as demo:
                 value=1300,
                 info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
             )
             bitrate_state = gr.State(value="96k")  # Default bitrate
             sample_rate_state = gr.State(value="32000")  # Default output sampling rate
             bit_depth_state = gr.State(value="16")  # Default bit depth
@@ -1088,13 +1100,13 @@ with gr.Blocks(css=css) as demo:
     bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
     gen_btn.click(
         generate_music,
-        inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state],
         outputs=[out_audio, status, vram_status]
     )
     clr_btn.click(
         clear_inputs,
         inputs=None,
-        outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state]
     )
     log_btn.click(
         get_latest_log,

 import mmap
 import subprocess
 import re
+import io
 # Suppress warnings for cleaner output
 warnings.filterwarnings("ignore")
             stereo_samples = stereo_samples * mask
             left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
             right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
+            left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 else 0
             right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
             if left_rms > 0 and right_rms > 0:
                 avg_rms = (left_rms + right_rms) / 2
                 stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
                 stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
             balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
+            # Ensure sample length is even for stereo
+            if len(balanced_samples) % 2 != 0:
+                balanced_samples = balanced_samples[:-1]
             balanced_segment = AudioSegment(
                 balanced_samples.tobytes(),
                 frame_rate=sample_rate,
         limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
         samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
         samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
+        # Ensure sample length is even for stereo
+        if len(samples) % 2 != 0:
+            samples = samples[:-1]
         limited_segment = AudioSegment(
             samples.tobytes(),
             frame_rate=sample_rate,
 # Genre prompt functions
 def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
     try:
+        rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("syncopated funk rhythms" if bpm > 120 else "groovy funk flow")
+        drum = f", {drum_beat} drums" if drum_beat != "none" else ", tight funk drums with punchy snares"
         synth = f", {synthesizer} accents" if synthesizer != "none" else ""
+        bass = f", {bass_style}" if bass_style != "none" else ", prominent slap bass with funky grooves"
+        guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated funk guitar riffs with clean and distorted tones"
+        prompt = (
+            f"Instrumental funk rock{bass}{guitar}{drum}{synth}, Red Hot Chili Peppers-inspired vibe with high-energy slap bass, "
+            f"syncopated guitar riffs, dynamic breakdowns, and a raw, funky edge, {rhythm} at {bpm} BPM."
+        )
         logger.debug(f"Generated RHCP prompt: {prompt}")
         return prompt
     except Exception as e:
     "rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
     "techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
     "grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
+    "indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8},
+    "funk_rock": {"cfg_scale": 2.2, "top_k": 150, "top_p": 0.95, "temperature": 1.0}  # Enhanced for RHCP
 }
 # Function to get the latest log file
     return "24"
 # Optimized generation function
+def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str, seed: int):
     global musicgen_model
     if not instrumental_prompt.strip():
         logger.warning("Empty instrumental prompt provided")
         except ValueError:
             logger.error(f"Invalid bit_depth value: {bit_depth}")
             return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
+        # Validate seed
+        if not (0 <= seed <= 10000):
+            logger.error(f"Invalid seed value: {seed}. Must be between 0 and 10000.")
+            return None, "❌ Invalid seed value; must be between 0 and 10000", vram_status
         max_duration = min(max_steps_int / 50, 30)  # Convert steps to seconds, cap at 30s
         total_duration = min(max(total_duration, 30), 120)  # Clamp between 30s and 120s
         processing_sample_rate = 16000  # Fixed for processing
             logger.error("Insufficient disk space")
             return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
         logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
         base_prompt = instrumental_prompt
         clean_memory()
                     logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
                     prev_overlap = final_segment[-overlap_ms:]
                     curr_overlap = current_segment[:overlap_ms]
+                    # Use torchaudio for precise crossfading
                     prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
                     curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
                     num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
                         logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
                         final_segment += current_segment
                         continue
+                    blended_samples = torch.zeros(2, num_samples, dtype=torch.float32)
+                    prev_samples = prev_audio[:, :num_samples]
+                    curr_samples = curr_audio[:, :num_samples]
+                    hann_window = torch.hann_window(num_samples, periodic=False)
+                    fade_out = hann_window.flip(0)
                     fade_in = hann_window
+                    blended_samples = (prev_samples * fade_out + curr_samples * fade_in)
+                    # Convert to appropriate dtype for bit depth
+                    blended_samples = (blended_samples * (2**23 if sample_width == 3 else 32767)).to(torch.int32 if sample_width == 3 else torch.int16)
+                    # Save to temporary WAV to create AudioSegment
+                    temp_crossfade_path = f"temp_crossfade_{int(time.time()*1000)}.wav"
+                    torchaudio.save(temp_crossfade_path, blended_samples, processing_sample_rate, bits_per_sample=bit_depth_int)
+                    blended_segment = AudioSegment.from_wav(temp_crossfade_path)
+                    os.remove(temp_crossfade_path)
+                    blended_segment = ensure_stereo(blended_segment, processing_sample_rate, sample_width)
                     blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
                     final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
                 else:
 # Clear inputs function
 def clear_inputs():
     logger.info("Clearing input fields")
+    return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16", 0
 # Custom CSS
 css = """
             )
             preset = gr.Dropdown(
                 label="Preset Configuration 🎛️",
+                choices=["default", "rock", "techno", "grunge", "indie", "funk_rock"],
                 value="default",
                 info="Select a preset optimized for specific genres."
             )
                 value=1300,
                 info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
             )
+            seed = gr.Slider(
+                label="Random Seed 🌱",
+                minimum=0,
+                maximum=10000,
+                value=0,
+                step=1,
+                info="Set a seed for reproducibility (0-10000). Change for different variations."
+            )
             bitrate_state = gr.State(value="96k")  # Default bitrate
             sample_rate_state = gr.State(value="32000")  # Default output sampling rate
             bit_depth_state = gr.State(value="16")  # Default bit depth
     bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
     gen_btn.click(
         generate_music,
+        inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state, seed],
         outputs=[out_audio, status, vram_status]
     )
     clr_btn.click(
         clear_inputs,
         inputs=None,
+        outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state, seed]
     )
     log_btn.click(
         get_latest_log,