ghostai1 commited on
Commit
5a32854
·
verified ·
1 Parent(s): 7a43119

Update stablecuda12build1.py

Browse files

16 bit and addition in audio have diff maths for each float fixed up errors

Files changed (1) hide show
  1. stablecuda12build1.py +50 -38
stablecuda12build1.py CHANGED
@@ -19,6 +19,7 @@ from pathlib import Path
19
  import mmap
20
  import subprocess
21
  import re
 
22
 
23
  # Suppress warnings for cleaner output
24
  warnings.filterwarnings("ignore")
@@ -149,13 +150,16 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
149
  stereo_samples = stereo_samples * mask
150
  left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
151
  right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
152
- left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 machts 0
153
  right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
154
  if left_rms > 0 and right_rms > 0:
155
  avg_rms = (left_rms + right_rms) / 2
156
  stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
157
  stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
158
  balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
 
 
 
159
  balanced_segment = AudioSegment(
160
  balanced_samples.tobytes(),
161
  frame_rate=sample_rate,
@@ -206,6 +210,9 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
206
  limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
207
  samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
208
  samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
 
 
 
209
  limited_segment = AudioSegment(
210
  samples.tobytes(),
211
  frame_rate=sample_rate,
@@ -248,12 +255,15 @@ def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
248
  # Genre prompt functions
249
  def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
250
  try:
251
- rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("strong rhythmic steps" if bpm > 120 else "groovy rhythmic flow")
252
- drum = f", {drum_beat} drums" if drum_beat != "none" else ""
253
  synth = f", {synthesizer} accents" if synthesizer != "none" else ""
254
- bass = f", {bass_style}" if bass_style != "none" else ", groovy basslines"
255
- guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated guitar riffs"
256
- prompt = f"Instrumental funk rock{bass}{guitar}{drum}{synth}, Red Hot Chili Peppers-inspired vibe with dynamic energy and funky breakdowns, {rhythm} at {bpm} BPM."
 
 
 
257
  logger.debug(f"Generated RHCP prompt: {prompt}")
258
  return prompt
259
  except Exception as e:
@@ -468,7 +478,8 @@ PRESETS = {
468
  "rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
469
  "techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
470
  "grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
471
- "indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8}
 
472
  }
473
 
474
  # Function to get the latest log file
@@ -523,7 +534,7 @@ def set_bit_depth_24():
523
  return "24"
524
 
525
  # Optimized generation function
526
- def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str):
527
  global musicgen_model
528
  if not instrumental_prompt.strip():
529
  logger.warning("Empty instrumental prompt provided")
@@ -550,6 +561,10 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
550
  except ValueError:
551
  logger.error(f"Invalid bit_depth value: {bit_depth}")
552
  return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
 
 
 
 
553
  max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
554
  total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
555
  processing_sample_rate = 16000 # Fixed for processing
@@ -570,8 +585,6 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
570
  logger.error("Insufficient disk space")
571
  return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
572
 
573
- # Set random seed for this generation run
574
- seed = random.randint(0, 10000)
575
  logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
576
  base_prompt = instrumental_prompt
577
  clean_memory()
@@ -731,10 +744,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
731
  logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
732
  prev_overlap = final_segment[-overlap_ms:]
733
  curr_overlap = current_segment[:overlap_ms]
734
- # Ensure stereo and consistent sample length
735
- prev_overlap = ensure_stereo(prev_overlap, processing_sample_rate, sample_width)
736
- curr_overlap = ensure_stereo(curr_overlap, processing_sample_rate, sample_width)
737
- # Calculate samples using torchaudio for precision
738
  prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
739
  curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
740
  num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
@@ -744,27 +754,21 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
744
  logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
745
  final_segment += current_segment
746
  continue
747
- blended_samples = np.zeros((num_samples, 2), dtype=np.float32)
748
- prev_samples = prev_audio[:, :num_samples].numpy().T
749
- curr_samples = curr_audio[:, :num_samples].numpy().T
750
- hann_window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(num_samples) / num_samples))
751
- fade_out = hann_window[::-1]
752
  fade_in = hann_window
753
- blended_samples = (prev_samples * fade_out[:, None] + curr_samples * fade_in[:, None])
754
- # Ensure byte length is multiple of sample_width * channels
755
- blended_samples = blended_samples.astype(np.int32 if sample_width == 3 else np.int16)
756
- byte_data = blended_samples.tobytes()
757
- byte_length = len(byte_data)
758
- expected_length = byte_length - (byte_length % (sample_width * channels))
759
- if byte_length != expected_length:
760
- logger.debug(f"Truncating blended samples from {byte_length} to {expected_length} bytes")
761
- byte_data = byte_data[:expected_length]
762
- blended_segment = AudioSegment(
763
- byte_data,
764
- frame_rate=processing_sample_rate,
765
- sample_width=sample_width,
766
- channels=2
767
- )
768
  blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
769
  final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
770
  else:
@@ -822,7 +826,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
822
  # Clear inputs function
823
  def clear_inputs():
824
  logger.info("Clearing input fields")
825
- return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16"
826
 
827
  # Custom CSS
828
  css = """
@@ -1024,7 +1028,7 @@ with gr.Blocks(css=css) as demo:
1024
  )
1025
  preset = gr.Dropdown(
1026
  label="Preset Configuration 🎛️",
1027
- choices=["default", "rock", "techno", "grunge", "indie"],
1028
  value="default",
1029
  info="Select a preset optimized for specific genres."
1030
  )
@@ -1034,6 +1038,14 @@ with gr.Blocks(css=css) as demo:
1034
  value=1300,
1035
  info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
1036
  )
 
 
 
 
 
 
 
 
1037
  bitrate_state = gr.State(value="96k") # Default bitrate
1038
  sample_rate_state = gr.State(value="32000") # Default output sampling rate
1039
  bit_depth_state = gr.State(value="16") # Default bit depth
@@ -1088,13 +1100,13 @@ with gr.Blocks(css=css) as demo:
1088
  bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
1089
  gen_btn.click(
1090
  generate_music,
1091
- inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state],
1092
  outputs=[out_audio, status, vram_status]
1093
  )
1094
  clr_btn.click(
1095
  clear_inputs,
1096
  inputs=None,
1097
- outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state]
1098
  )
1099
  log_btn.click(
1100
  get_latest_log,
 
19
  import mmap
20
  import subprocess
21
  import re
22
+ import io
23
 
24
  # Suppress warnings for cleaner output
25
  warnings.filterwarnings("ignore")
 
150
  stereo_samples = stereo_samples * mask
151
  left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
152
  right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
153
+ left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 else 0
154
  right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
155
  if left_rms > 0 and right_rms > 0:
156
  avg_rms = (left_rms + right_rms) / 2
157
  stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
158
  stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
159
  balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
160
+ # Ensure sample length is even for stereo
161
+ if len(balanced_samples) % 2 != 0:
162
+ balanced_samples = balanced_samples[:-1]
163
  balanced_segment = AudioSegment(
164
  balanced_samples.tobytes(),
165
  frame_rate=sample_rate,
 
210
  limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
211
  samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
212
  samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
213
+ # Ensure sample length is even for stereo
214
+ if len(samples) % 2 != 0:
215
+ samples = samples[:-1]
216
  limited_segment = AudioSegment(
217
  samples.tobytes(),
218
  frame_rate=sample_rate,
 
255
  # Genre prompt functions
256
  def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
257
  try:
258
+ rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("syncopated funk rhythms" if bpm > 120 else "groovy funk flow")
259
+ drum = f", {drum_beat} drums" if drum_beat != "none" else ", tight funk drums with punchy snares"
260
  synth = f", {synthesizer} accents" if synthesizer != "none" else ""
261
+ bass = f", {bass_style}" if bass_style != "none" else ", prominent slap bass with funky grooves"
262
+ guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated funk guitar riffs with clean and distorted tones"
263
+ prompt = (
264
+ f"Instrumental funk rock{bass}{guitar}{drum}{synth}, Red Hot Chili Peppers-inspired vibe with high-energy slap bass, "
265
+ f"syncopated guitar riffs, dynamic breakdowns, and a raw, funky edge, {rhythm} at {bpm} BPM."
266
+ )
267
  logger.debug(f"Generated RHCP prompt: {prompt}")
268
  return prompt
269
  except Exception as e:
 
478
  "rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
479
  "techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
480
  "grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
481
+ "indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8},
482
+ "funk_rock": {"cfg_scale": 2.2, "top_k": 150, "top_p": 0.95, "temperature": 1.0} # Enhanced for RHCP
483
  }
484
 
485
  # Function to get the latest log file
 
534
  return "24"
535
 
536
  # Optimized generation function
537
+ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str, seed: int):
538
  global musicgen_model
539
  if not instrumental_prompt.strip():
540
  logger.warning("Empty instrumental prompt provided")
 
561
  except ValueError:
562
  logger.error(f"Invalid bit_depth value: {bit_depth}")
563
  return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
564
+ # Validate seed
565
+ if not (0 <= seed <= 10000):
566
+ logger.error(f"Invalid seed value: {seed}. Must be between 0 and 10000.")
567
+ return None, "❌ Invalid seed value; must be between 0 and 10000", vram_status
568
  max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
569
  total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
570
  processing_sample_rate = 16000 # Fixed for processing
 
585
  logger.error("Insufficient disk space")
586
  return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
587
 
 
 
588
  logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
589
  base_prompt = instrumental_prompt
590
  clean_memory()
 
744
  logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
745
  prev_overlap = final_segment[-overlap_ms:]
746
  curr_overlap = current_segment[:overlap_ms]
747
+ # Use torchaudio for precise crossfading
 
 
 
748
  prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
749
  curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
750
  num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
 
754
  logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
755
  final_segment += current_segment
756
  continue
757
+ blended_samples = torch.zeros(2, num_samples, dtype=torch.float32)
758
+ prev_samples = prev_audio[:, :num_samples]
759
+ curr_samples = curr_audio[:, :num_samples]
760
+ hann_window = torch.hann_window(num_samples, periodic=False)
761
+ fade_out = hann_window.flip(0)
762
  fade_in = hann_window
763
+ blended_samples = (prev_samples * fade_out + curr_samples * fade_in)
764
+ # Convert to appropriate dtype for bit depth
765
+ blended_samples = (blended_samples * (2**23 if sample_width == 3 else 32767)).to(torch.int32 if sample_width == 3 else torch.int16)
766
+ # Save to temporary WAV to create AudioSegment
767
+ temp_crossfade_path = f"temp_crossfade_{int(time.time()*1000)}.wav"
768
+ torchaudio.save(temp_crossfade_path, blended_samples, processing_sample_rate, bits_per_sample=bit_depth_int)
769
+ blended_segment = AudioSegment.from_wav(temp_crossfade_path)
770
+ os.remove(temp_crossfade_path)
771
+ blended_segment = ensure_stereo(blended_segment, processing_sample_rate, sample_width)
 
 
 
 
 
 
772
  blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
773
  final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
774
  else:
 
826
  # Clear inputs function
827
  def clear_inputs():
828
  logger.info("Clearing input fields")
829
+ return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16", 0
830
 
831
  # Custom CSS
832
  css = """
 
1028
  )
1029
  preset = gr.Dropdown(
1030
  label="Preset Configuration 🎛️",
1031
+ choices=["default", "rock", "techno", "grunge", "indie", "funk_rock"],
1032
  value="default",
1033
  info="Select a preset optimized for specific genres."
1034
  )
 
1038
  value=1300,
1039
  info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
1040
  )
1041
+ seed = gr.Slider(
1042
+ label="Random Seed 🌱",
1043
+ minimum=0,
1044
+ maximum=10000,
1045
+ value=0,
1046
+ step=1,
1047
+ info="Set a seed for reproducibility (0-10000). Change for different variations."
1048
+ )
1049
  bitrate_state = gr.State(value="96k") # Default bitrate
1050
  sample_rate_state = gr.State(value="32000") # Default output sampling rate
1051
  bit_depth_state = gr.State(value="16") # Default bit depth
 
1100
  bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
1101
  gen_btn.click(
1102
  generate_music,
1103
+ inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state, seed],
1104
  outputs=[out_audio, status, vram_status]
1105
  )
1106
  clr_btn.click(
1107
  clear_inputs,
1108
  inputs=None,
1109
+ outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state, seed]
1110
  )
1111
  log_btn.click(
1112
  get_latest_log,