Spaces:

romas-458
/

acr

Sleeping

roman commited on May 29, 2024

Commit

186a3b7

1 Parent(s): d242d3a

try new approach

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,21 +1,25 @@
 import streamlit as st
-from transformers import pipeline
 import tempfile
 from pydub import AudioSegment
 import numpy as np
-# Load the ASR pipeline
 @st.cache_resource
-def load_asr_pipeline():
-    asr_pipeline = pipeline("automatic-speech-recognition", model="Yehor/whisper-small-ukrainian")
-    return asr_pipeline
 st.title("Voice Recognition App using Whisper")
 st.write("Upload an audio file and the Whisper model will transcribe it to text.")
-# Load the ASR pipeline
-asr_pipeline = load_asr_pipeline()
 st.write("Model loaded successfully.")
 # File uploader for audio file
@@ -40,9 +44,18 @@ if uploaded_file is not None:
     audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
     audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
-    # Perform transcription
-    result = asr_pipeline(audio_input)
-    # Display transcription
     st.write("Transcription:")
-    st.write(result['text'])

 import streamlit as st
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import tempfile
 from pydub import AudioSegment
 import numpy as np
+import torch
+# Load the processor and model
 @st.cache_resource
+def load_model_and_processor():
+    processor = AutoProcessor.from_pretrained("arampacha/whisper-large-uk-2")
+    model = AutoModelForSpeechSeq2Seq.from_pretrained("arampacha/whisper-large-uk-2")
+    return processor, model
 st.title("Voice Recognition App using Whisper")
 st.write("Upload an audio file and the Whisper model will transcribe it to text.")
+# Load the processor and model
+processor, model = load_model_and_processor()
 st.write("Model loaded successfully.")
 # File uploader for audio file
     audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
     audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
+    # Normalize audio
+    audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
+    audio_input = torch.tensor(audio_input).unsqueeze(0)
+    # Process the audio
+    input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
+    # Generate transcription
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     st.write("Transcription:")
+    st.write(transcription)