roman
commited on
Commit
·
186a3b7
1
Parent(s):
d242d3a
try new approach
Browse files
app.py
CHANGED
@@ -1,21 +1,25 @@
|
|
1 |
import streamlit as st
|
2 |
-
from transformers import
|
3 |
import tempfile
|
4 |
from pydub import AudioSegment
|
5 |
import numpy as np
|
|
|
6 |
|
7 |
-
|
|
|
8 |
@st.cache_resource
|
9 |
-
def
|
10 |
-
|
11 |
-
|
|
|
|
|
12 |
|
13 |
st.title("Voice Recognition App using Whisper")
|
14 |
|
15 |
st.write("Upload an audio file and the Whisper model will transcribe it to text.")
|
16 |
|
17 |
-
# Load the
|
18 |
-
|
19 |
st.write("Model loaded successfully.")
|
20 |
|
21 |
# File uploader for audio file
|
@@ -40,9 +44,18 @@ if uploaded_file is not None:
|
|
40 |
audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
|
41 |
audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
|
42 |
|
43 |
-
#
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
# Display transcription
|
47 |
st.write("Transcription:")
|
48 |
-
st.write(
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
3 |
import tempfile
|
4 |
from pydub import AudioSegment
|
5 |
import numpy as np
|
6 |
+
import torch
|
7 |
|
8 |
+
|
9 |
+
# Load the processor and model
|
10 |
@st.cache_resource
|
11 |
+
def load_model_and_processor():
|
12 |
+
processor = AutoProcessor.from_pretrained("arampacha/whisper-large-uk-2")
|
13 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained("arampacha/whisper-large-uk-2")
|
14 |
+
return processor, model
|
15 |
+
|
16 |
|
17 |
st.title("Voice Recognition App using Whisper")
|
18 |
|
19 |
st.write("Upload an audio file and the Whisper model will transcribe it to text.")
|
20 |
|
21 |
+
# Load the processor and model
|
22 |
+
processor, model = load_model_and_processor()
|
23 |
st.write("Model loaded successfully.")
|
24 |
|
25 |
# File uploader for audio file
|
|
|
44 |
audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
|
45 |
audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
|
46 |
|
47 |
+
# Normalize audio
|
48 |
+
audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
|
49 |
+
audio_input = torch.tensor(audio_input).unsqueeze(0)
|
50 |
+
|
51 |
+
# Process the audio
|
52 |
+
input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
|
53 |
+
|
54 |
+
# Generate transcription
|
55 |
+
with torch.no_grad():
|
56 |
+
predicted_ids = model.generate(input_features)
|
57 |
+
|
58 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
59 |
|
|
|
60 |
st.write("Transcription:")
|
61 |
+
st.write(transcription)
|