roman commited on
Commit
186a3b7
·
1 Parent(s): d242d3a

try new approach

Browse files
Files changed (1) hide show
  1. app.py +24 -11
app.py CHANGED
@@ -1,21 +1,25 @@
1
  import streamlit as st
2
- from transformers import pipeline
3
  import tempfile
4
  from pydub import AudioSegment
5
  import numpy as np
 
6
 
7
- # Load the ASR pipeline
 
8
  @st.cache_resource
9
- def load_asr_pipeline():
10
- asr_pipeline = pipeline("automatic-speech-recognition", model="Yehor/whisper-small-ukrainian")
11
- return asr_pipeline
 
 
12
 
13
  st.title("Voice Recognition App using Whisper")
14
 
15
  st.write("Upload an audio file and the Whisper model will transcribe it to text.")
16
 
17
- # Load the ASR pipeline
18
- asr_pipeline = load_asr_pipeline()
19
  st.write("Model loaded successfully.")
20
 
21
  # File uploader for audio file
@@ -40,9 +44,18 @@ if uploaded_file is not None:
40
  audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
41
  audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
42
 
43
- # Perform transcription
44
- result = asr_pipeline(audio_input)
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Display transcription
47
  st.write("Transcription:")
48
- st.write(result['text'])
 
1
  import streamlit as st
2
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
3
  import tempfile
4
  from pydub import AudioSegment
5
  import numpy as np
6
+ import torch
7
 
8
+
9
+ # Load the processor and model
10
  @st.cache_resource
11
+ def load_model_and_processor():
12
+ processor = AutoProcessor.from_pretrained("arampacha/whisper-large-uk-2")
13
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("arampacha/whisper-large-uk-2")
14
+ return processor, model
15
+
16
 
17
  st.title("Voice Recognition App using Whisper")
18
 
19
  st.write("Upload an audio file and the Whisper model will transcribe it to text.")
20
 
21
+ # Load the processor and model
22
+ processor, model = load_model_and_processor()
23
  st.write("Model loaded successfully.")
24
 
25
  # File uploader for audio file
 
44
  audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
45
  audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
46
 
47
+ # Normalize audio
48
+ audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
49
+ audio_input = torch.tensor(audio_input).unsqueeze(0)
50
+
51
+ # Process the audio
52
+ input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
53
+
54
+ # Generate transcription
55
+ with torch.no_grad():
56
+ predicted_ids = model.generate(input_features)
57
+
58
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
59
 
 
60
  st.write("Transcription:")
61
+ st.write(transcription)