import faster_whisper
import requests
import tempfile
import os

# Load the faster-whisper model that supports Hebrew
model = faster_whisper.WhisperModel("ivrit-ai/faster-whisper-v2-d4")

# URL of the audio file (replace this with the actual URL of your audio)
audio_url = "https://github.com/AshDavid12/runpod-serverless-forked/blob/main/me-hebrew.wav"

# Download the audio file from the URL
response = requests.get(audio_url)
if response.status_code != 200:
    raise Exception("Failed to download audio file")

# Create a temporary file to store the audio
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_file:
    tmp_audio_file.write(response.content)
    tmp_audio_file_path = tmp_audio_file.name

# Perform the transcription
segments, info = model.transcribe(tmp_audio_file_path, language="he")

# Print transcription results
for segment in segments:
    print(f"[{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}")

# Clean up the temporary file
os.remove(tmp_audio_file_path)


# import torch
# from transformers import WhisperProcessor, WhisperForConditionalGeneration
# import requests
# import soundfile as sf
# import io


# # Load the Whisper model and processor from Hugging Face Model Hub
# model_name = "openai/whisper-base"
# processor = WhisperProcessor.from_pretrained(model_name)
# model = WhisperForConditionalGeneration.from_pretrained(model_name)
#
# # Use GPU if available, otherwise use CPU
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)
#
# # URL of the audio file
# audio_url = "https://www.signalogic.com/melp/EngSamples/Orig/male.wav"
#
# # Download the audio file
# response = requests.get(audio_url)
# audio_data = io.BytesIO(response.content)
#
# # Read the audio using soundfile
# audio_input, _ = sf.read(audio_data)
#
# # Preprocess the audio for Whisper
# inputs = processor(audio_input, return_tensors="pt", sampling_rate=16000)
# attention_mask = inputs['input_features'].ne(processor.tokenizer.pad_token_id).long()
#
# # Move inputs and attention mask to the correct device
# inputs = {key: value.to(device) for key, value in inputs.items()}
# attention_mask = attention_mask.to(device)
#
# # Generate the transcription with attention mask
# with torch.no_grad():
#     predicted_ids = model.generate(
#         inputs["input_features"],
#         attention_mask=attention_mask  # Pass attention mask explicitly
#     )
# # Decode the transcription
# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
#
# # Print the transcription result
# print("Transcription:", transcription)