Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,17 +1,7 @@
|
|
1 |
-
# Loading the ST Model (Whisper)
|
2 |
import os
|
3 |
import torch
|
4 |
from transformers import pipeline
|
5 |
|
6 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
7 |
-
pipe = pipeline("automatic-speech-recognition", model="okezieowen/whisper-small-multilingual-naija-11-03-2024", device=device)
|
8 |
-
|
9 |
-
# Take audio and return translated text
|
10 |
-
def transcribe(audio):
|
11 |
-
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
|
12 |
-
return outputs["text"]
|
13 |
-
|
14 |
-
|
15 |
# The LLM Model
|
16 |
from huggingface_hub import HfFolder
|
17 |
from openai import OpenAI
|
@@ -29,35 +19,77 @@ client = OpenAI(
|
|
29 |
)
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def generate_llm_response(text, model_id="ccibeekeoc42/Llama3.1-8b-base-SFT-2024-11-09"):
|
33 |
-
"""Generates LLM response for given text with streaming support"""
|
34 |
full_response = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
# Create streaming response
|
37 |
-
chat_completion = client.chat.completions.create(
|
38 |
-
model="tgi",
|
39 |
-
messages=[
|
40 |
-
{"role": "system", "content": "You are a BRIEF AND DIRECT assistant. A part of a speech pipeline so keep your responces short, fluent, and straight to the point. Avoid markdown in responses"},
|
41 |
-
{"role": "user", "content": text}
|
42 |
-
],
|
43 |
-
top_p=None,
|
44 |
-
temperature=None,
|
45 |
-
max_tokens=75,
|
46 |
-
stream=True,
|
47 |
-
seed=None,
|
48 |
-
stop=None,
|
49 |
-
frequency_penalty=None,
|
50 |
-
presence_penalty=None
|
51 |
-
)
|
52 |
-
# Collect streamed response chunks
|
53 |
-
for chunk in chat_completion:
|
54 |
-
if chunk.choices[0].delta.content:
|
55 |
-
full_response.append(chunk.choices[0].delta.content)
|
56 |
-
|
57 |
-
return "".join(full_response)
|
58 |
|
59 |
generate_llm_response("Explain Deep Learning in Igbo")
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
# Helper Functions to Cleanup LLM Texts
|
62 |
# Replacement rules
|
63 |
import re
|
|
|
|
|
1 |
import os
|
2 |
import torch
|
3 |
from transformers import pipeline
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# The LLM Model
|
6 |
from huggingface_hub import HfFolder
|
7 |
from openai import OpenAI
|
|
|
19 |
)
|
20 |
|
21 |
|
22 |
+
# def generate_llm_response(text, model_id="ccibeekeoc42/Llama3.1-8b-base-SFT-2024-11-09"):
|
23 |
+
# """Generates LLM response for given text with streaming support"""
|
24 |
+
# full_response = []
|
25 |
+
|
26 |
+
# # Create streaming response
|
27 |
+
# chat_completion = client.chat.completions.create(
|
28 |
+
# model="tgi",
|
29 |
+
# messages=[
|
30 |
+
# {"role": "system", "content": "You are a BRIEF AND DIRECT assistant. A part of a speech pipeline so keep your responces short, fluent, and straight to the point. Avoid markdown in responses"},
|
31 |
+
# {"role": "user", "content": text}
|
32 |
+
# ],
|
33 |
+
# top_p=None,
|
34 |
+
# temperature=None,
|
35 |
+
# max_tokens=75,
|
36 |
+
# stream=True,
|
37 |
+
# seed=None,
|
38 |
+
# stop=None,
|
39 |
+
# frequency_penalty=None,
|
40 |
+
# presence_penalty=None
|
41 |
+
# )
|
42 |
+
# # Collect streamed response chunks
|
43 |
+
# for chunk in chat_completion:
|
44 |
+
# if chunk.choices[0].delta.content:
|
45 |
+
# full_response.append(chunk.choices[0].delta.content)
|
46 |
+
|
47 |
+
# return "".join(full_response)
|
48 |
+
|
49 |
+
import openai
|
50 |
+
|
51 |
def generate_llm_response(text, model_id="ccibeekeoc42/Llama3.1-8b-base-SFT-2024-11-09"):
|
52 |
+
"""Generates LLM response for given text with streaming support, handling GPU cold-start errors."""
|
53 |
full_response = []
|
54 |
+
try:
|
55 |
+
chat_completion = client.chat.completions.create(
|
56 |
+
model="tgi",
|
57 |
+
messages=[
|
58 |
+
{"role": "system", "content": "You are a BRIEF AND DIRECT assistant. A part of a speech pipeline so keep your responses short, fluent, and straight to the point. Avoid markdown in responses."},
|
59 |
+
{"role": "user", "content": text}
|
60 |
+
],
|
61 |
+
top_p=None,
|
62 |
+
temperature=None,
|
63 |
+
max_tokens=75,
|
64 |
+
stream=True,
|
65 |
+
seed=None,
|
66 |
+
stop=None,
|
67 |
+
frequency_penalty=None,
|
68 |
+
presence_penalty=None
|
69 |
+
)
|
70 |
+
# Collect streamed response chunks
|
71 |
+
for chunk in chat_completion:
|
72 |
+
if chunk.choices[0].delta.content:
|
73 |
+
full_response.append(chunk.choices[0].delta.content)
|
74 |
+
return "".join(full_response)
|
75 |
+
except openai.error.InternalServerError as e:
|
76 |
+
# If the error is due to the GPU scaling down, inform the user accordingly.
|
77 |
+
return "The GPU is currently booting up. Please wait about 10 minutes and try again."
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
generate_llm_response("Explain Deep Learning in Igbo")
|
81 |
|
82 |
+
|
83 |
+
# Loading the ST Model (Whisper)
|
84 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
85 |
+
pipe = pipeline("automatic-speech-recognition", model="okezieowen/whisper-small-multilingual-naija-11-03-2024", device=device)
|
86 |
+
|
87 |
+
# Take audio and return translated text
|
88 |
+
def transcribe(audio):
|
89 |
+
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
|
90 |
+
return outputs["text"]
|
91 |
+
|
92 |
+
|
93 |
# Helper Functions to Cleanup LLM Texts
|
94 |
# Replacement rules
|
95 |
import re
|