Spaces:
Build error
Build error
# app.py | |
# Version: 1.07 (08.24.24), ALPHA | |
#--------------------------------------------------------------------------------------------------------------------------------------------- | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
#--------------------------------------------------------------------------------------------------------------------------------------------- | |
import time | |
import os | |
import re | |
import warnings | |
from pydub import AudioSegment | |
import pandas as pd | |
import numpy as np | |
import torch | |
import torchaudio | |
import torchaudio.transforms as transforms | |
from transformers import pipeline | |
import spacy | |
import networkx as nx | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import gradio as gr | |
from fpdf import FPDF | |
from PIL import Image | |
title = """# Welcome to 🌟Tonic's✨StarCoder | |
✨StarCoder StarCoder2-15B model is a 15B parameter model trained on 600+ programming languages from The Stack v2, with opt-out requests excluded. The model uses Grouped Query Attention, a context window of 16,384 tokens with a sliding window attention of 4,096 tokens, and was trained using the Fill-in-the-Middle objective on 4+ trillion tokens. The model was trained with NVIDIA NeMo™ Framework using the NVIDIA Eos Supercomputer built with NVIDIA DGX H100 systems. You can build with this endpoint using✨StarCoder available here : [bigcode/starcoder2-15b](https://huggingface.co/bigcode/starcoder2-15b). You can also use ✨StarCoder by cloning this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/starcoder2?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3> | |
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) Math 🔍 [introspector](https://huggingface.co/introspector) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [SciTonic](https://github.com/Tonic-AI/scitonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗 | |
""" | |
warnings.filterwarnings("ignore") | |
def convert_to_wav(audio_file): | |
audio = AudioSegment.from_file(audio_file, format="m4a") | |
wav_file = "temp.wav" | |
audio.export(wav_file, format="wav") | |
return wav_file | |
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: | |
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large-semantic") | |
kwargs = { | |
"num_beams": 5, | |
"language": "no", | |
"forced_decoder_ids": None | |
} | |
def transcribe_audio(audio_file): | |
if audio_file.endswith(".m4a"): | |
audio_file = convert_to_wav(audio_file) | |
start_time = time.time() | |
outputs = asr(audio_file, task="transcribe", batch_size=16, return_timestamps=False, **kwargs) # chunk_length_s=30, | |
text = outputs["text"] | |
end_time = time.time() | |
output_time = end_time - start_time | |
word_count = len(text.split()) | |
result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}" | |
return text.strip(), result | |
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: | |
# Clean and preprocess text | |
def clean_text(text): | |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) | |
text = re.sub(r'[^\w\s]', '', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål | |
spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS | |
def preprocess_text(text): | |
# Process the text with SpaCy | |
doc = nlp(text) | |
# SpaCy's stop top wrds direct | |
stop_words = spacy_stop_words | |
# Filter out stop words | |
words = [token.text for token in doc if token.text.lower() not in stop_words] | |
return ' '.join(words) | |
# Summarize w/T5 model | |
def summarize_text(text): | |
preprocessed_text = preprocess_text(text) | |
inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True) | |
inputs = inputs.to(device) | |
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True) | |
return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
# Builds similarity matrix | |
def build_similarity_matrix(sentences, stop_words): | |
similarity_matrix = nx.Graph() | |
for i, tokens_a in enumerate(sentences): | |
for j, tokens_b in enumerate(sentences): | |
if i != j: | |
common_words = set(tokens_a) & set(tokens_b) | |
similarity_matrix.add_edge(i, j, weight=len(common_words)) | |
return similarity_matrix | |
# "Graph-based summarization" =====> | |
def graph_based_summary(text, num_paragraphs=3): | |
doc = nlp(text) | |
sentences = [sent.text for sent in doc.sents] | |
if len(sentences) < num_paragraphs: | |
return sentences | |
sentence_tokens = [nlp(sent) for sent in sentences] | |
stop_words = spacy_stop_words | |
filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens] | |
similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words) | |
scores = nx.pagerank(similarity_matrix) | |
ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True) | |
return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]]) | |
# LexRank | |
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1): | |
doc = nlp(text) | |
sentences = [sent.text for sent in doc.sents] | |
if len(sentences) < num_paragraphs: | |
return sentences | |
stop_words = spacy_stop_words | |
vectorizer = TfidfVectorizer(stop_words=list(stop_words)) | |
X = vectorizer.fit_transform(sentences) | |
similarity_matrix = cosine_similarity(X, X) | |
# Apply threshold@similarity matrix | |
similarity_matrix[similarity_matrix < threshold] = 0 | |
nx_graph = nx.from_numpy_array(similarity_matrix) | |
scores = nx.pagerank(nx_graph) | |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) | |
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)]) | |
# TextRank | |
def text_rank_summary(text, num_paragraphs=3): | |
doc = nlp(text) | |
sentences = [sent.text for sent in doc.sents] | |
if len(sentences) < num_paragraphs: | |
return sentences | |
stop_words = spacy_stop_words | |
vectorizer = TfidfVectorizer(stop_words=list(stop_words)) | |
X = vectorizer.fit_transform(sentences) | |
similarity_matrix = cosine_similarity(X, X) | |
nx_graph = nx.from_numpy_array(similarity_matrix) | |
scores = nx.pagerank(nx_graph) | |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) | |
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)]) | |
# Save text+summary/PDF | |
def save_to_pdf(text, summary): | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
if text: | |
pdf.multi_cell(0, 10, "Text:\n" + text) | |
pdf.ln(10) # Paragraph space | |
if summary: | |
pdf.multi_cell(0, 10, "Summary:\n" + summary) | |
pdf_output_path = "transcription.pdf" | |
pdf.output(pdf_output_path) | |
return pdf_output_path | |
iface = gr.Blocks() | |
with iface | |
#gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**") | |
gr.HTML(<a img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto"</a>) | |
gr.Markdown(title) | |
with gr.Tabs(): | |
with gr.TabItem("Transcription"): | |
audio_input = gr.Audio(type="filepath") | |
text_output = gr.Textbox(label="Transcription") | |
result_output = gr.Textbox(label="Details") | |
transcribe_button = gr.Button("Transcribe") | |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output]) | |
with gr.TabItem("Summary | Graph-based"): | |
summary_output = gr.Textbox(label="Summary | Graph-based") | |
summarize_button = gr.Button("Summarize") | |
summarize_button.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output]) | |
with gr.TabItem("Summary | LexRank"): | |
summary_output = gr.Textbox(label="Summary | LexRank") | |
summarize_button = gr.Button("Summarize") | |
summarize_button.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output]) | |
with gr.TabItem("Summary | TextRank"): | |
summary_output = gr.Textbox(label="Summary | TextRank") | |
summarize_button = gr.Button("Summarize") | |
summarize_button.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output]) | |
with gr.TabItem("Download PDF"): | |
pdf_text_only = gr.Button("Download PDF with Text Only") | |
pdf_summary_only = gr.Button("Download PDF with Summary Only") | |
pdf_both = gr.Button("Download PDF with Both") | |
pdf_output = gr.File(label="Download PDF") | |
pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output]) | |
pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output], outputs=[pdf_output]) | |
pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output], outputs=[pdf_output]) | |
iface.launch(share=True, debug=True) | |