import spaces import gradio as gr import torch from transformers import MarianTokenizer, MarianMTModel from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed from PyPDF2 import PdfReader import re import textwrap import soundfile as SF import numpy as np # Device configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Initialize models and tokenizers tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device) tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1") feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1") SAMPLE_RATE = feature_extractor.sampling_rate SEED = 42 # Helper function to extract text from a PDF def pdf_to_text(pdf_path): with open(pdf_path, 'rb') as file: pdf_reader = PdfReader(file) text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() return text # Helper function to split text into sentences using regex def split_text_into_sentences(text): sentence_endings = re.compile(r'[.!?]') sentences = sentence_endings.split(text) return [sentence.strip() for sentence in sentences if sentence.strip()] # Translation function @spaces.GPU(duration=120) def translate(source_text, source_lang, target_lang, batch_size=16): model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}" tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name).to(device) text_chunks = textwrap.wrap(source_text, 512) translated_text = "" for i in range(0, len(text_chunks), batch_size): text_batch = text_chunks[i:i+batch_size] input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device) output_ids = model.generate(input_ids, max_new_tokens=512) for output in output_ids: output_text = tokenizer.decode(output, skip_special_tokens=True) translated_text += output_text + " " return translated_text # Function to preprocess the text (normalization, punctuation) def preprocess(text): text = text.replace("-", " ") if text[-1] not in ".!?": text += "." return text # Function to generate audio for a single sentence @spaces.GPU(duration=120) def generate_single_wav_from_text(sentence, description): set_seed(SEED) inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device) prompt = tts_tokenizer(preprocess(sentence), return_tensors="pt").to(device) generation = tts_model.generate( input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 ) audio_arr = generation.cpu().numpy().squeeze() return SAMPLE_RATE, audio_arr # Gradio Interface with gr.Blocks() as demo: with gr.Row(): with gr.Column(): pdf_input = gr.File(label="Upload PDF", file_types=['pdf']) translate_checkbox = gr.Checkbox(label="Enable Translation", value=False) source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True) target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True) description = gr.Textbox(label="Voice Description", lines=2, value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.") run_button = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_container = gr.Column() markdown_output = gr.Markdown() def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description): text = pdf_to_text(pdf_input.name) if translate_checkbox: text = translate(text, source_lang, target_lang) sentences = split_text_into_sentences(text) all_audio_data = [] all_text = "" for sentence in sentences: sample_rate, audio_arr = generate_single_wav_from_text(sentence, description) audio_data = (sample_rate, audio_arr) all_audio_data.append(audio_data) all_text += f"**Sentence**: {sentence}\n\n" yield all_audio_data, all_text def run_pipeline(pdf_input, translate_checkbox, source_lang, target_lang, description): audio_container.clear_components() # Clear previous components for audio_data_list, markdown_text in handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description): for sample_rate, audio_arr in audio_data_list: audio_container.append(gr.Audio(value=(np.array(audio_arr).astype(np.float32), sample_rate))) yield None, markdown_text run_button.click(run_pipeline, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_container, markdown_output]) demo.queue() demo.launch(share=True)