import os import json import base64 import requests import gradio as gr # Load API key from Hugging Face secret API_KEY = os.getenv("GEMINI_API_KEY") API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" headers = { "Content-Type": "application/json", "x-goog-api-key": API_KEY, } def generate_audio(text): payload = { "contents": [{ "parts": [{ "text": f"Say cheerfully: {text}" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } } } response = requests.post(API_URL, headers=headers, data=json.dumps(payload)) if response.status_code != 200: return f"API Error: {response.text}" res_json = response.json() data = res_json["candidates"][0]["content"]["parts"][0]["inlineData"]["data"] audio_bytes = base64.b64decode(data) file_path = "output.wav" with open(file_path, "wb") as f: f.write(audio_bytes) return file_path # Gradio Interface def speak_gradio(text): result = generate_audio(text) return result if result.endswith(".wav") else None iface = gr.Interface( fn=speak_gradio, inputs=gr.Textbox(label="Enter text to speak", placeholder="Say something cheerful..."), outputs=gr.Audio(label="Gemini TTS Output", type="filepath"), title="Gemini TTS (Kore Voice)", description="Powered by Gemini 2.5 Flash Preview TTS API. Cheerfully speaks your input!" ) if __name__ == "__main__": iface.launch()