import os
import torch
import gradio as gr
from datetime import datetime
from vinorm import TTSnorm
from underthesea import sent_tokenize
from unidecode import unidecode
import soundfile as sf
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from huggingface_hub import snapshot_download
import os

# Tải model nếu chưa có
if not os.path.exists("model/model.pth"):
    snapshot_download(repo_id="epchannel/EpXTTS", repo_type="model", local_dir="model")
    
# Load XTTS model
def load_model():
    config = XttsConfig()
    config.load_json("model/config.json")
    model = Xtts.init_from_config(config)
    model.load_checkpoint(config, checkpoint_path="model/model.pth", vocab_path="model/vocab.json")
    if torch.cuda.is_available():
        model.cuda()
    return model

# Chuẩn hóa văn bản tiếng Việt
def normalize_vietnamese_text(text):
    return (
        TTSnorm(text, unknown=False, lower=False, rule=True)
        .replace("..", ".").replace("!.", "!").replace("?.", "?")
        .replace(" .", ".").replace(" ,", ",").replace('"', "")
        .replace("'", "").replace("AI", "Ây Ai").replace("A.I", "Ây Ai")
        .replace("anh/chị", "anh chị")
    )

# Tạo tên file
def get_file_name(text, max_char=50):
    filename = unidecode(text[:max_char].lower().replace(" ", "_"))
    timestamp = datetime.now().strftime("%m%d%H%M%S")
    return f"{timestamp}_{filename}"

# Sinh tiếng nói
def synthesize(text, voice_choice):
    model = load_model()
    ref_audio = f"model/samples/{voice_choice}.wav"

    # Prepare speaker embedding
    gpt_latent, speaker_embed = model.get_conditioning_latents(
        audio_path=ref_audio,
        gpt_cond_len=model.config.gpt_cond_len,
        max_ref_length=model.config.max_ref_len,
        sound_norm_refs=model.config.sound_norm_refs,
    )

    try:
        text = normalize_vietnamese_text(text)
    except:
        pass

    sentences = sent_tokenize(text)
    wav_chunks = []
    for sent in sentences:
        if sent.strip() == "":
            continue
        wav = model.inference(
            text=sent,
            language="vi",
            gpt_cond_latent=gpt_latent,
            speaker_embedding=speaker_embed,
            temperature=0.5,
            top_k=20,
            top_p=0.85,
            repetition_penalty=5.0,
        )
        wav_chunks.append(torch.tensor(wav["wav"]))

    final_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0)
    filename = f"./output/{get_file_name(text)}.mp3"
    os.makedirs("output", exist_ok=True)
    sf.write(filename, final_wav.squeeze(0).numpy(), 24000, format='MP3')
    return filename

# Giao diện Gradio
voices = {
    "Bống Xinh": "bongxinh",
    "Nam Calm": "nam-calm",
    "Nam Cham": "nam-cham",
    "Nam Truyền cảm": "nam-truyen-cam",
    "Nữ Lưu Loát": "nu-luu-loat",
    "Nữ Nhẹ Nhàng": "nu-nhe-nhang",
    # Thêm các giọng bạn có...
}


with gr.Blocks() as demo:
    gr.Markdown("## 🇻🇳 Text to Speech tiếng Việt (XTTS)")
    with gr.Row():
        text_input = gr.Textbox(label="Nhập văn bản", lines=5, placeholder="Nhập văn bản tiếng Việt...")
    voice_choice = gr.Radio(choices=list(voices.keys()), label="Chọn giọng đọc", value="Bông Xinh")
    btn = gr.Button("🎙️ Chuyển thành giọng nói")
    audio_output = gr.Audio(label="🔊 Kết quả")

    def process(text, voice_label):
        file = synthesize(text, voices[voice_label])
        return file

    btn.click(fn=process, inputs=[text_input, voice_choice], outputs=audio_output)

demo.launch()