from functools import lru_cache
from typing import Literal
from phonikud import lexicon
import unicodedata
import regex as re
import phonikud.syllables
from phonikud.variants import Letter
import phonikud


def sort_diacritics(match):
    letter = match.group(1)
    diacritics = "".join(sorted(match.group(2)))  # Sort diacritics
    return letter + diacritics


NORMALIZE_PATTERNS = {
    # Sort diacritics
    r"(\p{L})(\p{M}+)": sort_diacritics,
    "״": '"',  # Hebrew geresh to normal geresh
    "׳": "'",  # Same
}


def remove_nikud(text: str, to_keep=""):
    pattern = lexicon.HE_NIKUD_PATTERN
    pattern = "".join(i for i in pattern if i not in to_keep)
    return re.sub(
        pattern,
        "",
        text,
    )


@lru_cache(maxsize=10000)
def normalize(text: str) -> str:
    """
    Normalize unicode (decomposite)
    Keep only Hebrew characters / punctuation / IPA
    Sort diacritics
    """

    # Decompose text
    text = unicodedata.normalize("NFD", text)
    for k, v in NORMALIZE_PATTERNS.items():
        text = re.sub(k, v, text)
    for k, v in lexicon.DEDUPLICATE.items():
        text = re.sub(k, v, text)
    return text


def post_normalize(phonemes: str):
    new_phonemes = []
    for word in phonemes.split(" "):
        # remove glottal stop from end
        word = re.sub(r"ʔ$", "", word)
        # remove h from end
        word = re.sub(r"h$", "", word)
        word = re.sub(r"ˈh$", "", word)
        # remove j followed by a i
        word = re.sub(r"ij$", "i", word)
        new_phonemes.append(word)
    phonemes = " ".join(new_phonemes)
    return phonemes


def post_clean(phonemes: str):
    clean = []
    for i in phonemes:
        if i == "-":
            clean.append(" ")
        elif (
            i in lexicon.SET_PHONEMES
            or i in lexicon.ADDITIONAL_PHONEMES
            or i == " "
            or i in lexicon.PUNCTUATION
        ):
            clean.append(i)
    return "".join(clean)


letters_pattern = re.compile(r"(\p{L})([\p{M}'|]*)")


# @lru_cache(maxsize=10000) TODO?
def get_letters(word: str):
    letters: list[tuple[str, str]] = letters_pattern.findall(word)  # with en_geresh
    letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
    return letters


def get_unicode_names(text: str):
    return [unicodedata.name(c, "?") for c in text]


def has_vowel(s: iter):
    return any(i in s for i in "aeiou")


def has_constant(s: iter):
    return any(i not in "aeiou" for i in s)


def get_phoneme_syllables(phonemes: list[str]) -> list[str]:
    syllables = []
    cur_syllable = ""

    i = 0
    while i < len(phonemes):
        # Add current phoneme to the syllable

        cur_syllable += phonemes[i]

        # If we have a vowel in the current syllable
        if has_vowel(cur_syllable):
            # If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
            if (
                i + 2 < len(phonemes)
                and not has_vowel(phonemes[i + 1])
                and has_vowel(phonemes[i + 2])
            ):
                # End the current syllable and start a new one
                syllables.append(cur_syllable)
                cur_syllable = ""
            # If we're at the end or next phoneme has a vowel
            elif i + 1 >= len(phonemes) or has_vowel(phonemes[i + 1]):
                # End the current syllable
                syllables.append(cur_syllable)
                cur_syllable = ""

        i += 1

    # Add any remaining syllable
    if cur_syllable:
        syllables.append(cur_syllable)

    # Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
    for i in range(len(syllables) - 1):  # Ensure we're not at the last syllable
        if syllables[i].endswith(lexicon.STRESS_PHONEME):
            syllables[i + 1] = (
                lexicon.STRESS_PHONEME + syllables[i + 1]
            )  # Move stress to next syllable
            syllables[i] = syllables[i][
                : -len(lexicon.STRESS_PHONEME)
            ]  # Remove stress from current syllable

    return syllables


def sort_stress(
    phonemes: list[str], placement: Literal["syllable", "vowel"] = "vowel"
) -> list[str]:
    """
    TTS systems expect that the stress will be BEFORE vowel
    Linguistics expect in the START of the syllable
    at_start=True for place it in the beginning
    """
    if "ˈ" not in "".join(phonemes):
        # ^ Does not contains stress
        return phonemes
    if not any(i in "".join(phonemes) for i in "aeiou"):
        # ^ Does not contains vowel
        return phonemes

    # Remove stress marker
    phonemes = [p for p in phonemes if p != "ˈ"]

    if placement == "syllable":
        return ["ˈ"] + phonemes

    # Define vowels
    vowels = "aeiou"

    # Find the first phoneme that contains a vowel, and inject the stress before the vowel

    for i, phoneme in enumerate(phonemes):
        for j, char in enumerate(phoneme):
            if char in vowels:
                # Insert stress before the vowel
                phonemes[i] = phoneme[:j] + "ˈ" + phoneme[j:]
                return phonemes

    # If no vowels found, return unchanged
    return phonemes


def mark_shva_na(word: str):
    """
    Shva Na is context-independent and can be predicted with just the word or a dictionary.
    See https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
    Note: we predict only if Shva in the first letter in the word
    Note: we assume that the word comes with | to mark 'Txiliyot'
    Note: Shva Na rules mid-word are unreliable, so we don’t code them.

    Meteg (\u05bd) will be added in the letter with Shva Na

    What we don't predict:
    (1) some shva in beginning in future form (we don't know)
    (2) shva in the middle of the word
    """
    letters = get_letters(word)
    if not letters:
        return word
    if letters[0].char in "למנרי":
        letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
    elif len(letters) > 1 and letters[1].char in "אעה":
        letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
    elif letters[0].char in "וכלב" and lexicon.PREFIX_DIACRITIC in letters[0].all_diac:
        # ^ The nakdan should add |
        letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
    # Ensure that prefix character will be last
    for letter in letters:
        if "|" in letter.all_diac:
            letter.all_diac = letter.all_diac.replace("|", "") + "|"
    return "".join(str(i) for i in letters)


def sort_hatama(letters: list[Letter]) -> list[Letter]:
    for i in range(len(letters) - 1):
        diacs = list(letters[i].all_diac)
        if lexicon.HATAMA_DIACRITIC in diacs and lexicon.NIKUD_HASER_DIACRITIC in diacs:
            diacs.remove(lexicon.HATAMA_DIACRITIC)
            letters[i].all_diac = "".join(diacs)  # Reassign the updated diacritics
            letters[i + 1].all_diac += lexicon.HATAMA_DIACRITIC
    return letters


def add_milra_hatama(word: str):
    syllables = phonikud.syllables.get_syllables(word)
    stress_index = -1

    if not syllables:
        return word

    if len(syllables) == 1:
        stress_index = 0

    # Get latest syllable
    milra = syllables[stress_index]
    # Get letters
    letters = get_letters(milra)
    # Add Hatama
    letters[0].all_diac += lexicon.HATAMA_DIACRITIC

    # Replace latest syllable
    syllables[stress_index] = "".join(str(i) for i in letters)
    return "".join(syllables)