"""
Hebrew Phonemizer

Rules implemented:
1. Consonant handling (including special cases)
2. Nikud (vowel) processing
3. Dagesh handling
4. Geresh handling
5. Shva Na prediction
6. Special letter combinations

Reference:
- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
- https://he.wikipedia.org/wiki/הברה
- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
- https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט
- https://hebrew-academy.org.il/2022/03/03/מלעיל-ומלרע-על-ההטעמה-בעברית
"""

from typing import Literal
from phonikud.variants import Letter
from phonikud import lexicon
import re
from phonikud.utils import sort_stress

SHVA = "\u05b0"
SIN = "\u05c2"
PATAH = "\u05b7"
KAMATZ = "\u05b8"
HATAF_KAMATZ = "\u05b3"
DAGESH = "\u05bc"
HOLAM = "\u05b9"
HIRIK = "\u05b4"
PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
KUBUTS = "\u05bb"
TSERE = "\u05b5"
HATAMA = "\u05ab"
VAV_HOLAM = "\u05ba"
DAGESH = "\u05bc"
SEGOL = "\u05b6"


def phonemize_hebrew(
    letters: list[Letter],
    stress_placement: Literal["syllable", "vowel"],
) -> list[str]:
    phonemes = []
    i = 0

    while i < len(letters):
        cur = letters[i]
        prev = letters[i - 1] if i > 0 else None
        next = letters[i + 1] if i < len(letters) - 1 else None
        next_phonemes, skip_offset = letter_to_phonemes(
            cur, prev, next, stress_placement=stress_placement
        )
        # TODO: split into syllables
        # next_letters = next_phonemes, letters[i:i+skip_offset+1]
        phonemes.extend(next_phonemes)
        i += skip_offset + 1

    return phonemes


def letter_to_phonemes(
    cur: Letter,
    prev: Letter | None,
    next: Letter | None,
    stress_placement: Literal["syllable", "vowel"],
) -> tuple[str, int]:
    cur_phonemes = []
    skip_diacritics = False
    skip_consonants = False
    skip_offset = 0

    if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
        skip_consonants = True
        skip_diacritics = True

    elif cur.char == "א" and not cur.diac and prev:
        if next and next.char != "ו":
            skip_consonants = True

    elif (
        cur.char == "י"
        and next
        # Yud without diacritics
        and not cur.diac
        # In middle
        and prev
        # Prev Hirik
        and prev.char + prev.diac != "אֵ"
        # Next Vav has meaning
        and not (next.char == "ו" and next.diac and "\u05b0" not in next.diac)
    ):
        skip_consonants = True

    elif cur.char == "ש" and SIN in cur.diac:
        if (
            next
            and next.char == "ש"
            and not next.diac
            and re.search("[\u05b7\u05b8]", cur.diac)
        ):
            # ^ יששכר
            cur_phonemes.append("sa")
            skip_consonants = True
            skip_diacritics = True
            skip_offset += 1
        else:
            cur_phonemes.append("s")
            skip_consonants = True

    # shin without nikud after sin = sin
    elif cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
        cur_phonemes.append("s")
        skip_consonants = True

    elif not next and cur.char == "ח" and PATAH in cur.diac:
        # Final Het gnuva
        cur_phonemes.append("ax")
        skip_diacritics = True
        skip_consonants = True

    elif not next and cur.char == "ה" and PATAH in cur.diac:
        # Final He gnuva
        cur_phonemes.append("ah")
        skip_diacritics = True
        skip_consonants = True

    elif not next and cur.char == "ע" and PATAH in cur.diac:
        # Final Ayin gnuva
        cur_phonemes.append("a")
        skip_diacritics = True
        skip_consonants = True

    if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
        if cur.char == "ת":
            cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
            skip_diacritics = True
            skip_consonants = True
        else:
            # Geresh
            cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
            skip_consonants = True

    elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES:  # dagesh
        cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
        skip_consonants = True
    elif cur.char == "ו" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
        skip_consonants = True

        if prev and "\u05b0" in prev.diac and re.findall("[\u05b9-\u05ba]", cur.diac):
            # ^ לִגְוֹעַ
            cur_phonemes.append("vo")
            skip_diacritics = True
            skip_consonants = True

        elif next and next.char == "ו":
            # One of them has holam

            holams = re.findall("[\u05b9-\u05ba]", cur.diac + next.diac)
            if len(holams) == 2:
                cur_phonemes.append("wo")
                skip_diacritics = True
                skip_offset += 1
            if len(holams) == 1:
                cur_phonemes.append("vo")
                skip_diacritics = True
                skip_offset += 1
            # patah and next.diac empty
            elif cur.diac == next.diac:
                # double Vav
                cur_phonemes.append("vu")
                skip_diacritics = True
                skip_offset += 1
            elif HIRIK in cur.diac:
                cur_phonemes.append("vi")
                skip_diacritics = True
            elif SHVA in cur.diac and not next.diac:
                cur_phonemes.append("v")
                skip_diacritics = True
            elif KAMATZ in cur.diac or PATAH in cur.diac:
                cur_phonemes.append("va")
                skip_diacritics = True
            elif SEGOL in cur.diac:
                cur_phonemes.append("ve")
                skip_diacritics = True
            else:
                # TODO ?
                # skip_consonants = False
                skip_diacritics = False
        else:
            # Single vav

            # Vav with Patah
            if re.search(PATAH_LIKE_PATTERN, cur.diac):
                cur_phonemes.append("va")

            # Tsere
            elif TSERE in cur.diac:
                cur_phonemes.append("ve")
            elif SEGOL in cur.diac:
                cur_phonemes.append("ve")
            # Holam haser
            elif HOLAM in cur.diac:
                cur_phonemes.append("o")
            # Shuruk / Kubutz
            elif KUBUTS in cur.diac or DAGESH in cur.diac:
                cur_phonemes.append("u")
            # Vav with Shva in start
            elif SHVA in cur.diac and not prev:
                cur_phonemes.append("ve")
            # Hirik
            elif HIRIK in cur.diac:
                cur_phonemes.append("vi")
            elif next and not cur.diac:
                # It is fine for now since we use Dicta
                skip_consonants = True
                skip_diacritics = True
            else:
                cur_phonemes.append("v")

            skip_diacritics = True

    if not skip_consonants:
        cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))

    if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
        cur_phonemes.append("o")
        skip_diacritics = True

    nikud_phonemes = []
    if not skip_diacritics:
        nikud_phonemes = [
            lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
        ]
    elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
        nikud_phonemes = [lexicon.STRESS_PHONEME]
    cur_phonemes.extend(nikud_phonemes)
    # Ensure the stress is at the beginning of the syllable
    cur_phonemes = sort_stress(cur_phonemes, stress_placement)
    cur_phonemes = [
        p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
    ]
    # Remove empty phonemes
    cur_phonemes = [p for p in cur_phonemes if p]
    return cur_phonemes, skip_offset