thewh1teagle
latest
1866014
"""
Hebrew Phonemizer
Rules implemented:
1. Consonant handling (including special cases)
2. Nikud (vowel) processing
3. Dagesh handling
4. Geresh handling
5. Shva Na prediction
6. Special letter combinations
Reference:
- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
- https://he.wikipedia.org/wiki/הברה
- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
- https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט
- https://hebrew-academy.org.il/2022/03/03/מלעיל-ומלרע-על-ההטעמה-בעברית
"""
from typing import Literal
from phonikud.variants import Letter
from phonikud import lexicon
import re
from phonikud.utils import sort_stress
SHVA = "\u05b0"
SIN = "\u05c2"
PATAH = "\u05b7"
KAMATZ = "\u05b8"
HATAF_KAMATZ = "\u05b3"
DAGESH = "\u05bc"
HOLAM = "\u05b9"
HIRIK = "\u05b4"
PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
KUBUTS = "\u05bb"
TSERE = "\u05b5"
HATAMA = "\u05ab"
VAV_HOLAM = "\u05ba"
DAGESH = "\u05bc"
SEGOL = "\u05b6"
def phonemize_hebrew(
letters: list[Letter],
stress_placement: Literal["syllable", "vowel"],
) -> list[str]:
phonemes = []
i = 0
while i < len(letters):
cur = letters[i]
prev = letters[i - 1] if i > 0 else None
next = letters[i + 1] if i < len(letters) - 1 else None
next_phonemes, skip_offset = letter_to_phonemes(
cur, prev, next, stress_placement=stress_placement
)
# TODO: split into syllables
# next_letters = next_phonemes, letters[i:i+skip_offset+1]
phonemes.extend(next_phonemes)
i += skip_offset + 1
return phonemes
def letter_to_phonemes(
cur: Letter,
prev: Letter | None,
next: Letter | None,
stress_placement: Literal["syllable", "vowel"],
) -> tuple[str, int]:
cur_phonemes = []
skip_diacritics = False
skip_consonants = False
skip_offset = 0
if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
skip_consonants = True
skip_diacritics = True
elif cur.char == "א" and not cur.diac and prev:
if next and next.char != "ו":
skip_consonants = True
elif (
cur.char == "י"
and next
# Yud without diacritics
and not cur.diac
# In middle
and prev
# Prev Hirik
and prev.char + prev.diac != "אֵ"
# Next Vav has meaning
and not (next.char == "ו" and next.diac and "\u05b0" not in next.diac)
):
skip_consonants = True
elif cur.char == "ש" and SIN in cur.diac:
if (
next
and next.char == "ש"
and not next.diac
and re.search("[\u05b7\u05b8]", cur.diac)
):
# ^ יששכר
cur_phonemes.append("sa")
skip_consonants = True
skip_diacritics = True
skip_offset += 1
else:
cur_phonemes.append("s")
skip_consonants = True
# shin without nikud after sin = sin
elif cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
cur_phonemes.append("s")
skip_consonants = True
elif not next and cur.char == "ח" and PATAH in cur.diac:
# Final Het gnuva
cur_phonemes.append("ax")
skip_diacritics = True
skip_consonants = True
elif not next and cur.char == "ה" and PATAH in cur.diac:
# Final He gnuva
cur_phonemes.append("ah")
skip_diacritics = True
skip_consonants = True
elif not next and cur.char == "ע" and PATAH in cur.diac:
# Final Ayin gnuva
cur_phonemes.append("a")
skip_diacritics = True
skip_consonants = True
if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
if cur.char == "ת":
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
skip_diacritics = True
skip_consonants = True
else:
# Geresh
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
skip_consonants = True
elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES: # dagesh
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
skip_consonants = True
elif cur.char == "ו" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
skip_consonants = True
if prev and "\u05b0" in prev.diac and re.findall("[\u05b9-\u05ba]", cur.diac):
# ^ לִגְוֹעַ
cur_phonemes.append("vo")
skip_diacritics = True
skip_consonants = True
elif next and next.char == "ו":
# One of them has holam
holams = re.findall("[\u05b9-\u05ba]", cur.diac + next.diac)
if len(holams) == 2:
cur_phonemes.append("wo")
skip_diacritics = True
skip_offset += 1
if len(holams) == 1:
cur_phonemes.append("vo")
skip_diacritics = True
skip_offset += 1
# patah and next.diac empty
elif cur.diac == next.diac:
# double Vav
cur_phonemes.append("vu")
skip_diacritics = True
skip_offset += 1
elif HIRIK in cur.diac:
cur_phonemes.append("vi")
skip_diacritics = True
elif SHVA in cur.diac and not next.diac:
cur_phonemes.append("v")
skip_diacritics = True
elif KAMATZ in cur.diac or PATAH in cur.diac:
cur_phonemes.append("va")
skip_diacritics = True
elif SEGOL in cur.diac:
cur_phonemes.append("ve")
skip_diacritics = True
else:
# TODO ?
# skip_consonants = False
skip_diacritics = False
else:
# Single vav
# Vav with Patah
if re.search(PATAH_LIKE_PATTERN, cur.diac):
cur_phonemes.append("va")
# Tsere
elif TSERE in cur.diac:
cur_phonemes.append("ve")
elif SEGOL in cur.diac:
cur_phonemes.append("ve")
# Holam haser
elif HOLAM in cur.diac:
cur_phonemes.append("o")
# Shuruk / Kubutz
elif KUBUTS in cur.diac or DAGESH in cur.diac:
cur_phonemes.append("u")
# Vav with Shva in start
elif SHVA in cur.diac and not prev:
cur_phonemes.append("ve")
# Hirik
elif HIRIK in cur.diac:
cur_phonemes.append("vi")
elif next and not cur.diac:
# It is fine for now since we use Dicta
skip_consonants = True
skip_diacritics = True
else:
cur_phonemes.append("v")
skip_diacritics = True
if not skip_consonants:
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
cur_phonemes.append("o")
skip_diacritics = True
nikud_phonemes = []
if not skip_diacritics:
nikud_phonemes = [
lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
]
elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
nikud_phonemes = [lexicon.STRESS_PHONEME]
cur_phonemes.extend(nikud_phonemes)
# Ensure the stress is at the beginning of the syllable
cur_phonemes = sort_stress(cur_phonemes, stress_placement)
cur_phonemes = [
p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
]
# Remove empty phonemes
cur_phonemes = [p for p in cur_phonemes if p]
return cur_phonemes, skip_offset