|
""" |
|
Hebrew Phonemizer |
|
|
|
Rules implemented: |
|
1. Consonant handling (including special cases) |
|
2. Nikud (vowel) processing |
|
3. Dagesh handling |
|
4. Geresh handling |
|
5. Shva Na prediction |
|
6. Special letter combinations |
|
|
|
Reference: |
|
- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table |
|
- https://en.wikipedia.org/wiki/Help:IPA/Hebrew |
|
- https://he.wikipedia.org/wiki/הברה |
|
- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע |
|
- https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט |
|
- https://hebrew-academy.org.il/2022/03/03/מלעיל-ומלרע-על-ההטעמה-בעברית |
|
""" |
|
|
|
from typing import Literal |
|
from phonikud.variants import Letter |
|
from phonikud import lexicon |
|
import re |
|
from phonikud.utils import sort_stress |
|
|
|
SHVA = "\u05b0" |
|
SIN = "\u05c2" |
|
PATAH = "\u05b7" |
|
KAMATZ = "\u05b8" |
|
HATAF_KAMATZ = "\u05b3" |
|
DAGESH = "\u05bc" |
|
HOLAM = "\u05b9" |
|
HIRIK = "\u05b4" |
|
PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]" |
|
KUBUTS = "\u05bb" |
|
TSERE = "\u05b5" |
|
HATAMA = "\u05ab" |
|
VAV_HOLAM = "\u05ba" |
|
DAGESH = "\u05bc" |
|
SEGOL = "\u05b6" |
|
|
|
|
|
def phonemize_hebrew( |
|
letters: list[Letter], |
|
stress_placement: Literal["syllable", "vowel"], |
|
) -> list[str]: |
|
phonemes = [] |
|
i = 0 |
|
|
|
while i < len(letters): |
|
cur = letters[i] |
|
prev = letters[i - 1] if i > 0 else None |
|
next = letters[i + 1] if i < len(letters) - 1 else None |
|
next_phonemes, skip_offset = letter_to_phonemes( |
|
cur, prev, next, stress_placement=stress_placement |
|
) |
|
|
|
|
|
phonemes.extend(next_phonemes) |
|
i += skip_offset + 1 |
|
|
|
return phonemes |
|
|
|
|
|
def letter_to_phonemes( |
|
cur: Letter, |
|
prev: Letter | None, |
|
next: Letter | None, |
|
stress_placement: Literal["syllable", "vowel"], |
|
) -> tuple[str, int]: |
|
cur_phonemes = [] |
|
skip_diacritics = False |
|
skip_consonants = False |
|
skip_offset = 0 |
|
|
|
if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac: |
|
skip_consonants = True |
|
skip_diacritics = True |
|
|
|
elif cur.char == "א" and not cur.diac and prev: |
|
if next and next.char != "ו": |
|
skip_consonants = True |
|
|
|
elif ( |
|
cur.char == "י" |
|
and next |
|
|
|
and not cur.diac |
|
|
|
and prev |
|
|
|
and prev.char + prev.diac != "אֵ" |
|
|
|
and not (next.char == "ו" and next.diac and "\u05b0" not in next.diac) |
|
): |
|
skip_consonants = True |
|
|
|
elif cur.char == "ש" and SIN in cur.diac: |
|
if ( |
|
next |
|
and next.char == "ש" |
|
and not next.diac |
|
and re.search("[\u05b7\u05b8]", cur.diac) |
|
): |
|
|
|
cur_phonemes.append("sa") |
|
skip_consonants = True |
|
skip_diacritics = True |
|
skip_offset += 1 |
|
else: |
|
cur_phonemes.append("s") |
|
skip_consonants = True |
|
|
|
|
|
elif cur.char == "ש" and not cur.diac and prev and SIN in prev.diac: |
|
cur_phonemes.append("s") |
|
skip_consonants = True |
|
|
|
elif not next and cur.char == "ח" and PATAH in cur.diac: |
|
|
|
cur_phonemes.append("ax") |
|
skip_diacritics = True |
|
skip_consonants = True |
|
|
|
elif not next and cur.char == "ה" and PATAH in cur.diac: |
|
|
|
cur_phonemes.append("ah") |
|
skip_diacritics = True |
|
skip_consonants = True |
|
|
|
elif not next and cur.char == "ע" and PATAH in cur.diac: |
|
|
|
cur_phonemes.append("a") |
|
skip_diacritics = True |
|
skip_consonants = True |
|
|
|
if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES: |
|
if cur.char == "ת": |
|
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, "")) |
|
skip_diacritics = True |
|
skip_consonants = True |
|
else: |
|
|
|
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, "")) |
|
skip_consonants = True |
|
|
|
elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES: |
|
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, "")) |
|
skip_consonants = True |
|
elif cur.char == "ו" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac: |
|
skip_consonants = True |
|
|
|
if prev and "\u05b0" in prev.diac and re.findall("[\u05b9-\u05ba]", cur.diac): |
|
|
|
cur_phonemes.append("vo") |
|
skip_diacritics = True |
|
skip_consonants = True |
|
|
|
elif next and next.char == "ו": |
|
|
|
|
|
holams = re.findall("[\u05b9-\u05ba]", cur.diac + next.diac) |
|
if len(holams) == 2: |
|
cur_phonemes.append("wo") |
|
skip_diacritics = True |
|
skip_offset += 1 |
|
if len(holams) == 1: |
|
cur_phonemes.append("vo") |
|
skip_diacritics = True |
|
skip_offset += 1 |
|
|
|
elif cur.diac == next.diac: |
|
|
|
cur_phonemes.append("vu") |
|
skip_diacritics = True |
|
skip_offset += 1 |
|
elif HIRIK in cur.diac: |
|
cur_phonemes.append("vi") |
|
skip_diacritics = True |
|
elif SHVA in cur.diac and not next.diac: |
|
cur_phonemes.append("v") |
|
skip_diacritics = True |
|
elif KAMATZ in cur.diac or PATAH in cur.diac: |
|
cur_phonemes.append("va") |
|
skip_diacritics = True |
|
elif SEGOL in cur.diac: |
|
cur_phonemes.append("ve") |
|
skip_diacritics = True |
|
else: |
|
|
|
|
|
skip_diacritics = False |
|
else: |
|
|
|
|
|
|
|
if re.search(PATAH_LIKE_PATTERN, cur.diac): |
|
cur_phonemes.append("va") |
|
|
|
|
|
elif TSERE in cur.diac: |
|
cur_phonemes.append("ve") |
|
elif SEGOL in cur.diac: |
|
cur_phonemes.append("ve") |
|
|
|
elif HOLAM in cur.diac: |
|
cur_phonemes.append("o") |
|
|
|
elif KUBUTS in cur.diac or DAGESH in cur.diac: |
|
cur_phonemes.append("u") |
|
|
|
elif SHVA in cur.diac and not prev: |
|
cur_phonemes.append("ve") |
|
|
|
elif HIRIK in cur.diac: |
|
cur_phonemes.append("vi") |
|
elif next and not cur.diac: |
|
|
|
skip_consonants = True |
|
skip_diacritics = True |
|
else: |
|
cur_phonemes.append("v") |
|
|
|
skip_diacritics = True |
|
|
|
if not skip_consonants: |
|
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, "")) |
|
|
|
if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac: |
|
cur_phonemes.append("o") |
|
skip_diacritics = True |
|
|
|
nikud_phonemes = [] |
|
if not skip_diacritics: |
|
nikud_phonemes = [ |
|
lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac |
|
] |
|
elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac: |
|
nikud_phonemes = [lexicon.STRESS_PHONEME] |
|
cur_phonemes.extend(nikud_phonemes) |
|
|
|
cur_phonemes = sort_stress(cur_phonemes, stress_placement) |
|
cur_phonemes = [ |
|
p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p) |
|
] |
|
|
|
cur_phonemes = [p for p in cur_phonemes if p] |
|
return cur_phonemes, skip_offset |
|
|