Spaces:

thewh1teagle
/

phonemize-in-hebrew

Running

phonemize-in-hebrew / phonikud /hebrew.py

thewh1teagle

latest

22c7a44 3 days ago

8.14 kB

	"""
	Hebrew Phonemizer

	Fast rule-based FST that converts Hebrew text to phonemes.
	See https://en.wikipedia.org/wiki/Finite-state_transducer

	Rules implemented:
	1. Consonant handling (including special cases)
	2. Nikud (vowel) processing
	3. Dagesh handling
	4. Geresh handling
	5. Vocal Shva prediction
	6. Special letter combinations

	Reference:
	- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
	- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
	- https://he.wikipedia.org/wiki/הברה
	- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
	- https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט
	- https://hebrew-academy.org.il/2022/03/03/מלעיל-ומלרע-על-ההטעמה-בעברית
	"""

	from typing import Literal
	from phonikud.variants import Letter
	from phonikud import lexicon
	import re
	from phonikud.utils import sort_stress

	SHVA = "\u05b0"
	SIN = "\u05c2"
	PATAH = "\u05b7"
	KAMATZ = "\u05b8"
	HATAF_KAMATZ = "\u05b3"
	DAGESH = "\u05bc"
	HOLAM = "\u05b9"
	HIRIK = "\u05b4"
	PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
	KUBUTS = "\u05bb"
	TSERE = "\u05b5"
	HATAMA = "\u05ab"
	VAV_HOLAM = "\u05ba"
	DAGESH = "\u05bc"
	SEGOL = "\u05b6"


	def phonemize_hebrew(
	letters: list[Letter],
	stress_placement: Literal["syllable", "vowel"],
	) -> list[str]:
	phonemes = []
	i = 0

	while i < len(letters):
	cur = letters[i]
	prev = letters[i - 1] if i > 0 else None
	next = letters[i + 1] if i < len(letters) - 1 else None
	next_phonemes, skip_offset = letter_to_phonemes(
	cur, prev, next, stress_placement=stress_placement
	)
	# TODO: split into syllables
	# next_letters = next_phonemes, letters[i:i+skip_offset+1]
	phonemes.extend(next_phonemes)
	i += skip_offset + 1

	return phonemes


	def letter_to_phonemes(
	cur: Letter,
	prev: Letter \| None,
	next: Letter \| None,
	stress_placement: Literal["syllable", "vowel"],
	) -> tuple[str, int]:
	cur_phonemes = []
	skip_diacritics = False
	skip_consonants = False
	skip_offset = 0

	if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
	skip_consonants = True
	skip_diacritics = True

	elif cur.char == "א" and not cur.diac and prev:
	if next and next.char != "ו":
	skip_consonants = True

	elif (
	cur.char == "י"
	and next
	# Yud without diacritics
	and not cur.diac
	# In middle
	and prev
	# Prev Hirik
	and prev.char + prev.diac != "אֵ"
	# Next Vav has meaning
	and not (next.char == "ו" and next.diac and "\u05b0" not in next.diac)
	):
	skip_consonants = True

	elif cur.char == "ש" and SIN in cur.diac:
	if (
	next
	and next.char == "ש"
	and not next.diac
	and re.search("[\u05b7\u05b8]", cur.diac)
	):
	# ^ יששכר
	cur_phonemes.append("sa")
	skip_consonants = True
	skip_diacritics = True
	skip_offset += 1
	else:
	cur_phonemes.append("s")
	skip_consonants = True

	# shin without nikud after sin = sin
	elif cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
	cur_phonemes.append("s")
	skip_consonants = True

	elif not next and cur.char == "ח" and PATAH in cur.diac:
	# Final Het gnuva
	cur_phonemes.append("ax")
	skip_diacritics = True
	skip_consonants = True

	elif not next and cur.char == "ה" and PATAH in cur.diac:
	# Final He gnuva
	cur_phonemes.append("ah")
	skip_diacritics = True
	skip_consonants = True

	elif not next and cur.char == "ע" and PATAH in cur.diac:
	# Final Ayin gnuva
	cur_phonemes.append("a")
	skip_diacritics = True
	skip_consonants = True

	if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
	if cur.char == "ת":
	cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
	skip_diacritics = True
	skip_consonants = True
	else:
	# Geresh
	cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
	skip_consonants = True

	elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES: # dagesh
	cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
	skip_consonants = True
	elif cur.char == "ו" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
	skip_consonants = True

	if prev and "\u05b0" in prev.diac and re.findall("[\u05b9-\u05ba]", cur.diac):
	# ^ לִגְוֹעַ
	cur_phonemes.append("vo")
	skip_diacritics = True
	skip_consonants = True

	elif next and next.char == "ו":
	# One of them has holam

	holams = re.findall("[\u05b9-\u05ba]", cur.diac + next.diac)
	if len(holams) == 2:
	cur_phonemes.append("wo")
	skip_diacritics = True
	skip_offset += 1
	if len(holams) == 1:
	cur_phonemes.append("vo")
	skip_diacritics = True
	skip_offset += 1
	# patah and next.diac empty
	elif cur.diac == next.diac:
	# double Vav
	cur_phonemes.append("vu")
	skip_diacritics = True
	skip_offset += 1
	elif HIRIK in cur.diac:
	cur_phonemes.append("vi")
	skip_diacritics = True
	elif SHVA in cur.diac and not next.diac:
	cur_phonemes.append("v")
	skip_diacritics = True
	elif KAMATZ in cur.diac or PATAH in cur.diac:
	cur_phonemes.append("va")
	skip_diacritics = True
	elif SEGOL in cur.diac:
	cur_phonemes.append("ve")
	skip_diacritics = True
	else:
	# TODO ?
	# skip_consonants = False
	skip_diacritics = False
	else:
	# Single vav

	# Vav with Patah
	if re.search(PATAH_LIKE_PATTERN, cur.diac):
	cur_phonemes.append("va")

	# Tsere
	elif TSERE in cur.diac:
	cur_phonemes.append("ve")
	elif SEGOL in cur.diac:
	cur_phonemes.append("ve")
	# Holam haser
	elif HOLAM in cur.diac:
	cur_phonemes.append("o")
	# Shuruk / Kubutz
	elif KUBUTS in cur.diac or DAGESH in cur.diac:
	cur_phonemes.append("u")
	# Vav with Shva in start
	elif SHVA in cur.diac and not prev:
	cur_phonemes.append("ve")
	# Hirik
	elif HIRIK in cur.diac:
	cur_phonemes.append("vi")
	elif next and not cur.diac:
	# It is fine for now since we use Dicta
	skip_consonants = True
	skip_diacritics = True
	else:
	cur_phonemes.append("v")

	skip_diacritics = True

	if not skip_consonants:
	cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))

	if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
	cur_phonemes.append("o")
	skip_diacritics = True

	nikud_phonemes = []
	if not skip_diacritics:
	nikud_phonemes = [
	lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
	]
	elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
	nikud_phonemes = [lexicon.STRESS_PHONEME]
	cur_phonemes.extend(nikud_phonemes)
	# Ensure the stress is at the beginning of the syllable
	cur_phonemes = sort_stress(cur_phonemes, stress_placement)
	cur_phonemes = [
	p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
	]
	# Remove empty phonemes
	cur_phonemes = [p for p in cur_phonemes if p]
	return cur_phonemes, skip_offset