Spaces:

thewh1teagle
/

phonemize-in-hebrew

Running

phonemize-in-hebrew / phonikud /phonemize.py

thewh1teagle

latest

1866014 11 days ago

4.84 kB

	from phonikud import lexicon
	from phonikud.variants import Letter
	from .expander import Expander
	from phonikud.utils import (
	get_letters,
	normalize,
	post_normalize,
	post_clean,
	add_milra_hatama,
	mark_shva_na,
	sort_hatama,
	)
	from typing import Callable, Literal
	import regex as re
	from phonikud.hebrew import phonemize_hebrew


	class Phonemizer:
	# TODO: is that enough? what if there's punctuation around? other chars?
	fallback_pattern = r"[a-zA-Z]+"

	def __init__(self):
	self.expander = Expander()

	def phonemize(
	self,
	text: str,
	preserve_punctuation: bool,
	preserve_stress: bool,
	use_expander: bool,
	use_post_normalize: bool, # For TTS
	predict_stress: bool,
	predict_shva_nah: bool,
	stress_placement: Literal["syllable", "vowel"],
	schema: Literal["plain", "modern"],
	fallback: Callable[[str], str] = None,
	) -> str \| list[str]:
	# normalize
	text = normalize(text)

	def fallback_replace_callback(match: re.Match):
	word = match.group(0)

	if self.expander.dictionary.dict.get(word):
	# skip
	# TODO: better API
	return word
	phonemes = fallback(word).strip()
	# TODO: check that it has only IPA?!
	for c in phonemes:
	lexicon.ADDITIONAL_PHONEMES.add(c)
	return phonemes

	if fallback is not None:
	text = re.sub(self.fallback_pattern, fallback_replace_callback, text)

	if use_expander:
	text = self.expander.expand_text(text)

	def heb_replace_callback(match: re.Match, original_text: str):
	word = match.group(0)
	start_offset = match.start()
	if start_offset > 0 and original_text[start_offset - 1] == "[":
	# Skip if it starts with [ as it's used for hyper phonemes
	return word

	if predict_shva_nah:
	mark_shva_na(word)
	if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
	word = add_milra_hatama(word)
	letters: list[Letter] = get_letters(word)
	letters = sort_hatama(letters)

	phonemes: list[str] = phonemize_hebrew(
	letters,
	stress_placement=stress_placement,
	)
	phonemes = "".join(phonemes)
	# syllables = get_syllables(phonemes)

	# phonemes_text = "".join(phonemes)
	# # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
	# # if len(syllables) == 1:
	# # syllables[-1] = lexicon.STRESS + syllables[-1]
	# # syllables[-1] = "".join(sort_stress(syllables[-1]))
	# # elif any(
	# # remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
	# # ) or phonemes_text.endswith("ax"):
	# # # insert lexicon.STRESS in the first character of syllables[-2]
	# # syllables[-2] = lexicon.STRESS + syllables[-2]
	# # syllables[-2] = "".join(sort_stress(syllables[-2]))
	# # else:
	# # # insert in syllables[-1]
	# # syllables[-1] = lexicon.STRESS + syllables[-1]
	# # syllables[-1] = "".join(sort_stress(syllables[-1]))

	# phonemes = "".join(syllables)
	if use_post_normalize:
	phonemes = post_normalize(phonemes)

	if schema == "modern":
	# We'll keep this feature simple for now
	for k, v in lexicon.MODERN_SCHEMA.items():
	phonemes = re.sub(k, v, phonemes)
	return phonemes

	text = re.sub(
	lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
	)

	def hyper_phonemes_callback(match: re.Match):
	"""
	Expand hyper phonemes into normal phonemes
	eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
	"""
	matched_phonemes = match.group(2)
	for c in matched_phonemes:
	lexicon.ADDITIONAL_PHONEMES.add(c)
	return matched_phonemes # The phoneme is in the second group

	text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)

	if not preserve_punctuation:
	text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
	if not preserve_stress:
	text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
	if use_post_normalize:
	# We don't keep hypens in the output, but we should replace it with space
	text = post_clean(text)
	return text