thewh1teagle
latest
1866014
from phonikud import lexicon
from phonikud.variants import Letter
from .expander import Expander
from phonikud.utils import (
get_letters,
normalize,
post_normalize,
post_clean,
add_milra_hatama,
mark_shva_na,
sort_hatama,
)
from typing import Callable, Literal
import regex as re
from phonikud.hebrew import phonemize_hebrew
class Phonemizer:
# TODO: is that enough? what if there's punctuation around? other chars?
fallback_pattern = r"[a-zA-Z]+"
def __init__(self):
self.expander = Expander()
def phonemize(
self,
text: str,
preserve_punctuation: bool,
preserve_stress: bool,
use_expander: bool,
use_post_normalize: bool, # For TTS
predict_stress: bool,
predict_shva_nah: bool,
stress_placement: Literal["syllable", "vowel"],
schema: Literal["plain", "modern"],
fallback: Callable[[str], str] = None,
) -> str | list[str]:
# normalize
text = normalize(text)
def fallback_replace_callback(match: re.Match):
word = match.group(0)
if self.expander.dictionary.dict.get(word):
# skip
# TODO: better API
return word
phonemes = fallback(word).strip()
# TODO: check that it has only IPA?!
for c in phonemes:
lexicon.ADDITIONAL_PHONEMES.add(c)
return phonemes
if fallback is not None:
text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
if use_expander:
text = self.expander.expand_text(text)
def heb_replace_callback(match: re.Match, original_text: str):
word = match.group(0)
start_offset = match.start()
if start_offset > 0 and original_text[start_offset - 1] == "[":
# Skip if it starts with [ as it's used for hyper phonemes
return word
if predict_shva_nah:
mark_shva_na(word)
if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
word = add_milra_hatama(word)
letters: list[Letter] = get_letters(word)
letters = sort_hatama(letters)
phonemes: list[str] = phonemize_hebrew(
letters,
stress_placement=stress_placement,
)
phonemes = "".join(phonemes)
# syllables = get_syllables(phonemes)
# phonemes_text = "".join(phonemes)
# # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
# # if len(syllables) == 1:
# # syllables[-1] = lexicon.STRESS + syllables[-1]
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
# # elif any(
# # remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
# # ) or phonemes_text.endswith("ax"):
# # # insert lexicon.STRESS in the first character of syllables[-2]
# # syllables[-2] = lexicon.STRESS + syllables[-2]
# # syllables[-2] = "".join(sort_stress(syllables[-2]))
# # else:
# # # insert in syllables[-1]
# # syllables[-1] = lexicon.STRESS + syllables[-1]
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
# phonemes = "".join(syllables)
if use_post_normalize:
phonemes = post_normalize(phonemes)
if schema == "modern":
# We'll keep this feature simple for now
for k, v in lexicon.MODERN_SCHEMA.items():
phonemes = re.sub(k, v, phonemes)
return phonemes
text = re.sub(
lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
)
def hyper_phonemes_callback(match: re.Match):
"""
Expand hyper phonemes into normal phonemes
eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
"""
matched_phonemes = match.group(2)
for c in matched_phonemes:
lexicon.ADDITIONAL_PHONEMES.add(c)
return matched_phonemes # The phoneme is in the second group
text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
if not preserve_punctuation:
text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
if not preserve_stress:
text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
if use_post_normalize:
# We don't keep hypens in the output, but we should replace it with space
text = post_clean(text)
return text