|
from phonikud import lexicon |
|
from phonikud.variants import Letter |
|
from .expander import Expander |
|
from phonikud.utils import ( |
|
get_letters, |
|
normalize, |
|
post_normalize, |
|
post_clean, |
|
add_milra_hatama, |
|
mark_shva_na, |
|
sort_hatama, |
|
) |
|
from typing import Callable, Literal |
|
import regex as re |
|
from phonikud.hebrew import phonemize_hebrew |
|
|
|
|
|
class Phonemizer: |
|
|
|
fallback_pattern = r"[a-zA-Z]+" |
|
|
|
def __init__(self): |
|
self.expander = Expander() |
|
|
|
def phonemize( |
|
self, |
|
text: str, |
|
preserve_punctuation: bool, |
|
preserve_stress: bool, |
|
use_expander: bool, |
|
use_post_normalize: bool, |
|
predict_stress: bool, |
|
predict_shva_nah: bool, |
|
stress_placement: Literal["syllable", "vowel"], |
|
schema: Literal["plain", "modern"], |
|
fallback: Callable[[str], str] = None, |
|
) -> str | list[str]: |
|
|
|
text = normalize(text) |
|
|
|
def fallback_replace_callback(match: re.Match): |
|
word = match.group(0) |
|
|
|
if self.expander.dictionary.dict.get(word): |
|
|
|
|
|
return word |
|
phonemes = fallback(word).strip() |
|
|
|
for c in phonemes: |
|
lexicon.ADDITIONAL_PHONEMES.add(c) |
|
return phonemes |
|
|
|
if fallback is not None: |
|
text = re.sub(self.fallback_pattern, fallback_replace_callback, text) |
|
|
|
if use_expander: |
|
text = self.expander.expand_text(text) |
|
|
|
def heb_replace_callback(match: re.Match, original_text: str): |
|
word = match.group(0) |
|
start_offset = match.start() |
|
if start_offset > 0 and original_text[start_offset - 1] == "[": |
|
|
|
return word |
|
|
|
if predict_shva_nah: |
|
mark_shva_na(word) |
|
if lexicon.HATAMA_DIACRITIC not in word and predict_stress: |
|
word = add_milra_hatama(word) |
|
letters: list[Letter] = get_letters(word) |
|
letters = sort_hatama(letters) |
|
|
|
phonemes: list[str] = phonemize_hebrew( |
|
letters, |
|
stress_placement=stress_placement, |
|
) |
|
phonemes = "".join(phonemes) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if use_post_normalize: |
|
phonemes = post_normalize(phonemes) |
|
|
|
if schema == "modern": |
|
|
|
for k, v in lexicon.MODERN_SCHEMA.items(): |
|
phonemes = re.sub(k, v, phonemes) |
|
return phonemes |
|
|
|
text = re.sub( |
|
lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text |
|
) |
|
|
|
def hyper_phonemes_callback(match: re.Match): |
|
""" |
|
Expand hyper phonemes into normal phonemes |
|
eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ |
|
""" |
|
matched_phonemes = match.group(2) |
|
for c in matched_phonemes: |
|
lexicon.ADDITIONAL_PHONEMES.add(c) |
|
return matched_phonemes |
|
|
|
text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text) |
|
|
|
if not preserve_punctuation: |
|
text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ") |
|
if not preserve_stress: |
|
text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME]) |
|
if use_post_normalize: |
|
|
|
text = post_clean(text) |
|
return text |
|
|