Spaces:

thewh1teagle
/

phonemize-in-hebrew

Running

File size: 4,843 Bytes

22c7a44

from phonikud import lexicon
from phonikud.variants import Letter
from .expander import Expander
from phonikud.utils import (
    get_letters,
    normalize,
    post_normalize,
    post_clean,
    add_milra_hatama,
    mark_shva_na,
    sort_hatama,
)
from typing import Callable, Literal
import regex as re
from phonikud.hebrew import phonemize_hebrew


class Phonemizer:
    # TODO: is that enough? what if there's punctuation around? other chars?
    fallback_pattern = r"[a-zA-Z]+"

    def __init__(self):
        self.expander = Expander()

    def phonemize(
        self,
        text: str,
        preserve_punctuation: bool,
        preserve_stress: bool,
        use_expander: bool,
        use_post_normalize: bool,  # For TTS
        predict_stress: bool,
        predict_shva_nah: bool,
        stress_placement: Literal["syllable", "vowel"],
        schema: Literal["plain", "modern"],
        fallback: Callable[[str], str] = None,
    ) -> str | list[str]:
        # normalize
        text = normalize(text)

        def fallback_replace_callback(match: re.Match):
            word = match.group(0)

            if self.expander.dictionary.dict.get(word):
                # skip
                # TODO: better API
                return word
            phonemes = fallback(word).strip()
            # TODO: check that it has only IPA?!
            for c in phonemes:
                lexicon.ADDITIONAL_PHONEMES.add(c)
            return phonemes

        if fallback is not None:
            text = re.sub(self.fallback_pattern, fallback_replace_callback, text)

        if use_expander:
            text = self.expander.expand_text(text)

        def heb_replace_callback(match: re.Match, original_text: str):
            word = match.group(0)
            start_offset = match.start()
            if start_offset > 0 and original_text[start_offset - 1] == "[":
                # Skip if it starts with [ as it's used for hyper phonemes
                return word

            if predict_shva_nah:
                mark_shva_na(word)
            if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
                word = add_milra_hatama(word)
            letters: list[Letter] = get_letters(word)
            letters = sort_hatama(letters)

            phonemes: list[str] = phonemize_hebrew(
                letters,
                stress_placement=stress_placement,
            )
            phonemes = "".join(phonemes)
            # syllables = get_syllables(phonemes)

            # phonemes_text = "".join(phonemes)
            # # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
            # #     if len(syllables) == 1:
            # #         syllables[-1] = lexicon.STRESS + syllables[-1]
            # #         syllables[-1] = "".join(sort_stress(syllables[-1]))
            # #     elif any(
            # #         remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
            # #     ) or phonemes_text.endswith("ax"):
            # #         # insert lexicon.STRESS in the first character of syllables[-2]
            # #         syllables[-2] = lexicon.STRESS + syllables[-2]
            # #         syllables[-2] = "".join(sort_stress(syllables[-2]))
            # #     else:
            # #         # insert in syllables[-1]
            # #         syllables[-1] = lexicon.STRESS + syllables[-1]
            # #         syllables[-1] = "".join(sort_stress(syllables[-1]))

            # phonemes = "".join(syllables)
            if use_post_normalize:
                phonemes = post_normalize(phonemes)

            if schema == "modern":
                # We'll keep this feature simple for now
                for k, v in lexicon.MODERN_SCHEMA.items():
                    phonemes = re.sub(k, v, phonemes)
            return phonemes

        text = re.sub(
            lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
        )

        def hyper_phonemes_callback(match: re.Match):
            """
            Expand hyper phonemes into normal phonemes
            eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
            """
            matched_phonemes = match.group(2)
            for c in matched_phonemes:
                lexicon.ADDITIONAL_PHONEMES.add(c)
            return matched_phonemes  # The phoneme is in the second group

        text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)

        if not preserve_punctuation:
            text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
        if not preserve_stress:
            text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
        if use_post_normalize:
            # We don't keep hypens in the output, but we should replace it with space
            text = post_clean(text)
        return text