File size: 4,843 Bytes
1866014
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from phonikud import lexicon
from phonikud.variants import Letter
from .expander import Expander
from phonikud.utils import (
    get_letters,
    normalize,
    post_normalize,
    post_clean,
    add_milra_hatama,
    mark_shva_na,
    sort_hatama,
)
from typing import Callable, Literal
import regex as re
from phonikud.hebrew import phonemize_hebrew


class Phonemizer:
    # TODO: is that enough? what if there's punctuation around? other chars?
    fallback_pattern = r"[a-zA-Z]+"

    def __init__(self):
        self.expander = Expander()

    def phonemize(
        self,
        text: str,
        preserve_punctuation: bool,
        preserve_stress: bool,
        use_expander: bool,
        use_post_normalize: bool,  # For TTS
        predict_stress: bool,
        predict_shva_nah: bool,
        stress_placement: Literal["syllable", "vowel"],
        schema: Literal["plain", "modern"],
        fallback: Callable[[str], str] = None,
    ) -> str | list[str]:
        # normalize
        text = normalize(text)

        def fallback_replace_callback(match: re.Match):
            word = match.group(0)

            if self.expander.dictionary.dict.get(word):
                # skip
                # TODO: better API
                return word
            phonemes = fallback(word).strip()
            # TODO: check that it has only IPA?!
            for c in phonemes:
                lexicon.ADDITIONAL_PHONEMES.add(c)
            return phonemes

        if fallback is not None:
            text = re.sub(self.fallback_pattern, fallback_replace_callback, text)

        if use_expander:
            text = self.expander.expand_text(text)

        def heb_replace_callback(match: re.Match, original_text: str):
            word = match.group(0)
            start_offset = match.start()
            if start_offset > 0 and original_text[start_offset - 1] == "[":
                # Skip if it starts with [ as it's used for hyper phonemes
                return word

            if predict_shva_nah:
                mark_shva_na(word)
            if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
                word = add_milra_hatama(word)
            letters: list[Letter] = get_letters(word)
            letters = sort_hatama(letters)

            phonemes: list[str] = phonemize_hebrew(
                letters,
                stress_placement=stress_placement,
            )
            phonemes = "".join(phonemes)
            # syllables = get_syllables(phonemes)

            # phonemes_text = "".join(phonemes)
            # # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
            # #     if len(syllables) == 1:
            # #         syllables[-1] = lexicon.STRESS + syllables[-1]
            # #         syllables[-1] = "".join(sort_stress(syllables[-1]))
            # #     elif any(
            # #         remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
            # #     ) or phonemes_text.endswith("ax"):
            # #         # insert lexicon.STRESS in the first character of syllables[-2]
            # #         syllables[-2] = lexicon.STRESS + syllables[-2]
            # #         syllables[-2] = "".join(sort_stress(syllables[-2]))
            # #     else:
            # #         # insert in syllables[-1]
            # #         syllables[-1] = lexicon.STRESS + syllables[-1]
            # #         syllables[-1] = "".join(sort_stress(syllables[-1]))

            # phonemes = "".join(syllables)
            if use_post_normalize:
                phonemes = post_normalize(phonemes)

            if schema == "modern":
                # We'll keep this feature simple for now
                for k, v in lexicon.MODERN_SCHEMA.items():
                    phonemes = re.sub(k, v, phonemes)
            return phonemes

        text = re.sub(
            lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
        )

        def hyper_phonemes_callback(match: re.Match):
            """
            Expand hyper phonemes into normal phonemes
            eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
            """
            matched_phonemes = match.group(2)
            for c in matched_phonemes:
                lexicon.ADDITIONAL_PHONEMES.add(c)
            return matched_phonemes  # The phoneme is in the second group

        text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)

        if not preserve_punctuation:
            text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
        if not preserve_stress:
            text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
        if use_post_normalize:
            # We don't keep hypens in the output, but we should replace it with space
            text = post_clean(text)
        return text