File size: 4,843 Bytes
1866014 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from phonikud import lexicon
from phonikud.variants import Letter
from .expander import Expander
from phonikud.utils import (
get_letters,
normalize,
post_normalize,
post_clean,
add_milra_hatama,
mark_shva_na,
sort_hatama,
)
from typing import Callable, Literal
import regex as re
from phonikud.hebrew import phonemize_hebrew
class Phonemizer:
# TODO: is that enough? what if there's punctuation around? other chars?
fallback_pattern = r"[a-zA-Z]+"
def __init__(self):
self.expander = Expander()
def phonemize(
self,
text: str,
preserve_punctuation: bool,
preserve_stress: bool,
use_expander: bool,
use_post_normalize: bool, # For TTS
predict_stress: bool,
predict_shva_nah: bool,
stress_placement: Literal["syllable", "vowel"],
schema: Literal["plain", "modern"],
fallback: Callable[[str], str] = None,
) -> str | list[str]:
# normalize
text = normalize(text)
def fallback_replace_callback(match: re.Match):
word = match.group(0)
if self.expander.dictionary.dict.get(word):
# skip
# TODO: better API
return word
phonemes = fallback(word).strip()
# TODO: check that it has only IPA?!
for c in phonemes:
lexicon.ADDITIONAL_PHONEMES.add(c)
return phonemes
if fallback is not None:
text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
if use_expander:
text = self.expander.expand_text(text)
def heb_replace_callback(match: re.Match, original_text: str):
word = match.group(0)
start_offset = match.start()
if start_offset > 0 and original_text[start_offset - 1] == "[":
# Skip if it starts with [ as it's used for hyper phonemes
return word
if predict_shva_nah:
mark_shva_na(word)
if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
word = add_milra_hatama(word)
letters: list[Letter] = get_letters(word)
letters = sort_hatama(letters)
phonemes: list[str] = phonemize_hebrew(
letters,
stress_placement=stress_placement,
)
phonemes = "".join(phonemes)
# syllables = get_syllables(phonemes)
# phonemes_text = "".join(phonemes)
# # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
# # if len(syllables) == 1:
# # syllables[-1] = lexicon.STRESS + syllables[-1]
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
# # elif any(
# # remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
# # ) or phonemes_text.endswith("ax"):
# # # insert lexicon.STRESS in the first character of syllables[-2]
# # syllables[-2] = lexicon.STRESS + syllables[-2]
# # syllables[-2] = "".join(sort_stress(syllables[-2]))
# # else:
# # # insert in syllables[-1]
# # syllables[-1] = lexicon.STRESS + syllables[-1]
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
# phonemes = "".join(syllables)
if use_post_normalize:
phonemes = post_normalize(phonemes)
if schema == "modern":
# We'll keep this feature simple for now
for k, v in lexicon.MODERN_SCHEMA.items():
phonemes = re.sub(k, v, phonemes)
return phonemes
text = re.sub(
lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
)
def hyper_phonemes_callback(match: re.Match):
"""
Expand hyper phonemes into normal phonemes
eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
"""
matched_phonemes = match.group(2)
for c in matched_phonemes:
lexicon.ADDITIONAL_PHONEMES.add(c)
return matched_phonemes # The phoneme is in the second group
text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
if not preserve_punctuation:
text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
if not preserve_stress:
text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
if use_post_normalize:
# We don't keep hypens in the output, but we should replace it with space
text = post_clean(text)
return text
|