Spaces:

thewh1teagle
/

phonemize-in-hebrew

Running

App Files Files Community

thewh1teagle commited on 13 days ago

Commit

1866014

0 Parent(s):

latest

Browse files

Files changed (25) hide show

.gitattributes +1 -0
README.md +10 -0
app.py +109 -0
phonikud-1.0.int8.onnx +3 -0
phonikud/__init__.py +39 -0
phonikud/data/rashej_tevot.json +3 -0
phonikud/data/special.json +9 -0
phonikud/data/symbols.json +5 -0
phonikud/expander/__init__.py +33 -0
phonikud/expander/dates.py +60 -0
phonikud/expander/dictionary.py +79 -0
phonikud/expander/number_names.py +193 -0
phonikud/expander/numbers.py +39 -0
phonikud/expander/time_to_word.py +104 -0
phonikud/hebrew.py +249 -0
phonikud/lexicon.py +115 -0
phonikud/log.py +35 -0
phonikud/phonemize.py +130 -0
phonikud/syllables.py +103 -0
phonikud/utils.py +247 -0
phonikud/variants.py +20 -0
phonikud_onnx/__init__.py +41 -0
phonikud_onnx/model.py +197 -0
phonikud_onnx/py.typed +0 -0
requirements.txt +43 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.onnx filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Phonemize in Hebrew
+emoji: 🐢
+colorFrom: red
+colorTo: green
+sdk: gradio
+sdk_version: "4.44.0"
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+uv pip install gradio
+uv run gradio examples/editor.py
+"""
+from phonikud import phonemize, lexicon
+from phonikud.utils import remove_nikud
+import gradio as gr
+from phonikud_onnx import Phonikud
+from pathlib import Path
+default_text = """
+הַדַּיָּיג נִצְמָד לְדֹופֶן הַסִּירָה בִּזְמַן הַסְּעָרָה.
+הִסְבַּרְתִּי לָהּ אֶת הַכֹּל, וְאָמַרְתִּי בְּדִיּוּק מָה קָרָה.
+הַיְּלָדִים אָהֲבוּ בִּמְיֻוחָד אֶת הַסִּיפּוּרִים הַלָּלוּ שֶׁהַמּוֹרָה הִקְרִיאָה.
+""".strip()
+def on_phonikud_toggle(use_phonikud):
+    if not use_phonikud:
+        return default_text
+    return remove_nikud(default_text)
+css = """
+    .input textarea {
+        font-size: 22px;
+        padding: 15px;
+        height: 200px;
+    }
+    .phonemes {
+        background: var(--input-background-fill);
+    }
+    .phonemes {
+        padding: 5px;
+        min-height: 50px;
+    }
+"""
+theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Noto Sans Hebrew")])
+phonikud = None
+model_path = Path("./phonikud-1.0.int8.onnx")
+if model_path.exists():
+    phonikud = Phonikud(str(model_path))
+def on_submit(text: str, schema: str, use_phonikud: bool) -> str:
+    diacritized = (
+        phonikud.add_diacritics(
+            text, mark_matres_lectionis=lexicon.NIKUD_HASER_DIACRITIC
+        )
+        if phonikud and use_phonikud
+        else text
+    )
+    phonemes = phonemize(
+        diacritized, predict_stress=True, schema=schema, predict_shva_nah=False
+    )
+    if use_phonikud:
+        return f"<div dir='rtl' style='font-size: 22px;'>{diacritized.strip()}</div><br><div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
+    else:
+        return f"<div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
+with gr.Blocks(theme=theme, css=css) as demo:
+    text_input = gr.Textbox(
+        value=remove_nikud(default_text),
+        label="Text",
+        rtl=True,
+        elem_classes=["input"],
+        lines=7,
+    )
+    with gr.Row():
+        schema_dropdown = gr.Dropdown(
+            choices=["modern", "plain"], value="plain", label="Phoneme Schema"
+        )
+        use_phonikud_checkbox = gr.Checkbox(
+            value=True, label="Use Phonikud (add diacritics)"
+        )
+    submit_button = gr.Button("Create")
+    output_box = gr.Markdown(label="Phonemes + Diacritics", elem_classes=["phonemes"])
+    use_phonikud_checkbox.change(
+        fn=lambda use_phonikud: (
+            on_phonikud_toggle(use_phonikud),  # Update text_input
+            on_submit(
+                on_phonikud_toggle(use_phonikud), schema_dropdown.value, use_phonikud
+            ),  # Update output_box
+        ),
+        inputs=use_phonikud_checkbox,
+        outputs=[text_input, output_box],  # Update both text input and output box
+    )
+    submit_button.click(
+        fn=on_submit,
+        inputs=[text_input, schema_dropdown, use_phonikud_checkbox],
+        outputs=output_box,
+    )
+    gr.Markdown("""
+        <p style='text-align: center;'><a href='https://github.com/thewh1teagle/phonikud' target='_blank'>Phonikud on Github</a></p>
+    """)
+if __name__ == "__main__":
+    demo.launch()

phonikud-1.0.int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c4e7b0dbb263315ca124865da1ef3da3e91f64fb8acec6c437312a6bc0a8d51
+size 307683244

phonikud/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+High level phonemize functions
+"""
+from .phonemize import Phonemizer
+from .utils import normalize  # noqa: F401
+from typing import Callable, Literal
+phonemizer = Phonemizer()
+def phonemize(
+    text: str,
+    preserve_punctuation=True,
+    preserve_stress=True,
+    use_expander=True,
+    use_post_normalize=True,  # For TTS
+    predict_stress=True,
+    predict_shva_nah=True,
+    stress_placement: Literal["syllable", "vowel"] = "vowel",
+    schema: Literal["plain", "modern"] = "modern",
+    fallback: Callable[[str], str] = None,
+) -> str:
+    """
+    Set stress_at_start=True to place stress at syllable start.
+    """
+    phonemes = phonemizer.phonemize(
+        text,
+        preserve_punctuation=preserve_punctuation,
+        preserve_stress=preserve_stress,
+        fallback=fallback,
+        use_expander=use_expander,
+        use_post_normalize=use_post_normalize,
+        predict_stress=predict_stress,
+        schema=schema,
+        predict_shva_nah=predict_shva_nah,
+        stress_placement=stress_placement,
+    )
+    return phonemes

phonikud/data/rashej_tevot.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "צה״ל": "tsˈahal"
+}

phonikud/data/special.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "וַאלְלָה": "wˈala",
+    "וַסַבִּי": "wasˈabi",
+    "פינגוין": "pinguwˈin",
+    "וואצאפ": "wˈatsʔap",
+    "וואטסאפ": "wˈatsʔap",
+    "יאללה": "jˈala",
+    "וולטר": "wˈolter"
+}

phonikud/data/symbols.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "₪": "ʃˈekel",
+    "$": "dˈolar",
+    "%": "axˈuz"
+}

phonikud/expander/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Expand dates and numbers into words with nikud
+This happens before phonemization
+"""
+from .numbers import num_to_word
+from .dates import date_to_word
+from .time_to_word import time_to_word
+from .dictionary import Dictionary
+from phonikud.log import log
+class Expander:
+    def __init__(self):
+        self.dictionary = Dictionary()
+    def expand_text(self, text: str):
+        words = []
+        for source_word in text.split():
+            try:
+                word = date_to_word(source_word)
+                if word == source_word:
+                    word = time_to_word(word)
+                if word == source_word:
+                    word = num_to_word(word)
+                words.append(word)
+            except Exception as e:
+                log.error(f"Failed to expand {word} with error: {e}")
+                words.append(source_word)
+        text = " ".join(words)
+        text = self.dictionary.expand_text(text)
+        return text

phonikud/expander/dates.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from datetime import datetime
+from .numbers import num_to_word
+# Mapping of month names in Hebrew with diacritics (Gregorian months)
+MONTHS = {
+    1: "יָ֫נוּאָר",
+    2: "פֶ֫בְרוּאָר",
+    3: "מֵ֫רְץ",
+    4: "אֵפְרִיל",
+    5: "מַאי",
+    6: "י֫וּנִי",
+    7: "י֫וּלִי",
+    8: "א֫וֹגֻסְט",
+    9: "סֶפְּטֶ֫מְבֶּר",
+    10: "אוֹקְט֫וֹבֶּר",
+    11: "נוֹבֶ֫מְבֶּר",
+    12: "דֶּצֶ֫מְבֶּר",
+}
+# Mapping of day names in Hebrew with diacritics
+DAYS = {
+    0: "יוֹם רִאשׁוֹן",
+    1: "יוֹם שֵׁנִי",
+    2: "יוֹם שְׁלִישִׁי",
+    3: "יוֹם רֵבִיעִי",
+    4: "יוֹם חֲמִישִׁי",
+    5: "יוֹם שִׁישִׁי",
+    6: "יוֹם שַׁבָּת",
+}
+def date_to_word(word: str, include_day_name=False) -> str:
+    """
+    Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
+    Returns the original word if it's not a valid date.
+    """
+    separators = ["-", ".", "/"]
+    orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
+    date_formats = [sep.join(order) for order in orders for sep in separators]
+    for date_format in date_formats:
+        try:
+            # Try parsing the word with each date format
+            date_obj = datetime.strptime(word, date_format)
+            # Get the Hebrew day name with diacritics
+            day_name = DAYS[date_obj.weekday()]
+            # Convert month to Hebrew name with diacritics
+            month_name = MONTHS[date_obj.month]
+            day = num_to_word(str(date_obj.day))
+            year = num_to_word(str(date_obj.year))
+            text = f"{day} בֵּ{month_name} {year}"
+            if include_day_name:
+                text = f"{day_name}, {text}"
+            return text
+        except ValueError:
+            continue
+    return word

phonikud/expander/dictionary.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Dictionaries are tab separated key value words
+"""
+from pathlib import Path
+import json
+import re
+from phonikud.utils import remove_nikud
+from phonikud.utils import normalize
+from phonikud import lexicon
+import unicodedata
+files = Path(__file__).parent.joinpath("../data").glob("*.json")
+# Sort in reverse order to prioritize the most recent and best
+order = {"bronze": 1, "silver": 2, "gold": 3}
+files = sorted(
+    files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
+)
+class Dictionary:
+    def __init__(self):
+        self.dict = {}
+        self.load_dictionaries()
+    def load_dictionaries(self):
+        for file in files:
+            with open(file, "r", encoding="utf-8") as f:
+                dictionary: dict = json.load(f)
+                normalized_dictionary = {}
+                # normalize nikud keys
+                for k, v in dictionary.items():
+                    k = normalize(k)
+                    # Ensure not empty
+                    if k and v:
+                        normalized_dictionary[k] = v
+                self.dict.update(normalized_dictionary)
+    def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
+        source: str = match.group(0)
+        # decomposite
+        source = unicodedata.normalize("NFD", source)
+        raw_lookup = self.dict.get(source)
+        without_nikud_lookup = self.dict.get(remove_nikud(source))
+        with_nikud_lookup = self.dict.get(normalize(source))
+        # Compare without nikud ONLY if source has no nikud
+        if raw_lookup:
+            return raw_lookup
+        if without_nikud_lookup:
+            return without_nikud_lookup
+        elif with_nikud_lookup:
+            return with_nikud_lookup
+        return source
+    def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
+        raw_source: str = match.group(0)
+        if raw_source.isnumeric():
+            return raw_source
+        raw_lookup = self.dict.get(raw_source)
+        # Compare without nikud ONLY if source has no nikud
+        if raw_lookup:
+            return raw_lookup
+        # search by only ', space, regular nikud, alphabet
+        raw_source = re.sub(
+            lexicon.HE_PATTERN, self.replace_hebrew_only_callback, raw_source
+        )
+        return raw_source
+    def expand_text(self, text: str) -> str:
+        """
+        TODO: if key doesn't have diacritics expand even diacritized words
+        """
+        text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
+        return text

phonikud/expander/number_names.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
+"""
+# TODO: add nikud hints
+ZERO = {"אפס": "אֶ֫פֶס"}
+ONES = {
+    "אחת": "אַחַת",
+    "אחד": "אֶחָד",
+    "ראשונה": "רִאשׁוֹנָה",
+    "ראשון": "רִאשׁוֹן",
+    "ראשונות": "רִאשׁוֹנוֹת",
+    "ראשונים": "רִאשׁוֹנִים",
+    "שתיים": "שְׁתַּ֫יִם",
+    "שניים": "שְׁנַ֫יִם",
+    "שתי": "שְׁתֵּי",
+    "שני": "שְׁנֵי",
+    "שנייה": "שְׁנִיָּה",
+    "שניות": "שְׁנִיּוֹת",
+    "שלוש": "שָׁלוֹשׁ",
+    "שלושה": "שְׁלוֹשָׁה",
+    "שלושת": "שְׁל֫וֹשֶׁת",
+    "שלישית": "שְׁלִישִׁית",
+    "שלישי": "שְׁלִישִׁי",
+    "שלישיות": "שְׁלִישִׁיּוֹת",
+    "שלישיים": "שְׁלִישִׁיִּים",
+    "ארבע": "אַ֫רְבַּע",
+    "ארבעה": "אַרְבַּעָה",
+    "ארבעת": "אַרְבַּ֫עַת",
+    "רביעית": "רֵבִיעִית",
+    "רביעי": "רֵבִיעִי",
+    "רביעיות": "רֵבִיעִיוֹת",
+    "רביעיים": "רֵבִיעִיִּים",
+    "חמש": "חָמֵשׁ",
+    "חמישה": "חֲמִשָּׁה",
+    "חמשת": "חֲמֵ֫שֶׁת",
+    "חמישית": "חֲמִישִּׁית",
+    "חמישי": "חֲמִישִּׁי",
+    "חמישיות": "חֲמִישִּׁיוֹת",
+    "חמישיים": "חֲמִישִּׁיִּים",
+    "שש": "שֵׁשׁ",
+    "שישה": "שִׁשָּׁה",
+    "ששת": "שֵׁ֫שֶׁת",
+    "שישית": "שִׁשִּׁית",
+    "שישי": "שִׁשִּׁי",
+    "שישיות": "שִׁשִּׁיוֹת",
+    "שישיים": "שִׁשִּׁיִּים",
+    "שבע": "שֶׁ֫בַע",
+    "שבעה": "שִׁבְעָה",
+    "שבעת": "שִׁבְעַת",
+    "שביעית": "שְׁבִיעִית",
+    "שביעי": "שְׁבִיעִי",
+    "שביעיות": "שְׁבִיעִיוֹת",
+    "שביעיים": "שְׁבִיעִיִּים",
+    "שמונה": "שְׁמ֫וֹנֶה",
+    "שמונת": "שְׁמוֹנַת",
+    "שמינית": "שְׁמִינִית",
+    "שמיני": "שְׁמִינִי",
+    "שמיניות": "שְׁמִינִיוֹת",
+    "שמיניים": "שְׁמִינִיִּים",
+    "תשע": "תֵּשַׁע",
+    "תשעה": "תִּשְׁעָה",
+    "תשעת": "תִּשְׁעַת",
+    "תשיעית": "תְּשִׁיעִית",
+    "תשיעי": "תְּשִׁיעִי",
+    "תשיעיות": "תְּשִׁיעִיּוֹת",
+    "תשיעיים": "תְּשִׁיעִיִּים",
+}
+TENS = {
+    "עשר": "עֶ֫שֶׂר",
+    "עשרה": "עֶשְׂרֵה",
+    "עשרת": "עֲשֶׂ֫רֶת",
+    "עשירית": "עֲשִׂירִית",
+    "עשירי": "עֲשִׂירִי",
+    "עשיריות": "עֲשִׂירִיּוֹת",
+    "עשיריים": "עֲשִׂירִיִּים",
+    "שתים עשרה": "שְׁתֵּ֫ים עֶשְׂרֵה",
+    "שנים עשר": "שְׁנֵים עָשָׂר",
+}
+TWENTIES = {
+    "עשרים": "עֶשְׂרִ֫ים",
+    "שלושים": "שְׁלוֹשִׁים",
+    "ארבעים": "אַרְבָּעִים",
+    "חמישים": "חֲמִשִּׁים",
+    "שישים": "שִׁשִּׁים",
+    "שבעים": "שִׁבְעִים",
+    "שמונים": "שְׁמוֹנִים",
+    "תשעים": "תִּשְׁעִים",
+}
+HUNDREDS = {
+    "מאה": "מֵ֫אָה",
+    "מאת": "מֵאַת",
+    "מאתיים": "מָאתַ֫יִם",
+    "מאות": "מֵאוֹת",
+}
+THOUSANDS = {
+    "אלף": "אֶ֫לֶף",
+    "אלפיים": "אַלְפַּ֫יִם",
+    "אלפים": "אֲלָפִים",
+    "אלפי": "אַלְפִּי",
+}
+LARGE = {
+    "מיליון": "מִילְיוֹן",
+    "מיליוני": "מִילְיוֹנִי",
+    "מיליארד": "מִילְיַארְד",
+    "מיליארדי": "מִילְיַ֫ארְדִּי",
+    "טריליון": "טְרִילְיוֹן",
+    "טריליוני": "טְרִילְיוֹנִי",
+    "קוודריליון": "קוֹוַדְרִילְיוֹן",
+    "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
+    "קווינטיליון": "קוִוִּנְטִילְיוֹן",
+    "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
+    "סקסטיליון": "סְקֶסְטִילְיוֹן",
+    "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
+    "ספטיליון": "סְפֶּטִילְיוֹן",
+    "ספטיליוני": "סְפֶּטִילְיוֹנִי",
+    "אוקטיליון": "אוֹקְטִילְיוֹן",
+    "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
+    "נוניליון": "נוּנִילְיוֹן",
+    "נוניליוני": "נוּנִילְיוֹנִי",
+    "דסיליון": "דֶּסִילְיוֹן",
+    "דסיליוני": "דֶּסִילְיוֹנִי",
+    "אונדסיליון": "אוּנְדְסִילְיוֹן",
+    "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
+    "דואודסיליון": "דוּאודְסִילְיוֹן",
+    "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
+    "טרדסיליון": "טֶרְדְסִילְיוֹן",
+    "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
+    "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
+    "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
+    "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
+    "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
+    "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
+    "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
+    "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
+    "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
+    "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
+    "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
+    "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
+    "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
+    "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
+    "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
+}
+LETTERS = {
+    "ו": "וֵ",
+    "ה": "הַ",
+}
+CURRENCY = {
+    "שקל": "שֵׁ֫קֶל",
+    "שקלים": "שְׁקָלִים",
+    "אגורה": "אֲגוֹרָה",
+    "אגורות": "אֲגוֹרוֹת",
+    "אירו": "אֵ֫ירוֹ",
+    "סנט": "סֵנְט",
+    "סנטים": "סֵ֫נְטִים",
+    "דולר": "ד֫וֹלָר",
+    "דולרים": "דוֹלָ֫רִים",
+}
+POINTS = {
+    "מינוס": "מִ֫ינּוּס",
+    "נקודה": "נְֽקֻדָּה",
+}
+NUMBER_NAMES = {
+    **CURRENCY,
+    **HUNDREDS,
+    **LARGE,
+    **LETTERS,
+    **ONES,
+    **POINTS,
+    **TENS,
+    **THOUSANDS,
+    **TWENTIES,
+    **ZERO,
+}

phonikud/expander/numbers.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import num2words
+from .number_names import NUMBER_NAMES
+import re
+def add_diacritics(words: str):
+    new_words = []
+    for word in words.split():
+        if NUMBER_NAMES.get(word):
+            new_words.append(NUMBER_NAMES[word])
+        elif NUMBER_NAMES.get(word[1:]):
+            # With Vav or Bet
+            new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
+        else:
+            new_words.append(word)
+    return " ".join(new_words)
+def num_to_word(maybe_number: str) -> str:
+    def replace_number(match):
+        num: str = match.group()
+        suffix, prefix = "", ""
+        # prefix
+        if not num.startswith("-") and not num[0].isdigit():
+            prefix = num[0]
+            num = num[1:]
+        if not num[-1].isdigit():
+            suffix = num[-1]
+            num = num[:-1]
+        words = num2words.num2words(num, lang="he", ordinal=False)
+        words_with_diacritics = add_diacritics(words)
+        return (
+            f"{prefix.strip()} {words_with_diacritics.strip()} {suffix.strip()}".strip()
+        )
+    # Replace all whole numbers in the string
+    result = re.sub(r"[^\d\-]?-?\d+(?:[\.,]\d+)?[^\d]?", replace_number, maybe_number)
+    return result

phonikud/expander/time_to_word.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Convert time to words
+TODO: fix zeros eg. 22:00
+"""
+import re
+PATTERNS = [
+    r"(\d{1,2})([apm]{2})",  # AM/PM format
+    r"(\d{1,2}):(\d{2})",  # HH:MM format
+]
+def extract_time(match):
+    """
+    Extract hour and minute from a string in HH:MM or AM/PM format
+    and return as integers.
+    """
+    time_str = match.group(0).lower().strip()
+    # Check for HH:MM format
+    match = re.match(r"(\d{1,2}):(\d{2})", time_str)
+    if match:
+        h = int(match.group(1))
+        m = int(match.group(2))
+        return f"{convert_to_word(h, m)}"
+    # Check for AM/PM format
+    match = re.match(r"(\d{1,2})([apm]{2})", time_str)
+    if match:
+        h = int(match.group(1))
+        period = match.group(2)
+        # Normalize to 24-hour format
+        if period == "am" and h == 12:
+            h = 0
+        elif period == "pm" and h != 12:
+            h += 12
+        return f"{convert_to_word(h, 0)}"  # Defaulting to 0 minutes when only hour is provided
+    return match.group(0)  # Return original text if the format is not recognized
+def convert_to_word(h, m):
+    hours = [
+        "אֶפֶס",
+        "אַחַת",
+        "שְׁנַיִם",  # Will be replaced with "שֵׁנִי" when needed
+        "שָׁלוֹשׁ",
+        "אַ֫רְבַּע",
+        "חָמֵשׁ",
+        "שֵׁשׁ",
+        "שֶׁ֫בַע",
+        "שְׁמ֫וֹנֵה",
+        "תֵּ֫שַׁע",
+        "עֵ֫שֵׂר",
+        "אַחַת עֶשְׂרֵה",
+        "שְׁתֵּים עֶשְׂרֵה",
+    ]
+    tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
+    ten_to_twenty = [
+        "עֵ֫שֵׂר",
+        "אַחַת עֶשְׂרֵה",
+        "שְׁתֵּים עֶשְׂרֵה",
+        "שְׁלוֹשׁ עֶשְׂרֵה",
+        "אַרְבַּע עֶשְׂרֵה",
+        "חֲמֵשׁ עֶשְׂרֵה",
+        "שֵׁשׁ עֶשְׂרֵה",
+        "שְׁבַע עֶשְׂרֵה",
+        "שְׁמוֹנֶה עֶשְׂרֵה",
+        "תְּשַׁע עֶשְׂרֵה",
+    ]
+    vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
+    # Convert 0 hours to 12 (midnight)
+    if h == 0:
+        h = 12
+    elif h > 12:
+        h -= 12
+    if m == 0:
+        return f"{hours[h]}"
+    elif 1 <= m <= 9:
+        minute_word = (
+            vocab["shtey"] if m == 2 else hours[m]
+        )  # Replace "שניים" with "שני"
+        return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
+    elif 10 <= m <= 19:
+        return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
+    else:
+        tens_part = f"{vocab['and']}{tens[m // 10]}"
+        units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
+        return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
+def time_to_word(text: str):
+    return re.sub("|".join(PATTERNS), extract_time, text)

phonikud/hebrew.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Hebrew Phonemizer
+Rules implemented:
+1. Consonant handling (including special cases)
+2. Nikud (vowel) processing
+3. Dagesh handling
+4. Geresh handling
+5. Shva Na prediction
+6. Special letter combinations
+Reference:
+- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
+- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
+- https://he.wikipedia.org/wiki/הברה
+- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
+- https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט
+- https://hebrew-academy.org.il/2022/03/03/מלעיל-ומלרע-על-ההטעמה-בעברית
+"""
+from typing import Literal
+from phonikud.variants import Letter
+from phonikud import lexicon
+import re
+from phonikud.utils import sort_stress
+SHVA = "\u05b0"
+SIN = "\u05c2"
+PATAH = "\u05b7"
+KAMATZ = "\u05b8"
+HATAF_KAMATZ = "\u05b3"
+DAGESH = "\u05bc"
+HOLAM = "\u05b9"
+HIRIK = "\u05b4"
+PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
+KUBUTS = "\u05bb"
+TSERE = "\u05b5"
+HATAMA = "\u05ab"
+VAV_HOLAM = "\u05ba"
+DAGESH = "\u05bc"
+SEGOL = "\u05b6"
+def phonemize_hebrew(
+    letters: list[Letter],
+    stress_placement: Literal["syllable", "vowel"],
+) -> list[str]:
+    phonemes = []
+    i = 0
+    while i < len(letters):
+        cur = letters[i]
+        prev = letters[i - 1] if i > 0 else None
+        next = letters[i + 1] if i < len(letters) - 1 else None
+        next_phonemes, skip_offset = letter_to_phonemes(
+            cur, prev, next, stress_placement=stress_placement
+        )
+        # TODO: split into syllables
+        # next_letters = next_phonemes, letters[i:i+skip_offset+1]
+        phonemes.extend(next_phonemes)
+        i += skip_offset + 1
+    return phonemes
+def letter_to_phonemes(
+    cur: Letter,
+    prev: Letter | None,
+    next: Letter | None,
+    stress_placement: Literal["syllable", "vowel"],
+) -> tuple[str, int]:
+    cur_phonemes = []
+    skip_diacritics = False
+    skip_consonants = False
+    skip_offset = 0
+    if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
+        skip_consonants = True
+        skip_diacritics = True
+    elif cur.char == "א" and not cur.diac and prev:
+        if next and next.char != "ו":
+            skip_consonants = True
+    elif (
+        cur.char == "י"
+        and next
+        # Yud without diacritics
+        and not cur.diac
+        # In middle
+        and prev
+        # Prev Hirik
+        and prev.char + prev.diac != "אֵ"
+        # Next Vav has meaning
+        and not (next.char == "ו" and next.diac and "\u05b0" not in next.diac)
+    ):
+        skip_consonants = True
+    elif cur.char == "ש" and SIN in cur.diac:
+        if (
+            next
+            and next.char == "ש"
+            and not next.diac
+            and re.search("[\u05b7\u05b8]", cur.diac)
+        ):
+            # ^ יששכר
+            cur_phonemes.append("sa")
+            skip_consonants = True
+            skip_diacritics = True
+            skip_offset += 1
+        else:
+            cur_phonemes.append("s")
+            skip_consonants = True
+    # shin without nikud after sin = sin
+    elif cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
+        cur_phonemes.append("s")
+        skip_consonants = True
+    elif not next and cur.char == "ח" and PATAH in cur.diac:
+        # Final Het gnuva
+        cur_phonemes.append("ax")
+        skip_diacritics = True
+        skip_consonants = True
+    elif not next and cur.char == "ה" and PATAH in cur.diac:
+        # Final He gnuva
+        cur_phonemes.append("ah")
+        skip_diacritics = True
+        skip_consonants = True
+    elif not next and cur.char == "ע" and PATAH in cur.diac:
+        # Final Ayin gnuva
+        cur_phonemes.append("a")
+        skip_diacritics = True
+        skip_consonants = True
+    if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
+        if cur.char == "ת":
+            cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
+            skip_diacritics = True
+            skip_consonants = True
+        else:
+            # Geresh
+            cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
+            skip_consonants = True
+    elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES:  # dagesh
+        cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
+        skip_consonants = True
+    elif cur.char == "ו" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
+        skip_consonants = True
+        if prev and "\u05b0" in prev.diac and re.findall("[\u05b9-\u05ba]", cur.diac):
+            # ^ לִגְוֹעַ
+            cur_phonemes.append("vo")
+            skip_diacritics = True
+            skip_consonants = True
+        elif next and next.char == "ו":
+            # One of them has holam
+            holams = re.findall("[\u05b9-\u05ba]", cur.diac + next.diac)
+            if len(holams) == 2:
+                cur_phonemes.append("wo")
+                skip_diacritics = True
+                skip_offset += 1
+            if len(holams) == 1:
+                cur_phonemes.append("vo")
+                skip_diacritics = True
+                skip_offset += 1
+            # patah and next.diac empty
+            elif cur.diac == next.diac:
+                # double Vav
+                cur_phonemes.append("vu")
+                skip_diacritics = True
+                skip_offset += 1
+            elif HIRIK in cur.diac:
+                cur_phonemes.append("vi")
+                skip_diacritics = True
+            elif SHVA in cur.diac and not next.diac:
+                cur_phonemes.append("v")
+                skip_diacritics = True
+            elif KAMATZ in cur.diac or PATAH in cur.diac:
+                cur_phonemes.append("va")
+                skip_diacritics = True
+            elif SEGOL in cur.diac:
+                cur_phonemes.append("ve")
+                skip_diacritics = True
+            else:
+                # TODO ?
+                # skip_consonants = False
+                skip_diacritics = False
+        else:
+            # Single vav
+            # Vav with Patah
+            if re.search(PATAH_LIKE_PATTERN, cur.diac):
+                cur_phonemes.append("va")
+            # Tsere
+            elif TSERE in cur.diac:
+                cur_phonemes.append("ve")
+            elif SEGOL in cur.diac:
+                cur_phonemes.append("ve")
+            # Holam haser
+            elif HOLAM in cur.diac:
+                cur_phonemes.append("o")
+            # Shuruk / Kubutz
+            elif KUBUTS in cur.diac or DAGESH in cur.diac:
+                cur_phonemes.append("u")
+            # Vav with Shva in start
+            elif SHVA in cur.diac and not prev:
+                cur_phonemes.append("ve")
+            # Hirik
+            elif HIRIK in cur.diac:
+                cur_phonemes.append("vi")
+            elif next and not cur.diac:
+                # It is fine for now since we use Dicta
+                skip_consonants = True
+                skip_diacritics = True
+            else:
+                cur_phonemes.append("v")
+            skip_diacritics = True
+    if not skip_consonants:
+        cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
+    if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
+        cur_phonemes.append("o")
+        skip_diacritics = True
+    nikud_phonemes = []
+    if not skip_diacritics:
+        nikud_phonemes = [
+            lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
+        ]
+    elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
+        nikud_phonemes = [lexicon.STRESS_PHONEME]
+    cur_phonemes.extend(nikud_phonemes)
+    # Ensure the stress is at the beginning of the syllable
+    cur_phonemes = sort_stress(cur_phonemes, stress_placement)
+    cur_phonemes = [
+        p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
+    ]
+    # Remove empty phonemes
+    cur_phonemes = [p for p in cur_phonemes if p]
+    return cur_phonemes, skip_offset

phonikud/lexicon.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+ASCII IPA transcription of Hebrew consonants and vowels.
+"""
+# https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
+# Non standard diacritics
+SHVA_NA_DIACRITIC = "\u05bd"  # Meteg
+HATAMA_DIACRITIC = "\u05ab"  # Ole
+PREFIX_DIACRITIC = "|"  # Vertical bar
+NIKUD_HASER_DIACRITIC = "\u05af"  # Masora, not in use
+EN_GERESH = "'"
+NON_STANDARD_DIAC = "".join(
+    [
+        SHVA_NA_DIACRITIC,
+        HATAMA_DIACRITIC,
+        PREFIX_DIACRITIC,
+        NIKUD_HASER_DIACRITIC,
+        EN_GERESH,
+    ]
+)
+HE_PATTERN = rf'[\u05b0-\u05ea{NON_STANDARD_DIAC}"]+'
+# ^ Standard nikud and letters, ole, meteg, masora, vertical bar, en geresh
+HE_NIKUD_PATTERN = rf"[\u05b0-\u05c7{NON_STANDARD_DIAC}]"
+# ^ Letters, diacritics, en geresh
+PUNCTUATION = set(r".,!? ")
+STRESS_PHONEME = "ˈ"  # \u02c8 visually looks like single quote
+SPECIAL_PHONEMES = ["w"]
+MODERN_SCHEMA = {
+    "x": "χ",  # Het
+    "r": "ʁ",  # Resh
+    "g": "ɡ",  # Gimel
+}
+# Geresh
+GERESH_PHONEMES = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
+# Consonants
+LETTERS_PHONEMES = {
+    "א": "ʔ",  # Alef
+    "ב": "v",  # Bet
+    "ג": "g",  # Gimel
+    "ד": "d",  # Dalet
+    "ה": "h",  # He
+    "ו": "v",  # Vav
+    "ז": "z",  # Zayin
+    "ח": "x",  # Het
+    "ט": "t",  # Tet
+    "י": "j",  # Yod
+    "ך": "x",  # Haf sofit
+    "כ": "x",  # Haf
+    "ל": "l",  # Lamed
+    "ם": "m",  # Mem Sofit
+    "מ": "m",  # Mem
+    "ן": "n",  # Nun Sofit
+    "נ": "n",  # Nun
+    "ס": "s",  # Samekh
+    "ע": "ʔ",  # Ayin, only voweled
+    "פ": "f",  # Fey
+    "ף": "f",  # Fey Sofit
+    "ץ": "ts",  # Tsadik sofit
+    "צ": "ts",  # Tsadik
+    "ק": "k",  # Kuf
+    "ר": "r",  # Resh
+    "ש": "ʃ",  # Shin
+    "ת": "t",  # Taf
+    # Beged Kefet
+    "בּ": "b",
+    "כּ": "k",
+    "פּ": "p",
+    # Shin Sin
+    "שׁ": "ʃ",
+    "שׂ": "s",
+    "'": "",
+}
+NIKUD_PHONEMES = {
+    "\u05b4": "i",  # Hiriq
+    "\u05b1": "e",  # Hataf segol
+    "\u05b5": "e",  # Tsere
+    "\u05b6": "e",  # Segol
+    "\u05b2": "a",  # Hataf Patah
+    "\u05b7": "a",  # Patah
+    "\u05c7": "o",  # Kamatz katan
+    "\u05b9": "o",  # Holam
+    "\u05ba": "o",  # Holam haser for vav
+    "\u05bb": "u",  # Qubuts
+    "\u05b3": "o",  # Hataf qamats
+    "\u05b8": "a",  # Kamataz
+    HATAMA_DIACRITIC: STRESS_PHONEME,  # Stress (Hat'ama)
+    SHVA_NA_DIACRITIC: "e",  # Shva na
+}
+DEDUPLICATE = {
+    "\u05f3": "'",  # Hebrew geresh to regular geresh
+    "־": "-",  # Hebrew Makaf to hypen
+}
+# Sets
+SET_PHONETIC_DIACRITICS = set([HATAMA_DIACRITIC, PREFIX_DIACRITIC, SHVA_NA_DIACRITIC])
+ADDITIONAL_PHONEMES = set()  # When using fallback
+SET_PHONEMES = set(
+    sorted(
+        {
+            *NIKUD_PHONEMES.values(),
+            *LETTERS_PHONEMES.values(),
+            *GERESH_PHONEMES.values(),
+            *MODERN_SCHEMA.values(),
+            *SPECIAL_PHONEMES,
+        }
+    )
+)

phonikud/log.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import logging
+import os
+import colorlog
+def _create_logger():
+    """
+    Create a logger with colorized output
+    Usage: LOG_LEVEL=DEBUG python <script.py>
+    """
+    handler = colorlog.StreamHandler()
+    fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
+    handler.setFormatter(
+        colorlog.ColoredFormatter(
+            fmt=fmt,
+            log_colors={
+                "DEBUG": "blue",
+                "INFO": "green",
+                "WARNING": "yellow",
+                "ERROR": "red",
+                "CRITICAL": "red",
+            },
+        )
+    )
+    # Get log level from LOG_LEVEL environment variable
+    log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
+    logger = colorlog.getLogger(__package__)
+    logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
+    # Setup logging to stdout
+    logger.addHandler(handler)
+    return logger
+log = _create_logger()

phonikud/phonemize.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from phonikud import lexicon
+from phonikud.variants import Letter
+from .expander import Expander
+from phonikud.utils import (
+    get_letters,
+    normalize,
+    post_normalize,
+    post_clean,
+    add_milra_hatama,
+    mark_shva_na,
+    sort_hatama,
+)
+from typing import Callable, Literal
+import regex as re
+from phonikud.hebrew import phonemize_hebrew
+class Phonemizer:
+    # TODO: is that enough? what if there's punctuation around? other chars?
+    fallback_pattern = r"[a-zA-Z]+"
+    def __init__(self):
+        self.expander = Expander()
+    def phonemize(
+        self,
+        text: str,
+        preserve_punctuation: bool,
+        preserve_stress: bool,
+        use_expander: bool,
+        use_post_normalize: bool,  # For TTS
+        predict_stress: bool,
+        predict_shva_nah: bool,
+        stress_placement: Literal["syllable", "vowel"],
+        schema: Literal["plain", "modern"],
+        fallback: Callable[[str], str] = None,
+    ) -> str | list[str]:
+        # normalize
+        text = normalize(text)
+        def fallback_replace_callback(match: re.Match):
+            word = match.group(0)
+            if self.expander.dictionary.dict.get(word):
+                # skip
+                # TODO: better API
+                return word
+            phonemes = fallback(word).strip()
+            # TODO: check that it has only IPA?!
+            for c in phonemes:
+                lexicon.ADDITIONAL_PHONEMES.add(c)
+            return phonemes
+        if fallback is not None:
+            text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
+        if use_expander:
+            text = self.expander.expand_text(text)
+        def heb_replace_callback(match: re.Match, original_text: str):
+            word = match.group(0)
+            start_offset = match.start()
+            if start_offset > 0 and original_text[start_offset - 1] == "[":
+                # Skip if it starts with [ as it's used for hyper phonemes
+                return word
+            if predict_shva_nah:
+                mark_shva_na(word)
+            if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
+                word = add_milra_hatama(word)
+            letters: list[Letter] = get_letters(word)
+            letters = sort_hatama(letters)
+            phonemes: list[str] = phonemize_hebrew(
+                letters,
+                stress_placement=stress_placement,
+            )
+            phonemes = "".join(phonemes)
+            # syllables = get_syllables(phonemes)
+            # phonemes_text = "".join(phonemes)
+            # # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
+            # #     if len(syllables) == 1:
+            # #         syllables[-1] = lexicon.STRESS + syllables[-1]
+            # #         syllables[-1] = "".join(sort_stress(syllables[-1]))
+            # #     elif any(
+            # #         remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
+            # #     ) or phonemes_text.endswith("ax"):
+            # #         # insert lexicon.STRESS in the first character of syllables[-2]
+            # #         syllables[-2] = lexicon.STRESS + syllables[-2]
+            # #         syllables[-2] = "".join(sort_stress(syllables[-2]))
+            # #     else:
+            # #         # insert in syllables[-1]
+            # #         syllables[-1] = lexicon.STRESS + syllables[-1]
+            # #         syllables[-1] = "".join(sort_stress(syllables[-1]))
+            # phonemes = "".join(syllables)
+            if use_post_normalize:
+                phonemes = post_normalize(phonemes)
+            if schema == "modern":
+                # We'll keep this feature simple for now
+                for k, v in lexicon.MODERN_SCHEMA.items():
+                    phonemes = re.sub(k, v, phonemes)
+            return phonemes
+        text = re.sub(
+            lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
+        )
+        def hyper_phonemes_callback(match: re.Match):
+            """
+            Expand hyper phonemes into normal phonemes
+            eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
+            """
+            matched_phonemes = match.group(2)
+            for c in matched_phonemes:
+                lexicon.ADDITIONAL_PHONEMES.add(c)
+            return matched_phonemes  # The phoneme is in the second group
+        text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
+        if not preserve_punctuation:
+            text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
+        if not preserve_stress:
+            text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
+        if use_post_normalize:
+            # We don't keep hypens in the output, but we should replace it with space
+            text = post_clean(text)
+        return text

phonikud/syllables.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
+TODO: add to phonikud?
+"""
+import regex as re
+import phonikud
+VOWEL_DIACS = [chr(i) for i in range(0x05B1, 0x05BC)] + [chr(0x05C7)] + [chr(0x5BD)]
+STRESS = "\u05ab"
+SHVA = "\u05b0"
+DAGESH = "\u05bc"
+def sort_diacritics(word: str):
+    def sort_diacritics_callback(match):
+        letter = match.group(1)
+        diacritics = "".join(sorted(match.group(2)))  # Sort diacritics
+        return letter + diacritics
+    return re.sub(r"(\p{L})(\p{M}+)", sort_diacritics_callback, word)
+def has_vowel_diacs(s: str):
+    if s == "וּ":
+        return True
+    return any(i in s for i in VOWEL_DIACS)
+def get_syllables(word: str) -> list[str]:
+    letters = phonikud.utils.get_letters(word)
+    syllables, cur = [], ""
+    vowel_state = False
+    i = 0
+    while i < len(letters):
+        letter = letters[i]
+        has_vowel = has_vowel_diacs(str(letter)) or (i == 0 and SHVA in letter.all_diac)
+        # Look ahead
+        vav1 = i + 2 < len(letters) and letters[i + 2].char == "ו"
+        vav2 = i + 3 < len(letters) and letters[i + 3].char == "ו"
+        if has_vowel:
+            if vowel_state:
+                syllables.append(cur)
+                cur = str(letter)
+            else:
+                cur += str(letter)
+            vowel_state = True
+        else:
+            cur += str(letter)
+        i += 1
+        # If two וs are coming: force current syllable to end, and join both וs as next syllable
+        if vav1 and vav2:
+            if cur:
+                # Finish current syllable
+                syllables.append(cur + str(letters[i]))
+                cur = ""
+            cur = str(letters[i + 1]) + str(letters[i + 2])
+            i += 3  # skip past the double-vav
+            vowel_state = True
+        # If one ו is coming, end the syllable now
+        elif vav1 and letters[i + 1].diac:
+            if cur:
+                syllables.append(cur)
+                cur = ""
+            vowel_state = False
+    if cur:
+        syllables.append(cur)
+    # print(syllables)
+    return syllables
+def add_stress_to_syllable(s: str):
+    letters = phonikud.utils.get_letters(s)
+    letters[0].all_diac = STRESS + letters[0].all_diac
+    return "".join(letter.char + letter.all_diac for letter in letters)
+def add_stress(word: str, syllable_position: int):
+    syllables: list[str] = get_syllables(word)
+    if not syllables:
+        return word  # no syllables, return original word
+    # Normalize negative indices
+    if syllable_position < 0:
+        syllable_position += len(syllables)
+    # Clamp to valid range
+    syllable_position = max(0, min(syllable_position, len(syllables) - 1))
+    stressed_syllable = syllables[syllable_position]
+    stressed_syllable = add_stress_to_syllable(stressed_syllable)
+    syllables[syllable_position] = stressed_syllable
+    return "".join(syllables)

phonikud/utils.py ADDED Viewed

	@@ -0,0 +1,247 @@

+from functools import lru_cache
+from typing import Literal
+from phonikud import lexicon
+import unicodedata
+import regex as re
+import phonikud.syllables
+from phonikud.variants import Letter
+import phonikud
+def sort_diacritics(match):
+    letter = match.group(1)
+    diacritics = "".join(sorted(match.group(2)))  # Sort diacritics
+    return letter + diacritics
+NORMALIZE_PATTERNS = {
+    # Sort diacritics
+    r"(\p{L})(\p{M}+)": sort_diacritics,
+    "״": '"',  # Hebrew geresh to normal geresh
+    "׳": "'",  # Same
+}
+def remove_nikud(text: str, to_keep=""):
+    pattern = lexicon.HE_NIKUD_PATTERN
+    pattern = "".join(i for i in pattern if i not in to_keep)
+    return re.sub(
+        pattern,
+        "",
+        text,
+    )
+@lru_cache(maxsize=10000)
+def normalize(text: str) -> str:
+    """
+    Normalize unicode (decomposite)
+    Keep only Hebrew characters / punctuation / IPA
+    Sort diacritics
+    """
+    # Decompose text
+    text = unicodedata.normalize("NFD", text)
+    for k, v in NORMALIZE_PATTERNS.items():
+        text = re.sub(k, v, text)
+    for k, v in lexicon.DEDUPLICATE.items():
+        text = re.sub(k, v, text)
+    return text
+def post_normalize(phonemes: str):
+    new_phonemes = []
+    for word in phonemes.split(" "):
+        # remove glottal stop from end
+        word = re.sub(r"ʔ$", "", word)
+        # remove h from end
+        word = re.sub(r"h$", "", word)
+        word = re.sub(r"ˈh$", "", word)
+        # remove j followed by a i
+        word = re.sub(r"ij$", "i", word)
+        new_phonemes.append(word)
+    phonemes = " ".join(new_phonemes)
+    return phonemes
+def post_clean(phonemes: str):
+    clean = []
+    for i in phonemes:
+        if i == "-":
+            clean.append(" ")
+        elif (
+            i in lexicon.SET_PHONEMES
+            or i in lexicon.ADDITIONAL_PHONEMES
+            or i == " "
+            or i in lexicon.PUNCTUATION
+        ):
+            clean.append(i)
+    return "".join(clean)
+letters_pattern = re.compile(r"(\p{L})([\p{M}'|]*)")
+# @lru_cache(maxsize=10000) TODO?
+def get_letters(word: str):
+    letters: list[tuple[str, str]] = letters_pattern.findall(word)  # with en_geresh
+    letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
+    return letters
+def get_unicode_names(text: str):
+    return [unicodedata.name(c, "?") for c in text]
+def has_vowel(s: iter):
+    return any(i in s for i in "aeiou")
+def has_constant(s: iter):
+    return any(i not in "aeiou" for i in s)
+def get_phoneme_syllables(phonemes: list[str]) -> list[str]:
+    syllables = []
+    cur_syllable = ""
+    i = 0
+    while i < len(phonemes):
+        # Add current phoneme to the syllable
+        cur_syllable += phonemes[i]
+        # If we have a vowel in the current syllable
+        if has_vowel(cur_syllable):
+            # If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
+            if (
+                i + 2 < len(phonemes)
+                and not has_vowel(phonemes[i + 1])
+                and has_vowel(phonemes[i + 2])
+            ):
+                # End the current syllable and start a new one
+                syllables.append(cur_syllable)
+                cur_syllable = ""
+            # If we're at the end or next phoneme has a vowel
+            elif i + 1 >= len(phonemes) or has_vowel(phonemes[i + 1]):
+                # End the current syllable
+                syllables.append(cur_syllable)
+                cur_syllable = ""
+        i += 1
+    # Add any remaining syllable
+    if cur_syllable:
+        syllables.append(cur_syllable)
+    # Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
+    for i in range(len(syllables) - 1):  # Ensure we're not at the last syllable
+        if syllables[i].endswith(lexicon.STRESS_PHONEME):
+            syllables[i + 1] = (
+                lexicon.STRESS_PHONEME + syllables[i + 1]
+            )  # Move stress to next syllable
+            syllables[i] = syllables[i][
+                : -len(lexicon.STRESS_PHONEME)
+            ]  # Remove stress from current syllable
+    return syllables
+def sort_stress(
+    phonemes: list[str], placement: Literal["syllable", "vowel"] = "vowel"
+) -> list[str]:
+    """
+    TTS systems expect that the stress will be BEFORE vowel
+    Linguistics expect in the START of the syllable
+    at_start=True for place it in the beginning
+    """
+    if "ˈ" not in "".join(phonemes):
+        # ^ Does not contains stress
+        return phonemes
+    if not any(i in "".join(phonemes) for i in "aeiou"):
+        # ^ Does not contains vowel
+        return phonemes
+    # Remove stress marker
+    phonemes = [p for p in phonemes if p != "ˈ"]
+    if placement == "syllable":
+        return ["ˈ"] + phonemes
+    # Define vowels
+    vowels = "aeiou"
+    # Find the first phoneme that contains a vowel, and inject the stress before the vowel
+    for i, phoneme in enumerate(phonemes):
+        for j, char in enumerate(phoneme):
+            if char in vowels:
+                # Insert stress before the vowel
+                phonemes[i] = phoneme[:j] + "ˈ" + phoneme[j:]
+                return phonemes
+    # If no vowels found, return unchanged
+    return phonemes
+def mark_shva_na(word: str):
+    """
+    Shva Na is context-independent and can be predicted with just the word or a dictionary.
+    See https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
+    Note: we predict only if Shva in the first letter in the word
+    Note: we assume that the word comes with | to mark 'Txiliyot'
+    Note: Shva Na rules mid-word are unreliable, so we don’t code them.
+    Meteg (\u05bd) will be added in the letter with Shva Na
+    What we don't predict:
+    (1) some shva in beginning in future form (we don't know)
+    (2) shva in the middle of the word
+    """
+    letters = get_letters(word)
+    if not letters:
+        return word
+    if letters[0].char in "למנרי":
+        letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
+    elif len(letters) > 1 and letters[1].char in "אעה":
+        letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
+    elif letters[0].char in "וכלב" and lexicon.PREFIX_DIACRITIC in letters[0].all_diac:
+        # ^ The nakdan should add |
+        letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
+    # Ensure that prefix character will be last
+    for letter in letters:
+        if "|" in letter.all_diac:
+            letter.all_diac = letter.all_diac.replace("|", "") + "|"
+    return "".join(str(i) for i in letters)
+def sort_hatama(letters: list[Letter]) -> list[Letter]:
+    for i in range(len(letters) - 1):
+        diacs = list(letters[i].all_diac)
+        if lexicon.HATAMA_DIACRITIC in diacs and lexicon.NIKUD_HASER_DIACRITIC in diacs:
+            diacs.remove(lexicon.HATAMA_DIACRITIC)
+            letters[i].all_diac = "".join(diacs)  # Reassign the updated diacritics
+            letters[i + 1].all_diac += lexicon.HATAMA_DIACRITIC
+    return letters
+def add_milra_hatama(word: str):
+    syllables = phonikud.syllables.get_syllables(word)
+    stress_index = -1
+    if not syllables:
+        return word
+    if len(syllables) == 1:
+        stress_index = 0
+    # Get latest syllable
+    milra = syllables[stress_index]
+    # Get letters
+    letters = get_letters(milra)
+    # Add Hatama
+    letters[0].all_diac += lexicon.HATAMA_DIACRITIC
+    # Replace latest syllable
+    syllables[stress_index] = "".join(str(i) for i in letters)
+    return "".join(syllables)

phonikud/variants.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import phonikud
+from phonikud import lexicon
+class Letter:
+    def __init__(self, char: str, diac: list[str]):
+        self.char = phonikud.normalize(char)
+        self.all_diac = phonikud.normalize(diac)
+        self.diac = "".join(
+            i for i in self.all_diac if i not in lexicon.SET_PHONETIC_DIACRITICS
+        )
+    def __repr__(self):
+        return f"[Letter] {self.char}{''.join(self.all_diac)}"
+    def __eq__(self, value: "Letter"):
+        return value.all_diac == self.all_diac and value.char == self.char
+    def __str__(self):
+        return self.char + self.all_diac

phonikud_onnx/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from .model import OnnxModel
+import re
+class Phonikud:
+    def __init__(self, model_path: str):
+        self.model = OnnxModel(model_path)
+    def add_diacritics(
+        self, sentences: list | str, mark_matres_lectionis: str | None = None
+    ) -> str:
+        """
+        Adds nikud (Hebrew diacritics) to the given text.
+        Parameters:
+        - sentences (list | str): A string or a list of strings to be processed. Each string should not exceed 2048 characters.
+        - mark_matres_lectionis (str | None, optional): A string used to mark nikud male. For example, if set to '|',
+            "לִימּוּדָיו" will be returned as "לִי|מּוּדָיו". Default is None (no marking).
+        Returns:
+        - str: The text with added diacritics.
+        """
+        if isinstance(sentences, str):
+            sentences = [sentences]
+        result = self.model.predict(
+            sentences, mark_matres_lectionis=mark_matres_lectionis
+        )
+        return result[0]
+    def get_nikud_male(self, text: str, mark_matres_lectionis: str):
+        """
+        Based on given mark character remove the mark character to keep it as nikud male
+        """
+        return text.replace(mark_matres_lectionis, "")
+    def get_nikud_haser(self, text: str):
+        """
+        Based on given mark_matres_lectionis remove the nikud nikud male character along with the mark character
+        """
+        return re.sub(r".\|", "", text)  # Remove {char}{matres_lectionis}

phonikud_onnx/model.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import onnxruntime as ort
+import numpy as np
+from tokenizers import Tokenizer
+import re
+# Constants
+NIKUD_CLASSES = [
+    "",
+    "<MAT_LECT>",
+    "\u05bc",
+    "\u05b0",
+    "\u05b1",
+    "\u05b2",
+    "\u05b3",
+    "\u05b4",
+    "\u05b5",
+    "\u05b6",
+    "\u05b7",
+    "\u05b8",
+    "\u05b9",
+    "\u05ba",
+    "\u05bb",
+    "\u05bc\u05b0",
+    "\u05bc\u05b1",
+    "\u05bc\u05b2",
+    "\u05bc\u05b3",
+    "\u05bc\u05b4",
+    "\u05bc\u05b5",
+    "\u05bc\u05b6",
+    "\u05bc\u05b7",
+    "\u05bc\u05b8",
+    "\u05bc\u05b9",
+    "\u05bc\u05ba",
+    "\u05bc\u05bb",
+    "\u05c7",
+    "\u05bc\u05c7",
+]
+SHIN_CLASSES = ["\u05c1", "\u05c2"]  # shin, sin
+MAT_LECT_TOKEN = "<MAT_LECT>"
+MATRES_LETTERS = list("אוי")
+ALEF_ORD = ord("א")
+TAF_ORD = ord("ת")
+STRESS_CHAR = "\u05ab"  # "ole" symbol marks stress
+MOBILE_SHVA_CHAR = "\u05bd"  # "meteg" symbol marks shva na (mobile shva)
+PREFIX_CHAR = "|"
+def is_hebrew_letter(char):
+    return ALEF_ORD <= ord(char) <= TAF_ORD
+def is_matres_letter(char):
+    return char in MATRES_LETTERS
+nikud_pattern = re.compile(r"[\u05B0-\u05BD\u05C1\u05C2\u05C7]")
+def remove_nikkud(text):
+    return nikud_pattern.sub("", text)
+class OnnxModel:
+    def __init__(
+        self, model_path, tokenizer_name="dicta-il/dictabert-large-char-menaked"
+    ):
+        # Load the tokenizer
+        self.tokenizer = Tokenizer.from_pretrained(tokenizer_name)
+        # Create ONNX Runtime session
+        self.session = ort.InferenceSession(model_path)
+        self.input_names = [input.name for input in self.session.get_inputs()]
+        self.output_names = [output.name for output in self.session.get_outputs()]
+    def _create_inputs(self, sentences: list[str], padding: str):
+        # Tokenize inputs using tokenizers library
+        encodings = []
+        for sentence in sentences:
+            encoding = self.tokenizer.encode(sentence)
+            encodings.append(encoding)
+        # Get the max length for padding
+        max_len = max(len(enc.ids) for enc in encodings) if padding == "longest" else 0
+        # Prepare batch inputs
+        input_ids = []
+        attention_mask = []
+        offset_mapping = []
+        for encoding in encodings:
+            ids = encoding.ids
+            masks = [1] * len(ids)
+            offsets = encoding.offsets
+            # Pad if needed
+            if padding == "longest" and len(ids) < max_len:
+                padding_length = max_len - len(ids)
+                ids = ids + [self.tokenizer.token_to_id("[PAD]")] * padding_length
+                masks = masks + [0] * padding_length
+                offsets = offsets + [(0, 0)] * padding_length
+            input_ids.append(ids)
+            attention_mask.append(masks)
+            offset_mapping.append(offsets)
+        # Convert to numpy arrays for ONNX Runtime
+        return {
+            "input_ids": np.array(input_ids, dtype=np.int64),
+            "attention_mask": np.array(attention_mask, dtype=np.int64),
+            # Token type IDs might be needed depending on your model
+            "token_type_ids": np.zeros_like(np.array(input_ids, dtype=np.int64)),
+        }, offset_mapping
+    def predict(self, sentences, mark_matres_lectionis=None, padding="longest"):
+        sentences = [remove_nikkud(sentence) for sentence in sentences]
+        inputs, offset_mapping = self._create_inputs(sentences, padding)
+        # Run inference
+        outputs = self.session.run(self.output_names, inputs)
+        # Process outputs based on output names
+        nikud_idx = self.output_names.index("nikud_logits")
+        shin_idx = self.output_names.index("shin_logits")
+        nikud_logits = outputs[nikud_idx]
+        shin_logits = outputs[shin_idx]
+        additional_idx = self.output_names.index("additional_logits")
+        additional_logits = outputs[additional_idx]
+        # Get predictions
+        nikud_predictions = np.argmax(nikud_logits, axis=-1)
+        shin_predictions = np.argmax(shin_logits, axis=-1)
+        stress_predictions = (additional_logits[..., 1] > 1).astype(np.int32)
+        mobile_shva_predictions = (additional_logits[..., 2] > 1).astype(np.int32)
+        prefix_predictions = (additional_logits[..., 3] > 1).astype(np.int32)
+        ret = []
+        for sent_idx, (sentence, sent_offsets) in enumerate(
+            zip(sentences, offset_mapping)
+        ):
+            # Assign the nikud to each letter
+            output = []
+            prev_index = 0
+            for idx, offsets in enumerate(sent_offsets):
+                # Add anything we missed
+                if offsets[0] > prev_index:
+                    output.append(sentence[prev_index : offsets[0]])
+                if offsets[1] - offsets[0] != 1:
+                    continue
+                # Get next char
+                char = sentence[offsets[0] : offsets[1]]
+                prev_index = offsets[1]
+                if not is_hebrew_letter(char):
+                    output.append(char)
+                    continue
+                nikud = NIKUD_CLASSES[nikud_predictions[sent_idx][idx]]
+                shin = (
+                    "" if char != "ש" else SHIN_CLASSES[shin_predictions[sent_idx][idx]]
+                )
+                # Check for matres lectionis
+                if nikud == MAT_LECT_TOKEN:
+                    if not is_matres_letter(char):
+                        nikud = ""  # Don't allow matres on irrelevant letters
+                    elif mark_matres_lectionis is not None:
+                        nikud = mark_matres_lectionis
+                    else:
+                        output.append(char)
+                        continue
+                stress = (
+                    STRESS_CHAR
+                    if stress_predictions is not None
+                    and stress_predictions[sent_idx][idx] == 1
+                    else ""
+                )
+                mobile_shva = (
+                    MOBILE_SHVA_CHAR
+                    if mobile_shva_predictions is not None
+                    and mobile_shva_predictions[sent_idx][idx] == 1
+                    else ""
+                )
+                prefix = (
+                    PREFIX_CHAR
+                    if prefix_predictions is not None
+                    and prefix_predictions[sent_idx][idx] == 1
+                    else ""
+                )
+                output.append(char + shin + nikud + stress + mobile_shva + prefix)
+            output.append(sentence[prev_index:])
+            ret.append("".join(output))
+        return ret

phonikud_onnx/py.typed ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+# This file was autogenerated by uv via the following command:
+#    uv export --no-hashes --no-emit-project
+colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   colorlog
+    #   pytest
+    #   tqdm
+colorlog==6.9.0
+    # via phonikud
+docopt==0.6.2
+    # via num2words
+exceptiongroup==1.3.0 ; python_full_version < '3.11'
+    # via pytest
+iniconfig==2.1.0
+    # via pytest
+num2words==0.5.14
+    # via phonikud
+numpy==2.2.6
+    # via pandas
+packaging==25.0
+    # via pytest
+pandas==2.2.3
+pluggy==1.6.0
+    # via pytest
+pytest==8.3.5
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2025.2
+    # via pandas
+regex==2024.11.6
+    # via phonikud
+ruff==0.11.11
+six==1.17.0
+    # via python-dateutil
+tomli==2.2.1 ; python_full_version < '3.11'
+    # via pytest
+tqdm==4.67.1
+typing-extensions==4.13.2 ; python_full_version < '3.11'
+    # via exceptiongroup
+tzdata==2025.2
+    # via pandas
+gradio>=5.25.2
+phonikud_onnx