thewh1teagle commited on
Commit
1866014
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.onnx filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv pip install gradio
3
+ uv run gradio examples/editor.py
4
+ """
5
+
6
+ from phonikud import phonemize, lexicon
7
+ from phonikud.utils import remove_nikud
8
+ import gradio as gr
9
+ from phonikud_onnx import Phonikud
10
+ from pathlib import Path
11
+
12
+
13
+ default_text = """
14
+ הַדַּיָּיג נִצְמָד לְדֹופֶן הַסִּירָה בִּזְמַן הַסְּעָרָה.
15
+ הִסְבַּרְתִּי לָהּ אֶת הַכֹּל, וְאָמַרְתִּי בְּדִיּוּק מָה קָרָה.
16
+ הַיְּלָדִים אָהֲבוּ בִּמְיֻוחָד אֶת הַסִּיפּוּרִים הַלָּלוּ שֶׁהַמּוֹרָה הִקְרִיאָה.
17
+ """.strip()
18
+
19
+
20
+ def on_phonikud_toggle(use_phonikud):
21
+ if not use_phonikud:
22
+ return default_text
23
+ return remove_nikud(default_text)
24
+
25
+
26
+ css = """
27
+ .input textarea {
28
+ font-size: 22px;
29
+ padding: 15px;
30
+ height: 200px;
31
+ }
32
+
33
+ .phonemes {
34
+ background: var(--input-background-fill);
35
+
36
+ }
37
+ .phonemes {
38
+ padding: 5px;
39
+ min-height: 50px;
40
+ }
41
+ """
42
+
43
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Noto Sans Hebrew")])
44
+
45
+ phonikud = None
46
+ model_path = Path("./phonikud-1.0.int8.onnx")
47
+ if model_path.exists():
48
+ phonikud = Phonikud(str(model_path))
49
+
50
+
51
+ def on_submit(text: str, schema: str, use_phonikud: bool) -> str:
52
+ diacritized = (
53
+ phonikud.add_diacritics(
54
+ text, mark_matres_lectionis=lexicon.NIKUD_HASER_DIACRITIC
55
+ )
56
+ if phonikud and use_phonikud
57
+ else text
58
+ )
59
+ phonemes = phonemize(
60
+ diacritized, predict_stress=True, schema=schema, predict_shva_nah=False
61
+ )
62
+ if use_phonikud:
63
+ return f"<div dir='rtl' style='font-size: 22px;'>{diacritized.strip()}</div><br><div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
64
+ else:
65
+ return f"<div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
66
+
67
+
68
+ with gr.Blocks(theme=theme, css=css) as demo:
69
+ text_input = gr.Textbox(
70
+ value=remove_nikud(default_text),
71
+ label="Text",
72
+ rtl=True,
73
+ elem_classes=["input"],
74
+ lines=7,
75
+ )
76
+
77
+ with gr.Row():
78
+ schema_dropdown = gr.Dropdown(
79
+ choices=["modern", "plain"], value="plain", label="Phoneme Schema"
80
+ )
81
+ use_phonikud_checkbox = gr.Checkbox(
82
+ value=True, label="Use Phonikud (add diacritics)"
83
+ )
84
+
85
+ submit_button = gr.Button("Create")
86
+ output_box = gr.Markdown(label="Phonemes + Diacritics", elem_classes=["phonemes"])
87
+ use_phonikud_checkbox.change(
88
+ fn=lambda use_phonikud: (
89
+ on_phonikud_toggle(use_phonikud), # Update text_input
90
+ on_submit(
91
+ on_phonikud_toggle(use_phonikud), schema_dropdown.value, use_phonikud
92
+ ), # Update output_box
93
+ ),
94
+ inputs=use_phonikud_checkbox,
95
+ outputs=[text_input, output_box], # Update both text input and output box
96
+ )
97
+
98
+ submit_button.click(
99
+ fn=on_submit,
100
+ inputs=[text_input, schema_dropdown, use_phonikud_checkbox],
101
+ outputs=output_box,
102
+ )
103
+
104
+ gr.Markdown("""
105
+ <p style='text-align: center;'><a href='https://github.com/thewh1teagle/phonikud' target='_blank'>Phonikud on Github</a></p>
106
+ """)
107
+
108
+ if __name__ == "__main__":
109
+ demo.launch()
phonikud-1.0.int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c4e7b0dbb263315ca124865da1ef3da3e91f64fb8acec6c437312a6bc0a8d51
3
+ size 307683244
phonikud/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable, Literal
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ use_expander=True,
17
+ use_post_normalize=True, # For TTS
18
+ predict_stress=True,
19
+ predict_shva_nah=True,
20
+ stress_placement: Literal["syllable", "vowel"] = "vowel",
21
+ schema: Literal["plain", "modern"] = "modern",
22
+ fallback: Callable[[str], str] = None,
23
+ ) -> str:
24
+ """
25
+ Set stress_at_start=True to place stress at syllable start.
26
+ """
27
+ phonemes = phonemizer.phonemize(
28
+ text,
29
+ preserve_punctuation=preserve_punctuation,
30
+ preserve_stress=preserve_stress,
31
+ fallback=fallback,
32
+ use_expander=use_expander,
33
+ use_post_normalize=use_post_normalize,
34
+ predict_stress=predict_stress,
35
+ schema=schema,
36
+ predict_shva_nah=predict_shva_nah,
37
+ stress_placement=stress_placement,
38
+ )
39
+ return phonemes
phonikud/data/rashej_tevot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "צה״ל": "tsˈahal"
3
+ }
phonikud/data/special.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "וַאלְלָה": "wˈala",
3
+ "וַסַבִּי": "wasˈabi",
4
+ "פינגוין": "pinguwˈin",
5
+ "וואצאפ": "wˈatsʔap",
6
+ "וואטסאפ": "wˈatsʔap",
7
+ "יאללה": "jˈala",
8
+ "וולטר": "wˈolter"
9
+ }
phonikud/data/symbols.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "₪": "ʃˈekel",
3
+ "$": "dˈolar",
4
+ "%": "axˈuz"
5
+ }
phonikud/expander/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with nikud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from phonikud.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ words = []
19
+ for source_word in text.split():
20
+ try:
21
+ word = date_to_word(source_word)
22
+ if word == source_word:
23
+ word = time_to_word(word)
24
+ if word == source_word:
25
+ word = num_to_word(word)
26
+ words.append(word)
27
+ except Exception as e:
28
+ log.error(f"Failed to expand {word} with error: {e}")
29
+ words.append(source_word)
30
+ text = " ".join(words)
31
+ text = self.dictionary.expand_text(text)
32
+
33
+ return text
phonikud/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "יָ֫נוּאָר",
7
+ 2: "פֶ֫בְרוּאָר",
8
+ 3: "מֵ֫רְץ",
9
+ 4: "אֵפְרִיל",
10
+ 5: "מַאי",
11
+ 6: "י֫וּנִי",
12
+ 7: "י֫וּלִי",
13
+ 8: "א֫וֹגֻסְט",
14
+ 9: "סֶפְּטֶ֫מְבֶּר",
15
+ 10: "אוֹקְט֫וֹבֶּר",
16
+ 11: "נוֹבֶ֫מְבֶּר",
17
+ 12: "דֶּצֶ֫מְבֶּר",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "יוֹם רִאשׁוֹן",
23
+ 1: "יוֹם שֵׁנִי",
24
+ 2: "יוֹם שְׁלִישִׁי",
25
+ 3: "יוֹם רֵבִיעִי",
26
+ 4: "יוֹם חֲמִישִׁי",
27
+ 5: "יוֹם שִׁישִׁי",
28
+ 6: "יוֹם שַׁבָּת",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} בֵּ{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
phonikud/expander/dictionary.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from phonikud.utils import remove_nikud
9
+ from phonikud.utils import normalize
10
+ from phonikud import lexicon
11
+ import unicodedata
12
+
13
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
14
+ # Sort in reverse order to prioritize the most recent and best
15
+ order = {"bronze": 1, "silver": 2, "gold": 3}
16
+ files = sorted(
17
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
18
+ )
19
+
20
+
21
+ class Dictionary:
22
+ def __init__(self):
23
+ self.dict = {}
24
+ self.load_dictionaries()
25
+
26
+ def load_dictionaries(self):
27
+ for file in files:
28
+ with open(file, "r", encoding="utf-8") as f:
29
+ dictionary: dict = json.load(f)
30
+ normalized_dictionary = {}
31
+
32
+ # normalize nikud keys
33
+ for k, v in dictionary.items():
34
+ k = normalize(k)
35
+ # Ensure not empty
36
+ if k and v:
37
+ normalized_dictionary[k] = v
38
+ self.dict.update(normalized_dictionary)
39
+
40
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
41
+ source: str = match.group(0)
42
+ # decomposite
43
+ source = unicodedata.normalize("NFD", source)
44
+ raw_lookup = self.dict.get(source)
45
+
46
+ without_nikud_lookup = self.dict.get(remove_nikud(source))
47
+ with_nikud_lookup = self.dict.get(normalize(source))
48
+ # Compare without nikud ONLY if source has no nikud
49
+ if raw_lookup:
50
+ return raw_lookup
51
+ if without_nikud_lookup:
52
+ return without_nikud_lookup
53
+ elif with_nikud_lookup:
54
+ return with_nikud_lookup
55
+ return source
56
+
57
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
58
+ raw_source: str = match.group(0)
59
+ if raw_source.isnumeric():
60
+ return raw_source
61
+
62
+ raw_lookup = self.dict.get(raw_source)
63
+
64
+ # Compare without nikud ONLY if source has no nikud
65
+ if raw_lookup:
66
+ return raw_lookup
67
+ # search by only ', space, regular nikud, alphabet
68
+ raw_source = re.sub(
69
+ lexicon.HE_PATTERN, self.replace_hebrew_only_callback, raw_source
70
+ )
71
+ return raw_source
72
+
73
+ def expand_text(self, text: str) -> str:
74
+ """
75
+ TODO: if key doesn't have diacritics expand even diacritized words
76
+ """
77
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
78
+
79
+ return text
phonikud/expander/number_names.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ # TODO: add nikud hints
6
+
7
+ ZERO = {"אפס": "אֶ֫פֶס"}
8
+
9
+
10
+ ONES = {
11
+ "אחת": "אַחַת",
12
+ "אחד": "אֶחָד",
13
+ "ראשונה": "רִאשׁוֹנָה",
14
+ "ראשון": "רִאשׁוֹן",
15
+ "ראשונות": "רִאשׁוֹנוֹת",
16
+ "ראשונים": "רִאשׁוֹנִים",
17
+ "שתיים": "שְׁתַּ֫יִם",
18
+ "שניים": "שְׁנַ֫יִם",
19
+ "שתי": "שְׁתֵּי",
20
+ "שני": "שְׁנֵי",
21
+ "שנייה": "שְׁנִיָּה",
22
+ "שניות": "שְׁנִיּוֹת",
23
+ "שלוש": "שָׁלוֹשׁ",
24
+ "שלושה": "שְׁלוֹשָׁה",
25
+ "שלושת": "שְׁל֫וֹשֶׁת",
26
+ "שלישית": "שְׁלִישִׁית",
27
+ "שלישי": "שְׁלִישִׁי",
28
+ "שלישיות": "שְׁלִישִׁיּוֹת",
29
+ "שלישיים": "שְׁלִישִׁיִּים",
30
+ "ארבע": "אַ֫רְבַּע",
31
+ "ארבעה": "אַרְבַּעָה",
32
+ "ארבעת": "אַרְבַּ֫עַת",
33
+ "רביעית": "רֵבִיעִית",
34
+ "רביעי": "רֵבִיעִי",
35
+ "רביעיות": "רֵבִיעִיוֹת",
36
+ "רביעיים": "רֵבִיעִיִּים",
37
+ "חמש": "חָמֵשׁ",
38
+ "חמישה": "חֲמִשָּׁה",
39
+ "חמשת": "חֲמֵ֫שֶׁת",
40
+ "חמישית": "חֲמִישִּׁית",
41
+ "חמישי": "חֲמִישִּׁי",
42
+ "חמישיות": "חֲמִישִּׁיוֹת",
43
+ "חמישיים": "חֲמִישִּׁיִּים",
44
+ "שש": "שֵׁשׁ",
45
+ "שישה": "שִׁשָּׁה",
46
+ "ששת": "שֵׁ֫שֶׁת",
47
+ "שישית": "שִׁשִּׁית",
48
+ "שישי": "שִׁשִּׁי",
49
+ "שישיות": "שִׁשִּׁיוֹת",
50
+ "שישיים": "שִׁשִּׁיִּים",
51
+ "שבע": "שֶׁ֫בַע",
52
+ "שבעה": "שִׁבְעָה",
53
+ "שבעת": "שִׁבְעַת",
54
+ "שביעית": "שְׁבִיעִית",
55
+ "שביעי": "שְׁבִיעִי",
56
+ "שביעיות": "שְׁבִיעִיוֹת",
57
+ "שביעיים": "שְׁבִיעִיִּים",
58
+ "שמונה": "שְׁמ֫וֹנֶה",
59
+ "שמונת": "שְׁמוֹנַת",
60
+ "שמינית": "שְׁמִינִית",
61
+ "שמיני": "שְׁמִינִי",
62
+ "שמיניות": "שְׁמִינִיוֹת",
63
+ "שמיניים": "שְׁמִינִיִּים",
64
+ "תשע": "תֵּשַׁע",
65
+ "תשעה": "תִּשְׁעָה",
66
+ "תשעת": "תִּשְׁעַת",
67
+ "תשיעית": "תְּשִׁיעִית",
68
+ "תשיעי": "תְּשִׁיעִי",
69
+ "תשיעיות": "תְּשִׁיעִיּוֹת",
70
+ "תשיעיים": "תְּשִׁיעִיִּים",
71
+ }
72
+
73
+
74
+ TENS = {
75
+ "עשר": "עֶ֫שֶׂר",
76
+ "עשרה": "עֶשְׂרֵה",
77
+ "עשרת": "עֲשֶׂ֫רֶת",
78
+ "עשירית": "עֲשִׂירִית",
79
+ "עשירי": "עֲשִׂירִי",
80
+ "עשיריות": "עֲשִׂירִיּוֹת",
81
+ "עשיריים": "עֲשִׂירִיִּים",
82
+ "שתים עשרה": "שְׁתֵּ֫ים עֶשְׂרֵה",
83
+ "שנים עשר": "שְׁנֵים עָשָׂר",
84
+ }
85
+
86
+
87
+ TWENTIES = {
88
+ "עשרים": "עֶשְׂרִ֫ים",
89
+ "שלושים": "שְׁלוֹשִׁים",
90
+ "ארבעים": "אַרְבָּעִים",
91
+ "חמישים": "חֲמִשִּׁים",
92
+ "שישים": "שִׁשִּׁים",
93
+ "שבעים": "שִׁבְעִים",
94
+ "שמונים": "שְׁמוֹנִים",
95
+ "תשעים": "תִּשְׁעִים",
96
+ }
97
+
98
+
99
+ HUNDREDS = {
100
+ "מאה": "מֵ֫אָה",
101
+ "מאת": "מֵאַת",
102
+ "מאתיים": "מָאתַ֫יִם",
103
+ "מאות": "מֵאוֹת",
104
+ }
105
+
106
+ THOUSANDS = {
107
+ "אלף": "אֶ֫לֶף",
108
+ "אלפיים": "אַלְפַּ֫יִם",
109
+ "אלפים": "אֲלָפִים",
110
+ "אלפי": "אַלְפִּי",
111
+ }
112
+
113
+
114
+ LARGE = {
115
+ "מיליון": "מִילְיוֹן",
116
+ "מיליוני": "מִילְיוֹנִי",
117
+ "מיליארד": "מִילְיַארְד",
118
+ "מיליארדי": "מִילְיַ֫ארְדִּי",
119
+ "טריליון": "טְרִילְיוֹן",
120
+ "טריליוני": "טְרִילְיוֹנִי",
121
+ "קוודריליון": "קוֹוַדְרִילְיוֹן",
122
+ "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
123
+ "קווינטיליון": "קוִוִּנְטִילְיוֹן",
124
+ "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
125
+ "סקסטיליון": "סְקֶסְטִילְיוֹן",
126
+ "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
127
+ "ספטיליון": "סְפֶּטִילְיוֹן",
128
+ "ספטיליוני": "סְפֶּטִילְיוֹנִי",
129
+ "אוקטיליון": "אוֹקְטִילְיוֹן",
130
+ "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
131
+ "נוניליון": "נוּנִילְיוֹן",
132
+ "נוניליוני": "נוּנִילְיוֹנִי",
133
+ "דסיליון": "דֶּסִילְיוֹן",
134
+ "דסיליוני": "דֶּסִילְיוֹנִי",
135
+ "אונדסיליון": "אוּנְדְסִילְיוֹן",
136
+ "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
137
+ "דואודסיליון": "דוּאודְסִילְיוֹן",
138
+ "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
139
+ "טרדסיליון": "טֶרְדְסִילְיוֹן",
140
+ "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
141
+ "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
142
+ "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
143
+ "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
144
+ "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
145
+ "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
146
+ "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
147
+ "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
148
+ "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
149
+ "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
150
+ "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
151
+ "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
152
+ "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
153
+ "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
154
+ "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
155
+ }
156
+
157
+
158
+ LETTERS = {
159
+ "ו": "וֵ",
160
+ "ה": "הַ",
161
+ }
162
+
163
+
164
+ CURRENCY = {
165
+ "שקל": "שֵׁ֫קֶל",
166
+ "שקלים": "שְׁקָלִים",
167
+ "אגורה": "אֲגוֹרָה",
168
+ "אגורות": "אֲגוֹרוֹת",
169
+ "אירו": "אֵ֫ירוֹ",
170
+ "סנט": "סֵנְט",
171
+ "סנטים": "סֵ֫נְטִים",
172
+ "דולר": "ד֫וֹלָר",
173
+ "דולרים": "דוֹלָ֫רִים",
174
+ }
175
+
176
+
177
+ POINTS = {
178
+ "מינוס": "מִ֫ינּוּס",
179
+ "נקודה": "נְֽקֻדָּה",
180
+ }
181
+
182
+ NUMBER_NAMES = {
183
+ **CURRENCY,
184
+ **HUNDREDS,
185
+ **LARGE,
186
+ **LETTERS,
187
+ **ONES,
188
+ **POINTS,
189
+ **TENS,
190
+ **THOUSANDS,
191
+ **TWENTIES,
192
+ **ZERO,
193
+ }
phonikud/expander/numbers.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num: str = match.group()
22
+ suffix, prefix = "", ""
23
+ # prefix
24
+ if not num.startswith("-") and not num[0].isdigit():
25
+ prefix = num[0]
26
+ num = num[1:]
27
+ if not num[-1].isdigit():
28
+ suffix = num[-1]
29
+ num = num[:-1]
30
+ words = num2words.num2words(num, lang="he", ordinal=False)
31
+ words_with_diacritics = add_diacritics(words)
32
+ return (
33
+ f"{prefix.strip()} {words_with_diacritics.strip()} {suffix.strip()}".strip()
34
+ )
35
+
36
+ # Replace all whole numbers in the string
37
+ result = re.sub(r"[^\d\-]?-?\d+(?:[\.,]\d+)?[^\d]?", replace_number, maybe_number)
38
+
39
+ return result
phonikud/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "אֶפֶס",
47
+ "אַחַת",
48
+ "שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
49
+ "שָׁלוֹשׁ",
50
+ "אַ֫רְבַּע",
51
+ "חָמֵשׁ",
52
+ "שֵׁשׁ",
53
+ "שֶׁ֫בַע",
54
+ "שְׁמ֫וֹנֵה",
55
+ "תֵּ֫שַׁע",
56
+ "עֵ֫שֵׂר",
57
+ "אַחַת עֶשְׂרֵה",
58
+ "שְׁתֵּים עֶשְׂרֵה",
59
+ ]
60
+
61
+ tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
62
+
63
+ ten_to_twenty = [
64
+ "עֵ֫שֵׂר",
65
+ "אַחַת עֶשְׂרֵה",
66
+ "שְׁתֵּים עֶשְׂרֵה",
67
+ "שְׁלוֹשׁ עֶשְׂרֵה",
68
+ "אַרְבַּע עֶשְׂרֵה",
69
+ "חֲמֵשׁ עֶשְׂרֵה",
70
+ "שֵׁשׁ עֶשְׂרֵה",
71
+ "שְׁבַע עֶשְׂרֵה",
72
+ "שְׁמוֹנֶה עֶשְׂרֵה",
73
+ "תְּשַׁע עֶשְׂרֵה",
74
+ ]
75
+
76
+ vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "שניים" with "שני"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
phonikud/hebrew.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hebrew Phonemizer
3
+
4
+ Rules implemented:
5
+ 1. Consonant handling (including special cases)
6
+ 2. Nikud (vowel) processing
7
+ 3. Dagesh handling
8
+ 4. Geresh handling
9
+ 5. Shva Na prediction
10
+ 6. Special letter combinations
11
+
12
+ Reference:
13
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
14
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
15
+ - https://he.wikipedia.org/wiki/הברה
16
+ - https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
17
+ - https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט
18
+ - https://hebrew-academy.org.il/2022/03/03/מלעיל-ומלרע-על-ההטעמה-בעברית
19
+ """
20
+
21
+ from typing import Literal
22
+ from phonikud.variants import Letter
23
+ from phonikud import lexicon
24
+ import re
25
+ from phonikud.utils import sort_stress
26
+
27
+ SHVA = "\u05b0"
28
+ SIN = "\u05c2"
29
+ PATAH = "\u05b7"
30
+ KAMATZ = "\u05b8"
31
+ HATAF_KAMATZ = "\u05b3"
32
+ DAGESH = "\u05bc"
33
+ HOLAM = "\u05b9"
34
+ HIRIK = "\u05b4"
35
+ PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
36
+ KUBUTS = "\u05bb"
37
+ TSERE = "\u05b5"
38
+ HATAMA = "\u05ab"
39
+ VAV_HOLAM = "\u05ba"
40
+ DAGESH = "\u05bc"
41
+ SEGOL = "\u05b6"
42
+
43
+
44
+ def phonemize_hebrew(
45
+ letters: list[Letter],
46
+ stress_placement: Literal["syllable", "vowel"],
47
+ ) -> list[str]:
48
+ phonemes = []
49
+ i = 0
50
+
51
+ while i < len(letters):
52
+ cur = letters[i]
53
+ prev = letters[i - 1] if i > 0 else None
54
+ next = letters[i + 1] if i < len(letters) - 1 else None
55
+ next_phonemes, skip_offset = letter_to_phonemes(
56
+ cur, prev, next, stress_placement=stress_placement
57
+ )
58
+ # TODO: split into syllables
59
+ # next_letters = next_phonemes, letters[i:i+skip_offset+1]
60
+ phonemes.extend(next_phonemes)
61
+ i += skip_offset + 1
62
+
63
+ return phonemes
64
+
65
+
66
+ def letter_to_phonemes(
67
+ cur: Letter,
68
+ prev: Letter | None,
69
+ next: Letter | None,
70
+ stress_placement: Literal["syllable", "vowel"],
71
+ ) -> tuple[str, int]:
72
+ cur_phonemes = []
73
+ skip_diacritics = False
74
+ skip_consonants = False
75
+ skip_offset = 0
76
+
77
+ if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
78
+ skip_consonants = True
79
+ skip_diacritics = True
80
+
81
+ elif cur.char == "א" and not cur.diac and prev:
82
+ if next and next.char != "ו":
83
+ skip_consonants = True
84
+
85
+ elif (
86
+ cur.char == "י"
87
+ and next
88
+ # Yud without diacritics
89
+ and not cur.diac
90
+ # In middle
91
+ and prev
92
+ # Prev Hirik
93
+ and prev.char + prev.diac != "אֵ"
94
+ # Next Vav has meaning
95
+ and not (next.char == "ו" and next.diac and "\u05b0" not in next.diac)
96
+ ):
97
+ skip_consonants = True
98
+
99
+ elif cur.char == "ש" and SIN in cur.diac:
100
+ if (
101
+ next
102
+ and next.char == "ש"
103
+ and not next.diac
104
+ and re.search("[\u05b7\u05b8]", cur.diac)
105
+ ):
106
+ # ^ יששכר
107
+ cur_phonemes.append("sa")
108
+ skip_consonants = True
109
+ skip_diacritics = True
110
+ skip_offset += 1
111
+ else:
112
+ cur_phonemes.append("s")
113
+ skip_consonants = True
114
+
115
+ # shin without nikud after sin = sin
116
+ elif cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
117
+ cur_phonemes.append("s")
118
+ skip_consonants = True
119
+
120
+ elif not next and cur.char == "ח" and PATAH in cur.diac:
121
+ # Final Het gnuva
122
+ cur_phonemes.append("ax")
123
+ skip_diacritics = True
124
+ skip_consonants = True
125
+
126
+ elif not next and cur.char == "ה" and PATAH in cur.diac:
127
+ # Final He gnuva
128
+ cur_phonemes.append("ah")
129
+ skip_diacritics = True
130
+ skip_consonants = True
131
+
132
+ elif not next and cur.char == "ע" and PATAH in cur.diac:
133
+ # Final Ayin gnuva
134
+ cur_phonemes.append("a")
135
+ skip_diacritics = True
136
+ skip_consonants = True
137
+
138
+ if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
139
+ if cur.char == "ת":
140
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
141
+ skip_diacritics = True
142
+ skip_consonants = True
143
+ else:
144
+ # Geresh
145
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
146
+ skip_consonants = True
147
+
148
+ elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES: # dagesh
149
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
150
+ skip_consonants = True
151
+ elif cur.char == "ו" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
152
+ skip_consonants = True
153
+
154
+ if prev and "\u05b0" in prev.diac and re.findall("[\u05b9-\u05ba]", cur.diac):
155
+ # ^ לִגְוֹעַ
156
+ cur_phonemes.append("vo")
157
+ skip_diacritics = True
158
+ skip_consonants = True
159
+
160
+ elif next and next.char == "ו":
161
+ # One of them has holam
162
+
163
+ holams = re.findall("[\u05b9-\u05ba]", cur.diac + next.diac)
164
+ if len(holams) == 2:
165
+ cur_phonemes.append("wo")
166
+ skip_diacritics = True
167
+ skip_offset += 1
168
+ if len(holams) == 1:
169
+ cur_phonemes.append("vo")
170
+ skip_diacritics = True
171
+ skip_offset += 1
172
+ # patah and next.diac empty
173
+ elif cur.diac == next.diac:
174
+ # double Vav
175
+ cur_phonemes.append("vu")
176
+ skip_diacritics = True
177
+ skip_offset += 1
178
+ elif HIRIK in cur.diac:
179
+ cur_phonemes.append("vi")
180
+ skip_diacritics = True
181
+ elif SHVA in cur.diac and not next.diac:
182
+ cur_phonemes.append("v")
183
+ skip_diacritics = True
184
+ elif KAMATZ in cur.diac or PATAH in cur.diac:
185
+ cur_phonemes.append("va")
186
+ skip_diacritics = True
187
+ elif SEGOL in cur.diac:
188
+ cur_phonemes.append("ve")
189
+ skip_diacritics = True
190
+ else:
191
+ # TODO ?
192
+ # skip_consonants = False
193
+ skip_diacritics = False
194
+ else:
195
+ # Single vav
196
+
197
+ # Vav with Patah
198
+ if re.search(PATAH_LIKE_PATTERN, cur.diac):
199
+ cur_phonemes.append("va")
200
+
201
+ # Tsere
202
+ elif TSERE in cur.diac:
203
+ cur_phonemes.append("ve")
204
+ elif SEGOL in cur.diac:
205
+ cur_phonemes.append("ve")
206
+ # Holam haser
207
+ elif HOLAM in cur.diac:
208
+ cur_phonemes.append("o")
209
+ # Shuruk / Kubutz
210
+ elif KUBUTS in cur.diac or DAGESH in cur.diac:
211
+ cur_phonemes.append("u")
212
+ # Vav with Shva in start
213
+ elif SHVA in cur.diac and not prev:
214
+ cur_phonemes.append("ve")
215
+ # Hirik
216
+ elif HIRIK in cur.diac:
217
+ cur_phonemes.append("vi")
218
+ elif next and not cur.diac:
219
+ # It is fine for now since we use Dicta
220
+ skip_consonants = True
221
+ skip_diacritics = True
222
+ else:
223
+ cur_phonemes.append("v")
224
+
225
+ skip_diacritics = True
226
+
227
+ if not skip_consonants:
228
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
229
+
230
+ if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
231
+ cur_phonemes.append("o")
232
+ skip_diacritics = True
233
+
234
+ nikud_phonemes = []
235
+ if not skip_diacritics:
236
+ nikud_phonemes = [
237
+ lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
238
+ ]
239
+ elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
240
+ nikud_phonemes = [lexicon.STRESS_PHONEME]
241
+ cur_phonemes.extend(nikud_phonemes)
242
+ # Ensure the stress is at the beginning of the syllable
243
+ cur_phonemes = sort_stress(cur_phonemes, stress_placement)
244
+ cur_phonemes = [
245
+ p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
246
+ ]
247
+ # Remove empty phonemes
248
+ cur_phonemes = [p for p in cur_phonemes if p]
249
+ return cur_phonemes, skip_offset
phonikud/lexicon.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
6
+
7
+ # Non standard diacritics
8
+ SHVA_NA_DIACRITIC = "\u05bd" # Meteg
9
+ HATAMA_DIACRITIC = "\u05ab" # Ole
10
+ PREFIX_DIACRITIC = "|" # Vertical bar
11
+ NIKUD_HASER_DIACRITIC = "\u05af" # Masora, not in use
12
+ EN_GERESH = "'"
13
+ NON_STANDARD_DIAC = "".join(
14
+ [
15
+ SHVA_NA_DIACRITIC,
16
+ HATAMA_DIACRITIC,
17
+ PREFIX_DIACRITIC,
18
+ NIKUD_HASER_DIACRITIC,
19
+ EN_GERESH,
20
+ ]
21
+ )
22
+
23
+ HE_PATTERN = rf'[\u05b0-\u05ea{NON_STANDARD_DIAC}"]+'
24
+ # ^ Standard nikud and letters, ole, meteg, masora, vertical bar, en geresh
25
+ HE_NIKUD_PATTERN = rf"[\u05b0-\u05c7{NON_STANDARD_DIAC}]"
26
+ # ^ Letters, diacritics, en geresh
27
+ PUNCTUATION = set(r".,!? ")
28
+
29
+ STRESS_PHONEME = "ˈ" # \u02c8 visually looks like single quote
30
+ SPECIAL_PHONEMES = ["w"]
31
+ MODERN_SCHEMA = {
32
+ "x": "χ", # Het
33
+ "r": "ʁ", # Resh
34
+ "g": "ɡ", # Gimel
35
+ }
36
+
37
+ # Geresh
38
+ GERESH_PHONEMES = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
39
+
40
+ # Consonants
41
+ LETTERS_PHONEMES = {
42
+ "א": "ʔ", # Alef
43
+ "ב": "v", # Bet
44
+ "ג": "g", # Gimel
45
+ "ד": "d", # Dalet
46
+ "ה": "h", # He
47
+ "ו": "v", # Vav
48
+ "ז": "z", # Zayin
49
+ "ח": "x", # Het
50
+ "ט": "t", # Tet
51
+ "י": "j", # Yod
52
+ "ך": "x", # Haf sofit
53
+ "כ": "x", # Haf
54
+ "ל": "l", # Lamed
55
+ "ם": "m", # Mem Sofit
56
+ "מ": "m", # Mem
57
+ "ן": "n", # Nun Sofit
58
+ "נ": "n", # Nun
59
+ "ס": "s", # Samekh
60
+ "ע": "ʔ", # Ayin, only voweled
61
+ "פ": "f", # Fey
62
+ "ף": "f", # Fey Sofit
63
+ "ץ": "ts", # Tsadik sofit
64
+ "צ": "ts", # Tsadik
65
+ "ק": "k", # Kuf
66
+ "ר": "r", # Resh
67
+ "ש": "ʃ", # Shin
68
+ "ת": "t", # Taf
69
+ # Beged Kefet
70
+ "בּ": "b",
71
+ "כּ": "k",
72
+ "פּ": "p",
73
+ # Shin Sin
74
+ "שׁ": "ʃ",
75
+ "שׂ": "s",
76
+ "'": "",
77
+ }
78
+
79
+ NIKUD_PHONEMES = {
80
+ "\u05b4": "i", # Hiriq
81
+ "\u05b1": "e", # Hataf segol
82
+ "\u05b5": "e", # Tsere
83
+ "\u05b6": "e", # Segol
84
+ "\u05b2": "a", # Hataf Patah
85
+ "\u05b7": "a", # Patah
86
+ "\u05c7": "o", # Kamatz katan
87
+ "\u05b9": "o", # Holam
88
+ "\u05ba": "o", # Holam haser for vav
89
+ "\u05bb": "u", # Qubuts
90
+ "\u05b3": "o", # Hataf qamats
91
+ "\u05b8": "a", # Kamataz
92
+ HATAMA_DIACRITIC: STRESS_PHONEME, # Stress (Hat'ama)
93
+ SHVA_NA_DIACRITIC: "e", # Shva na
94
+ }
95
+
96
+ DEDUPLICATE = {
97
+ "\u05f3": "'", # Hebrew geresh to regular geresh
98
+ "־": "-", # Hebrew Makaf to hypen
99
+ }
100
+
101
+ # Sets
102
+ SET_PHONETIC_DIACRITICS = set([HATAMA_DIACRITIC, PREFIX_DIACRITIC, SHVA_NA_DIACRITIC])
103
+
104
+ ADDITIONAL_PHONEMES = set() # When using fallback
105
+ SET_PHONEMES = set(
106
+ sorted(
107
+ {
108
+ *NIKUD_PHONEMES.values(),
109
+ *LETTERS_PHONEMES.values(),
110
+ *GERESH_PHONEMES.values(),
111
+ *MODERN_SCHEMA.values(),
112
+ *SPECIAL_PHONEMES,
113
+ }
114
+ )
115
+ )
phonikud/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
phonikud/phonemize.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from phonikud import lexicon
2
+ from phonikud.variants import Letter
3
+ from .expander import Expander
4
+ from phonikud.utils import (
5
+ get_letters,
6
+ normalize,
7
+ post_normalize,
8
+ post_clean,
9
+ add_milra_hatama,
10
+ mark_shva_na,
11
+ sort_hatama,
12
+ )
13
+ from typing import Callable, Literal
14
+ import regex as re
15
+ from phonikud.hebrew import phonemize_hebrew
16
+
17
+
18
+ class Phonemizer:
19
+ # TODO: is that enough? what if there's punctuation around? other chars?
20
+ fallback_pattern = r"[a-zA-Z]+"
21
+
22
+ def __init__(self):
23
+ self.expander = Expander()
24
+
25
+ def phonemize(
26
+ self,
27
+ text: str,
28
+ preserve_punctuation: bool,
29
+ preserve_stress: bool,
30
+ use_expander: bool,
31
+ use_post_normalize: bool, # For TTS
32
+ predict_stress: bool,
33
+ predict_shva_nah: bool,
34
+ stress_placement: Literal["syllable", "vowel"],
35
+ schema: Literal["plain", "modern"],
36
+ fallback: Callable[[str], str] = None,
37
+ ) -> str | list[str]:
38
+ # normalize
39
+ text = normalize(text)
40
+
41
+ def fallback_replace_callback(match: re.Match):
42
+ word = match.group(0)
43
+
44
+ if self.expander.dictionary.dict.get(word):
45
+ # skip
46
+ # TODO: better API
47
+ return word
48
+ phonemes = fallback(word).strip()
49
+ # TODO: check that it has only IPA?!
50
+ for c in phonemes:
51
+ lexicon.ADDITIONAL_PHONEMES.add(c)
52
+ return phonemes
53
+
54
+ if fallback is not None:
55
+ text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
56
+
57
+ if use_expander:
58
+ text = self.expander.expand_text(text)
59
+
60
+ def heb_replace_callback(match: re.Match, original_text: str):
61
+ word = match.group(0)
62
+ start_offset = match.start()
63
+ if start_offset > 0 and original_text[start_offset - 1] == "[":
64
+ # Skip if it starts with [ as it's used for hyper phonemes
65
+ return word
66
+
67
+ if predict_shva_nah:
68
+ mark_shva_na(word)
69
+ if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
70
+ word = add_milra_hatama(word)
71
+ letters: list[Letter] = get_letters(word)
72
+ letters = sort_hatama(letters)
73
+
74
+ phonemes: list[str] = phonemize_hebrew(
75
+ letters,
76
+ stress_placement=stress_placement,
77
+ )
78
+ phonemes = "".join(phonemes)
79
+ # syllables = get_syllables(phonemes)
80
+
81
+ # phonemes_text = "".join(phonemes)
82
+ # # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
83
+ # # if len(syllables) == 1:
84
+ # # syllables[-1] = lexicon.STRESS + syllables[-1]
85
+ # # syllables[-1] = "".join(sort_stress(syllables[-1]))
86
+ # # elif any(
87
+ # # remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
88
+ # # ) or phonemes_text.endswith("ax"):
89
+ # # # insert lexicon.STRESS in the first character of syllables[-2]
90
+ # # syllables[-2] = lexicon.STRESS + syllables[-2]
91
+ # # syllables[-2] = "".join(sort_stress(syllables[-2]))
92
+ # # else:
93
+ # # # insert in syllables[-1]
94
+ # # syllables[-1] = lexicon.STRESS + syllables[-1]
95
+ # # syllables[-1] = "".join(sort_stress(syllables[-1]))
96
+
97
+ # phonemes = "".join(syllables)
98
+ if use_post_normalize:
99
+ phonemes = post_normalize(phonemes)
100
+
101
+ if schema == "modern":
102
+ # We'll keep this feature simple for now
103
+ for k, v in lexicon.MODERN_SCHEMA.items():
104
+ phonemes = re.sub(k, v, phonemes)
105
+ return phonemes
106
+
107
+ text = re.sub(
108
+ lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
109
+ )
110
+
111
+ def hyper_phonemes_callback(match: re.Match):
112
+ """
113
+ Expand hyper phonemes into normal phonemes
114
+ eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
115
+ """
116
+ matched_phonemes = match.group(2)
117
+ for c in matched_phonemes:
118
+ lexicon.ADDITIONAL_PHONEMES.add(c)
119
+ return matched_phonemes # The phoneme is in the second group
120
+
121
+ text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
122
+
123
+ if not preserve_punctuation:
124
+ text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
125
+ if not preserve_stress:
126
+ text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
127
+ if use_post_normalize:
128
+ # We don't keep hypens in the output, but we should replace it with space
129
+ text = post_clean(text)
130
+ return text
phonikud/syllables.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
3
+
4
+ TODO: add to phonikud?
5
+ """
6
+
7
+ import regex as re
8
+ import phonikud
9
+
10
+ VOWEL_DIACS = [chr(i) for i in range(0x05B1, 0x05BC)] + [chr(0x05C7)] + [chr(0x5BD)]
11
+
12
+ STRESS = "\u05ab"
13
+ SHVA = "\u05b0"
14
+ DAGESH = "\u05bc"
15
+
16
+
17
+ def sort_diacritics(word: str):
18
+ def sort_diacritics_callback(match):
19
+ letter = match.group(1)
20
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
21
+ return letter + diacritics
22
+
23
+ return re.sub(r"(\p{L})(\p{M}+)", sort_diacritics_callback, word)
24
+
25
+
26
+ def has_vowel_diacs(s: str):
27
+ if s == "וּ":
28
+ return True
29
+ return any(i in s for i in VOWEL_DIACS)
30
+
31
+
32
+ def get_syllables(word: str) -> list[str]:
33
+ letters = phonikud.utils.get_letters(word)
34
+ syllables, cur = [], ""
35
+ vowel_state = False
36
+
37
+ i = 0
38
+ while i < len(letters):
39
+ letter = letters[i]
40
+ has_vowel = has_vowel_diacs(str(letter)) or (i == 0 and SHVA in letter.all_diac)
41
+ # Look ahead
42
+ vav1 = i + 2 < len(letters) and letters[i + 2].char == "ו"
43
+ vav2 = i + 3 < len(letters) and letters[i + 3].char == "ו"
44
+
45
+ if has_vowel:
46
+ if vowel_state:
47
+ syllables.append(cur)
48
+ cur = str(letter)
49
+ else:
50
+ cur += str(letter)
51
+ vowel_state = True
52
+ else:
53
+ cur += str(letter)
54
+
55
+ i += 1
56
+
57
+ # If two וs are coming: force current syllable to end, and join both וs as next syllable
58
+ if vav1 and vav2:
59
+ if cur:
60
+ # Finish current syllable
61
+ syllables.append(cur + str(letters[i]))
62
+ cur = ""
63
+ cur = str(letters[i + 1]) + str(letters[i + 2])
64
+ i += 3 # skip past the double-vav
65
+ vowel_state = True
66
+
67
+ # If one ו is coming, end the syllable now
68
+ elif vav1 and letters[i + 1].diac:
69
+ if cur:
70
+ syllables.append(cur)
71
+ cur = ""
72
+ vowel_state = False
73
+
74
+ if cur:
75
+ syllables.append(cur)
76
+ # print(syllables)
77
+ return syllables
78
+
79
+
80
+ def add_stress_to_syllable(s: str):
81
+ letters = phonikud.utils.get_letters(s)
82
+ letters[0].all_diac = STRESS + letters[0].all_diac
83
+ return "".join(letter.char + letter.all_diac for letter in letters)
84
+
85
+
86
+ def add_stress(word: str, syllable_position: int):
87
+ syllables: list[str] = get_syllables(word)
88
+
89
+ if not syllables:
90
+ return word # no syllables, return original word
91
+
92
+ # Normalize negative indices
93
+ if syllable_position < 0:
94
+ syllable_position += len(syllables)
95
+
96
+ # Clamp to valid range
97
+ syllable_position = max(0, min(syllable_position, len(syllables) - 1))
98
+
99
+ stressed_syllable = syllables[syllable_position]
100
+ stressed_syllable = add_stress_to_syllable(stressed_syllable)
101
+ syllables[syllable_position] = stressed_syllable
102
+
103
+ return "".join(syllables)
phonikud/utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from typing import Literal
3
+ from phonikud import lexicon
4
+ import unicodedata
5
+ import regex as re
6
+ import phonikud.syllables
7
+ from phonikud.variants import Letter
8
+ import phonikud
9
+
10
+
11
+ def sort_diacritics(match):
12
+ letter = match.group(1)
13
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
14
+ return letter + diacritics
15
+
16
+
17
+ NORMALIZE_PATTERNS = {
18
+ # Sort diacritics
19
+ r"(\p{L})(\p{M}+)": sort_diacritics,
20
+ "״": '"', # Hebrew geresh to normal geresh
21
+ "׳": "'", # Same
22
+ }
23
+
24
+
25
+ def remove_nikud(text: str, to_keep=""):
26
+ pattern = lexicon.HE_NIKUD_PATTERN
27
+ pattern = "".join(i for i in pattern if i not in to_keep)
28
+ return re.sub(
29
+ pattern,
30
+ "",
31
+ text,
32
+ )
33
+
34
+
35
+ @lru_cache(maxsize=10000)
36
+ def normalize(text: str) -> str:
37
+ """
38
+ Normalize unicode (decomposite)
39
+ Keep only Hebrew characters / punctuation / IPA
40
+ Sort diacritics
41
+ """
42
+
43
+ # Decompose text
44
+ text = unicodedata.normalize("NFD", text)
45
+ for k, v in NORMALIZE_PATTERNS.items():
46
+ text = re.sub(k, v, text)
47
+ for k, v in lexicon.DEDUPLICATE.items():
48
+ text = re.sub(k, v, text)
49
+ return text
50
+
51
+
52
+ def post_normalize(phonemes: str):
53
+ new_phonemes = []
54
+ for word in phonemes.split(" "):
55
+ # remove glottal stop from end
56
+ word = re.sub(r"ʔ$", "", word)
57
+ # remove h from end
58
+ word = re.sub(r"h$", "", word)
59
+ word = re.sub(r"ˈh$", "", word)
60
+ # remove j followed by a i
61
+ word = re.sub(r"ij$", "i", word)
62
+ new_phonemes.append(word)
63
+ phonemes = " ".join(new_phonemes)
64
+ return phonemes
65
+
66
+
67
+ def post_clean(phonemes: str):
68
+ clean = []
69
+ for i in phonemes:
70
+ if i == "-":
71
+ clean.append(" ")
72
+ elif (
73
+ i in lexicon.SET_PHONEMES
74
+ or i in lexicon.ADDITIONAL_PHONEMES
75
+ or i == " "
76
+ or i in lexicon.PUNCTUATION
77
+ ):
78
+ clean.append(i)
79
+ return "".join(clean)
80
+
81
+
82
+ letters_pattern = re.compile(r"(\p{L})([\p{M}'|]*)")
83
+
84
+
85
+ # @lru_cache(maxsize=10000) TODO?
86
+ def get_letters(word: str):
87
+ letters: list[tuple[str, str]] = letters_pattern.findall(word) # with en_geresh
88
+ letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
89
+ return letters
90
+
91
+
92
+ def get_unicode_names(text: str):
93
+ return [unicodedata.name(c, "?") for c in text]
94
+
95
+
96
+ def has_vowel(s: iter):
97
+ return any(i in s for i in "aeiou")
98
+
99
+
100
+ def has_constant(s: iter):
101
+ return any(i not in "aeiou" for i in s)
102
+
103
+
104
+ def get_phoneme_syllables(phonemes: list[str]) -> list[str]:
105
+ syllables = []
106
+ cur_syllable = ""
107
+
108
+ i = 0
109
+ while i < len(phonemes):
110
+ # Add current phoneme to the syllable
111
+
112
+ cur_syllable += phonemes[i]
113
+
114
+ # If we have a vowel in the current syllable
115
+ if has_vowel(cur_syllable):
116
+ # If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
117
+ if (
118
+ i + 2 < len(phonemes)
119
+ and not has_vowel(phonemes[i + 1])
120
+ and has_vowel(phonemes[i + 2])
121
+ ):
122
+ # End the current syllable and start a new one
123
+ syllables.append(cur_syllable)
124
+ cur_syllable = ""
125
+ # If we're at the end or next phoneme has a vowel
126
+ elif i + 1 >= len(phonemes) or has_vowel(phonemes[i + 1]):
127
+ # End the current syllable
128
+ syllables.append(cur_syllable)
129
+ cur_syllable = ""
130
+
131
+ i += 1
132
+
133
+ # Add any remaining syllable
134
+ if cur_syllable:
135
+ syllables.append(cur_syllable)
136
+
137
+ # Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
138
+ for i in range(len(syllables) - 1): # Ensure we're not at the last syllable
139
+ if syllables[i].endswith(lexicon.STRESS_PHONEME):
140
+ syllables[i + 1] = (
141
+ lexicon.STRESS_PHONEME + syllables[i + 1]
142
+ ) # Move stress to next syllable
143
+ syllables[i] = syllables[i][
144
+ : -len(lexicon.STRESS_PHONEME)
145
+ ] # Remove stress from current syllable
146
+
147
+ return syllables
148
+
149
+
150
+ def sort_stress(
151
+ phonemes: list[str], placement: Literal["syllable", "vowel"] = "vowel"
152
+ ) -> list[str]:
153
+ """
154
+ TTS systems expect that the stress will be BEFORE vowel
155
+ Linguistics expect in the START of the syllable
156
+ at_start=True for place it in the beginning
157
+ """
158
+ if "ˈ" not in "".join(phonemes):
159
+ # ^ Does not contains stress
160
+ return phonemes
161
+ if not any(i in "".join(phonemes) for i in "aeiou"):
162
+ # ^ Does not contains vowel
163
+ return phonemes
164
+
165
+ # Remove stress marker
166
+ phonemes = [p for p in phonemes if p != "ˈ"]
167
+
168
+ if placement == "syllable":
169
+ return ["ˈ"] + phonemes
170
+
171
+ # Define vowels
172
+ vowels = "aeiou"
173
+
174
+ # Find the first phoneme that contains a vowel, and inject the stress before the vowel
175
+
176
+ for i, phoneme in enumerate(phonemes):
177
+ for j, char in enumerate(phoneme):
178
+ if char in vowels:
179
+ # Insert stress before the vowel
180
+ phonemes[i] = phoneme[:j] + "ˈ" + phoneme[j:]
181
+ return phonemes
182
+
183
+ # If no vowels found, return unchanged
184
+ return phonemes
185
+
186
+
187
+ def mark_shva_na(word: str):
188
+ """
189
+ Shva Na is context-independent and can be predicted with just the word or a dictionary.
190
+ See https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
191
+ Note: we predict only if Shva in the first letter in the word
192
+ Note: we assume that the word comes with | to mark 'Txiliyot'
193
+ Note: Shva Na rules mid-word are unreliable, so we don’t code them.
194
+
195
+ Meteg (\u05bd) will be added in the letter with Shva Na
196
+
197
+ What we don't predict:
198
+ (1) some shva in beginning in future form (we don't know)
199
+ (2) shva in the middle of the word
200
+ """
201
+ letters = get_letters(word)
202
+ if not letters:
203
+ return word
204
+ if letters[0].char in "למנרי":
205
+ letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
206
+ elif len(letters) > 1 and letters[1].char in "אעה":
207
+ letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
208
+ elif letters[0].char in "וכלב" and lexicon.PREFIX_DIACRITIC in letters[0].all_diac:
209
+ # ^ The nakdan should add |
210
+ letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
211
+ # Ensure that prefix character will be last
212
+ for letter in letters:
213
+ if "|" in letter.all_diac:
214
+ letter.all_diac = letter.all_diac.replace("|", "") + "|"
215
+ return "".join(str(i) for i in letters)
216
+
217
+
218
+ def sort_hatama(letters: list[Letter]) -> list[Letter]:
219
+ for i in range(len(letters) - 1):
220
+ diacs = list(letters[i].all_diac)
221
+ if lexicon.HATAMA_DIACRITIC in diacs and lexicon.NIKUD_HASER_DIACRITIC in diacs:
222
+ diacs.remove(lexicon.HATAMA_DIACRITIC)
223
+ letters[i].all_diac = "".join(diacs) # Reassign the updated diacritics
224
+ letters[i + 1].all_diac += lexicon.HATAMA_DIACRITIC
225
+ return letters
226
+
227
+
228
+ def add_milra_hatama(word: str):
229
+ syllables = phonikud.syllables.get_syllables(word)
230
+ stress_index = -1
231
+
232
+ if not syllables:
233
+ return word
234
+
235
+ if len(syllables) == 1:
236
+ stress_index = 0
237
+
238
+ # Get latest syllable
239
+ milra = syllables[stress_index]
240
+ # Get letters
241
+ letters = get_letters(milra)
242
+ # Add Hatama
243
+ letters[0].all_diac += lexicon.HATAMA_DIACRITIC
244
+
245
+ # Replace latest syllable
246
+ syllables[stress_index] = "".join(str(i) for i in letters)
247
+ return "".join(syllables)
phonikud/variants.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import phonikud
2
+ from phonikud import lexicon
3
+
4
+
5
+ class Letter:
6
+ def __init__(self, char: str, diac: list[str]):
7
+ self.char = phonikud.normalize(char)
8
+ self.all_diac = phonikud.normalize(diac)
9
+ self.diac = "".join(
10
+ i for i in self.all_diac if i not in lexicon.SET_PHONETIC_DIACRITICS
11
+ )
12
+
13
+ def __repr__(self):
14
+ return f"[Letter] {self.char}{''.join(self.all_diac)}"
15
+
16
+ def __eq__(self, value: "Letter"):
17
+ return value.all_diac == self.all_diac and value.char == self.char
18
+
19
+ def __str__(self):
20
+ return self.char + self.all_diac
phonikud_onnx/__init__.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import OnnxModel
2
+ import re
3
+
4
+
5
+ class Phonikud:
6
+ def __init__(self, model_path: str):
7
+ self.model = OnnxModel(model_path)
8
+
9
+ def add_diacritics(
10
+ self, sentences: list | str, mark_matres_lectionis: str | None = None
11
+ ) -> str:
12
+ """
13
+ Adds nikud (Hebrew diacritics) to the given text.
14
+
15
+ Parameters:
16
+ - sentences (list | str): A string or a list of strings to be processed. Each string should not exceed 2048 characters.
17
+ - mark_matres_lectionis (str | None, optional): A string used to mark nikud male. For example, if set to '|',
18
+ "לִימּוּדָיו" will be returned as "לִי|מּוּדָיו". Default is None (no marking).
19
+
20
+ Returns:
21
+ - str: The text with added diacritics.
22
+ """
23
+
24
+ if isinstance(sentences, str):
25
+ sentences = [sentences]
26
+ result = self.model.predict(
27
+ sentences, mark_matres_lectionis=mark_matres_lectionis
28
+ )
29
+ return result[0]
30
+
31
+ def get_nikud_male(self, text: str, mark_matres_lectionis: str):
32
+ """
33
+ Based on given mark character remove the mark character to keep it as nikud male
34
+ """
35
+ return text.replace(mark_matres_lectionis, "")
36
+
37
+ def get_nikud_haser(self, text: str):
38
+ """
39
+ Based on given mark_matres_lectionis remove the nikud nikud male character along with the mark character
40
+ """
41
+ return re.sub(r".\|", "", text) # Remove {char}{matres_lectionis}
phonikud_onnx/model.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime as ort
2
+ import numpy as np
3
+ from tokenizers import Tokenizer
4
+ import re
5
+
6
+ # Constants
7
+ NIKUD_CLASSES = [
8
+ "",
9
+ "<MAT_LECT>",
10
+ "\u05bc",
11
+ "\u05b0",
12
+ "\u05b1",
13
+ "\u05b2",
14
+ "\u05b3",
15
+ "\u05b4",
16
+ "\u05b5",
17
+ "\u05b6",
18
+ "\u05b7",
19
+ "\u05b8",
20
+ "\u05b9",
21
+ "\u05ba",
22
+ "\u05bb",
23
+ "\u05bc\u05b0",
24
+ "\u05bc\u05b1",
25
+ "\u05bc\u05b2",
26
+ "\u05bc\u05b3",
27
+ "\u05bc\u05b4",
28
+ "\u05bc\u05b5",
29
+ "\u05bc\u05b6",
30
+ "\u05bc\u05b7",
31
+ "\u05bc\u05b8",
32
+ "\u05bc\u05b9",
33
+ "\u05bc\u05ba",
34
+ "\u05bc\u05bb",
35
+ "\u05c7",
36
+ "\u05bc\u05c7",
37
+ ]
38
+ SHIN_CLASSES = ["\u05c1", "\u05c2"] # shin, sin
39
+ MAT_LECT_TOKEN = "<MAT_LECT>"
40
+ MATRES_LETTERS = list("אוי")
41
+ ALEF_ORD = ord("א")
42
+ TAF_ORD = ord("ת")
43
+ STRESS_CHAR = "\u05ab" # "ole" symbol marks stress
44
+ MOBILE_SHVA_CHAR = "\u05bd" # "meteg" symbol marks shva na (mobile shva)
45
+ PREFIX_CHAR = "|"
46
+
47
+
48
+ def is_hebrew_letter(char):
49
+ return ALEF_ORD <= ord(char) <= TAF_ORD
50
+
51
+
52
+ def is_matres_letter(char):
53
+ return char in MATRES_LETTERS
54
+
55
+
56
+ nikud_pattern = re.compile(r"[\u05B0-\u05BD\u05C1\u05C2\u05C7]")
57
+
58
+
59
+ def remove_nikkud(text):
60
+ return nikud_pattern.sub("", text)
61
+
62
+
63
+ class OnnxModel:
64
+ def __init__(
65
+ self, model_path, tokenizer_name="dicta-il/dictabert-large-char-menaked"
66
+ ):
67
+ # Load the tokenizer
68
+ self.tokenizer = Tokenizer.from_pretrained(tokenizer_name)
69
+
70
+ # Create ONNX Runtime session
71
+ self.session = ort.InferenceSession(model_path)
72
+ self.input_names = [input.name for input in self.session.get_inputs()]
73
+ self.output_names = [output.name for output in self.session.get_outputs()]
74
+
75
+ def _create_inputs(self, sentences: list[str], padding: str):
76
+ # Tokenize inputs using tokenizers library
77
+ encodings = []
78
+ for sentence in sentences:
79
+ encoding = self.tokenizer.encode(sentence)
80
+ encodings.append(encoding)
81
+
82
+ # Get the max length for padding
83
+ max_len = max(len(enc.ids) for enc in encodings) if padding == "longest" else 0
84
+
85
+ # Prepare batch inputs
86
+ input_ids = []
87
+ attention_mask = []
88
+ offset_mapping = []
89
+
90
+ for encoding in encodings:
91
+ ids = encoding.ids
92
+ masks = [1] * len(ids)
93
+ offsets = encoding.offsets
94
+
95
+ # Pad if needed
96
+ if padding == "longest" and len(ids) < max_len:
97
+ padding_length = max_len - len(ids)
98
+ ids = ids + [self.tokenizer.token_to_id("[PAD]")] * padding_length
99
+ masks = masks + [0] * padding_length
100
+ offsets = offsets + [(0, 0)] * padding_length
101
+
102
+ input_ids.append(ids)
103
+ attention_mask.append(masks)
104
+ offset_mapping.append(offsets)
105
+
106
+ # Convert to numpy arrays for ONNX Runtime
107
+ return {
108
+ "input_ids": np.array(input_ids, dtype=np.int64),
109
+ "attention_mask": np.array(attention_mask, dtype=np.int64),
110
+ # Token type IDs might be needed depending on your model
111
+ "token_type_ids": np.zeros_like(np.array(input_ids, dtype=np.int64)),
112
+ }, offset_mapping
113
+
114
+ def predict(self, sentences, mark_matres_lectionis=None, padding="longest"):
115
+ sentences = [remove_nikkud(sentence) for sentence in sentences]
116
+ inputs, offset_mapping = self._create_inputs(sentences, padding)
117
+
118
+ # Run inference
119
+ outputs = self.session.run(self.output_names, inputs)
120
+
121
+ # Process outputs based on output names
122
+ nikud_idx = self.output_names.index("nikud_logits")
123
+ shin_idx = self.output_names.index("shin_logits")
124
+ nikud_logits = outputs[nikud_idx]
125
+ shin_logits = outputs[shin_idx]
126
+
127
+ additional_idx = self.output_names.index("additional_logits")
128
+ additional_logits = outputs[additional_idx]
129
+
130
+ # Get predictions
131
+ nikud_predictions = np.argmax(nikud_logits, axis=-1)
132
+ shin_predictions = np.argmax(shin_logits, axis=-1)
133
+ stress_predictions = (additional_logits[..., 1] > 1).astype(np.int32)
134
+ mobile_shva_predictions = (additional_logits[..., 2] > 1).astype(np.int32)
135
+ prefix_predictions = (additional_logits[..., 3] > 1).astype(np.int32)
136
+
137
+ ret = []
138
+ for sent_idx, (sentence, sent_offsets) in enumerate(
139
+ zip(sentences, offset_mapping)
140
+ ):
141
+ # Assign the nikud to each letter
142
+ output = []
143
+ prev_index = 0
144
+ for idx, offsets in enumerate(sent_offsets):
145
+ # Add anything we missed
146
+ if offsets[0] > prev_index:
147
+ output.append(sentence[prev_index : offsets[0]])
148
+ if offsets[1] - offsets[0] != 1:
149
+ continue
150
+
151
+ # Get next char
152
+ char = sentence[offsets[0] : offsets[1]]
153
+ prev_index = offsets[1]
154
+ if not is_hebrew_letter(char):
155
+ output.append(char)
156
+ continue
157
+
158
+ nikud = NIKUD_CLASSES[nikud_predictions[sent_idx][idx]]
159
+ shin = (
160
+ "" if char != "ש" else SHIN_CLASSES[shin_predictions[sent_idx][idx]]
161
+ )
162
+
163
+ # Check for matres lectionis
164
+ if nikud == MAT_LECT_TOKEN:
165
+ if not is_matres_letter(char):
166
+ nikud = "" # Don't allow matres on irrelevant letters
167
+ elif mark_matres_lectionis is not None:
168
+ nikud = mark_matres_lectionis
169
+ else:
170
+ output.append(char)
171
+ continue
172
+
173
+ stress = (
174
+ STRESS_CHAR
175
+ if stress_predictions is not None
176
+ and stress_predictions[sent_idx][idx] == 1
177
+ else ""
178
+ )
179
+ mobile_shva = (
180
+ MOBILE_SHVA_CHAR
181
+ if mobile_shva_predictions is not None
182
+ and mobile_shva_predictions[sent_idx][idx] == 1
183
+ else ""
184
+ )
185
+
186
+ prefix = (
187
+ PREFIX_CHAR
188
+ if prefix_predictions is not None
189
+ and prefix_predictions[sent_idx][idx] == 1
190
+ else ""
191
+ )
192
+
193
+ output.append(char + shin + nikud + stress + mobile_shva + prefix)
194
+ output.append(sentence[prev_index:])
195
+ ret.append("".join(output))
196
+
197
+ return ret
phonikud_onnx/py.typed ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv export --no-hashes --no-emit-project
3
+ colorama==0.4.6 ; sys_platform == 'win32'
4
+ # via
5
+ # colorlog
6
+ # pytest
7
+ # tqdm
8
+ colorlog==6.9.0
9
+ # via phonikud
10
+ docopt==0.6.2
11
+ # via num2words
12
+ exceptiongroup==1.3.0 ; python_full_version < '3.11'
13
+ # via pytest
14
+ iniconfig==2.1.0
15
+ # via pytest
16
+ num2words==0.5.14
17
+ # via phonikud
18
+ numpy==2.2.6
19
+ # via pandas
20
+ packaging==25.0
21
+ # via pytest
22
+ pandas==2.2.3
23
+ pluggy==1.6.0
24
+ # via pytest
25
+ pytest==8.3.5
26
+ python-dateutil==2.9.0.post0
27
+ # via pandas
28
+ pytz==2025.2
29
+ # via pandas
30
+ regex==2024.11.6
31
+ # via phonikud
32
+ ruff==0.11.11
33
+ six==1.17.0
34
+ # via python-dateutil
35
+ tomli==2.2.1 ; python_full_version < '3.11'
36
+ # via pytest
37
+ tqdm==4.67.1
38
+ typing-extensions==4.13.2 ; python_full_version < '3.11'
39
+ # via exceptiongroup
40
+ tzdata==2025.2
41
+ # via pandas
42
+ gradio>=5.25.2
43
+ phonikud_onnx