thewh1teagle
commited on
Commit
·
1866014
0
Parent(s):
latest
Browse files- .gitattributes +1 -0
- README.md +10 -0
- app.py +109 -0
- phonikud-1.0.int8.onnx +3 -0
- phonikud/__init__.py +39 -0
- phonikud/data/rashej_tevot.json +3 -0
- phonikud/data/special.json +9 -0
- phonikud/data/symbols.json +5 -0
- phonikud/expander/__init__.py +33 -0
- phonikud/expander/dates.py +60 -0
- phonikud/expander/dictionary.py +79 -0
- phonikud/expander/number_names.py +193 -0
- phonikud/expander/numbers.py +39 -0
- phonikud/expander/time_to_word.py +104 -0
- phonikud/hebrew.py +249 -0
- phonikud/lexicon.py +115 -0
- phonikud/log.py +35 -0
- phonikud/phonemize.py +130 -0
- phonikud/syllables.py +103 -0
- phonikud/utils.py +247 -0
- phonikud/variants.py +20 -0
- phonikud_onnx/__init__.py +41 -0
- phonikud_onnx/model.py +197 -0
- phonikud_onnx/py.typed +0 -0
- requirements.txt +43 -0
.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Phonemize in Hebrew
|
3 |
+
emoji: 🐢
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: "4.44.0"
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
uv pip install gradio
|
3 |
+
uv run gradio examples/editor.py
|
4 |
+
"""
|
5 |
+
|
6 |
+
from phonikud import phonemize, lexicon
|
7 |
+
from phonikud.utils import remove_nikud
|
8 |
+
import gradio as gr
|
9 |
+
from phonikud_onnx import Phonikud
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
|
13 |
+
default_text = """
|
14 |
+
הַדַּיָּיג נִצְמָד לְדֹופֶן הַסִּירָה בִּזְמַן הַסְּעָרָה.
|
15 |
+
הִסְבַּרְתִּי לָהּ אֶת הַכֹּל, וְאָמַרְתִּי בְּדִיּוּק מָה קָרָה.
|
16 |
+
הַיְּלָדִים אָהֲבוּ בִּמְיֻוחָד אֶת הַסִּיפּוּרִים הַלָּלוּ שֶׁהַמּוֹרָה הִקְרִיאָה.
|
17 |
+
""".strip()
|
18 |
+
|
19 |
+
|
20 |
+
def on_phonikud_toggle(use_phonikud):
|
21 |
+
if not use_phonikud:
|
22 |
+
return default_text
|
23 |
+
return remove_nikud(default_text)
|
24 |
+
|
25 |
+
|
26 |
+
css = """
|
27 |
+
.input textarea {
|
28 |
+
font-size: 22px;
|
29 |
+
padding: 15px;
|
30 |
+
height: 200px;
|
31 |
+
}
|
32 |
+
|
33 |
+
.phonemes {
|
34 |
+
background: var(--input-background-fill);
|
35 |
+
|
36 |
+
}
|
37 |
+
.phonemes {
|
38 |
+
padding: 5px;
|
39 |
+
min-height: 50px;
|
40 |
+
}
|
41 |
+
"""
|
42 |
+
|
43 |
+
theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Noto Sans Hebrew")])
|
44 |
+
|
45 |
+
phonikud = None
|
46 |
+
model_path = Path("./phonikud-1.0.int8.onnx")
|
47 |
+
if model_path.exists():
|
48 |
+
phonikud = Phonikud(str(model_path))
|
49 |
+
|
50 |
+
|
51 |
+
def on_submit(text: str, schema: str, use_phonikud: bool) -> str:
|
52 |
+
diacritized = (
|
53 |
+
phonikud.add_diacritics(
|
54 |
+
text, mark_matres_lectionis=lexicon.NIKUD_HASER_DIACRITIC
|
55 |
+
)
|
56 |
+
if phonikud and use_phonikud
|
57 |
+
else text
|
58 |
+
)
|
59 |
+
phonemes = phonemize(
|
60 |
+
diacritized, predict_stress=True, schema=schema, predict_shva_nah=False
|
61 |
+
)
|
62 |
+
if use_phonikud:
|
63 |
+
return f"<div dir='rtl' style='font-size: 22px;'>{diacritized.strip()}</div><br><div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
|
64 |
+
else:
|
65 |
+
return f"<div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
|
66 |
+
|
67 |
+
|
68 |
+
with gr.Blocks(theme=theme, css=css) as demo:
|
69 |
+
text_input = gr.Textbox(
|
70 |
+
value=remove_nikud(default_text),
|
71 |
+
label="Text",
|
72 |
+
rtl=True,
|
73 |
+
elem_classes=["input"],
|
74 |
+
lines=7,
|
75 |
+
)
|
76 |
+
|
77 |
+
with gr.Row():
|
78 |
+
schema_dropdown = gr.Dropdown(
|
79 |
+
choices=["modern", "plain"], value="plain", label="Phoneme Schema"
|
80 |
+
)
|
81 |
+
use_phonikud_checkbox = gr.Checkbox(
|
82 |
+
value=True, label="Use Phonikud (add diacritics)"
|
83 |
+
)
|
84 |
+
|
85 |
+
submit_button = gr.Button("Create")
|
86 |
+
output_box = gr.Markdown(label="Phonemes + Diacritics", elem_classes=["phonemes"])
|
87 |
+
use_phonikud_checkbox.change(
|
88 |
+
fn=lambda use_phonikud: (
|
89 |
+
on_phonikud_toggle(use_phonikud), # Update text_input
|
90 |
+
on_submit(
|
91 |
+
on_phonikud_toggle(use_phonikud), schema_dropdown.value, use_phonikud
|
92 |
+
), # Update output_box
|
93 |
+
),
|
94 |
+
inputs=use_phonikud_checkbox,
|
95 |
+
outputs=[text_input, output_box], # Update both text input and output box
|
96 |
+
)
|
97 |
+
|
98 |
+
submit_button.click(
|
99 |
+
fn=on_submit,
|
100 |
+
inputs=[text_input, schema_dropdown, use_phonikud_checkbox],
|
101 |
+
outputs=output_box,
|
102 |
+
)
|
103 |
+
|
104 |
+
gr.Markdown("""
|
105 |
+
<p style='text-align: center;'><a href='https://github.com/thewh1teagle/phonikud' target='_blank'>Phonikud on Github</a></p>
|
106 |
+
""")
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
demo.launch()
|
phonikud-1.0.int8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c4e7b0dbb263315ca124865da1ef3da3e91f64fb8acec6c437312a6bc0a8d51
|
3 |
+
size 307683244
|
phonikud/__init__.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
High level phonemize functions
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .phonemize import Phonemizer
|
6 |
+
from .utils import normalize # noqa: F401
|
7 |
+
from typing import Callable, Literal
|
8 |
+
|
9 |
+
phonemizer = Phonemizer()
|
10 |
+
|
11 |
+
|
12 |
+
def phonemize(
|
13 |
+
text: str,
|
14 |
+
preserve_punctuation=True,
|
15 |
+
preserve_stress=True,
|
16 |
+
use_expander=True,
|
17 |
+
use_post_normalize=True, # For TTS
|
18 |
+
predict_stress=True,
|
19 |
+
predict_shva_nah=True,
|
20 |
+
stress_placement: Literal["syllable", "vowel"] = "vowel",
|
21 |
+
schema: Literal["plain", "modern"] = "modern",
|
22 |
+
fallback: Callable[[str], str] = None,
|
23 |
+
) -> str:
|
24 |
+
"""
|
25 |
+
Set stress_at_start=True to place stress at syllable start.
|
26 |
+
"""
|
27 |
+
phonemes = phonemizer.phonemize(
|
28 |
+
text,
|
29 |
+
preserve_punctuation=preserve_punctuation,
|
30 |
+
preserve_stress=preserve_stress,
|
31 |
+
fallback=fallback,
|
32 |
+
use_expander=use_expander,
|
33 |
+
use_post_normalize=use_post_normalize,
|
34 |
+
predict_stress=predict_stress,
|
35 |
+
schema=schema,
|
36 |
+
predict_shva_nah=predict_shva_nah,
|
37 |
+
stress_placement=stress_placement,
|
38 |
+
)
|
39 |
+
return phonemes
|
phonikud/data/rashej_tevot.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"צה״ל": "tsˈahal"
|
3 |
+
}
|
phonikud/data/special.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"וַאלְלָה": "wˈala",
|
3 |
+
"וַסַבִּי": "wasˈabi",
|
4 |
+
"פינגוין": "pinguwˈin",
|
5 |
+
"וואצאפ": "wˈatsʔap",
|
6 |
+
"וואטסאפ": "wˈatsʔap",
|
7 |
+
"יאללה": "jˈala",
|
8 |
+
"וולטר": "wˈolter"
|
9 |
+
}
|
phonikud/data/symbols.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"₪": "ʃˈekel",
|
3 |
+
"$": "dˈolar",
|
4 |
+
"%": "axˈuz"
|
5 |
+
}
|
phonikud/expander/__init__.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Expand dates and numbers into words with nikud
|
3 |
+
This happens before phonemization
|
4 |
+
"""
|
5 |
+
|
6 |
+
from .numbers import num_to_word
|
7 |
+
from .dates import date_to_word
|
8 |
+
from .time_to_word import time_to_word
|
9 |
+
from .dictionary import Dictionary
|
10 |
+
from phonikud.log import log
|
11 |
+
|
12 |
+
|
13 |
+
class Expander:
|
14 |
+
def __init__(self):
|
15 |
+
self.dictionary = Dictionary()
|
16 |
+
|
17 |
+
def expand_text(self, text: str):
|
18 |
+
words = []
|
19 |
+
for source_word in text.split():
|
20 |
+
try:
|
21 |
+
word = date_to_word(source_word)
|
22 |
+
if word == source_word:
|
23 |
+
word = time_to_word(word)
|
24 |
+
if word == source_word:
|
25 |
+
word = num_to_word(word)
|
26 |
+
words.append(word)
|
27 |
+
except Exception as e:
|
28 |
+
log.error(f"Failed to expand {word} with error: {e}")
|
29 |
+
words.append(source_word)
|
30 |
+
text = " ".join(words)
|
31 |
+
text = self.dictionary.expand_text(text)
|
32 |
+
|
33 |
+
return text
|
phonikud/expander/dates.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from .numbers import num_to_word
|
3 |
+
|
4 |
+
# Mapping of month names in Hebrew with diacritics (Gregorian months)
|
5 |
+
MONTHS = {
|
6 |
+
1: "יָ֫נוּאָר",
|
7 |
+
2: "פֶ֫בְרוּאָר",
|
8 |
+
3: "מֵ֫רְץ",
|
9 |
+
4: "אֵפְרִיל",
|
10 |
+
5: "מַאי",
|
11 |
+
6: "י֫וּנִי",
|
12 |
+
7: "י֫וּלִי",
|
13 |
+
8: "א֫וֹגֻסְט",
|
14 |
+
9: "סֶפְּטֶ֫מְבֶּר",
|
15 |
+
10: "אוֹקְט֫וֹבֶּר",
|
16 |
+
11: "נוֹבֶ֫מְבֶּר",
|
17 |
+
12: "דֶּצֶ֫מְבֶּר",
|
18 |
+
}
|
19 |
+
|
20 |
+
# Mapping of day names in Hebrew with diacritics
|
21 |
+
DAYS = {
|
22 |
+
0: "יוֹם רִאשׁוֹן",
|
23 |
+
1: "יוֹם שֵׁנִי",
|
24 |
+
2: "יוֹם שְׁלִישִׁי",
|
25 |
+
3: "יוֹם רֵבִיעִי",
|
26 |
+
4: "יוֹם חֲמִישִׁי",
|
27 |
+
5: "יוֹם שִׁישִׁי",
|
28 |
+
6: "יוֹם שַׁבָּת",
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
def date_to_word(word: str, include_day_name=False) -> str:
|
33 |
+
"""
|
34 |
+
Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
|
35 |
+
Returns the original word if it's not a valid date.
|
36 |
+
"""
|
37 |
+
separators = ["-", ".", "/"]
|
38 |
+
orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
|
39 |
+
date_formats = [sep.join(order) for order in orders for sep in separators]
|
40 |
+
|
41 |
+
for date_format in date_formats:
|
42 |
+
try:
|
43 |
+
# Try parsing the word with each date format
|
44 |
+
date_obj = datetime.strptime(word, date_format)
|
45 |
+
|
46 |
+
# Get the Hebrew day name with diacritics
|
47 |
+
day_name = DAYS[date_obj.weekday()]
|
48 |
+
|
49 |
+
# Convert month to Hebrew name with diacritics
|
50 |
+
month_name = MONTHS[date_obj.month]
|
51 |
+
day = num_to_word(str(date_obj.day))
|
52 |
+
year = num_to_word(str(date_obj.year))
|
53 |
+
|
54 |
+
text = f"{day} בֵּ{month_name} {year}"
|
55 |
+
if include_day_name:
|
56 |
+
text = f"{day_name}, {text}"
|
57 |
+
return text
|
58 |
+
except ValueError:
|
59 |
+
continue
|
60 |
+
return word
|
phonikud/expander/dictionary.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Dictionaries are tab separated key value words
|
3 |
+
"""
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
import json
|
7 |
+
import re
|
8 |
+
from phonikud.utils import remove_nikud
|
9 |
+
from phonikud.utils import normalize
|
10 |
+
from phonikud import lexicon
|
11 |
+
import unicodedata
|
12 |
+
|
13 |
+
files = Path(__file__).parent.joinpath("../data").glob("*.json")
|
14 |
+
# Sort in reverse order to prioritize the most recent and best
|
15 |
+
order = {"bronze": 1, "silver": 2, "gold": 3}
|
16 |
+
files = sorted(
|
17 |
+
files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
class Dictionary:
|
22 |
+
def __init__(self):
|
23 |
+
self.dict = {}
|
24 |
+
self.load_dictionaries()
|
25 |
+
|
26 |
+
def load_dictionaries(self):
|
27 |
+
for file in files:
|
28 |
+
with open(file, "r", encoding="utf-8") as f:
|
29 |
+
dictionary: dict = json.load(f)
|
30 |
+
normalized_dictionary = {}
|
31 |
+
|
32 |
+
# normalize nikud keys
|
33 |
+
for k, v in dictionary.items():
|
34 |
+
k = normalize(k)
|
35 |
+
# Ensure not empty
|
36 |
+
if k and v:
|
37 |
+
normalized_dictionary[k] = v
|
38 |
+
self.dict.update(normalized_dictionary)
|
39 |
+
|
40 |
+
def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
|
41 |
+
source: str = match.group(0)
|
42 |
+
# decomposite
|
43 |
+
source = unicodedata.normalize("NFD", source)
|
44 |
+
raw_lookup = self.dict.get(source)
|
45 |
+
|
46 |
+
without_nikud_lookup = self.dict.get(remove_nikud(source))
|
47 |
+
with_nikud_lookup = self.dict.get(normalize(source))
|
48 |
+
# Compare without nikud ONLY if source has no nikud
|
49 |
+
if raw_lookup:
|
50 |
+
return raw_lookup
|
51 |
+
if without_nikud_lookup:
|
52 |
+
return without_nikud_lookup
|
53 |
+
elif with_nikud_lookup:
|
54 |
+
return with_nikud_lookup
|
55 |
+
return source
|
56 |
+
|
57 |
+
def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
|
58 |
+
raw_source: str = match.group(0)
|
59 |
+
if raw_source.isnumeric():
|
60 |
+
return raw_source
|
61 |
+
|
62 |
+
raw_lookup = self.dict.get(raw_source)
|
63 |
+
|
64 |
+
# Compare without nikud ONLY if source has no nikud
|
65 |
+
if raw_lookup:
|
66 |
+
return raw_lookup
|
67 |
+
# search by only ', space, regular nikud, alphabet
|
68 |
+
raw_source = re.sub(
|
69 |
+
lexicon.HE_PATTERN, self.replace_hebrew_only_callback, raw_source
|
70 |
+
)
|
71 |
+
return raw_source
|
72 |
+
|
73 |
+
def expand_text(self, text: str) -> str:
|
74 |
+
"""
|
75 |
+
TODO: if key doesn't have diacritics expand even diacritized words
|
76 |
+
"""
|
77 |
+
text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
|
78 |
+
|
79 |
+
return text
|
phonikud/expander/number_names.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
|
3 |
+
"""
|
4 |
+
|
5 |
+
# TODO: add nikud hints
|
6 |
+
|
7 |
+
ZERO = {"אפס": "אֶ֫פֶס"}
|
8 |
+
|
9 |
+
|
10 |
+
ONES = {
|
11 |
+
"אחת": "אַחַת",
|
12 |
+
"אחד": "אֶחָד",
|
13 |
+
"ראשונה": "רִאשׁוֹנָה",
|
14 |
+
"ראשון": "רִאשׁוֹן",
|
15 |
+
"ראשונות": "רִאשׁוֹנוֹת",
|
16 |
+
"ראשונים": "רִאשׁוֹנִים",
|
17 |
+
"שתיים": "שְׁתַּ֫יִם",
|
18 |
+
"שניים": "שְׁנַ֫יִם",
|
19 |
+
"שתי": "שְׁתֵּי",
|
20 |
+
"שני": "שְׁנֵי",
|
21 |
+
"שנייה": "שְׁנִיָּה",
|
22 |
+
"שניות": "שְׁנִיּוֹת",
|
23 |
+
"שלוש": "שָׁלוֹשׁ",
|
24 |
+
"שלושה": "שְׁלוֹשָׁה",
|
25 |
+
"שלושת": "שְׁל֫וֹשֶׁת",
|
26 |
+
"שלישית": "שְׁלִישִׁית",
|
27 |
+
"שלישי": "שְׁלִישִׁי",
|
28 |
+
"שלישיות": "שְׁלִישִׁיּוֹת",
|
29 |
+
"שלישיים": "שְׁלִישִׁיִּים",
|
30 |
+
"ארבע": "אַ֫רְבַּע",
|
31 |
+
"ארבעה": "אַרְבַּעָה",
|
32 |
+
"ארבעת": "אַרְבַּ֫עַת",
|
33 |
+
"רביעית": "רֵבִיעִית",
|
34 |
+
"רביעי": "רֵבִיעִי",
|
35 |
+
"רביעיות": "רֵבִיעִיוֹת",
|
36 |
+
"רביעיים": "רֵבִיעִיִּים",
|
37 |
+
"חמש": "חָמֵשׁ",
|
38 |
+
"חמישה": "חֲמִשָּׁה",
|
39 |
+
"חמשת": "חֲמֵ֫שֶׁת",
|
40 |
+
"חמישית": "חֲמִישִּׁית",
|
41 |
+
"חמישי": "חֲמִישִּׁי",
|
42 |
+
"חמישיות": "חֲמִישִּׁיוֹת",
|
43 |
+
"חמישיים": "חֲמִישִּׁיִּים",
|
44 |
+
"שש": "שֵׁשׁ",
|
45 |
+
"שישה": "שִׁשָּׁה",
|
46 |
+
"ששת": "שֵׁ֫שֶׁת",
|
47 |
+
"שישית": "שִׁשִּׁית",
|
48 |
+
"שישי": "שִׁשִּׁי",
|
49 |
+
"שישיות": "שִׁשִּׁיוֹת",
|
50 |
+
"שישיים": "שִׁשִּׁיִּים",
|
51 |
+
"שבע": "שֶׁ֫בַע",
|
52 |
+
"שבעה": "שִׁבְעָה",
|
53 |
+
"שבעת": "שִׁבְעַת",
|
54 |
+
"שביעית": "שְׁבִיעִית",
|
55 |
+
"שביעי": "שְׁבִיעִי",
|
56 |
+
"שביעיות": "שְׁבִיעִיוֹת",
|
57 |
+
"שביעיים": "שְׁבִיעִיִּים",
|
58 |
+
"שמונה": "שְׁמ֫וֹנֶה",
|
59 |
+
"שמונת": "שְׁמוֹנַת",
|
60 |
+
"שמינית": "שְׁמִינִית",
|
61 |
+
"שמיני": "שְׁמִינִי",
|
62 |
+
"שמיניות": "שְׁמִינִיוֹת",
|
63 |
+
"שמיניים": "שְׁמִינִיִּים",
|
64 |
+
"תשע": "תֵּשַׁע",
|
65 |
+
"תשעה": "תִּשְׁעָה",
|
66 |
+
"תשעת": "תִּשְׁעַת",
|
67 |
+
"תשיעית": "תְּשִׁיעִית",
|
68 |
+
"תשיעי": "תְּשִׁיעִי",
|
69 |
+
"תשיעיות": "תְּשִׁיעִיּוֹת",
|
70 |
+
"תשיעיים": "תְּשִׁיעִיִּים",
|
71 |
+
}
|
72 |
+
|
73 |
+
|
74 |
+
TENS = {
|
75 |
+
"עשר": "עֶ֫שֶׂר",
|
76 |
+
"עשרה": "עֶשְׂרֵה",
|
77 |
+
"עשרת": "עֲשֶׂ֫רֶת",
|
78 |
+
"עשירית": "עֲשִׂירִית",
|
79 |
+
"עשירי": "עֲשִׂירִי",
|
80 |
+
"עשיריות": "עֲשִׂירִיּוֹת",
|
81 |
+
"עשיריים": "עֲשִׂירִיִּים",
|
82 |
+
"שתים עשרה": "שְׁתֵּ֫ים עֶשְׂרֵה",
|
83 |
+
"שנים עשר": "שְׁנֵים עָשָׂר",
|
84 |
+
}
|
85 |
+
|
86 |
+
|
87 |
+
TWENTIES = {
|
88 |
+
"עשרים": "עֶשְׂרִ֫ים",
|
89 |
+
"שלושים": "שְׁלוֹשִׁים",
|
90 |
+
"ארבעים": "אַרְבָּעִים",
|
91 |
+
"חמישים": "חֲמִשִּׁים",
|
92 |
+
"שישים": "שִׁשִּׁים",
|
93 |
+
"שבעים": "שִׁבְעִים",
|
94 |
+
"שמונים": "שְׁמוֹנִים",
|
95 |
+
"תשעים": "תִּשְׁעִים",
|
96 |
+
}
|
97 |
+
|
98 |
+
|
99 |
+
HUNDREDS = {
|
100 |
+
"מאה": "מֵ֫אָה",
|
101 |
+
"מאת": "מֵאַת",
|
102 |
+
"מאתיים": "מָאתַ֫יִם",
|
103 |
+
"מאות": "מֵאוֹת",
|
104 |
+
}
|
105 |
+
|
106 |
+
THOUSANDS = {
|
107 |
+
"אלף": "אֶ֫לֶף",
|
108 |
+
"אלפיים": "אַלְפַּ֫יִם",
|
109 |
+
"אלפים": "אֲלָפִים",
|
110 |
+
"אלפי": "אַלְפִּי",
|
111 |
+
}
|
112 |
+
|
113 |
+
|
114 |
+
LARGE = {
|
115 |
+
"מיליון": "מִילְיוֹן",
|
116 |
+
"מיליוני": "מִילְיוֹנִי",
|
117 |
+
"מיליארד": "מִילְיַארְד",
|
118 |
+
"מיליארדי": "מִילְיַ֫ארְדִּי",
|
119 |
+
"טריליון": "טְרִילְיוֹן",
|
120 |
+
"טריליוני": "טְרִילְיוֹנִי",
|
121 |
+
"קוודריליון": "קוֹוַדְרִילְיוֹן",
|
122 |
+
"קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
|
123 |
+
"קווינטיליון": "קוִוִּנְטִילְיוֹן",
|
124 |
+
"קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
|
125 |
+
"סקסטיליון": "סְקֶסְטִילְיוֹן",
|
126 |
+
"סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
|
127 |
+
"ספטיליון": "סְפֶּטִילְיוֹן",
|
128 |
+
"ספטיליוני": "סְפֶּטִילְיוֹנִי",
|
129 |
+
"אוקטיליון": "אוֹקְטִילְיוֹן",
|
130 |
+
"אוקטיליוני": "אוֹקְטִילְיוֹנִי",
|
131 |
+
"נוניליון": "נוּנִילְיוֹן",
|
132 |
+
"נוניליוני": "נוּנִילְיוֹנִי",
|
133 |
+
"דסיליון": "דֶּסִילְיוֹן",
|
134 |
+
"דסיליוני": "דֶּסִילְיוֹנִי",
|
135 |
+
"אונדסיליון": "אוּנְדְסִילְיוֹן",
|
136 |
+
"אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
|
137 |
+
"דואודסיליון": "דוּאודְסִילְיוֹן",
|
138 |
+
"דואודסיליוני": "דוּאודְסִילְיוֹנִי",
|
139 |
+
"טרדסיליון": "טֶרְדְסִילְיוֹן",
|
140 |
+
"טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
|
141 |
+
"קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
|
142 |
+
"קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
|
143 |
+
"קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
|
144 |
+
"קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
|
145 |
+
"סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
|
146 |
+
"סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
|
147 |
+
"ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
|
148 |
+
"ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
|
149 |
+
"אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
|
150 |
+
"אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
|
151 |
+
"נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
|
152 |
+
"נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
|
153 |
+
"ויגינטיליון": "וִיגִּינְטִילְיוֹן",
|
154 |
+
"ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
|
155 |
+
}
|
156 |
+
|
157 |
+
|
158 |
+
LETTERS = {
|
159 |
+
"ו": "וֵ",
|
160 |
+
"ה": "הַ",
|
161 |
+
}
|
162 |
+
|
163 |
+
|
164 |
+
CURRENCY = {
|
165 |
+
"שקל": "שֵׁ֫קֶל",
|
166 |
+
"שקלים": "שְׁקָלִים",
|
167 |
+
"אגורה": "אֲגוֹרָה",
|
168 |
+
"אגורות": "אֲגוֹרוֹת",
|
169 |
+
"אירו": "אֵ֫ירוֹ",
|
170 |
+
"סנט": "סֵנְט",
|
171 |
+
"סנטים": "סֵ֫נְטִים",
|
172 |
+
"דולר": "ד֫וֹלָר",
|
173 |
+
"דולרים": "דוֹלָ֫רִים",
|
174 |
+
}
|
175 |
+
|
176 |
+
|
177 |
+
POINTS = {
|
178 |
+
"מינוס": "מִ֫ינּוּס",
|
179 |
+
"נקודה": "נְֽקֻדָּה",
|
180 |
+
}
|
181 |
+
|
182 |
+
NUMBER_NAMES = {
|
183 |
+
**CURRENCY,
|
184 |
+
**HUNDREDS,
|
185 |
+
**LARGE,
|
186 |
+
**LETTERS,
|
187 |
+
**ONES,
|
188 |
+
**POINTS,
|
189 |
+
**TENS,
|
190 |
+
**THOUSANDS,
|
191 |
+
**TWENTIES,
|
192 |
+
**ZERO,
|
193 |
+
}
|
phonikud/expander/numbers.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import num2words
|
2 |
+
from .number_names import NUMBER_NAMES
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
def add_diacritics(words: str):
|
7 |
+
new_words = []
|
8 |
+
for word in words.split():
|
9 |
+
if NUMBER_NAMES.get(word):
|
10 |
+
new_words.append(NUMBER_NAMES[word])
|
11 |
+
elif NUMBER_NAMES.get(word[1:]):
|
12 |
+
# With Vav or Bet
|
13 |
+
new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
|
14 |
+
else:
|
15 |
+
new_words.append(word)
|
16 |
+
return " ".join(new_words)
|
17 |
+
|
18 |
+
|
19 |
+
def num_to_word(maybe_number: str) -> str:
|
20 |
+
def replace_number(match):
|
21 |
+
num: str = match.group()
|
22 |
+
suffix, prefix = "", ""
|
23 |
+
# prefix
|
24 |
+
if not num.startswith("-") and not num[0].isdigit():
|
25 |
+
prefix = num[0]
|
26 |
+
num = num[1:]
|
27 |
+
if not num[-1].isdigit():
|
28 |
+
suffix = num[-1]
|
29 |
+
num = num[:-1]
|
30 |
+
words = num2words.num2words(num, lang="he", ordinal=False)
|
31 |
+
words_with_diacritics = add_diacritics(words)
|
32 |
+
return (
|
33 |
+
f"{prefix.strip()} {words_with_diacritics.strip()} {suffix.strip()}".strip()
|
34 |
+
)
|
35 |
+
|
36 |
+
# Replace all whole numbers in the string
|
37 |
+
result = re.sub(r"[^\d\-]?-?\d+(?:[\.,]\d+)?[^\d]?", replace_number, maybe_number)
|
38 |
+
|
39 |
+
return result
|
phonikud/expander/time_to_word.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Convert time to words
|
3 |
+
TODO: fix zeros eg. 22:00
|
4 |
+
"""
|
5 |
+
|
6 |
+
import re
|
7 |
+
|
8 |
+
PATTERNS = [
|
9 |
+
r"(\d{1,2})([apm]{2})", # AM/PM format
|
10 |
+
r"(\d{1,2}):(\d{2})", # HH:MM format
|
11 |
+
]
|
12 |
+
|
13 |
+
|
14 |
+
def extract_time(match):
|
15 |
+
"""
|
16 |
+
Extract hour and minute from a string in HH:MM or AM/PM format
|
17 |
+
and return as integers.
|
18 |
+
"""
|
19 |
+
time_str = match.group(0).lower().strip()
|
20 |
+
|
21 |
+
# Check for HH:MM format
|
22 |
+
match = re.match(r"(\d{1,2}):(\d{2})", time_str)
|
23 |
+
if match:
|
24 |
+
h = int(match.group(1))
|
25 |
+
m = int(match.group(2))
|
26 |
+
return f"{convert_to_word(h, m)}"
|
27 |
+
|
28 |
+
# Check for AM/PM format
|
29 |
+
match = re.match(r"(\d{1,2})([apm]{2})", time_str)
|
30 |
+
if match:
|
31 |
+
h = int(match.group(1))
|
32 |
+
period = match.group(2)
|
33 |
+
|
34 |
+
# Normalize to 24-hour format
|
35 |
+
if period == "am" and h == 12:
|
36 |
+
h = 0
|
37 |
+
elif period == "pm" and h != 12:
|
38 |
+
h += 12
|
39 |
+
return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
|
40 |
+
|
41 |
+
return match.group(0) # Return original text if the format is not recognized
|
42 |
+
|
43 |
+
|
44 |
+
def convert_to_word(h, m):
|
45 |
+
hours = [
|
46 |
+
"אֶפֶס",
|
47 |
+
"אַחַת",
|
48 |
+
"שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
|
49 |
+
"שָׁלוֹשׁ",
|
50 |
+
"אַ֫רְבַּע",
|
51 |
+
"חָמֵשׁ",
|
52 |
+
"שֵׁשׁ",
|
53 |
+
"שֶׁ֫בַע",
|
54 |
+
"שְׁמ֫וֹנֵה",
|
55 |
+
"תֵּ֫שַׁע",
|
56 |
+
"עֵ֫שֵׂר",
|
57 |
+
"אַחַת עֶשְׂרֵה",
|
58 |
+
"שְׁתֵּים עֶשְׂרֵה",
|
59 |
+
]
|
60 |
+
|
61 |
+
tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
|
62 |
+
|
63 |
+
ten_to_twenty = [
|
64 |
+
"עֵ֫שֵׂר",
|
65 |
+
"אַחַת עֶשְׂרֵה",
|
66 |
+
"שְׁתֵּים עֶשְׂרֵה",
|
67 |
+
"שְׁלוֹשׁ עֶשְׂרֵה",
|
68 |
+
"אַרְבַּע עֶשְׂרֵה",
|
69 |
+
"חֲמֵשׁ עֶשְׂרֵה",
|
70 |
+
"שֵׁשׁ עֶשְׂרֵה",
|
71 |
+
"שְׁבַע עֶשְׂרֵה",
|
72 |
+
"שְׁמוֹנֶה עֶשְׂרֵה",
|
73 |
+
"תְּשַׁע עֶשְׂרֵה",
|
74 |
+
]
|
75 |
+
|
76 |
+
vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
|
77 |
+
|
78 |
+
# Convert 0 hours to 12 (midnight)
|
79 |
+
if h == 0:
|
80 |
+
h = 12
|
81 |
+
|
82 |
+
elif h > 12:
|
83 |
+
h -= 12
|
84 |
+
|
85 |
+
if m == 0:
|
86 |
+
return f"{hours[h]}"
|
87 |
+
|
88 |
+
elif 1 <= m <= 9:
|
89 |
+
minute_word = (
|
90 |
+
vocab["shtey"] if m == 2 else hours[m]
|
91 |
+
) # Replace "שניים" with "שני"
|
92 |
+
return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
|
93 |
+
|
94 |
+
elif 10 <= m <= 19:
|
95 |
+
return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
|
96 |
+
|
97 |
+
else:
|
98 |
+
tens_part = f"{vocab['and']}{tens[m // 10]}"
|
99 |
+
units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
|
100 |
+
return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
|
101 |
+
|
102 |
+
|
103 |
+
def time_to_word(text: str):
|
104 |
+
return re.sub("|".join(PATTERNS), extract_time, text)
|
phonikud/hebrew.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Hebrew Phonemizer
|
3 |
+
|
4 |
+
Rules implemented:
|
5 |
+
1. Consonant handling (including special cases)
|
6 |
+
2. Nikud (vowel) processing
|
7 |
+
3. Dagesh handling
|
8 |
+
4. Geresh handling
|
9 |
+
5. Shva Na prediction
|
10 |
+
6. Special letter combinations
|
11 |
+
|
12 |
+
Reference:
|
13 |
+
- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
|
14 |
+
- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
|
15 |
+
- https://he.wikipedia.org/wiki/הברה
|
16 |
+
- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
|
17 |
+
- https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט
|
18 |
+
- https://hebrew-academy.org.il/2022/03/03/מלעיל-ומלרע-על-ההטעמה-בעברית
|
19 |
+
"""
|
20 |
+
|
21 |
+
from typing import Literal
|
22 |
+
from phonikud.variants import Letter
|
23 |
+
from phonikud import lexicon
|
24 |
+
import re
|
25 |
+
from phonikud.utils import sort_stress
|
26 |
+
|
27 |
+
SHVA = "\u05b0"
|
28 |
+
SIN = "\u05c2"
|
29 |
+
PATAH = "\u05b7"
|
30 |
+
KAMATZ = "\u05b8"
|
31 |
+
HATAF_KAMATZ = "\u05b3"
|
32 |
+
DAGESH = "\u05bc"
|
33 |
+
HOLAM = "\u05b9"
|
34 |
+
HIRIK = "\u05b4"
|
35 |
+
PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
|
36 |
+
KUBUTS = "\u05bb"
|
37 |
+
TSERE = "\u05b5"
|
38 |
+
HATAMA = "\u05ab"
|
39 |
+
VAV_HOLAM = "\u05ba"
|
40 |
+
DAGESH = "\u05bc"
|
41 |
+
SEGOL = "\u05b6"
|
42 |
+
|
43 |
+
|
44 |
+
def phonemize_hebrew(
|
45 |
+
letters: list[Letter],
|
46 |
+
stress_placement: Literal["syllable", "vowel"],
|
47 |
+
) -> list[str]:
|
48 |
+
phonemes = []
|
49 |
+
i = 0
|
50 |
+
|
51 |
+
while i < len(letters):
|
52 |
+
cur = letters[i]
|
53 |
+
prev = letters[i - 1] if i > 0 else None
|
54 |
+
next = letters[i + 1] if i < len(letters) - 1 else None
|
55 |
+
next_phonemes, skip_offset = letter_to_phonemes(
|
56 |
+
cur, prev, next, stress_placement=stress_placement
|
57 |
+
)
|
58 |
+
# TODO: split into syllables
|
59 |
+
# next_letters = next_phonemes, letters[i:i+skip_offset+1]
|
60 |
+
phonemes.extend(next_phonemes)
|
61 |
+
i += skip_offset + 1
|
62 |
+
|
63 |
+
return phonemes
|
64 |
+
|
65 |
+
|
66 |
+
def letter_to_phonemes(
|
67 |
+
cur: Letter,
|
68 |
+
prev: Letter | None,
|
69 |
+
next: Letter | None,
|
70 |
+
stress_placement: Literal["syllable", "vowel"],
|
71 |
+
) -> tuple[str, int]:
|
72 |
+
cur_phonemes = []
|
73 |
+
skip_diacritics = False
|
74 |
+
skip_consonants = False
|
75 |
+
skip_offset = 0
|
76 |
+
|
77 |
+
if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
|
78 |
+
skip_consonants = True
|
79 |
+
skip_diacritics = True
|
80 |
+
|
81 |
+
elif cur.char == "א" and not cur.diac and prev:
|
82 |
+
if next and next.char != "ו":
|
83 |
+
skip_consonants = True
|
84 |
+
|
85 |
+
elif (
|
86 |
+
cur.char == "י"
|
87 |
+
and next
|
88 |
+
# Yud without diacritics
|
89 |
+
and not cur.diac
|
90 |
+
# In middle
|
91 |
+
and prev
|
92 |
+
# Prev Hirik
|
93 |
+
and prev.char + prev.diac != "אֵ"
|
94 |
+
# Next Vav has meaning
|
95 |
+
and not (next.char == "ו" and next.diac and "\u05b0" not in next.diac)
|
96 |
+
):
|
97 |
+
skip_consonants = True
|
98 |
+
|
99 |
+
elif cur.char == "ש" and SIN in cur.diac:
|
100 |
+
if (
|
101 |
+
next
|
102 |
+
and next.char == "ש"
|
103 |
+
and not next.diac
|
104 |
+
and re.search("[\u05b7\u05b8]", cur.diac)
|
105 |
+
):
|
106 |
+
# ^ יששכר
|
107 |
+
cur_phonemes.append("sa")
|
108 |
+
skip_consonants = True
|
109 |
+
skip_diacritics = True
|
110 |
+
skip_offset += 1
|
111 |
+
else:
|
112 |
+
cur_phonemes.append("s")
|
113 |
+
skip_consonants = True
|
114 |
+
|
115 |
+
# shin without nikud after sin = sin
|
116 |
+
elif cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
|
117 |
+
cur_phonemes.append("s")
|
118 |
+
skip_consonants = True
|
119 |
+
|
120 |
+
elif not next and cur.char == "ח" and PATAH in cur.diac:
|
121 |
+
# Final Het gnuva
|
122 |
+
cur_phonemes.append("ax")
|
123 |
+
skip_diacritics = True
|
124 |
+
skip_consonants = True
|
125 |
+
|
126 |
+
elif not next and cur.char == "ה" and PATAH in cur.diac:
|
127 |
+
# Final He gnuva
|
128 |
+
cur_phonemes.append("ah")
|
129 |
+
skip_diacritics = True
|
130 |
+
skip_consonants = True
|
131 |
+
|
132 |
+
elif not next and cur.char == "ע" and PATAH in cur.diac:
|
133 |
+
# Final Ayin gnuva
|
134 |
+
cur_phonemes.append("a")
|
135 |
+
skip_diacritics = True
|
136 |
+
skip_consonants = True
|
137 |
+
|
138 |
+
if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
|
139 |
+
if cur.char == "ת":
|
140 |
+
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
|
141 |
+
skip_diacritics = True
|
142 |
+
skip_consonants = True
|
143 |
+
else:
|
144 |
+
# Geresh
|
145 |
+
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
|
146 |
+
skip_consonants = True
|
147 |
+
|
148 |
+
elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES: # dagesh
|
149 |
+
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
|
150 |
+
skip_consonants = True
|
151 |
+
elif cur.char == "ו" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
|
152 |
+
skip_consonants = True
|
153 |
+
|
154 |
+
if prev and "\u05b0" in prev.diac and re.findall("[\u05b9-\u05ba]", cur.diac):
|
155 |
+
# ^ לִגְוֹעַ
|
156 |
+
cur_phonemes.append("vo")
|
157 |
+
skip_diacritics = True
|
158 |
+
skip_consonants = True
|
159 |
+
|
160 |
+
elif next and next.char == "ו":
|
161 |
+
# One of them has holam
|
162 |
+
|
163 |
+
holams = re.findall("[\u05b9-\u05ba]", cur.diac + next.diac)
|
164 |
+
if len(holams) == 2:
|
165 |
+
cur_phonemes.append("wo")
|
166 |
+
skip_diacritics = True
|
167 |
+
skip_offset += 1
|
168 |
+
if len(holams) == 1:
|
169 |
+
cur_phonemes.append("vo")
|
170 |
+
skip_diacritics = True
|
171 |
+
skip_offset += 1
|
172 |
+
# patah and next.diac empty
|
173 |
+
elif cur.diac == next.diac:
|
174 |
+
# double Vav
|
175 |
+
cur_phonemes.append("vu")
|
176 |
+
skip_diacritics = True
|
177 |
+
skip_offset += 1
|
178 |
+
elif HIRIK in cur.diac:
|
179 |
+
cur_phonemes.append("vi")
|
180 |
+
skip_diacritics = True
|
181 |
+
elif SHVA in cur.diac and not next.diac:
|
182 |
+
cur_phonemes.append("v")
|
183 |
+
skip_diacritics = True
|
184 |
+
elif KAMATZ in cur.diac or PATAH in cur.diac:
|
185 |
+
cur_phonemes.append("va")
|
186 |
+
skip_diacritics = True
|
187 |
+
elif SEGOL in cur.diac:
|
188 |
+
cur_phonemes.append("ve")
|
189 |
+
skip_diacritics = True
|
190 |
+
else:
|
191 |
+
# TODO ?
|
192 |
+
# skip_consonants = False
|
193 |
+
skip_diacritics = False
|
194 |
+
else:
|
195 |
+
# Single vav
|
196 |
+
|
197 |
+
# Vav with Patah
|
198 |
+
if re.search(PATAH_LIKE_PATTERN, cur.diac):
|
199 |
+
cur_phonemes.append("va")
|
200 |
+
|
201 |
+
# Tsere
|
202 |
+
elif TSERE in cur.diac:
|
203 |
+
cur_phonemes.append("ve")
|
204 |
+
elif SEGOL in cur.diac:
|
205 |
+
cur_phonemes.append("ve")
|
206 |
+
# Holam haser
|
207 |
+
elif HOLAM in cur.diac:
|
208 |
+
cur_phonemes.append("o")
|
209 |
+
# Shuruk / Kubutz
|
210 |
+
elif KUBUTS in cur.diac or DAGESH in cur.diac:
|
211 |
+
cur_phonemes.append("u")
|
212 |
+
# Vav with Shva in start
|
213 |
+
elif SHVA in cur.diac and not prev:
|
214 |
+
cur_phonemes.append("ve")
|
215 |
+
# Hirik
|
216 |
+
elif HIRIK in cur.diac:
|
217 |
+
cur_phonemes.append("vi")
|
218 |
+
elif next and not cur.diac:
|
219 |
+
# It is fine for now since we use Dicta
|
220 |
+
skip_consonants = True
|
221 |
+
skip_diacritics = True
|
222 |
+
else:
|
223 |
+
cur_phonemes.append("v")
|
224 |
+
|
225 |
+
skip_diacritics = True
|
226 |
+
|
227 |
+
if not skip_consonants:
|
228 |
+
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
|
229 |
+
|
230 |
+
if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
|
231 |
+
cur_phonemes.append("o")
|
232 |
+
skip_diacritics = True
|
233 |
+
|
234 |
+
nikud_phonemes = []
|
235 |
+
if not skip_diacritics:
|
236 |
+
nikud_phonemes = [
|
237 |
+
lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
|
238 |
+
]
|
239 |
+
elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
|
240 |
+
nikud_phonemes = [lexicon.STRESS_PHONEME]
|
241 |
+
cur_phonemes.extend(nikud_phonemes)
|
242 |
+
# Ensure the stress is at the beginning of the syllable
|
243 |
+
cur_phonemes = sort_stress(cur_phonemes, stress_placement)
|
244 |
+
cur_phonemes = [
|
245 |
+
p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
|
246 |
+
]
|
247 |
+
# Remove empty phonemes
|
248 |
+
cur_phonemes = [p for p in cur_phonemes if p]
|
249 |
+
return cur_phonemes, skip_offset
|
phonikud/lexicon.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
ASCII IPA transcription of Hebrew consonants and vowels.
|
3 |
+
"""
|
4 |
+
|
5 |
+
# https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
|
6 |
+
|
7 |
+
# Non standard diacritics
|
8 |
+
SHVA_NA_DIACRITIC = "\u05bd" # Meteg
|
9 |
+
HATAMA_DIACRITIC = "\u05ab" # Ole
|
10 |
+
PREFIX_DIACRITIC = "|" # Vertical bar
|
11 |
+
NIKUD_HASER_DIACRITIC = "\u05af" # Masora, not in use
|
12 |
+
EN_GERESH = "'"
|
13 |
+
NON_STANDARD_DIAC = "".join(
|
14 |
+
[
|
15 |
+
SHVA_NA_DIACRITIC,
|
16 |
+
HATAMA_DIACRITIC,
|
17 |
+
PREFIX_DIACRITIC,
|
18 |
+
NIKUD_HASER_DIACRITIC,
|
19 |
+
EN_GERESH,
|
20 |
+
]
|
21 |
+
)
|
22 |
+
|
23 |
+
HE_PATTERN = rf'[\u05b0-\u05ea{NON_STANDARD_DIAC}"]+'
|
24 |
+
# ^ Standard nikud and letters, ole, meteg, masora, vertical bar, en geresh
|
25 |
+
HE_NIKUD_PATTERN = rf"[\u05b0-\u05c7{NON_STANDARD_DIAC}]"
|
26 |
+
# ^ Letters, diacritics, en geresh
|
27 |
+
PUNCTUATION = set(r".,!? ")
|
28 |
+
|
29 |
+
STRESS_PHONEME = "ˈ" # \u02c8 visually looks like single quote
|
30 |
+
SPECIAL_PHONEMES = ["w"]
|
31 |
+
MODERN_SCHEMA = {
|
32 |
+
"x": "χ", # Het
|
33 |
+
"r": "ʁ", # Resh
|
34 |
+
"g": "ɡ", # Gimel
|
35 |
+
}
|
36 |
+
|
37 |
+
# Geresh
|
38 |
+
GERESH_PHONEMES = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
|
39 |
+
|
40 |
+
# Consonants
|
41 |
+
LETTERS_PHONEMES = {
|
42 |
+
"א": "ʔ", # Alef
|
43 |
+
"ב": "v", # Bet
|
44 |
+
"ג": "g", # Gimel
|
45 |
+
"ד": "d", # Dalet
|
46 |
+
"ה": "h", # He
|
47 |
+
"ו": "v", # Vav
|
48 |
+
"ז": "z", # Zayin
|
49 |
+
"ח": "x", # Het
|
50 |
+
"ט": "t", # Tet
|
51 |
+
"י": "j", # Yod
|
52 |
+
"ך": "x", # Haf sofit
|
53 |
+
"כ": "x", # Haf
|
54 |
+
"ל": "l", # Lamed
|
55 |
+
"ם": "m", # Mem Sofit
|
56 |
+
"מ": "m", # Mem
|
57 |
+
"ן": "n", # Nun Sofit
|
58 |
+
"נ": "n", # Nun
|
59 |
+
"ס": "s", # Samekh
|
60 |
+
"ע": "ʔ", # Ayin, only voweled
|
61 |
+
"פ": "f", # Fey
|
62 |
+
"ף": "f", # Fey Sofit
|
63 |
+
"ץ": "ts", # Tsadik sofit
|
64 |
+
"צ": "ts", # Tsadik
|
65 |
+
"ק": "k", # Kuf
|
66 |
+
"ר": "r", # Resh
|
67 |
+
"ש": "ʃ", # Shin
|
68 |
+
"ת": "t", # Taf
|
69 |
+
# Beged Kefet
|
70 |
+
"בּ": "b",
|
71 |
+
"כּ": "k",
|
72 |
+
"פּ": "p",
|
73 |
+
# Shin Sin
|
74 |
+
"שׁ": "ʃ",
|
75 |
+
"שׂ": "s",
|
76 |
+
"'": "",
|
77 |
+
}
|
78 |
+
|
79 |
+
NIKUD_PHONEMES = {
|
80 |
+
"\u05b4": "i", # Hiriq
|
81 |
+
"\u05b1": "e", # Hataf segol
|
82 |
+
"\u05b5": "e", # Tsere
|
83 |
+
"\u05b6": "e", # Segol
|
84 |
+
"\u05b2": "a", # Hataf Patah
|
85 |
+
"\u05b7": "a", # Patah
|
86 |
+
"\u05c7": "o", # Kamatz katan
|
87 |
+
"\u05b9": "o", # Holam
|
88 |
+
"\u05ba": "o", # Holam haser for vav
|
89 |
+
"\u05bb": "u", # Qubuts
|
90 |
+
"\u05b3": "o", # Hataf qamats
|
91 |
+
"\u05b8": "a", # Kamataz
|
92 |
+
HATAMA_DIACRITIC: STRESS_PHONEME, # Stress (Hat'ama)
|
93 |
+
SHVA_NA_DIACRITIC: "e", # Shva na
|
94 |
+
}
|
95 |
+
|
96 |
+
DEDUPLICATE = {
|
97 |
+
"\u05f3": "'", # Hebrew geresh to regular geresh
|
98 |
+
"־": "-", # Hebrew Makaf to hypen
|
99 |
+
}
|
100 |
+
|
101 |
+
# Sets
|
102 |
+
SET_PHONETIC_DIACRITICS = set([HATAMA_DIACRITIC, PREFIX_DIACRITIC, SHVA_NA_DIACRITIC])
|
103 |
+
|
104 |
+
ADDITIONAL_PHONEMES = set() # When using fallback
|
105 |
+
SET_PHONEMES = set(
|
106 |
+
sorted(
|
107 |
+
{
|
108 |
+
*NIKUD_PHONEMES.values(),
|
109 |
+
*LETTERS_PHONEMES.values(),
|
110 |
+
*GERESH_PHONEMES.values(),
|
111 |
+
*MODERN_SCHEMA.values(),
|
112 |
+
*SPECIAL_PHONEMES,
|
113 |
+
}
|
114 |
+
)
|
115 |
+
)
|
phonikud/log.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import colorlog
|
4 |
+
|
5 |
+
|
6 |
+
def _create_logger():
|
7 |
+
"""
|
8 |
+
Create a logger with colorized output
|
9 |
+
Usage: LOG_LEVEL=DEBUG python <script.py>
|
10 |
+
"""
|
11 |
+
|
12 |
+
handler = colorlog.StreamHandler()
|
13 |
+
fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
|
14 |
+
handler.setFormatter(
|
15 |
+
colorlog.ColoredFormatter(
|
16 |
+
fmt=fmt,
|
17 |
+
log_colors={
|
18 |
+
"DEBUG": "blue",
|
19 |
+
"INFO": "green",
|
20 |
+
"WARNING": "yellow",
|
21 |
+
"ERROR": "red",
|
22 |
+
"CRITICAL": "red",
|
23 |
+
},
|
24 |
+
)
|
25 |
+
)
|
26 |
+
# Get log level from LOG_LEVEL environment variable
|
27 |
+
log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
|
28 |
+
logger = colorlog.getLogger(__package__)
|
29 |
+
logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
|
30 |
+
# Setup logging to stdout
|
31 |
+
logger.addHandler(handler)
|
32 |
+
return logger
|
33 |
+
|
34 |
+
|
35 |
+
log = _create_logger()
|
phonikud/phonemize.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from phonikud import lexicon
|
2 |
+
from phonikud.variants import Letter
|
3 |
+
from .expander import Expander
|
4 |
+
from phonikud.utils import (
|
5 |
+
get_letters,
|
6 |
+
normalize,
|
7 |
+
post_normalize,
|
8 |
+
post_clean,
|
9 |
+
add_milra_hatama,
|
10 |
+
mark_shva_na,
|
11 |
+
sort_hatama,
|
12 |
+
)
|
13 |
+
from typing import Callable, Literal
|
14 |
+
import regex as re
|
15 |
+
from phonikud.hebrew import phonemize_hebrew
|
16 |
+
|
17 |
+
|
18 |
+
class Phonemizer:
|
19 |
+
# TODO: is that enough? what if there's punctuation around? other chars?
|
20 |
+
fallback_pattern = r"[a-zA-Z]+"
|
21 |
+
|
22 |
+
def __init__(self):
|
23 |
+
self.expander = Expander()
|
24 |
+
|
25 |
+
def phonemize(
|
26 |
+
self,
|
27 |
+
text: str,
|
28 |
+
preserve_punctuation: bool,
|
29 |
+
preserve_stress: bool,
|
30 |
+
use_expander: bool,
|
31 |
+
use_post_normalize: bool, # For TTS
|
32 |
+
predict_stress: bool,
|
33 |
+
predict_shva_nah: bool,
|
34 |
+
stress_placement: Literal["syllable", "vowel"],
|
35 |
+
schema: Literal["plain", "modern"],
|
36 |
+
fallback: Callable[[str], str] = None,
|
37 |
+
) -> str | list[str]:
|
38 |
+
# normalize
|
39 |
+
text = normalize(text)
|
40 |
+
|
41 |
+
def fallback_replace_callback(match: re.Match):
|
42 |
+
word = match.group(0)
|
43 |
+
|
44 |
+
if self.expander.dictionary.dict.get(word):
|
45 |
+
# skip
|
46 |
+
# TODO: better API
|
47 |
+
return word
|
48 |
+
phonemes = fallback(word).strip()
|
49 |
+
# TODO: check that it has only IPA?!
|
50 |
+
for c in phonemes:
|
51 |
+
lexicon.ADDITIONAL_PHONEMES.add(c)
|
52 |
+
return phonemes
|
53 |
+
|
54 |
+
if fallback is not None:
|
55 |
+
text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
|
56 |
+
|
57 |
+
if use_expander:
|
58 |
+
text = self.expander.expand_text(text)
|
59 |
+
|
60 |
+
def heb_replace_callback(match: re.Match, original_text: str):
|
61 |
+
word = match.group(0)
|
62 |
+
start_offset = match.start()
|
63 |
+
if start_offset > 0 and original_text[start_offset - 1] == "[":
|
64 |
+
# Skip if it starts with [ as it's used for hyper phonemes
|
65 |
+
return word
|
66 |
+
|
67 |
+
if predict_shva_nah:
|
68 |
+
mark_shva_na(word)
|
69 |
+
if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
|
70 |
+
word = add_milra_hatama(word)
|
71 |
+
letters: list[Letter] = get_letters(word)
|
72 |
+
letters = sort_hatama(letters)
|
73 |
+
|
74 |
+
phonemes: list[str] = phonemize_hebrew(
|
75 |
+
letters,
|
76 |
+
stress_placement=stress_placement,
|
77 |
+
)
|
78 |
+
phonemes = "".join(phonemes)
|
79 |
+
# syllables = get_syllables(phonemes)
|
80 |
+
|
81 |
+
# phonemes_text = "".join(phonemes)
|
82 |
+
# # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
|
83 |
+
# # if len(syllables) == 1:
|
84 |
+
# # syllables[-1] = lexicon.STRESS + syllables[-1]
|
85 |
+
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
|
86 |
+
# # elif any(
|
87 |
+
# # remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
|
88 |
+
# # ) or phonemes_text.endswith("ax"):
|
89 |
+
# # # insert lexicon.STRESS in the first character of syllables[-2]
|
90 |
+
# # syllables[-2] = lexicon.STRESS + syllables[-2]
|
91 |
+
# # syllables[-2] = "".join(sort_stress(syllables[-2]))
|
92 |
+
# # else:
|
93 |
+
# # # insert in syllables[-1]
|
94 |
+
# # syllables[-1] = lexicon.STRESS + syllables[-1]
|
95 |
+
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
|
96 |
+
|
97 |
+
# phonemes = "".join(syllables)
|
98 |
+
if use_post_normalize:
|
99 |
+
phonemes = post_normalize(phonemes)
|
100 |
+
|
101 |
+
if schema == "modern":
|
102 |
+
# We'll keep this feature simple for now
|
103 |
+
for k, v in lexicon.MODERN_SCHEMA.items():
|
104 |
+
phonemes = re.sub(k, v, phonemes)
|
105 |
+
return phonemes
|
106 |
+
|
107 |
+
text = re.sub(
|
108 |
+
lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
|
109 |
+
)
|
110 |
+
|
111 |
+
def hyper_phonemes_callback(match: re.Match):
|
112 |
+
"""
|
113 |
+
Expand hyper phonemes into normal phonemes
|
114 |
+
eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
|
115 |
+
"""
|
116 |
+
matched_phonemes = match.group(2)
|
117 |
+
for c in matched_phonemes:
|
118 |
+
lexicon.ADDITIONAL_PHONEMES.add(c)
|
119 |
+
return matched_phonemes # The phoneme is in the second group
|
120 |
+
|
121 |
+
text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
|
122 |
+
|
123 |
+
if not preserve_punctuation:
|
124 |
+
text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
|
125 |
+
if not preserve_stress:
|
126 |
+
text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
|
127 |
+
if use_post_normalize:
|
128 |
+
# We don't keep hypens in the output, but we should replace it with space
|
129 |
+
text = post_clean(text)
|
130 |
+
return text
|
phonikud/syllables.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
|
3 |
+
|
4 |
+
TODO: add to phonikud?
|
5 |
+
"""
|
6 |
+
|
7 |
+
import regex as re
|
8 |
+
import phonikud
|
9 |
+
|
10 |
+
VOWEL_DIACS = [chr(i) for i in range(0x05B1, 0x05BC)] + [chr(0x05C7)] + [chr(0x5BD)]
|
11 |
+
|
12 |
+
STRESS = "\u05ab"
|
13 |
+
SHVA = "\u05b0"
|
14 |
+
DAGESH = "\u05bc"
|
15 |
+
|
16 |
+
|
17 |
+
def sort_diacritics(word: str):
|
18 |
+
def sort_diacritics_callback(match):
|
19 |
+
letter = match.group(1)
|
20 |
+
diacritics = "".join(sorted(match.group(2))) # Sort diacritics
|
21 |
+
return letter + diacritics
|
22 |
+
|
23 |
+
return re.sub(r"(\p{L})(\p{M}+)", sort_diacritics_callback, word)
|
24 |
+
|
25 |
+
|
26 |
+
def has_vowel_diacs(s: str):
|
27 |
+
if s == "וּ":
|
28 |
+
return True
|
29 |
+
return any(i in s for i in VOWEL_DIACS)
|
30 |
+
|
31 |
+
|
32 |
+
def get_syllables(word: str) -> list[str]:
|
33 |
+
letters = phonikud.utils.get_letters(word)
|
34 |
+
syllables, cur = [], ""
|
35 |
+
vowel_state = False
|
36 |
+
|
37 |
+
i = 0
|
38 |
+
while i < len(letters):
|
39 |
+
letter = letters[i]
|
40 |
+
has_vowel = has_vowel_diacs(str(letter)) or (i == 0 and SHVA in letter.all_diac)
|
41 |
+
# Look ahead
|
42 |
+
vav1 = i + 2 < len(letters) and letters[i + 2].char == "ו"
|
43 |
+
vav2 = i + 3 < len(letters) and letters[i + 3].char == "ו"
|
44 |
+
|
45 |
+
if has_vowel:
|
46 |
+
if vowel_state:
|
47 |
+
syllables.append(cur)
|
48 |
+
cur = str(letter)
|
49 |
+
else:
|
50 |
+
cur += str(letter)
|
51 |
+
vowel_state = True
|
52 |
+
else:
|
53 |
+
cur += str(letter)
|
54 |
+
|
55 |
+
i += 1
|
56 |
+
|
57 |
+
# If two וs are coming: force current syllable to end, and join both וs as next syllable
|
58 |
+
if vav1 and vav2:
|
59 |
+
if cur:
|
60 |
+
# Finish current syllable
|
61 |
+
syllables.append(cur + str(letters[i]))
|
62 |
+
cur = ""
|
63 |
+
cur = str(letters[i + 1]) + str(letters[i + 2])
|
64 |
+
i += 3 # skip past the double-vav
|
65 |
+
vowel_state = True
|
66 |
+
|
67 |
+
# If one ו is coming, end the syllable now
|
68 |
+
elif vav1 and letters[i + 1].diac:
|
69 |
+
if cur:
|
70 |
+
syllables.append(cur)
|
71 |
+
cur = ""
|
72 |
+
vowel_state = False
|
73 |
+
|
74 |
+
if cur:
|
75 |
+
syllables.append(cur)
|
76 |
+
# print(syllables)
|
77 |
+
return syllables
|
78 |
+
|
79 |
+
|
80 |
+
def add_stress_to_syllable(s: str):
|
81 |
+
letters = phonikud.utils.get_letters(s)
|
82 |
+
letters[0].all_diac = STRESS + letters[0].all_diac
|
83 |
+
return "".join(letter.char + letter.all_diac for letter in letters)
|
84 |
+
|
85 |
+
|
86 |
+
def add_stress(word: str, syllable_position: int):
|
87 |
+
syllables: list[str] = get_syllables(word)
|
88 |
+
|
89 |
+
if not syllables:
|
90 |
+
return word # no syllables, return original word
|
91 |
+
|
92 |
+
# Normalize negative indices
|
93 |
+
if syllable_position < 0:
|
94 |
+
syllable_position += len(syllables)
|
95 |
+
|
96 |
+
# Clamp to valid range
|
97 |
+
syllable_position = max(0, min(syllable_position, len(syllables) - 1))
|
98 |
+
|
99 |
+
stressed_syllable = syllables[syllable_position]
|
100 |
+
stressed_syllable = add_stress_to_syllable(stressed_syllable)
|
101 |
+
syllables[syllable_position] = stressed_syllable
|
102 |
+
|
103 |
+
return "".join(syllables)
|
phonikud/utils.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import lru_cache
|
2 |
+
from typing import Literal
|
3 |
+
from phonikud import lexicon
|
4 |
+
import unicodedata
|
5 |
+
import regex as re
|
6 |
+
import phonikud.syllables
|
7 |
+
from phonikud.variants import Letter
|
8 |
+
import phonikud
|
9 |
+
|
10 |
+
|
11 |
+
def sort_diacritics(match):
|
12 |
+
letter = match.group(1)
|
13 |
+
diacritics = "".join(sorted(match.group(2))) # Sort diacritics
|
14 |
+
return letter + diacritics
|
15 |
+
|
16 |
+
|
17 |
+
NORMALIZE_PATTERNS = {
|
18 |
+
# Sort diacritics
|
19 |
+
r"(\p{L})(\p{M}+)": sort_diacritics,
|
20 |
+
"״": '"', # Hebrew geresh to normal geresh
|
21 |
+
"׳": "'", # Same
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
def remove_nikud(text: str, to_keep=""):
|
26 |
+
pattern = lexicon.HE_NIKUD_PATTERN
|
27 |
+
pattern = "".join(i for i in pattern if i not in to_keep)
|
28 |
+
return re.sub(
|
29 |
+
pattern,
|
30 |
+
"",
|
31 |
+
text,
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
@lru_cache(maxsize=10000)
|
36 |
+
def normalize(text: str) -> str:
|
37 |
+
"""
|
38 |
+
Normalize unicode (decomposite)
|
39 |
+
Keep only Hebrew characters / punctuation / IPA
|
40 |
+
Sort diacritics
|
41 |
+
"""
|
42 |
+
|
43 |
+
# Decompose text
|
44 |
+
text = unicodedata.normalize("NFD", text)
|
45 |
+
for k, v in NORMALIZE_PATTERNS.items():
|
46 |
+
text = re.sub(k, v, text)
|
47 |
+
for k, v in lexicon.DEDUPLICATE.items():
|
48 |
+
text = re.sub(k, v, text)
|
49 |
+
return text
|
50 |
+
|
51 |
+
|
52 |
+
def post_normalize(phonemes: str):
|
53 |
+
new_phonemes = []
|
54 |
+
for word in phonemes.split(" "):
|
55 |
+
# remove glottal stop from end
|
56 |
+
word = re.sub(r"ʔ$", "", word)
|
57 |
+
# remove h from end
|
58 |
+
word = re.sub(r"h$", "", word)
|
59 |
+
word = re.sub(r"ˈh$", "", word)
|
60 |
+
# remove j followed by a i
|
61 |
+
word = re.sub(r"ij$", "i", word)
|
62 |
+
new_phonemes.append(word)
|
63 |
+
phonemes = " ".join(new_phonemes)
|
64 |
+
return phonemes
|
65 |
+
|
66 |
+
|
67 |
+
def post_clean(phonemes: str):
|
68 |
+
clean = []
|
69 |
+
for i in phonemes:
|
70 |
+
if i == "-":
|
71 |
+
clean.append(" ")
|
72 |
+
elif (
|
73 |
+
i in lexicon.SET_PHONEMES
|
74 |
+
or i in lexicon.ADDITIONAL_PHONEMES
|
75 |
+
or i == " "
|
76 |
+
or i in lexicon.PUNCTUATION
|
77 |
+
):
|
78 |
+
clean.append(i)
|
79 |
+
return "".join(clean)
|
80 |
+
|
81 |
+
|
82 |
+
letters_pattern = re.compile(r"(\p{L})([\p{M}'|]*)")
|
83 |
+
|
84 |
+
|
85 |
+
# @lru_cache(maxsize=10000) TODO?
|
86 |
+
def get_letters(word: str):
|
87 |
+
letters: list[tuple[str, str]] = letters_pattern.findall(word) # with en_geresh
|
88 |
+
letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
|
89 |
+
return letters
|
90 |
+
|
91 |
+
|
92 |
+
def get_unicode_names(text: str):
|
93 |
+
return [unicodedata.name(c, "?") for c in text]
|
94 |
+
|
95 |
+
|
96 |
+
def has_vowel(s: iter):
|
97 |
+
return any(i in s for i in "aeiou")
|
98 |
+
|
99 |
+
|
100 |
+
def has_constant(s: iter):
|
101 |
+
return any(i not in "aeiou" for i in s)
|
102 |
+
|
103 |
+
|
104 |
+
def get_phoneme_syllables(phonemes: list[str]) -> list[str]:
|
105 |
+
syllables = []
|
106 |
+
cur_syllable = ""
|
107 |
+
|
108 |
+
i = 0
|
109 |
+
while i < len(phonemes):
|
110 |
+
# Add current phoneme to the syllable
|
111 |
+
|
112 |
+
cur_syllable += phonemes[i]
|
113 |
+
|
114 |
+
# If we have a vowel in the current syllable
|
115 |
+
if has_vowel(cur_syllable):
|
116 |
+
# If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
|
117 |
+
if (
|
118 |
+
i + 2 < len(phonemes)
|
119 |
+
and not has_vowel(phonemes[i + 1])
|
120 |
+
and has_vowel(phonemes[i + 2])
|
121 |
+
):
|
122 |
+
# End the current syllable and start a new one
|
123 |
+
syllables.append(cur_syllable)
|
124 |
+
cur_syllable = ""
|
125 |
+
# If we're at the end or next phoneme has a vowel
|
126 |
+
elif i + 1 >= len(phonemes) or has_vowel(phonemes[i + 1]):
|
127 |
+
# End the current syllable
|
128 |
+
syllables.append(cur_syllable)
|
129 |
+
cur_syllable = ""
|
130 |
+
|
131 |
+
i += 1
|
132 |
+
|
133 |
+
# Add any remaining syllable
|
134 |
+
if cur_syllable:
|
135 |
+
syllables.append(cur_syllable)
|
136 |
+
|
137 |
+
# Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
|
138 |
+
for i in range(len(syllables) - 1): # Ensure we're not at the last syllable
|
139 |
+
if syllables[i].endswith(lexicon.STRESS_PHONEME):
|
140 |
+
syllables[i + 1] = (
|
141 |
+
lexicon.STRESS_PHONEME + syllables[i + 1]
|
142 |
+
) # Move stress to next syllable
|
143 |
+
syllables[i] = syllables[i][
|
144 |
+
: -len(lexicon.STRESS_PHONEME)
|
145 |
+
] # Remove stress from current syllable
|
146 |
+
|
147 |
+
return syllables
|
148 |
+
|
149 |
+
|
150 |
+
def sort_stress(
|
151 |
+
phonemes: list[str], placement: Literal["syllable", "vowel"] = "vowel"
|
152 |
+
) -> list[str]:
|
153 |
+
"""
|
154 |
+
TTS systems expect that the stress will be BEFORE vowel
|
155 |
+
Linguistics expect in the START of the syllable
|
156 |
+
at_start=True for place it in the beginning
|
157 |
+
"""
|
158 |
+
if "ˈ" not in "".join(phonemes):
|
159 |
+
# ^ Does not contains stress
|
160 |
+
return phonemes
|
161 |
+
if not any(i in "".join(phonemes) for i in "aeiou"):
|
162 |
+
# ^ Does not contains vowel
|
163 |
+
return phonemes
|
164 |
+
|
165 |
+
# Remove stress marker
|
166 |
+
phonemes = [p for p in phonemes if p != "ˈ"]
|
167 |
+
|
168 |
+
if placement == "syllable":
|
169 |
+
return ["ˈ"] + phonemes
|
170 |
+
|
171 |
+
# Define vowels
|
172 |
+
vowels = "aeiou"
|
173 |
+
|
174 |
+
# Find the first phoneme that contains a vowel, and inject the stress before the vowel
|
175 |
+
|
176 |
+
for i, phoneme in enumerate(phonemes):
|
177 |
+
for j, char in enumerate(phoneme):
|
178 |
+
if char in vowels:
|
179 |
+
# Insert stress before the vowel
|
180 |
+
phonemes[i] = phoneme[:j] + "ˈ" + phoneme[j:]
|
181 |
+
return phonemes
|
182 |
+
|
183 |
+
# If no vowels found, return unchanged
|
184 |
+
return phonemes
|
185 |
+
|
186 |
+
|
187 |
+
def mark_shva_na(word: str):
|
188 |
+
"""
|
189 |
+
Shva Na is context-independent and can be predicted with just the word or a dictionary.
|
190 |
+
See https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
|
191 |
+
Note: we predict only if Shva in the first letter in the word
|
192 |
+
Note: we assume that the word comes with | to mark 'Txiliyot'
|
193 |
+
Note: Shva Na rules mid-word are unreliable, so we don’t code them.
|
194 |
+
|
195 |
+
Meteg (\u05bd) will be added in the letter with Shva Na
|
196 |
+
|
197 |
+
What we don't predict:
|
198 |
+
(1) some shva in beginning in future form (we don't know)
|
199 |
+
(2) shva in the middle of the word
|
200 |
+
"""
|
201 |
+
letters = get_letters(word)
|
202 |
+
if not letters:
|
203 |
+
return word
|
204 |
+
if letters[0].char in "למנרי":
|
205 |
+
letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
|
206 |
+
elif len(letters) > 1 and letters[1].char in "אעה":
|
207 |
+
letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
|
208 |
+
elif letters[0].char in "וכלב" and lexicon.PREFIX_DIACRITIC in letters[0].all_diac:
|
209 |
+
# ^ The nakdan should add |
|
210 |
+
letters[0].all_diac += lexicon.SHVA_NA_DIACRITIC
|
211 |
+
# Ensure that prefix character will be last
|
212 |
+
for letter in letters:
|
213 |
+
if "|" in letter.all_diac:
|
214 |
+
letter.all_diac = letter.all_diac.replace("|", "") + "|"
|
215 |
+
return "".join(str(i) for i in letters)
|
216 |
+
|
217 |
+
|
218 |
+
def sort_hatama(letters: list[Letter]) -> list[Letter]:
|
219 |
+
for i in range(len(letters) - 1):
|
220 |
+
diacs = list(letters[i].all_diac)
|
221 |
+
if lexicon.HATAMA_DIACRITIC in diacs and lexicon.NIKUD_HASER_DIACRITIC in diacs:
|
222 |
+
diacs.remove(lexicon.HATAMA_DIACRITIC)
|
223 |
+
letters[i].all_diac = "".join(diacs) # Reassign the updated diacritics
|
224 |
+
letters[i + 1].all_diac += lexicon.HATAMA_DIACRITIC
|
225 |
+
return letters
|
226 |
+
|
227 |
+
|
228 |
+
def add_milra_hatama(word: str):
|
229 |
+
syllables = phonikud.syllables.get_syllables(word)
|
230 |
+
stress_index = -1
|
231 |
+
|
232 |
+
if not syllables:
|
233 |
+
return word
|
234 |
+
|
235 |
+
if len(syllables) == 1:
|
236 |
+
stress_index = 0
|
237 |
+
|
238 |
+
# Get latest syllable
|
239 |
+
milra = syllables[stress_index]
|
240 |
+
# Get letters
|
241 |
+
letters = get_letters(milra)
|
242 |
+
# Add Hatama
|
243 |
+
letters[0].all_diac += lexicon.HATAMA_DIACRITIC
|
244 |
+
|
245 |
+
# Replace latest syllable
|
246 |
+
syllables[stress_index] = "".join(str(i) for i in letters)
|
247 |
+
return "".join(syllables)
|
phonikud/variants.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import phonikud
|
2 |
+
from phonikud import lexicon
|
3 |
+
|
4 |
+
|
5 |
+
class Letter:
|
6 |
+
def __init__(self, char: str, diac: list[str]):
|
7 |
+
self.char = phonikud.normalize(char)
|
8 |
+
self.all_diac = phonikud.normalize(diac)
|
9 |
+
self.diac = "".join(
|
10 |
+
i for i in self.all_diac if i not in lexicon.SET_PHONETIC_DIACRITICS
|
11 |
+
)
|
12 |
+
|
13 |
+
def __repr__(self):
|
14 |
+
return f"[Letter] {self.char}{''.join(self.all_diac)}"
|
15 |
+
|
16 |
+
def __eq__(self, value: "Letter"):
|
17 |
+
return value.all_diac == self.all_diac and value.char == self.char
|
18 |
+
|
19 |
+
def __str__(self):
|
20 |
+
return self.char + self.all_diac
|
phonikud_onnx/__init__.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model import OnnxModel
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
class Phonikud:
|
6 |
+
def __init__(self, model_path: str):
|
7 |
+
self.model = OnnxModel(model_path)
|
8 |
+
|
9 |
+
def add_diacritics(
|
10 |
+
self, sentences: list | str, mark_matres_lectionis: str | None = None
|
11 |
+
) -> str:
|
12 |
+
"""
|
13 |
+
Adds nikud (Hebrew diacritics) to the given text.
|
14 |
+
|
15 |
+
Parameters:
|
16 |
+
- sentences (list | str): A string or a list of strings to be processed. Each string should not exceed 2048 characters.
|
17 |
+
- mark_matres_lectionis (str | None, optional): A string used to mark nikud male. For example, if set to '|',
|
18 |
+
"לִימּוּדָיו" will be returned as "לִי|מּוּדָיו". Default is None (no marking).
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
- str: The text with added diacritics.
|
22 |
+
"""
|
23 |
+
|
24 |
+
if isinstance(sentences, str):
|
25 |
+
sentences = [sentences]
|
26 |
+
result = self.model.predict(
|
27 |
+
sentences, mark_matres_lectionis=mark_matres_lectionis
|
28 |
+
)
|
29 |
+
return result[0]
|
30 |
+
|
31 |
+
def get_nikud_male(self, text: str, mark_matres_lectionis: str):
|
32 |
+
"""
|
33 |
+
Based on given mark character remove the mark character to keep it as nikud male
|
34 |
+
"""
|
35 |
+
return text.replace(mark_matres_lectionis, "")
|
36 |
+
|
37 |
+
def get_nikud_haser(self, text: str):
|
38 |
+
"""
|
39 |
+
Based on given mark_matres_lectionis remove the nikud nikud male character along with the mark character
|
40 |
+
"""
|
41 |
+
return re.sub(r".\|", "", text) # Remove {char}{matres_lectionis}
|
phonikud_onnx/model.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import onnxruntime as ort
|
2 |
+
import numpy as np
|
3 |
+
from tokenizers import Tokenizer
|
4 |
+
import re
|
5 |
+
|
6 |
+
# Constants
|
7 |
+
NIKUD_CLASSES = [
|
8 |
+
"",
|
9 |
+
"<MAT_LECT>",
|
10 |
+
"\u05bc",
|
11 |
+
"\u05b0",
|
12 |
+
"\u05b1",
|
13 |
+
"\u05b2",
|
14 |
+
"\u05b3",
|
15 |
+
"\u05b4",
|
16 |
+
"\u05b5",
|
17 |
+
"\u05b6",
|
18 |
+
"\u05b7",
|
19 |
+
"\u05b8",
|
20 |
+
"\u05b9",
|
21 |
+
"\u05ba",
|
22 |
+
"\u05bb",
|
23 |
+
"\u05bc\u05b0",
|
24 |
+
"\u05bc\u05b1",
|
25 |
+
"\u05bc\u05b2",
|
26 |
+
"\u05bc\u05b3",
|
27 |
+
"\u05bc\u05b4",
|
28 |
+
"\u05bc\u05b5",
|
29 |
+
"\u05bc\u05b6",
|
30 |
+
"\u05bc\u05b7",
|
31 |
+
"\u05bc\u05b8",
|
32 |
+
"\u05bc\u05b9",
|
33 |
+
"\u05bc\u05ba",
|
34 |
+
"\u05bc\u05bb",
|
35 |
+
"\u05c7",
|
36 |
+
"\u05bc\u05c7",
|
37 |
+
]
|
38 |
+
SHIN_CLASSES = ["\u05c1", "\u05c2"] # shin, sin
|
39 |
+
MAT_LECT_TOKEN = "<MAT_LECT>"
|
40 |
+
MATRES_LETTERS = list("אוי")
|
41 |
+
ALEF_ORD = ord("א")
|
42 |
+
TAF_ORD = ord("ת")
|
43 |
+
STRESS_CHAR = "\u05ab" # "ole" symbol marks stress
|
44 |
+
MOBILE_SHVA_CHAR = "\u05bd" # "meteg" symbol marks shva na (mobile shva)
|
45 |
+
PREFIX_CHAR = "|"
|
46 |
+
|
47 |
+
|
48 |
+
def is_hebrew_letter(char):
|
49 |
+
return ALEF_ORD <= ord(char) <= TAF_ORD
|
50 |
+
|
51 |
+
|
52 |
+
def is_matres_letter(char):
|
53 |
+
return char in MATRES_LETTERS
|
54 |
+
|
55 |
+
|
56 |
+
nikud_pattern = re.compile(r"[\u05B0-\u05BD\u05C1\u05C2\u05C7]")
|
57 |
+
|
58 |
+
|
59 |
+
def remove_nikkud(text):
|
60 |
+
return nikud_pattern.sub("", text)
|
61 |
+
|
62 |
+
|
63 |
+
class OnnxModel:
|
64 |
+
def __init__(
|
65 |
+
self, model_path, tokenizer_name="dicta-il/dictabert-large-char-menaked"
|
66 |
+
):
|
67 |
+
# Load the tokenizer
|
68 |
+
self.tokenizer = Tokenizer.from_pretrained(tokenizer_name)
|
69 |
+
|
70 |
+
# Create ONNX Runtime session
|
71 |
+
self.session = ort.InferenceSession(model_path)
|
72 |
+
self.input_names = [input.name for input in self.session.get_inputs()]
|
73 |
+
self.output_names = [output.name for output in self.session.get_outputs()]
|
74 |
+
|
75 |
+
def _create_inputs(self, sentences: list[str], padding: str):
|
76 |
+
# Tokenize inputs using tokenizers library
|
77 |
+
encodings = []
|
78 |
+
for sentence in sentences:
|
79 |
+
encoding = self.tokenizer.encode(sentence)
|
80 |
+
encodings.append(encoding)
|
81 |
+
|
82 |
+
# Get the max length for padding
|
83 |
+
max_len = max(len(enc.ids) for enc in encodings) if padding == "longest" else 0
|
84 |
+
|
85 |
+
# Prepare batch inputs
|
86 |
+
input_ids = []
|
87 |
+
attention_mask = []
|
88 |
+
offset_mapping = []
|
89 |
+
|
90 |
+
for encoding in encodings:
|
91 |
+
ids = encoding.ids
|
92 |
+
masks = [1] * len(ids)
|
93 |
+
offsets = encoding.offsets
|
94 |
+
|
95 |
+
# Pad if needed
|
96 |
+
if padding == "longest" and len(ids) < max_len:
|
97 |
+
padding_length = max_len - len(ids)
|
98 |
+
ids = ids + [self.tokenizer.token_to_id("[PAD]")] * padding_length
|
99 |
+
masks = masks + [0] * padding_length
|
100 |
+
offsets = offsets + [(0, 0)] * padding_length
|
101 |
+
|
102 |
+
input_ids.append(ids)
|
103 |
+
attention_mask.append(masks)
|
104 |
+
offset_mapping.append(offsets)
|
105 |
+
|
106 |
+
# Convert to numpy arrays for ONNX Runtime
|
107 |
+
return {
|
108 |
+
"input_ids": np.array(input_ids, dtype=np.int64),
|
109 |
+
"attention_mask": np.array(attention_mask, dtype=np.int64),
|
110 |
+
# Token type IDs might be needed depending on your model
|
111 |
+
"token_type_ids": np.zeros_like(np.array(input_ids, dtype=np.int64)),
|
112 |
+
}, offset_mapping
|
113 |
+
|
114 |
+
def predict(self, sentences, mark_matres_lectionis=None, padding="longest"):
|
115 |
+
sentences = [remove_nikkud(sentence) for sentence in sentences]
|
116 |
+
inputs, offset_mapping = self._create_inputs(sentences, padding)
|
117 |
+
|
118 |
+
# Run inference
|
119 |
+
outputs = self.session.run(self.output_names, inputs)
|
120 |
+
|
121 |
+
# Process outputs based on output names
|
122 |
+
nikud_idx = self.output_names.index("nikud_logits")
|
123 |
+
shin_idx = self.output_names.index("shin_logits")
|
124 |
+
nikud_logits = outputs[nikud_idx]
|
125 |
+
shin_logits = outputs[shin_idx]
|
126 |
+
|
127 |
+
additional_idx = self.output_names.index("additional_logits")
|
128 |
+
additional_logits = outputs[additional_idx]
|
129 |
+
|
130 |
+
# Get predictions
|
131 |
+
nikud_predictions = np.argmax(nikud_logits, axis=-1)
|
132 |
+
shin_predictions = np.argmax(shin_logits, axis=-1)
|
133 |
+
stress_predictions = (additional_logits[..., 1] > 1).astype(np.int32)
|
134 |
+
mobile_shva_predictions = (additional_logits[..., 2] > 1).astype(np.int32)
|
135 |
+
prefix_predictions = (additional_logits[..., 3] > 1).astype(np.int32)
|
136 |
+
|
137 |
+
ret = []
|
138 |
+
for sent_idx, (sentence, sent_offsets) in enumerate(
|
139 |
+
zip(sentences, offset_mapping)
|
140 |
+
):
|
141 |
+
# Assign the nikud to each letter
|
142 |
+
output = []
|
143 |
+
prev_index = 0
|
144 |
+
for idx, offsets in enumerate(sent_offsets):
|
145 |
+
# Add anything we missed
|
146 |
+
if offsets[0] > prev_index:
|
147 |
+
output.append(sentence[prev_index : offsets[0]])
|
148 |
+
if offsets[1] - offsets[0] != 1:
|
149 |
+
continue
|
150 |
+
|
151 |
+
# Get next char
|
152 |
+
char = sentence[offsets[0] : offsets[1]]
|
153 |
+
prev_index = offsets[1]
|
154 |
+
if not is_hebrew_letter(char):
|
155 |
+
output.append(char)
|
156 |
+
continue
|
157 |
+
|
158 |
+
nikud = NIKUD_CLASSES[nikud_predictions[sent_idx][idx]]
|
159 |
+
shin = (
|
160 |
+
"" if char != "ש" else SHIN_CLASSES[shin_predictions[sent_idx][idx]]
|
161 |
+
)
|
162 |
+
|
163 |
+
# Check for matres lectionis
|
164 |
+
if nikud == MAT_LECT_TOKEN:
|
165 |
+
if not is_matres_letter(char):
|
166 |
+
nikud = "" # Don't allow matres on irrelevant letters
|
167 |
+
elif mark_matres_lectionis is not None:
|
168 |
+
nikud = mark_matres_lectionis
|
169 |
+
else:
|
170 |
+
output.append(char)
|
171 |
+
continue
|
172 |
+
|
173 |
+
stress = (
|
174 |
+
STRESS_CHAR
|
175 |
+
if stress_predictions is not None
|
176 |
+
and stress_predictions[sent_idx][idx] == 1
|
177 |
+
else ""
|
178 |
+
)
|
179 |
+
mobile_shva = (
|
180 |
+
MOBILE_SHVA_CHAR
|
181 |
+
if mobile_shva_predictions is not None
|
182 |
+
and mobile_shva_predictions[sent_idx][idx] == 1
|
183 |
+
else ""
|
184 |
+
)
|
185 |
+
|
186 |
+
prefix = (
|
187 |
+
PREFIX_CHAR
|
188 |
+
if prefix_predictions is not None
|
189 |
+
and prefix_predictions[sent_idx][idx] == 1
|
190 |
+
else ""
|
191 |
+
)
|
192 |
+
|
193 |
+
output.append(char + shin + nikud + stress + mobile_shva + prefix)
|
194 |
+
output.append(sentence[prev_index:])
|
195 |
+
ret.append("".join(output))
|
196 |
+
|
197 |
+
return ret
|
phonikud_onnx/py.typed
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file was autogenerated by uv via the following command:
|
2 |
+
# uv export --no-hashes --no-emit-project
|
3 |
+
colorama==0.4.6 ; sys_platform == 'win32'
|
4 |
+
# via
|
5 |
+
# colorlog
|
6 |
+
# pytest
|
7 |
+
# tqdm
|
8 |
+
colorlog==6.9.0
|
9 |
+
# via phonikud
|
10 |
+
docopt==0.6.2
|
11 |
+
# via num2words
|
12 |
+
exceptiongroup==1.3.0 ; python_full_version < '3.11'
|
13 |
+
# via pytest
|
14 |
+
iniconfig==2.1.0
|
15 |
+
# via pytest
|
16 |
+
num2words==0.5.14
|
17 |
+
# via phonikud
|
18 |
+
numpy==2.2.6
|
19 |
+
# via pandas
|
20 |
+
packaging==25.0
|
21 |
+
# via pytest
|
22 |
+
pandas==2.2.3
|
23 |
+
pluggy==1.6.0
|
24 |
+
# via pytest
|
25 |
+
pytest==8.3.5
|
26 |
+
python-dateutil==2.9.0.post0
|
27 |
+
# via pandas
|
28 |
+
pytz==2025.2
|
29 |
+
# via pandas
|
30 |
+
regex==2024.11.6
|
31 |
+
# via phonikud
|
32 |
+
ruff==0.11.11
|
33 |
+
six==1.17.0
|
34 |
+
# via python-dateutil
|
35 |
+
tomli==2.2.1 ; python_full_version < '3.11'
|
36 |
+
# via pytest
|
37 |
+
tqdm==4.67.1
|
38 |
+
typing-extensions==4.13.2 ; python_full_version < '3.11'
|
39 |
+
# via exceptiongroup
|
40 |
+
tzdata==2025.2
|
41 |
+
# via pandas
|
42 |
+
gradio>=5.25.2
|
43 |
+
phonikud_onnx
|