File size: 1,494 Bytes
22c7a44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from .model import OnnxModel
import re


class Phonikud:
    def __init__(self, model_path: str):
        self.model = OnnxModel(model_path)

    def add_diacritics(
        self, sentences: list | str, mark_matres_lectionis: str | None = None
    ) -> str:
        """
        Adds nikud (Hebrew diacritics) to the given text.

        Parameters:
        - sentences (list | str): A string or a list of strings to be processed. Each string should not exceed 2048 characters.
        - mark_matres_lectionis (str | None, optional): A string used to mark nikud male. For example, if set to '|',
            "ืœึดื™ืžึผื•ึผื“ึธื™ื•" will be returned as "ืœึดื™|ืžึผื•ึผื“ึธื™ื•". Default is None (no marking).

        Returns:
        - str: The text with added diacritics.
        """

        if isinstance(sentences, str):
            sentences = [sentences]
        result = self.model.predict(
            sentences, mark_matres_lectionis=mark_matres_lectionis
        )
        return result[0]

    def get_nikud_male(self, text: str, mark_matres_lectionis: str):
        """
        Based on given mark character remove the mark character to keep it as nikud male
        """
        return text.replace(mark_matres_lectionis, "")

    def get_nikud_haser(self, text: str):
        """
        Based on given mark_matres_lectionis remove the nikud nikud male character along with the mark character
        """
        return re.sub(r".\|", "", text)  # Remove {char}{matres_lectionis}