Spaces:

thewh1teagle
/

phonemize-in-hebrew

Running

File size: 1,494 Bytes

22c7a44

from .model import OnnxModel
import re


class Phonikud:
    def __init__(self, model_path: str):
        self.model = OnnxModel(model_path)

    def add_diacritics(
        self, sentences: list | str, mark_matres_lectionis: str | None = None
    ) -> str:
        """
        Adds nikud (Hebrew diacritics) to the given text.

        Parameters:
        - sentences (list | str): A string or a list of strings to be processed. Each string should not exceed 2048 characters.
        - mark_matres_lectionis (str | None, optional): A string used to mark nikud male. For example, if set to '|',
            "לִימּוּדָיו" will be returned as "לִי|מּוּדָיו". Default is None (no marking).

        Returns:
        - str: The text with added diacritics.
        """

        if isinstance(sentences, str):
            sentences = [sentences]
        result = self.model.predict(
            sentences, mark_matres_lectionis=mark_matres_lectionis
        )
        return result[0]

    def get_nikud_male(self, text: str, mark_matres_lectionis: str):
        """
        Based on given mark character remove the mark character to keep it as nikud male
        """
        return text.replace(mark_matres_lectionis, "")

    def get_nikud_haser(self, text: str):
        """
        Based on given mark_matres_lectionis remove the nikud nikud male character along with the mark character
        """
        return re.sub(r".\|", "", text)  # Remove {char}{matres_lectionis}