Saranath07
model to huggingface
1484d4b
raw
history blame
8.18 kB
import re
LANG_CODE_TO_DISPLAY_NAME = {
# Indo-Aryan
## Indic-scripts
'as' : "Assamese - অসমীয়া",
'bn' : "Bangla - বাংলা",
'doi': "Dogri - डोगरी",
'gom': "Goan Konkani - कोंकणी",
'gu' : "Gujarati - ગુજરાતી",
'hi' : "Hindi - हिंदी",
'mai': "Maithili - मैथिली",
'mr' : "Marathi - मराठी",
'ne' : "Nepali - नेपाली",
'or' : "Oriya - ଓଡ଼ିଆ",
'pa' : "Panjabi - ਪੰਜਾਬੀ",
'sa' : "Sanskrit - संस्कृतम्",
'si' : "Sinhala - සිංහල",
## Perso-Arabic scripts
'ks' : "Kashmiri - كٲشُر",
'pnb': "Panjabi (Western) - پن٘جابی",
'sd' : "Sindhi - سنڌي",
'skr': "Saraiki - سرائیکی",
'ur' : "Urdu - اُردُو",
## Misc
'dv' : "Dhivehi - ދިވެހި",
# Dravidian
'kn' : "Kannada - ಕನ್ನಡ",
'ml' : "Malayalam - മലയാളം",
'ta' : "Tamil - தமிழ்",
'te' : "Telugu - తెలుగు",
# Tibeto-Burman
'brx': "Boro - बड़ो",
'mni': "Manipuri - ꯃꯤꯇꯩꯂꯣꯟ",
# Munda
'sat': "Santali - ᱥᱟᱱᱛᱟᱲᱤ",
# Misc
'en' : "English",
}
PERSOARABIC_LANG_CODES = {
'ks',
'pnb',
'sd',
'skr',
'ur',
}
RTL_LANG_CODES = set(PERSOARABIC_LANG_CODES)
RTL_LANG_CODES.add('dv')
# Default/Official language to script mapping
LANG_CODE_TO_SCRIPT_CODE = {
# Indo-Aryan
"as" : "Beng",
"bn" : "Beng",
"doi" : "Deva",
"dv" : "Thaa",
"gom" : "Deva",
"gu" : "Gujr",
"hi" : "Deva",
"ks" : "Aran",
"mai" : "Deva",
"mr" : "Deva",
"ne" : "Deva",
"or" : "Orya",
"pa" : "Guru",
"pnb" : "Aran",
"sa" : "Deva",
"sd" : "Arab",
"si" : "Sinh",
"skr" : "Aran",
"ur" : "Aran",
# Dravidian
"kn" : "Knda",
"ml" : "Mlym",
"ta" : "Taml",
"te" : "Telu",
# Tibeto-Burman
"brx" : "Deva",
"mni" : "Mtei",
# Munda
"sat" : "Olck",
# Misc
"en" : "Latn",
}
SCRIPT_CODE_TO_UNICODE_CHARS_RANGE_STR = {
# ISO 15924 codes for script names
# North Indic
"Beng": "\u0980-\u09FF",
"Deva": "\u0900-\u097F",
"Gujr": "\u0A80-\u0AFF",
"Guru": "\u0A00-\u0A7F",
"Orya": "\u0B00-\u0B7F",
# South Indic
"Knda": "\u0C80-\u0CFF",
"Mlym": "\u0D00-\u0D7F",
"Sinh": "\u0D80-\u0DFF",
"Taml": "\u0B80-\u0BFF",
"Telu": "\u0C00-\u0C7F",
# Tibetic
"Mtei": "\uABC0-\uABFF",
# Misc
"Arab": "\u0600-\u06FF\u0750-\u077F\u0870-\u089F\u08A0-\u08FF", # Perso-Arabic
"Aran": "\u0600-\u06FF\u0750-\u077F\u0870-\u089F\u08A0-\u08FF", # Perso-Arabic (Nastaliq code)
"Latn": "\u0041-\u005A\u0061-\u007A", # includes only basic/unaccented Roman
"Olck": "\u1C50-\u1C7F",
"Thaa": "\u0780-\u07BF",
}
GOOGLE_FONTS = {
"gom": "Tiro Devanagari Marathi",
"ks" : "Noto Nastaliq Urdu",
"mni": "Noto Sans Meetei Mayek",
"mr" : "Tiro Devanagari Marathi",
"sa" : "Tiro Devanagari Sanskrit",
"sat": "Noto Sans Ol Chiki",
"sd" : "Lateef",
"ur" : "Noto Nastaliq Urdu",
}
FALLBACK_FONTS = {
"gom": "serif",
"ks" : "serif",
"mni": "sans-serif",
"mr" : "serif",
"sa" : "serif",
"sat": "sans-serif",
"sd" : "serif",
"ur" : "serif",
}
INDIC_TO_LATIN_PUNCT = {
## List of all punctuations across languages
# Brahmic
'।': '.', # Nagari
## Archaic Indic
'॥': "..", # Sanskrit
'෴': '.', # Sinhala
## Meetei (influenced from Burmese)
'꫰': ',',
'꯫': '.',
# Ol Chiki
'᱾': '.',
'᱿': '..',
# Arabic
'۔': '.',
'؟': '?',
'،': ',',
'؛': ';',
'۝': "..",
}
INDIC_TO_LATIN_PUNCT_TRANSLATOR = str.maketrans(INDIC_TO_LATIN_PUNCT)
NON_LATIN_FULLSTOP_LANGS = {
# Brahmic
'as' : '।',
'bn' : '।',
'brx': '।',
'doi': '।',
'hi' : '।',
'mai': '।',
'mni': '꯫',
'ne' : '।',
'or' : '।',
'pa' : '।',
'sa' : '।',
'sat': '᱾',
# Nastaliq
'ks' : '۔',
'pnb': '۔',
# 'sd' : '۔', # Sindhi uses Naskh, hence use latin
'skr': '۔',
'ur' : '۔',
}
ENDS_WITH_LATIN_FULLSTOP_REGEX = re.compile("(^|.*[^.])\.$")
def nativize_latin_fullstop(text, lang_code):
if lang_code in NON_LATIN_FULLSTOP_LANGS and ENDS_WITH_LATIN_FULLSTOP_REGEX.match(text):
return text[:-1] + NON_LATIN_FULLSTOP_LANGS[lang_code]
return text
LATIN_TO_PERSOARABIC_PUNCTUATIONS = {
# Except full-stop (since period-mark is ambiguous in usage, like fullforms)
'?': '؟',
',': '،',
';': '؛',
}
LATIN_TO_PERSOARABIC_PUNC_TRANSLATOR = str.maketrans(LATIN_TO_PERSOARABIC_PUNCTUATIONS)
SCRIPT_CODE_TO_NUMERALS = {
# ISO 15924 codes for script names
# North Indic
"Beng": "০১২৩৪৫৬৭৮৯",
"Deva": "०१२३४५६७८९",
"Gujr": "૦૧૨૩૪૫૬૭૮૯",
"Guru": "੦੧੨੩੪੫੬੭੮੯",
"Orya": "୦୧୨୩୪୫୬୭୮୯",
# South Indic
"Knda": "೦೧೨೩೪೫೬೭೮೯",
"Mlym": "൦൧൨൩൪൫൬൭൮൯",
"Sinh": "෦෧෨෩෪෫෬෭෮෯",
"Taml": "௦௧௨௩௪௫௬௭௮௯",
"Telu": "౦౧౨౩౪౫౬౭౮౯",
# Tibetic
"Mtei": "꯰꯱꯲꯳꯴꯵꯶꯷꯸꯹",
# Misc
"Arab": "۰۱۲۳۴۵۶۷۸۹", # Perso-Arabic numerals
"Aran": "۰۱۲۳۴۵۶۷۸۹", # Perso-Arabic numerals
"Latn": "0123456789",
"Olck": "᱐᱑᱒᱓᱔᱕᱖᱗᱘᱙",
"Thaa": "٠١٢٣٤٥٦٧٨٩", # East-Arabic numerals. (Dhivehi does code-mixing with Arabic)
}
LANG_CODE_TO_NUMERALS = {
lang_code: SCRIPT_CODE_TO_NUMERALS[script_code]
for lang_code, script_code in LANG_CODE_TO_SCRIPT_CODE.items()
}
INDIC_TO_STANDARD_NUMERALS_GLOBAL_MAP = {}
for lang_code, lang_numerals in LANG_CODE_TO_NUMERALS.items():
map_dict = {lang_numeral: en_numeral for lang_numeral, en_numeral in zip(lang_numerals, LANG_CODE_TO_NUMERALS["en"])}
INDIC_TO_STANDARD_NUMERALS_GLOBAL_MAP.update(map_dict)
INDIC_TO_STANDARD_NUMERALS_TRANSLATOR = str.maketrans(INDIC_TO_STANDARD_NUMERALS_GLOBAL_MAP)
NATIVE_TO_LATIN_NUMERALS_TRANSLATORS = {
lang_code: str.maketrans({lang_numeral: en_numeral for lang_numeral, en_numeral in zip(lang_numerals, LANG_CODE_TO_NUMERALS["en"])})
for lang_code, lang_numerals in LANG_CODE_TO_NUMERALS.items()
if lang_code != "en"
}
LATIN_TO_NATIVE_NUMERALS_TRANSLATORS = {
lang_code: str.maketrans({en_numeral: lang_numeral for en_numeral, lang_numeral in zip(LANG_CODE_TO_NUMERALS["en"], lang_numerals)})
for lang_code, lang_numerals in LANG_CODE_TO_NUMERALS.items()
if lang_code != "en"
}
WORDFINAL_INDIC_VIRAMA_REGEX = re.compile("(\u09cd|\u094d|\u0acd|\u0a4d|\u0b4d|\u0ccd|\u0d4d|\u0dca|\u0bcd|\u0c4d|\uaaf6)$")
def hardfix_wordfinal_virama(word):
# Add ZWNJ after a word-final halanta
# Not applicable for non-Brahmic scripts (like Arabic & Ol-Chiki)
return WORDFINAL_INDIC_VIRAMA_REGEX.sub("\\1\u200c", word)
ODIA_CONFUSING_YUKTAKSHARA_REGEX = re.compile("(\u0b4d)(ବ|ଵ|ୱ|ଯ|ୟ)")
def fix_odia_confusing_ambiguous_yuktakshara(word):
# Add ZWNJ in-between to force-render virama in conjunct
return ODIA_CONFUSING_YUKTAKSHARA_REGEX.sub("\\1\u200c\\2", word)
LATIN_WORDFINAL_CONSONANTS_CHECKER_REGEX = re.compile(".*([bcdfghjklmnpqrstvwxyz])$")
DEVANAGARI_WORDFINAL_CONSONANTS_REGEX = re.compile("([\u0915-\u0939\u0958-\u095f\u0979-\u097c\u097e-\u097f])$")
def explicit_devanagari_wordfinal_schwa_delete(roman_word, indic_word):
if LATIN_WORDFINAL_CONSONANTS_CHECKER_REGEX.match(roman_word):
indic_word = DEVANAGARI_WORDFINAL_CONSONANTS_REGEX.sub("\\1\u094d", indic_word)
return indic_word
# To replace last N occurences of a substring in a string
# Src: https://stackoverflow.com/questions/2556108/
def rreplace(text, find_pattern, replace_pattern, match_count=1):
splits = text.rsplit(find_pattern, match_count)
return replace_pattern.join(splits)