|
import re |
|
|
|
LANG_CODE_TO_DISPLAY_NAME = { |
|
|
|
|
|
'as' : "Assamese - অসমীয়া", |
|
'bn' : "Bangla - বাংলা", |
|
'doi': "Dogri - डोगरी", |
|
'gom': "Goan Konkani - कोंकणी", |
|
'gu' : "Gujarati - ગુજરાતી", |
|
'hi' : "Hindi - हिंदी", |
|
'mai': "Maithili - मैथिली", |
|
'mr' : "Marathi - मराठी", |
|
'ne' : "Nepali - नेपाली", |
|
'or' : "Oriya - ଓଡ଼ିଆ", |
|
'pa' : "Panjabi - ਪੰਜਾਬੀ", |
|
'sa' : "Sanskrit - संस्कृतम्", |
|
'si' : "Sinhala - සිංහල", |
|
|
|
'ks' : "Kashmiri - كٲشُر", |
|
'pnb': "Panjabi (Western) - پن٘جابی", |
|
'sd' : "Sindhi - سنڌي", |
|
'skr': "Saraiki - سرائیکی", |
|
'ur' : "Urdu - اُردُو", |
|
|
|
'dv' : "Dhivehi - ދިވެހި", |
|
|
|
|
|
'kn' : "Kannada - ಕನ್ನಡ", |
|
'ml' : "Malayalam - മലയാളം", |
|
'ta' : "Tamil - தமிழ்", |
|
'te' : "Telugu - తెలుగు", |
|
|
|
|
|
'brx': "Boro - बड़ो", |
|
'mni': "Manipuri - ꯃꯤꯇꯩꯂꯣꯟ", |
|
|
|
|
|
'sat': "Santali - ᱥᱟᱱᱛᱟᱲᱤ", |
|
|
|
|
|
'en' : "English", |
|
} |
|
|
|
PERSOARABIC_LANG_CODES = { |
|
'ks', |
|
'pnb', |
|
'sd', |
|
'skr', |
|
'ur', |
|
} |
|
|
|
RTL_LANG_CODES = set(PERSOARABIC_LANG_CODES) |
|
RTL_LANG_CODES.add('dv') |
|
|
|
|
|
LANG_CODE_TO_SCRIPT_CODE = { |
|
|
|
|
|
"as" : "Beng", |
|
"bn" : "Beng", |
|
"doi" : "Deva", |
|
"dv" : "Thaa", |
|
"gom" : "Deva", |
|
"gu" : "Gujr", |
|
"hi" : "Deva", |
|
"ks" : "Aran", |
|
"mai" : "Deva", |
|
"mr" : "Deva", |
|
"ne" : "Deva", |
|
"or" : "Orya", |
|
"pa" : "Guru", |
|
"pnb" : "Aran", |
|
"sa" : "Deva", |
|
"sd" : "Arab", |
|
"si" : "Sinh", |
|
"skr" : "Aran", |
|
"ur" : "Aran", |
|
|
|
|
|
"kn" : "Knda", |
|
"ml" : "Mlym", |
|
"ta" : "Taml", |
|
"te" : "Telu", |
|
|
|
|
|
"brx" : "Deva", |
|
"mni" : "Mtei", |
|
|
|
|
|
"sat" : "Olck", |
|
|
|
|
|
"en" : "Latn", |
|
} |
|
|
|
SCRIPT_CODE_TO_UNICODE_CHARS_RANGE_STR = { |
|
|
|
|
|
|
|
"Beng": "\u0980-\u09FF", |
|
"Deva": "\u0900-\u097F", |
|
"Gujr": "\u0A80-\u0AFF", |
|
"Guru": "\u0A00-\u0A7F", |
|
"Orya": "\u0B00-\u0B7F", |
|
|
|
|
|
"Knda": "\u0C80-\u0CFF", |
|
"Mlym": "\u0D00-\u0D7F", |
|
"Sinh": "\u0D80-\u0DFF", |
|
"Taml": "\u0B80-\u0BFF", |
|
"Telu": "\u0C00-\u0C7F", |
|
|
|
|
|
"Mtei": "\uABC0-\uABFF", |
|
|
|
|
|
"Arab": "\u0600-\u06FF\u0750-\u077F\u0870-\u089F\u08A0-\u08FF", |
|
"Aran": "\u0600-\u06FF\u0750-\u077F\u0870-\u089F\u08A0-\u08FF", |
|
"Latn": "\u0041-\u005A\u0061-\u007A", |
|
"Olck": "\u1C50-\u1C7F", |
|
"Thaa": "\u0780-\u07BF", |
|
} |
|
|
|
GOOGLE_FONTS = { |
|
"gom": "Tiro Devanagari Marathi", |
|
"ks" : "Noto Nastaliq Urdu", |
|
"mni": "Noto Sans Meetei Mayek", |
|
"mr" : "Tiro Devanagari Marathi", |
|
"sa" : "Tiro Devanagari Sanskrit", |
|
"sat": "Noto Sans Ol Chiki", |
|
"sd" : "Lateef", |
|
"ur" : "Noto Nastaliq Urdu", |
|
} |
|
|
|
FALLBACK_FONTS = { |
|
"gom": "serif", |
|
"ks" : "serif", |
|
"mni": "sans-serif", |
|
"mr" : "serif", |
|
"sa" : "serif", |
|
"sat": "sans-serif", |
|
"sd" : "serif", |
|
"ur" : "serif", |
|
} |
|
|
|
INDIC_TO_LATIN_PUNCT = { |
|
|
|
|
|
|
|
'।': '.', |
|
|
|
'॥': "..", |
|
'෴': '.', |
|
|
|
'꫰': ',', |
|
'꯫': '.', |
|
|
|
|
|
'᱾': '.', |
|
'᱿': '..', |
|
|
|
|
|
'۔': '.', |
|
'؟': '?', |
|
'،': ',', |
|
'؛': ';', |
|
'': "..", |
|
} |
|
|
|
INDIC_TO_LATIN_PUNCT_TRANSLATOR = str.maketrans(INDIC_TO_LATIN_PUNCT) |
|
|
|
NON_LATIN_FULLSTOP_LANGS = { |
|
|
|
'as' : '।', |
|
'bn' : '।', |
|
'brx': '।', |
|
'doi': '।', |
|
'hi' : '।', |
|
'mai': '।', |
|
'mni': '꯫', |
|
'ne' : '।', |
|
'or' : '।', |
|
'pa' : '।', |
|
'sa' : '।', |
|
'sat': '᱾', |
|
|
|
|
|
'ks' : '۔', |
|
'pnb': '۔', |
|
|
|
'skr': '۔', |
|
'ur' : '۔', |
|
} |
|
|
|
ENDS_WITH_LATIN_FULLSTOP_REGEX = re.compile("(^|.*[^.])\.$") |
|
|
|
def nativize_latin_fullstop(text, lang_code): |
|
if lang_code in NON_LATIN_FULLSTOP_LANGS and ENDS_WITH_LATIN_FULLSTOP_REGEX.match(text): |
|
return text[:-1] + NON_LATIN_FULLSTOP_LANGS[lang_code] |
|
return text |
|
|
|
LATIN_TO_PERSOARABIC_PUNCTUATIONS = { |
|
|
|
'?': '؟', |
|
',': '،', |
|
';': '؛', |
|
} |
|
|
|
LATIN_TO_PERSOARABIC_PUNC_TRANSLATOR = str.maketrans(LATIN_TO_PERSOARABIC_PUNCTUATIONS) |
|
|
|
SCRIPT_CODE_TO_NUMERALS = { |
|
|
|
|
|
|
|
"Beng": "০১২৩৪৫৬৭৮৯", |
|
"Deva": "०१२३४५६७८९", |
|
"Gujr": "૦૧૨૩૪૫૬૭૮૯", |
|
"Guru": "੦੧੨੩੪੫੬੭੮੯", |
|
"Orya": "୦୧୨୩୪୫୬୭୮୯", |
|
|
|
|
|
"Knda": "೦೧೨೩೪೫೬೭೮೯", |
|
"Mlym": "൦൧൨൩൪൫൬൭൮൯", |
|
"Sinh": "෦෧෨෩෪෫෬෭෮෯", |
|
"Taml": "௦௧௨௩௪௫௬௭௮௯", |
|
"Telu": "౦౧౨౩౪౫౬౭౮౯", |
|
|
|
|
|
"Mtei": "꯰꯱꯲꯳꯴꯵꯶꯷꯸꯹", |
|
|
|
|
|
"Arab": "۰۱۲۳۴۵۶۷۸۹", |
|
"Aran": "۰۱۲۳۴۵۶۷۸۹", |
|
"Latn": "0123456789", |
|
"Olck": "᱐᱑᱒᱓᱔᱕᱖᱗᱘᱙", |
|
"Thaa": "٠١٢٣٤٥٦٧٨٩", |
|
} |
|
|
|
LANG_CODE_TO_NUMERALS = { |
|
lang_code: SCRIPT_CODE_TO_NUMERALS[script_code] |
|
for lang_code, script_code in LANG_CODE_TO_SCRIPT_CODE.items() |
|
} |
|
|
|
INDIC_TO_STANDARD_NUMERALS_GLOBAL_MAP = {} |
|
for lang_code, lang_numerals in LANG_CODE_TO_NUMERALS.items(): |
|
map_dict = {lang_numeral: en_numeral for lang_numeral, en_numeral in zip(lang_numerals, LANG_CODE_TO_NUMERALS["en"])} |
|
INDIC_TO_STANDARD_NUMERALS_GLOBAL_MAP.update(map_dict) |
|
|
|
INDIC_TO_STANDARD_NUMERALS_TRANSLATOR = str.maketrans(INDIC_TO_STANDARD_NUMERALS_GLOBAL_MAP) |
|
|
|
NATIVE_TO_LATIN_NUMERALS_TRANSLATORS = { |
|
lang_code: str.maketrans({lang_numeral: en_numeral for lang_numeral, en_numeral in zip(lang_numerals, LANG_CODE_TO_NUMERALS["en"])}) |
|
for lang_code, lang_numerals in LANG_CODE_TO_NUMERALS.items() |
|
if lang_code != "en" |
|
} |
|
|
|
LATIN_TO_NATIVE_NUMERALS_TRANSLATORS = { |
|
lang_code: str.maketrans({en_numeral: lang_numeral for en_numeral, lang_numeral in zip(LANG_CODE_TO_NUMERALS["en"], lang_numerals)}) |
|
for lang_code, lang_numerals in LANG_CODE_TO_NUMERALS.items() |
|
if lang_code != "en" |
|
} |
|
|
|
WORDFINAL_INDIC_VIRAMA_REGEX = re.compile("(\u09cd|\u094d|\u0acd|\u0a4d|\u0b4d|\u0ccd|\u0d4d|\u0dca|\u0bcd|\u0c4d|\uaaf6)$") |
|
def hardfix_wordfinal_virama(word): |
|
|
|
|
|
return WORDFINAL_INDIC_VIRAMA_REGEX.sub("\\1\u200c", word) |
|
|
|
ODIA_CONFUSING_YUKTAKSHARA_REGEX = re.compile("(\u0b4d)(ବ|ଵ|ୱ|ଯ|ୟ)") |
|
def fix_odia_confusing_ambiguous_yuktakshara(word): |
|
|
|
return ODIA_CONFUSING_YUKTAKSHARA_REGEX.sub("\\1\u200c\\2", word) |
|
|
|
LATIN_WORDFINAL_CONSONANTS_CHECKER_REGEX = re.compile(".*([bcdfghjklmnpqrstvwxyz])$") |
|
DEVANAGARI_WORDFINAL_CONSONANTS_REGEX = re.compile("([\u0915-\u0939\u0958-\u095f\u0979-\u097c\u097e-\u097f])$") |
|
def explicit_devanagari_wordfinal_schwa_delete(roman_word, indic_word): |
|
if LATIN_WORDFINAL_CONSONANTS_CHECKER_REGEX.match(roman_word): |
|
indic_word = DEVANAGARI_WORDFINAL_CONSONANTS_REGEX.sub("\\1\u094d", indic_word) |
|
return indic_word |
|
|
|
|
|
|
|
def rreplace(text, find_pattern, replace_pattern, match_count=1): |
|
splits = text.rsplit(find_pattern, match_count) |
|
return replace_pattern.join(splits) |
|
|