Spaces:
Running
Running
File size: 2,738 Bytes
349b2ad 77f184e 30f2cde 349b2ad c7de0f6 c0a9b4d 305bf1a c0a9b4d 305bf1a c0a9b4d 86e7d18 c7de0f6 77f184e 349b2ad e5a3778 77f184e e5a3778 349b2ad 77f184e 349b2ad c010ef4 30f2cde 5fe2eb9 30f2cde 5fe2eb9 c010ef4 c7f3351 c010ef4 86e7d18 c010ef4 86e7d18 c010ef4 2ffc7e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import num2words
import re
def preprocess_text(text):
text = text.lower()
# currencies
text = text.replace("$", "долар")
text = text.replace("₴", "гривня")
text = text.replace("€", "євро")
# replace apostrophe
text = text.replace("`", "'")
text = text.replace("ʼ", "'")
text = text.replace("…", "...")
symbols = {
"”": '"',
"“": '"',
"’": '"',
"‘": '"',
"«": '"',
"»": '"',
"–": "-",
"—": "-",
"―": "-",
}
for symbol, value in symbols.items():
text = text.replace(symbol, value)
# numbers
text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
def detect_num_and_convert(word):
numbers = "0123456789,."
result = []
parts = word.split("-") # for handling complex words
for part in parts:
is_number = all(map(lambda x: x in numbers, part))
if is_number:
try:
result.append(num2words.num2words(part, lang="uk"))
except:
result.append(part)
else:
result.append(part)
return "-".join(result)
# print([detect_num_and_convert(word) for word in text.split(" ")])
text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
# fallback numbers
text = text.replace("1", "один ")
text = text.replace("2", "два ")
text = text.replace("3", "три ")
text = text.replace("4", "чотири ")
text = text.replace("5", "п'ять ")
text = text.replace("6", "шість ")
text = text.replace("7", "сім ")
text = text.replace("8", "вісім ")
text = text.replace("9", "дев'ять ")
text = text.replace("0", "нуль ")
# speak english alphabet using brute force transliteration
english = {
"qu": "кв",
"ch": "ч",
"sh": "ш",
"ph": "ф",
"kh": "х",
"a": "а",
"b": "б",
"c": "ц",
"d": "д",
"e": "е",
"f": "ф",
"g": "ґ",
"h": "г",
"i": "і",
"j": "дж",
"k": "к",
"l": "л",
"m": "м",
"n": "н",
"o": "о",
"p": "п",
"q": "кв",
"r": "р",
"s": "с",
"t": "т",
"u": "ю",
"v": "в",
"w": "в",
"x": "кс",
"y": "і",
"z": "з",
}
for english_char, english_value in english.keys():
# uppercase
text = text.replace(english_char.upper(), english_value.upper())
text = text.replace(english_char, english_value)
return text
|