# SPDX-FileCopyrightText: 2024 Idiap Research Institute # SPDX-FileContributor: Karl El Hajal # # SPDX-License-Identifier: MIT import re from num2words import num2words PUNCTUATION_TO_REPLACE_WITH_COMMA = ["(", ")", ":"] PUNCTUATION = ".!?," SPACE = " " def replace_some_punctuation_with_comma(text): for punct in PUNCTUATION_TO_REPLACE_WITH_COMMA: text = text.replace(punct, " , ") return text def split_string_on_punctuation(s): substrings = [] current_substring = "" for char in s: current_substring += char if char in PUNCTUATION: substrings.append(current_substring.strip()) current_substring = "" if current_substring: substrings.append(current_substring) return substrings def remove_punctuation_only_substrings(substrings): new_substrings = [] for substring in substrings: if not all(c in PUNCTUATION + SPACE for c in substring): new_substrings.append(substring) return new_substrings def clean_punctuation(text): text = replace_some_punctuation_with_comma(text) substrings = split_string_on_punctuation(text) substrings = remove_punctuation_only_substrings(substrings) text = SPACE.join(substrings) return text def clean_input_text(text): text = text.lower().strip() text = clean_punctuation(text) def replace_numbers(match): return num2words(int(match.group())) text = re.sub(r"\d+", replace_numbers, text) text = " ".join(text.split()) # Remove extra spaces if text and text[-1] not in PUNCTUATION: text += "." return text