Update app.py
Browse files
app.py
CHANGED
@@ -27,10 +27,12 @@ initial_list_of_models = [
|
|
27 |
"CohereForAI/c4ai-command-r-v01",
|
28 |
"CohereForAI/c4ai-command-r-plus",
|
29 |
"CohereForAI/aya-101",
|
|
|
30 |
"aubmindlab/bert-base-arabertv02",
|
31 |
"aubmindlab/bert-base-arabertv2",
|
32 |
"aubmindlab/bert-base-arabertv01",
|
33 |
-
"aubmindlab/bert-base-arabert"
|
|
|
34 |
]
|
35 |
|
36 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
@@ -63,7 +65,7 @@ def benchmark_tokenizer(model_name) -> float:
|
|
63 |
model_name, use_fast=True, trust_remote_code=True
|
64 |
)
|
65 |
vocab_size = tokenizer.vocab_size
|
66 |
-
if "arabert" in model_name:
|
67 |
arabert_prep = ArabertPreprocessor(model_name=model_name)
|
68 |
arabic_dataset1_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset1, desc="Arabert Prep Dataset 1")]
|
69 |
arabic_dataset2_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset2, desc="Arabert Prep Dataset 2")]
|
@@ -192,7 +194,7 @@ def decode_bpe_tokens(tokens):
|
|
192 |
|
193 |
def tokenize_text(text, chosen_model, better_tokenization=False):
|
194 |
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
|
195 |
-
if "arabert" in chosen_model:
|
196 |
arabert_prep = ArabertPreprocessor(model_name=chosen_model)
|
197 |
text = arabert_prep.preprocess(text)
|
198 |
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
|
|
|
27 |
"CohereForAI/c4ai-command-r-v01",
|
28 |
"CohereForAI/c4ai-command-r-plus",
|
29 |
"CohereForAI/aya-101",
|
30 |
+
"aubmindlab/bert-base-arabertv02-twitter",
|
31 |
"aubmindlab/bert-base-arabertv02",
|
32 |
"aubmindlab/bert-base-arabertv2",
|
33 |
"aubmindlab/bert-base-arabertv01",
|
34 |
+
"aubmindlab/bert-base-arabert",
|
35 |
+
"aubmindlab/aragpt2-mega"
|
36 |
]
|
37 |
|
38 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
|
|
65 |
model_name, use_fast=True, trust_remote_code=True
|
66 |
)
|
67 |
vocab_size = tokenizer.vocab_size
|
68 |
+
if "arabert" in model_name or "aragpt2" in model_name:
|
69 |
arabert_prep = ArabertPreprocessor(model_name=model_name)
|
70 |
arabic_dataset1_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset1, desc="Arabert Prep Dataset 1")]
|
71 |
arabic_dataset2_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset2, desc="Arabert Prep Dataset 2")]
|
|
|
194 |
|
195 |
def tokenize_text(text, chosen_model, better_tokenization=False):
|
196 |
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
|
197 |
+
if "arabert" in chosen_model or "aragpt2" in chosen_model:
|
198 |
arabert_prep = ArabertPreprocessor(model_name=chosen_model)
|
199 |
text = arabert_prep.preprocess(text)
|
200 |
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
|