wissamantoun commited on
Commit
de21796
·
verified ·
1 Parent(s): ba06000

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -27,10 +27,12 @@ initial_list_of_models = [
27
  "CohereForAI/c4ai-command-r-v01",
28
  "CohereForAI/c4ai-command-r-plus",
29
  "CohereForAI/aya-101",
 
30
  "aubmindlab/bert-base-arabertv02",
31
  "aubmindlab/bert-base-arabertv2",
32
  "aubmindlab/bert-base-arabertv01",
33
- "aubmindlab/bert-base-arabert"
 
34
  ]
35
 
36
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
@@ -63,7 +65,7 @@ def benchmark_tokenizer(model_name) -> float:
63
  model_name, use_fast=True, trust_remote_code=True
64
  )
65
  vocab_size = tokenizer.vocab_size
66
- if "arabert" in model_name:
67
  arabert_prep = ArabertPreprocessor(model_name=model_name)
68
  arabic_dataset1_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset1, desc="Arabert Prep Dataset 1")]
69
  arabic_dataset2_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset2, desc="Arabert Prep Dataset 2")]
@@ -192,7 +194,7 @@ def decode_bpe_tokens(tokens):
192
 
193
  def tokenize_text(text, chosen_model, better_tokenization=False):
194
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
195
- if "arabert" in chosen_model:
196
  arabert_prep = ArabertPreprocessor(model_name=chosen_model)
197
  text = arabert_prep.preprocess(text)
198
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
 
27
  "CohereForAI/c4ai-command-r-v01",
28
  "CohereForAI/c4ai-command-r-plus",
29
  "CohereForAI/aya-101",
30
+ "aubmindlab/bert-base-arabertv02-twitter",
31
  "aubmindlab/bert-base-arabertv02",
32
  "aubmindlab/bert-base-arabertv2",
33
  "aubmindlab/bert-base-arabertv01",
34
+ "aubmindlab/bert-base-arabert",
35
+ "aubmindlab/aragpt2-mega"
36
  ]
37
 
38
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
 
65
  model_name, use_fast=True, trust_remote_code=True
66
  )
67
  vocab_size = tokenizer.vocab_size
68
+ if "arabert" in model_name or "aragpt2" in model_name:
69
  arabert_prep = ArabertPreprocessor(model_name=model_name)
70
  arabic_dataset1_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset1, desc="Arabert Prep Dataset 1")]
71
  arabic_dataset2_preped = [ arabert_prep.preprocess(x) for x in tqdm(arabic_dataset2, desc="Arabert Prep Dataset 2")]
 
194
 
195
  def tokenize_text(text, chosen_model, better_tokenization=False):
196
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
197
+ if "arabert" in chosen_model or "aragpt2" in chosen_model:
198
  arabert_prep = ArabertPreprocessor(model_name=chosen_model)
199
  text = arabert_prep.preprocess(text)
200
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))