|
--- |
|
license: gemma |
|
datasets: |
|
- ChallengerSpaceShuttle/zulu-pretraining-dataset |
|
language: |
|
- zu |
|
base_model: google/gemma-2-2b |
|
pipeline_tag: text-generation |
|
--- |
|
|
|
# BafoGPT-3B |
|
|
|
This is gemma2-2b-base model continued-pretraining on the [ChallengerSpaceShuttle/zulu-pretraining-dataset](https://huggingface.co/datasets/ChallengerSpaceShuttle/zulu-pretraining-dataset) dataset. |
|
|
|
This is the first iteration, on building IsiZulu models that can attain performance comparable to models that typically require millions of dollars to train from scratch. |
|
|
|
## π Applications |
|
|
|
This is the base model and has a context length of 8k. It can generate coherent Zulu text, one can finetune it based on instruction datasets. |
|
|
|
## β‘ Quantized models |
|
|
|
## π Evaluation |
|
|
|
## π§© Configuration |
|
|
|
The code used to train the model can be found here: [BafoGPT](https://github.com/Motsepe-Jr/bafoGPT/tree/main) with the following training configuration. |
|
|
|
```yaml |
|
model_name: google/gemma-2-2b |
|
out_dir: pretrained_model/models |
|
precision: bf16-mixed |
|
initial_checkpoint_dir: google/gemma-2-2b |
|
resume: false |
|
data: |
|
class_path: litgpt.data.LitData |
|
init_args: |
|
data_path: data |
|
seed: 42 |
|
num_workers: 8 |
|
train: |
|
save_interval: 1000 |
|
log_interval: 1 |
|
global_batch_size: 4 |
|
micro_batch_size: 1 |
|
lr_warmup_steps: 2000 |
|
max_tokens: 156800708 |
|
max_seq_length: 2048 |
|
tie_embeddings: false |
|
max_norm: 1.0 |
|
min_lr: 4.0e-05 |
|
eval: |
|
interval: 1000 |
|
max_iters: 100 |
|
initial_validation: false |
|
final_validation: true |
|
optimizer: AdamW |
|
devices: auto |
|
num_nodes: 1 |
|
tokenizer_dir: google/gemma-2-2b |
|
logger_name: tensorboard |
|
seed: 42 |
|
``` |
|
|
|
Architecture Config |
|
|
|
```json |
|
{ |
|
"architectures": [ |
|
"Gemma2ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"attn_logit_softcapping": 50.0, |
|
"bos_token_id": 2, |
|
"cache_implementation": "hybrid", |
|
"eos_token_id": 1, |
|
"final_logit_softcapping": 30.0, |
|
"head_dim": 256, |
|
"hidden_act": "gelu_pytorch_tanh", |
|
"hidden_activation": "gelu_pytorch_tanh", |
|
"hidden_size": 2304, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 9216, |
|
"max_position_embeddings": 8192, |
|
"model_type": "gemma2", |
|
"num_attention_heads": 8, |
|
"num_hidden_layers": 26, |
|
"num_key_value_heads": 4, |
|
"pad_token_id": 0, |
|
"query_pre_attn_scalar": 256, |
|
"rms_norm_eps": 1e-06, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 4096, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.42.4", |
|
"use_cache": true, |
|
"vocab_size": 288256 |
|
} |
|
``` |
|
|
|
|