End of training
Browse files- README.md +19 -19
- pytorch_model-00001-of-00004.bin +1 -1
- pytorch_model-00002-of-00004.bin +1 -1
- pytorch_model-00003-of-00004.bin +1 -1
- pytorch_model-00004-of-00004.bin +1 -1
README.md
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
---
|
2 |
-
license:
|
3 |
-
base_model:
|
4 |
tags:
|
|
|
5 |
- generated_from_trainer
|
6 |
model-index:
|
7 |
-
- name:
|
8 |
results: []
|
9 |
---
|
10 |
|
@@ -16,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
|
|
16 |
|
17 |
axolotl version: `0.4.1`
|
18 |
```yaml
|
19 |
-
base_model:
|
20 |
model_type: LlamaForCausalLM
|
21 |
tokenizer_type: AutoTokenizer
|
22 |
|
@@ -27,22 +28,23 @@ strict: false
|
|
27 |
datasets:
|
28 |
- path: AI-MO/NuminaMath-CoT
|
29 |
type: sharegpt.load_ultrachat
|
|
|
|
|
30 |
dataset_prepared_path: /scratch/bf996/axolotl/datasets/numina
|
31 |
val_set_size: 0.001
|
32 |
output_dir: /scratch/bf996/axolotl/outputs/numina
|
33 |
-
chat_template: llama3
|
34 |
sequence_len: 8192
|
35 |
sample_packing: true
|
36 |
eval_sample_packing: false
|
37 |
pad_to_sequence_len: true
|
38 |
|
39 |
-
wandb_project:
|
40 |
wandb_entity:
|
41 |
wandb_watch:
|
42 |
-
wandb_name:
|
43 |
wandb_log_model:
|
|
|
44 |
|
45 |
-
shuffle_merged_datasets: true
|
46 |
|
47 |
gradient_accumulation_steps: 8
|
48 |
micro_batch_size: 1
|
@@ -50,7 +52,7 @@ num_epochs: 2
|
|
50 |
optimizer: paged_adamw_8bit
|
51 |
lr_scheduler: cosine
|
52 |
learning_rate: 2e-5
|
53 |
-
max_steps:
|
54 |
|
55 |
train_on_inputs: false
|
56 |
group_by_length: false
|
@@ -68,10 +70,10 @@ xformers_attention:
|
|
68 |
flash_attention: true
|
69 |
|
70 |
warmup_steps: 100
|
71 |
-
evals_per_epoch:
|
72 |
eval_table_size:
|
73 |
save_strategy: steps
|
74 |
-
save_steps:
|
75 |
save_total_limit: 5
|
76 |
debug:
|
77 |
deepspeed:
|
@@ -85,11 +87,12 @@ special_tokens:
|
|
85 |
|
86 |
</details><br>
|
87 |
|
88 |
-
|
|
|
89 |
|
90 |
-
This model is a fine-tuned version of [
|
91 |
It achieves the following results on the evaluation set:
|
92 |
-
- Loss: 0.
|
93 |
|
94 |
## Model description
|
95 |
|
@@ -126,11 +129,8 @@ The following hyperparameters were used during training:
|
|
126 |
|
127 |
| Training Loss | Epoch | Step | Validation Loss |
|
128 |
|:-------------:|:------:|:----:|:---------------:|
|
129 |
-
| 0.
|
130 |
-
| 0.
|
131 |
-
| 0.47 | 0.9597 | 1730 | 0.4013 |
|
132 |
-
| 0.3877 | 1.4265 | 2595 | 0.3950 |
|
133 |
-
| 0.3924 | 1.9064 | 3460 | 0.3942 |
|
134 |
|
135 |
|
136 |
### Framework versions
|
|
|
1 |
---
|
2 |
+
license: llama3
|
3 |
+
base_model: meta-llama/Meta-Llama-3-8B
|
4 |
tags:
|
5 |
+
- axolotl
|
6 |
- generated_from_trainer
|
7 |
model-index:
|
8 |
+
- name: Llama-3-8B-NuminaCoT
|
9 |
results: []
|
10 |
---
|
11 |
|
|
|
17 |
|
18 |
axolotl version: `0.4.1`
|
19 |
```yaml
|
20 |
+
base_model: meta-llama/Meta-Llama-3-8B
|
21 |
model_type: LlamaForCausalLM
|
22 |
tokenizer_type: AutoTokenizer
|
23 |
|
|
|
28 |
datasets:
|
29 |
- path: AI-MO/NuminaMath-CoT
|
30 |
type: sharegpt.load_ultrachat
|
31 |
+
|
32 |
+
chat_template: llama3
|
33 |
dataset_prepared_path: /scratch/bf996/axolotl/datasets/numina
|
34 |
val_set_size: 0.001
|
35 |
output_dir: /scratch/bf996/axolotl/outputs/numina
|
|
|
36 |
sequence_len: 8192
|
37 |
sample_packing: true
|
38 |
eval_sample_packing: false
|
39 |
pad_to_sequence_len: true
|
40 |
|
41 |
+
wandb_project: lm-evals
|
42 |
wandb_entity:
|
43 |
wandb_watch:
|
44 |
+
wandb_name: Llama-3-8B-NuminaCoT
|
45 |
wandb_log_model:
|
46 |
+
hub_model_id: penfever/Llama-3-8B-NuminaCoT
|
47 |
|
|
|
48 |
|
49 |
gradient_accumulation_steps: 8
|
50 |
micro_batch_size: 1
|
|
|
52 |
optimizer: paged_adamw_8bit
|
53 |
lr_scheduler: cosine
|
54 |
learning_rate: 2e-5
|
55 |
+
max_steps: 10000
|
56 |
|
57 |
train_on_inputs: false
|
58 |
group_by_length: false
|
|
|
70 |
flash_attention: true
|
71 |
|
72 |
warmup_steps: 100
|
73 |
+
evals_per_epoch: 0
|
74 |
eval_table_size:
|
75 |
save_strategy: steps
|
76 |
+
save_steps: 500
|
77 |
save_total_limit: 5
|
78 |
debug:
|
79 |
deepspeed:
|
|
|
87 |
|
88 |
</details><br>
|
89 |
|
90 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/nyu-dice-lab/lm-evals/runs/ghe48g78)
|
91 |
+
# Llama-3-8B-NuminaCoT
|
92 |
|
93 |
+
This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the None dataset.
|
94 |
It achieves the following results on the evaluation set:
|
95 |
+
- Loss: 0.3943
|
96 |
|
97 |
## Model description
|
98 |
|
|
|
129 |
|
130 |
| Training Loss | Epoch | Step | Validation Loss |
|
131 |
|:-------------:|:------:|:----:|:---------------:|
|
132 |
+
| 0.4379 | 1.0130 | 1826 | 0.3994 |
|
133 |
+
| 0.3928 | 1.9064 | 3460 | 0.3943 |
|
|
|
|
|
|
|
134 |
|
135 |
|
136 |
### Framework versions
|
pytorch_model-00001-of-00004.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4976718466
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e0cc9f6527c9af1389c4c041f4740ff6b6bed5353b7f21e2015cbec77ed78d4
|
3 |
size 4976718466
|
pytorch_model-00002-of-00004.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999827718
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91c6851988a44e6d4a31111486ddbe18a14e4f376981b7650c49791ecf838515
|
3 |
size 4999827718
|
pytorch_model-00003-of-00004.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4915940170
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0b0669b36ff3a762555ac39b4edd3242fb42ff73bb81c366fed0b7b802a44596
|
3 |
size 4915940170
|
pytorch_model-00004-of-00004.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1168140873
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5337e1c352bea82087ced3063c1df19dd6dd69fc631d9aaaa4e296cadcfb6e0b
|
3 |
size 1168140873
|