samuellimabraz commited on
Commit
fa277b4
·
verified ·
1 Parent(s): 57d54f7

End of training

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Qwen2-VL-2B-Instruct
3
+ library_name: transformers
4
+ model_name: Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned
5
+ tags:
6
+ - generated_from_trainer
7
+ - unsloth
8
+ - trl
9
+ - sft
10
+ licence: license
11
+ ---
12
+
13
+ # Model Card for Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned
14
+
15
+ This model is a fine-tuned version of [unsloth/Qwen2-VL-2B-Instruct](https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct).
16
+ It has been trained using [TRL](https://github.com/huggingface/trl).
17
+
18
+ ## Quick start
19
+
20
+ ```python
21
+ from transformers import pipeline
22
+
23
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
24
+ generator = pipeline("text-generation", model="tech4humans/Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned", device="cuda")
25
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
26
+ print(output["generated_text"])
27
+ ```
28
+
29
+ ## Training procedure
30
+
31
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/samuel-lima-tech4humans/ocr-finetuning-v2/runs/rqtgmysz)
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 0.15.1
39
+ - Transformers: 4.49.0.dev0
40
+ - Pytorch: 2.6.0
41
+ - Datasets: 3.3.2
42
+ - Tokenizers: 0.21.0
43
+
44
+ ## Citations
45
+
46
+
47
+
48
+ Cite TRL as:
49
+
50
+ ```bibtex
51
+ @misc{vonwerra2022trl,
52
+ title = {{TRL: Transformer Reinforcement Learning}},
53
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
54
+ year = 2020,
55
+ journal = {GitHub repository},
56
+ publisher = {GitHub},
57
+ howpublished = {\url{https://github.com/huggingface/trl}}
58
+ }
59
+ ```
adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/Qwen2-VL-2B-Instruct",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 8,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 4,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn).*?(?:qkv|proj|fc1|fc2|q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn)\\.(?:(?:qkv|proj|fc1|fc2|q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)))",
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73b26ff3aeea093ee7f91cbc4a169a4f2e36af3fc848c9bf5ee510db88ac9717
3
+ size 8335688
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "all_params": 2211058176,
3
+ "memory_footprint": 4426269008,
4
+ "total_flos": 2.136820048293888e+16,
5
+ "train_loss": 0.2946822406862069,
6
+ "train_runtime": 4276.0328,
7
+ "train_samples_per_second": 0.745,
8
+ "train_steps_per_second": 0.093,
9
+ "trainable_params": 2072576,
10
+ "trainable_params_percent": 0.09373683707180756
11
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "longest_edge": 12845056,
26
+ "shortest_edge": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|vision_pad|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:948c45c29a91dd2e6ae77d6f5a324a3d408bcca6ad443365b2e79986f1422771
3
+ size 11420540
tokenizer_config.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
134
+ "clean_up_tokenization_spaces": false,
135
+ "eos_token": "<|im_end|>",
136
+ "errors": "replace",
137
+ "extra_special_tokens": {},
138
+ "model_max_length": 32768,
139
+ "pad_token": "<|vision_pad|>",
140
+ "padding_side": "right",
141
+ "processor_class": "Qwen2VLProcessor",
142
+ "split_special_tokens": false,
143
+ "tokenizer_class": "Qwen2Tokenizer",
144
+ "unk_token": null
145
+ }
train_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "all_params": 2211058176,
3
+ "memory_footprint": 4426269008,
4
+ "total_flos": 2.136820048293888e+16,
5
+ "train_loss": 0.2946822406862069,
6
+ "train_runtime": 4276.0328,
7
+ "train_samples_per_second": 0.745,
8
+ "train_steps_per_second": 0.093,
9
+ "trainable_params": 2072576,
10
+ "trainable_params_percent": 0.09373683707180756
11
+ }
trainer_state.json ADDED
@@ -0,0 +1,2157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.05344419553875923,
3
+ "best_model_checkpoint": "/content/train/Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned/checkpoint-270",
4
+ "epoch": 2.0451977401129944,
5
+ "eval_steps": 10,
6
+ "global_step": 270,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.007532956685499058,
13
+ "grad_norm": 0.68587327003479,
14
+ "learning_rate": 2e-05,
15
+ "loss": 1.6782,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.015065913370998116,
20
+ "grad_norm": 0.7287677526473999,
21
+ "learning_rate": 4e-05,
22
+ "loss": 1.7932,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.022598870056497175,
27
+ "grad_norm": 0.7718816995620728,
28
+ "learning_rate": 6e-05,
29
+ "loss": 1.6757,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.030131826741996232,
34
+ "grad_norm": 0.7753613591194153,
35
+ "learning_rate": 8e-05,
36
+ "loss": 1.7695,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.03766478342749529,
41
+ "grad_norm": 1.235795259475708,
42
+ "learning_rate": 0.0001,
43
+ "loss": 1.9245,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.04519774011299435,
48
+ "grad_norm": 0.569118082523346,
49
+ "learning_rate": 0.00012,
50
+ "loss": 1.451,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.05273069679849341,
55
+ "grad_norm": 0.6638339757919312,
56
+ "learning_rate": 0.00014,
57
+ "loss": 1.6576,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.060263653483992465,
62
+ "grad_norm": 0.6843408942222595,
63
+ "learning_rate": 0.00016,
64
+ "loss": 1.6339,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.06779661016949153,
69
+ "grad_norm": 0.5259923934936523,
70
+ "learning_rate": 0.00018,
71
+ "loss": 1.5687,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.07532956685499058,
76
+ "grad_norm": 0.655581533908844,
77
+ "learning_rate": 0.0002,
78
+ "loss": 1.6655,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.07532956685499058,
83
+ "eval_loss": 1.5389481782913208,
84
+ "eval_runtime": 47.9411,
85
+ "eval_samples_per_second": 1.564,
86
+ "eval_steps_per_second": 0.793,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.08286252354048965,
91
+ "grad_norm": 1.65678870677948,
92
+ "learning_rate": 0.0001999966879815833,
93
+ "loss": 1.7139,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.0903954802259887,
98
+ "grad_norm": 0.4999409019947052,
99
+ "learning_rate": 0.0001999867521457224,
100
+ "loss": 1.4695,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.09792843691148775,
105
+ "grad_norm": 0.6279143691062927,
106
+ "learning_rate": 0.0001999701931505708,
107
+ "loss": 1.42,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.10546139359698682,
112
+ "grad_norm": 0.47573018074035645,
113
+ "learning_rate": 0.00019994701209300245,
114
+ "loss": 1.3877,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.11299435028248588,
119
+ "grad_norm": 0.5120630264282227,
120
+ "learning_rate": 0.00019991721050853907,
121
+ "loss": 1.4014,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.12052730696798493,
126
+ "grad_norm": 0.4641444981098175,
127
+ "learning_rate": 0.00019988079037124864,
128
+ "loss": 1.2456,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.128060263653484,
133
+ "grad_norm": 0.5229088664054871,
134
+ "learning_rate": 0.00019983775409361447,
135
+ "loss": 1.3617,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.13559322033898305,
140
+ "grad_norm": 0.6793835759162903,
141
+ "learning_rate": 0.00019978810452637543,
142
+ "loss": 1.4584,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.1431261770244821,
147
+ "grad_norm": 0.530450165271759,
148
+ "learning_rate": 0.00019973184495833716,
149
+ "loss": 1.2412,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.15065913370998116,
154
+ "grad_norm": 0.5695556998252869,
155
+ "learning_rate": 0.00019966897911615416,
156
+ "loss": 1.2738,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.15065913370998116,
161
+ "eval_loss": 1.2209211587905884,
162
+ "eval_runtime": 37.1065,
163
+ "eval_samples_per_second": 2.021,
164
+ "eval_steps_per_second": 1.024,
165
+ "step": 20
166
+ },
167
+ {
168
+ "epoch": 0.15819209039548024,
169
+ "grad_norm": 0.5769656896591187,
170
+ "learning_rate": 0.00019959951116408294,
171
+ "loss": 1.2751,
172
+ "step": 21
173
+ },
174
+ {
175
+ "epoch": 0.1657250470809793,
176
+ "grad_norm": 0.6152491569519043,
177
+ "learning_rate": 0.0001995234457037063,
178
+ "loss": 1.2145,
179
+ "step": 22
180
+ },
181
+ {
182
+ "epoch": 0.17325800376647835,
183
+ "grad_norm": 0.6578372120857239,
184
+ "learning_rate": 0.00019944078777362826,
185
+ "loss": 1.1845,
186
+ "step": 23
187
+ },
188
+ {
189
+ "epoch": 0.1807909604519774,
190
+ "grad_norm": 0.5556841492652893,
191
+ "learning_rate": 0.00019935154284914065,
192
+ "loss": 1.0926,
193
+ "step": 24
194
+ },
195
+ {
196
+ "epoch": 0.18832391713747645,
197
+ "grad_norm": 0.7302567958831787,
198
+ "learning_rate": 0.00019925571684186006,
199
+ "loss": 1.1249,
200
+ "step": 25
201
+ },
202
+ {
203
+ "epoch": 0.1958568738229755,
204
+ "grad_norm": 0.6284404993057251,
205
+ "learning_rate": 0.00019915331609933657,
206
+ "loss": 0.9404,
207
+ "step": 26
208
+ },
209
+ {
210
+ "epoch": 0.2033898305084746,
211
+ "grad_norm": 0.776946485042572,
212
+ "learning_rate": 0.00019904434740463306,
213
+ "loss": 1.044,
214
+ "step": 27
215
+ },
216
+ {
217
+ "epoch": 0.21092278719397364,
218
+ "grad_norm": 0.7142918705940247,
219
+ "learning_rate": 0.00019892881797587601,
220
+ "loss": 0.9695,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 0.2184557438794727,
225
+ "grad_norm": 0.8852341175079346,
226
+ "learning_rate": 0.0001988067354657773,
227
+ "loss": 0.8989,
228
+ "step": 29
229
+ },
230
+ {
231
+ "epoch": 0.22598870056497175,
232
+ "grad_norm": 0.8206908106803894,
233
+ "learning_rate": 0.00019867810796112744,
234
+ "loss": 0.8154,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 0.22598870056497175,
239
+ "eval_loss": 0.8218569755554199,
240
+ "eval_runtime": 37.094,
241
+ "eval_samples_per_second": 2.022,
242
+ "eval_steps_per_second": 1.024,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 0.2335216572504708,
247
+ "grad_norm": 0.9797173142433167,
248
+ "learning_rate": 0.0001985429439822596,
249
+ "loss": 0.7847,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 0.24105461393596986,
254
+ "grad_norm": 1.0684410333633423,
255
+ "learning_rate": 0.00019840125248248564,
256
+ "loss": 0.823,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 0.24858757062146894,
261
+ "grad_norm": 1.009280800819397,
262
+ "learning_rate": 0.00019825304284750263,
263
+ "loss": 0.883,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 0.256120527306968,
268
+ "grad_norm": 0.8165304660797119,
269
+ "learning_rate": 0.00019809832489477142,
270
+ "loss": 0.7012,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 0.263653483992467,
275
+ "grad_norm": 0.794262707233429,
276
+ "learning_rate": 0.00019793710887286615,
277
+ "loss": 0.6529,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 0.2711864406779661,
282
+ "grad_norm": 0.727675199508667,
283
+ "learning_rate": 0.0001977694054607955,
284
+ "loss": 0.6809,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 0.2787193973634652,
289
+ "grad_norm": 0.7391637563705444,
290
+ "learning_rate": 0.00019759522576729533,
291
+ "loss": 0.6308,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 0.2862523540489642,
296
+ "grad_norm": 0.7500622868537903,
297
+ "learning_rate": 0.00019741458133009258,
298
+ "loss": 0.5628,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 0.2937853107344633,
303
+ "grad_norm": 0.962188184261322,
304
+ "learning_rate": 0.00019722748411514135,
305
+ "loss": 0.5857,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 0.3013182674199623,
310
+ "grad_norm": 0.7300134301185608,
311
+ "learning_rate": 0.0001970339465158301,
312
+ "loss": 0.5631,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 0.3013182674199623,
317
+ "eval_loss": 0.5341230630874634,
318
+ "eval_runtime": 37.0912,
319
+ "eval_samples_per_second": 2.022,
320
+ "eval_steps_per_second": 1.025,
321
+ "step": 40
322
+ },
323
+ {
324
+ "epoch": 0.3088512241054614,
325
+ "grad_norm": 0.5163620710372925,
326
+ "learning_rate": 0.00019683398135216066,
327
+ "loss": 0.528,
328
+ "step": 41
329
+ },
330
+ {
331
+ "epoch": 0.3163841807909605,
332
+ "grad_norm": 0.38112568855285645,
333
+ "learning_rate": 0.00019662760186989913,
334
+ "loss": 0.5219,
335
+ "step": 42
336
+ },
337
+ {
338
+ "epoch": 0.3239171374764595,
339
+ "grad_norm": 0.389498233795166,
340
+ "learning_rate": 0.00019641482173969848,
341
+ "loss": 0.5172,
342
+ "step": 43
343
+ },
344
+ {
345
+ "epoch": 0.3314500941619586,
346
+ "grad_norm": 0.5581079125404358,
347
+ "learning_rate": 0.00019619565505619288,
348
+ "loss": 0.5106,
349
+ "step": 44
350
+ },
351
+ {
352
+ "epoch": 0.3389830508474576,
353
+ "grad_norm": 0.38179025053977966,
354
+ "learning_rate": 0.00019597011633706415,
355
+ "loss": 0.5374,
356
+ "step": 45
357
+ },
358
+ {
359
+ "epoch": 0.3465160075329567,
360
+ "grad_norm": 0.40401706099510193,
361
+ "learning_rate": 0.00019573822052208013,
362
+ "loss": 0.4814,
363
+ "step": 46
364
+ },
365
+ {
366
+ "epoch": 0.3540489642184557,
367
+ "grad_norm": 0.3594434857368469,
368
+ "learning_rate": 0.00019549998297210502,
369
+ "loss": 0.4933,
370
+ "step": 47
371
+ },
372
+ {
373
+ "epoch": 0.3615819209039548,
374
+ "grad_norm": 0.34325098991394043,
375
+ "learning_rate": 0.00019525541946808188,
376
+ "loss": 0.4893,
377
+ "step": 48
378
+ },
379
+ {
380
+ "epoch": 0.3691148775894539,
381
+ "grad_norm": 0.3423003852367401,
382
+ "learning_rate": 0.00019500454620998732,
383
+ "loss": 0.4584,
384
+ "step": 49
385
+ },
386
+ {
387
+ "epoch": 0.3766478342749529,
388
+ "grad_norm": 0.3735145330429077,
389
+ "learning_rate": 0.00019474737981575832,
390
+ "loss": 0.4078,
391
+ "step": 50
392
+ },
393
+ {
394
+ "epoch": 0.3766478342749529,
395
+ "eval_loss": 0.4520163834095001,
396
+ "eval_runtime": 37.128,
397
+ "eval_samples_per_second": 2.02,
398
+ "eval_steps_per_second": 1.023,
399
+ "step": 50
400
+ },
401
+ {
402
+ "epoch": 0.384180790960452,
403
+ "grad_norm": 0.5755606293678284,
404
+ "learning_rate": 0.0001944839373201916,
405
+ "loss": 0.4468,
406
+ "step": 51
407
+ },
408
+ {
409
+ "epoch": 0.391713747645951,
410
+ "grad_norm": 0.3776421546936035,
411
+ "learning_rate": 0.00019421423617381508,
412
+ "loss": 0.5,
413
+ "step": 52
414
+ },
415
+ {
416
+ "epoch": 0.3992467043314501,
417
+ "grad_norm": 0.3351342976093292,
418
+ "learning_rate": 0.00019393829424173205,
419
+ "loss": 0.4443,
420
+ "step": 53
421
+ },
422
+ {
423
+ "epoch": 0.4067796610169492,
424
+ "grad_norm": 0.6081859469413757,
425
+ "learning_rate": 0.0001936561298024377,
426
+ "loss": 0.393,
427
+ "step": 54
428
+ },
429
+ {
430
+ "epoch": 0.4143126177024482,
431
+ "grad_norm": 0.40104803442955017,
432
+ "learning_rate": 0.00019336776154660841,
433
+ "loss": 0.4274,
434
+ "step": 55
435
+ },
436
+ {
437
+ "epoch": 0.4218455743879473,
438
+ "grad_norm": 0.44677644968032837,
439
+ "learning_rate": 0.00019307320857586376,
440
+ "loss": 0.4133,
441
+ "step": 56
442
+ },
443
+ {
444
+ "epoch": 0.4293785310734463,
445
+ "grad_norm": 0.36069607734680176,
446
+ "learning_rate": 0.00019277249040150092,
447
+ "loss": 0.3849,
448
+ "step": 57
449
+ },
450
+ {
451
+ "epoch": 0.4369114877589454,
452
+ "grad_norm": 1.2188339233398438,
453
+ "learning_rate": 0.00019246562694320255,
454
+ "loss": 0.4041,
455
+ "step": 58
456
+ },
457
+ {
458
+ "epoch": 0.4444444444444444,
459
+ "grad_norm": 0.4592845141887665,
460
+ "learning_rate": 0.00019215263852771718,
461
+ "loss": 0.4183,
462
+ "step": 59
463
+ },
464
+ {
465
+ "epoch": 0.4519774011299435,
466
+ "grad_norm": 1.6102626323699951,
467
+ "learning_rate": 0.00019183354588751271,
468
+ "loss": 0.4038,
469
+ "step": 60
470
+ },
471
+ {
472
+ "epoch": 0.4519774011299435,
473
+ "eval_loss": 0.38410142064094543,
474
+ "eval_runtime": 37.1151,
475
+ "eval_samples_per_second": 2.021,
476
+ "eval_steps_per_second": 1.024,
477
+ "step": 60
478
+ },
479
+ {
480
+ "epoch": 0.4595103578154426,
481
+ "grad_norm": 0.4766036868095398,
482
+ "learning_rate": 0.00019150837015940322,
483
+ "loss": 0.4346,
484
+ "step": 61
485
+ },
486
+ {
487
+ "epoch": 0.4670433145009416,
488
+ "grad_norm": 0.4366019368171692,
489
+ "learning_rate": 0.00019117713288314863,
490
+ "loss": 0.3804,
491
+ "step": 62
492
+ },
493
+ {
494
+ "epoch": 0.4745762711864407,
495
+ "grad_norm": 0.792560338973999,
496
+ "learning_rate": 0.00019083985600002818,
497
+ "loss": 0.3856,
498
+ "step": 63
499
+ },
500
+ {
501
+ "epoch": 0.4821092278719397,
502
+ "grad_norm": 0.427386075258255,
503
+ "learning_rate": 0.0001904965618513868,
504
+ "loss": 0.3906,
505
+ "step": 64
506
+ },
507
+ {
508
+ "epoch": 0.4896421845574388,
509
+ "grad_norm": 0.6638129949569702,
510
+ "learning_rate": 0.00019014727317715537,
511
+ "loss": 0.4039,
512
+ "step": 65
513
+ },
514
+ {
515
+ "epoch": 0.4971751412429379,
516
+ "grad_norm": 0.46441903710365295,
517
+ "learning_rate": 0.00018979201311434434,
518
+ "loss": 0.422,
519
+ "step": 66
520
+ },
521
+ {
522
+ "epoch": 0.504708097928437,
523
+ "grad_norm": 0.4845605790615082,
524
+ "learning_rate": 0.00018943080519551108,
525
+ "loss": 0.358,
526
+ "step": 67
527
+ },
528
+ {
529
+ "epoch": 0.512241054613936,
530
+ "grad_norm": 0.7461917400360107,
531
+ "learning_rate": 0.00018906367334720124,
532
+ "loss": 0.3956,
533
+ "step": 68
534
+ },
535
+ {
536
+ "epoch": 0.519774011299435,
537
+ "grad_norm": 0.6427743434906006,
538
+ "learning_rate": 0.0001886906418883636,
539
+ "loss": 0.3141,
540
+ "step": 69
541
+ },
542
+ {
543
+ "epoch": 0.527306967984934,
544
+ "grad_norm": 0.6577739119529724,
545
+ "learning_rate": 0.00018831173552873946,
546
+ "loss": 0.3455,
547
+ "step": 70
548
+ },
549
+ {
550
+ "epoch": 0.527306967984934,
551
+ "eval_loss": 0.3052687644958496,
552
+ "eval_runtime": 37.0939,
553
+ "eval_samples_per_second": 2.022,
554
+ "eval_steps_per_second": 1.024,
555
+ "step": 70
556
+ },
557
+ {
558
+ "epoch": 0.5348399246704332,
559
+ "grad_norm": 0.7122016549110413,
560
+ "learning_rate": 0.00018792697936722563,
561
+ "loss": 0.3519,
562
+ "step": 71
563
+ },
564
+ {
565
+ "epoch": 0.5423728813559322,
566
+ "grad_norm": 0.5734298229217529,
567
+ "learning_rate": 0.00018753639889021196,
568
+ "loss": 0.3051,
569
+ "step": 72
570
+ },
571
+ {
572
+ "epoch": 0.5499058380414312,
573
+ "grad_norm": 0.8871021270751953,
574
+ "learning_rate": 0.00018714001996989312,
575
+ "loss": 0.2803,
576
+ "step": 73
577
+ },
578
+ {
579
+ "epoch": 0.5574387947269304,
580
+ "grad_norm": 0.7467854022979736,
581
+ "learning_rate": 0.00018673786886255476,
582
+ "loss": 0.2741,
583
+ "step": 74
584
+ },
585
+ {
586
+ "epoch": 0.5649717514124294,
587
+ "grad_norm": 0.549818754196167,
588
+ "learning_rate": 0.0001863299722068344,
589
+ "loss": 0.2779,
590
+ "step": 75
591
+ },
592
+ {
593
+ "epoch": 0.5725047080979284,
594
+ "grad_norm": 0.5196639895439148,
595
+ "learning_rate": 0.00018591635702195673,
596
+ "loss": 0.3036,
597
+ "step": 76
598
+ },
599
+ {
600
+ "epoch": 0.5800376647834274,
601
+ "grad_norm": 0.532467782497406,
602
+ "learning_rate": 0.00018549705070594396,
603
+ "loss": 0.2767,
604
+ "step": 77
605
+ },
606
+ {
607
+ "epoch": 0.5875706214689266,
608
+ "grad_norm": 0.8568252325057983,
609
+ "learning_rate": 0.00018507208103380092,
610
+ "loss": 0.2224,
611
+ "step": 78
612
+ },
613
+ {
614
+ "epoch": 0.5951035781544256,
615
+ "grad_norm": 0.557944118976593,
616
+ "learning_rate": 0.00018464147615567517,
617
+ "loss": 0.2269,
618
+ "step": 79
619
+ },
620
+ {
621
+ "epoch": 0.6026365348399246,
622
+ "grad_norm": 0.886238157749176,
623
+ "learning_rate": 0.0001842052645949925,
624
+ "loss": 0.2658,
625
+ "step": 80
626
+ },
627
+ {
628
+ "epoch": 0.6026365348399246,
629
+ "eval_loss": 0.22510449588298798,
630
+ "eval_runtime": 37.1134,
631
+ "eval_samples_per_second": 2.021,
632
+ "eval_steps_per_second": 1.024,
633
+ "step": 80
634
+ },
635
+ {
636
+ "epoch": 0.6101694915254238,
637
+ "grad_norm": 0.5309122204780579,
638
+ "learning_rate": 0.00018376347524656734,
639
+ "loss": 0.2168,
640
+ "step": 81
641
+ },
642
+ {
643
+ "epoch": 0.6177024482109228,
644
+ "grad_norm": 0.5083054304122925,
645
+ "learning_rate": 0.00018331613737468887,
646
+ "loss": 0.2312,
647
+ "step": 82
648
+ },
649
+ {
650
+ "epoch": 0.6252354048964218,
651
+ "grad_norm": 9.135035514831543,
652
+ "learning_rate": 0.00018286328061118244,
653
+ "loss": 0.246,
654
+ "step": 83
655
+ },
656
+ {
657
+ "epoch": 0.632768361581921,
658
+ "grad_norm": 0.771587610244751,
659
+ "learning_rate": 0.00018240493495344694,
660
+ "loss": 0.2207,
661
+ "step": 84
662
+ },
663
+ {
664
+ "epoch": 0.64030131826742,
665
+ "grad_norm": 0.8555005788803101,
666
+ "learning_rate": 0.00018194113076246753,
667
+ "loss": 0.223,
668
+ "step": 85
669
+ },
670
+ {
671
+ "epoch": 0.647834274952919,
672
+ "grad_norm": 0.5555715560913086,
673
+ "learning_rate": 0.00018147189876080463,
674
+ "loss": 0.2114,
675
+ "step": 86
676
+ },
677
+ {
678
+ "epoch": 0.655367231638418,
679
+ "grad_norm": 0.6347367167472839,
680
+ "learning_rate": 0.00018099727003055894,
681
+ "loss": 0.2326,
682
+ "step": 87
683
+ },
684
+ {
685
+ "epoch": 0.6629001883239172,
686
+ "grad_norm": 0.7266764640808105,
687
+ "learning_rate": 0.00018051727601131227,
688
+ "loss": 0.257,
689
+ "step": 88
690
+ },
691
+ {
692
+ "epoch": 0.6704331450094162,
693
+ "grad_norm": 0.7240170240402222,
694
+ "learning_rate": 0.00018003194849804534,
695
+ "loss": 0.2001,
696
+ "step": 89
697
+ },
698
+ {
699
+ "epoch": 0.6779661016949152,
700
+ "grad_norm": 0.7595257759094238,
701
+ "learning_rate": 0.00017954131963903133,
702
+ "loss": 0.1747,
703
+ "step": 90
704
+ },
705
+ {
706
+ "epoch": 0.6779661016949152,
707
+ "eval_loss": 0.17160969972610474,
708
+ "eval_runtime": 37.1302,
709
+ "eval_samples_per_second": 2.02,
710
+ "eval_steps_per_second": 1.023,
711
+ "step": 90
712
+ },
713
+ {
714
+ "epoch": 0.6854990583804144,
715
+ "grad_norm": 0.7588114738464355,
716
+ "learning_rate": 0.00017904542193370663,
717
+ "loss": 0.1372,
718
+ "step": 91
719
+ },
720
+ {
721
+ "epoch": 0.6930320150659134,
722
+ "grad_norm": 0.7313429713249207,
723
+ "learning_rate": 0.0001785442882305179,
724
+ "loss": 0.2234,
725
+ "step": 92
726
+ },
727
+ {
728
+ "epoch": 0.7005649717514124,
729
+ "grad_norm": 0.8581743836402893,
730
+ "learning_rate": 0.0001780379517247462,
731
+ "loss": 0.1712,
732
+ "step": 93
733
+ },
734
+ {
735
+ "epoch": 0.7080979284369114,
736
+ "grad_norm": 1.0297523736953735,
737
+ "learning_rate": 0.0001775264459563081,
738
+ "loss": 0.1769,
739
+ "step": 94
740
+ },
741
+ {
742
+ "epoch": 0.7156308851224106,
743
+ "grad_norm": 0.5627338290214539,
744
+ "learning_rate": 0.00017700980480753423,
745
+ "loss": 0.1864,
746
+ "step": 95
747
+ },
748
+ {
749
+ "epoch": 0.7231638418079096,
750
+ "grad_norm": 1.0914690494537354,
751
+ "learning_rate": 0.0001764880625009245,
752
+ "loss": 0.1786,
753
+ "step": 96
754
+ },
755
+ {
756
+ "epoch": 0.7306967984934086,
757
+ "grad_norm": 0.6584937572479248,
758
+ "learning_rate": 0.00017596125359688154,
759
+ "loss": 0.131,
760
+ "step": 97
761
+ },
762
+ {
763
+ "epoch": 0.7382297551789078,
764
+ "grad_norm": 1.1257890462875366,
765
+ "learning_rate": 0.00017542941299142112,
766
+ "loss": 0.1678,
767
+ "step": 98
768
+ },
769
+ {
770
+ "epoch": 0.7457627118644068,
771
+ "grad_norm": 0.5444011688232422,
772
+ "learning_rate": 0.00017489257591386093,
773
+ "loss": 0.1562,
774
+ "step": 99
775
+ },
776
+ {
777
+ "epoch": 0.7532956685499058,
778
+ "grad_norm": 0.665874183177948,
779
+ "learning_rate": 0.00017435077792448664,
780
+ "loss": 0.189,
781
+ "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.7532956685499058,
785
+ "eval_loss": 0.13106876611709595,
786
+ "eval_runtime": 37.0741,
787
+ "eval_samples_per_second": 2.023,
788
+ "eval_steps_per_second": 1.025,
789
+ "step": 100
790
+ },
791
+ {
792
+ "epoch": 0.7608286252354048,
793
+ "grad_norm": 0.6252754330635071,
794
+ "learning_rate": 0.0001738040549121967,
795
+ "loss": 0.104,
796
+ "step": 101
797
+ },
798
+ {
799
+ "epoch": 0.768361581920904,
800
+ "grad_norm": 0.6250944137573242,
801
+ "learning_rate": 0.00017325244309212475,
802
+ "loss": 0.1582,
803
+ "step": 102
804
+ },
805
+ {
806
+ "epoch": 0.775894538606403,
807
+ "grad_norm": 0.7759442329406738,
808
+ "learning_rate": 0.00017269597900324097,
809
+ "loss": 0.1888,
810
+ "step": 103
811
+ },
812
+ {
813
+ "epoch": 0.783427495291902,
814
+ "grad_norm": 0.5639198422431946,
815
+ "learning_rate": 0.00017213469950593156,
816
+ "loss": 0.1223,
817
+ "step": 104
818
+ },
819
+ {
820
+ "epoch": 0.7909604519774012,
821
+ "grad_norm": 0.5083601474761963,
822
+ "learning_rate": 0.00017156864177955719,
823
+ "loss": 0.0838,
824
+ "step": 105
825
+ },
826
+ {
827
+ "epoch": 0.7984934086629002,
828
+ "grad_norm": 0.5559635758399963,
829
+ "learning_rate": 0.0001709978433199901,
830
+ "loss": 0.0855,
831
+ "step": 106
832
+ },
833
+ {
834
+ "epoch": 0.8060263653483992,
835
+ "grad_norm": 0.6353676319122314,
836
+ "learning_rate": 0.00017042234193713056,
837
+ "loss": 0.1105,
838
+ "step": 107
839
+ },
840
+ {
841
+ "epoch": 0.8135593220338984,
842
+ "grad_norm": 0.7712072134017944,
843
+ "learning_rate": 0.0001698421757524021,
844
+ "loss": 0.1402,
845
+ "step": 108
846
+ },
847
+ {
848
+ "epoch": 0.8210922787193974,
849
+ "grad_norm": 0.7416761517524719,
850
+ "learning_rate": 0.00016925738319622654,
851
+ "loss": 0.0932,
852
+ "step": 109
853
+ },
854
+ {
855
+ "epoch": 0.8286252354048964,
856
+ "grad_norm": 0.7182126045227051,
857
+ "learning_rate": 0.00016866800300547813,
858
+ "loss": 0.131,
859
+ "step": 110
860
+ },
861
+ {
862
+ "epoch": 0.8286252354048964,
863
+ "eval_loss": 0.09729403257369995,
864
+ "eval_runtime": 37.1303,
865
+ "eval_samples_per_second": 2.02,
866
+ "eval_steps_per_second": 1.023,
867
+ "step": 110
868
+ },
869
+ {
870
+ "epoch": 0.8361581920903954,
871
+ "grad_norm": 0.6551967263221741,
872
+ "learning_rate": 0.00016807407422091784,
873
+ "loss": 0.1161,
874
+ "step": 111
875
+ },
876
+ {
877
+ "epoch": 0.8436911487758946,
878
+ "grad_norm": 0.6405838131904602,
879
+ "learning_rate": 0.0001674756361846071,
880
+ "loss": 0.1454,
881
+ "step": 112
882
+ },
883
+ {
884
+ "epoch": 0.8512241054613936,
885
+ "grad_norm": 0.4275994598865509,
886
+ "learning_rate": 0.00016687272853730192,
887
+ "loss": 0.0897,
888
+ "step": 113
889
+ },
890
+ {
891
+ "epoch": 0.8587570621468926,
892
+ "grad_norm": 0.6592651605606079,
893
+ "learning_rate": 0.00016626539121582685,
894
+ "loss": 0.0534,
895
+ "step": 114
896
+ },
897
+ {
898
+ "epoch": 0.8662900188323918,
899
+ "grad_norm": 0.6205569505691528,
900
+ "learning_rate": 0.0001656536644504298,
901
+ "loss": 0.1361,
902
+ "step": 115
903
+ },
904
+ {
905
+ "epoch": 0.8738229755178908,
906
+ "grad_norm": 0.5345686078071594,
907
+ "learning_rate": 0.0001650375887621171,
908
+ "loss": 0.0923,
909
+ "step": 116
910
+ },
911
+ {
912
+ "epoch": 0.8813559322033898,
913
+ "grad_norm": 1.0165270566940308,
914
+ "learning_rate": 0.00016441720495996912,
915
+ "loss": 0.0852,
916
+ "step": 117
917
+ },
918
+ {
919
+ "epoch": 0.8888888888888888,
920
+ "grad_norm": 0.48809266090393066,
921
+ "learning_rate": 0.00016379255413843754,
922
+ "loss": 0.0839,
923
+ "step": 118
924
+ },
925
+ {
926
+ "epoch": 0.896421845574388,
927
+ "grad_norm": 0.650384247303009,
928
+ "learning_rate": 0.0001631636776746228,
929
+ "loss": 0.102,
930
+ "step": 119
931
+ },
932
+ {
933
+ "epoch": 0.903954802259887,
934
+ "grad_norm": 0.6523996591567993,
935
+ "learning_rate": 0.00016253061722553355,
936
+ "loss": 0.0661,
937
+ "step": 120
938
+ },
939
+ {
940
+ "epoch": 0.903954802259887,
941
+ "eval_loss": 0.07857384532690048,
942
+ "eval_runtime": 37.0902,
943
+ "eval_samples_per_second": 2.022,
944
+ "eval_steps_per_second": 1.025,
945
+ "step": 120
946
+ },
947
+ {
948
+ "epoch": 0.911487758945386,
949
+ "grad_norm": 0.4382803738117218,
950
+ "learning_rate": 0.00016189341472532705,
951
+ "loss": 0.0582,
952
+ "step": 121
953
+ },
954
+ {
955
+ "epoch": 0.9190207156308852,
956
+ "grad_norm": 0.6267339587211609,
957
+ "learning_rate": 0.0001612521123825317,
958
+ "loss": 0.079,
959
+ "step": 122
960
+ },
961
+ {
962
+ "epoch": 0.9265536723163842,
963
+ "grad_norm": 0.700908899307251,
964
+ "learning_rate": 0.00016060675267725083,
965
+ "loss": 0.1022,
966
+ "step": 123
967
+ },
968
+ {
969
+ "epoch": 0.9340866290018832,
970
+ "grad_norm": 0.4881342351436615,
971
+ "learning_rate": 0.00015995737835834906,
972
+ "loss": 0.063,
973
+ "step": 124
974
+ },
975
+ {
976
+ "epoch": 0.9416195856873822,
977
+ "grad_norm": 0.4968627989292145,
978
+ "learning_rate": 0.00015930403244062043,
979
+ "loss": 0.0675,
980
+ "step": 125
981
+ },
982
+ {
983
+ "epoch": 0.9491525423728814,
984
+ "grad_norm": 0.4240921437740326,
985
+ "learning_rate": 0.00015864675820193922,
986
+ "loss": 0.0531,
987
+ "step": 126
988
+ },
989
+ {
990
+ "epoch": 0.9566854990583804,
991
+ "grad_norm": 0.3779008984565735,
992
+ "learning_rate": 0.00015798559918039307,
993
+ "loss": 0.0481,
994
+ "step": 127
995
+ },
996
+ {
997
+ "epoch": 0.9642184557438794,
998
+ "grad_norm": 0.471587210893631,
999
+ "learning_rate": 0.00015732059917139912,
1000
+ "loss": 0.0698,
1001
+ "step": 128
1002
+ },
1003
+ {
1004
+ "epoch": 0.9717514124293786,
1005
+ "grad_norm": 0.44407761096954346,
1006
+ "learning_rate": 0.0001566518022248029,
1007
+ "loss": 0.1005,
1008
+ "step": 129
1009
+ },
1010
+ {
1011
+ "epoch": 0.9792843691148776,
1012
+ "grad_norm": 0.4785122275352478,
1013
+ "learning_rate": 0.00015597925264196049,
1014
+ "loss": 0.0784,
1015
+ "step": 130
1016
+ },
1017
+ {
1018
+ "epoch": 0.9792843691148776,
1019
+ "eval_loss": 0.07214296609163284,
1020
+ "eval_runtime": 37.1186,
1021
+ "eval_samples_per_second": 2.021,
1022
+ "eval_steps_per_second": 1.024,
1023
+ "step": 130
1024
+ },
1025
+ {
1026
+ "epoch": 0.9868173258003766,
1027
+ "grad_norm": 0.4364639222621918,
1028
+ "learning_rate": 0.00015530299497280395,
1029
+ "loss": 0.046,
1030
+ "step": 131
1031
+ },
1032
+ {
1033
+ "epoch": 0.9943502824858758,
1034
+ "grad_norm": 0.6649202108383179,
1035
+ "learning_rate": 0.0001546230740128904,
1036
+ "loss": 0.0618,
1037
+ "step": 132
1038
+ },
1039
+ {
1040
+ "epoch": 1.0075329566854991,
1041
+ "grad_norm": 0.662251353263855,
1042
+ "learning_rate": 0.00015393953480043467,
1043
+ "loss": 0.1003,
1044
+ "step": 133
1045
+ },
1046
+ {
1047
+ "epoch": 1.015065913370998,
1048
+ "grad_norm": 0.36134594678878784,
1049
+ "learning_rate": 0.000153252422613326,
1050
+ "loss": 0.0403,
1051
+ "step": 134
1052
+ },
1053
+ {
1054
+ "epoch": 1.0225988700564972,
1055
+ "grad_norm": 0.512718677520752,
1056
+ "learning_rate": 0.00015256178296612868,
1057
+ "loss": 0.0673,
1058
+ "step": 135
1059
+ },
1060
+ {
1061
+ "epoch": 1.0301318267419963,
1062
+ "grad_norm": 0.4086618721485138,
1063
+ "learning_rate": 0.0001518676616070674,
1064
+ "loss": 0.0943,
1065
+ "step": 136
1066
+ },
1067
+ {
1068
+ "epoch": 1.0376647834274952,
1069
+ "grad_norm": 0.3207029402256012,
1070
+ "learning_rate": 0.00015117010451499654,
1071
+ "loss": 0.0865,
1072
+ "step": 137
1073
+ },
1074
+ {
1075
+ "epoch": 1.0451977401129944,
1076
+ "grad_norm": 0.2941770553588867,
1077
+ "learning_rate": 0.0001504691578963549,
1078
+ "loss": 0.0374,
1079
+ "step": 138
1080
+ },
1081
+ {
1082
+ "epoch": 1.0527306967984935,
1083
+ "grad_norm": 0.4340198040008545,
1084
+ "learning_rate": 0.00014976486818210467,
1085
+ "loss": 0.077,
1086
+ "step": 139
1087
+ },
1088
+ {
1089
+ "epoch": 1.0602636534839924,
1090
+ "grad_norm": 0.54200679063797,
1091
+ "learning_rate": 0.00014905728202465595,
1092
+ "loss": 0.086,
1093
+ "step": 140
1094
+ },
1095
+ {
1096
+ "epoch": 1.0602636534839924,
1097
+ "eval_loss": 0.0658058300614357,
1098
+ "eval_runtime": 37.189,
1099
+ "eval_samples_per_second": 2.017,
1100
+ "eval_steps_per_second": 1.022,
1101
+ "step": 140
1102
+ },
1103
+ {
1104
+ "epoch": 1.0677966101694916,
1105
+ "grad_norm": 0.48267418146133423,
1106
+ "learning_rate": 0.00014834644629477644,
1107
+ "loss": 0.0502,
1108
+ "step": 141
1109
+ },
1110
+ {
1111
+ "epoch": 1.0753295668549905,
1112
+ "grad_norm": 0.5690019726753235,
1113
+ "learning_rate": 0.00014763240807848666,
1114
+ "loss": 0.0617,
1115
+ "step": 142
1116
+ },
1117
+ {
1118
+ "epoch": 1.0828625235404896,
1119
+ "grad_norm": 0.4100703299045563,
1120
+ "learning_rate": 0.0001469152146739411,
1121
+ "loss": 0.0562,
1122
+ "step": 143
1123
+ },
1124
+ {
1125
+ "epoch": 1.0903954802259888,
1126
+ "grad_norm": 0.49852266907691956,
1127
+ "learning_rate": 0.000146194913588295,
1128
+ "loss": 0.0751,
1129
+ "step": 144
1130
+ },
1131
+ {
1132
+ "epoch": 1.0979284369114877,
1133
+ "grad_norm": 0.4217350482940674,
1134
+ "learning_rate": 0.00014547155253455768,
1135
+ "loss": 0.0803,
1136
+ "step": 145
1137
+ },
1138
+ {
1139
+ "epoch": 1.1054613935969868,
1140
+ "grad_norm": 0.4313773810863495,
1141
+ "learning_rate": 0.00014474517942843175,
1142
+ "loss": 0.0447,
1143
+ "step": 146
1144
+ },
1145
+ {
1146
+ "epoch": 1.112994350282486,
1147
+ "grad_norm": 0.5009363889694214,
1148
+ "learning_rate": 0.0001440158423851392,
1149
+ "loss": 0.0415,
1150
+ "step": 147
1151
+ },
1152
+ {
1153
+ "epoch": 1.1205273069679849,
1154
+ "grad_norm": 0.8885876536369324,
1155
+ "learning_rate": 0.00014328358971623455,
1156
+ "loss": 0.0603,
1157
+ "step": 148
1158
+ },
1159
+ {
1160
+ "epoch": 1.128060263653484,
1161
+ "grad_norm": 1.5320378541946411,
1162
+ "learning_rate": 0.00014254846992640423,
1163
+ "loss": 0.0665,
1164
+ "step": 149
1165
+ },
1166
+ {
1167
+ "epoch": 1.1355932203389831,
1168
+ "grad_norm": 0.45557519793510437,
1169
+ "learning_rate": 0.00014181053171025392,
1170
+ "loss": 0.0855,
1171
+ "step": 150
1172
+ },
1173
+ {
1174
+ "epoch": 1.1355932203389831,
1175
+ "eval_loss": 0.06454955041408539,
1176
+ "eval_runtime": 37.1265,
1177
+ "eval_samples_per_second": 2.02,
1178
+ "eval_steps_per_second": 1.024,
1179
+ "step": 150
1180
+ },
1181
+ {
1182
+ "epoch": 1.143126177024482,
1183
+ "grad_norm": 0.2571490406990051,
1184
+ "learning_rate": 0.00014106982394908283,
1185
+ "loss": 0.0402,
1186
+ "step": 151
1187
+ },
1188
+ {
1189
+ "epoch": 1.1506591337099812,
1190
+ "grad_norm": 0.4380505084991455,
1191
+ "learning_rate": 0.00014032639570764593,
1192
+ "loss": 0.086,
1193
+ "step": 152
1194
+ },
1195
+ {
1196
+ "epoch": 1.1581920903954803,
1197
+ "grad_norm": 0.4073718190193176,
1198
+ "learning_rate": 0.00013958029623090378,
1199
+ "loss": 0.0491,
1200
+ "step": 153
1201
+ },
1202
+ {
1203
+ "epoch": 1.1657250470809792,
1204
+ "grad_norm": 0.40776053071022034,
1205
+ "learning_rate": 0.00013883157494076046,
1206
+ "loss": 0.072,
1207
+ "step": 154
1208
+ },
1209
+ {
1210
+ "epoch": 1.1732580037664784,
1211
+ "grad_norm": 0.31324324011802673,
1212
+ "learning_rate": 0.00013808028143279006,
1213
+ "loss": 0.0342,
1214
+ "step": 155
1215
+ },
1216
+ {
1217
+ "epoch": 1.1807909604519775,
1218
+ "grad_norm": 0.517558753490448,
1219
+ "learning_rate": 0.00013732646547295126,
1220
+ "loss": 0.0579,
1221
+ "step": 156
1222
+ },
1223
+ {
1224
+ "epoch": 1.1883239171374764,
1225
+ "grad_norm": 0.3593922257423401,
1226
+ "learning_rate": 0.00013657017699429092,
1227
+ "loss": 0.0749,
1228
+ "step": 157
1229
+ },
1230
+ {
1231
+ "epoch": 1.1958568738229756,
1232
+ "grad_norm": 0.26723143458366394,
1233
+ "learning_rate": 0.0001358114660936364,
1234
+ "loss": 0.0372,
1235
+ "step": 158
1236
+ },
1237
+ {
1238
+ "epoch": 1.2033898305084745,
1239
+ "grad_norm": 0.3371814489364624,
1240
+ "learning_rate": 0.00013505038302827723,
1241
+ "loss": 0.0486,
1242
+ "step": 159
1243
+ },
1244
+ {
1245
+ "epoch": 1.2109227871939736,
1246
+ "grad_norm": 0.3006036579608917,
1247
+ "learning_rate": 0.000134286978212636,
1248
+ "loss": 0.0882,
1249
+ "step": 160
1250
+ },
1251
+ {
1252
+ "epoch": 1.2109227871939736,
1253
+ "eval_loss": 0.06150702014565468,
1254
+ "eval_runtime": 37.13,
1255
+ "eval_samples_per_second": 2.02,
1256
+ "eval_steps_per_second": 1.023,
1257
+ "step": 160
1258
+ },
1259
+ {
1260
+ "epoch": 1.2184557438794728,
1261
+ "grad_norm": 0.3491075932979584,
1262
+ "learning_rate": 0.0001335213022149289,
1263
+ "loss": 0.0656,
1264
+ "step": 161
1265
+ },
1266
+ {
1267
+ "epoch": 1.2259887005649717,
1268
+ "grad_norm": 0.3559153378009796,
1269
+ "learning_rate": 0.00013275340575381598,
1270
+ "loss": 0.0601,
1271
+ "step": 162
1272
+ },
1273
+ {
1274
+ "epoch": 1.2335216572504708,
1275
+ "grad_norm": 0.41236844658851624,
1276
+ "learning_rate": 0.00013198333969504175,
1277
+ "loss": 0.0383,
1278
+ "step": 163
1279
+ },
1280
+ {
1281
+ "epoch": 1.24105461393597,
1282
+ "grad_norm": 0.3909653425216675,
1283
+ "learning_rate": 0.00013121115504806553,
1284
+ "loss": 0.1066,
1285
+ "step": 164
1286
+ },
1287
+ {
1288
+ "epoch": 1.2485875706214689,
1289
+ "grad_norm": 0.2600908875465393,
1290
+ "learning_rate": 0.0001304369029626828,
1291
+ "loss": 0.0361,
1292
+ "step": 165
1293
+ },
1294
+ {
1295
+ "epoch": 1.256120527306968,
1296
+ "grad_norm": 0.27978697419166565,
1297
+ "learning_rate": 0.00012966063472563685,
1298
+ "loss": 0.0301,
1299
+ "step": 166
1300
+ },
1301
+ {
1302
+ "epoch": 1.2636534839924671,
1303
+ "grad_norm": 0.3649253249168396,
1304
+ "learning_rate": 0.00012888240175722162,
1305
+ "loss": 0.0508,
1306
+ "step": 167
1307
+ },
1308
+ {
1309
+ "epoch": 1.271186440677966,
1310
+ "grad_norm": 0.34710630774497986,
1311
+ "learning_rate": 0.0001281022556078756,
1312
+ "loss": 0.0573,
1313
+ "step": 168
1314
+ },
1315
+ {
1316
+ "epoch": 1.2787193973634652,
1317
+ "grad_norm": 0.3954513669013977,
1318
+ "learning_rate": 0.0001273202479547671,
1319
+ "loss": 0.0708,
1320
+ "step": 169
1321
+ },
1322
+ {
1323
+ "epoch": 1.286252354048964,
1324
+ "grad_norm": 0.3171145021915436,
1325
+ "learning_rate": 0.00012653643059837107,
1326
+ "loss": 0.0835,
1327
+ "step": 170
1328
+ },
1329
+ {
1330
+ "epoch": 1.286252354048964,
1331
+ "eval_loss": 0.06033060699701309,
1332
+ "eval_runtime": 37.0879,
1333
+ "eval_samples_per_second": 2.022,
1334
+ "eval_steps_per_second": 1.025,
1335
+ "step": 170
1336
+ },
1337
+ {
1338
+ "epoch": 1.2937853107344632,
1339
+ "grad_norm": 0.3680741786956787,
1340
+ "learning_rate": 0.00012575085545903794,
1341
+ "loss": 0.077,
1342
+ "step": 171
1343
+ },
1344
+ {
1345
+ "epoch": 1.3013182674199624,
1346
+ "grad_norm": 0.3026699423789978,
1347
+ "learning_rate": 0.00012496357457355422,
1348
+ "loss": 0.0778,
1349
+ "step": 172
1350
+ },
1351
+ {
1352
+ "epoch": 1.3088512241054615,
1353
+ "grad_norm": 0.28971561789512634,
1354
+ "learning_rate": 0.00012417464009169583,
1355
+ "loss": 0.05,
1356
+ "step": 173
1357
+ },
1358
+ {
1359
+ "epoch": 1.3163841807909604,
1360
+ "grad_norm": 0.4369751513004303,
1361
+ "learning_rate": 0.0001233841042727734,
1362
+ "loss": 0.0755,
1363
+ "step": 174
1364
+ },
1365
+ {
1366
+ "epoch": 1.3239171374764596,
1367
+ "grad_norm": 0.2916516661643982,
1368
+ "learning_rate": 0.00012259201948217077,
1369
+ "loss": 0.0538,
1370
+ "step": 175
1371
+ },
1372
+ {
1373
+ "epoch": 1.3314500941619585,
1374
+ "grad_norm": 0.6259362697601318,
1375
+ "learning_rate": 0.00012179843818787624,
1376
+ "loss": 0.0878,
1377
+ "step": 176
1378
+ },
1379
+ {
1380
+ "epoch": 1.3389830508474576,
1381
+ "grad_norm": 0.2717919647693634,
1382
+ "learning_rate": 0.00012100341295700702,
1383
+ "loss": 0.0545,
1384
+ "step": 177
1385
+ },
1386
+ {
1387
+ "epoch": 1.3465160075329567,
1388
+ "grad_norm": 0.47408613562583923,
1389
+ "learning_rate": 0.00012020699645232721,
1390
+ "loss": 0.0969,
1391
+ "step": 178
1392
+ },
1393
+ {
1394
+ "epoch": 1.3540489642184557,
1395
+ "grad_norm": 0.2807871997356415,
1396
+ "learning_rate": 0.00011940924142875947,
1397
+ "loss": 0.0328,
1398
+ "step": 179
1399
+ },
1400
+ {
1401
+ "epoch": 1.3615819209039548,
1402
+ "grad_norm": 0.4400388300418854,
1403
+ "learning_rate": 0.0001186102007298904,
1404
+ "loss": 0.0585,
1405
+ "step": 180
1406
+ },
1407
+ {
1408
+ "epoch": 1.3615819209039548,
1409
+ "eval_loss": 0.05832603573799133,
1410
+ "eval_runtime": 37.1319,
1411
+ "eval_samples_per_second": 2.02,
1412
+ "eval_steps_per_second": 1.023,
1413
+ "step": 180
1414
+ },
1415
+ {
1416
+ "epoch": 1.369114877589454,
1417
+ "grad_norm": 0.38300102949142456,
1418
+ "learning_rate": 0.00011780992728447018,
1419
+ "loss": 0.0655,
1420
+ "step": 181
1421
+ },
1422
+ {
1423
+ "epoch": 1.3766478342749529,
1424
+ "grad_norm": 0.39059555530548096,
1425
+ "learning_rate": 0.00011700847410290667,
1426
+ "loss": 0.0617,
1427
+ "step": 182
1428
+ },
1429
+ {
1430
+ "epoch": 1.384180790960452,
1431
+ "grad_norm": 0.36025285720825195,
1432
+ "learning_rate": 0.00011620589427375375,
1433
+ "loss": 0.1054,
1434
+ "step": 183
1435
+ },
1436
+ {
1437
+ "epoch": 1.3917137476459511,
1438
+ "grad_norm": 0.24352721869945526,
1439
+ "learning_rate": 0.00011540224096019494,
1440
+ "loss": 0.0298,
1441
+ "step": 184
1442
+ },
1443
+ {
1444
+ "epoch": 1.39924670433145,
1445
+ "grad_norm": 0.2885790169239044,
1446
+ "learning_rate": 0.00011459756739652175,
1447
+ "loss": 0.0696,
1448
+ "step": 185
1449
+ },
1450
+ {
1451
+ "epoch": 1.4067796610169492,
1452
+ "grad_norm": 0.2957116961479187,
1453
+ "learning_rate": 0.0001137919268846074,
1454
+ "loss": 0.0449,
1455
+ "step": 186
1456
+ },
1457
+ {
1458
+ "epoch": 1.414312617702448,
1459
+ "grad_norm": 0.32375454902648926,
1460
+ "learning_rate": 0.0001129853727903762,
1461
+ "loss": 0.0535,
1462
+ "step": 187
1463
+ },
1464
+ {
1465
+ "epoch": 1.4218455743879472,
1466
+ "grad_norm": 0.35646215081214905,
1467
+ "learning_rate": 0.0001121779585402684,
1468
+ "loss": 0.037,
1469
+ "step": 188
1470
+ },
1471
+ {
1472
+ "epoch": 1.4293785310734464,
1473
+ "grad_norm": 0.25164303183555603,
1474
+ "learning_rate": 0.00011136973761770136,
1475
+ "loss": 0.036,
1476
+ "step": 189
1477
+ },
1478
+ {
1479
+ "epoch": 1.4369114877589455,
1480
+ "grad_norm": 0.24905888736248016,
1481
+ "learning_rate": 0.0001105607635595266,
1482
+ "loss": 0.0344,
1483
+ "step": 190
1484
+ },
1485
+ {
1486
+ "epoch": 1.4369114877589455,
1487
+ "eval_loss": 0.05805233120918274,
1488
+ "eval_runtime": 37.1095,
1489
+ "eval_samples_per_second": 2.021,
1490
+ "eval_steps_per_second": 1.024,
1491
+ "step": 190
1492
+ },
1493
+ {
1494
+ "epoch": 1.4444444444444444,
1495
+ "grad_norm": 0.3525996506214142,
1496
+ "learning_rate": 0.00010975108995248378,
1497
+ "loss": 0.0576,
1498
+ "step": 191
1499
+ },
1500
+ {
1501
+ "epoch": 1.4519774011299436,
1502
+ "grad_norm": 0.2925921082496643,
1503
+ "learning_rate": 0.00010894077042965083,
1504
+ "loss": 0.0645,
1505
+ "step": 192
1506
+ },
1507
+ {
1508
+ "epoch": 1.4595103578154425,
1509
+ "grad_norm": 0.47334054112434387,
1510
+ "learning_rate": 0.00010812985866689142,
1511
+ "loss": 0.1769,
1512
+ "step": 193
1513
+ },
1514
+ {
1515
+ "epoch": 1.4670433145009416,
1516
+ "grad_norm": 0.3007245659828186,
1517
+ "learning_rate": 0.00010731840837929946,
1518
+ "loss": 0.0565,
1519
+ "step": 194
1520
+ },
1521
+ {
1522
+ "epoch": 1.4745762711864407,
1523
+ "grad_norm": 0.3107605576515198,
1524
+ "learning_rate": 0.00010650647331764079,
1525
+ "loss": 0.0504,
1526
+ "step": 195
1527
+ },
1528
+ {
1529
+ "epoch": 1.4821092278719397,
1530
+ "grad_norm": 0.3428517282009125,
1531
+ "learning_rate": 0.000105694107264793,
1532
+ "loss": 0.0749,
1533
+ "step": 196
1534
+ },
1535
+ {
1536
+ "epoch": 1.4896421845574388,
1537
+ "grad_norm": 0.3695080280303955,
1538
+ "learning_rate": 0.00010488136403218265,
1539
+ "loss": 0.0604,
1540
+ "step": 197
1541
+ },
1542
+ {
1543
+ "epoch": 1.497175141242938,
1544
+ "grad_norm": 0.33667024970054626,
1545
+ "learning_rate": 0.00010406829745622085,
1546
+ "loss": 0.0739,
1547
+ "step": 198
1548
+ },
1549
+ {
1550
+ "epoch": 1.5047080979284368,
1551
+ "grad_norm": 0.4697053134441376,
1552
+ "learning_rate": 0.00010325496139473702,
1553
+ "loss": 0.0588,
1554
+ "step": 199
1555
+ },
1556
+ {
1557
+ "epoch": 1.512241054613936,
1558
+ "grad_norm": 0.36798229813575745,
1559
+ "learning_rate": 0.00010244140972341155,
1560
+ "loss": 0.0401,
1561
+ "step": 200
1562
+ },
1563
+ {
1564
+ "epoch": 1.512241054613936,
1565
+ "eval_loss": 0.05732857435941696,
1566
+ "eval_runtime": 37.158,
1567
+ "eval_samples_per_second": 2.018,
1568
+ "eval_steps_per_second": 1.023,
1569
+ "step": 200
1570
+ },
1571
+ {
1572
+ "epoch": 1.5197740112994351,
1573
+ "grad_norm": 0.29147714376449585,
1574
+ "learning_rate": 0.00010162769633220672,
1575
+ "loss": 0.0692,
1576
+ "step": 201
1577
+ },
1578
+ {
1579
+ "epoch": 1.527306967984934,
1580
+ "grad_norm": 0.2551415264606476,
1581
+ "learning_rate": 0.00010081387512179729,
1582
+ "loss": 0.0495,
1583
+ "step": 202
1584
+ },
1585
+ {
1586
+ "epoch": 1.5348399246704332,
1587
+ "grad_norm": 0.4365129768848419,
1588
+ "learning_rate": 0.0001,
1589
+ "loss": 0.0905,
1590
+ "step": 203
1591
+ },
1592
+ {
1593
+ "epoch": 1.542372881355932,
1594
+ "grad_norm": 0.256455659866333,
1595
+ "learning_rate": 9.918612487820273e-05,
1596
+ "loss": 0.0441,
1597
+ "step": 204
1598
+ },
1599
+ {
1600
+ "epoch": 1.5499058380414312,
1601
+ "grad_norm": 0.33844852447509766,
1602
+ "learning_rate": 9.83723036677933e-05,
1603
+ "loss": 0.0517,
1604
+ "step": 205
1605
+ },
1606
+ {
1607
+ "epoch": 1.5574387947269304,
1608
+ "grad_norm": 0.28650492429733276,
1609
+ "learning_rate": 9.755859027658848e-05,
1610
+ "loss": 0.0473,
1611
+ "step": 206
1612
+ },
1613
+ {
1614
+ "epoch": 1.5649717514124295,
1615
+ "grad_norm": 0.2910935580730438,
1616
+ "learning_rate": 9.674503860526297e-05,
1617
+ "loss": 0.0501,
1618
+ "step": 207
1619
+ },
1620
+ {
1621
+ "epoch": 1.5725047080979284,
1622
+ "grad_norm": 0.49296438694000244,
1623
+ "learning_rate": 9.593170254377916e-05,
1624
+ "loss": 0.0624,
1625
+ "step": 208
1626
+ },
1627
+ {
1628
+ "epoch": 1.5800376647834273,
1629
+ "grad_norm": 0.3825702965259552,
1630
+ "learning_rate": 9.511863596781734e-05,
1631
+ "loss": 0.0768,
1632
+ "step": 209
1633
+ },
1634
+ {
1635
+ "epoch": 1.5875706214689265,
1636
+ "grad_norm": 0.2868608832359314,
1637
+ "learning_rate": 9.430589273520703e-05,
1638
+ "loss": 0.054,
1639
+ "step": 210
1640
+ },
1641
+ {
1642
+ "epoch": 1.5875706214689265,
1643
+ "eval_loss": 0.05658142268657684,
1644
+ "eval_runtime": 37.107,
1645
+ "eval_samples_per_second": 2.021,
1646
+ "eval_steps_per_second": 1.024,
1647
+ "step": 210
1648
+ },
1649
+ {
1650
+ "epoch": 1.5951035781544256,
1651
+ "grad_norm": 0.22975075244903564,
1652
+ "learning_rate": 9.349352668235925e-05,
1653
+ "loss": 0.0375,
1654
+ "step": 211
1655
+ },
1656
+ {
1657
+ "epoch": 1.6026365348399247,
1658
+ "grad_norm": 0.29614976048469543,
1659
+ "learning_rate": 9.268159162070058e-05,
1660
+ "loss": 0.0768,
1661
+ "step": 212
1662
+ },
1663
+ {
1664
+ "epoch": 1.6101694915254239,
1665
+ "grad_norm": 0.2965467870235443,
1666
+ "learning_rate": 9.18701413331086e-05,
1667
+ "loss": 0.0444,
1668
+ "step": 213
1669
+ },
1670
+ {
1671
+ "epoch": 1.6177024482109228,
1672
+ "grad_norm": 0.3394235670566559,
1673
+ "learning_rate": 9.10592295703492e-05,
1674
+ "loss": 0.0549,
1675
+ "step": 214
1676
+ },
1677
+ {
1678
+ "epoch": 1.6252354048964217,
1679
+ "grad_norm": 0.3029539883136749,
1680
+ "learning_rate": 9.024891004751626e-05,
1681
+ "loss": 0.0451,
1682
+ "step": 215
1683
+ },
1684
+ {
1685
+ "epoch": 1.6327683615819208,
1686
+ "grad_norm": 0.28490352630615234,
1687
+ "learning_rate": 8.943923644047342e-05,
1688
+ "loss": 0.0272,
1689
+ "step": 216
1690
+ },
1691
+ {
1692
+ "epoch": 1.64030131826742,
1693
+ "grad_norm": 0.3418651819229126,
1694
+ "learning_rate": 8.863026238229868e-05,
1695
+ "loss": 0.1127,
1696
+ "step": 217
1697
+ },
1698
+ {
1699
+ "epoch": 1.6478342749529191,
1700
+ "grad_norm": 0.32494044303894043,
1701
+ "learning_rate": 8.782204145973162e-05,
1702
+ "loss": 0.0976,
1703
+ "step": 218
1704
+ },
1705
+ {
1706
+ "epoch": 1.655367231638418,
1707
+ "grad_norm": 0.5956616997718811,
1708
+ "learning_rate": 8.701462720962381e-05,
1709
+ "loss": 0.0509,
1710
+ "step": 219
1711
+ },
1712
+ {
1713
+ "epoch": 1.6629001883239172,
1714
+ "grad_norm": 0.35732752084732056,
1715
+ "learning_rate": 8.620807311539259e-05,
1716
+ "loss": 0.1967,
1717
+ "step": 220
1718
+ },
1719
+ {
1720
+ "epoch": 1.6629001883239172,
1721
+ "eval_loss": 0.055175162851810455,
1722
+ "eval_runtime": 37.1192,
1723
+ "eval_samples_per_second": 2.021,
1724
+ "eval_steps_per_second": 1.024,
1725
+ "step": 220
1726
+ },
1727
+ {
1728
+ "epoch": 1.670433145009416,
1729
+ "grad_norm": 0.4732244610786438,
1730
+ "learning_rate": 8.540243260347826e-05,
1731
+ "loss": 0.0693,
1732
+ "step": 221
1733
+ },
1734
+ {
1735
+ "epoch": 1.6779661016949152,
1736
+ "grad_norm": 0.27817562222480774,
1737
+ "learning_rate": 8.45977590398051e-05,
1738
+ "loss": 0.0616,
1739
+ "step": 222
1740
+ },
1741
+ {
1742
+ "epoch": 1.6854990583804144,
1743
+ "grad_norm": 0.28534531593322754,
1744
+ "learning_rate": 8.379410572624628e-05,
1745
+ "loss": 0.0392,
1746
+ "step": 223
1747
+ },
1748
+ {
1749
+ "epoch": 1.6930320150659135,
1750
+ "grad_norm": 0.20350764691829681,
1751
+ "learning_rate": 8.299152589709336e-05,
1752
+ "loss": 0.0348,
1753
+ "step": 224
1754
+ },
1755
+ {
1756
+ "epoch": 1.7005649717514124,
1757
+ "grad_norm": 0.22657251358032227,
1758
+ "learning_rate": 8.219007271552983e-05,
1759
+ "loss": 0.0393,
1760
+ "step": 225
1761
+ },
1762
+ {
1763
+ "epoch": 1.7080979284369113,
1764
+ "grad_norm": 0.3810754418373108,
1765
+ "learning_rate": 8.138979927010964e-05,
1766
+ "loss": 0.0661,
1767
+ "step": 226
1768
+ },
1769
+ {
1770
+ "epoch": 1.7156308851224105,
1771
+ "grad_norm": 0.23370787501335144,
1772
+ "learning_rate": 8.059075857124056e-05,
1773
+ "loss": 0.0519,
1774
+ "step": 227
1775
+ },
1776
+ {
1777
+ "epoch": 1.7231638418079096,
1778
+ "grad_norm": 0.2558518648147583,
1779
+ "learning_rate": 7.97930035476728e-05,
1780
+ "loss": 0.0419,
1781
+ "step": 228
1782
+ },
1783
+ {
1784
+ "epoch": 1.7306967984934087,
1785
+ "grad_norm": 0.24495276808738708,
1786
+ "learning_rate": 7.899658704299301e-05,
1787
+ "loss": 0.0768,
1788
+ "step": 229
1789
+ },
1790
+ {
1791
+ "epoch": 1.7382297551789079,
1792
+ "grad_norm": 0.31314679980278015,
1793
+ "learning_rate": 7.820156181212379e-05,
1794
+ "loss": 0.0987,
1795
+ "step": 230
1796
+ },
1797
+ {
1798
+ "epoch": 1.7382297551789079,
1799
+ "eval_loss": 0.055335018783807755,
1800
+ "eval_runtime": 37.1237,
1801
+ "eval_samples_per_second": 2.02,
1802
+ "eval_steps_per_second": 1.024,
1803
+ "step": 230
1804
+ },
1805
+ {
1806
+ "epoch": 1.7457627118644068,
1807
+ "grad_norm": 0.2738696038722992,
1808
+ "learning_rate": 7.740798051782923e-05,
1809
+ "loss": 0.1045,
1810
+ "step": 231
1811
+ },
1812
+ {
1813
+ "epoch": 1.7532956685499057,
1814
+ "grad_norm": 0.22097167372703552,
1815
+ "learning_rate": 7.66158957272266e-05,
1816
+ "loss": 0.0384,
1817
+ "step": 232
1818
+ },
1819
+ {
1820
+ "epoch": 1.7608286252354048,
1821
+ "grad_norm": 0.319528728723526,
1822
+ "learning_rate": 7.582535990830415e-05,
1823
+ "loss": 0.0513,
1824
+ "step": 233
1825
+ },
1826
+ {
1827
+ "epoch": 1.768361581920904,
1828
+ "grad_norm": 0.28677770495414734,
1829
+ "learning_rate": 7.503642542644581e-05,
1830
+ "loss": 0.0616,
1831
+ "step": 234
1832
+ },
1833
+ {
1834
+ "epoch": 1.7758945386064031,
1835
+ "grad_norm": 0.3826892673969269,
1836
+ "learning_rate": 7.424914454096211e-05,
1837
+ "loss": 0.0606,
1838
+ "step": 235
1839
+ },
1840
+ {
1841
+ "epoch": 1.783427495291902,
1842
+ "grad_norm": 0.3082129955291748,
1843
+ "learning_rate": 7.346356940162895e-05,
1844
+ "loss": 0.0566,
1845
+ "step": 236
1846
+ },
1847
+ {
1848
+ "epoch": 1.7909604519774012,
1849
+ "grad_norm": 0.25097185373306274,
1850
+ "learning_rate": 7.267975204523295e-05,
1851
+ "loss": 0.0431,
1852
+ "step": 237
1853
+ },
1854
+ {
1855
+ "epoch": 1.7984934086629,
1856
+ "grad_norm": 0.4633219838142395,
1857
+ "learning_rate": 7.189774439212442e-05,
1858
+ "loss": 0.0546,
1859
+ "step": 238
1860
+ },
1861
+ {
1862
+ "epoch": 1.8060263653483992,
1863
+ "grad_norm": 0.3444885313510895,
1864
+ "learning_rate": 7.11175982427784e-05,
1865
+ "loss": 0.1409,
1866
+ "step": 239
1867
+ },
1868
+ {
1869
+ "epoch": 1.8135593220338984,
1870
+ "grad_norm": 0.3237282633781433,
1871
+ "learning_rate": 7.033936527436318e-05,
1872
+ "loss": 0.0659,
1873
+ "step": 240
1874
+ },
1875
+ {
1876
+ "epoch": 1.8135593220338984,
1877
+ "eval_loss": 0.05429178848862648,
1878
+ "eval_runtime": 37.0949,
1879
+ "eval_samples_per_second": 2.022,
1880
+ "eval_steps_per_second": 1.024,
1881
+ "step": 240
1882
+ },
1883
+ {
1884
+ "epoch": 1.8210922787193975,
1885
+ "grad_norm": 0.2055477797985077,
1886
+ "learning_rate": 6.95630970373172e-05,
1887
+ "loss": 0.0378,
1888
+ "step": 241
1889
+ },
1890
+ {
1891
+ "epoch": 1.8286252354048964,
1892
+ "grad_norm": 0.27016931772232056,
1893
+ "learning_rate": 6.878884495193448e-05,
1894
+ "loss": 0.0507,
1895
+ "step": 242
1896
+ },
1897
+ {
1898
+ "epoch": 1.8361581920903953,
1899
+ "grad_norm": 0.2610904574394226,
1900
+ "learning_rate": 6.801666030495826e-05,
1901
+ "loss": 0.0389,
1902
+ "step": 243
1903
+ },
1904
+ {
1905
+ "epoch": 1.8436911487758945,
1906
+ "grad_norm": 0.2465640753507614,
1907
+ "learning_rate": 6.724659424618401e-05,
1908
+ "loss": 0.0843,
1909
+ "step": 244
1910
+ },
1911
+ {
1912
+ "epoch": 1.8512241054613936,
1913
+ "grad_norm": 0.24705246090888977,
1914
+ "learning_rate": 6.647869778507112e-05,
1915
+ "loss": 0.0493,
1916
+ "step": 245
1917
+ },
1918
+ {
1919
+ "epoch": 1.8587570621468927,
1920
+ "grad_norm": 0.7887628078460693,
1921
+ "learning_rate": 6.571302178736404e-05,
1922
+ "loss": 0.0511,
1923
+ "step": 246
1924
+ },
1925
+ {
1926
+ "epoch": 1.8662900188323919,
1927
+ "grad_norm": 0.3609479069709778,
1928
+ "learning_rate": 6.494961697172279e-05,
1929
+ "loss": 0.0292,
1930
+ "step": 247
1931
+ },
1932
+ {
1933
+ "epoch": 1.8738229755178908,
1934
+ "grad_norm": 0.23038731515407562,
1935
+ "learning_rate": 6.418853390636364e-05,
1936
+ "loss": 0.0361,
1937
+ "step": 248
1938
+ },
1939
+ {
1940
+ "epoch": 1.8813559322033897,
1941
+ "grad_norm": 0.3310745060443878,
1942
+ "learning_rate": 6.342982300570912e-05,
1943
+ "loss": 0.103,
1944
+ "step": 249
1945
+ },
1946
+ {
1947
+ "epoch": 1.8888888888888888,
1948
+ "grad_norm": 0.2912939786911011,
1949
+ "learning_rate": 6.267353452704876e-05,
1950
+ "loss": 0.0391,
1951
+ "step": 250
1952
+ },
1953
+ {
1954
+ "epoch": 1.8888888888888888,
1955
+ "eval_loss": 0.05423182249069214,
1956
+ "eval_runtime": 37.1201,
1957
+ "eval_samples_per_second": 2.02,
1958
+ "eval_steps_per_second": 1.024,
1959
+ "step": 250
1960
+ },
1961
+ {
1962
+ "epoch": 1.896421845574388,
1963
+ "grad_norm": 0.3608611822128296,
1964
+ "learning_rate": 6.191971856720997e-05,
1965
+ "loss": 0.0474,
1966
+ "step": 251
1967
+ },
1968
+ {
1969
+ "epoch": 1.9039548022598871,
1970
+ "grad_norm": 0.2649577856063843,
1971
+ "learning_rate": 6.116842505923955e-05,
1972
+ "loss": 0.0352,
1973
+ "step": 252
1974
+ },
1975
+ {
1976
+ "epoch": 1.911487758945386,
1977
+ "grad_norm": 1.2930629253387451,
1978
+ "learning_rate": 6.0419703769096235e-05,
1979
+ "loss": 0.0672,
1980
+ "step": 253
1981
+ },
1982
+ {
1983
+ "epoch": 1.9190207156308852,
1984
+ "grad_norm": 0.4104057252407074,
1985
+ "learning_rate": 5.967360429235407e-05,
1986
+ "loss": 0.07,
1987
+ "step": 254
1988
+ },
1989
+ {
1990
+ "epoch": 1.926553672316384,
1991
+ "grad_norm": 0.375598281621933,
1992
+ "learning_rate": 5.893017605091717e-05,
1993
+ "loss": 0.0904,
1994
+ "step": 255
1995
+ },
1996
+ {
1997
+ "epoch": 1.9340866290018832,
1998
+ "grad_norm": 0.20128563046455383,
1999
+ "learning_rate": 5.818946828974607e-05,
2000
+ "loss": 0.0288,
2001
+ "step": 256
2002
+ },
2003
+ {
2004
+ "epoch": 1.9416195856873824,
2005
+ "grad_norm": 0.37956199049949646,
2006
+ "learning_rate": 5.7451530073595785e-05,
2007
+ "loss": 0.0575,
2008
+ "step": 257
2009
+ },
2010
+ {
2011
+ "epoch": 1.9491525423728815,
2012
+ "grad_norm": 0.40059077739715576,
2013
+ "learning_rate": 5.671641028376546e-05,
2014
+ "loss": 0.0586,
2015
+ "step": 258
2016
+ },
2017
+ {
2018
+ "epoch": 1.9566854990583804,
2019
+ "grad_norm": 0.3582189381122589,
2020
+ "learning_rate": 5.5984157614860845e-05,
2021
+ "loss": 0.0682,
2022
+ "step": 259
2023
+ },
2024
+ {
2025
+ "epoch": 1.9642184557438793,
2026
+ "grad_norm": 0.3279203474521637,
2027
+ "learning_rate": 5.5254820571568325e-05,
2028
+ "loss": 0.0953,
2029
+ "step": 260
2030
+ },
2031
+ {
2032
+ "epoch": 1.9642184557438793,
2033
+ "eval_loss": 0.05431414395570755,
2034
+ "eval_runtime": 37.1186,
2035
+ "eval_samples_per_second": 2.021,
2036
+ "eval_steps_per_second": 1.024,
2037
+ "step": 260
2038
+ },
2039
+ {
2040
+ "epoch": 1.9717514124293785,
2041
+ "grad_norm": 0.27801838517189026,
2042
+ "learning_rate": 5.4528447465442334e-05,
2043
+ "loss": 0.0383,
2044
+ "step": 261
2045
+ },
2046
+ {
2047
+ "epoch": 1.9792843691148776,
2048
+ "grad_norm": 0.26640596985816956,
2049
+ "learning_rate": 5.3805086411704985e-05,
2050
+ "loss": 0.0624,
2051
+ "step": 262
2052
+ },
2053
+ {
2054
+ "epoch": 1.9868173258003767,
2055
+ "grad_norm": 0.3783319294452667,
2056
+ "learning_rate": 5.3084785326058925e-05,
2057
+ "loss": 0.0739,
2058
+ "step": 263
2059
+ },
2060
+ {
2061
+ "epoch": 1.9943502824858759,
2062
+ "grad_norm": 0.2667982280254364,
2063
+ "learning_rate": 5.236759192151336e-05,
2064
+ "loss": 0.04,
2065
+ "step": 264
2066
+ },
2067
+ {
2068
+ "epoch": 2.007532956685499,
2069
+ "grad_norm": 0.9414636492729187,
2070
+ "learning_rate": 5.165355370522358e-05,
2071
+ "loss": 0.1447,
2072
+ "step": 265
2073
+ },
2074
+ {
2075
+ "epoch": 2.0150659133709983,
2076
+ "grad_norm": 0.3006013035774231,
2077
+ "learning_rate": 5.0942717975344035e-05,
2078
+ "loss": 0.0482,
2079
+ "step": 266
2080
+ },
2081
+ {
2082
+ "epoch": 2.022598870056497,
2083
+ "grad_norm": 0.20572024583816528,
2084
+ "learning_rate": 5.02351318178953e-05,
2085
+ "loss": 0.0329,
2086
+ "step": 267
2087
+ },
2088
+ {
2089
+ "epoch": 2.030131826741996,
2090
+ "grad_norm": 0.2508287727832794,
2091
+ "learning_rate": 4.953084210364508e-05,
2092
+ "loss": 0.0352,
2093
+ "step": 268
2094
+ },
2095
+ {
2096
+ "epoch": 2.0376647834274952,
2097
+ "grad_norm": 0.2693123519420624,
2098
+ "learning_rate": 4.882989548500349e-05,
2099
+ "loss": 0.0408,
2100
+ "step": 269
2101
+ },
2102
+ {
2103
+ "epoch": 2.0451977401129944,
2104
+ "grad_norm": 0.3008269965648651,
2105
+ "learning_rate": 4.813233839293265e-05,
2106
+ "loss": 0.0362,
2107
+ "step": 270
2108
+ },
2109
+ {
2110
+ "epoch": 2.0451977401129944,
2111
+ "eval_loss": 0.05344419553875923,
2112
+ "eval_runtime": 37.1115,
2113
+ "eval_samples_per_second": 2.021,
2114
+ "eval_steps_per_second": 1.024,
2115
+ "step": 270
2116
+ },
2117
+ {
2118
+ "epoch": 2.0451977401129944,
2119
+ "step": 270,
2120
+ "total_flos": 2.136820048293888e+16,
2121
+ "train_loss": 0.2946822406862069,
2122
+ "train_runtime": 4276.0328,
2123
+ "train_samples_per_second": 0.745,
2124
+ "train_steps_per_second": 0.093
2125
+ }
2126
+ ],
2127
+ "logging_steps": 1,
2128
+ "max_steps": 396,
2129
+ "num_input_tokens_seen": 0,
2130
+ "num_train_epochs": 3,
2131
+ "save_steps": 500,
2132
+ "stateful_callbacks": {
2133
+ "EarlyStoppingCallback": {
2134
+ "args": {
2135
+ "early_stopping_patience": 5,
2136
+ "early_stopping_threshold": 0.001
2137
+ },
2138
+ "attributes": {
2139
+ "early_stopping_patience_counter": 5
2140
+ }
2141
+ },
2142
+ "TrainerControl": {
2143
+ "args": {
2144
+ "should_epoch_stop": false,
2145
+ "should_evaluate": false,
2146
+ "should_log": false,
2147
+ "should_save": true,
2148
+ "should_training_stop": true
2149
+ },
2150
+ "attributes": {}
2151
+ }
2152
+ },
2153
+ "total_flos": 2.136820048293888e+16,
2154
+ "train_batch_size": 2,
2155
+ "trial_name": null,
2156
+ "trial_params": null
2157
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a77af18f00c34a759a24dd16f355f28486619b3592f07abfbb0f7b9b13205220
3
+ size 5880
vocab.json ADDED
The diff for this file is too large to render. See raw diff