End of training

Browse files

Files changed (16) hide show

.gitattributes +1 -0
README.md +59 -0
adapter_config.json +29 -0
adapter_model.safetensors +3 -0
added_tokens.json +16 -0
all_results.json +11 -0
chat_template.json +3 -0
merges.txt +0 -0
preprocessor_config.json +29 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +145 -0
train_results.json +11 -0
trainer_state.json +2157 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+base_model: unsloth/Qwen2-VL-2B-Instruct
+library_name: transformers
+model_name: Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned
+tags:
+- generated_from_trainer
+- unsloth
+- trl
+- sft
+licence: license
+---
+# Model Card for Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned
+This model is a fine-tuned version of [unsloth/Qwen2-VL-2B-Instruct](https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="tech4humans/Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/samuel-lima-tech4humans/ocr-finetuning-v2/runs/rqtgmysz)
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.15.1
+- Transformers: 4.49.0.dev0
+- Pytorch: 2.6.0
+- Datasets: 3.3.2
+- Tokenizers: 0.21.0
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "unsloth/Qwen2-VL-2B-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 4,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn).*?(?:qkv|proj|fc1|fc2|q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn)\\.(?:(?:qkv|proj|fc1|fc2|q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)))",
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73b26ff3aeea093ee7f91cbc4a169a4f2e36af3fc848c9bf5ee510db88ac9717
+size 8335688

added_tokens.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "all_params": 2211058176,
+    "memory_footprint": 4426269008,
+    "total_flos": 2.136820048293888e+16,
+    "train_loss": 0.2946822406862069,
+    "train_runtime": 4276.0328,
+    "train_samples_per_second": 0.745,
+    "train_steps_per_second": 0.093,
+    "trainable_params": 2072576,
+    "trainable_params_percent": 0.09373683707180756
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|vision_pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:948c45c29a91dd2e6ae77d6f5a324a3d408bcca6ad443365b2e79986f1422771
+size 11420540

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|vision_pad|>",
+  "padding_side": "right",
+  "processor_class": "Qwen2VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "all_params": 2211058176,
+    "memory_footprint": 4426269008,
+    "total_flos": 2.136820048293888e+16,
+    "train_loss": 0.2946822406862069,
+    "train_runtime": 4276.0328,
+    "train_samples_per_second": 0.745,
+    "train_steps_per_second": 0.093,
+    "trainable_params": 2072576,
+    "trainable_params_percent": 0.09373683707180756
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2157 @@

+{
+  "best_metric": 0.05344419553875923,
+  "best_model_checkpoint": "/content/train/Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned/checkpoint-270",
+  "epoch": 2.0451977401129944,
+  "eval_steps": 10,
+  "global_step": 270,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007532956685499058,
+      "grad_norm": 0.68587327003479,
+      "learning_rate": 2e-05,
+      "loss": 1.6782,
+      "step": 1
+    },
+    {
+      "epoch": 0.015065913370998116,
+      "grad_norm": 0.7287677526473999,
+      "learning_rate": 4e-05,
+      "loss": 1.7932,
+      "step": 2
+    },
+    {
+      "epoch": 0.022598870056497175,
+      "grad_norm": 0.7718816995620728,
+      "learning_rate": 6e-05,
+      "loss": 1.6757,
+      "step": 3
+    },
+    {
+      "epoch": 0.030131826741996232,
+      "grad_norm": 0.7753613591194153,
+      "learning_rate": 8e-05,
+      "loss": 1.7695,
+      "step": 4
+    },
+    {
+      "epoch": 0.03766478342749529,
+      "grad_norm": 1.235795259475708,
+      "learning_rate": 0.0001,
+      "loss": 1.9245,
+      "step": 5
+    },
+    {
+      "epoch": 0.04519774011299435,
+      "grad_norm": 0.569118082523346,
+      "learning_rate": 0.00012,
+      "loss": 1.451,
+      "step": 6
+    },
+    {
+      "epoch": 0.05273069679849341,
+      "grad_norm": 0.6638339757919312,
+      "learning_rate": 0.00014,
+      "loss": 1.6576,
+      "step": 7
+    },
+    {
+      "epoch": 0.060263653483992465,
+      "grad_norm": 0.6843408942222595,
+      "learning_rate": 0.00016,
+      "loss": 1.6339,
+      "step": 8
+    },
+    {
+      "epoch": 0.06779661016949153,
+      "grad_norm": 0.5259923934936523,
+      "learning_rate": 0.00018,
+      "loss": 1.5687,
+      "step": 9
+    },
+    {
+      "epoch": 0.07532956685499058,
+      "grad_norm": 0.655581533908844,
+      "learning_rate": 0.0002,
+      "loss": 1.6655,
+      "step": 10
+    },
+    {
+      "epoch": 0.07532956685499058,
+      "eval_loss": 1.5389481782913208,
+      "eval_runtime": 47.9411,
+      "eval_samples_per_second": 1.564,
+      "eval_steps_per_second": 0.793,
+      "step": 10
+    },
+    {
+      "epoch": 0.08286252354048965,
+      "grad_norm": 1.65678870677948,
+      "learning_rate": 0.0001999966879815833,
+      "loss": 1.7139,
+      "step": 11
+    },
+    {
+      "epoch": 0.0903954802259887,
+      "grad_norm": 0.4999409019947052,
+      "learning_rate": 0.0001999867521457224,
+      "loss": 1.4695,
+      "step": 12
+    },
+    {
+      "epoch": 0.09792843691148775,
+      "grad_norm": 0.6279143691062927,
+      "learning_rate": 0.0001999701931505708,
+      "loss": 1.42,
+      "step": 13
+    },
+    {
+      "epoch": 0.10546139359698682,
+      "grad_norm": 0.47573018074035645,
+      "learning_rate": 0.00019994701209300245,
+      "loss": 1.3877,
+      "step": 14
+    },
+    {
+      "epoch": 0.11299435028248588,
+      "grad_norm": 0.5120630264282227,
+      "learning_rate": 0.00019991721050853907,
+      "loss": 1.4014,
+      "step": 15
+    },
+    {
+      "epoch": 0.12052730696798493,
+      "grad_norm": 0.4641444981098175,
+      "learning_rate": 0.00019988079037124864,
+      "loss": 1.2456,
+      "step": 16
+    },
+    {
+      "epoch": 0.128060263653484,
+      "grad_norm": 0.5229088664054871,
+      "learning_rate": 0.00019983775409361447,
+      "loss": 1.3617,
+      "step": 17
+    },
+    {
+      "epoch": 0.13559322033898305,
+      "grad_norm": 0.6793835759162903,
+      "learning_rate": 0.00019978810452637543,
+      "loss": 1.4584,
+      "step": 18
+    },
+    {
+      "epoch": 0.1431261770244821,
+      "grad_norm": 0.530450165271759,
+      "learning_rate": 0.00019973184495833716,
+      "loss": 1.2412,
+      "step": 19
+    },
+    {
+      "epoch": 0.15065913370998116,
+      "grad_norm": 0.5695556998252869,
+      "learning_rate": 0.00019966897911615416,
+      "loss": 1.2738,
+      "step": 20
+    },
+    {
+      "epoch": 0.15065913370998116,
+      "eval_loss": 1.2209211587905884,
+      "eval_runtime": 37.1065,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 20
+    },
+    {
+      "epoch": 0.15819209039548024,
+      "grad_norm": 0.5769656896591187,
+      "learning_rate": 0.00019959951116408294,
+      "loss": 1.2751,
+      "step": 21
+    },
+    {
+      "epoch": 0.1657250470809793,
+      "grad_norm": 0.6152491569519043,
+      "learning_rate": 0.0001995234457037063,
+      "loss": 1.2145,
+      "step": 22
+    },
+    {
+      "epoch": 0.17325800376647835,
+      "grad_norm": 0.6578372120857239,
+      "learning_rate": 0.00019944078777362826,
+      "loss": 1.1845,
+      "step": 23
+    },
+    {
+      "epoch": 0.1807909604519774,
+      "grad_norm": 0.5556841492652893,
+      "learning_rate": 0.00019935154284914065,
+      "loss": 1.0926,
+      "step": 24
+    },
+    {
+      "epoch": 0.18832391713747645,
+      "grad_norm": 0.7302567958831787,
+      "learning_rate": 0.00019925571684186006,
+      "loss": 1.1249,
+      "step": 25
+    },
+    {
+      "epoch": 0.1958568738229755,
+      "grad_norm": 0.6284404993057251,
+      "learning_rate": 0.00019915331609933657,
+      "loss": 0.9404,
+      "step": 26
+    },
+    {
+      "epoch": 0.2033898305084746,
+      "grad_norm": 0.776946485042572,
+      "learning_rate": 0.00019904434740463306,
+      "loss": 1.044,
+      "step": 27
+    },
+    {
+      "epoch": 0.21092278719397364,
+      "grad_norm": 0.7142918705940247,
+      "learning_rate": 0.00019892881797587601,
+      "loss": 0.9695,
+      "step": 28
+    },
+    {
+      "epoch": 0.2184557438794727,
+      "grad_norm": 0.8852341175079346,
+      "learning_rate": 0.0001988067354657773,
+      "loss": 0.8989,
+      "step": 29
+    },
+    {
+      "epoch": 0.22598870056497175,
+      "grad_norm": 0.8206908106803894,
+      "learning_rate": 0.00019867810796112744,
+      "loss": 0.8154,
+      "step": 30
+    },
+    {
+      "epoch": 0.22598870056497175,
+      "eval_loss": 0.8218569755554199,
+      "eval_runtime": 37.094,
+      "eval_samples_per_second": 2.022,
+      "eval_steps_per_second": 1.024,
+      "step": 30
+    },
+    {
+      "epoch": 0.2335216572504708,
+      "grad_norm": 0.9797173142433167,
+      "learning_rate": 0.0001985429439822596,
+      "loss": 0.7847,
+      "step": 31
+    },
+    {
+      "epoch": 0.24105461393596986,
+      "grad_norm": 1.0684410333633423,
+      "learning_rate": 0.00019840125248248564,
+      "loss": 0.823,
+      "step": 32
+    },
+    {
+      "epoch": 0.24858757062146894,
+      "grad_norm": 1.009280800819397,
+      "learning_rate": 0.00019825304284750263,
+      "loss": 0.883,
+      "step": 33
+    },
+    {
+      "epoch": 0.256120527306968,
+      "grad_norm": 0.8165304660797119,
+      "learning_rate": 0.00019809832489477142,
+      "loss": 0.7012,
+      "step": 34
+    },
+    {
+      "epoch": 0.263653483992467,
+      "grad_norm": 0.794262707233429,
+      "learning_rate": 0.00019793710887286615,
+      "loss": 0.6529,
+      "step": 35
+    },
+    {
+      "epoch": 0.2711864406779661,
+      "grad_norm": 0.727675199508667,
+      "learning_rate": 0.0001977694054607955,
+      "loss": 0.6809,
+      "step": 36
+    },
+    {
+      "epoch": 0.2787193973634652,
+      "grad_norm": 0.7391637563705444,
+      "learning_rate": 0.00019759522576729533,
+      "loss": 0.6308,
+      "step": 37
+    },
+    {
+      "epoch": 0.2862523540489642,
+      "grad_norm": 0.7500622868537903,
+      "learning_rate": 0.00019741458133009258,
+      "loss": 0.5628,
+      "step": 38
+    },
+    {
+      "epoch": 0.2937853107344633,
+      "grad_norm": 0.962188184261322,
+      "learning_rate": 0.00019722748411514135,
+      "loss": 0.5857,
+      "step": 39
+    },
+    {
+      "epoch": 0.3013182674199623,
+      "grad_norm": 0.7300134301185608,
+      "learning_rate": 0.0001970339465158301,
+      "loss": 0.5631,
+      "step": 40
+    },
+    {
+      "epoch": 0.3013182674199623,
+      "eval_loss": 0.5341230630874634,
+      "eval_runtime": 37.0912,
+      "eval_samples_per_second": 2.022,
+      "eval_steps_per_second": 1.025,
+      "step": 40
+    },
+    {
+      "epoch": 0.3088512241054614,
+      "grad_norm": 0.5163620710372925,
+      "learning_rate": 0.00019683398135216066,
+      "loss": 0.528,
+      "step": 41
+    },
+    {
+      "epoch": 0.3163841807909605,
+      "grad_norm": 0.38112568855285645,
+      "learning_rate": 0.00019662760186989913,
+      "loss": 0.5219,
+      "step": 42
+    },
+    {
+      "epoch": 0.3239171374764595,
+      "grad_norm": 0.389498233795166,
+      "learning_rate": 0.00019641482173969848,
+      "loss": 0.5172,
+      "step": 43
+    },
+    {
+      "epoch": 0.3314500941619586,
+      "grad_norm": 0.5581079125404358,
+      "learning_rate": 0.00019619565505619288,
+      "loss": 0.5106,
+      "step": 44
+    },
+    {
+      "epoch": 0.3389830508474576,
+      "grad_norm": 0.38179025053977966,
+      "learning_rate": 0.00019597011633706415,
+      "loss": 0.5374,
+      "step": 45
+    },
+    {
+      "epoch": 0.3465160075329567,
+      "grad_norm": 0.40401706099510193,
+      "learning_rate": 0.00019573822052208013,
+      "loss": 0.4814,
+      "step": 46
+    },
+    {
+      "epoch": 0.3540489642184557,
+      "grad_norm": 0.3594434857368469,
+      "learning_rate": 0.00019549998297210502,
+      "loss": 0.4933,
+      "step": 47
+    },
+    {
+      "epoch": 0.3615819209039548,
+      "grad_norm": 0.34325098991394043,
+      "learning_rate": 0.00019525541946808188,
+      "loss": 0.4893,
+      "step": 48
+    },
+    {
+      "epoch": 0.3691148775894539,
+      "grad_norm": 0.3423003852367401,
+      "learning_rate": 0.00019500454620998732,
+      "loss": 0.4584,
+      "step": 49
+    },
+    {
+      "epoch": 0.3766478342749529,
+      "grad_norm": 0.3735145330429077,
+      "learning_rate": 0.00019474737981575832,
+      "loss": 0.4078,
+      "step": 50
+    },
+    {
+      "epoch": 0.3766478342749529,
+      "eval_loss": 0.4520163834095001,
+      "eval_runtime": 37.128,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.023,
+      "step": 50
+    },
+    {
+      "epoch": 0.384180790960452,
+      "grad_norm": 0.5755606293678284,
+      "learning_rate": 0.0001944839373201916,
+      "loss": 0.4468,
+      "step": 51
+    },
+    {
+      "epoch": 0.391713747645951,
+      "grad_norm": 0.3776421546936035,
+      "learning_rate": 0.00019421423617381508,
+      "loss": 0.5,
+      "step": 52
+    },
+    {
+      "epoch": 0.3992467043314501,
+      "grad_norm": 0.3351342976093292,
+      "learning_rate": 0.00019393829424173205,
+      "loss": 0.4443,
+      "step": 53
+    },
+    {
+      "epoch": 0.4067796610169492,
+      "grad_norm": 0.6081859469413757,
+      "learning_rate": 0.0001936561298024377,
+      "loss": 0.393,
+      "step": 54
+    },
+    {
+      "epoch": 0.4143126177024482,
+      "grad_norm": 0.40104803442955017,
+      "learning_rate": 0.00019336776154660841,
+      "loss": 0.4274,
+      "step": 55
+    },
+    {
+      "epoch": 0.4218455743879473,
+      "grad_norm": 0.44677644968032837,
+      "learning_rate": 0.00019307320857586376,
+      "loss": 0.4133,
+      "step": 56
+    },
+    {
+      "epoch": 0.4293785310734463,
+      "grad_norm": 0.36069607734680176,
+      "learning_rate": 0.00019277249040150092,
+      "loss": 0.3849,
+      "step": 57
+    },
+    {
+      "epoch": 0.4369114877589454,
+      "grad_norm": 1.2188339233398438,
+      "learning_rate": 0.00019246562694320255,
+      "loss": 0.4041,
+      "step": 58
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.4592845141887665,
+      "learning_rate": 0.00019215263852771718,
+      "loss": 0.4183,
+      "step": 59
+    },
+    {
+      "epoch": 0.4519774011299435,
+      "grad_norm": 1.6102626323699951,
+      "learning_rate": 0.00019183354588751271,
+      "loss": 0.4038,
+      "step": 60
+    },
+    {
+      "epoch": 0.4519774011299435,
+      "eval_loss": 0.38410142064094543,
+      "eval_runtime": 37.1151,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 60
+    },
+    {
+      "epoch": 0.4595103578154426,
+      "grad_norm": 0.4766036868095398,
+      "learning_rate": 0.00019150837015940322,
+      "loss": 0.4346,
+      "step": 61
+    },
+    {
+      "epoch": 0.4670433145009416,
+      "grad_norm": 0.4366019368171692,
+      "learning_rate": 0.00019117713288314863,
+      "loss": 0.3804,
+      "step": 62
+    },
+    {
+      "epoch": 0.4745762711864407,
+      "grad_norm": 0.792560338973999,
+      "learning_rate": 0.00019083985600002818,
+      "loss": 0.3856,
+      "step": 63
+    },
+    {
+      "epoch": 0.4821092278719397,
+      "grad_norm": 0.427386075258255,
+      "learning_rate": 0.0001904965618513868,
+      "loss": 0.3906,
+      "step": 64
+    },
+    {
+      "epoch": 0.4896421845574388,
+      "grad_norm": 0.6638129949569702,
+      "learning_rate": 0.00019014727317715537,
+      "loss": 0.4039,
+      "step": 65
+    },
+    {
+      "epoch": 0.4971751412429379,
+      "grad_norm": 0.46441903710365295,
+      "learning_rate": 0.00018979201311434434,
+      "loss": 0.422,
+      "step": 66
+    },
+    {
+      "epoch": 0.504708097928437,
+      "grad_norm": 0.4845605790615082,
+      "learning_rate": 0.00018943080519551108,
+      "loss": 0.358,
+      "step": 67
+    },
+    {
+      "epoch": 0.512241054613936,
+      "grad_norm": 0.7461917400360107,
+      "learning_rate": 0.00018906367334720124,
+      "loss": 0.3956,
+      "step": 68
+    },
+    {
+      "epoch": 0.519774011299435,
+      "grad_norm": 0.6427743434906006,
+      "learning_rate": 0.0001886906418883636,
+      "loss": 0.3141,
+      "step": 69
+    },
+    {
+      "epoch": 0.527306967984934,
+      "grad_norm": 0.6577739119529724,
+      "learning_rate": 0.00018831173552873946,
+      "loss": 0.3455,
+      "step": 70
+    },
+    {
+      "epoch": 0.527306967984934,
+      "eval_loss": 0.3052687644958496,
+      "eval_runtime": 37.0939,
+      "eval_samples_per_second": 2.022,
+      "eval_steps_per_second": 1.024,
+      "step": 70
+    },
+    {
+      "epoch": 0.5348399246704332,
+      "grad_norm": 0.7122016549110413,
+      "learning_rate": 0.00018792697936722563,
+      "loss": 0.3519,
+      "step": 71
+    },
+    {
+      "epoch": 0.5423728813559322,
+      "grad_norm": 0.5734298229217529,
+      "learning_rate": 0.00018753639889021196,
+      "loss": 0.3051,
+      "step": 72
+    },
+    {
+      "epoch": 0.5499058380414312,
+      "grad_norm": 0.8871021270751953,
+      "learning_rate": 0.00018714001996989312,
+      "loss": 0.2803,
+      "step": 73
+    },
+    {
+      "epoch": 0.5574387947269304,
+      "grad_norm": 0.7467854022979736,
+      "learning_rate": 0.00018673786886255476,
+      "loss": 0.2741,
+      "step": 74
+    },
+    {
+      "epoch": 0.5649717514124294,
+      "grad_norm": 0.549818754196167,
+      "learning_rate": 0.0001863299722068344,
+      "loss": 0.2779,
+      "step": 75
+    },
+    {
+      "epoch": 0.5725047080979284,
+      "grad_norm": 0.5196639895439148,
+      "learning_rate": 0.00018591635702195673,
+      "loss": 0.3036,
+      "step": 76
+    },
+    {
+      "epoch": 0.5800376647834274,
+      "grad_norm": 0.532467782497406,
+      "learning_rate": 0.00018549705070594396,
+      "loss": 0.2767,
+      "step": 77
+    },
+    {
+      "epoch": 0.5875706214689266,
+      "grad_norm": 0.8568252325057983,
+      "learning_rate": 0.00018507208103380092,
+      "loss": 0.2224,
+      "step": 78
+    },
+    {
+      "epoch": 0.5951035781544256,
+      "grad_norm": 0.557944118976593,
+      "learning_rate": 0.00018464147615567517,
+      "loss": 0.2269,
+      "step": 79
+    },
+    {
+      "epoch": 0.6026365348399246,
+      "grad_norm": 0.886238157749176,
+      "learning_rate": 0.0001842052645949925,
+      "loss": 0.2658,
+      "step": 80
+    },
+    {
+      "epoch": 0.6026365348399246,
+      "eval_loss": 0.22510449588298798,
+      "eval_runtime": 37.1134,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 80
+    },
+    {
+      "epoch": 0.6101694915254238,
+      "grad_norm": 0.5309122204780579,
+      "learning_rate": 0.00018376347524656734,
+      "loss": 0.2168,
+      "step": 81
+    },
+    {
+      "epoch": 0.6177024482109228,
+      "grad_norm": 0.5083054304122925,
+      "learning_rate": 0.00018331613737468887,
+      "loss": 0.2312,
+      "step": 82
+    },
+    {
+      "epoch": 0.6252354048964218,
+      "grad_norm": 9.135035514831543,
+      "learning_rate": 0.00018286328061118244,
+      "loss": 0.246,
+      "step": 83
+    },
+    {
+      "epoch": 0.632768361581921,
+      "grad_norm": 0.771587610244751,
+      "learning_rate": 0.00018240493495344694,
+      "loss": 0.2207,
+      "step": 84
+    },
+    {
+      "epoch": 0.64030131826742,
+      "grad_norm": 0.8555005788803101,
+      "learning_rate": 0.00018194113076246753,
+      "loss": 0.223,
+      "step": 85
+    },
+    {
+      "epoch": 0.647834274952919,
+      "grad_norm": 0.5555715560913086,
+      "learning_rate": 0.00018147189876080463,
+      "loss": 0.2114,
+      "step": 86
+    },
+    {
+      "epoch": 0.655367231638418,
+      "grad_norm": 0.6347367167472839,
+      "learning_rate": 0.00018099727003055894,
+      "loss": 0.2326,
+      "step": 87
+    },
+    {
+      "epoch": 0.6629001883239172,
+      "grad_norm": 0.7266764640808105,
+      "learning_rate": 0.00018051727601131227,
+      "loss": 0.257,
+      "step": 88
+    },
+    {
+      "epoch": 0.6704331450094162,
+      "grad_norm": 0.7240170240402222,
+      "learning_rate": 0.00018003194849804534,
+      "loss": 0.2001,
+      "step": 89
+    },
+    {
+      "epoch": 0.6779661016949152,
+      "grad_norm": 0.7595257759094238,
+      "learning_rate": 0.00017954131963903133,
+      "loss": 0.1747,
+      "step": 90
+    },
+    {
+      "epoch": 0.6779661016949152,
+      "eval_loss": 0.17160969972610474,
+      "eval_runtime": 37.1302,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.023,
+      "step": 90
+    },
+    {
+      "epoch": 0.6854990583804144,
+      "grad_norm": 0.7588114738464355,
+      "learning_rate": 0.00017904542193370663,
+      "loss": 0.1372,
+      "step": 91
+    },
+    {
+      "epoch": 0.6930320150659134,
+      "grad_norm": 0.7313429713249207,
+      "learning_rate": 0.0001785442882305179,
+      "loss": 0.2234,
+      "step": 92
+    },
+    {
+      "epoch": 0.7005649717514124,
+      "grad_norm": 0.8581743836402893,
+      "learning_rate": 0.0001780379517247462,
+      "loss": 0.1712,
+      "step": 93
+    },
+    {
+      "epoch": 0.7080979284369114,
+      "grad_norm": 1.0297523736953735,
+      "learning_rate": 0.0001775264459563081,
+      "loss": 0.1769,
+      "step": 94
+    },
+    {
+      "epoch": 0.7156308851224106,
+      "grad_norm": 0.5627338290214539,
+      "learning_rate": 0.00017700980480753423,
+      "loss": 0.1864,
+      "step": 95
+    },
+    {
+      "epoch": 0.7231638418079096,
+      "grad_norm": 1.0914690494537354,
+      "learning_rate": 0.0001764880625009245,
+      "loss": 0.1786,
+      "step": 96
+    },
+    {
+      "epoch": 0.7306967984934086,
+      "grad_norm": 0.6584937572479248,
+      "learning_rate": 0.00017596125359688154,
+      "loss": 0.131,
+      "step": 97
+    },
+    {
+      "epoch": 0.7382297551789078,
+      "grad_norm": 1.1257890462875366,
+      "learning_rate": 0.00017542941299142112,
+      "loss": 0.1678,
+      "step": 98
+    },
+    {
+      "epoch": 0.7457627118644068,
+      "grad_norm": 0.5444011688232422,
+      "learning_rate": 0.00017489257591386093,
+      "loss": 0.1562,
+      "step": 99
+    },
+    {
+      "epoch": 0.7532956685499058,
+      "grad_norm": 0.665874183177948,
+      "learning_rate": 0.00017435077792448664,
+      "loss": 0.189,
+      "step": 100
+    },
+    {
+      "epoch": 0.7532956685499058,
+      "eval_loss": 0.13106876611709595,
+      "eval_runtime": 37.0741,
+      "eval_samples_per_second": 2.023,
+      "eval_steps_per_second": 1.025,
+      "step": 100
+    },
+    {
+      "epoch": 0.7608286252354048,
+      "grad_norm": 0.6252754330635071,
+      "learning_rate": 0.0001738040549121967,
+      "loss": 0.104,
+      "step": 101
+    },
+    {
+      "epoch": 0.768361581920904,
+      "grad_norm": 0.6250944137573242,
+      "learning_rate": 0.00017325244309212475,
+      "loss": 0.1582,
+      "step": 102
+    },
+    {
+      "epoch": 0.775894538606403,
+      "grad_norm": 0.7759442329406738,
+      "learning_rate": 0.00017269597900324097,
+      "loss": 0.1888,
+      "step": 103
+    },
+    {
+      "epoch": 0.783427495291902,
+      "grad_norm": 0.5639198422431946,
+      "learning_rate": 0.00017213469950593156,
+      "loss": 0.1223,
+      "step": 104
+    },
+    {
+      "epoch": 0.7909604519774012,
+      "grad_norm": 0.5083601474761963,
+      "learning_rate": 0.00017156864177955719,
+      "loss": 0.0838,
+      "step": 105
+    },
+    {
+      "epoch": 0.7984934086629002,
+      "grad_norm": 0.5559635758399963,
+      "learning_rate": 0.0001709978433199901,
+      "loss": 0.0855,
+      "step": 106
+    },
+    {
+      "epoch": 0.8060263653483992,
+      "grad_norm": 0.6353676319122314,
+      "learning_rate": 0.00017042234193713056,
+      "loss": 0.1105,
+      "step": 107
+    },
+    {
+      "epoch": 0.8135593220338984,
+      "grad_norm": 0.7712072134017944,
+      "learning_rate": 0.0001698421757524021,
+      "loss": 0.1402,
+      "step": 108
+    },
+    {
+      "epoch": 0.8210922787193974,
+      "grad_norm": 0.7416761517524719,
+      "learning_rate": 0.00016925738319622654,
+      "loss": 0.0932,
+      "step": 109
+    },
+    {
+      "epoch": 0.8286252354048964,
+      "grad_norm": 0.7182126045227051,
+      "learning_rate": 0.00016866800300547813,
+      "loss": 0.131,
+      "step": 110
+    },
+    {
+      "epoch": 0.8286252354048964,
+      "eval_loss": 0.09729403257369995,
+      "eval_runtime": 37.1303,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.023,
+      "step": 110
+    },
+    {
+      "epoch": 0.8361581920903954,
+      "grad_norm": 0.6551967263221741,
+      "learning_rate": 0.00016807407422091784,
+      "loss": 0.1161,
+      "step": 111
+    },
+    {
+      "epoch": 0.8436911487758946,
+      "grad_norm": 0.6405838131904602,
+      "learning_rate": 0.0001674756361846071,
+      "loss": 0.1454,
+      "step": 112
+    },
+    {
+      "epoch": 0.8512241054613936,
+      "grad_norm": 0.4275994598865509,
+      "learning_rate": 0.00016687272853730192,
+      "loss": 0.0897,
+      "step": 113
+    },
+    {
+      "epoch": 0.8587570621468926,
+      "grad_norm": 0.6592651605606079,
+      "learning_rate": 0.00016626539121582685,
+      "loss": 0.0534,
+      "step": 114
+    },
+    {
+      "epoch": 0.8662900188323918,
+      "grad_norm": 0.6205569505691528,
+      "learning_rate": 0.0001656536644504298,
+      "loss": 0.1361,
+      "step": 115
+    },
+    {
+      "epoch": 0.8738229755178908,
+      "grad_norm": 0.5345686078071594,
+      "learning_rate": 0.0001650375887621171,
+      "loss": 0.0923,
+      "step": 116
+    },
+    {
+      "epoch": 0.8813559322033898,
+      "grad_norm": 1.0165270566940308,
+      "learning_rate": 0.00016441720495996912,
+      "loss": 0.0852,
+      "step": 117
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.48809266090393066,
+      "learning_rate": 0.00016379255413843754,
+      "loss": 0.0839,
+      "step": 118
+    },
+    {
+      "epoch": 0.896421845574388,
+      "grad_norm": 0.650384247303009,
+      "learning_rate": 0.0001631636776746228,
+      "loss": 0.102,
+      "step": 119
+    },
+    {
+      "epoch": 0.903954802259887,
+      "grad_norm": 0.6523996591567993,
+      "learning_rate": 0.00016253061722553355,
+      "loss": 0.0661,
+      "step": 120
+    },
+    {
+      "epoch": 0.903954802259887,
+      "eval_loss": 0.07857384532690048,
+      "eval_runtime": 37.0902,
+      "eval_samples_per_second": 2.022,
+      "eval_steps_per_second": 1.025,
+      "step": 120
+    },
+    {
+      "epoch": 0.911487758945386,
+      "grad_norm": 0.4382803738117218,
+      "learning_rate": 0.00016189341472532705,
+      "loss": 0.0582,
+      "step": 121
+    },
+    {
+      "epoch": 0.9190207156308852,
+      "grad_norm": 0.6267339587211609,
+      "learning_rate": 0.0001612521123825317,
+      "loss": 0.079,
+      "step": 122
+    },
+    {
+      "epoch": 0.9265536723163842,
+      "grad_norm": 0.700908899307251,
+      "learning_rate": 0.00016060675267725083,
+      "loss": 0.1022,
+      "step": 123
+    },
+    {
+      "epoch": 0.9340866290018832,
+      "grad_norm": 0.4881342351436615,
+      "learning_rate": 0.00015995737835834906,
+      "loss": 0.063,
+      "step": 124
+    },
+    {
+      "epoch": 0.9416195856873822,
+      "grad_norm": 0.4968627989292145,
+      "learning_rate": 0.00015930403244062043,
+      "loss": 0.0675,
+      "step": 125
+    },
+    {
+      "epoch": 0.9491525423728814,
+      "grad_norm": 0.4240921437740326,
+      "learning_rate": 0.00015864675820193922,
+      "loss": 0.0531,
+      "step": 126
+    },
+    {
+      "epoch": 0.9566854990583804,
+      "grad_norm": 0.3779008984565735,
+      "learning_rate": 0.00015798559918039307,
+      "loss": 0.0481,
+      "step": 127
+    },
+    {
+      "epoch": 0.9642184557438794,
+      "grad_norm": 0.471587210893631,
+      "learning_rate": 0.00015732059917139912,
+      "loss": 0.0698,
+      "step": 128
+    },
+    {
+      "epoch": 0.9717514124293786,
+      "grad_norm": 0.44407761096954346,
+      "learning_rate": 0.0001566518022248029,
+      "loss": 0.1005,
+      "step": 129
+    },
+    {
+      "epoch": 0.9792843691148776,
+      "grad_norm": 0.4785122275352478,
+      "learning_rate": 0.00015597925264196049,
+      "loss": 0.0784,
+      "step": 130
+    },
+    {
+      "epoch": 0.9792843691148776,
+      "eval_loss": 0.07214296609163284,
+      "eval_runtime": 37.1186,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 130
+    },
+    {
+      "epoch": 0.9868173258003766,
+      "grad_norm": 0.4364639222621918,
+      "learning_rate": 0.00015530299497280395,
+      "loss": 0.046,
+      "step": 131
+    },
+    {
+      "epoch": 0.9943502824858758,
+      "grad_norm": 0.6649202108383179,
+      "learning_rate": 0.0001546230740128904,
+      "loss": 0.0618,
+      "step": 132
+    },
+    {
+      "epoch": 1.0075329566854991,
+      "grad_norm": 0.662251353263855,
+      "learning_rate": 0.00015393953480043467,
+      "loss": 0.1003,
+      "step": 133
+    },
+    {
+      "epoch": 1.015065913370998,
+      "grad_norm": 0.36134594678878784,
+      "learning_rate": 0.000153252422613326,
+      "loss": 0.0403,
+      "step": 134
+    },
+    {
+      "epoch": 1.0225988700564972,
+      "grad_norm": 0.512718677520752,
+      "learning_rate": 0.00015256178296612868,
+      "loss": 0.0673,
+      "step": 135
+    },
+    {
+      "epoch": 1.0301318267419963,
+      "grad_norm": 0.4086618721485138,
+      "learning_rate": 0.0001518676616070674,
+      "loss": 0.0943,
+      "step": 136
+    },
+    {
+      "epoch": 1.0376647834274952,
+      "grad_norm": 0.3207029402256012,
+      "learning_rate": 0.00015117010451499654,
+      "loss": 0.0865,
+      "step": 137
+    },
+    {
+      "epoch": 1.0451977401129944,
+      "grad_norm": 0.2941770553588867,
+      "learning_rate": 0.0001504691578963549,
+      "loss": 0.0374,
+      "step": 138
+    },
+    {
+      "epoch": 1.0527306967984935,
+      "grad_norm": 0.4340198040008545,
+      "learning_rate": 0.00014976486818210467,
+      "loss": 0.077,
+      "step": 139
+    },
+    {
+      "epoch": 1.0602636534839924,
+      "grad_norm": 0.54200679063797,
+      "learning_rate": 0.00014905728202465595,
+      "loss": 0.086,
+      "step": 140
+    },
+    {
+      "epoch": 1.0602636534839924,
+      "eval_loss": 0.0658058300614357,
+      "eval_runtime": 37.189,
+      "eval_samples_per_second": 2.017,
+      "eval_steps_per_second": 1.022,
+      "step": 140
+    },
+    {
+      "epoch": 1.0677966101694916,
+      "grad_norm": 0.48267418146133423,
+      "learning_rate": 0.00014834644629477644,
+      "loss": 0.0502,
+      "step": 141
+    },
+    {
+      "epoch": 1.0753295668549905,
+      "grad_norm": 0.5690019726753235,
+      "learning_rate": 0.00014763240807848666,
+      "loss": 0.0617,
+      "step": 142
+    },
+    {
+      "epoch": 1.0828625235404896,
+      "grad_norm": 0.4100703299045563,
+      "learning_rate": 0.0001469152146739411,
+      "loss": 0.0562,
+      "step": 143
+    },
+    {
+      "epoch": 1.0903954802259888,
+      "grad_norm": 0.49852266907691956,
+      "learning_rate": 0.000146194913588295,
+      "loss": 0.0751,
+      "step": 144
+    },
+    {
+      "epoch": 1.0979284369114877,
+      "grad_norm": 0.4217350482940674,
+      "learning_rate": 0.00014547155253455768,
+      "loss": 0.0803,
+      "step": 145
+    },
+    {
+      "epoch": 1.1054613935969868,
+      "grad_norm": 0.4313773810863495,
+      "learning_rate": 0.00014474517942843175,
+      "loss": 0.0447,
+      "step": 146
+    },
+    {
+      "epoch": 1.112994350282486,
+      "grad_norm": 0.5009363889694214,
+      "learning_rate": 0.0001440158423851392,
+      "loss": 0.0415,
+      "step": 147
+    },
+    {
+      "epoch": 1.1205273069679849,
+      "grad_norm": 0.8885876536369324,
+      "learning_rate": 0.00014328358971623455,
+      "loss": 0.0603,
+      "step": 148
+    },
+    {
+      "epoch": 1.128060263653484,
+      "grad_norm": 1.5320378541946411,
+      "learning_rate": 0.00014254846992640423,
+      "loss": 0.0665,
+      "step": 149
+    },
+    {
+      "epoch": 1.1355932203389831,
+      "grad_norm": 0.45557519793510437,
+      "learning_rate": 0.00014181053171025392,
+      "loss": 0.0855,
+      "step": 150
+    },
+    {
+      "epoch": 1.1355932203389831,
+      "eval_loss": 0.06454955041408539,
+      "eval_runtime": 37.1265,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.024,
+      "step": 150
+    },
+    {
+      "epoch": 1.143126177024482,
+      "grad_norm": 0.2571490406990051,
+      "learning_rate": 0.00014106982394908283,
+      "loss": 0.0402,
+      "step": 151
+    },
+    {
+      "epoch": 1.1506591337099812,
+      "grad_norm": 0.4380505084991455,
+      "learning_rate": 0.00014032639570764593,
+      "loss": 0.086,
+      "step": 152
+    },
+    {
+      "epoch": 1.1581920903954803,
+      "grad_norm": 0.4073718190193176,
+      "learning_rate": 0.00013958029623090378,
+      "loss": 0.0491,
+      "step": 153
+    },
+    {
+      "epoch": 1.1657250470809792,
+      "grad_norm": 0.40776053071022034,
+      "learning_rate": 0.00013883157494076046,
+      "loss": 0.072,
+      "step": 154
+    },
+    {
+      "epoch": 1.1732580037664784,
+      "grad_norm": 0.31324324011802673,
+      "learning_rate": 0.00013808028143279006,
+      "loss": 0.0342,
+      "step": 155
+    },
+    {
+      "epoch": 1.1807909604519775,
+      "grad_norm": 0.517558753490448,
+      "learning_rate": 0.00013732646547295126,
+      "loss": 0.0579,
+      "step": 156
+    },
+    {
+      "epoch": 1.1883239171374764,
+      "grad_norm": 0.3593922257423401,
+      "learning_rate": 0.00013657017699429092,
+      "loss": 0.0749,
+      "step": 157
+    },
+    {
+      "epoch": 1.1958568738229756,
+      "grad_norm": 0.26723143458366394,
+      "learning_rate": 0.0001358114660936364,
+      "loss": 0.0372,
+      "step": 158
+    },
+    {
+      "epoch": 1.2033898305084745,
+      "grad_norm": 0.3371814489364624,
+      "learning_rate": 0.00013505038302827723,
+      "loss": 0.0486,
+      "step": 159
+    },
+    {
+      "epoch": 1.2109227871939736,
+      "grad_norm": 0.3006036579608917,
+      "learning_rate": 0.000134286978212636,
+      "loss": 0.0882,
+      "step": 160
+    },
+    {
+      "epoch": 1.2109227871939736,
+      "eval_loss": 0.06150702014565468,
+      "eval_runtime": 37.13,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.023,
+      "step": 160
+    },
+    {
+      "epoch": 1.2184557438794728,
+      "grad_norm": 0.3491075932979584,
+      "learning_rate": 0.0001335213022149289,
+      "loss": 0.0656,
+      "step": 161
+    },
+    {
+      "epoch": 1.2259887005649717,
+      "grad_norm": 0.3559153378009796,
+      "learning_rate": 0.00013275340575381598,
+      "loss": 0.0601,
+      "step": 162
+    },
+    {
+      "epoch": 1.2335216572504708,
+      "grad_norm": 0.41236844658851624,
+      "learning_rate": 0.00013198333969504175,
+      "loss": 0.0383,
+      "step": 163
+    },
+    {
+      "epoch": 1.24105461393597,
+      "grad_norm": 0.3909653425216675,
+      "learning_rate": 0.00013121115504806553,
+      "loss": 0.1066,
+      "step": 164
+    },
+    {
+      "epoch": 1.2485875706214689,
+      "grad_norm": 0.2600908875465393,
+      "learning_rate": 0.0001304369029626828,
+      "loss": 0.0361,
+      "step": 165
+    },
+    {
+      "epoch": 1.256120527306968,
+      "grad_norm": 0.27978697419166565,
+      "learning_rate": 0.00012966063472563685,
+      "loss": 0.0301,
+      "step": 166
+    },
+    {
+      "epoch": 1.2636534839924671,
+      "grad_norm": 0.3649253249168396,
+      "learning_rate": 0.00012888240175722162,
+      "loss": 0.0508,
+      "step": 167
+    },
+    {
+      "epoch": 1.271186440677966,
+      "grad_norm": 0.34710630774497986,
+      "learning_rate": 0.0001281022556078756,
+      "loss": 0.0573,
+      "step": 168
+    },
+    {
+      "epoch": 1.2787193973634652,
+      "grad_norm": 0.3954513669013977,
+      "learning_rate": 0.0001273202479547671,
+      "loss": 0.0708,
+      "step": 169
+    },
+    {
+      "epoch": 1.286252354048964,
+      "grad_norm": 0.3171145021915436,
+      "learning_rate": 0.00012653643059837107,
+      "loss": 0.0835,
+      "step": 170
+    },
+    {
+      "epoch": 1.286252354048964,
+      "eval_loss": 0.06033060699701309,
+      "eval_runtime": 37.0879,
+      "eval_samples_per_second": 2.022,
+      "eval_steps_per_second": 1.025,
+      "step": 170
+    },
+    {
+      "epoch": 1.2937853107344632,
+      "grad_norm": 0.3680741786956787,
+      "learning_rate": 0.00012575085545903794,
+      "loss": 0.077,
+      "step": 171
+    },
+    {
+      "epoch": 1.3013182674199624,
+      "grad_norm": 0.3026699423789978,
+      "learning_rate": 0.00012496357457355422,
+      "loss": 0.0778,
+      "step": 172
+    },
+    {
+      "epoch": 1.3088512241054615,
+      "grad_norm": 0.28971561789512634,
+      "learning_rate": 0.00012417464009169583,
+      "loss": 0.05,
+      "step": 173
+    },
+    {
+      "epoch": 1.3163841807909604,
+      "grad_norm": 0.4369751513004303,
+      "learning_rate": 0.0001233841042727734,
+      "loss": 0.0755,
+      "step": 174
+    },
+    {
+      "epoch": 1.3239171374764596,
+      "grad_norm": 0.2916516661643982,
+      "learning_rate": 0.00012259201948217077,
+      "loss": 0.0538,
+      "step": 175
+    },
+    {
+      "epoch": 1.3314500941619585,
+      "grad_norm": 0.6259362697601318,
+      "learning_rate": 0.00012179843818787624,
+      "loss": 0.0878,
+      "step": 176
+    },
+    {
+      "epoch": 1.3389830508474576,
+      "grad_norm": 0.2717919647693634,
+      "learning_rate": 0.00012100341295700702,
+      "loss": 0.0545,
+      "step": 177
+    },
+    {
+      "epoch": 1.3465160075329567,
+      "grad_norm": 0.47408613562583923,
+      "learning_rate": 0.00012020699645232721,
+      "loss": 0.0969,
+      "step": 178
+    },
+    {
+      "epoch": 1.3540489642184557,
+      "grad_norm": 0.2807871997356415,
+      "learning_rate": 0.00011940924142875947,
+      "loss": 0.0328,
+      "step": 179
+    },
+    {
+      "epoch": 1.3615819209039548,
+      "grad_norm": 0.4400388300418854,
+      "learning_rate": 0.0001186102007298904,
+      "loss": 0.0585,
+      "step": 180
+    },
+    {
+      "epoch": 1.3615819209039548,
+      "eval_loss": 0.05832603573799133,
+      "eval_runtime": 37.1319,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.023,
+      "step": 180
+    },
+    {
+      "epoch": 1.369114877589454,
+      "grad_norm": 0.38300102949142456,
+      "learning_rate": 0.00011780992728447018,
+      "loss": 0.0655,
+      "step": 181
+    },
+    {
+      "epoch": 1.3766478342749529,
+      "grad_norm": 0.39059555530548096,
+      "learning_rate": 0.00011700847410290667,
+      "loss": 0.0617,
+      "step": 182
+    },
+    {
+      "epoch": 1.384180790960452,
+      "grad_norm": 0.36025285720825195,
+      "learning_rate": 0.00011620589427375375,
+      "loss": 0.1054,
+      "step": 183
+    },
+    {
+      "epoch": 1.3917137476459511,
+      "grad_norm": 0.24352721869945526,
+      "learning_rate": 0.00011540224096019494,
+      "loss": 0.0298,
+      "step": 184
+    },
+    {
+      "epoch": 1.39924670433145,
+      "grad_norm": 0.2885790169239044,
+      "learning_rate": 0.00011459756739652175,
+      "loss": 0.0696,
+      "step": 185
+    },
+    {
+      "epoch": 1.4067796610169492,
+      "grad_norm": 0.2957116961479187,
+      "learning_rate": 0.0001137919268846074,
+      "loss": 0.0449,
+      "step": 186
+    },
+    {
+      "epoch": 1.414312617702448,
+      "grad_norm": 0.32375454902648926,
+      "learning_rate": 0.0001129853727903762,
+      "loss": 0.0535,
+      "step": 187
+    },
+    {
+      "epoch": 1.4218455743879472,
+      "grad_norm": 0.35646215081214905,
+      "learning_rate": 0.0001121779585402684,
+      "loss": 0.037,
+      "step": 188
+    },
+    {
+      "epoch": 1.4293785310734464,
+      "grad_norm": 0.25164303183555603,
+      "learning_rate": 0.00011136973761770136,
+      "loss": 0.036,
+      "step": 189
+    },
+    {
+      "epoch": 1.4369114877589455,
+      "grad_norm": 0.24905888736248016,
+      "learning_rate": 0.0001105607635595266,
+      "loss": 0.0344,
+      "step": 190
+    },
+    {
+      "epoch": 1.4369114877589455,
+      "eval_loss": 0.05805233120918274,
+      "eval_runtime": 37.1095,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 190
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.3525996506214142,
+      "learning_rate": 0.00010975108995248378,
+      "loss": 0.0576,
+      "step": 191
+    },
+    {
+      "epoch": 1.4519774011299436,
+      "grad_norm": 0.2925921082496643,
+      "learning_rate": 0.00010894077042965083,
+      "loss": 0.0645,
+      "step": 192
+    },
+    {
+      "epoch": 1.4595103578154425,
+      "grad_norm": 0.47334054112434387,
+      "learning_rate": 0.00010812985866689142,
+      "loss": 0.1769,
+      "step": 193
+    },
+    {
+      "epoch": 1.4670433145009416,
+      "grad_norm": 0.3007245659828186,
+      "learning_rate": 0.00010731840837929946,
+      "loss": 0.0565,
+      "step": 194
+    },
+    {
+      "epoch": 1.4745762711864407,
+      "grad_norm": 0.3107605576515198,
+      "learning_rate": 0.00010650647331764079,
+      "loss": 0.0504,
+      "step": 195
+    },
+    {
+      "epoch": 1.4821092278719397,
+      "grad_norm": 0.3428517282009125,
+      "learning_rate": 0.000105694107264793,
+      "loss": 0.0749,
+      "step": 196
+    },
+    {
+      "epoch": 1.4896421845574388,
+      "grad_norm": 0.3695080280303955,
+      "learning_rate": 0.00010488136403218265,
+      "loss": 0.0604,
+      "step": 197
+    },
+    {
+      "epoch": 1.497175141242938,
+      "grad_norm": 0.33667024970054626,
+      "learning_rate": 0.00010406829745622085,
+      "loss": 0.0739,
+      "step": 198
+    },
+    {
+      "epoch": 1.5047080979284368,
+      "grad_norm": 0.4697053134441376,
+      "learning_rate": 0.00010325496139473702,
+      "loss": 0.0588,
+      "step": 199
+    },
+    {
+      "epoch": 1.512241054613936,
+      "grad_norm": 0.36798229813575745,
+      "learning_rate": 0.00010244140972341155,
+      "loss": 0.0401,
+      "step": 200
+    },
+    {
+      "epoch": 1.512241054613936,
+      "eval_loss": 0.05732857435941696,
+      "eval_runtime": 37.158,
+      "eval_samples_per_second": 2.018,
+      "eval_steps_per_second": 1.023,
+      "step": 200
+    },
+    {
+      "epoch": 1.5197740112994351,
+      "grad_norm": 0.29147714376449585,
+      "learning_rate": 0.00010162769633220672,
+      "loss": 0.0692,
+      "step": 201
+    },
+    {
+      "epoch": 1.527306967984934,
+      "grad_norm": 0.2551415264606476,
+      "learning_rate": 0.00010081387512179729,
+      "loss": 0.0495,
+      "step": 202
+    },
+    {
+      "epoch": 1.5348399246704332,
+      "grad_norm": 0.4365129768848419,
+      "learning_rate": 0.0001,
+      "loss": 0.0905,
+      "step": 203
+    },
+    {
+      "epoch": 1.542372881355932,
+      "grad_norm": 0.256455659866333,
+      "learning_rate": 9.918612487820273e-05,
+      "loss": 0.0441,
+      "step": 204
+    },
+    {
+      "epoch": 1.5499058380414312,
+      "grad_norm": 0.33844852447509766,
+      "learning_rate": 9.83723036677933e-05,
+      "loss": 0.0517,
+      "step": 205
+    },
+    {
+      "epoch": 1.5574387947269304,
+      "grad_norm": 0.28650492429733276,
+      "learning_rate": 9.755859027658848e-05,
+      "loss": 0.0473,
+      "step": 206
+    },
+    {
+      "epoch": 1.5649717514124295,
+      "grad_norm": 0.2910935580730438,
+      "learning_rate": 9.674503860526297e-05,
+      "loss": 0.0501,
+      "step": 207
+    },
+    {
+      "epoch": 1.5725047080979284,
+      "grad_norm": 0.49296438694000244,
+      "learning_rate": 9.593170254377916e-05,
+      "loss": 0.0624,
+      "step": 208
+    },
+    {
+      "epoch": 1.5800376647834273,
+      "grad_norm": 0.3825702965259552,
+      "learning_rate": 9.511863596781734e-05,
+      "loss": 0.0768,
+      "step": 209
+    },
+    {
+      "epoch": 1.5875706214689265,
+      "grad_norm": 0.2868608832359314,
+      "learning_rate": 9.430589273520703e-05,
+      "loss": 0.054,
+      "step": 210
+    },
+    {
+      "epoch": 1.5875706214689265,
+      "eval_loss": 0.05658142268657684,
+      "eval_runtime": 37.107,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 210
+    },
+    {
+      "epoch": 1.5951035781544256,
+      "grad_norm": 0.22975075244903564,
+      "learning_rate": 9.349352668235925e-05,
+      "loss": 0.0375,
+      "step": 211
+    },
+    {
+      "epoch": 1.6026365348399247,
+      "grad_norm": 0.29614976048469543,
+      "learning_rate": 9.268159162070058e-05,
+      "loss": 0.0768,
+      "step": 212
+    },
+    {
+      "epoch": 1.6101694915254239,
+      "grad_norm": 0.2965467870235443,
+      "learning_rate": 9.18701413331086e-05,
+      "loss": 0.0444,
+      "step": 213
+    },
+    {
+      "epoch": 1.6177024482109228,
+      "grad_norm": 0.3394235670566559,
+      "learning_rate": 9.10592295703492e-05,
+      "loss": 0.0549,
+      "step": 214
+    },
+    {
+      "epoch": 1.6252354048964217,
+      "grad_norm": 0.3029539883136749,
+      "learning_rate": 9.024891004751626e-05,
+      "loss": 0.0451,
+      "step": 215
+    },
+    {
+      "epoch": 1.6327683615819208,
+      "grad_norm": 0.28490352630615234,
+      "learning_rate": 8.943923644047342e-05,
+      "loss": 0.0272,
+      "step": 216
+    },
+    {
+      "epoch": 1.64030131826742,
+      "grad_norm": 0.3418651819229126,
+      "learning_rate": 8.863026238229868e-05,
+      "loss": 0.1127,
+      "step": 217
+    },
+    {
+      "epoch": 1.6478342749529191,
+      "grad_norm": 0.32494044303894043,
+      "learning_rate": 8.782204145973162e-05,
+      "loss": 0.0976,
+      "step": 218
+    },
+    {
+      "epoch": 1.655367231638418,
+      "grad_norm": 0.5956616997718811,
+      "learning_rate": 8.701462720962381e-05,
+      "loss": 0.0509,
+      "step": 219
+    },
+    {
+      "epoch": 1.6629001883239172,
+      "grad_norm": 0.35732752084732056,
+      "learning_rate": 8.620807311539259e-05,
+      "loss": 0.1967,
+      "step": 220
+    },
+    {
+      "epoch": 1.6629001883239172,
+      "eval_loss": 0.055175162851810455,
+      "eval_runtime": 37.1192,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 220
+    },
+    {
+      "epoch": 1.670433145009416,
+      "grad_norm": 0.4732244610786438,
+      "learning_rate": 8.540243260347826e-05,
+      "loss": 0.0693,
+      "step": 221
+    },
+    {
+      "epoch": 1.6779661016949152,
+      "grad_norm": 0.27817562222480774,
+      "learning_rate": 8.45977590398051e-05,
+      "loss": 0.0616,
+      "step": 222
+    },
+    {
+      "epoch": 1.6854990583804144,
+      "grad_norm": 0.28534531593322754,
+      "learning_rate": 8.379410572624628e-05,
+      "loss": 0.0392,
+      "step": 223
+    },
+    {
+      "epoch": 1.6930320150659135,
+      "grad_norm": 0.20350764691829681,
+      "learning_rate": 8.299152589709336e-05,
+      "loss": 0.0348,
+      "step": 224
+    },
+    {
+      "epoch": 1.7005649717514124,
+      "grad_norm": 0.22657251358032227,
+      "learning_rate": 8.219007271552983e-05,
+      "loss": 0.0393,
+      "step": 225
+    },
+    {
+      "epoch": 1.7080979284369113,
+      "grad_norm": 0.3810754418373108,
+      "learning_rate": 8.138979927010964e-05,
+      "loss": 0.0661,
+      "step": 226
+    },
+    {
+      "epoch": 1.7156308851224105,
+      "grad_norm": 0.23370787501335144,
+      "learning_rate": 8.059075857124056e-05,
+      "loss": 0.0519,
+      "step": 227
+    },
+    {
+      "epoch": 1.7231638418079096,
+      "grad_norm": 0.2558518648147583,
+      "learning_rate": 7.97930035476728e-05,
+      "loss": 0.0419,
+      "step": 228
+    },
+    {
+      "epoch": 1.7306967984934087,
+      "grad_norm": 0.24495276808738708,
+      "learning_rate": 7.899658704299301e-05,
+      "loss": 0.0768,
+      "step": 229
+    },
+    {
+      "epoch": 1.7382297551789079,
+      "grad_norm": 0.31314679980278015,
+      "learning_rate": 7.820156181212379e-05,
+      "loss": 0.0987,
+      "step": 230
+    },
+    {
+      "epoch": 1.7382297551789079,
+      "eval_loss": 0.055335018783807755,
+      "eval_runtime": 37.1237,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.024,
+      "step": 230
+    },
+    {
+      "epoch": 1.7457627118644068,
+      "grad_norm": 0.2738696038722992,
+      "learning_rate": 7.740798051782923e-05,
+      "loss": 0.1045,
+      "step": 231
+    },
+    {
+      "epoch": 1.7532956685499057,
+      "grad_norm": 0.22097167372703552,
+      "learning_rate": 7.66158957272266e-05,
+      "loss": 0.0384,
+      "step": 232
+    },
+    {
+      "epoch": 1.7608286252354048,
+      "grad_norm": 0.319528728723526,
+      "learning_rate": 7.582535990830415e-05,
+      "loss": 0.0513,
+      "step": 233
+    },
+    {
+      "epoch": 1.768361581920904,
+      "grad_norm": 0.28677770495414734,
+      "learning_rate": 7.503642542644581e-05,
+      "loss": 0.0616,
+      "step": 234
+    },
+    {
+      "epoch": 1.7758945386064031,
+      "grad_norm": 0.3826892673969269,
+      "learning_rate": 7.424914454096211e-05,
+      "loss": 0.0606,
+      "step": 235
+    },
+    {
+      "epoch": 1.783427495291902,
+      "grad_norm": 0.3082129955291748,
+      "learning_rate": 7.346356940162895e-05,
+      "loss": 0.0566,
+      "step": 236
+    },
+    {
+      "epoch": 1.7909604519774012,
+      "grad_norm": 0.25097185373306274,
+      "learning_rate": 7.267975204523295e-05,
+      "loss": 0.0431,
+      "step": 237
+    },
+    {
+      "epoch": 1.7984934086629,
+      "grad_norm": 0.4633219838142395,
+      "learning_rate": 7.189774439212442e-05,
+      "loss": 0.0546,
+      "step": 238
+    },
+    {
+      "epoch": 1.8060263653483992,
+      "grad_norm": 0.3444885313510895,
+      "learning_rate": 7.11175982427784e-05,
+      "loss": 0.1409,
+      "step": 239
+    },
+    {
+      "epoch": 1.8135593220338984,
+      "grad_norm": 0.3237282633781433,
+      "learning_rate": 7.033936527436318e-05,
+      "loss": 0.0659,
+      "step": 240
+    },
+    {
+      "epoch": 1.8135593220338984,
+      "eval_loss": 0.05429178848862648,
+      "eval_runtime": 37.0949,
+      "eval_samples_per_second": 2.022,
+      "eval_steps_per_second": 1.024,
+      "step": 240
+    },
+    {
+      "epoch": 1.8210922787193975,
+      "grad_norm": 0.2055477797985077,
+      "learning_rate": 6.95630970373172e-05,
+      "loss": 0.0378,
+      "step": 241
+    },
+    {
+      "epoch": 1.8286252354048964,
+      "grad_norm": 0.27016931772232056,
+      "learning_rate": 6.878884495193448e-05,
+      "loss": 0.0507,
+      "step": 242
+    },
+    {
+      "epoch": 1.8361581920903953,
+      "grad_norm": 0.2610904574394226,
+      "learning_rate": 6.801666030495826e-05,
+      "loss": 0.0389,
+      "step": 243
+    },
+    {
+      "epoch": 1.8436911487758945,
+      "grad_norm": 0.2465640753507614,
+      "learning_rate": 6.724659424618401e-05,
+      "loss": 0.0843,
+      "step": 244
+    },
+    {
+      "epoch": 1.8512241054613936,
+      "grad_norm": 0.24705246090888977,
+      "learning_rate": 6.647869778507112e-05,
+      "loss": 0.0493,
+      "step": 245
+    },
+    {
+      "epoch": 1.8587570621468927,
+      "grad_norm": 0.7887628078460693,
+      "learning_rate": 6.571302178736404e-05,
+      "loss": 0.0511,
+      "step": 246
+    },
+    {
+      "epoch": 1.8662900188323919,
+      "grad_norm": 0.3609479069709778,
+      "learning_rate": 6.494961697172279e-05,
+      "loss": 0.0292,
+      "step": 247
+    },
+    {
+      "epoch": 1.8738229755178908,
+      "grad_norm": 0.23038731515407562,
+      "learning_rate": 6.418853390636364e-05,
+      "loss": 0.0361,
+      "step": 248
+    },
+    {
+      "epoch": 1.8813559322033897,
+      "grad_norm": 0.3310745060443878,
+      "learning_rate": 6.342982300570912e-05,
+      "loss": 0.103,
+      "step": 249
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.2912939786911011,
+      "learning_rate": 6.267353452704876e-05,
+      "loss": 0.0391,
+      "step": 250
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "eval_loss": 0.05423182249069214,
+      "eval_runtime": 37.1201,
+      "eval_samples_per_second": 2.02,
+      "eval_steps_per_second": 1.024,
+      "step": 250
+    },
+    {
+      "epoch": 1.896421845574388,
+      "grad_norm": 0.3608611822128296,
+      "learning_rate": 6.191971856720997e-05,
+      "loss": 0.0474,
+      "step": 251
+    },
+    {
+      "epoch": 1.9039548022598871,
+      "grad_norm": 0.2649577856063843,
+      "learning_rate": 6.116842505923955e-05,
+      "loss": 0.0352,
+      "step": 252
+    },
+    {
+      "epoch": 1.911487758945386,
+      "grad_norm": 1.2930629253387451,
+      "learning_rate": 6.0419703769096235e-05,
+      "loss": 0.0672,
+      "step": 253
+    },
+    {
+      "epoch": 1.9190207156308852,
+      "grad_norm": 0.4104057252407074,
+      "learning_rate": 5.967360429235407e-05,
+      "loss": 0.07,
+      "step": 254
+    },
+    {
+      "epoch": 1.926553672316384,
+      "grad_norm": 0.375598281621933,
+      "learning_rate": 5.893017605091717e-05,
+      "loss": 0.0904,
+      "step": 255
+    },
+    {
+      "epoch": 1.9340866290018832,
+      "grad_norm": 0.20128563046455383,
+      "learning_rate": 5.818946828974607e-05,
+      "loss": 0.0288,
+      "step": 256
+    },
+    {
+      "epoch": 1.9416195856873824,
+      "grad_norm": 0.37956199049949646,
+      "learning_rate": 5.7451530073595785e-05,
+      "loss": 0.0575,
+      "step": 257
+    },
+    {
+      "epoch": 1.9491525423728815,
+      "grad_norm": 0.40059077739715576,
+      "learning_rate": 5.671641028376546e-05,
+      "loss": 0.0586,
+      "step": 258
+    },
+    {
+      "epoch": 1.9566854990583804,
+      "grad_norm": 0.3582189381122589,
+      "learning_rate": 5.5984157614860845e-05,
+      "loss": 0.0682,
+      "step": 259
+    },
+    {
+      "epoch": 1.9642184557438793,
+      "grad_norm": 0.3279203474521637,
+      "learning_rate": 5.5254820571568325e-05,
+      "loss": 0.0953,
+      "step": 260
+    },
+    {
+      "epoch": 1.9642184557438793,
+      "eval_loss": 0.05431414395570755,
+      "eval_runtime": 37.1186,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 260
+    },
+    {
+      "epoch": 1.9717514124293785,
+      "grad_norm": 0.27801838517189026,
+      "learning_rate": 5.4528447465442334e-05,
+      "loss": 0.0383,
+      "step": 261
+    },
+    {
+      "epoch": 1.9792843691148776,
+      "grad_norm": 0.26640596985816956,
+      "learning_rate": 5.3805086411704985e-05,
+      "loss": 0.0624,
+      "step": 262
+    },
+    {
+      "epoch": 1.9868173258003767,
+      "grad_norm": 0.3783319294452667,
+      "learning_rate": 5.3084785326058925e-05,
+      "loss": 0.0739,
+      "step": 263
+    },
+    {
+      "epoch": 1.9943502824858759,
+      "grad_norm": 0.2667982280254364,
+      "learning_rate": 5.236759192151336e-05,
+      "loss": 0.04,
+      "step": 264
+    },
+    {
+      "epoch": 2.007532956685499,
+      "grad_norm": 0.9414636492729187,
+      "learning_rate": 5.165355370522358e-05,
+      "loss": 0.1447,
+      "step": 265
+    },
+    {
+      "epoch": 2.0150659133709983,
+      "grad_norm": 0.3006013035774231,
+      "learning_rate": 5.0942717975344035e-05,
+      "loss": 0.0482,
+      "step": 266
+    },
+    {
+      "epoch": 2.022598870056497,
+      "grad_norm": 0.20572024583816528,
+      "learning_rate": 5.02351318178953e-05,
+      "loss": 0.0329,
+      "step": 267
+    },
+    {
+      "epoch": 2.030131826741996,
+      "grad_norm": 0.2508287727832794,
+      "learning_rate": 4.953084210364508e-05,
+      "loss": 0.0352,
+      "step": 268
+    },
+    {
+      "epoch": 2.0376647834274952,
+      "grad_norm": 0.2693123519420624,
+      "learning_rate": 4.882989548500349e-05,
+      "loss": 0.0408,
+      "step": 269
+    },
+    {
+      "epoch": 2.0451977401129944,
+      "grad_norm": 0.3008269965648651,
+      "learning_rate": 4.813233839293265e-05,
+      "loss": 0.0362,
+      "step": 270
+    },
+    {
+      "epoch": 2.0451977401129944,
+      "eval_loss": 0.05344419553875923,
+      "eval_runtime": 37.1115,
+      "eval_samples_per_second": 2.021,
+      "eval_steps_per_second": 1.024,
+      "step": 270
+    },
+    {
+      "epoch": 2.0451977401129944,
+      "step": 270,
+      "total_flos": 2.136820048293888e+16,
+      "train_loss": 0.2946822406862069,
+      "train_runtime": 4276.0328,
+      "train_samples_per_second": 0.745,
+      "train_steps_per_second": 0.093
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 396,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 5,
+        "early_stopping_threshold": 0.001
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 5
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.136820048293888e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a77af18f00c34a759a24dd16f355f28486619b3592f07abfbb0f7b9b13205220
+size 5880

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff