Upload 14 files (#1)

Browse files

- Upload 14 files (7f8f19cbedb82d5682a41d9c0e7380a52ef0871e)

Files changed (15) hide show

.gitattributes +1 -0
Modelfile +23 -0
README.md +93 -3
adapter_config.json +38 -0
adapter_model.safetensors +3 -0
codellama-7b-qml.gguf +3 -0
optimizer.pt +3 -0
rest.safetensors +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +84 -0
trainer_state.json +3114 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+codellama-7b-qml.gguf filter=lfs diff=lfs merge=lfs -text

Modelfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM codellama:7b-code
+# Base model name and adapter
+ADAPTER ./codellama-7b-qml.gguf
+# Parameters optimized for code generation
+PARAMETER temperature 0.2
+PARAMETER num_predict 500
+PARAMETER top_p 0.9
+PARAMETER stop "<SUF>"
+PARAMETER stop "<PRE>"
+PARAMETER stop "</PRE>"
+PARAMETER stop "</SUF>"
+PARAMETER stop "< EOT >"
+PARAMETER stop "\\end"
+PARAMETER stop "<MID>"
+PARAMETER stop "</MID>"
+PARAMETER stop "##"
+# Template format for code interactions
+TEMPLATE """{{if .Prompt}}{{ .Prompt }}{{end}}{{if .Response}}{{ .Response }}{{end}}"""

README.md CHANGED Viewed

@@ -1,3 +1,93 @@
----
-license: llama3.2
----

+---
+license: llama2
+base_model:
+- meta-llama/CodeLlama-7b-hf
+base_model_relation: adapter
+tags:
+- QML
+- Code-Completion
+---
+# Model Overview
+## Description:
+CodeLlama-7B-QML is a large language model customized by the Qt Company for Fill-In-The-Middle code completion tasks in the QML programming language, especially for Qt Quick Controls compliant with Qt 6 releases. The CodeLlama-7B-QML model is designed for companies and individuals that want to self-host their LLM for HMI (Human Machine Interface) software development instead of relying on third-party hosted LLMs.
+This model reaches a score of 79% on the QML100 Fill-In-the-Middle code completion benchmark for Qt 6-compliant code. In comparison, the model scored:
+- CodeLlama-13B-QML (finetuned model from Qt): 79%,
+- Claude 3.7 Sonnet: 76%,
+- Claude 3.5 Sonnet: 68%,
+- CodeLlama 13B: 66%,
+- GPT-4o: 62%,
+- CodeLlama 7B: 61%.
+This model was fine-tuned based on raw data from over 5000 human-created QML code snippets using the LoRa fine-tuning method. CodeLlama-7B-QML is not optimised for the creation of Qt5-release compliant, C++, or Python code.
+ ## Terms of use:
+By accessing this model, you are agreeing to the Llama 2 terms and conditions of the [license](https://github.com/meta-llama/llama/blob/main/LICENSE), [acceptable use policy](https://github.com/meta-llama/llama/blob/main/USE_POLICY.md) and [Meta’s privacy policy](https://www.facebook.com/privacy/policy/). By using this model, you are furthermore agreeing to the [Qt AI Model terms & conditions](https://www.qt.io/terms-conditions/ai-services/model-use).
+ ## Usage:
+CodeLlama-7B-QML is a medium-sized Language Model that requires significant computing resources to perform with inference (response) times suitable for automatic code completion. Therefore, it should be used with a GPU accelerator, either in the cloud environment such as AWS, Google Cloud, Microsoft Azure, or locally.
+Large Language Models, including CodeLlama-7B-QML, are not designed to be deployed in isolation but instead should be deployed as part of an overall AI system with additional safety guardrails as required. Developers are expected to deploy system safeguards when building AI systems.
+The repository contains multiple files with adapters.
+## How to run CodeLlama-7B-QML in cloud deployment:
+The configuration depends on the chosen cloud technology.
+Running a CodeLlama-7b-QML in the cloud requires working with Docker and vLLM for optimal performance. Make sure all required dependencies are installed (transformers, accelerate and peft modules). Use bfloat16 precision. The setup leverages the base model from Hugging Face (requiring an access token) combined with adapter weights from the repository. Using vLLM enables efficient inference with an OpenAI-compatible API endpoint, making integration straightforward. vLLM serves as a highly optimized backend that implements request batching and queuing mechanisms, providing excellent serving optimization. The docker container should be run on an instance with GPU accelerator. The configuration has been thoroughly tested on Ubuntu 22.04 LTS running NVIDIA driver with A100 80GB GPUs, demonstrating stable and efficient performance.
+## How to run CodeLlama-7B-QML in ollama:
+The model can be downloaded either from Hugging Face or Ollama. If the choice is Hugging Face, follow all the instruction steps. In case of Ollama, execute steps 1 and 5.
+#### 1. Install ollama
+https://ollama.com/download
+#### 2. Clone the model repository
+#### 3. Open the terminal and go to the repository
+#### 4. Build the model in ollama
+```
+ollama create theqtcompany/codellama-7b-code-qml -f Modelfile
+```
+The model's name must be exactly as above if one wants to use the model in the Qt Creator
+#### 5. Run the model
+```
+ollama run theqtcompany/codellama-7b-qml
+```
+You can start writing prompts in the terminal or send curl requests now.
+Here is a curl request example:
+```
+curl -X POST http://localhost:11434/api/generate -d '{
+  "model": "theqtcompany/codellama-7b-qml",
+  "Prompt": "<SUF>\n    title: qsTr(\"Hello World\")\n}<PRE>import QtQuick\n\nWindow {\n    width: 640\n    height: 480\n    visible: true\n<MID>",
+  "stream": false,
+  "temperature": 0.2,
+  "top_p": 0.9,
+  "num_predict": 500,
+  "stop": ["<SUF>", "<PRE>", "</PRE>", "</SUF>", "< EOT >", "\\end", "<MID>", "</MID>", "##"]
+}'
+```
+The prompt format:
+```
+"<SUF>{suffix}<PRE>{prefix}<MID>"
+```
+If there is no suffix, please use:
+```
+"<PRE>{prefix}<MID>"
+```
+## Model Version:
+v1.0
+## Attribution:
+CodeLlama-7B is a model of the Llama 2 family. Llama 2 is licensed under the LLAMA 2 Community License, Copyright (c) Meta Platforms, Inc. All Rights Reserved.

adapter_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-hf",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "lm_head",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4924dfac7c38ed7e78d23d166c1bac46cc7cde6ef2ec3d70c741a4d5b11ea7
+size 648936760

codellama-7b-qml.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87a033f1b70147091d4a52d8d7a3938ca182b5efa701418adaddcaf7331dce4e
+size 648907072

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8da06080bb75f0fe64cb3bb6897ff13b9b2f17a551222d1a4883a09ca52cf17e
+size 325614754

rest.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:554787be54f537266241b356a8bbc672062afe4d218d66ca5d2ece3b41d590b7
+size 524550264

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:054177611f2412e52a8cd0a4d0895e2cb9e4fef94374378f0bf6c4059d506903
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef1fe48b040bf7678711805e0ccda18391a9ad1db687676a750066fbac4eecfb
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "extra_special_tokens": {},
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3114 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 20,
+  "global_step": 8786,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0022765430693491933,
+      "grad_norm": 0.469247430562973,
+      "learning_rate": 0.0002,
+      "loss": 1.9469,
+      "step": 20
+    },
+    {
+      "epoch": 0.004553086138698387,
+      "grad_norm": 0.6239348649978638,
+      "learning_rate": 0.0002,
+      "loss": 1.556,
+      "step": 40
+    },
+    {
+      "epoch": 0.006829629208047579,
+      "grad_norm": 0.4587397277355194,
+      "learning_rate": 0.0002,
+      "loss": 1.4108,
+      "step": 60
+    },
+    {
+      "epoch": 0.009106172277396773,
+      "grad_norm": 0.42919760942459106,
+      "learning_rate": 0.0002,
+      "loss": 1.3352,
+      "step": 80
+    },
+    {
+      "epoch": 0.011382715346745967,
+      "grad_norm": 0.46492573618888855,
+      "learning_rate": 0.0002,
+      "loss": 1.3388,
+      "step": 100
+    },
+    {
+      "epoch": 0.013659258416095159,
+      "grad_norm": 0.453070729970932,
+      "learning_rate": 0.0002,
+      "loss": 1.2295,
+      "step": 120
+    },
+    {
+      "epoch": 0.015935801485444354,
+      "grad_norm": 0.4760678708553314,
+      "learning_rate": 0.0002,
+      "loss": 1.2493,
+      "step": 140
+    },
+    {
+      "epoch": 0.018212344554793546,
+      "grad_norm": 0.4545675814151764,
+      "learning_rate": 0.0002,
+      "loss": 1.215,
+      "step": 160
+    },
+    {
+      "epoch": 0.020488887624142738,
+      "grad_norm": 0.4772235155105591,
+      "learning_rate": 0.0002,
+      "loss": 1.2173,
+      "step": 180
+    },
+    {
+      "epoch": 0.022765430693491934,
+      "grad_norm": 0.4403541088104248,
+      "learning_rate": 0.0002,
+      "loss": 1.1058,
+      "step": 200
+    },
+    {
+      "epoch": 0.025041973762841126,
+      "grad_norm": 0.511401355266571,
+      "learning_rate": 0.0002,
+      "loss": 1.1049,
+      "step": 220
+    },
+    {
+      "epoch": 0.027318516832190318,
+      "grad_norm": 0.3809013366699219,
+      "learning_rate": 0.0002,
+      "loss": 1.0498,
+      "step": 240
+    },
+    {
+      "epoch": 0.029595059901539513,
+      "grad_norm": 0.3980010449886322,
+      "learning_rate": 0.0002,
+      "loss": 0.9842,
+      "step": 260
+    },
+    {
+      "epoch": 0.03187160297088871,
+      "grad_norm": 0.5747793316841125,
+      "learning_rate": 0.0002,
+      "loss": 1.0988,
+      "step": 280
+    },
+    {
+      "epoch": 0.0341481460402379,
+      "grad_norm": 0.46827971935272217,
+      "learning_rate": 0.0002,
+      "loss": 1.0367,
+      "step": 300
+    },
+    {
+      "epoch": 0.03642468910958709,
+      "grad_norm": 0.4702209532260895,
+      "learning_rate": 0.0002,
+      "loss": 1.066,
+      "step": 320
+    },
+    {
+      "epoch": 0.038701232178936285,
+      "grad_norm": 0.5084996223449707,
+      "learning_rate": 0.0002,
+      "loss": 1.0652,
+      "step": 340
+    },
+    {
+      "epoch": 0.040977775248285477,
+      "grad_norm": 0.3944012522697449,
+      "learning_rate": 0.0002,
+      "loss": 0.9642,
+      "step": 360
+    },
+    {
+      "epoch": 0.04325431831763467,
+      "grad_norm": 0.40287718176841736,
+      "learning_rate": 0.0002,
+      "loss": 0.9431,
+      "step": 380
+    },
+    {
+      "epoch": 0.04553086138698387,
+      "grad_norm": 0.4629077613353729,
+      "learning_rate": 0.0002,
+      "loss": 0.9615,
+      "step": 400
+    },
+    {
+      "epoch": 0.04780740445633306,
+      "grad_norm": 0.44827452301979065,
+      "learning_rate": 0.0002,
+      "loss": 0.9434,
+      "step": 420
+    },
+    {
+      "epoch": 0.05008394752568225,
+      "grad_norm": 0.41644710302352905,
+      "learning_rate": 0.0002,
+      "loss": 0.9241,
+      "step": 440
+    },
+    {
+      "epoch": 0.05236049059503144,
+      "grad_norm": 0.4760611057281494,
+      "learning_rate": 0.0002,
+      "loss": 0.8475,
+      "step": 460
+    },
+    {
+      "epoch": 0.054637033664380635,
+      "grad_norm": 0.45987364649772644,
+      "learning_rate": 0.0002,
+      "loss": 0.898,
+      "step": 480
+    },
+    {
+      "epoch": 0.056913576733729834,
+      "grad_norm": 0.4840068817138672,
+      "learning_rate": 0.0002,
+      "loss": 0.9611,
+      "step": 500
+    },
+    {
+      "epoch": 0.059190119803079026,
+      "grad_norm": 0.40314286947250366,
+      "learning_rate": 0.0002,
+      "loss": 0.8884,
+      "step": 520
+    },
+    {
+      "epoch": 0.06146666287242822,
+      "grad_norm": 0.5458106398582458,
+      "learning_rate": 0.0002,
+      "loss": 0.8939,
+      "step": 540
+    },
+    {
+      "epoch": 0.06374320594177742,
+      "grad_norm": 0.5420896410942078,
+      "learning_rate": 0.0002,
+      "loss": 0.8265,
+      "step": 560
+    },
+    {
+      "epoch": 0.0660197490111266,
+      "grad_norm": 0.5356529355049133,
+      "learning_rate": 0.0002,
+      "loss": 0.8432,
+      "step": 580
+    },
+    {
+      "epoch": 0.0682962920804758,
+      "grad_norm": 0.5064826011657715,
+      "learning_rate": 0.0002,
+      "loss": 0.8272,
+      "step": 600
+    },
+    {
+      "epoch": 0.07057283514982499,
+      "grad_norm": 0.4143005311489105,
+      "learning_rate": 0.0002,
+      "loss": 0.7854,
+      "step": 620
+    },
+    {
+      "epoch": 0.07284937821917419,
+      "grad_norm": 0.3817225396633148,
+      "learning_rate": 0.0002,
+      "loss": 0.8219,
+      "step": 640
+    },
+    {
+      "epoch": 0.07512592128852338,
+      "grad_norm": 0.5336936712265015,
+      "learning_rate": 0.0002,
+      "loss": 0.7977,
+      "step": 660
+    },
+    {
+      "epoch": 0.07740246435787257,
+      "grad_norm": 0.5397001504898071,
+      "learning_rate": 0.0002,
+      "loss": 0.8117,
+      "step": 680
+    },
+    {
+      "epoch": 0.07967900742722177,
+      "grad_norm": 0.4968530535697937,
+      "learning_rate": 0.0002,
+      "loss": 0.7527,
+      "step": 700
+    },
+    {
+      "epoch": 0.08195555049657095,
+      "grad_norm": 0.4084935784339905,
+      "learning_rate": 0.0002,
+      "loss": 0.651,
+      "step": 720
+    },
+    {
+      "epoch": 0.08423209356592015,
+      "grad_norm": 0.48406732082366943,
+      "learning_rate": 0.0002,
+      "loss": 0.7352,
+      "step": 740
+    },
+    {
+      "epoch": 0.08650863663526934,
+      "grad_norm": 0.5246301293373108,
+      "learning_rate": 0.0002,
+      "loss": 0.7785,
+      "step": 760
+    },
+    {
+      "epoch": 0.08878517970461854,
+      "grad_norm": 0.5729619264602661,
+      "learning_rate": 0.0002,
+      "loss": 0.7646,
+      "step": 780
+    },
+    {
+      "epoch": 0.09106172277396773,
+      "grad_norm": 0.5675190687179565,
+      "learning_rate": 0.0002,
+      "loss": 0.7784,
+      "step": 800
+    },
+    {
+      "epoch": 0.09333826584331692,
+      "grad_norm": 0.4682878255844116,
+      "learning_rate": 0.0002,
+      "loss": 0.7284,
+      "step": 820
+    },
+    {
+      "epoch": 0.09561480891266612,
+      "grad_norm": 0.5388545393943787,
+      "learning_rate": 0.0002,
+      "loss": 0.6959,
+      "step": 840
+    },
+    {
+      "epoch": 0.0978913519820153,
+      "grad_norm": 0.48806509375572205,
+      "learning_rate": 0.0002,
+      "loss": 0.7585,
+      "step": 860
+    },
+    {
+      "epoch": 0.1001678950513645,
+      "grad_norm": 0.4149261713027954,
+      "learning_rate": 0.0002,
+      "loss": 0.6978,
+      "step": 880
+    },
+    {
+      "epoch": 0.1024444381207137,
+      "grad_norm": 0.4971105754375458,
+      "learning_rate": 0.0002,
+      "loss": 0.7103,
+      "step": 900
+    },
+    {
+      "epoch": 0.10472098119006289,
+      "grad_norm": 0.5066735744476318,
+      "learning_rate": 0.0002,
+      "loss": 0.6854,
+      "step": 920
+    },
+    {
+      "epoch": 0.10699752425941209,
+      "grad_norm": 0.4922661781311035,
+      "learning_rate": 0.0002,
+      "loss": 0.6231,
+      "step": 940
+    },
+    {
+      "epoch": 0.10927406732876127,
+      "grad_norm": 0.5949555039405823,
+      "learning_rate": 0.0002,
+      "loss": 0.6813,
+      "step": 960
+    },
+    {
+      "epoch": 0.11155061039811047,
+      "grad_norm": 0.581446647644043,
+      "learning_rate": 0.0002,
+      "loss": 0.6174,
+      "step": 980
+    },
+    {
+      "epoch": 0.11382715346745967,
+      "grad_norm": 0.6152529716491699,
+      "learning_rate": 0.0002,
+      "loss": 0.6405,
+      "step": 1000
+    },
+    {
+      "epoch": 0.11610369653680885,
+      "grad_norm": 0.5986836552619934,
+      "learning_rate": 0.0002,
+      "loss": 0.5776,
+      "step": 1020
+    },
+    {
+      "epoch": 0.11838023960615805,
+      "grad_norm": 0.4255094528198242,
+      "learning_rate": 0.0002,
+      "loss": 0.6576,
+      "step": 1040
+    },
+    {
+      "epoch": 0.12065678267550724,
+      "grad_norm": 0.4563849866390228,
+      "learning_rate": 0.0002,
+      "loss": 0.6647,
+      "step": 1060
+    },
+    {
+      "epoch": 0.12293332574485644,
+      "grad_norm": 0.593227744102478,
+      "learning_rate": 0.0002,
+      "loss": 0.6043,
+      "step": 1080
+    },
+    {
+      "epoch": 0.12520986881420562,
+      "grad_norm": 0.47059598565101624,
+      "learning_rate": 0.0002,
+      "loss": 0.591,
+      "step": 1100
+    },
+    {
+      "epoch": 0.12748641188355483,
+      "grad_norm": 0.5013225674629211,
+      "learning_rate": 0.0002,
+      "loss": 0.5947,
+      "step": 1120
+    },
+    {
+      "epoch": 0.12976295495290402,
+      "grad_norm": 0.46772757172584534,
+      "learning_rate": 0.0002,
+      "loss": 0.6292,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1320394980222532,
+      "grad_norm": 0.5844313502311707,
+      "learning_rate": 0.0002,
+      "loss": 0.6128,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1343160410916024,
+      "grad_norm": 0.5295489430427551,
+      "learning_rate": 0.0002,
+      "loss": 0.6064,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1365925841609516,
+      "grad_norm": 0.4482004642486572,
+      "learning_rate": 0.0002,
+      "loss": 0.5899,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1388691272303008,
+      "grad_norm": 0.6281692981719971,
+      "learning_rate": 0.0002,
+      "loss": 0.6109,
+      "step": 1220
+    },
+    {
+      "epoch": 0.14114567029964997,
+      "grad_norm": 0.4718242585659027,
+      "learning_rate": 0.0002,
+      "loss": 0.5857,
+      "step": 1240
+    },
+    {
+      "epoch": 0.14342221336899919,
+      "grad_norm": 0.5219341516494751,
+      "learning_rate": 0.0002,
+      "loss": 0.5581,
+      "step": 1260
+    },
+    {
+      "epoch": 0.14569875643834837,
+      "grad_norm": 0.47050580382347107,
+      "learning_rate": 0.0002,
+      "loss": 0.6368,
+      "step": 1280
+    },
+    {
+      "epoch": 0.14797529950769756,
+      "grad_norm": 0.5425338745117188,
+      "learning_rate": 0.0002,
+      "loss": 0.5626,
+      "step": 1300
+    },
+    {
+      "epoch": 0.15025184257704677,
+      "grad_norm": 0.4944934844970703,
+      "learning_rate": 0.0002,
+      "loss": 0.5337,
+      "step": 1320
+    },
+    {
+      "epoch": 0.15252838564639595,
+      "grad_norm": 0.5921599864959717,
+      "learning_rate": 0.0002,
+      "loss": 0.5672,
+      "step": 1340
+    },
+    {
+      "epoch": 0.15480492871574514,
+      "grad_norm": 0.4866751730442047,
+      "learning_rate": 0.0002,
+      "loss": 0.5305,
+      "step": 1360
+    },
+    {
+      "epoch": 0.15708147178509432,
+      "grad_norm": 0.62166827917099,
+      "learning_rate": 0.0002,
+      "loss": 0.5737,
+      "step": 1380
+    },
+    {
+      "epoch": 0.15935801485444354,
+      "grad_norm": 0.5006982684135437,
+      "learning_rate": 0.0002,
+      "loss": 0.5542,
+      "step": 1400
+    },
+    {
+      "epoch": 0.16163455792379272,
+      "grad_norm": 0.6090095043182373,
+      "learning_rate": 0.0002,
+      "loss": 0.5215,
+      "step": 1420
+    },
+    {
+      "epoch": 0.1639111009931419,
+      "grad_norm": 0.4260309636592865,
+      "learning_rate": 0.0002,
+      "loss": 0.5535,
+      "step": 1440
+    },
+    {
+      "epoch": 0.16618764406249112,
+      "grad_norm": 0.48657718300819397,
+      "learning_rate": 0.0002,
+      "loss": 0.5441,
+      "step": 1460
+    },
+    {
+      "epoch": 0.1684641871318403,
+      "grad_norm": 0.43275007605552673,
+      "learning_rate": 0.0002,
+      "loss": 0.5161,
+      "step": 1480
+    },
+    {
+      "epoch": 0.1707407302011895,
+      "grad_norm": 0.4225006699562073,
+      "learning_rate": 0.0002,
+      "loss": 0.512,
+      "step": 1500
+    },
+    {
+      "epoch": 0.17301727327053867,
+      "grad_norm": 0.5176346302032471,
+      "learning_rate": 0.0002,
+      "loss": 0.5384,
+      "step": 1520
+    },
+    {
+      "epoch": 0.1752938163398879,
+      "grad_norm": 0.6492679715156555,
+      "learning_rate": 0.0002,
+      "loss": 0.4981,
+      "step": 1540
+    },
+    {
+      "epoch": 0.17757035940923707,
+      "grad_norm": 0.5511758327484131,
+      "learning_rate": 0.0002,
+      "loss": 0.5289,
+      "step": 1560
+    },
+    {
+      "epoch": 0.17984690247858626,
+      "grad_norm": 0.5211341977119446,
+      "learning_rate": 0.0002,
+      "loss": 0.5002,
+      "step": 1580
+    },
+    {
+      "epoch": 0.18212344554793547,
+      "grad_norm": 0.5488260984420776,
+      "learning_rate": 0.0002,
+      "loss": 0.5178,
+      "step": 1600
+    },
+    {
+      "epoch": 0.18439998861728465,
+      "grad_norm": 0.6779264211654663,
+      "learning_rate": 0.0002,
+      "loss": 0.5155,
+      "step": 1620
+    },
+    {
+      "epoch": 0.18667653168663384,
+      "grad_norm": 0.502919614315033,
+      "learning_rate": 0.0002,
+      "loss": 0.4923,
+      "step": 1640
+    },
+    {
+      "epoch": 0.18895307475598305,
+      "grad_norm": 0.4989205300807953,
+      "learning_rate": 0.0002,
+      "loss": 0.4825,
+      "step": 1660
+    },
+    {
+      "epoch": 0.19122961782533224,
+      "grad_norm": 0.5155315399169922,
+      "learning_rate": 0.0002,
+      "loss": 0.4796,
+      "step": 1680
+    },
+    {
+      "epoch": 0.19350616089468142,
+      "grad_norm": 0.5648865699768066,
+      "learning_rate": 0.0002,
+      "loss": 0.4985,
+      "step": 1700
+    },
+    {
+      "epoch": 0.1957827039640306,
+      "grad_norm": 0.606176495552063,
+      "learning_rate": 0.0002,
+      "loss": 0.4819,
+      "step": 1720
+    },
+    {
+      "epoch": 0.19805924703337982,
+      "grad_norm": 0.5440786480903625,
+      "learning_rate": 0.0002,
+      "loss": 0.5213,
+      "step": 1740
+    },
+    {
+      "epoch": 0.200335790102729,
+      "grad_norm": 0.43152502179145813,
+      "learning_rate": 0.0002,
+      "loss": 0.4429,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2026123331720782,
+      "grad_norm": 0.5701313614845276,
+      "learning_rate": 0.0002,
+      "loss": 0.4486,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2048888762414274,
+      "grad_norm": 0.565666913986206,
+      "learning_rate": 0.0002,
+      "loss": 0.4561,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2071654193107766,
+      "grad_norm": 0.5725598931312561,
+      "learning_rate": 0.0002,
+      "loss": 0.4757,
+      "step": 1820
+    },
+    {
+      "epoch": 0.20944196238012577,
+      "grad_norm": 0.4642520248889923,
+      "learning_rate": 0.0002,
+      "loss": 0.438,
+      "step": 1840
+    },
+    {
+      "epoch": 0.21171850544947496,
+      "grad_norm": 0.6077229976654053,
+      "learning_rate": 0.0002,
+      "loss": 0.4295,
+      "step": 1860
+    },
+    {
+      "epoch": 0.21399504851882417,
+      "grad_norm": 0.6314090490341187,
+      "learning_rate": 0.0002,
+      "loss": 0.449,
+      "step": 1880
+    },
+    {
+      "epoch": 0.21627159158817336,
+      "grad_norm": 0.4416756331920624,
+      "learning_rate": 0.0002,
+      "loss": 0.4554,
+      "step": 1900
+    },
+    {
+      "epoch": 0.21854813465752254,
+      "grad_norm": 0.5278882384300232,
+      "learning_rate": 0.0002,
+      "loss": 0.4554,
+      "step": 1920
+    },
+    {
+      "epoch": 0.22082467772687175,
+      "grad_norm": 0.45619043707847595,
+      "learning_rate": 0.0002,
+      "loss": 0.4868,
+      "step": 1940
+    },
+    {
+      "epoch": 0.22310122079622094,
+      "grad_norm": 0.5881581902503967,
+      "learning_rate": 0.0002,
+      "loss": 0.4672,
+      "step": 1960
+    },
+    {
+      "epoch": 0.22537776386557012,
+      "grad_norm": 0.5379284024238586,
+      "learning_rate": 0.0002,
+      "loss": 0.4531,
+      "step": 1980
+    },
+    {
+      "epoch": 0.22765430693491934,
+      "grad_norm": 0.5562624931335449,
+      "learning_rate": 0.0002,
+      "loss": 0.464,
+      "step": 2000
+    },
+    {
+      "epoch": 0.22993085000426852,
+      "grad_norm": 0.554499626159668,
+      "learning_rate": 0.0002,
+      "loss": 0.446,
+      "step": 2020
+    },
+    {
+      "epoch": 0.2322073930736177,
+      "grad_norm": 0.509219229221344,
+      "learning_rate": 0.0002,
+      "loss": 0.4417,
+      "step": 2040
+    },
+    {
+      "epoch": 0.2344839361429669,
+      "grad_norm": 0.5206849575042725,
+      "learning_rate": 0.0002,
+      "loss": 0.4118,
+      "step": 2060
+    },
+    {
+      "epoch": 0.2367604792123161,
+      "grad_norm": 0.548729658126831,
+      "learning_rate": 0.0002,
+      "loss": 0.4067,
+      "step": 2080
+    },
+    {
+      "epoch": 0.2390370222816653,
+      "grad_norm": 0.4220084846019745,
+      "learning_rate": 0.0002,
+      "loss": 0.428,
+      "step": 2100
+    },
+    {
+      "epoch": 0.24131356535101448,
+      "grad_norm": 0.5507292747497559,
+      "learning_rate": 0.0002,
+      "loss": 0.4176,
+      "step": 2120
+    },
+    {
+      "epoch": 0.2435901084203637,
+      "grad_norm": 0.5605701208114624,
+      "learning_rate": 0.0002,
+      "loss": 0.4661,
+      "step": 2140
+    },
+    {
+      "epoch": 0.24586665148971287,
+      "grad_norm": 0.43142881989479065,
+      "learning_rate": 0.0002,
+      "loss": 0.4197,
+      "step": 2160
+    },
+    {
+      "epoch": 0.24814319455906206,
+      "grad_norm": 0.47790080308914185,
+      "learning_rate": 0.0002,
+      "loss": 0.4568,
+      "step": 2180
+    },
+    {
+      "epoch": 0.25041973762841124,
+      "grad_norm": 0.6048968434333801,
+      "learning_rate": 0.0002,
+      "loss": 0.4199,
+      "step": 2200
+    },
+    {
+      "epoch": 0.25269628069776046,
+      "grad_norm": 0.4925907850265503,
+      "learning_rate": 0.0002,
+      "loss": 0.4325,
+      "step": 2220
+    },
+    {
+      "epoch": 0.25497282376710967,
+      "grad_norm": 0.5463051199913025,
+      "learning_rate": 0.0002,
+      "loss": 0.4549,
+      "step": 2240
+    },
+    {
+      "epoch": 0.2572493668364588,
+      "grad_norm": 0.4631319046020508,
+      "learning_rate": 0.0002,
+      "loss": 0.3977,
+      "step": 2260
+    },
+    {
+      "epoch": 0.25952590990580804,
+      "grad_norm": 0.4965234398841858,
+      "learning_rate": 0.0002,
+      "loss": 0.4285,
+      "step": 2280
+    },
+    {
+      "epoch": 0.2618024529751572,
+      "grad_norm": 0.5436238646507263,
+      "learning_rate": 0.0002,
+      "loss": 0.4039,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2640789960445064,
+      "grad_norm": 0.5218191742897034,
+      "learning_rate": 0.0002,
+      "loss": 0.4092,
+      "step": 2320
+    },
+    {
+      "epoch": 0.2663555391138556,
+      "grad_norm": 0.5417261719703674,
+      "learning_rate": 0.0002,
+      "loss": 0.3825,
+      "step": 2340
+    },
+    {
+      "epoch": 0.2686320821832048,
+      "grad_norm": 0.6126281023025513,
+      "learning_rate": 0.0002,
+      "loss": 0.4391,
+      "step": 2360
+    },
+    {
+      "epoch": 0.270908625252554,
+      "grad_norm": 0.4734433889389038,
+      "learning_rate": 0.0002,
+      "loss": 0.4151,
+      "step": 2380
+    },
+    {
+      "epoch": 0.2731851683219032,
+      "grad_norm": 0.4501429796218872,
+      "learning_rate": 0.0002,
+      "loss": 0.4178,
+      "step": 2400
+    },
+    {
+      "epoch": 0.27546171139125236,
+      "grad_norm": 0.5258509516716003,
+      "learning_rate": 0.0002,
+      "loss": 0.4007,
+      "step": 2420
+    },
+    {
+      "epoch": 0.2777382544606016,
+      "grad_norm": 0.47874951362609863,
+      "learning_rate": 0.0002,
+      "loss": 0.4245,
+      "step": 2440
+    },
+    {
+      "epoch": 0.2800147975299508,
+      "grad_norm": 0.528533399105072,
+      "learning_rate": 0.0002,
+      "loss": 0.3794,
+      "step": 2460
+    },
+    {
+      "epoch": 0.28229134059929994,
+      "grad_norm": 0.46465063095092773,
+      "learning_rate": 0.0002,
+      "loss": 0.4019,
+      "step": 2480
+    },
+    {
+      "epoch": 0.28456788366864916,
+      "grad_norm": 0.5217177867889404,
+      "learning_rate": 0.0002,
+      "loss": 0.4104,
+      "step": 2500
+    },
+    {
+      "epoch": 0.28684442673799837,
+      "grad_norm": 0.510036289691925,
+      "learning_rate": 0.0002,
+      "loss": 0.389,
+      "step": 2520
+    },
+    {
+      "epoch": 0.2891209698073475,
+      "grad_norm": 0.6968228220939636,
+      "learning_rate": 0.0002,
+      "loss": 0.4152,
+      "step": 2540
+    },
+    {
+      "epoch": 0.29139751287669674,
+      "grad_norm": 0.4529867470264435,
+      "learning_rate": 0.0002,
+      "loss": 0.3987,
+      "step": 2560
+    },
+    {
+      "epoch": 0.29367405594604595,
+      "grad_norm": 0.5680263638496399,
+      "learning_rate": 0.0002,
+      "loss": 0.3828,
+      "step": 2580
+    },
+    {
+      "epoch": 0.2959505990153951,
+      "grad_norm": 0.4892405867576599,
+      "learning_rate": 0.0002,
+      "loss": 0.4006,
+      "step": 2600
+    },
+    {
+      "epoch": 0.2982271420847443,
+      "grad_norm": 0.47588276863098145,
+      "learning_rate": 0.0002,
+      "loss": 0.4197,
+      "step": 2620
+    },
+    {
+      "epoch": 0.30050368515409354,
+      "grad_norm": 0.5624070167541504,
+      "learning_rate": 0.0002,
+      "loss": 0.3997,
+      "step": 2640
+    },
+    {
+      "epoch": 0.3027802282234427,
+      "grad_norm": 0.5434039831161499,
+      "learning_rate": 0.0002,
+      "loss": 0.3977,
+      "step": 2660
+    },
+    {
+      "epoch": 0.3050567712927919,
+      "grad_norm": 0.5572277903556824,
+      "learning_rate": 0.0002,
+      "loss": 0.3966,
+      "step": 2680
+    },
+    {
+      "epoch": 0.30733331436214106,
+      "grad_norm": 0.5533374547958374,
+      "learning_rate": 0.0002,
+      "loss": 0.3803,
+      "step": 2700
+    },
+    {
+      "epoch": 0.3096098574314903,
+      "grad_norm": 0.40596967935562134,
+      "learning_rate": 0.0002,
+      "loss": 0.3682,
+      "step": 2720
+    },
+    {
+      "epoch": 0.3118864005008395,
+      "grad_norm": 0.4737823009490967,
+      "learning_rate": 0.0002,
+      "loss": 0.3761,
+      "step": 2740
+    },
+    {
+      "epoch": 0.31416294357018865,
+      "grad_norm": 0.4295174777507782,
+      "learning_rate": 0.0002,
+      "loss": 0.4035,
+      "step": 2760
+    },
+    {
+      "epoch": 0.31643948663953786,
+      "grad_norm": 0.5348454713821411,
+      "learning_rate": 0.0002,
+      "loss": 0.404,
+      "step": 2780
+    },
+    {
+      "epoch": 0.31871602970888707,
+      "grad_norm": 0.4819965362548828,
+      "learning_rate": 0.0002,
+      "loss": 0.3929,
+      "step": 2800
+    },
+    {
+      "epoch": 0.32099257277823623,
+      "grad_norm": 0.5920088291168213,
+      "learning_rate": 0.0002,
+      "loss": 0.3798,
+      "step": 2820
+    },
+    {
+      "epoch": 0.32326911584758544,
+      "grad_norm": 0.4936531186103821,
+      "learning_rate": 0.0002,
+      "loss": 0.3995,
+      "step": 2840
+    },
+    {
+      "epoch": 0.32554565891693465,
+      "grad_norm": 0.5252315998077393,
+      "learning_rate": 0.0002,
+      "loss": 0.3842,
+      "step": 2860
+    },
+    {
+      "epoch": 0.3278222019862838,
+      "grad_norm": 0.5818414688110352,
+      "learning_rate": 0.0002,
+      "loss": 0.3533,
+      "step": 2880
+    },
+    {
+      "epoch": 0.330098745055633,
+      "grad_norm": 0.44053876399993896,
+      "learning_rate": 0.0002,
+      "loss": 0.3402,
+      "step": 2900
+    },
+    {
+      "epoch": 0.33237528812498224,
+      "grad_norm": 0.5421345233917236,
+      "learning_rate": 0.0002,
+      "loss": 0.3542,
+      "step": 2920
+    },
+    {
+      "epoch": 0.3346518311943314,
+      "grad_norm": 0.4642751216888428,
+      "learning_rate": 0.0002,
+      "loss": 0.3755,
+      "step": 2940
+    },
+    {
+      "epoch": 0.3369283742636806,
+      "grad_norm": 0.5137833952903748,
+      "learning_rate": 0.0002,
+      "loss": 0.3602,
+      "step": 2960
+    },
+    {
+      "epoch": 0.3392049173330298,
+      "grad_norm": 0.5032792687416077,
+      "learning_rate": 0.0002,
+      "loss": 0.3451,
+      "step": 2980
+    },
+    {
+      "epoch": 0.341481460402379,
+      "grad_norm": 0.4932720363140106,
+      "learning_rate": 0.0002,
+      "loss": 0.384,
+      "step": 3000
+    },
+    {
+      "epoch": 0.3437580034717282,
+      "grad_norm": 0.49986231327056885,
+      "learning_rate": 0.0002,
+      "loss": 0.3826,
+      "step": 3020
+    },
+    {
+      "epoch": 0.34603454654107735,
+      "grad_norm": 0.6325618624687195,
+      "learning_rate": 0.0002,
+      "loss": 0.3582,
+      "step": 3040
+    },
+    {
+      "epoch": 0.34831108961042656,
+      "grad_norm": 0.5402369499206543,
+      "learning_rate": 0.0002,
+      "loss": 0.3706,
+      "step": 3060
+    },
+    {
+      "epoch": 0.3505876326797758,
+      "grad_norm": 0.4967012107372284,
+      "learning_rate": 0.0002,
+      "loss": 0.3456,
+      "step": 3080
+    },
+    {
+      "epoch": 0.35286417574912493,
+      "grad_norm": 0.4491735100746155,
+      "learning_rate": 0.0002,
+      "loss": 0.347,
+      "step": 3100
+    },
+    {
+      "epoch": 0.35514071881847414,
+      "grad_norm": 0.9062516093254089,
+      "learning_rate": 0.0002,
+      "loss": 0.3617,
+      "step": 3120
+    },
+    {
+      "epoch": 0.35741726188782336,
+      "grad_norm": 0.5253359079360962,
+      "learning_rate": 0.0002,
+      "loss": 0.3512,
+      "step": 3140
+    },
+    {
+      "epoch": 0.3596938049571725,
+      "grad_norm": 0.4836867153644562,
+      "learning_rate": 0.0002,
+      "loss": 0.3585,
+      "step": 3160
+    },
+    {
+      "epoch": 0.3619703480265217,
+      "grad_norm": 0.49537473917007446,
+      "learning_rate": 0.0002,
+      "loss": 0.364,
+      "step": 3180
+    },
+    {
+      "epoch": 0.36424689109587094,
+      "grad_norm": 0.6098095178604126,
+      "learning_rate": 0.0002,
+      "loss": 0.3455,
+      "step": 3200
+    },
+    {
+      "epoch": 0.3665234341652201,
+      "grad_norm": 0.5926884412765503,
+      "learning_rate": 0.0002,
+      "loss": 0.3406,
+      "step": 3220
+    },
+    {
+      "epoch": 0.3687999772345693,
+      "grad_norm": 0.5868669152259827,
+      "learning_rate": 0.0002,
+      "loss": 0.3643,
+      "step": 3240
+    },
+    {
+      "epoch": 0.3710765203039185,
+      "grad_norm": 0.42670106887817383,
+      "learning_rate": 0.0002,
+      "loss": 0.344,
+      "step": 3260
+    },
+    {
+      "epoch": 0.3733530633732677,
+      "grad_norm": 0.5992838740348816,
+      "learning_rate": 0.0002,
+      "loss": 0.3588,
+      "step": 3280
+    },
+    {
+      "epoch": 0.3756296064426169,
+      "grad_norm": 0.4388341009616852,
+      "learning_rate": 0.0002,
+      "loss": 0.3375,
+      "step": 3300
+    },
+    {
+      "epoch": 0.3779061495119661,
+      "grad_norm": 0.596488893032074,
+      "learning_rate": 0.0002,
+      "loss": 0.3425,
+      "step": 3320
+    },
+    {
+      "epoch": 0.38018269258131526,
+      "grad_norm": 0.4572538137435913,
+      "learning_rate": 0.0002,
+      "loss": 0.3711,
+      "step": 3340
+    },
+    {
+      "epoch": 0.3824592356506645,
+      "grad_norm": 0.5661656856536865,
+      "learning_rate": 0.0002,
+      "loss": 0.3415,
+      "step": 3360
+    },
+    {
+      "epoch": 0.38473577872001363,
+      "grad_norm": 0.45082923769950867,
+      "learning_rate": 0.0002,
+      "loss": 0.3495,
+      "step": 3380
+    },
+    {
+      "epoch": 0.38701232178936285,
+      "grad_norm": 0.4995211660861969,
+      "learning_rate": 0.0002,
+      "loss": 0.3311,
+      "step": 3400
+    },
+    {
+      "epoch": 0.38928886485871206,
+      "grad_norm": 0.5004004240036011,
+      "learning_rate": 0.0002,
+      "loss": 0.3506,
+      "step": 3420
+    },
+    {
+      "epoch": 0.3915654079280612,
+      "grad_norm": 0.5676460266113281,
+      "learning_rate": 0.0002,
+      "loss": 0.3383,
+      "step": 3440
+    },
+    {
+      "epoch": 0.39384195099741043,
+      "grad_norm": 0.4805515706539154,
+      "learning_rate": 0.0002,
+      "loss": 0.3382,
+      "step": 3460
+    },
+    {
+      "epoch": 0.39611849406675964,
+      "grad_norm": 0.47675764560699463,
+      "learning_rate": 0.0002,
+      "loss": 0.3021,
+      "step": 3480
+    },
+    {
+      "epoch": 0.3983950371361088,
+      "grad_norm": 0.6285260915756226,
+      "learning_rate": 0.0002,
+      "loss": 0.3467,
+      "step": 3500
+    },
+    {
+      "epoch": 0.400671580205458,
+      "grad_norm": 0.5657575130462646,
+      "learning_rate": 0.0002,
+      "loss": 0.3382,
+      "step": 3520
+    },
+    {
+      "epoch": 0.4029481232748072,
+      "grad_norm": 0.6148316860198975,
+      "learning_rate": 0.0002,
+      "loss": 0.3396,
+      "step": 3540
+    },
+    {
+      "epoch": 0.4052246663441564,
+      "grad_norm": 0.5819992423057556,
+      "learning_rate": 0.0002,
+      "loss": 0.3373,
+      "step": 3560
+    },
+    {
+      "epoch": 0.4075012094135056,
+      "grad_norm": 0.6080338954925537,
+      "learning_rate": 0.0002,
+      "loss": 0.3463,
+      "step": 3580
+    },
+    {
+      "epoch": 0.4097777524828548,
+      "grad_norm": 0.6103864312171936,
+      "learning_rate": 0.0002,
+      "loss": 0.3441,
+      "step": 3600
+    },
+    {
+      "epoch": 0.41205429555220396,
+      "grad_norm": 0.5234800577163696,
+      "learning_rate": 0.0002,
+      "loss": 0.3272,
+      "step": 3620
+    },
+    {
+      "epoch": 0.4143308386215532,
+      "grad_norm": 0.5393822193145752,
+      "learning_rate": 0.0002,
+      "loss": 0.3308,
+      "step": 3640
+    },
+    {
+      "epoch": 0.4166073816909024,
+      "grad_norm": 0.4853431284427643,
+      "learning_rate": 0.0002,
+      "loss": 0.3152,
+      "step": 3660
+    },
+    {
+      "epoch": 0.41888392476025155,
+      "grad_norm": 0.5507264733314514,
+      "learning_rate": 0.0002,
+      "loss": 0.3229,
+      "step": 3680
+    },
+    {
+      "epoch": 0.42116046782960076,
+      "grad_norm": 0.44306129217147827,
+      "learning_rate": 0.0002,
+      "loss": 0.3389,
+      "step": 3700
+    },
+    {
+      "epoch": 0.4234370108989499,
+      "grad_norm": 0.4574294984340668,
+      "learning_rate": 0.0002,
+      "loss": 0.3516,
+      "step": 3720
+    },
+    {
+      "epoch": 0.42571355396829913,
+      "grad_norm": 0.5367994904518127,
+      "learning_rate": 0.0002,
+      "loss": 0.3576,
+      "step": 3740
+    },
+    {
+      "epoch": 0.42799009703764834,
+      "grad_norm": 0.5044491291046143,
+      "learning_rate": 0.0002,
+      "loss": 0.3449,
+      "step": 3760
+    },
+    {
+      "epoch": 0.4302666401069975,
+      "grad_norm": 0.41715556383132935,
+      "learning_rate": 0.0002,
+      "loss": 0.3128,
+      "step": 3780
+    },
+    {
+      "epoch": 0.4325431831763467,
+      "grad_norm": 0.4355817437171936,
+      "learning_rate": 0.0002,
+      "loss": 0.3131,
+      "step": 3800
+    },
+    {
+      "epoch": 0.4348197262456959,
+      "grad_norm": 0.5237382650375366,
+      "learning_rate": 0.0002,
+      "loss": 0.3281,
+      "step": 3820
+    },
+    {
+      "epoch": 0.4370962693150451,
+      "grad_norm": 0.6210081577301025,
+      "learning_rate": 0.0002,
+      "loss": 0.3195,
+      "step": 3840
+    },
+    {
+      "epoch": 0.4393728123843943,
+      "grad_norm": 0.5145352482795715,
+      "learning_rate": 0.0002,
+      "loss": 0.3107,
+      "step": 3860
+    },
+    {
+      "epoch": 0.4416493554537435,
+      "grad_norm": 0.5554608106613159,
+      "learning_rate": 0.0002,
+      "loss": 0.3418,
+      "step": 3880
+    },
+    {
+      "epoch": 0.44392589852309267,
+      "grad_norm": 0.4971628487110138,
+      "learning_rate": 0.0002,
+      "loss": 0.3293,
+      "step": 3900
+    },
+    {
+      "epoch": 0.4462024415924419,
+      "grad_norm": 0.49732130765914917,
+      "learning_rate": 0.0002,
+      "loss": 0.3138,
+      "step": 3920
+    },
+    {
+      "epoch": 0.4484789846617911,
+      "grad_norm": 0.5883257985115051,
+      "learning_rate": 0.0002,
+      "loss": 0.3357,
+      "step": 3940
+    },
+    {
+      "epoch": 0.45075552773114025,
+      "grad_norm": 0.5349528193473816,
+      "learning_rate": 0.0002,
+      "loss": 0.3381,
+      "step": 3960
+    },
+    {
+      "epoch": 0.45303207080048946,
+      "grad_norm": 0.5360047221183777,
+      "learning_rate": 0.0002,
+      "loss": 0.3116,
+      "step": 3980
+    },
+    {
+      "epoch": 0.4553086138698387,
+      "grad_norm": 0.4889732003211975,
+      "learning_rate": 0.0002,
+      "loss": 0.3154,
+      "step": 4000
+    },
+    {
+      "epoch": 0.45758515693918783,
+      "grad_norm": 0.4912421703338623,
+      "learning_rate": 0.0002,
+      "loss": 0.3054,
+      "step": 4020
+    },
+    {
+      "epoch": 0.45986170000853704,
+      "grad_norm": 0.4449983835220337,
+      "learning_rate": 0.0002,
+      "loss": 0.3079,
+      "step": 4040
+    },
+    {
+      "epoch": 0.46213824307788626,
+      "grad_norm": 0.4488675892353058,
+      "learning_rate": 0.0002,
+      "loss": 0.3027,
+      "step": 4060
+    },
+    {
+      "epoch": 0.4644147861472354,
+      "grad_norm": 0.5412561893463135,
+      "learning_rate": 0.0002,
+      "loss": 0.2932,
+      "step": 4080
+    },
+    {
+      "epoch": 0.4666913292165846,
+      "grad_norm": 0.41218650341033936,
+      "learning_rate": 0.0002,
+      "loss": 0.3087,
+      "step": 4100
+    },
+    {
+      "epoch": 0.4689678722859338,
+      "grad_norm": 0.5233949422836304,
+      "learning_rate": 0.0002,
+      "loss": 0.3157,
+      "step": 4120
+    },
+    {
+      "epoch": 0.471244415355283,
+      "grad_norm": 0.5676075220108032,
+      "learning_rate": 0.0002,
+      "loss": 0.3267,
+      "step": 4140
+    },
+    {
+      "epoch": 0.4735209584246322,
+      "grad_norm": 0.5336834788322449,
+      "learning_rate": 0.0002,
+      "loss": 0.3185,
+      "step": 4160
+    },
+    {
+      "epoch": 0.47579750149398137,
+      "grad_norm": 0.5505925416946411,
+      "learning_rate": 0.0002,
+      "loss": 0.3116,
+      "step": 4180
+    },
+    {
+      "epoch": 0.4780740445633306,
+      "grad_norm": 0.5440223813056946,
+      "learning_rate": 0.0002,
+      "loss": 0.3234,
+      "step": 4200
+    },
+    {
+      "epoch": 0.4803505876326798,
+      "grad_norm": 0.46334293484687805,
+      "learning_rate": 0.0002,
+      "loss": 0.3209,
+      "step": 4220
+    },
+    {
+      "epoch": 0.48262713070202895,
+      "grad_norm": 0.452364444732666,
+      "learning_rate": 0.0002,
+      "loss": 0.3056,
+      "step": 4240
+    },
+    {
+      "epoch": 0.48490367377137816,
+      "grad_norm": 0.5037956833839417,
+      "learning_rate": 0.0002,
+      "loss": 0.3141,
+      "step": 4260
+    },
+    {
+      "epoch": 0.4871802168407274,
+      "grad_norm": 0.4308939278125763,
+      "learning_rate": 0.0002,
+      "loss": 0.2948,
+      "step": 4280
+    },
+    {
+      "epoch": 0.48945675991007653,
+      "grad_norm": 0.45019960403442383,
+      "learning_rate": 0.0002,
+      "loss": 0.3142,
+      "step": 4300
+    },
+    {
+      "epoch": 0.49173330297942575,
+      "grad_norm": 0.4351404011249542,
+      "learning_rate": 0.0002,
+      "loss": 0.31,
+      "step": 4320
+    },
+    {
+      "epoch": 0.49400984604877496,
+      "grad_norm": 0.38306841254234314,
+      "learning_rate": 0.0002,
+      "loss": 0.2889,
+      "step": 4340
+    },
+    {
+      "epoch": 0.4962863891181241,
+      "grad_norm": 0.545360803604126,
+      "learning_rate": 0.0002,
+      "loss": 0.311,
+      "step": 4360
+    },
+    {
+      "epoch": 0.49856293218747333,
+      "grad_norm": 0.44942232966423035,
+      "learning_rate": 0.0002,
+      "loss": 0.2899,
+      "step": 4380
+    },
+    {
+      "epoch": 0.5008394752568225,
+      "grad_norm": 0.46564239263534546,
+      "learning_rate": 0.0002,
+      "loss": 0.3013,
+      "step": 4400
+    },
+    {
+      "epoch": 0.5031160183261717,
+      "grad_norm": 0.5398554801940918,
+      "learning_rate": 0.0002,
+      "loss": 0.3104,
+      "step": 4420
+    },
+    {
+      "epoch": 0.5053925613955209,
+      "grad_norm": 0.47367504239082336,
+      "learning_rate": 0.0002,
+      "loss": 0.2945,
+      "step": 4440
+    },
+    {
+      "epoch": 0.5076691044648701,
+      "grad_norm": 0.45659711956977844,
+      "learning_rate": 0.0002,
+      "loss": 0.304,
+      "step": 4460
+    },
+    {
+      "epoch": 0.5099456475342193,
+      "grad_norm": 0.4942033290863037,
+      "learning_rate": 0.0002,
+      "loss": 0.2969,
+      "step": 4480
+    },
+    {
+      "epoch": 0.5122221906035684,
+      "grad_norm": 0.46578243374824524,
+      "learning_rate": 0.0002,
+      "loss": 0.2935,
+      "step": 4500
+    },
+    {
+      "epoch": 0.5144987336729177,
+      "grad_norm": 0.6523891687393188,
+      "learning_rate": 0.0002,
+      "loss": 0.2823,
+      "step": 4520
+    },
+    {
+      "epoch": 0.5167752767422669,
+      "grad_norm": 0.4787238538265228,
+      "learning_rate": 0.0002,
+      "loss": 0.3148,
+      "step": 4540
+    },
+    {
+      "epoch": 0.5190518198116161,
+      "grad_norm": 0.46825891733169556,
+      "learning_rate": 0.0002,
+      "loss": 0.3089,
+      "step": 4560
+    },
+    {
+      "epoch": 0.5213283628809653,
+      "grad_norm": 0.46605536341667175,
+      "learning_rate": 0.0002,
+      "loss": 0.3012,
+      "step": 4580
+    },
+    {
+      "epoch": 0.5236049059503144,
+      "grad_norm": 0.5826888680458069,
+      "learning_rate": 0.0002,
+      "loss": 0.3043,
+      "step": 4600
+    },
+    {
+      "epoch": 0.5258814490196636,
+      "grad_norm": 0.48641151189804077,
+      "learning_rate": 0.0002,
+      "loss": 0.2952,
+      "step": 4620
+    },
+    {
+      "epoch": 0.5281579920890128,
+      "grad_norm": 0.5396175384521484,
+      "learning_rate": 0.0002,
+      "loss": 0.2926,
+      "step": 4640
+    },
+    {
+      "epoch": 0.530434535158362,
+      "grad_norm": 0.5584241151809692,
+      "learning_rate": 0.0002,
+      "loss": 0.3048,
+      "step": 4660
+    },
+    {
+      "epoch": 0.5327110782277112,
+      "grad_norm": 0.5832685232162476,
+      "learning_rate": 0.0002,
+      "loss": 0.2948,
+      "step": 4680
+    },
+    {
+      "epoch": 0.5349876212970605,
+      "grad_norm": 0.4676337242126465,
+      "learning_rate": 0.0002,
+      "loss": 0.3043,
+      "step": 4700
+    },
+    {
+      "epoch": 0.5372641643664096,
+      "grad_norm": 0.4440428614616394,
+      "learning_rate": 0.0002,
+      "loss": 0.288,
+      "step": 4720
+    },
+    {
+      "epoch": 0.5395407074357588,
+      "grad_norm": 0.49934279918670654,
+      "learning_rate": 0.0002,
+      "loss": 0.2882,
+      "step": 4740
+    },
+    {
+      "epoch": 0.541817250505108,
+      "grad_norm": 0.5172054171562195,
+      "learning_rate": 0.0002,
+      "loss": 0.3225,
+      "step": 4760
+    },
+    {
+      "epoch": 0.5440937935744572,
+      "grad_norm": 0.4527619183063507,
+      "learning_rate": 0.0002,
+      "loss": 0.2869,
+      "step": 4780
+    },
+    {
+      "epoch": 0.5463703366438064,
+      "grad_norm": 0.548918604850769,
+      "learning_rate": 0.0002,
+      "loss": 0.3105,
+      "step": 4800
+    },
+    {
+      "epoch": 0.5486468797131556,
+      "grad_norm": 0.48801419138908386,
+      "learning_rate": 0.0002,
+      "loss": 0.2835,
+      "step": 4820
+    },
+    {
+      "epoch": 0.5509234227825047,
+      "grad_norm": 0.49810609221458435,
+      "learning_rate": 0.0002,
+      "loss": 0.3227,
+      "step": 4840
+    },
+    {
+      "epoch": 0.5531999658518539,
+      "grad_norm": 0.49763086438179016,
+      "learning_rate": 0.0002,
+      "loss": 0.2786,
+      "step": 4860
+    },
+    {
+      "epoch": 0.5554765089212031,
+      "grad_norm": 0.48815059661865234,
+      "learning_rate": 0.0002,
+      "loss": 0.2802,
+      "step": 4880
+    },
+    {
+      "epoch": 0.5577530519905524,
+      "grad_norm": 0.3571115732192993,
+      "learning_rate": 0.0002,
+      "loss": 0.2796,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5600295950599016,
+      "grad_norm": 0.6448425650596619,
+      "learning_rate": 0.0002,
+      "loss": 0.2844,
+      "step": 4920
+    },
+    {
+      "epoch": 0.5623061381292508,
+      "grad_norm": 0.49660468101501465,
+      "learning_rate": 0.0002,
+      "loss": 0.2892,
+      "step": 4940
+    },
+    {
+      "epoch": 0.5645826811985999,
+      "grad_norm": 0.47702720761299133,
+      "learning_rate": 0.0002,
+      "loss": 0.3111,
+      "step": 4960
+    },
+    {
+      "epoch": 0.5668592242679491,
+      "grad_norm": 0.5281921029090881,
+      "learning_rate": 0.0002,
+      "loss": 0.2908,
+      "step": 4980
+    },
+    {
+      "epoch": 0.5691357673372983,
+      "grad_norm": 0.6427987813949585,
+      "learning_rate": 0.0002,
+      "loss": 0.2848,
+      "step": 5000
+    },
+    {
+      "epoch": 0.5714123104066475,
+      "grad_norm": 0.5437233448028564,
+      "learning_rate": 0.0002,
+      "loss": 0.3023,
+      "step": 5020
+    },
+    {
+      "epoch": 0.5736888534759967,
+      "grad_norm": 0.517444372177124,
+      "learning_rate": 0.0002,
+      "loss": 0.2876,
+      "step": 5040
+    },
+    {
+      "epoch": 0.5759653965453458,
+      "grad_norm": 0.5197298526763916,
+      "learning_rate": 0.0002,
+      "loss": 0.304,
+      "step": 5060
+    },
+    {
+      "epoch": 0.578241939614695,
+      "grad_norm": 0.3452152907848358,
+      "learning_rate": 0.0002,
+      "loss": 0.2794,
+      "step": 5080
+    },
+    {
+      "epoch": 0.5805184826840443,
+      "grad_norm": 0.5630306601524353,
+      "learning_rate": 0.0002,
+      "loss": 0.2979,
+      "step": 5100
+    },
+    {
+      "epoch": 0.5827950257533935,
+      "grad_norm": 0.5696737170219421,
+      "learning_rate": 0.0002,
+      "loss": 0.3035,
+      "step": 5120
+    },
+    {
+      "epoch": 0.5850715688227427,
+      "grad_norm": 0.5024551153182983,
+      "learning_rate": 0.0002,
+      "loss": 0.2717,
+      "step": 5140
+    },
+    {
+      "epoch": 0.5873481118920919,
+      "grad_norm": 0.4166383147239685,
+      "learning_rate": 0.0002,
+      "loss": 0.3065,
+      "step": 5160
+    },
+    {
+      "epoch": 0.589624654961441,
+      "grad_norm": 0.36780408024787903,
+      "learning_rate": 0.0002,
+      "loss": 0.2864,
+      "step": 5180
+    },
+    {
+      "epoch": 0.5919011980307902,
+      "grad_norm": 0.436526894569397,
+      "learning_rate": 0.0002,
+      "loss": 0.2764,
+      "step": 5200
+    },
+    {
+      "epoch": 0.5941777411001394,
+      "grad_norm": 0.43115249276161194,
+      "learning_rate": 0.0002,
+      "loss": 0.2791,
+      "step": 5220
+    },
+    {
+      "epoch": 0.5964542841694886,
+      "grad_norm": 0.359739750623703,
+      "learning_rate": 0.0002,
+      "loss": 0.3108,
+      "step": 5240
+    },
+    {
+      "epoch": 0.5987308272388379,
+      "grad_norm": 0.4555259644985199,
+      "learning_rate": 0.0002,
+      "loss": 0.2623,
+      "step": 5260
+    },
+    {
+      "epoch": 0.6010073703081871,
+      "grad_norm": 0.4587076008319855,
+      "learning_rate": 0.0002,
+      "loss": 0.293,
+      "step": 5280
+    },
+    {
+      "epoch": 0.6032839133775362,
+      "grad_norm": 0.5236973166465759,
+      "learning_rate": 0.0002,
+      "loss": 0.2888,
+      "step": 5300
+    },
+    {
+      "epoch": 0.6055604564468854,
+      "grad_norm": 0.46685513854026794,
+      "learning_rate": 0.0002,
+      "loss": 0.2731,
+      "step": 5320
+    },
+    {
+      "epoch": 0.6078369995162346,
+      "grad_norm": 0.5701884627342224,
+      "learning_rate": 0.0002,
+      "loss": 0.28,
+      "step": 5340
+    },
+    {
+      "epoch": 0.6101135425855838,
+      "grad_norm": 0.5002717971801758,
+      "learning_rate": 0.0002,
+      "loss": 0.2777,
+      "step": 5360
+    },
+    {
+      "epoch": 0.612390085654933,
+      "grad_norm": 0.5896885395050049,
+      "learning_rate": 0.0002,
+      "loss": 0.3048,
+      "step": 5380
+    },
+    {
+      "epoch": 0.6146666287242821,
+      "grad_norm": 0.49014943838119507,
+      "learning_rate": 0.0002,
+      "loss": 0.2642,
+      "step": 5400
+    },
+    {
+      "epoch": 0.6169431717936313,
+      "grad_norm": 0.5924846529960632,
+      "learning_rate": 0.0002,
+      "loss": 0.2943,
+      "step": 5420
+    },
+    {
+      "epoch": 0.6192197148629806,
+      "grad_norm": 0.49827829003334045,
+      "learning_rate": 0.0002,
+      "loss": 0.2879,
+      "step": 5440
+    },
+    {
+      "epoch": 0.6214962579323298,
+      "grad_norm": 0.45312178134918213,
+      "learning_rate": 0.0002,
+      "loss": 0.2728,
+      "step": 5460
+    },
+    {
+      "epoch": 0.623772801001679,
+      "grad_norm": 0.3595191538333893,
+      "learning_rate": 0.0002,
+      "loss": 0.2713,
+      "step": 5480
+    },
+    {
+      "epoch": 0.6260493440710282,
+      "grad_norm": 0.6547619104385376,
+      "learning_rate": 0.0002,
+      "loss": 0.2855,
+      "step": 5500
+    },
+    {
+      "epoch": 0.6283258871403773,
+      "grad_norm": 0.4659534692764282,
+      "learning_rate": 0.0002,
+      "loss": 0.2908,
+      "step": 5520
+    },
+    {
+      "epoch": 0.6306024302097265,
+      "grad_norm": 0.4027460813522339,
+      "learning_rate": 0.0002,
+      "loss": 0.2651,
+      "step": 5540
+    },
+    {
+      "epoch": 0.6328789732790757,
+      "grad_norm": 0.36129653453826904,
+      "learning_rate": 0.0002,
+      "loss": 0.2915,
+      "step": 5560
+    },
+    {
+      "epoch": 0.6351555163484249,
+      "grad_norm": 0.5963912010192871,
+      "learning_rate": 0.0002,
+      "loss": 0.2968,
+      "step": 5580
+    },
+    {
+      "epoch": 0.6374320594177741,
+      "grad_norm": 0.49669450521469116,
+      "learning_rate": 0.0002,
+      "loss": 0.2965,
+      "step": 5600
+    },
+    {
+      "epoch": 0.6397086024871234,
+      "grad_norm": 0.5784302353858948,
+      "learning_rate": 0.0002,
+      "loss": 0.2626,
+      "step": 5620
+    },
+    {
+      "epoch": 0.6419851455564725,
+      "grad_norm": 0.5651645660400391,
+      "learning_rate": 0.0002,
+      "loss": 0.2738,
+      "step": 5640
+    },
+    {
+      "epoch": 0.6442616886258217,
+      "grad_norm": 0.45475292205810547,
+      "learning_rate": 0.0002,
+      "loss": 0.2653,
+      "step": 5660
+    },
+    {
+      "epoch": 0.6465382316951709,
+      "grad_norm": 0.4691898822784424,
+      "learning_rate": 0.0002,
+      "loss": 0.2634,
+      "step": 5680
+    },
+    {
+      "epoch": 0.6488147747645201,
+      "grad_norm": 0.4604431092739105,
+      "learning_rate": 0.0002,
+      "loss": 0.2838,
+      "step": 5700
+    },
+    {
+      "epoch": 0.6510913178338693,
+      "grad_norm": 0.506804883480072,
+      "learning_rate": 0.0002,
+      "loss": 0.2657,
+      "step": 5720
+    },
+    {
+      "epoch": 0.6533678609032184,
+      "grad_norm": 0.5051881670951843,
+      "learning_rate": 0.0002,
+      "loss": 0.2976,
+      "step": 5740
+    },
+    {
+      "epoch": 0.6556444039725676,
+      "grad_norm": 0.4780672788619995,
+      "learning_rate": 0.0002,
+      "loss": 0.2828,
+      "step": 5760
+    },
+    {
+      "epoch": 0.6579209470419168,
+      "grad_norm": 0.4695095121860504,
+      "learning_rate": 0.0002,
+      "loss": 0.2685,
+      "step": 5780
+    },
+    {
+      "epoch": 0.660197490111266,
+      "grad_norm": 0.4259052276611328,
+      "learning_rate": 0.0002,
+      "loss": 0.2635,
+      "step": 5800
+    },
+    {
+      "epoch": 0.6624740331806153,
+      "grad_norm": 0.5684182643890381,
+      "learning_rate": 0.0002,
+      "loss": 0.2879,
+      "step": 5820
+    },
+    {
+      "epoch": 0.6647505762499645,
+      "grad_norm": 0.42193594574928284,
+      "learning_rate": 0.0002,
+      "loss": 0.2678,
+      "step": 5840
+    },
+    {
+      "epoch": 0.6670271193193136,
+      "grad_norm": 0.5095034241676331,
+      "learning_rate": 0.0002,
+      "loss": 0.2677,
+      "step": 5860
+    },
+    {
+      "epoch": 0.6693036623886628,
+      "grad_norm": 0.46626052260398865,
+      "learning_rate": 0.0002,
+      "loss": 0.2906,
+      "step": 5880
+    },
+    {
+      "epoch": 0.671580205458012,
+      "grad_norm": 0.5086765289306641,
+      "learning_rate": 0.0002,
+      "loss": 0.2775,
+      "step": 5900
+    },
+    {
+      "epoch": 0.6738567485273612,
+      "grad_norm": 0.44444966316223145,
+      "learning_rate": 0.0002,
+      "loss": 0.2764,
+      "step": 5920
+    },
+    {
+      "epoch": 0.6761332915967104,
+      "grad_norm": 0.4477381706237793,
+      "learning_rate": 0.0002,
+      "loss": 0.2729,
+      "step": 5940
+    },
+    {
+      "epoch": 0.6784098346660596,
+      "grad_norm": 0.46984028816223145,
+      "learning_rate": 0.0002,
+      "loss": 0.273,
+      "step": 5960
+    },
+    {
+      "epoch": 0.6806863777354087,
+      "grad_norm": 0.417084276676178,
+      "learning_rate": 0.0002,
+      "loss": 0.2744,
+      "step": 5980
+    },
+    {
+      "epoch": 0.682962920804758,
+      "grad_norm": 0.4144213795661926,
+      "learning_rate": 0.0002,
+      "loss": 0.2704,
+      "step": 6000
+    },
+    {
+      "epoch": 0.6852394638741072,
+      "grad_norm": 0.5844799876213074,
+      "learning_rate": 0.0002,
+      "loss": 0.2635,
+      "step": 6020
+    },
+    {
+      "epoch": 0.6875160069434564,
+      "grad_norm": 0.39512693881988525,
+      "learning_rate": 0.0002,
+      "loss": 0.2471,
+      "step": 6040
+    },
+    {
+      "epoch": 0.6897925500128056,
+      "grad_norm": 0.5299990773200989,
+      "learning_rate": 0.0002,
+      "loss": 0.2648,
+      "step": 6060
+    },
+    {
+      "epoch": 0.6920690930821547,
+      "grad_norm": 0.4980265498161316,
+      "learning_rate": 0.0002,
+      "loss": 0.2725,
+      "step": 6080
+    },
+    {
+      "epoch": 0.6943456361515039,
+      "grad_norm": 0.4003869891166687,
+      "learning_rate": 0.0002,
+      "loss": 0.2768,
+      "step": 6100
+    },
+    {
+      "epoch": 0.6966221792208531,
+      "grad_norm": 0.5103460550308228,
+      "learning_rate": 0.0002,
+      "loss": 0.2638,
+      "step": 6120
+    },
+    {
+      "epoch": 0.6988987222902023,
+      "grad_norm": 0.737101137638092,
+      "learning_rate": 0.0002,
+      "loss": 0.2779,
+      "step": 6140
+    },
+    {
+      "epoch": 0.7011752653595515,
+      "grad_norm": 0.4731826186180115,
+      "learning_rate": 0.0002,
+      "loss": 0.2691,
+      "step": 6160
+    },
+    {
+      "epoch": 0.7034518084289008,
+      "grad_norm": 0.5234053730964661,
+      "learning_rate": 0.0002,
+      "loss": 0.2739,
+      "step": 6180
+    },
+    {
+      "epoch": 0.7057283514982499,
+      "grad_norm": 0.5235525369644165,
+      "learning_rate": 0.0002,
+      "loss": 0.2754,
+      "step": 6200
+    },
+    {
+      "epoch": 0.7080048945675991,
+      "grad_norm": 0.4453619122505188,
+      "learning_rate": 0.0002,
+      "loss": 0.2833,
+      "step": 6220
+    },
+    {
+      "epoch": 0.7102814376369483,
+      "grad_norm": 0.4025666117668152,
+      "learning_rate": 0.0002,
+      "loss": 0.2713,
+      "step": 6240
+    },
+    {
+      "epoch": 0.7125579807062975,
+      "grad_norm": 0.35240331292152405,
+      "learning_rate": 0.0002,
+      "loss": 0.2786,
+      "step": 6260
+    },
+    {
+      "epoch": 0.7148345237756467,
+      "grad_norm": 0.4521905779838562,
+      "learning_rate": 0.0002,
+      "loss": 0.2639,
+      "step": 6280
+    },
+    {
+      "epoch": 0.7171110668449959,
+      "grad_norm": 0.5230519771575928,
+      "learning_rate": 0.0002,
+      "loss": 0.2517,
+      "step": 6300
+    },
+    {
+      "epoch": 0.719387609914345,
+      "grad_norm": 0.5415637493133545,
+      "learning_rate": 0.0002,
+      "loss": 0.2739,
+      "step": 6320
+    },
+    {
+      "epoch": 0.7216641529836942,
+      "grad_norm": 0.4067966341972351,
+      "learning_rate": 0.0002,
+      "loss": 0.2751,
+      "step": 6340
+    },
+    {
+      "epoch": 0.7239406960530435,
+      "grad_norm": 0.4670214354991913,
+      "learning_rate": 0.0002,
+      "loss": 0.2644,
+      "step": 6360
+    },
+    {
+      "epoch": 0.7262172391223927,
+      "grad_norm": 0.5316203236579895,
+      "learning_rate": 0.0002,
+      "loss": 0.2746,
+      "step": 6380
+    },
+    {
+      "epoch": 0.7284937821917419,
+      "grad_norm": 0.46312493085861206,
+      "learning_rate": 0.0002,
+      "loss": 0.2539,
+      "step": 6400
+    },
+    {
+      "epoch": 0.730770325261091,
+      "grad_norm": 0.465279221534729,
+      "learning_rate": 0.0002,
+      "loss": 0.2742,
+      "step": 6420
+    },
+    {
+      "epoch": 0.7330468683304402,
+      "grad_norm": 0.5096962451934814,
+      "learning_rate": 0.0002,
+      "loss": 0.2546,
+      "step": 6440
+    },
+    {
+      "epoch": 0.7353234113997894,
+      "grad_norm": 0.4525590240955353,
+      "learning_rate": 0.0002,
+      "loss": 0.2694,
+      "step": 6460
+    },
+    {
+      "epoch": 0.7375999544691386,
+      "grad_norm": 0.5033881664276123,
+      "learning_rate": 0.0002,
+      "loss": 0.2627,
+      "step": 6480
+    },
+    {
+      "epoch": 0.7398764975384878,
+      "grad_norm": 0.44053900241851807,
+      "learning_rate": 0.0002,
+      "loss": 0.258,
+      "step": 6500
+    },
+    {
+      "epoch": 0.742153040607837,
+      "grad_norm": 0.4677462875843048,
+      "learning_rate": 0.0002,
+      "loss": 0.2659,
+      "step": 6520
+    },
+    {
+      "epoch": 0.7444295836771861,
+      "grad_norm": 0.5687553882598877,
+      "learning_rate": 0.0002,
+      "loss": 0.271,
+      "step": 6540
+    },
+    {
+      "epoch": 0.7467061267465354,
+      "grad_norm": 0.4980468451976776,
+      "learning_rate": 0.0002,
+      "loss": 0.265,
+      "step": 6560
+    },
+    {
+      "epoch": 0.7489826698158846,
+      "grad_norm": 0.5155619382858276,
+      "learning_rate": 0.0002,
+      "loss": 0.2491,
+      "step": 6580
+    },
+    {
+      "epoch": 0.7512592128852338,
+      "grad_norm": 0.5364673733711243,
+      "learning_rate": 0.0002,
+      "loss": 0.2564,
+      "step": 6600
+    },
+    {
+      "epoch": 0.753535755954583,
+      "grad_norm": 0.421838641166687,
+      "learning_rate": 0.0002,
+      "loss": 0.267,
+      "step": 6620
+    },
+    {
+      "epoch": 0.7558122990239322,
+      "grad_norm": 0.46299833059310913,
+      "learning_rate": 0.0002,
+      "loss": 0.2461,
+      "step": 6640
+    },
+    {
+      "epoch": 0.7580888420932813,
+      "grad_norm": 0.3832832872867584,
+      "learning_rate": 0.0002,
+      "loss": 0.265,
+      "step": 6660
+    },
+    {
+      "epoch": 0.7603653851626305,
+      "grad_norm": 0.5560947060585022,
+      "learning_rate": 0.0002,
+      "loss": 0.253,
+      "step": 6680
+    },
+    {
+      "epoch": 0.7626419282319797,
+      "grad_norm": 0.4832628667354584,
+      "learning_rate": 0.0002,
+      "loss": 0.2515,
+      "step": 6700
+    },
+    {
+      "epoch": 0.764918471301329,
+      "grad_norm": 0.44354599714279175,
+      "learning_rate": 0.0002,
+      "loss": 0.2687,
+      "step": 6720
+    },
+    {
+      "epoch": 0.7671950143706782,
+      "grad_norm": 0.3746070861816406,
+      "learning_rate": 0.0002,
+      "loss": 0.2481,
+      "step": 6740
+    },
+    {
+      "epoch": 0.7694715574400273,
+      "grad_norm": 0.3048388659954071,
+      "learning_rate": 0.0002,
+      "loss": 0.269,
+      "step": 6760
+    },
+    {
+      "epoch": 0.7717481005093765,
+      "grad_norm": 0.46471843123435974,
+      "learning_rate": 0.0002,
+      "loss": 0.2642,
+      "step": 6780
+    },
+    {
+      "epoch": 0.7740246435787257,
+      "grad_norm": 0.44309428334236145,
+      "learning_rate": 0.0002,
+      "loss": 0.2565,
+      "step": 6800
+    },
+    {
+      "epoch": 0.7763011866480749,
+      "grad_norm": 0.4174291789531708,
+      "learning_rate": 0.0002,
+      "loss": 0.262,
+      "step": 6820
+    },
+    {
+      "epoch": 0.7785777297174241,
+      "grad_norm": 0.42592549324035645,
+      "learning_rate": 0.0002,
+      "loss": 0.2608,
+      "step": 6840
+    },
+    {
+      "epoch": 0.7808542727867733,
+      "grad_norm": 0.4378054141998291,
+      "learning_rate": 0.0002,
+      "loss": 0.2765,
+      "step": 6860
+    },
+    {
+      "epoch": 0.7831308158561224,
+      "grad_norm": 0.4560708701610565,
+      "learning_rate": 0.0002,
+      "loss": 0.2381,
+      "step": 6880
+    },
+    {
+      "epoch": 0.7854073589254716,
+      "grad_norm": 0.4595545828342438,
+      "learning_rate": 0.0002,
+      "loss": 0.2561,
+      "step": 6900
+    },
+    {
+      "epoch": 0.7876839019948209,
+      "grad_norm": 0.45213592052459717,
+      "learning_rate": 0.0002,
+      "loss": 0.2645,
+      "step": 6920
+    },
+    {
+      "epoch": 0.7899604450641701,
+      "grad_norm": 0.4857342839241028,
+      "learning_rate": 0.0002,
+      "loss": 0.2687,
+      "step": 6940
+    },
+    {
+      "epoch": 0.7922369881335193,
+      "grad_norm": 0.4939437508583069,
+      "learning_rate": 0.0002,
+      "loss": 0.2642,
+      "step": 6960
+    },
+    {
+      "epoch": 0.7945135312028685,
+      "grad_norm": 0.46244382858276367,
+      "learning_rate": 0.0002,
+      "loss": 0.2536,
+      "step": 6980
+    },
+    {
+      "epoch": 0.7967900742722176,
+      "grad_norm": 0.5876993536949158,
+      "learning_rate": 0.0002,
+      "loss": 0.2492,
+      "step": 7000
+    },
+    {
+      "epoch": 0.7990666173415668,
+      "grad_norm": 0.5170072913169861,
+      "learning_rate": 0.0002,
+      "loss": 0.2548,
+      "step": 7020
+    },
+    {
+      "epoch": 0.801343160410916,
+      "grad_norm": 0.394380658864975,
+      "learning_rate": 0.0002,
+      "loss": 0.2524,
+      "step": 7040
+    },
+    {
+      "epoch": 0.8036197034802652,
+      "grad_norm": 0.4716455340385437,
+      "learning_rate": 0.0002,
+      "loss": 0.2573,
+      "step": 7060
+    },
+    {
+      "epoch": 0.8058962465496144,
+      "grad_norm": 0.34525179862976074,
+      "learning_rate": 0.0002,
+      "loss": 0.246,
+      "step": 7080
+    },
+    {
+      "epoch": 0.8081727896189635,
+      "grad_norm": 0.5030418038368225,
+      "learning_rate": 0.0002,
+      "loss": 0.2596,
+      "step": 7100
+    },
+    {
+      "epoch": 0.8104493326883128,
+      "grad_norm": 0.5586132407188416,
+      "learning_rate": 0.0002,
+      "loss": 0.2568,
+      "step": 7120
+    },
+    {
+      "epoch": 0.812725875757662,
+      "grad_norm": 0.47025129199028015,
+      "learning_rate": 0.0002,
+      "loss": 0.265,
+      "step": 7140
+    },
+    {
+      "epoch": 0.8150024188270112,
+      "grad_norm": 0.5654832720756531,
+      "learning_rate": 0.0002,
+      "loss": 0.2468,
+      "step": 7160
+    },
+    {
+      "epoch": 0.8172789618963604,
+      "grad_norm": 0.4701017141342163,
+      "learning_rate": 0.0002,
+      "loss": 0.2538,
+      "step": 7180
+    },
+    {
+      "epoch": 0.8195555049657096,
+      "grad_norm": 0.47270438075065613,
+      "learning_rate": 0.0002,
+      "loss": 0.2529,
+      "step": 7200
+    },
+    {
+      "epoch": 0.8218320480350587,
+      "grad_norm": 0.39433714747428894,
+      "learning_rate": 0.0002,
+      "loss": 0.2445,
+      "step": 7220
+    },
+    {
+      "epoch": 0.8241085911044079,
+      "grad_norm": 0.4521467685699463,
+      "learning_rate": 0.0002,
+      "loss": 0.2556,
+      "step": 7240
+    },
+    {
+      "epoch": 0.8263851341737571,
+      "grad_norm": 0.28483667969703674,
+      "learning_rate": 0.0002,
+      "loss": 0.2451,
+      "step": 7260
+    },
+    {
+      "epoch": 0.8286616772431064,
+      "grad_norm": 0.4298310875892639,
+      "learning_rate": 0.0002,
+      "loss": 0.2599,
+      "step": 7280
+    },
+    {
+      "epoch": 0.8309382203124556,
+      "grad_norm": 0.39677906036376953,
+      "learning_rate": 0.0002,
+      "loss": 0.2539,
+      "step": 7300
+    },
+    {
+      "epoch": 0.8332147633818048,
+      "grad_norm": 0.5800175666809082,
+      "learning_rate": 0.0002,
+      "loss": 0.2463,
+      "step": 7320
+    },
+    {
+      "epoch": 0.8354913064511539,
+      "grad_norm": 0.42742472887039185,
+      "learning_rate": 0.0002,
+      "loss": 0.2593,
+      "step": 7340
+    },
+    {
+      "epoch": 0.8377678495205031,
+      "grad_norm": 0.5521807670593262,
+      "learning_rate": 0.0002,
+      "loss": 0.253,
+      "step": 7360
+    },
+    {
+      "epoch": 0.8400443925898523,
+      "grad_norm": 0.5068047046661377,
+      "learning_rate": 0.0002,
+      "loss": 0.2503,
+      "step": 7380
+    },
+    {
+      "epoch": 0.8423209356592015,
+      "grad_norm": 0.4325120151042938,
+      "learning_rate": 0.0002,
+      "loss": 0.2466,
+      "step": 7400
+    },
+    {
+      "epoch": 0.8445974787285507,
+      "grad_norm": 0.5130394101142883,
+      "learning_rate": 0.0002,
+      "loss": 0.2521,
+      "step": 7420
+    },
+    {
+      "epoch": 0.8468740217978998,
+      "grad_norm": 0.5091120600700378,
+      "learning_rate": 0.0002,
+      "loss": 0.2429,
+      "step": 7440
+    },
+    {
+      "epoch": 0.849150564867249,
+      "grad_norm": 0.4635036289691925,
+      "learning_rate": 0.0002,
+      "loss": 0.235,
+      "step": 7460
+    },
+    {
+      "epoch": 0.8514271079365983,
+      "grad_norm": 0.3827108144760132,
+      "learning_rate": 0.0002,
+      "loss": 0.2487,
+      "step": 7480
+    },
+    {
+      "epoch": 0.8537036510059475,
+      "grad_norm": 0.3880899250507355,
+      "learning_rate": 0.0002,
+      "loss": 0.2469,
+      "step": 7500
+    },
+    {
+      "epoch": 0.8559801940752967,
+      "grad_norm": 0.408933162689209,
+      "learning_rate": 0.0002,
+      "loss": 0.2499,
+      "step": 7520
+    },
+    {
+      "epoch": 0.8582567371446459,
+      "grad_norm": 0.5049706101417542,
+      "learning_rate": 0.0002,
+      "loss": 0.2418,
+      "step": 7540
+    },
+    {
+      "epoch": 0.860533280213995,
+      "grad_norm": 0.43551701307296753,
+      "learning_rate": 0.0002,
+      "loss": 0.2478,
+      "step": 7560
+    },
+    {
+      "epoch": 0.8628098232833442,
+      "grad_norm": 0.5024411678314209,
+      "learning_rate": 0.0002,
+      "loss": 0.2538,
+      "step": 7580
+    },
+    {
+      "epoch": 0.8650863663526934,
+      "grad_norm": 0.36361223459243774,
+      "learning_rate": 0.0002,
+      "loss": 0.2536,
+      "step": 7600
+    },
+    {
+      "epoch": 0.8673629094220426,
+      "grad_norm": 0.4526277482509613,
+      "learning_rate": 0.0002,
+      "loss": 0.242,
+      "step": 7620
+    },
+    {
+      "epoch": 0.8696394524913919,
+      "grad_norm": 0.5677676200866699,
+      "learning_rate": 0.0002,
+      "loss": 0.2572,
+      "step": 7640
+    },
+    {
+      "epoch": 0.8719159955607411,
+      "grad_norm": 0.4915711283683777,
+      "learning_rate": 0.0002,
+      "loss": 0.2562,
+      "step": 7660
+    },
+    {
+      "epoch": 0.8741925386300902,
+      "grad_norm": 0.36850452423095703,
+      "learning_rate": 0.0002,
+      "loss": 0.2523,
+      "step": 7680
+    },
+    {
+      "epoch": 0.8764690816994394,
+      "grad_norm": 0.38313761353492737,
+      "learning_rate": 0.0002,
+      "loss": 0.2596,
+      "step": 7700
+    },
+    {
+      "epoch": 0.8787456247687886,
+      "grad_norm": 0.5384640097618103,
+      "learning_rate": 0.0002,
+      "loss": 0.2455,
+      "step": 7720
+    },
+    {
+      "epoch": 0.8810221678381378,
+      "grad_norm": 0.5308900475502014,
+      "learning_rate": 0.0002,
+      "loss": 0.2439,
+      "step": 7740
+    },
+    {
+      "epoch": 0.883298710907487,
+      "grad_norm": 0.5488154292106628,
+      "learning_rate": 0.0002,
+      "loss": 0.2428,
+      "step": 7760
+    },
+    {
+      "epoch": 0.8855752539768362,
+      "grad_norm": 0.5271242260932922,
+      "learning_rate": 0.0002,
+      "loss": 0.2372,
+      "step": 7780
+    },
+    {
+      "epoch": 0.8878517970461853,
+      "grad_norm": 0.46171802282333374,
+      "learning_rate": 0.0002,
+      "loss": 0.2506,
+      "step": 7800
+    },
+    {
+      "epoch": 0.8901283401155345,
+      "grad_norm": 0.45436665415763855,
+      "learning_rate": 0.0002,
+      "loss": 0.2414,
+      "step": 7820
+    },
+    {
+      "epoch": 0.8924048831848838,
+      "grad_norm": 0.4920847415924072,
+      "learning_rate": 0.0002,
+      "loss": 0.2669,
+      "step": 7840
+    },
+    {
+      "epoch": 0.894681426254233,
+      "grad_norm": 0.5913518071174622,
+      "learning_rate": 0.0002,
+      "loss": 0.2552,
+      "step": 7860
+    },
+    {
+      "epoch": 0.8969579693235822,
+      "grad_norm": 0.6011972427368164,
+      "learning_rate": 0.0002,
+      "loss": 0.2533,
+      "step": 7880
+    },
+    {
+      "epoch": 0.8992345123929313,
+      "grad_norm": 0.4650927186012268,
+      "learning_rate": 0.0002,
+      "loss": 0.2448,
+      "step": 7900
+    },
+    {
+      "epoch": 0.9015110554622805,
+      "grad_norm": 0.5828790664672852,
+      "learning_rate": 0.0002,
+      "loss": 0.2381,
+      "step": 7920
+    },
+    {
+      "epoch": 0.9037875985316297,
+      "grad_norm": 0.5178338885307312,
+      "learning_rate": 0.0002,
+      "loss": 0.2619,
+      "step": 7940
+    },
+    {
+      "epoch": 0.9060641416009789,
+      "grad_norm": 0.5147708058357239,
+      "learning_rate": 0.0002,
+      "loss": 0.258,
+      "step": 7960
+    },
+    {
+      "epoch": 0.9083406846703281,
+      "grad_norm": 0.45790836215019226,
+      "learning_rate": 0.0002,
+      "loss": 0.2474,
+      "step": 7980
+    },
+    {
+      "epoch": 0.9106172277396773,
+      "grad_norm": 0.3837074935436249,
+      "learning_rate": 0.0002,
+      "loss": 0.2356,
+      "step": 8000
+    },
+    {
+      "epoch": 0.9128937708090265,
+      "grad_norm": 0.4466090500354767,
+      "learning_rate": 0.0002,
+      "loss": 0.237,
+      "step": 8020
+    },
+    {
+      "epoch": 0.9151703138783757,
+      "grad_norm": 0.5893344283103943,
+      "learning_rate": 0.0002,
+      "loss": 0.2399,
+      "step": 8040
+    },
+    {
+      "epoch": 0.9174468569477249,
+      "grad_norm": 0.49547362327575684,
+      "learning_rate": 0.0002,
+      "loss": 0.2526,
+      "step": 8060
+    },
+    {
+      "epoch": 0.9197234000170741,
+      "grad_norm": 0.47068551182746887,
+      "learning_rate": 0.0002,
+      "loss": 0.2631,
+      "step": 8080
+    },
+    {
+      "epoch": 0.9219999430864233,
+      "grad_norm": 0.3512951135635376,
+      "learning_rate": 0.0002,
+      "loss": 0.2395,
+      "step": 8100
+    },
+    {
+      "epoch": 0.9242764861557725,
+      "grad_norm": 0.3996793031692505,
+      "learning_rate": 0.0002,
+      "loss": 0.2424,
+      "step": 8120
+    },
+    {
+      "epoch": 0.9265530292251216,
+      "grad_norm": 0.5782022476196289,
+      "learning_rate": 0.0002,
+      "loss": 0.2549,
+      "step": 8140
+    },
+    {
+      "epoch": 0.9288295722944708,
+      "grad_norm": 0.450860857963562,
+      "learning_rate": 0.0002,
+      "loss": 0.2465,
+      "step": 8160
+    },
+    {
+      "epoch": 0.93110611536382,
+      "grad_norm": 0.4679816663265228,
+      "learning_rate": 0.0002,
+      "loss": 0.2326,
+      "step": 8180
+    },
+    {
+      "epoch": 0.9333826584331693,
+      "grad_norm": 0.5497337579727173,
+      "learning_rate": 0.0002,
+      "loss": 0.2457,
+      "step": 8200
+    },
+    {
+      "epoch": 0.9356592015025185,
+      "grad_norm": 0.3775748312473297,
+      "learning_rate": 0.0002,
+      "loss": 0.2331,
+      "step": 8220
+    },
+    {
+      "epoch": 0.9379357445718676,
+      "grad_norm": 0.5428327918052673,
+      "learning_rate": 0.0002,
+      "loss": 0.2399,
+      "step": 8240
+    },
+    {
+      "epoch": 0.9402122876412168,
+      "grad_norm": 0.4089830219745636,
+      "learning_rate": 0.0002,
+      "loss": 0.246,
+      "step": 8260
+    },
+    {
+      "epoch": 0.942488830710566,
+      "grad_norm": 0.5781340003013611,
+      "learning_rate": 0.0002,
+      "loss": 0.2451,
+      "step": 8280
+    },
+    {
+      "epoch": 0.9447653737799152,
+      "grad_norm": 0.5869989395141602,
+      "learning_rate": 0.0002,
+      "loss": 0.2541,
+      "step": 8300
+    },
+    {
+      "epoch": 0.9470419168492644,
+      "grad_norm": 0.47708019614219666,
+      "learning_rate": 0.0002,
+      "loss": 0.2559,
+      "step": 8320
+    },
+    {
+      "epoch": 0.9493184599186136,
+      "grad_norm": 0.5445525050163269,
+      "learning_rate": 0.0002,
+      "loss": 0.2466,
+      "step": 8340
+    },
+    {
+      "epoch": 0.9515950029879627,
+      "grad_norm": 0.480214387178421,
+      "learning_rate": 0.0002,
+      "loss": 0.236,
+      "step": 8360
+    },
+    {
+      "epoch": 0.953871546057312,
+      "grad_norm": 0.5392053127288818,
+      "learning_rate": 0.0002,
+      "loss": 0.2383,
+      "step": 8380
+    },
+    {
+      "epoch": 0.9561480891266612,
+      "grad_norm": 0.4515858292579651,
+      "learning_rate": 0.0002,
+      "loss": 0.238,
+      "step": 8400
+    },
+    {
+      "epoch": 0.9584246321960104,
+      "grad_norm": 0.5461826324462891,
+      "learning_rate": 0.0002,
+      "loss": 0.2442,
+      "step": 8420
+    },
+    {
+      "epoch": 0.9607011752653596,
+      "grad_norm": 0.44309332966804504,
+      "learning_rate": 0.0002,
+      "loss": 0.2622,
+      "step": 8440
+    },
+    {
+      "epoch": 0.9629777183347088,
+      "grad_norm": 0.5409505367279053,
+      "learning_rate": 0.0002,
+      "loss": 0.2303,
+      "step": 8460
+    },
+    {
+      "epoch": 0.9652542614040579,
+      "grad_norm": 0.3868342638015747,
+      "learning_rate": 0.0002,
+      "loss": 0.2624,
+      "step": 8480
+    },
+    {
+      "epoch": 0.9675308044734071,
+      "grad_norm": 0.38888975977897644,
+      "learning_rate": 0.0002,
+      "loss": 0.246,
+      "step": 8500
+    },
+    {
+      "epoch": 0.9698073475427563,
+      "grad_norm": 0.38946032524108887,
+      "learning_rate": 0.0002,
+      "loss": 0.2503,
+      "step": 8520
+    },
+    {
+      "epoch": 0.9720838906121055,
+      "grad_norm": 0.42425817251205444,
+      "learning_rate": 0.0002,
+      "loss": 0.2556,
+      "step": 8540
+    },
+    {
+      "epoch": 0.9743604336814548,
+      "grad_norm": 0.41515296697616577,
+      "learning_rate": 0.0002,
+      "loss": 0.2437,
+      "step": 8560
+    },
+    {
+      "epoch": 0.9766369767508039,
+      "grad_norm": 0.4085826575756073,
+      "learning_rate": 0.0002,
+      "loss": 0.2293,
+      "step": 8580
+    },
+    {
+      "epoch": 0.9789135198201531,
+      "grad_norm": 0.3404542803764343,
+      "learning_rate": 0.0002,
+      "loss": 0.242,
+      "step": 8600
+    },
+    {
+      "epoch": 0.9811900628895023,
+      "grad_norm": 0.43266579508781433,
+      "learning_rate": 0.0002,
+      "loss": 0.2513,
+      "step": 8620
+    },
+    {
+      "epoch": 0.9834666059588515,
+      "grad_norm": 0.42724549770355225,
+      "learning_rate": 0.0002,
+      "loss": 0.2384,
+      "step": 8640
+    },
+    {
+      "epoch": 0.9857431490282007,
+      "grad_norm": 0.5089221596717834,
+      "learning_rate": 0.0002,
+      "loss": 0.2409,
+      "step": 8660
+    },
+    {
+      "epoch": 0.9880196920975499,
+      "grad_norm": 0.519223690032959,
+      "learning_rate": 0.0002,
+      "loss": 0.2353,
+      "step": 8680
+    },
+    {
+      "epoch": 0.990296235166899,
+      "grad_norm": 0.5701056122779846,
+      "learning_rate": 0.0002,
+      "loss": 0.2486,
+      "step": 8700
+    },
+    {
+      "epoch": 0.9925727782362482,
+      "grad_norm": 0.4519595503807068,
+      "learning_rate": 0.0002,
+      "loss": 0.2374,
+      "step": 8720
+    },
+    {
+      "epoch": 0.9948493213055974,
+      "grad_norm": 0.4883946180343628,
+      "learning_rate": 0.0002,
+      "loss": 0.2441,
+      "step": 8740
+    },
+    {
+      "epoch": 0.9971258643749467,
+      "grad_norm": 0.6918900012969971,
+      "learning_rate": 0.0002,
+      "loss": 0.2403,
+      "step": 8760
+    },
+    {
+      "epoch": 0.9994024074442959,
+      "grad_norm": 0.4810091555118561,
+      "learning_rate": 0.0002,
+      "loss": 0.2334,
+      "step": 8780
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.30941203236579895,
+      "eval_runtime": 408.7196,
+      "eval_samples_per_second": 7.083,
+      "eval_steps_per_second": 0.886,
+      "step": 8786
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 13000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 77,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.923169198364426e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e81583ff738f437f9b8ba61f8cd63306401c4b51ce22ce038811cf0a2a0f493e
+size 5816