diff --git "a/checkpoint-2916/trainer_state.json" "b/checkpoint-2916/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2916/trainer_state.json" @@ -0,0 +1,21345 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9724770642201834, + "eval_steps": 98, + "global_step": 2916, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010193679918450561, + "grad_norm": 24.08992576599121, + "learning_rate": 0.0, + "loss": 8.5798, + "step": 1 + }, + { + "epoch": 0.0020387359836901123, + "grad_norm": 22.663619995117188, + "learning_rate": 2.2624434389140275e-07, + "loss": 8.4222, + "step": 2 + }, + { + "epoch": 0.0030581039755351682, + "grad_norm": 25.377544403076172, + "learning_rate": 4.524886877828055e-07, + "loss": 9.392, + "step": 3 + }, + { + "epoch": 0.004077471967380225, + "grad_norm": 22.122257232666016, + "learning_rate": 6.787330316742082e-07, + "loss": 8.4193, + "step": 4 + }, + { + "epoch": 0.0050968399592252805, + "grad_norm": Infinity, + "learning_rate": 9.04977375565611e-07, + "loss": 8.808, + "step": 5 + }, + { + "epoch": 0.0061162079510703364, + "grad_norm": 22.832090377807617, + "learning_rate": 9.04977375565611e-07, + "loss": 10.5002, + "step": 6 + }, + { + "epoch": 0.007135575942915392, + "grad_norm": 20.933177947998047, + "learning_rate": 1.1312217194570136e-06, + "loss": 9.5956, + "step": 7 + }, + { + "epoch": 0.00815494393476045, + "grad_norm": 20.44132423400879, + "learning_rate": 1.3574660633484164e-06, + "loss": 8.5526, + "step": 8 + }, + { + "epoch": 0.009174311926605505, + "grad_norm": 22.630067825317383, + "learning_rate": 1.583710407239819e-06, + "loss": 9.8255, + "step": 9 + }, + { + "epoch": 0.010193679918450561, + "grad_norm": 22.625064849853516, + "learning_rate": 1.809954751131222e-06, + "loss": 9.2067, + "step": 10 + }, + { + "epoch": 0.011213047910295617, + "grad_norm": 19.427106857299805, + "learning_rate": 2.0361990950226245e-06, + "loss": 8.667, + "step": 11 + }, + { + "epoch": 0.012232415902140673, + "grad_norm": 24.354387283325195, + "learning_rate": 2.2624434389140273e-06, + "loss": 10.396, + "step": 12 + }, + { + "epoch": 0.013251783893985729, + "grad_norm": 21.99860191345215, + "learning_rate": 2.48868778280543e-06, + "loss": 8.9036, + "step": 13 + }, + { + "epoch": 0.014271151885830785, + "grad_norm": 21.275592803955078, + "learning_rate": 2.7149321266968327e-06, + "loss": 7.9307, + "step": 14 + }, + { + "epoch": 0.01529051987767584, + "grad_norm": 20.020435333251953, + "learning_rate": 2.9411764705882355e-06, + "loss": 9.0655, + "step": 15 + }, + { + "epoch": 0.0163098878695209, + "grad_norm": 20.713603973388672, + "learning_rate": 3.167420814479638e-06, + "loss": 8.6598, + "step": 16 + }, + { + "epoch": 0.017329255861365953, + "grad_norm": 22.857194900512695, + "learning_rate": 3.3936651583710405e-06, + "loss": 9.7562, + "step": 17 + }, + { + "epoch": 0.01834862385321101, + "grad_norm": 19.031551361083984, + "learning_rate": 3.619909502262444e-06, + "loss": 9.2297, + "step": 18 + }, + { + "epoch": 0.019367991845056064, + "grad_norm": 19.30624008178711, + "learning_rate": 3.846153846153847e-06, + "loss": 8.6939, + "step": 19 + }, + { + "epoch": 0.020387359836901122, + "grad_norm": 17.09296417236328, + "learning_rate": 4.072398190045249e-06, + "loss": 8.1317, + "step": 20 + }, + { + "epoch": 0.021406727828746176, + "grad_norm": 19.199600219726562, + "learning_rate": 4.298642533936651e-06, + "loss": 8.3585, + "step": 21 + }, + { + "epoch": 0.022426095820591234, + "grad_norm": 18.50484275817871, + "learning_rate": 4.5248868778280546e-06, + "loss": 8.4533, + "step": 22 + }, + { + "epoch": 0.023445463812436288, + "grad_norm": 19.170618057250977, + "learning_rate": 4.751131221719457e-06, + "loss": 9.3014, + "step": 23 + }, + { + "epoch": 0.024464831804281346, + "grad_norm": 17.692346572875977, + "learning_rate": 4.97737556561086e-06, + "loss": 8.18, + "step": 24 + }, + { + "epoch": 0.0254841997961264, + "grad_norm": 18.87356185913086, + "learning_rate": 5.203619909502263e-06, + "loss": 7.8485, + "step": 25 + }, + { + "epoch": 0.026503567787971458, + "grad_norm": 16.432092666625977, + "learning_rate": 5.4298642533936655e-06, + "loss": 8.9669, + "step": 26 + }, + { + "epoch": 0.027522935779816515, + "grad_norm": 17.064382553100586, + "learning_rate": 5.656108597285068e-06, + "loss": 10.1397, + "step": 27 + }, + { + "epoch": 0.02854230377166157, + "grad_norm": 17.96854591369629, + "learning_rate": 5.882352941176471e-06, + "loss": 10.5216, + "step": 28 + }, + { + "epoch": 0.029561671763506627, + "grad_norm": 16.348352432250977, + "learning_rate": 6.108597285067873e-06, + "loss": 7.4782, + "step": 29 + }, + { + "epoch": 0.03058103975535168, + "grad_norm": 15.834653854370117, + "learning_rate": 6.334841628959276e-06, + "loss": 8.0439, + "step": 30 + }, + { + "epoch": 0.03160040774719674, + "grad_norm": 15.115158081054688, + "learning_rate": 6.5610859728506795e-06, + "loss": 7.1911, + "step": 31 + }, + { + "epoch": 0.0326197757390418, + "grad_norm": 17.570573806762695, + "learning_rate": 6.787330316742081e-06, + "loss": 8.5735, + "step": 32 + }, + { + "epoch": 0.03363914373088685, + "grad_norm": 15.224530220031738, + "learning_rate": 7.013574660633485e-06, + "loss": 8.3855, + "step": 33 + }, + { + "epoch": 0.034658511722731905, + "grad_norm": 16.47282600402832, + "learning_rate": 7.239819004524888e-06, + "loss": 8.4305, + "step": 34 + }, + { + "epoch": 0.03567787971457696, + "grad_norm": 16.739215850830078, + "learning_rate": 7.46606334841629e-06, + "loss": 9.4608, + "step": 35 + }, + { + "epoch": 0.03669724770642202, + "grad_norm": 13.741637229919434, + "learning_rate": 7.692307692307694e-06, + "loss": 8.1572, + "step": 36 + }, + { + "epoch": 0.03771661569826707, + "grad_norm": 14.70285701751709, + "learning_rate": 7.918552036199094e-06, + "loss": 8.1456, + "step": 37 + }, + { + "epoch": 0.03873598369011213, + "grad_norm": 11.470185279846191, + "learning_rate": 8.144796380090498e-06, + "loss": 7.3833, + "step": 38 + }, + { + "epoch": 0.039755351681957186, + "grad_norm": 13.029812812805176, + "learning_rate": 8.3710407239819e-06, + "loss": 8.8539, + "step": 39 + }, + { + "epoch": 0.040774719673802244, + "grad_norm": 12.46716594696045, + "learning_rate": 8.597285067873303e-06, + "loss": 8.9349, + "step": 40 + }, + { + "epoch": 0.0417940876656473, + "grad_norm": 12.875706672668457, + "learning_rate": 8.823529411764707e-06, + "loss": 8.1803, + "step": 41 + }, + { + "epoch": 0.04281345565749235, + "grad_norm": 12.646770477294922, + "learning_rate": 9.049773755656109e-06, + "loss": 6.7532, + "step": 42 + }, + { + "epoch": 0.04383282364933741, + "grad_norm": 13.792744636535645, + "learning_rate": 9.276018099547511e-06, + "loss": 7.127, + "step": 43 + }, + { + "epoch": 0.04485219164118247, + "grad_norm": 11.656695365905762, + "learning_rate": 9.502262443438914e-06, + "loss": 7.5565, + "step": 44 + }, + { + "epoch": 0.045871559633027525, + "grad_norm": 11.562976837158203, + "learning_rate": 9.728506787330318e-06, + "loss": 7.6078, + "step": 45 + }, + { + "epoch": 0.046890927624872576, + "grad_norm": 11.516715049743652, + "learning_rate": 9.95475113122172e-06, + "loss": 8.4153, + "step": 46 + }, + { + "epoch": 0.047910295616717634, + "grad_norm": 11.569866180419922, + "learning_rate": 1.0180995475113122e-05, + "loss": 7.1062, + "step": 47 + }, + { + "epoch": 0.04892966360856269, + "grad_norm": 11.088666915893555, + "learning_rate": 1.0407239819004526e-05, + "loss": 6.8482, + "step": 48 + }, + { + "epoch": 0.04994903160040775, + "grad_norm": 11.396224021911621, + "learning_rate": 1.0633484162895929e-05, + "loss": 7.2262, + "step": 49 + }, + { + "epoch": 0.0509683995922528, + "grad_norm": 11.868388175964355, + "learning_rate": 1.0859728506787331e-05, + "loss": 8.0207, + "step": 50 + }, + { + "epoch": 0.05198776758409786, + "grad_norm": 10.022957801818848, + "learning_rate": 1.1085972850678733e-05, + "loss": 7.6895, + "step": 51 + }, + { + "epoch": 0.053007135575942915, + "grad_norm": 11.007475852966309, + "learning_rate": 1.1312217194570136e-05, + "loss": 7.6185, + "step": 52 + }, + { + "epoch": 0.05402650356778797, + "grad_norm": 10.026458740234375, + "learning_rate": 1.153846153846154e-05, + "loss": 8.8153, + "step": 53 + }, + { + "epoch": 0.05504587155963303, + "grad_norm": 10.358866691589355, + "learning_rate": 1.1764705882352942e-05, + "loss": 7.7666, + "step": 54 + }, + { + "epoch": 0.05606523955147808, + "grad_norm": 10.722491264343262, + "learning_rate": 1.1990950226244344e-05, + "loss": 7.1431, + "step": 55 + }, + { + "epoch": 0.05708460754332314, + "grad_norm": 10.623186111450195, + "learning_rate": 1.2217194570135746e-05, + "loss": 6.3969, + "step": 56 + }, + { + "epoch": 0.0581039755351682, + "grad_norm": 10.13591480255127, + "learning_rate": 1.244343891402715e-05, + "loss": 8.1643, + "step": 57 + }, + { + "epoch": 0.059123343527013254, + "grad_norm": 9.476139068603516, + "learning_rate": 1.2669683257918553e-05, + "loss": 7.1228, + "step": 58 + }, + { + "epoch": 0.060142711518858305, + "grad_norm": 8.608465194702148, + "learning_rate": 1.2895927601809957e-05, + "loss": 6.9228, + "step": 59 + }, + { + "epoch": 0.06116207951070336, + "grad_norm": 10.69497299194336, + "learning_rate": 1.3122171945701359e-05, + "loss": 10.2251, + "step": 60 + }, + { + "epoch": 0.06218144750254842, + "grad_norm": 9.309306144714355, + "learning_rate": 1.3348416289592761e-05, + "loss": 7.1105, + "step": 61 + }, + { + "epoch": 0.06320081549439348, + "grad_norm": 9.268863677978516, + "learning_rate": 1.3574660633484162e-05, + "loss": 7.1156, + "step": 62 + }, + { + "epoch": 0.06422018348623854, + "grad_norm": 10.207130432128906, + "learning_rate": 1.3800904977375568e-05, + "loss": 6.5522, + "step": 63 + }, + { + "epoch": 0.0652395514780836, + "grad_norm": 9.29359245300293, + "learning_rate": 1.402714932126697e-05, + "loss": 6.734, + "step": 64 + }, + { + "epoch": 0.06625891946992865, + "grad_norm": 8.38429069519043, + "learning_rate": 1.425339366515837e-05, + "loss": 8.1303, + "step": 65 + }, + { + "epoch": 0.0672782874617737, + "grad_norm": 9.689257621765137, + "learning_rate": 1.4479638009049776e-05, + "loss": 7.298, + "step": 66 + }, + { + "epoch": 0.06829765545361875, + "grad_norm": 8.886714935302734, + "learning_rate": 1.4705882352941177e-05, + "loss": 6.1227, + "step": 67 + }, + { + "epoch": 0.06931702344546381, + "grad_norm": 9.28791332244873, + "learning_rate": 1.493212669683258e-05, + "loss": 6.7938, + "step": 68 + }, + { + "epoch": 0.07033639143730887, + "grad_norm": 9.196669578552246, + "learning_rate": 1.5158371040723981e-05, + "loss": 6.4562, + "step": 69 + }, + { + "epoch": 0.07135575942915393, + "grad_norm": 10.716215133666992, + "learning_rate": 1.5384615384615387e-05, + "loss": 8.0389, + "step": 70 + }, + { + "epoch": 0.07237512742099898, + "grad_norm": 9.852572441101074, + "learning_rate": 1.5610859728506788e-05, + "loss": 8.7218, + "step": 71 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 8.59492301940918, + "learning_rate": 1.583710407239819e-05, + "loss": 6.1906, + "step": 72 + }, + { + "epoch": 0.0744138634046891, + "grad_norm": 9.830521583557129, + "learning_rate": 1.6063348416289596e-05, + "loss": 6.7222, + "step": 73 + }, + { + "epoch": 0.07543323139653414, + "grad_norm": 9.12816047668457, + "learning_rate": 1.6289592760180996e-05, + "loss": 7.0611, + "step": 74 + }, + { + "epoch": 0.0764525993883792, + "grad_norm": 10.391504287719727, + "learning_rate": 1.6515837104072397e-05, + "loss": 7.8241, + "step": 75 + }, + { + "epoch": 0.07747196738022426, + "grad_norm": 9.0382719039917, + "learning_rate": 1.67420814479638e-05, + "loss": 6.3791, + "step": 76 + }, + { + "epoch": 0.07849133537206932, + "grad_norm": 11.495955467224121, + "learning_rate": 1.6968325791855205e-05, + "loss": 6.8864, + "step": 77 + }, + { + "epoch": 0.07951070336391437, + "grad_norm": 9.282613754272461, + "learning_rate": 1.7194570135746606e-05, + "loss": 6.8356, + "step": 78 + }, + { + "epoch": 0.08053007135575943, + "grad_norm": 9.06067180633545, + "learning_rate": 1.742081447963801e-05, + "loss": 6.168, + "step": 79 + }, + { + "epoch": 0.08154943934760449, + "grad_norm": 10.343846321105957, + "learning_rate": 1.7647058823529414e-05, + "loss": 8.6845, + "step": 80 + }, + { + "epoch": 0.08256880733944955, + "grad_norm": 10.185526847839355, + "learning_rate": 1.7873303167420814e-05, + "loss": 5.9739, + "step": 81 + }, + { + "epoch": 0.0835881753312946, + "grad_norm": 12.164653778076172, + "learning_rate": 1.8099547511312218e-05, + "loss": 6.2423, + "step": 82 + }, + { + "epoch": 0.08460754332313965, + "grad_norm": 10.543149948120117, + "learning_rate": 1.832579185520362e-05, + "loss": 7.6247, + "step": 83 + }, + { + "epoch": 0.0856269113149847, + "grad_norm": 10.210731506347656, + "learning_rate": 1.8552036199095023e-05, + "loss": 5.8418, + "step": 84 + }, + { + "epoch": 0.08664627930682976, + "grad_norm": 11.613642692565918, + "learning_rate": 1.8778280542986427e-05, + "loss": 7.0948, + "step": 85 + }, + { + "epoch": 0.08766564729867482, + "grad_norm": 12.590648651123047, + "learning_rate": 1.9004524886877827e-05, + "loss": 6.7457, + "step": 86 + }, + { + "epoch": 0.08868501529051988, + "grad_norm": 12.547815322875977, + "learning_rate": 1.923076923076923e-05, + "loss": 5.6837, + "step": 87 + }, + { + "epoch": 0.08970438328236494, + "grad_norm": 14.212437629699707, + "learning_rate": 1.9457013574660635e-05, + "loss": 6.0757, + "step": 88 + }, + { + "epoch": 0.09072375127421, + "grad_norm": 14.821358680725098, + "learning_rate": 1.9683257918552036e-05, + "loss": 6.523, + "step": 89 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 14.133096694946289, + "learning_rate": 1.990950226244344e-05, + "loss": 6.2917, + "step": 90 + }, + { + "epoch": 0.09276248725790011, + "grad_norm": 14.283154487609863, + "learning_rate": 2.0135746606334844e-05, + "loss": 6.0509, + "step": 91 + }, + { + "epoch": 0.09378185524974515, + "grad_norm": 15.914741516113281, + "learning_rate": 2.0361990950226245e-05, + "loss": 6.8298, + "step": 92 + }, + { + "epoch": 0.09480122324159021, + "grad_norm": 18.067726135253906, + "learning_rate": 2.058823529411765e-05, + "loss": 7.855, + "step": 93 + }, + { + "epoch": 0.09582059123343527, + "grad_norm": 17.288843154907227, + "learning_rate": 2.0814479638009053e-05, + "loss": 6.6372, + "step": 94 + }, + { + "epoch": 0.09683995922528033, + "grad_norm": 22.13617515563965, + "learning_rate": 2.1040723981900453e-05, + "loss": 5.7468, + "step": 95 + }, + { + "epoch": 0.09785932721712538, + "grad_norm": 22.20960235595703, + "learning_rate": 2.1266968325791857e-05, + "loss": 7.5522, + "step": 96 + }, + { + "epoch": 0.09887869520897044, + "grad_norm": 23.28131103515625, + "learning_rate": 2.149321266968326e-05, + "loss": 7.7825, + "step": 97 + }, + { + "epoch": 0.0998980632008155, + "grad_norm": 29.695850372314453, + "learning_rate": 2.1719457013574662e-05, + "loss": 8.7452, + "step": 98 + }, + { + "epoch": 0.0998980632008155, + "eval_Qnli-dev-1024_cosine_accuracy": 0.6458333333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.995652437210083, + "eval_Qnli-dev-1024_cosine_ap": 0.6274798374964984, + "eval_Qnli-dev-1024_cosine_f1": 0.6518518518518519, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.9515509605407715, + "eval_Qnli-dev-1024_cosine_mcc": 0.1563007361345257, + "eval_Qnli-dev-1024_cosine_precision": 0.4888888888888889, + "eval_Qnli-dev-1024_cosine_recall": 0.9777777777777777, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8860945701599121, + "eval_Qnli-dev_cosine_ap": 0.7645314494110582, + "eval_Qnli-dev_cosine_f1": 0.7500000000000001, + "eval_Qnli-dev_cosine_f1_threshold": 0.8442017436027527, + "eval_Qnli-dev_cosine_mcc": 0.48653004754089046, + "eval_Qnli-dev_cosine_precision": 0.6610169491525424, + "eval_Qnli-dev_cosine_recall": 0.8666666666666667, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.7291666865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 2.297825574874878, + "eval_global_dataset_runtime": 104.2196, + "eval_global_dataset_samples_per_second": 7.705, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.7291666865348816, + "eval_sts-test-1024_pearson_cosine": 0.470983874633109, + "eval_sts-test-1024_spearman_cosine": 0.7146928621162676, + "eval_sts-test_pearson_cosine": 0.904138891044396, + "eval_sts-test_spearman_cosine": 0.9172742489825538, + "step": 98 + }, + { + "epoch": 0.10091743119266056, + "grad_norm": 21.82425880432129, + "learning_rate": 2.1945701357466062e-05, + "loss": 6.2322, + "step": 99 + }, + { + "epoch": 0.1019367991845056, + "grad_norm": 25.734025955200195, + "learning_rate": 2.2171945701357466e-05, + "loss": 4.8433, + "step": 100 + }, + { + "epoch": 0.10295616717635066, + "grad_norm": 28.17144775390625, + "learning_rate": 2.239819004524887e-05, + "loss": 4.8335, + "step": 101 + }, + { + "epoch": 0.10397553516819572, + "grad_norm": 27.875871658325195, + "learning_rate": 2.262443438914027e-05, + "loss": 5.138, + "step": 102 + }, + { + "epoch": 0.10499490316004077, + "grad_norm": 31.503034591674805, + "learning_rate": 2.2850678733031675e-05, + "loss": 4.8609, + "step": 103 + }, + { + "epoch": 0.10601427115188583, + "grad_norm": 26.674440383911133, + "learning_rate": 2.307692307692308e-05, + "loss": 4.6204, + "step": 104 + }, + { + "epoch": 0.10703363914373089, + "grad_norm": 25.039222717285156, + "learning_rate": 2.330316742081448e-05, + "loss": 4.3809, + "step": 105 + }, + { + "epoch": 0.10805300713557595, + "grad_norm": 26.333913803100586, + "learning_rate": 2.3529411764705884e-05, + "loss": 5.6703, + "step": 106 + }, + { + "epoch": 0.109072375127421, + "grad_norm": 23.51517105102539, + "learning_rate": 2.3755656108597284e-05, + "loss": 5.0237, + "step": 107 + }, + { + "epoch": 0.11009174311926606, + "grad_norm": 18.25855255126953, + "learning_rate": 2.3981900452488688e-05, + "loss": 4.002, + "step": 108 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 19.852886199951172, + "learning_rate": 2.4208144796380092e-05, + "loss": 5.2532, + "step": 109 + }, + { + "epoch": 0.11213047910295616, + "grad_norm": 17.45444107055664, + "learning_rate": 2.4434389140271493e-05, + "loss": 4.1033, + "step": 110 + }, + { + "epoch": 0.11314984709480122, + "grad_norm": 14.521421432495117, + "learning_rate": 2.4660633484162897e-05, + "loss": 4.0818, + "step": 111 + }, + { + "epoch": 0.11416921508664628, + "grad_norm": 12.525910377502441, + "learning_rate": 2.48868778280543e-05, + "loss": 3.458, + "step": 112 + }, + { + "epoch": 0.11518858307849134, + "grad_norm": 14.503193855285645, + "learning_rate": 2.51131221719457e-05, + "loss": 4.3372, + "step": 113 + }, + { + "epoch": 0.1162079510703364, + "grad_norm": 14.2279634475708, + "learning_rate": 2.5339366515837106e-05, + "loss": 4.9513, + "step": 114 + }, + { + "epoch": 0.11722731906218145, + "grad_norm": 15.238719940185547, + "learning_rate": 2.5565610859728506e-05, + "loss": 5.2602, + "step": 115 + }, + { + "epoch": 0.11824668705402651, + "grad_norm": 11.11528491973877, + "learning_rate": 2.5791855203619913e-05, + "loss": 3.1741, + "step": 116 + }, + { + "epoch": 0.11926605504587157, + "grad_norm": 12.077157974243164, + "learning_rate": 2.6018099547511314e-05, + "loss": 4.1914, + "step": 117 + }, + { + "epoch": 0.12028542303771661, + "grad_norm": 11.872669219970703, + "learning_rate": 2.6244343891402718e-05, + "loss": 2.8383, + "step": 118 + }, + { + "epoch": 0.12130479102956167, + "grad_norm": 9.008302688598633, + "learning_rate": 2.647058823529412e-05, + "loss": 3.4165, + "step": 119 + }, + { + "epoch": 0.12232415902140673, + "grad_norm": 10.702130317687988, + "learning_rate": 2.6696832579185523e-05, + "loss": 3.5085, + "step": 120 + }, + { + "epoch": 0.12334352701325178, + "grad_norm": 10.306276321411133, + "learning_rate": 2.6923076923076923e-05, + "loss": 2.3992, + "step": 121 + }, + { + "epoch": 0.12436289500509684, + "grad_norm": 9.035378456115723, + "learning_rate": 2.7149321266968324e-05, + "loss": 2.4849, + "step": 122 + }, + { + "epoch": 0.12538226299694188, + "grad_norm": 8.996299743652344, + "learning_rate": 2.737556561085973e-05, + "loss": 2.2839, + "step": 123 + }, + { + "epoch": 0.12640163098878696, + "grad_norm": 8.635661125183105, + "learning_rate": 2.7601809954751135e-05, + "loss": 2.567, + "step": 124 + }, + { + "epoch": 0.127420998980632, + "grad_norm": 10.015826225280762, + "learning_rate": 2.7828054298642536e-05, + "loss": 4.5119, + "step": 125 + }, + { + "epoch": 0.12844036697247707, + "grad_norm": 8.679932594299316, + "learning_rate": 2.805429864253394e-05, + "loss": 2.767, + "step": 126 + }, + { + "epoch": 0.12945973496432212, + "grad_norm": 10.05739688873291, + "learning_rate": 2.828054298642534e-05, + "loss": 4.0225, + "step": 127 + }, + { + "epoch": 0.1304791029561672, + "grad_norm": 9.361485481262207, + "learning_rate": 2.850678733031674e-05, + "loss": 1.8294, + "step": 128 + }, + { + "epoch": 0.13149847094801223, + "grad_norm": 9.865928649902344, + "learning_rate": 2.8733031674208145e-05, + "loss": 4.4174, + "step": 129 + }, + { + "epoch": 0.1325178389398573, + "grad_norm": 10.055468559265137, + "learning_rate": 2.8959276018099553e-05, + "loss": 2.0112, + "step": 130 + }, + { + "epoch": 0.13353720693170235, + "grad_norm": 9.528116226196289, + "learning_rate": 2.9185520361990953e-05, + "loss": 1.7772, + "step": 131 + }, + { + "epoch": 0.1345565749235474, + "grad_norm": 9.870166778564453, + "learning_rate": 2.9411764705882354e-05, + "loss": 3.1912, + "step": 132 + }, + { + "epoch": 0.13557594291539246, + "grad_norm": 10.1703462600708, + "learning_rate": 2.9638009049773758e-05, + "loss": 2.4527, + "step": 133 + }, + { + "epoch": 0.1365953109072375, + "grad_norm": 7.443604469299316, + "learning_rate": 2.986425339366516e-05, + "loss": 1.6424, + "step": 134 + }, + { + "epoch": 0.13761467889908258, + "grad_norm": 10.003544807434082, + "learning_rate": 3.0090497737556562e-05, + "loss": 2.6143, + "step": 135 + }, + { + "epoch": 0.13863404689092762, + "grad_norm": 9.352860450744629, + "learning_rate": 3.0316742081447963e-05, + "loss": 2.0498, + "step": 136 + }, + { + "epoch": 0.1396534148827727, + "grad_norm": 7.393095970153809, + "learning_rate": 3.0542986425339374e-05, + "loss": 1.962, + "step": 137 + }, + { + "epoch": 0.14067278287461774, + "grad_norm": 8.278059959411621, + "learning_rate": 3.0769230769230774e-05, + "loss": 1.789, + "step": 138 + }, + { + "epoch": 0.14169215086646278, + "grad_norm": 6.577699184417725, + "learning_rate": 3.0995475113122175e-05, + "loss": 1.459, + "step": 139 + }, + { + "epoch": 0.14271151885830785, + "grad_norm": 8.23404312133789, + "learning_rate": 3.1221719457013576e-05, + "loss": 1.2479, + "step": 140 + }, + { + "epoch": 0.1437308868501529, + "grad_norm": 9.47106647491455, + "learning_rate": 3.1447963800904976e-05, + "loss": 2.5413, + "step": 141 + }, + { + "epoch": 0.14475025484199797, + "grad_norm": 7.330000400543213, + "learning_rate": 3.167420814479638e-05, + "loss": 1.4077, + "step": 142 + }, + { + "epoch": 0.145769622833843, + "grad_norm": 9.64534854888916, + "learning_rate": 3.1900452488687784e-05, + "loss": 2.6988, + "step": 143 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 8.404465675354004, + "learning_rate": 3.212669683257919e-05, + "loss": 2.9772, + "step": 144 + }, + { + "epoch": 0.14780835881753313, + "grad_norm": 8.019698143005371, + "learning_rate": 3.235294117647059e-05, + "loss": 1.6265, + "step": 145 + }, + { + "epoch": 0.1488277268093782, + "grad_norm": 7.635079860687256, + "learning_rate": 3.257918552036199e-05, + "loss": 1.9404, + "step": 146 + }, + { + "epoch": 0.14984709480122324, + "grad_norm": 7.929011821746826, + "learning_rate": 3.2805429864253393e-05, + "loss": 1.4251, + "step": 147 + }, + { + "epoch": 0.15086646279306828, + "grad_norm": 7.869425296783447, + "learning_rate": 3.3031674208144794e-05, + "loss": 2.6657, + "step": 148 + }, + { + "epoch": 0.15188583078491336, + "grad_norm": 8.369176864624023, + "learning_rate": 3.32579185520362e-05, + "loss": 2.1576, + "step": 149 + }, + { + "epoch": 0.1529051987767584, + "grad_norm": 9.128487586975098, + "learning_rate": 3.34841628959276e-05, + "loss": 1.53, + "step": 150 + }, + { + "epoch": 0.15392456676860347, + "grad_norm": 7.673459529876709, + "learning_rate": 3.371040723981901e-05, + "loss": 1.2642, + "step": 151 + }, + { + "epoch": 0.15494393476044852, + "grad_norm": 9.104422569274902, + "learning_rate": 3.393665158371041e-05, + "loss": 2.4846, + "step": 152 + }, + { + "epoch": 0.1559633027522936, + "grad_norm": 8.658594131469727, + "learning_rate": 3.416289592760181e-05, + "loss": 1.4979, + "step": 153 + }, + { + "epoch": 0.15698267074413863, + "grad_norm": 9.34330940246582, + "learning_rate": 3.438914027149321e-05, + "loss": 1.8149, + "step": 154 + }, + { + "epoch": 0.1580020387359837, + "grad_norm": 9.401769638061523, + "learning_rate": 3.461538461538462e-05, + "loss": 1.4693, + "step": 155 + }, + { + "epoch": 0.15902140672782875, + "grad_norm": 10.389461517333984, + "learning_rate": 3.484162895927602e-05, + "loss": 2.0114, + "step": 156 + }, + { + "epoch": 0.1600407747196738, + "grad_norm": 9.321866989135742, + "learning_rate": 3.506787330316742e-05, + "loss": 1.5511, + "step": 157 + }, + { + "epoch": 0.16106014271151886, + "grad_norm": 10.052262306213379, + "learning_rate": 3.529411764705883e-05, + "loss": 1.621, + "step": 158 + }, + { + "epoch": 0.1620795107033639, + "grad_norm": 7.535787105560303, + "learning_rate": 3.552036199095023e-05, + "loss": 2.1122, + "step": 159 + }, + { + "epoch": 0.16309887869520898, + "grad_norm": 9.70533275604248, + "learning_rate": 3.574660633484163e-05, + "loss": 1.8148, + "step": 160 + }, + { + "epoch": 0.16411824668705402, + "grad_norm": 7.81204080581665, + "learning_rate": 3.5972850678733036e-05, + "loss": 1.9861, + "step": 161 + }, + { + "epoch": 0.1651376146788991, + "grad_norm": 7.583981513977051, + "learning_rate": 3.6199095022624436e-05, + "loss": 1.3943, + "step": 162 + }, + { + "epoch": 0.16615698267074414, + "grad_norm": 8.344895362854004, + "learning_rate": 3.642533936651584e-05, + "loss": 1.7317, + "step": 163 + }, + { + "epoch": 0.1671763506625892, + "grad_norm": 7.1097331047058105, + "learning_rate": 3.665158371040724e-05, + "loss": 1.6, + "step": 164 + }, + { + "epoch": 0.16819571865443425, + "grad_norm": 7.911113739013672, + "learning_rate": 3.6877828054298645e-05, + "loss": 1.2222, + "step": 165 + }, + { + "epoch": 0.1692150866462793, + "grad_norm": 9.282394409179688, + "learning_rate": 3.7104072398190046e-05, + "loss": 1.6152, + "step": 166 + }, + { + "epoch": 0.17023445463812437, + "grad_norm": 7.449146270751953, + "learning_rate": 3.733031674208145e-05, + "loss": 1.0374, + "step": 167 + }, + { + "epoch": 0.1712538226299694, + "grad_norm": 9.164731979370117, + "learning_rate": 3.7556561085972854e-05, + "loss": 1.2844, + "step": 168 + }, + { + "epoch": 0.17227319062181448, + "grad_norm": 6.987304210662842, + "learning_rate": 3.7782805429864254e-05, + "loss": 1.8805, + "step": 169 + }, + { + "epoch": 0.17329255861365953, + "grad_norm": 7.447988033294678, + "learning_rate": 3.8009049773755655e-05, + "loss": 1.0972, + "step": 170 + }, + { + "epoch": 0.1743119266055046, + "grad_norm": 7.7849321365356445, + "learning_rate": 3.8235294117647055e-05, + "loss": 1.7012, + "step": 171 + }, + { + "epoch": 0.17533129459734964, + "grad_norm": 7.341614246368408, + "learning_rate": 3.846153846153846e-05, + "loss": 1.4182, + "step": 172 + }, + { + "epoch": 0.1763506625891947, + "grad_norm": 8.514887809753418, + "learning_rate": 3.868778280542987e-05, + "loss": 2.6053, + "step": 173 + }, + { + "epoch": 0.17737003058103976, + "grad_norm": 7.384711265563965, + "learning_rate": 3.891402714932127e-05, + "loss": 1.4193, + "step": 174 + }, + { + "epoch": 0.1783893985728848, + "grad_norm": 8.553336143493652, + "learning_rate": 3.914027149321267e-05, + "loss": 2.251, + "step": 175 + }, + { + "epoch": 0.17940876656472987, + "grad_norm": 8.517749786376953, + "learning_rate": 3.936651583710407e-05, + "loss": 1.9057, + "step": 176 + }, + { + "epoch": 0.18042813455657492, + "grad_norm": 8.444558143615723, + "learning_rate": 3.959276018099547e-05, + "loss": 1.1228, + "step": 177 + }, + { + "epoch": 0.18144750254842, + "grad_norm": 12.253990173339844, + "learning_rate": 3.981900452488688e-05, + "loss": 4.0905, + "step": 178 + }, + { + "epoch": 0.18246687054026503, + "grad_norm": 5.70052433013916, + "learning_rate": 4.004524886877829e-05, + "loss": 0.9007, + "step": 179 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 9.525473594665527, + "learning_rate": 4.027149321266969e-05, + "loss": 2.0665, + "step": 180 + }, + { + "epoch": 0.18450560652395515, + "grad_norm": 6.146080493927002, + "learning_rate": 4.049773755656109e-05, + "loss": 1.0946, + "step": 181 + }, + { + "epoch": 0.18552497451580022, + "grad_norm": 7.736543655395508, + "learning_rate": 4.072398190045249e-05, + "loss": 1.7479, + "step": 182 + }, + { + "epoch": 0.18654434250764526, + "grad_norm": 8.404258728027344, + "learning_rate": 4.095022624434389e-05, + "loss": 2.0877, + "step": 183 + }, + { + "epoch": 0.1875637104994903, + "grad_norm": 5.705750942230225, + "learning_rate": 4.11764705882353e-05, + "loss": 0.9239, + "step": 184 + }, + { + "epoch": 0.18858307849133538, + "grad_norm": 7.753995895385742, + "learning_rate": 4.14027149321267e-05, + "loss": 1.7865, + "step": 185 + }, + { + "epoch": 0.18960244648318042, + "grad_norm": 9.15240478515625, + "learning_rate": 4.1628959276018105e-05, + "loss": 2.1053, + "step": 186 + }, + { + "epoch": 0.1906218144750255, + "grad_norm": 7.2251129150390625, + "learning_rate": 4.1855203619909506e-05, + "loss": 1.5273, + "step": 187 + }, + { + "epoch": 0.19164118246687054, + "grad_norm": 6.803040981292725, + "learning_rate": 4.2081447963800907e-05, + "loss": 1.8726, + "step": 188 + }, + { + "epoch": 0.1926605504587156, + "grad_norm": 5.646162509918213, + "learning_rate": 4.230769230769231e-05, + "loss": 1.4663, + "step": 189 + }, + { + "epoch": 0.19367991845056065, + "grad_norm": 7.599930286407471, + "learning_rate": 4.2533936651583714e-05, + "loss": 1.0136, + "step": 190 + }, + { + "epoch": 0.1946992864424057, + "grad_norm": 7.882979393005371, + "learning_rate": 4.2760180995475115e-05, + "loss": 1.121, + "step": 191 + }, + { + "epoch": 0.19571865443425077, + "grad_norm": 8.919268608093262, + "learning_rate": 4.298642533936652e-05, + "loss": 1.6074, + "step": 192 + }, + { + "epoch": 0.1967380224260958, + "grad_norm": 8.914848327636719, + "learning_rate": 4.321266968325792e-05, + "loss": 2.1956, + "step": 193 + }, + { + "epoch": 0.19775739041794088, + "grad_norm": 8.603778839111328, + "learning_rate": 4.3438914027149324e-05, + "loss": 1.5425, + "step": 194 + }, + { + "epoch": 0.19877675840978593, + "grad_norm": 8.500616073608398, + "learning_rate": 4.3665158371040724e-05, + "loss": 1.4552, + "step": 195 + }, + { + "epoch": 0.199796126401631, + "grad_norm": 7.815979957580566, + "learning_rate": 4.3891402714932125e-05, + "loss": 1.2635, + "step": 196 + }, + { + "epoch": 0.199796126401631, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8570283651351929, + "eval_Qnli-dev-1024_cosine_ap": 0.7434694144471753, + "eval_Qnli-dev-1024_cosine_f1": 0.7207207207207208, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7906914353370667, + "eval_Qnli-dev-1024_cosine_mcc": 0.4081269865567241, + "eval_Qnli-dev-1024_cosine_precision": 0.6060606060606061, + "eval_Qnli-dev-1024_cosine_recall": 0.8888888888888888, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8287814855575562, + "eval_Qnli-dev_cosine_ap": 0.7646453733471359, + "eval_Qnli-dev_cosine_f1": 0.7378640776699029, + "eval_Qnli-dev_cosine_f1_threshold": 0.7745069265365601, + "eval_Qnli-dev_cosine_mcc": 0.46153029495329345, + "eval_Qnli-dev_cosine_precision": 0.6551724137931034, + "eval_Qnli-dev_cosine_recall": 0.8444444444444444, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9166666865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.9583333134651184, + "eval_global_dataset_loss": 0.5179261565208435, + "eval_global_dataset_runtime": 104.2216, + "eval_global_dataset_samples_per_second": 7.705, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.9166666865348816, + "eval_sts-test-1024_pearson_cosine": 0.8476975008591285, + "eval_sts-test-1024_spearman_cosine": 0.8973182534732806, + "eval_sts-test_pearson_cosine": 0.9039400681490469, + "eval_sts-test_spearman_cosine": 0.9185431775441114, + "step": 196 + }, + { + "epoch": 0.20081549439347604, + "grad_norm": 9.34125804901123, + "learning_rate": 4.411764705882353e-05, + "loss": 1.7222, + "step": 197 + }, + { + "epoch": 0.2018348623853211, + "grad_norm": 10.679852485656738, + "learning_rate": 4.434389140271493e-05, + "loss": 2.377, + "step": 198 + }, + { + "epoch": 0.20285423037716616, + "grad_norm": 7.775190830230713, + "learning_rate": 4.457013574660634e-05, + "loss": 1.5317, + "step": 199 + }, + { + "epoch": 0.2038735983690112, + "grad_norm": 6.390950679779053, + "learning_rate": 4.479638009049774e-05, + "loss": 1.0494, + "step": 200 + }, + { + "epoch": 0.20489296636085627, + "grad_norm": 9.170794486999512, + "learning_rate": 4.502262443438914e-05, + "loss": 1.7392, + "step": 201 + }, + { + "epoch": 0.20591233435270132, + "grad_norm": 7.37787389755249, + "learning_rate": 4.524886877828054e-05, + "loss": 1.2924, + "step": 202 + }, + { + "epoch": 0.2069317023445464, + "grad_norm": 6.836249828338623, + "learning_rate": 4.547511312217195e-05, + "loss": 0.9413, + "step": 203 + }, + { + "epoch": 0.20795107033639143, + "grad_norm": 9.543895721435547, + "learning_rate": 4.570135746606335e-05, + "loss": 2.1448, + "step": 204 + }, + { + "epoch": 0.2089704383282365, + "grad_norm": 7.8430495262146, + "learning_rate": 4.592760180995475e-05, + "loss": 1.0357, + "step": 205 + }, + { + "epoch": 0.20998980632008155, + "grad_norm": 9.558221817016602, + "learning_rate": 4.615384615384616e-05, + "loss": 1.3534, + "step": 206 + }, + { + "epoch": 0.21100917431192662, + "grad_norm": 5.715826034545898, + "learning_rate": 4.638009049773756e-05, + "loss": 1.0564, + "step": 207 + }, + { + "epoch": 0.21202854230377166, + "grad_norm": 8.720932960510254, + "learning_rate": 4.660633484162896e-05, + "loss": 0.9259, + "step": 208 + }, + { + "epoch": 0.2130479102956167, + "grad_norm": 9.008890151977539, + "learning_rate": 4.683257918552037e-05, + "loss": 1.3813, + "step": 209 + }, + { + "epoch": 0.21406727828746178, + "grad_norm": 7.1262006759643555, + "learning_rate": 4.705882352941177e-05, + "loss": 0.8828, + "step": 210 + }, + { + "epoch": 0.21508664627930682, + "grad_norm": 12.986166000366211, + "learning_rate": 4.728506787330317e-05, + "loss": 2.9147, + "step": 211 + }, + { + "epoch": 0.2161060142711519, + "grad_norm": 6.804072380065918, + "learning_rate": 4.751131221719457e-05, + "loss": 0.6539, + "step": 212 + }, + { + "epoch": 0.21712538226299694, + "grad_norm": 9.138653755187988, + "learning_rate": 4.7737556561085976e-05, + "loss": 1.3092, + "step": 213 + }, + { + "epoch": 0.218144750254842, + "grad_norm": 7.303668975830078, + "learning_rate": 4.7963800904977377e-05, + "loss": 1.1562, + "step": 214 + }, + { + "epoch": 0.21916411824668705, + "grad_norm": 7.368769645690918, + "learning_rate": 4.8190045248868784e-05, + "loss": 0.9509, + "step": 215 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 5.067785263061523, + "learning_rate": 4.8416289592760185e-05, + "loss": 0.6664, + "step": 216 + }, + { + "epoch": 0.22120285423037717, + "grad_norm": 5.643320083618164, + "learning_rate": 4.8642533936651585e-05, + "loss": 1.2315, + "step": 217 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 6.596173286437988, + "learning_rate": 4.8868778280542986e-05, + "loss": 0.9855, + "step": 218 + }, + { + "epoch": 0.22324159021406728, + "grad_norm": 6.5434770584106445, + "learning_rate": 4.9095022624434386e-05, + "loss": 0.9258, + "step": 219 + }, + { + "epoch": 0.22426095820591233, + "grad_norm": 11.537922859191895, + "learning_rate": 4.9321266968325794e-05, + "loss": 1.6578, + "step": 220 + }, + { + "epoch": 0.2252803261977574, + "grad_norm": 7.364137172698975, + "learning_rate": 4.95475113122172e-05, + "loss": 0.9666, + "step": 221 + }, + { + "epoch": 0.22629969418960244, + "grad_norm": 8.102925300598145, + "learning_rate": 4.97737556561086e-05, + "loss": 0.9808, + "step": 222 + }, + { + "epoch": 0.2273190621814475, + "grad_norm": 10.013775825500488, + "learning_rate": 5e-05, + "loss": 2.4156, + "step": 223 + }, + { + "epoch": 0.22833843017329256, + "grad_norm": 7.974793434143066, + "learning_rate": 5.02262443438914e-05, + "loss": 1.1297, + "step": 224 + }, + { + "epoch": 0.22935779816513763, + "grad_norm": 7.710846424102783, + "learning_rate": 5.0452488687782804e-05, + "loss": 1.3063, + "step": 225 + }, + { + "epoch": 0.23037716615698267, + "grad_norm": 5.633566856384277, + "learning_rate": 5.067873303167421e-05, + "loss": 0.5567, + "step": 226 + }, + { + "epoch": 0.23139653414882771, + "grad_norm": 9.50987720489502, + "learning_rate": 5.090497737556561e-05, + "loss": 1.3551, + "step": 227 + }, + { + "epoch": 0.2324159021406728, + "grad_norm": 10.309268951416016, + "learning_rate": 5.113122171945701e-05, + "loss": 1.4079, + "step": 228 + }, + { + "epoch": 0.23343527013251783, + "grad_norm": 7.812633037567139, + "learning_rate": 5.135746606334841e-05, + "loss": 0.948, + "step": 229 + }, + { + "epoch": 0.2344546381243629, + "grad_norm": 8.013436317443848, + "learning_rate": 5.158371040723983e-05, + "loss": 0.9288, + "step": 230 + }, + { + "epoch": 0.23547400611620795, + "grad_norm": 7.550686359405518, + "learning_rate": 5.180995475113123e-05, + "loss": 1.0077, + "step": 231 + }, + { + "epoch": 0.23649337410805302, + "grad_norm": 7.249583721160889, + "learning_rate": 5.203619909502263e-05, + "loss": 1.0674, + "step": 232 + }, + { + "epoch": 0.23751274209989806, + "grad_norm": 7.766678810119629, + "learning_rate": 5.2262443438914036e-05, + "loss": 1.3354, + "step": 233 + }, + { + "epoch": 0.23853211009174313, + "grad_norm": 7.417704105377197, + "learning_rate": 5.2488687782805436e-05, + "loss": 1.0076, + "step": 234 + }, + { + "epoch": 0.23955147808358818, + "grad_norm": 8.414839744567871, + "learning_rate": 5.271493212669684e-05, + "loss": 0.8814, + "step": 235 + }, + { + "epoch": 0.24057084607543322, + "grad_norm": 9.537981986999512, + "learning_rate": 5.294117647058824e-05, + "loss": 1.7839, + "step": 236 + }, + { + "epoch": 0.2415902140672783, + "grad_norm": 6.3290886878967285, + "learning_rate": 5.316742081447964e-05, + "loss": 1.2614, + "step": 237 + }, + { + "epoch": 0.24260958205912334, + "grad_norm": 8.181835174560547, + "learning_rate": 5.3393665158371045e-05, + "loss": 0.8655, + "step": 238 + }, + { + "epoch": 0.2436289500509684, + "grad_norm": 8.01684856414795, + "learning_rate": 5.3619909502262446e-05, + "loss": 1.8418, + "step": 239 + }, + { + "epoch": 0.24464831804281345, + "grad_norm": 7.891118049621582, + "learning_rate": 5.384615384615385e-05, + "loss": 1.4465, + "step": 240 + }, + { + "epoch": 0.24566768603465852, + "grad_norm": 8.080881118774414, + "learning_rate": 5.407239819004525e-05, + "loss": 0.8695, + "step": 241 + }, + { + "epoch": 0.24668705402650357, + "grad_norm": 6.881638050079346, + "learning_rate": 5.429864253393665e-05, + "loss": 0.9695, + "step": 242 + }, + { + "epoch": 0.24770642201834864, + "grad_norm": 10.03598690032959, + "learning_rate": 5.4524886877828055e-05, + "loss": 1.3374, + "step": 243 + }, + { + "epoch": 0.24872579001019368, + "grad_norm": 7.844127178192139, + "learning_rate": 5.475113122171946e-05, + "loss": 0.6716, + "step": 244 + }, + { + "epoch": 0.24974515800203873, + "grad_norm": 8.654071807861328, + "learning_rate": 5.497737556561087e-05, + "loss": 1.032, + "step": 245 + }, + { + "epoch": 0.25076452599388377, + "grad_norm": 6.731460094451904, + "learning_rate": 5.520361990950227e-05, + "loss": 0.8033, + "step": 246 + }, + { + "epoch": 0.25178389398572887, + "grad_norm": 9.436687469482422, + "learning_rate": 5.542986425339367e-05, + "loss": 0.9257, + "step": 247 + }, + { + "epoch": 0.2528032619775739, + "grad_norm": 7.817379474639893, + "learning_rate": 5.565610859728507e-05, + "loss": 0.8311, + "step": 248 + }, + { + "epoch": 0.25382262996941896, + "grad_norm": 6.328183650970459, + "learning_rate": 5.588235294117647e-05, + "loss": 0.5609, + "step": 249 + }, + { + "epoch": 0.254841997961264, + "grad_norm": 8.576601028442383, + "learning_rate": 5.610859728506788e-05, + "loss": 1.5985, + "step": 250 + }, + { + "epoch": 0.2558613659531091, + "grad_norm": 9.092324256896973, + "learning_rate": 5.633484162895928e-05, + "loss": 1.0025, + "step": 251 + }, + { + "epoch": 0.25688073394495414, + "grad_norm": 11.906094551086426, + "learning_rate": 5.656108597285068e-05, + "loss": 2.0499, + "step": 252 + }, + { + "epoch": 0.2579001019367992, + "grad_norm": 7.968968868255615, + "learning_rate": 5.678733031674208e-05, + "loss": 1.3116, + "step": 253 + }, + { + "epoch": 0.25891946992864423, + "grad_norm": 5.355049133300781, + "learning_rate": 5.701357466063348e-05, + "loss": 0.5969, + "step": 254 + }, + { + "epoch": 0.2599388379204893, + "grad_norm": 8.151896476745605, + "learning_rate": 5.723981900452488e-05, + "loss": 1.1107, + "step": 255 + }, + { + "epoch": 0.2609582059123344, + "grad_norm": 9.651622772216797, + "learning_rate": 5.746606334841629e-05, + "loss": 1.8581, + "step": 256 + }, + { + "epoch": 0.2619775739041794, + "grad_norm": 7.1527533531188965, + "learning_rate": 5.769230769230769e-05, + "loss": 0.572, + "step": 257 + }, + { + "epoch": 0.26299694189602446, + "grad_norm": 6.141374111175537, + "learning_rate": 5.7918552036199105e-05, + "loss": 0.9267, + "step": 258 + }, + { + "epoch": 0.2640163098878695, + "grad_norm": 7.274891376495361, + "learning_rate": 5.8144796380090506e-05, + "loss": 0.6255, + "step": 259 + }, + { + "epoch": 0.2650356778797146, + "grad_norm": 5.81080436706543, + "learning_rate": 5.8371040723981906e-05, + "loss": 0.7615, + "step": 260 + }, + { + "epoch": 0.26605504587155965, + "grad_norm": 6.9981279373168945, + "learning_rate": 5.859728506787331e-05, + "loss": 0.6026, + "step": 261 + }, + { + "epoch": 0.2670744138634047, + "grad_norm": 5.718660831451416, + "learning_rate": 5.882352941176471e-05, + "loss": 0.7263, + "step": 262 + }, + { + "epoch": 0.26809378185524974, + "grad_norm": 5.391998767852783, + "learning_rate": 5.9049773755656115e-05, + "loss": 0.4643, + "step": 263 + }, + { + "epoch": 0.2691131498470948, + "grad_norm": 6.843007564544678, + "learning_rate": 5.9276018099547516e-05, + "loss": 0.5101, + "step": 264 + }, + { + "epoch": 0.2701325178389399, + "grad_norm": 5.087254047393799, + "learning_rate": 5.9502262443438916e-05, + "loss": 0.5562, + "step": 265 + }, + { + "epoch": 0.2711518858307849, + "grad_norm": 7.482615947723389, + "learning_rate": 5.972850678733032e-05, + "loss": 1.256, + "step": 266 + }, + { + "epoch": 0.27217125382262997, + "grad_norm": 6.911371231079102, + "learning_rate": 5.995475113122172e-05, + "loss": 0.6543, + "step": 267 + }, + { + "epoch": 0.273190621814475, + "grad_norm": 7.643139839172363, + "learning_rate": 6.0180995475113125e-05, + "loss": 0.6698, + "step": 268 + }, + { + "epoch": 0.2742099898063201, + "grad_norm": 9.08658504486084, + "learning_rate": 6.0407239819004525e-05, + "loss": 1.3843, + "step": 269 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 8.890534400939941, + "learning_rate": 6.0633484162895926e-05, + "loss": 1.1421, + "step": 270 + }, + { + "epoch": 0.2762487257900102, + "grad_norm": 9.855698585510254, + "learning_rate": 6.0859728506787327e-05, + "loss": 1.1558, + "step": 271 + }, + { + "epoch": 0.27726809378185524, + "grad_norm": 8.32972526550293, + "learning_rate": 6.108597285067875e-05, + "loss": 1.603, + "step": 272 + }, + { + "epoch": 0.2782874617737003, + "grad_norm": 8.393510818481445, + "learning_rate": 6.131221719457015e-05, + "loss": 0.7985, + "step": 273 + }, + { + "epoch": 0.2793068297655454, + "grad_norm": 7.992040157318115, + "learning_rate": 6.153846153846155e-05, + "loss": 1.3884, + "step": 274 + }, + { + "epoch": 0.2803261977573904, + "grad_norm": 8.646651268005371, + "learning_rate": 6.176470588235295e-05, + "loss": 1.0337, + "step": 275 + }, + { + "epoch": 0.28134556574923547, + "grad_norm": 7.3104329109191895, + "learning_rate": 6.199095022624435e-05, + "loss": 1.0917, + "step": 276 + }, + { + "epoch": 0.2823649337410805, + "grad_norm": 12.030378341674805, + "learning_rate": 6.221719457013575e-05, + "loss": 2.4149, + "step": 277 + }, + { + "epoch": 0.28338430173292556, + "grad_norm": 4.781021595001221, + "learning_rate": 6.244343891402715e-05, + "loss": 0.4301, + "step": 278 + }, + { + "epoch": 0.28440366972477066, + "grad_norm": 4.352090358734131, + "learning_rate": 6.266968325791855e-05, + "loss": 0.5084, + "step": 279 + }, + { + "epoch": 0.2854230377166157, + "grad_norm": 5.88839864730835, + "learning_rate": 6.289592760180995e-05, + "loss": 0.7202, + "step": 280 + }, + { + "epoch": 0.28644240570846075, + "grad_norm": 11.228419303894043, + "learning_rate": 6.312217194570135e-05, + "loss": 1.7983, + "step": 281 + }, + { + "epoch": 0.2874617737003058, + "grad_norm": 6.119421005249023, + "learning_rate": 6.334841628959275e-05, + "loss": 0.673, + "step": 282 + }, + { + "epoch": 0.2884811416921509, + "grad_norm": 6.405134677886963, + "learning_rate": 6.357466063348417e-05, + "loss": 0.6655, + "step": 283 + }, + { + "epoch": 0.28950050968399593, + "grad_norm": 6.735506534576416, + "learning_rate": 6.380090497737557e-05, + "loss": 0.9121, + "step": 284 + }, + { + "epoch": 0.290519877675841, + "grad_norm": 11.012415885925293, + "learning_rate": 6.402714932126697e-05, + "loss": 1.5978, + "step": 285 + }, + { + "epoch": 0.291539245667686, + "grad_norm": 13.007187843322754, + "learning_rate": 6.425339366515838e-05, + "loss": 1.8536, + "step": 286 + }, + { + "epoch": 0.29255861365953106, + "grad_norm": 12.273601531982422, + "learning_rate": 6.447963800904978e-05, + "loss": 1.6397, + "step": 287 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 9.6339750289917, + "learning_rate": 6.470588235294118e-05, + "loss": 0.8275, + "step": 288 + }, + { + "epoch": 0.2945973496432212, + "grad_norm": 6.717658996582031, + "learning_rate": 6.493212669683258e-05, + "loss": 0.6003, + "step": 289 + }, + { + "epoch": 0.29561671763506625, + "grad_norm": 8.443256378173828, + "learning_rate": 6.515837104072399e-05, + "loss": 0.9834, + "step": 290 + }, + { + "epoch": 0.2966360856269113, + "grad_norm": 8.823105812072754, + "learning_rate": 6.538461538461539e-05, + "loss": 0.603, + "step": 291 + }, + { + "epoch": 0.2976554536187564, + "grad_norm": 6.8099141120910645, + "learning_rate": 6.561085972850679e-05, + "loss": 0.6597, + "step": 292 + }, + { + "epoch": 0.29867482161060144, + "grad_norm": 6.705087661743164, + "learning_rate": 6.583710407239819e-05, + "loss": 0.739, + "step": 293 + }, + { + "epoch": 0.2996941896024465, + "grad_norm": 7.209024906158447, + "learning_rate": 6.606334841628959e-05, + "loss": 1.2564, + "step": 294 + }, + { + "epoch": 0.2996941896024465, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8565528392791748, + "eval_Qnli-dev-1024_cosine_ap": 0.7531377591671699, + "eval_Qnli-dev-1024_cosine_f1": 0.7254901960784313, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8200148344039917, + "eval_Qnli-dev-1024_cosine_mcc": 0.43697448216965834, + "eval_Qnli-dev-1024_cosine_precision": 0.6491228070175439, + "eval_Qnli-dev-1024_cosine_recall": 0.8222222222222222, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7719540596008301, + "eval_Qnli-dev_cosine_ap": 0.7588639733888536, + "eval_Qnli-dev_cosine_f1": 0.7454545454545455, + "eval_Qnli-dev_cosine_f1_threshold": 0.7090869545936584, + "eval_Qnli-dev_cosine_mcc": 0.47013467657639685, + "eval_Qnli-dev_cosine_precision": 0.6307692307692307, + "eval_Qnli-dev_cosine_recall": 0.9111111111111111, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.8854166865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.9583333134651184, + "eval_global_dataset_loss": 0.4855804145336151, + "eval_global_dataset_runtime": 104.3189, + "eval_global_dataset_samples_per_second": 7.698, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.8854166865348816, + "eval_sts-test-1024_pearson_cosine": 0.8681461030339531, + "eval_sts-test-1024_spearman_cosine": 0.9053809631987397, + "eval_sts-test_pearson_cosine": 0.9051731986667259, + "eval_sts-test_spearman_cosine": 0.920630429781229, + "step": 294 + }, + { + "epoch": 0.3007135575942915, + "grad_norm": 7.649487495422363, + "learning_rate": 6.6289592760181e-05, + "loss": 1.4442, + "step": 295 + }, + { + "epoch": 0.30173292558613657, + "grad_norm": 7.740142822265625, + "learning_rate": 6.65158371040724e-05, + "loss": 1.4064, + "step": 296 + }, + { + "epoch": 0.30275229357798167, + "grad_norm": 8.009271621704102, + "learning_rate": 6.67420814479638e-05, + "loss": 0.8456, + "step": 297 + }, + { + "epoch": 0.3037716615698267, + "grad_norm": 5.718809604644775, + "learning_rate": 6.69683257918552e-05, + "loss": 0.7772, + "step": 298 + }, + { + "epoch": 0.30479102956167176, + "grad_norm": 7.34658145904541, + "learning_rate": 6.719457013574662e-05, + "loss": 0.7619, + "step": 299 + }, + { + "epoch": 0.3058103975535168, + "grad_norm": 8.556058883666992, + "learning_rate": 6.742081447963802e-05, + "loss": 1.002, + "step": 300 + }, + { + "epoch": 0.3068297655453619, + "grad_norm": 8.995348930358887, + "learning_rate": 6.764705882352942e-05, + "loss": 1.4774, + "step": 301 + }, + { + "epoch": 0.30784913353720694, + "grad_norm": 8.271109580993652, + "learning_rate": 6.787330316742082e-05, + "loss": 0.6814, + "step": 302 + }, + { + "epoch": 0.308868501529052, + "grad_norm": 8.896450996398926, + "learning_rate": 6.809954751131222e-05, + "loss": 1.2286, + "step": 303 + }, + { + "epoch": 0.30988786952089703, + "grad_norm": 9.076520919799805, + "learning_rate": 6.832579185520362e-05, + "loss": 1.8546, + "step": 304 + }, + { + "epoch": 0.3109072375127421, + "grad_norm": 6.780123233795166, + "learning_rate": 6.855203619909502e-05, + "loss": 0.7547, + "step": 305 + }, + { + "epoch": 0.3119266055045872, + "grad_norm": 7.728740692138672, + "learning_rate": 6.877828054298642e-05, + "loss": 1.3638, + "step": 306 + }, + { + "epoch": 0.3129459734964322, + "grad_norm": 8.941544532775879, + "learning_rate": 6.900452488687784e-05, + "loss": 1.2604, + "step": 307 + }, + { + "epoch": 0.31396534148827726, + "grad_norm": 6.70719575881958, + "learning_rate": 6.923076923076924e-05, + "loss": 0.5111, + "step": 308 + }, + { + "epoch": 0.3149847094801223, + "grad_norm": 7.599255561828613, + "learning_rate": 6.945701357466064e-05, + "loss": 0.7153, + "step": 309 + }, + { + "epoch": 0.3160040774719674, + "grad_norm": 7.323727607727051, + "learning_rate": 6.968325791855204e-05, + "loss": 0.8367, + "step": 310 + }, + { + "epoch": 0.31702344546381245, + "grad_norm": 7.314160346984863, + "learning_rate": 6.990950226244344e-05, + "loss": 0.6643, + "step": 311 + }, + { + "epoch": 0.3180428134556575, + "grad_norm": 8.265671730041504, + "learning_rate": 7.013574660633484e-05, + "loss": 1.0404, + "step": 312 + }, + { + "epoch": 0.31906218144750254, + "grad_norm": 10.820046424865723, + "learning_rate": 7.036199095022625e-05, + "loss": 1.122, + "step": 313 + }, + { + "epoch": 0.3200815494393476, + "grad_norm": 7.194378852844238, + "learning_rate": 7.058823529411765e-05, + "loss": 0.7091, + "step": 314 + }, + { + "epoch": 0.3211009174311927, + "grad_norm": 7.764474868774414, + "learning_rate": 7.081447963800906e-05, + "loss": 1.07, + "step": 315 + }, + { + "epoch": 0.3221202854230377, + "grad_norm": 7.757960796356201, + "learning_rate": 7.104072398190046e-05, + "loss": 0.7246, + "step": 316 + }, + { + "epoch": 0.32313965341488277, + "grad_norm": 9.631681442260742, + "learning_rate": 7.126696832579186e-05, + "loss": 0.821, + "step": 317 + }, + { + "epoch": 0.3241590214067278, + "grad_norm": 6.478396892547607, + "learning_rate": 7.149321266968326e-05, + "loss": 0.9556, + "step": 318 + }, + { + "epoch": 0.3251783893985729, + "grad_norm": 8.858171463012695, + "learning_rate": 7.171945701357467e-05, + "loss": 1.2634, + "step": 319 + }, + { + "epoch": 0.32619775739041795, + "grad_norm": 8.02340030670166, + "learning_rate": 7.194570135746607e-05, + "loss": 0.8621, + "step": 320 + }, + { + "epoch": 0.327217125382263, + "grad_norm": 8.634239196777344, + "learning_rate": 7.217194570135747e-05, + "loss": 1.4215, + "step": 321 + }, + { + "epoch": 0.32823649337410804, + "grad_norm": 8.96740436553955, + "learning_rate": 7.239819004524887e-05, + "loss": 0.8894, + "step": 322 + }, + { + "epoch": 0.3292558613659531, + "grad_norm": 4.730165958404541, + "learning_rate": 7.262443438914027e-05, + "loss": 0.4134, + "step": 323 + }, + { + "epoch": 0.3302752293577982, + "grad_norm": 6.1243181228637695, + "learning_rate": 7.285067873303167e-05, + "loss": 0.4147, + "step": 324 + }, + { + "epoch": 0.3312945973496432, + "grad_norm": 7.8853607177734375, + "learning_rate": 7.307692307692307e-05, + "loss": 0.5721, + "step": 325 + }, + { + "epoch": 0.33231396534148827, + "grad_norm": 9.193514823913574, + "learning_rate": 7.330316742081448e-05, + "loss": 0.8541, + "step": 326 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 12.314509391784668, + "learning_rate": 7.352941176470589e-05, + "loss": 2.2959, + "step": 327 + }, + { + "epoch": 0.3343527013251784, + "grad_norm": 4.384552955627441, + "learning_rate": 7.375565610859729e-05, + "loss": 0.4452, + "step": 328 + }, + { + "epoch": 0.33537206931702346, + "grad_norm": 5.677075386047363, + "learning_rate": 7.398190045248869e-05, + "loss": 0.5008, + "step": 329 + }, + { + "epoch": 0.3363914373088685, + "grad_norm": 6.752626419067383, + "learning_rate": 7.420814479638009e-05, + "loss": 0.4106, + "step": 330 + }, + { + "epoch": 0.33741080530071355, + "grad_norm": 10.971478462219238, + "learning_rate": 7.44343891402715e-05, + "loss": 0.9237, + "step": 331 + }, + { + "epoch": 0.3384301732925586, + "grad_norm": 7.574080944061279, + "learning_rate": 7.46606334841629e-05, + "loss": 0.6275, + "step": 332 + }, + { + "epoch": 0.3394495412844037, + "grad_norm": 9.538507461547852, + "learning_rate": 7.48868778280543e-05, + "loss": 1.5184, + "step": 333 + }, + { + "epoch": 0.34046890927624873, + "grad_norm": 9.139626502990723, + "learning_rate": 7.511312217194571e-05, + "loss": 1.7865, + "step": 334 + }, + { + "epoch": 0.3414882772680938, + "grad_norm": 9.275596618652344, + "learning_rate": 7.533936651583711e-05, + "loss": 1.5947, + "step": 335 + }, + { + "epoch": 0.3425076452599388, + "grad_norm": 9.375283241271973, + "learning_rate": 7.556561085972851e-05, + "loss": 1.0249, + "step": 336 + }, + { + "epoch": 0.3435270132517839, + "grad_norm": 7.951083660125732, + "learning_rate": 7.579185520361991e-05, + "loss": 1.0227, + "step": 337 + }, + { + "epoch": 0.34454638124362896, + "grad_norm": 9.579297065734863, + "learning_rate": 7.601809954751131e-05, + "loss": 1.28, + "step": 338 + }, + { + "epoch": 0.345565749235474, + "grad_norm": 5.935997486114502, + "learning_rate": 7.624434389140271e-05, + "loss": 0.798, + "step": 339 + }, + { + "epoch": 0.34658511722731905, + "grad_norm": 7.16936731338501, + "learning_rate": 7.647058823529411e-05, + "loss": 1.0408, + "step": 340 + }, + { + "epoch": 0.3476044852191641, + "grad_norm": 9.448662757873535, + "learning_rate": 7.669683257918553e-05, + "loss": 0.9732, + "step": 341 + }, + { + "epoch": 0.3486238532110092, + "grad_norm": 7.747692584991455, + "learning_rate": 7.692307692307693e-05, + "loss": 0.7588, + "step": 342 + }, + { + "epoch": 0.34964322120285424, + "grad_norm": 10.198869705200195, + "learning_rate": 7.714932126696833e-05, + "loss": 0.9615, + "step": 343 + }, + { + "epoch": 0.3506625891946993, + "grad_norm": 8.069470405578613, + "learning_rate": 7.737556561085974e-05, + "loss": 0.9895, + "step": 344 + }, + { + "epoch": 0.3516819571865443, + "grad_norm": 10.662049293518066, + "learning_rate": 7.760180995475114e-05, + "loss": 1.923, + "step": 345 + }, + { + "epoch": 0.3527013251783894, + "grad_norm": 6.53238582611084, + "learning_rate": 7.782805429864254e-05, + "loss": 0.615, + "step": 346 + }, + { + "epoch": 0.35372069317023447, + "grad_norm": 11.10132122039795, + "learning_rate": 7.805429864253394e-05, + "loss": 1.4572, + "step": 347 + }, + { + "epoch": 0.3547400611620795, + "grad_norm": 7.372711181640625, + "learning_rate": 7.828054298642534e-05, + "loss": 1.0083, + "step": 348 + }, + { + "epoch": 0.35575942915392456, + "grad_norm": 7.358077526092529, + "learning_rate": 7.850678733031674e-05, + "loss": 0.922, + "step": 349 + }, + { + "epoch": 0.3567787971457696, + "grad_norm": 8.45017147064209, + "learning_rate": 7.873303167420814e-05, + "loss": 1.3767, + "step": 350 + }, + { + "epoch": 0.3577981651376147, + "grad_norm": 4.858506679534912, + "learning_rate": 7.895927601809954e-05, + "loss": 0.6378, + "step": 351 + }, + { + "epoch": 0.35881753312945974, + "grad_norm": 5.764273643493652, + "learning_rate": 7.918552036199095e-05, + "loss": 0.4063, + "step": 352 + }, + { + "epoch": 0.3598369011213048, + "grad_norm": 8.656686782836914, + "learning_rate": 7.941176470588235e-05, + "loss": 1.0834, + "step": 353 + }, + { + "epoch": 0.36085626911314983, + "grad_norm": 5.824944496154785, + "learning_rate": 7.963800904977376e-05, + "loss": 0.807, + "step": 354 + }, + { + "epoch": 0.36187563710499493, + "grad_norm": 6.73368501663208, + "learning_rate": 7.986425339366516e-05, + "loss": 1.0293, + "step": 355 + }, + { + "epoch": 0.36289500509684, + "grad_norm": 5.860096454620361, + "learning_rate": 8.009049773755657e-05, + "loss": 0.4371, + "step": 356 + }, + { + "epoch": 0.363914373088685, + "grad_norm": 5.65436315536499, + "learning_rate": 8.031674208144798e-05, + "loss": 0.4334, + "step": 357 + }, + { + "epoch": 0.36493374108053006, + "grad_norm": 7.566843509674072, + "learning_rate": 8.054298642533938e-05, + "loss": 0.949, + "step": 358 + }, + { + "epoch": 0.3659531090723751, + "grad_norm": 6.286118984222412, + "learning_rate": 8.076923076923078e-05, + "loss": 0.5788, + "step": 359 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 10.212640762329102, + "learning_rate": 8.099547511312218e-05, + "loss": 0.8535, + "step": 360 + }, + { + "epoch": 0.36799184505606525, + "grad_norm": 9.267760276794434, + "learning_rate": 8.122171945701358e-05, + "loss": 1.2529, + "step": 361 + }, + { + "epoch": 0.3690112130479103, + "grad_norm": 8.794651985168457, + "learning_rate": 8.144796380090498e-05, + "loss": 0.8974, + "step": 362 + }, + { + "epoch": 0.37003058103975534, + "grad_norm": 13.3441162109375, + "learning_rate": 8.167420814479638e-05, + "loss": 1.9105, + "step": 363 + }, + { + "epoch": 0.37104994903160043, + "grad_norm": 9.258030891418457, + "learning_rate": 8.190045248868778e-05, + "loss": 0.7717, + "step": 364 + }, + { + "epoch": 0.3720693170234455, + "grad_norm": 6.051854610443115, + "learning_rate": 8.212669683257918e-05, + "loss": 1.1052, + "step": 365 + }, + { + "epoch": 0.3730886850152905, + "grad_norm": 9.53382682800293, + "learning_rate": 8.23529411764706e-05, + "loss": 0.7298, + "step": 366 + }, + { + "epoch": 0.37410805300713557, + "grad_norm": 6.723752498626709, + "learning_rate": 8.2579185520362e-05, + "loss": 0.7039, + "step": 367 + }, + { + "epoch": 0.3751274209989806, + "grad_norm": 6.844725608825684, + "learning_rate": 8.28054298642534e-05, + "loss": 0.8536, + "step": 368 + }, + { + "epoch": 0.3761467889908257, + "grad_norm": 5.233691692352295, + "learning_rate": 8.303167420814481e-05, + "loss": 0.4774, + "step": 369 + }, + { + "epoch": 0.37716615698267075, + "grad_norm": 4.231795787811279, + "learning_rate": 8.325791855203621e-05, + "loss": 0.3297, + "step": 370 + }, + { + "epoch": 0.3781855249745158, + "grad_norm": 11.760458946228027, + "learning_rate": 8.348416289592761e-05, + "loss": 1.693, + "step": 371 + }, + { + "epoch": 0.37920489296636084, + "grad_norm": 10.05996036529541, + "learning_rate": 8.371040723981901e-05, + "loss": 0.853, + "step": 372 + }, + { + "epoch": 0.38022426095820594, + "grad_norm": 8.649154663085938, + "learning_rate": 8.393665158371041e-05, + "loss": 0.7242, + "step": 373 + }, + { + "epoch": 0.381243628950051, + "grad_norm": 6.6194748878479, + "learning_rate": 8.416289592760181e-05, + "loss": 0.5019, + "step": 374 + }, + { + "epoch": 0.382262996941896, + "grad_norm": 8.058365821838379, + "learning_rate": 8.438914027149321e-05, + "loss": 0.6206, + "step": 375 + }, + { + "epoch": 0.38328236493374107, + "grad_norm": 6.66504430770874, + "learning_rate": 8.461538461538461e-05, + "loss": 0.4872, + "step": 376 + }, + { + "epoch": 0.3843017329255861, + "grad_norm": 5.8679518699646, + "learning_rate": 8.484162895927601e-05, + "loss": 0.4515, + "step": 377 + }, + { + "epoch": 0.3853211009174312, + "grad_norm": 9.830297470092773, + "learning_rate": 8.506787330316743e-05, + "loss": 1.4657, + "step": 378 + }, + { + "epoch": 0.38634046890927626, + "grad_norm": 8.260361671447754, + "learning_rate": 8.529411764705883e-05, + "loss": 0.8411, + "step": 379 + }, + { + "epoch": 0.3873598369011213, + "grad_norm": 8.48035717010498, + "learning_rate": 8.552036199095023e-05, + "loss": 0.7654, + "step": 380 + }, + { + "epoch": 0.38837920489296635, + "grad_norm": 7.481667518615723, + "learning_rate": 8.574660633484163e-05, + "loss": 0.5413, + "step": 381 + }, + { + "epoch": 0.3893985728848114, + "grad_norm": 5.923032760620117, + "learning_rate": 8.597285067873304e-05, + "loss": 0.4594, + "step": 382 + }, + { + "epoch": 0.3904179408766565, + "grad_norm": 11.383003234863281, + "learning_rate": 8.619909502262445e-05, + "loss": 1.2656, + "step": 383 + }, + { + "epoch": 0.39143730886850153, + "grad_norm": 9.154252052307129, + "learning_rate": 8.642533936651585e-05, + "loss": 0.6881, + "step": 384 + }, + { + "epoch": 0.3924566768603466, + "grad_norm": 8.656584739685059, + "learning_rate": 8.665158371040725e-05, + "loss": 0.8169, + "step": 385 + }, + { + "epoch": 0.3934760448521916, + "grad_norm": 9.6775541305542, + "learning_rate": 8.687782805429865e-05, + "loss": 0.937, + "step": 386 + }, + { + "epoch": 0.3944954128440367, + "grad_norm": 12.836816787719727, + "learning_rate": 8.710407239819005e-05, + "loss": 2.1343, + "step": 387 + }, + { + "epoch": 0.39551478083588176, + "grad_norm": 6.1532487869262695, + "learning_rate": 8.733031674208145e-05, + "loss": 0.3644, + "step": 388 + }, + { + "epoch": 0.3965341488277268, + "grad_norm": 6.3952555656433105, + "learning_rate": 8.755656108597285e-05, + "loss": 0.4406, + "step": 389 + }, + { + "epoch": 0.39755351681957185, + "grad_norm": 7.005934238433838, + "learning_rate": 8.778280542986425e-05, + "loss": 0.5444, + "step": 390 + }, + { + "epoch": 0.3985728848114169, + "grad_norm": 8.97732925415039, + "learning_rate": 8.800904977375566e-05, + "loss": 1.3891, + "step": 391 + }, + { + "epoch": 0.399592252803262, + "grad_norm": 6.8778181076049805, + "learning_rate": 8.823529411764706e-05, + "loss": 0.6287, + "step": 392 + }, + { + "epoch": 0.399592252803262, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8465664982795715, + "eval_Qnli-dev-1024_cosine_ap": 0.7683064400770494, + "eval_Qnli-dev-1024_cosine_f1": 0.6976744186046511, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.834477424621582, + "eval_Qnli-dev-1024_cosine_mcc": 0.45496263625850347, + "eval_Qnli-dev-1024_cosine_precision": 0.7317073170731707, + "eval_Qnli-dev-1024_cosine_recall": 0.6666666666666666, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.74493807554245, + "eval_Qnli-dev_cosine_ap": 0.7575725381948821, + "eval_Qnli-dev_cosine_f1": 0.7476635514018692, + "eval_Qnli-dev_cosine_f1_threshold": 0.7015562057495117, + "eval_Qnli-dev_cosine_mcc": 0.47737827504723207, + "eval_Qnli-dev_cosine_precision": 0.6451612903225806, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9270833134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9479166865348816, + "eval_global_dataset_loss": 0.3703947365283966, + "eval_global_dataset_runtime": 104.3143, + "eval_global_dataset_samples_per_second": 7.698, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.9270833134651184, + "eval_sts-test-1024_pearson_cosine": 0.8782079507952609, + "eval_sts-test-1024_spearman_cosine": 0.9080003485202497, + "eval_sts-test_pearson_cosine": 0.9052799671643099, + "eval_sts-test_spearman_cosine": 0.9200953636370672, + "step": 392 + }, + { + "epoch": 0.40061162079510704, + "grad_norm": 7.236085414886475, + "learning_rate": 8.846153846153847e-05, + "loss": 1.066, + "step": 393 + }, + { + "epoch": 0.4016309887869521, + "grad_norm": 7.638827323913574, + "learning_rate": 8.868778280542987e-05, + "loss": 1.0406, + "step": 394 + }, + { + "epoch": 0.4026503567787971, + "grad_norm": 6.2278876304626465, + "learning_rate": 8.891402714932127e-05, + "loss": 0.819, + "step": 395 + }, + { + "epoch": 0.4036697247706422, + "grad_norm": 7.04884147644043, + "learning_rate": 8.914027149321268e-05, + "loss": 0.5826, + "step": 396 + }, + { + "epoch": 0.40468909276248727, + "grad_norm": 8.24869441986084, + "learning_rate": 8.936651583710408e-05, + "loss": 0.6355, + "step": 397 + }, + { + "epoch": 0.4057084607543323, + "grad_norm": 9.9276704788208, + "learning_rate": 8.959276018099548e-05, + "loss": 0.7566, + "step": 398 + }, + { + "epoch": 0.40672782874617736, + "grad_norm": 8.717905044555664, + "learning_rate": 8.981900452488688e-05, + "loss": 0.8174, + "step": 399 + }, + { + "epoch": 0.4077471967380224, + "grad_norm": 8.515538215637207, + "learning_rate": 9.004524886877828e-05, + "loss": 0.8905, + "step": 400 + }, + { + "epoch": 0.4087665647298675, + "grad_norm": 6.506967067718506, + "learning_rate": 9.027149321266968e-05, + "loss": 0.6646, + "step": 401 + }, + { + "epoch": 0.40978593272171254, + "grad_norm": 9.33711051940918, + "learning_rate": 9.049773755656108e-05, + "loss": 0.9056, + "step": 402 + }, + { + "epoch": 0.4108053007135576, + "grad_norm": 5.124199867248535, + "learning_rate": 9.07239819004525e-05, + "loss": 0.3689, + "step": 403 + }, + { + "epoch": 0.41182466870540263, + "grad_norm": 5.597712516784668, + "learning_rate": 9.09502262443439e-05, + "loss": 0.3709, + "step": 404 + }, + { + "epoch": 0.41284403669724773, + "grad_norm": 7.897356033325195, + "learning_rate": 9.11764705882353e-05, + "loss": 0.6708, + "step": 405 + }, + { + "epoch": 0.4138634046890928, + "grad_norm": 8.37096881866455, + "learning_rate": 9.14027149321267e-05, + "loss": 1.0531, + "step": 406 + }, + { + "epoch": 0.4148827726809378, + "grad_norm": 7.530358791351318, + "learning_rate": 9.16289592760181e-05, + "loss": 1.1355, + "step": 407 + }, + { + "epoch": 0.41590214067278286, + "grad_norm": 10.304217338562012, + "learning_rate": 9.18552036199095e-05, + "loss": 0.8042, + "step": 408 + }, + { + "epoch": 0.4169215086646279, + "grad_norm": 7.292766094207764, + "learning_rate": 9.20814479638009e-05, + "loss": 0.3915, + "step": 409 + }, + { + "epoch": 0.417940876656473, + "grad_norm": 10.453197479248047, + "learning_rate": 9.230769230769232e-05, + "loss": 1.9388, + "step": 410 + }, + { + "epoch": 0.41896024464831805, + "grad_norm": 2.7471694946289062, + "learning_rate": 9.253393665158372e-05, + "loss": 0.3044, + "step": 411 + }, + { + "epoch": 0.4199796126401631, + "grad_norm": 5.923367023468018, + "learning_rate": 9.276018099547512e-05, + "loss": 0.6153, + "step": 412 + }, + { + "epoch": 0.42099898063200814, + "grad_norm": 8.176202774047852, + "learning_rate": 9.298642533936652e-05, + "loss": 0.9407, + "step": 413 + }, + { + "epoch": 0.42201834862385323, + "grad_norm": 8.41361141204834, + "learning_rate": 9.321266968325792e-05, + "loss": 0.6876, + "step": 414 + }, + { + "epoch": 0.4230377166156983, + "grad_norm": 9.516852378845215, + "learning_rate": 9.343891402714933e-05, + "loss": 0.9694, + "step": 415 + }, + { + "epoch": 0.4240570846075433, + "grad_norm": 7.201638698577881, + "learning_rate": 9.366515837104073e-05, + "loss": 0.7868, + "step": 416 + }, + { + "epoch": 0.42507645259938837, + "grad_norm": 9.961840629577637, + "learning_rate": 9.389140271493213e-05, + "loss": 0.7735, + "step": 417 + }, + { + "epoch": 0.4260958205912334, + "grad_norm": 10.842241287231445, + "learning_rate": 9.411764705882353e-05, + "loss": 1.1682, + "step": 418 + }, + { + "epoch": 0.4271151885830785, + "grad_norm": 5.817572116851807, + "learning_rate": 9.434389140271494e-05, + "loss": 0.3465, + "step": 419 + }, + { + "epoch": 0.42813455657492355, + "grad_norm": 6.870133399963379, + "learning_rate": 9.457013574660634e-05, + "loss": 0.5699, + "step": 420 + }, + { + "epoch": 0.4291539245667686, + "grad_norm": 6.472342014312744, + "learning_rate": 9.479638009049774e-05, + "loss": 0.6128, + "step": 421 + }, + { + "epoch": 0.43017329255861364, + "grad_norm": 6.5723795890808105, + "learning_rate": 9.502262443438914e-05, + "loss": 0.8886, + "step": 422 + }, + { + "epoch": 0.43119266055045874, + "grad_norm": 6.1384429931640625, + "learning_rate": 9.524886877828054e-05, + "loss": 0.5124, + "step": 423 + }, + { + "epoch": 0.4322120285423038, + "grad_norm": 6.241471290588379, + "learning_rate": 9.547511312217195e-05, + "loss": 0.4409, + "step": 424 + }, + { + "epoch": 0.4332313965341488, + "grad_norm": 9.087861061096191, + "learning_rate": 9.570135746606335e-05, + "loss": 0.6368, + "step": 425 + }, + { + "epoch": 0.43425076452599387, + "grad_norm": 9.653539657592773, + "learning_rate": 9.592760180995475e-05, + "loss": 0.9874, + "step": 426 + }, + { + "epoch": 0.4352701325178389, + "grad_norm": 13.366517066955566, + "learning_rate": 9.615384615384617e-05, + "loss": 1.6544, + "step": 427 + }, + { + "epoch": 0.436289500509684, + "grad_norm": 6.302597522735596, + "learning_rate": 9.638009049773757e-05, + "loss": 0.4561, + "step": 428 + }, + { + "epoch": 0.43730886850152906, + "grad_norm": 7.133030891418457, + "learning_rate": 9.660633484162897e-05, + "loss": 0.5443, + "step": 429 + }, + { + "epoch": 0.4383282364933741, + "grad_norm": 6.341556072235107, + "learning_rate": 9.683257918552037e-05, + "loss": 0.5183, + "step": 430 + }, + { + "epoch": 0.43934760448521915, + "grad_norm": 10.657116889953613, + "learning_rate": 9.705882352941177e-05, + "loss": 1.1585, + "step": 431 + }, + { + "epoch": 0.44036697247706424, + "grad_norm": 7.707142353057861, + "learning_rate": 9.728506787330317e-05, + "loss": 1.4285, + "step": 432 + }, + { + "epoch": 0.4413863404689093, + "grad_norm": 8.27905559539795, + "learning_rate": 9.751131221719457e-05, + "loss": 1.0638, + "step": 433 + }, + { + "epoch": 0.44240570846075433, + "grad_norm": 5.601058483123779, + "learning_rate": 9.773755656108597e-05, + "loss": 0.553, + "step": 434 + }, + { + "epoch": 0.4434250764525994, + "grad_norm": 9.084299087524414, + "learning_rate": 9.796380090497737e-05, + "loss": 1.0009, + "step": 435 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 5.231532573699951, + "learning_rate": 9.819004524886877e-05, + "loss": 0.5211, + "step": 436 + }, + { + "epoch": 0.4454638124362895, + "grad_norm": 7.0478715896606445, + "learning_rate": 9.841628959276019e-05, + "loss": 0.6483, + "step": 437 + }, + { + "epoch": 0.44648318042813456, + "grad_norm": 8.44166088104248, + "learning_rate": 9.864253393665159e-05, + "loss": 1.2634, + "step": 438 + }, + { + "epoch": 0.4475025484199796, + "grad_norm": 7.2984771728515625, + "learning_rate": 9.8868778280543e-05, + "loss": 0.5242, + "step": 439 + }, + { + "epoch": 0.44852191641182465, + "grad_norm": 9.091867446899414, + "learning_rate": 9.90950226244344e-05, + "loss": 1.3739, + "step": 440 + }, + { + "epoch": 0.44954128440366975, + "grad_norm": 8.1068115234375, + "learning_rate": 9.93212669683258e-05, + "loss": 1.0153, + "step": 441 + }, + { + "epoch": 0.4505606523955148, + "grad_norm": 7.902680397033691, + "learning_rate": 9.95475113122172e-05, + "loss": 0.7174, + "step": 442 + }, + { + "epoch": 0.45158002038735984, + "grad_norm": 8.784537315368652, + "learning_rate": 9.97737556561086e-05, + "loss": 0.8631, + "step": 443 + }, + { + "epoch": 0.4525993883792049, + "grad_norm": 8.205148696899414, + "learning_rate": 0.0001, + "loss": 1.0721, + "step": 444 + }, + { + "epoch": 0.4536187563710499, + "grad_norm": 4.789169788360596, + "learning_rate": 9.999964497873585e-05, + "loss": 0.3682, + "step": 445 + }, + { + "epoch": 0.454638124362895, + "grad_norm": 11.335341453552246, + "learning_rate": 9.999857991998499e-05, + "loss": 1.3278, + "step": 446 + }, + { + "epoch": 0.45565749235474007, + "grad_norm": 8.901962280273438, + "learning_rate": 9.999680483887217e-05, + "loss": 0.665, + "step": 447 + }, + { + "epoch": 0.4566768603465851, + "grad_norm": 6.525248050689697, + "learning_rate": 9.999431976060504e-05, + "loss": 0.77, + "step": 448 + }, + { + "epoch": 0.45769622833843016, + "grad_norm": 7.658937931060791, + "learning_rate": 9.999112472047386e-05, + "loss": 0.9903, + "step": 449 + }, + { + "epoch": 0.45871559633027525, + "grad_norm": 5.406915664672852, + "learning_rate": 9.998721976385087e-05, + "loss": 0.3372, + "step": 450 + }, + { + "epoch": 0.4597349643221203, + "grad_norm": 5.920129299163818, + "learning_rate": 9.998260494618979e-05, + "loss": 0.6911, + "step": 451 + }, + { + "epoch": 0.46075433231396534, + "grad_norm": 7.490262985229492, + "learning_rate": 9.997728033302496e-05, + "loss": 0.505, + "step": 452 + }, + { + "epoch": 0.4617737003058104, + "grad_norm": 8.21649169921875, + "learning_rate": 9.997124599997043e-05, + "loss": 1.3397, + "step": 453 + }, + { + "epoch": 0.46279306829765543, + "grad_norm": 5.116532802581787, + "learning_rate": 9.996450203271886e-05, + "loss": 0.2853, + "step": 454 + }, + { + "epoch": 0.46381243628950053, + "grad_norm": 7.29067325592041, + "learning_rate": 9.995704852704029e-05, + "loss": 1.198, + "step": 455 + }, + { + "epoch": 0.4648318042813456, + "grad_norm": 10.033268928527832, + "learning_rate": 9.994888558878086e-05, + "loss": 1.7965, + "step": 456 + }, + { + "epoch": 0.4658511722731906, + "grad_norm": 5.4102606773376465, + "learning_rate": 9.994001333386125e-05, + "loss": 0.2987, + "step": 457 + }, + { + "epoch": 0.46687054026503566, + "grad_norm": 8.109895706176758, + "learning_rate": 9.993043188827501e-05, + "loss": 0.6864, + "step": 458 + }, + { + "epoch": 0.46788990825688076, + "grad_norm": 9.893292427062988, + "learning_rate": 9.992014138808682e-05, + "loss": 0.9016, + "step": 459 + }, + { + "epoch": 0.4689092762487258, + "grad_norm": 7.73169469833374, + "learning_rate": 9.990914197943053e-05, + "loss": 0.7314, + "step": 460 + }, + { + "epoch": 0.46992864424057085, + "grad_norm": 8.335735321044922, + "learning_rate": 9.989743381850711e-05, + "loss": 0.6633, + "step": 461 + }, + { + "epoch": 0.4709480122324159, + "grad_norm": 8.655631065368652, + "learning_rate": 9.988501707158243e-05, + "loss": 0.9783, + "step": 462 + }, + { + "epoch": 0.47196738022426094, + "grad_norm": 9.166102409362793, + "learning_rate": 9.987189191498479e-05, + "loss": 1.1307, + "step": 463 + }, + { + "epoch": 0.47298674821610603, + "grad_norm": 10.597552299499512, + "learning_rate": 9.985805853510262e-05, + "loss": 1.4662, + "step": 464 + }, + { + "epoch": 0.4740061162079511, + "grad_norm": 10.318975448608398, + "learning_rate": 9.984351712838167e-05, + "loss": 1.4666, + "step": 465 + }, + { + "epoch": 0.4750254841997961, + "grad_norm": 7.259106636047363, + "learning_rate": 9.98282679013223e-05, + "loss": 0.5918, + "step": 466 + }, + { + "epoch": 0.47604485219164117, + "grad_norm": 10.192667961120605, + "learning_rate": 9.981231107047648e-05, + "loss": 1.5836, + "step": 467 + }, + { + "epoch": 0.47706422018348627, + "grad_norm": 6.506603717803955, + "learning_rate": 9.97956468624448e-05, + "loss": 0.5698, + "step": 468 + }, + { + "epoch": 0.4780835881753313, + "grad_norm": 5.789127349853516, + "learning_rate": 9.977827551387318e-05, + "loss": 0.4654, + "step": 469 + }, + { + "epoch": 0.47910295616717635, + "grad_norm": 8.615316390991211, + "learning_rate": 9.976019727144956e-05, + "loss": 0.9522, + "step": 470 + }, + { + "epoch": 0.4801223241590214, + "grad_norm": 4.792436599731445, + "learning_rate": 9.974141239190034e-05, + "loss": 0.4748, + "step": 471 + }, + { + "epoch": 0.48114169215086644, + "grad_norm": 9.958406448364258, + "learning_rate": 9.972192114198677e-05, + "loss": 2.0766, + "step": 472 + }, + { + "epoch": 0.48216106014271154, + "grad_norm": 4.340735912322998, + "learning_rate": 9.970172379850122e-05, + "loss": 0.7071, + "step": 473 + }, + { + "epoch": 0.4831804281345566, + "grad_norm": 7.170680999755859, + "learning_rate": 9.968082064826314e-05, + "loss": 0.435, + "step": 474 + }, + { + "epoch": 0.4841997961264016, + "grad_norm": 3.927189350128174, + "learning_rate": 9.965921198811501e-05, + "loss": 0.4551, + "step": 475 + }, + { + "epoch": 0.48521916411824667, + "grad_norm": 10.183062553405762, + "learning_rate": 9.96368981249182e-05, + "loss": 1.1758, + "step": 476 + }, + { + "epoch": 0.48623853211009177, + "grad_norm": 9.819293022155762, + "learning_rate": 9.961387937554857e-05, + "loss": 0.9995, + "step": 477 + }, + { + "epoch": 0.4872579001019368, + "grad_norm": 11.188612937927246, + "learning_rate": 9.95901560668919e-05, + "loss": 1.6207, + "step": 478 + }, + { + "epoch": 0.48827726809378186, + "grad_norm": 7.268994331359863, + "learning_rate": 9.95657285358394e-05, + "loss": 0.6978, + "step": 479 + }, + { + "epoch": 0.4892966360856269, + "grad_norm": 5.575627326965332, + "learning_rate": 9.954059712928275e-05, + "loss": 0.4236, + "step": 480 + }, + { + "epoch": 0.49031600407747195, + "grad_norm": 9.621591567993164, + "learning_rate": 9.951476220410929e-05, + "loss": 1.8218, + "step": 481 + }, + { + "epoch": 0.49133537206931704, + "grad_norm": 7.322023391723633, + "learning_rate": 9.948822412719697e-05, + "loss": 0.8749, + "step": 482 + }, + { + "epoch": 0.4923547400611621, + "grad_norm": 8.407424926757812, + "learning_rate": 9.946098327540902e-05, + "loss": 1.1704, + "step": 483 + }, + { + "epoch": 0.49337410805300713, + "grad_norm": 4.8855438232421875, + "learning_rate": 9.943304003558873e-05, + "loss": 0.5327, + "step": 484 + }, + { + "epoch": 0.4943934760448522, + "grad_norm": 8.738515853881836, + "learning_rate": 9.940439480455386e-05, + "loss": 1.2009, + "step": 485 + }, + { + "epoch": 0.4954128440366973, + "grad_norm": 7.554356575012207, + "learning_rate": 9.937504798909106e-05, + "loss": 0.5427, + "step": 486 + }, + { + "epoch": 0.4964322120285423, + "grad_norm": 8.203272819519043, + "learning_rate": 9.934500000595008e-05, + "loss": 0.5893, + "step": 487 + }, + { + "epoch": 0.49745158002038736, + "grad_norm": 8.477286338806152, + "learning_rate": 9.931425128183782e-05, + "loss": 1.061, + "step": 488 + }, + { + "epoch": 0.4984709480122324, + "grad_norm": 7.389923095703125, + "learning_rate": 9.928280225341232e-05, + "loss": 0.5465, + "step": 489 + }, + { + "epoch": 0.49949031600407745, + "grad_norm": 10.051106452941895, + "learning_rate": 9.925065336727654e-05, + "loss": 0.7035, + "step": 490 + }, + { + "epoch": 0.49949031600407745, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.817620038986206, + "eval_Qnli-dev-1024_cosine_ap": 0.7443202788050278, + "eval_Qnli-dev-1024_cosine_f1": 0.7291666666666667, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.817620038986206, + "eval_Qnli-dev-1024_cosine_mcc": 0.46405228758169936, + "eval_Qnli-dev-1024_cosine_precision": 0.6862745098039216, + "eval_Qnli-dev-1024_cosine_recall": 0.7777777777777778, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7795548439025879, + "eval_Qnli-dev_cosine_ap": 0.7446338608862075, + "eval_Qnli-dev_cosine_f1": 0.7378640776699029, + "eval_Qnli-dev_cosine_f1_threshold": 0.6985307335853577, + "eval_Qnli-dev_cosine_mcc": 0.46153029495329345, + "eval_Qnli-dev_cosine_precision": 0.6551724137931034, + "eval_Qnli-dev_cosine_recall": 0.8444444444444444, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.90625, + "eval_allNLI-triplets_cosine_accuracy": 0.9375, + "eval_global_dataset_loss": 0.34814590215682983, + "eval_global_dataset_runtime": 104.2751, + "eval_global_dataset_samples_per_second": 7.701, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.90625, + "eval_sts-test-1024_pearson_cosine": 0.8574057933500303, + "eval_sts-test-1024_spearman_cosine": 0.8986116241995802, + "eval_sts-test_pearson_cosine": 0.9019111579722014, + "eval_sts-test_spearman_cosine": 0.9181479205822737, + "step": 490 + }, + { + "epoch": 0.5005096839959225, + "grad_norm": 10.88537311553955, + "learning_rate": 9.921780507997202e-05, + "loss": 1.6596, + "step": 491 + }, + { + "epoch": 0.5015290519877675, + "grad_norm": 5.0818891525268555, + "learning_rate": 9.918425785797235e-05, + "loss": 0.4475, + "step": 492 + }, + { + "epoch": 0.5025484199796126, + "grad_norm": 12.540839195251465, + "learning_rate": 9.915001217767663e-05, + "loss": 2.0803, + "step": 493 + }, + { + "epoch": 0.5035677879714577, + "grad_norm": 6.171934604644775, + "learning_rate": 9.911506852540267e-05, + "loss": 0.4296, + "step": 494 + }, + { + "epoch": 0.5045871559633027, + "grad_norm": 9.624109268188477, + "learning_rate": 9.907942739738001e-05, + "loss": 1.5435, + "step": 495 + }, + { + "epoch": 0.5056065239551478, + "grad_norm": 6.020090579986572, + "learning_rate": 9.904308929974302e-05, + "loss": 0.6073, + "step": 496 + }, + { + "epoch": 0.5066258919469928, + "grad_norm": 8.587658882141113, + "learning_rate": 9.900605474852358e-05, + "loss": 1.1774, + "step": 497 + }, + { + "epoch": 0.5076452599388379, + "grad_norm": 6.535181999206543, + "learning_rate": 9.896832426964382e-05, + "loss": 0.8951, + "step": 498 + }, + { + "epoch": 0.508664627930683, + "grad_norm": 5.945138454437256, + "learning_rate": 9.892989839890863e-05, + "loss": 0.3775, + "step": 499 + }, + { + "epoch": 0.509683995922528, + "grad_norm": 7.641120433807373, + "learning_rate": 9.889077768199806e-05, + "loss": 0.8086, + "step": 500 + }, + { + "epoch": 0.5107033639143731, + "grad_norm": 8.10549545288086, + "learning_rate": 9.885096267445957e-05, + "loss": 0.3864, + "step": 501 + }, + { + "epoch": 0.5117227319062182, + "grad_norm": 7.266530990600586, + "learning_rate": 9.881045394170012e-05, + "loss": 0.8865, + "step": 502 + }, + { + "epoch": 0.5127420998980632, + "grad_norm": 9.056779861450195, + "learning_rate": 9.876925205897818e-05, + "loss": 0.567, + "step": 503 + }, + { + "epoch": 0.5137614678899083, + "grad_norm": 7.140566349029541, + "learning_rate": 9.872735761139554e-05, + "loss": 0.9304, + "step": 504 + }, + { + "epoch": 0.5147808358817533, + "grad_norm": 11.422016143798828, + "learning_rate": 9.868477119388896e-05, + "loss": 0.6977, + "step": 505 + }, + { + "epoch": 0.5158002038735984, + "grad_norm": 11.155719757080078, + "learning_rate": 9.864149341122181e-05, + "loss": 1.3174, + "step": 506 + }, + { + "epoch": 0.5168195718654435, + "grad_norm": 8.781103134155273, + "learning_rate": 9.859752487797542e-05, + "loss": 1.2481, + "step": 507 + }, + { + "epoch": 0.5178389398572885, + "grad_norm": 5.503263473510742, + "learning_rate": 9.855286621854034e-05, + "loss": 0.4894, + "step": 508 + }, + { + "epoch": 0.5188583078491336, + "grad_norm": 7.503839015960693, + "learning_rate": 9.850751806710753e-05, + "loss": 0.8095, + "step": 509 + }, + { + "epoch": 0.5198776758409785, + "grad_norm": 5.623706817626953, + "learning_rate": 9.846148106765933e-05, + "loss": 0.388, + "step": 510 + }, + { + "epoch": 0.5208970438328236, + "grad_norm": 5.178555965423584, + "learning_rate": 9.841475587396028e-05, + "loss": 0.6725, + "step": 511 + }, + { + "epoch": 0.5219164118246687, + "grad_norm": 7.296833038330078, + "learning_rate": 9.836734314954785e-05, + "loss": 0.4804, + "step": 512 + }, + { + "epoch": 0.5229357798165137, + "grad_norm": 8.692532539367676, + "learning_rate": 9.831924356772308e-05, + "loss": 1.2414, + "step": 513 + }, + { + "epoch": 0.5239551478083588, + "grad_norm": 9.865914344787598, + "learning_rate": 9.827045781154093e-05, + "loss": 1.0319, + "step": 514 + }, + { + "epoch": 0.5249745158002038, + "grad_norm": 14.857895851135254, + "learning_rate": 9.822098657380065e-05, + "loss": 2.0732, + "step": 515 + }, + { + "epoch": 0.5259938837920489, + "grad_norm": 6.85409688949585, + "learning_rate": 9.817083055703587e-05, + "loss": 1.1168, + "step": 516 + }, + { + "epoch": 0.527013251783894, + "grad_norm": 10.668725967407227, + "learning_rate": 9.811999047350471e-05, + "loss": 0.8056, + "step": 517 + }, + { + "epoch": 0.528032619775739, + "grad_norm": 6.76224946975708, + "learning_rate": 9.806846704517957e-05, + "loss": 0.5322, + "step": 518 + }, + { + "epoch": 0.5290519877675841, + "grad_norm": 4.4465789794921875, + "learning_rate": 9.801626100373699e-05, + "loss": 0.4348, + "step": 519 + }, + { + "epoch": 0.5300713557594292, + "grad_norm": 8.388195991516113, + "learning_rate": 9.796337309054717e-05, + "loss": 0.6316, + "step": 520 + }, + { + "epoch": 0.5310907237512742, + "grad_norm": 5.859539031982422, + "learning_rate": 9.790980405666344e-05, + "loss": 0.3212, + "step": 521 + }, + { + "epoch": 0.5321100917431193, + "grad_norm": 6.299170017242432, + "learning_rate": 9.785555466281162e-05, + "loss": 0.4739, + "step": 522 + }, + { + "epoch": 0.5331294597349643, + "grad_norm": 9.609426498413086, + "learning_rate": 9.780062567937928e-05, + "loss": 1.1692, + "step": 523 + }, + { + "epoch": 0.5341488277268094, + "grad_norm": 9.116230964660645, + "learning_rate": 9.774501788640471e-05, + "loss": 0.941, + "step": 524 + }, + { + "epoch": 0.5351681957186545, + "grad_norm": 5.024673938751221, + "learning_rate": 9.768873207356586e-05, + "loss": 0.3767, + "step": 525 + }, + { + "epoch": 0.5361875637104995, + "grad_norm": 7.534763336181641, + "learning_rate": 9.763176904016913e-05, + "loss": 0.5264, + "step": 526 + }, + { + "epoch": 0.5372069317023446, + "grad_norm": 7.897163391113281, + "learning_rate": 9.757412959513807e-05, + "loss": 0.4345, + "step": 527 + }, + { + "epoch": 0.5382262996941896, + "grad_norm": 8.391239166259766, + "learning_rate": 9.751581455700181e-05, + "loss": 1.0352, + "step": 528 + }, + { + "epoch": 0.5392456676860347, + "grad_norm": 6.951046466827393, + "learning_rate": 9.745682475388348e-05, + "loss": 1.1014, + "step": 529 + }, + { + "epoch": 0.5402650356778798, + "grad_norm": 6.4283671379089355, + "learning_rate": 9.73971610234885e-05, + "loss": 0.7368, + "step": 530 + }, + { + "epoch": 0.5412844036697247, + "grad_norm": 7.643414497375488, + "learning_rate": 9.733682421309256e-05, + "loss": 0.5324, + "step": 531 + }, + { + "epoch": 0.5423037716615698, + "grad_norm": 7.95609188079834, + "learning_rate": 9.727581517952969e-05, + "loss": 0.5351, + "step": 532 + }, + { + "epoch": 0.5433231396534148, + "grad_norm": 11.28146743774414, + "learning_rate": 9.721413478918007e-05, + "loss": 1.6815, + "step": 533 + }, + { + "epoch": 0.5443425076452599, + "grad_norm": 8.346885681152344, + "learning_rate": 9.715178391795769e-05, + "loss": 0.8125, + "step": 534 + }, + { + "epoch": 0.545361875637105, + "grad_norm": 8.147517204284668, + "learning_rate": 9.708876345129797e-05, + "loss": 0.8629, + "step": 535 + }, + { + "epoch": 0.54638124362895, + "grad_norm": 10.061439514160156, + "learning_rate": 9.702507428414513e-05, + "loss": 1.3161, + "step": 536 + }, + { + "epoch": 0.5474006116207951, + "grad_norm": 8.882964134216309, + "learning_rate": 9.696071732093952e-05, + "loss": 1.0465, + "step": 537 + }, + { + "epoch": 0.5484199796126402, + "grad_norm": 5.954410076141357, + "learning_rate": 9.689569347560475e-05, + "loss": 0.4531, + "step": 538 + }, + { + "epoch": 0.5494393476044852, + "grad_norm": 10.33085823059082, + "learning_rate": 9.683000367153474e-05, + "loss": 0.5567, + "step": 539 + }, + { + "epoch": 0.5504587155963303, + "grad_norm": 5.265343189239502, + "learning_rate": 9.676364884158058e-05, + "loss": 0.7093, + "step": 540 + }, + { + "epoch": 0.5514780835881753, + "grad_norm": 10.214452743530273, + "learning_rate": 9.66966299280373e-05, + "loss": 1.9339, + "step": 541 + }, + { + "epoch": 0.5524974515800204, + "grad_norm": 7.001688480377197, + "learning_rate": 9.662894788263044e-05, + "loss": 0.3659, + "step": 542 + }, + { + "epoch": 0.5535168195718655, + "grad_norm": 6.640339374542236, + "learning_rate": 9.656060366650267e-05, + "loss": 1.0505, + "step": 543 + }, + { + "epoch": 0.5545361875637105, + "grad_norm": 9.303877830505371, + "learning_rate": 9.649159825019996e-05, + "loss": 0.8766, + "step": 544 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 8.21275806427002, + "learning_rate": 9.642193261365791e-05, + "loss": 0.6526, + "step": 545 + }, + { + "epoch": 0.5565749235474006, + "grad_norm": 6.97646427154541, + "learning_rate": 9.635160774618782e-05, + "loss": 0.5529, + "step": 546 + }, + { + "epoch": 0.5575942915392457, + "grad_norm": 6.77686071395874, + "learning_rate": 9.628062464646264e-05, + "loss": 0.4817, + "step": 547 + }, + { + "epoch": 0.5586136595310908, + "grad_norm": 3.5217092037200928, + "learning_rate": 9.620898432250272e-05, + "loss": 0.4804, + "step": 548 + }, + { + "epoch": 0.5596330275229358, + "grad_norm": 5.6369476318359375, + "learning_rate": 9.613668779166165e-05, + "loss": 0.4508, + "step": 549 + }, + { + "epoch": 0.5606523955147809, + "grad_norm": 5.534257888793945, + "learning_rate": 9.606373608061162e-05, + "loss": 0.4339, + "step": 550 + }, + { + "epoch": 0.5616717635066258, + "grad_norm": 10.922380447387695, + "learning_rate": 9.5990130225329e-05, + "loss": 0.712, + "step": 551 + }, + { + "epoch": 0.5626911314984709, + "grad_norm": 6.2288360595703125, + "learning_rate": 9.59158712710795e-05, + "loss": 0.3974, + "step": 552 + }, + { + "epoch": 0.563710499490316, + "grad_norm": 11.958196640014648, + "learning_rate": 9.58409602724035e-05, + "loss": 1.0016, + "step": 553 + }, + { + "epoch": 0.564729867482161, + "grad_norm": 8.267114639282227, + "learning_rate": 9.576539829310085e-05, + "loss": 0.5751, + "step": 554 + }, + { + "epoch": 0.5657492354740061, + "grad_norm": 11.533574104309082, + "learning_rate": 9.568918640621594e-05, + "loss": 1.111, + "step": 555 + }, + { + "epoch": 0.5667686034658511, + "grad_norm": 6.519062519073486, + "learning_rate": 9.561232569402239e-05, + "loss": 0.4202, + "step": 556 + }, + { + "epoch": 0.5677879714576962, + "grad_norm": 9.009593963623047, + "learning_rate": 9.553481724800768e-05, + "loss": 0.7822, + "step": 557 + }, + { + "epoch": 0.5688073394495413, + "grad_norm": 12.121257781982422, + "learning_rate": 9.545666216885767e-05, + "loss": 1.3844, + "step": 558 + }, + { + "epoch": 0.5698267074413863, + "grad_norm": 5.953427314758301, + "learning_rate": 9.537786156644097e-05, + "loss": 0.3881, + "step": 559 + }, + { + "epoch": 0.5708460754332314, + "grad_norm": 7.334780216217041, + "learning_rate": 9.529841655979315e-05, + "loss": 0.6317, + "step": 560 + }, + { + "epoch": 0.5718654434250765, + "grad_norm": 5.987368583679199, + "learning_rate": 9.521832827710088e-05, + "loss": 0.4976, + "step": 561 + }, + { + "epoch": 0.5728848114169215, + "grad_norm": 3.9462735652923584, + "learning_rate": 9.51375978556859e-05, + "loss": 0.2741, + "step": 562 + }, + { + "epoch": 0.5739041794087666, + "grad_norm": 6.374652862548828, + "learning_rate": 9.505622644198885e-05, + "loss": 0.6232, + "step": 563 + }, + { + "epoch": 0.5749235474006116, + "grad_norm": 3.525486707687378, + "learning_rate": 9.497421519155303e-05, + "loss": 0.2083, + "step": 564 + }, + { + "epoch": 0.5759429153924567, + "grad_norm": 9.60029125213623, + "learning_rate": 9.489156526900795e-05, + "loss": 1.0605, + "step": 565 + }, + { + "epoch": 0.5769622833843018, + "grad_norm": 12.22358226776123, + "learning_rate": 9.480827784805278e-05, + "loss": 1.2086, + "step": 566 + }, + { + "epoch": 0.5779816513761468, + "grad_norm": 4.388841152191162, + "learning_rate": 9.472435411143978e-05, + "loss": 0.2217, + "step": 567 + }, + { + "epoch": 0.5790010193679919, + "grad_norm": 5.581283092498779, + "learning_rate": 9.463979525095738e-05, + "loss": 0.4215, + "step": 568 + }, + { + "epoch": 0.5800203873598369, + "grad_norm": 7.996876239776611, + "learning_rate": 9.455460246741331e-05, + "loss": 0.663, + "step": 569 + }, + { + "epoch": 0.581039755351682, + "grad_norm": 9.21956729888916, + "learning_rate": 9.446877697061757e-05, + "loss": 0.653, + "step": 570 + }, + { + "epoch": 0.582059123343527, + "grad_norm": 8.46827220916748, + "learning_rate": 9.43823199793652e-05, + "loss": 0.6895, + "step": 571 + }, + { + "epoch": 0.583078491335372, + "grad_norm": 9.72203540802002, + "learning_rate": 9.429523272141903e-05, + "loss": 1.1101, + "step": 572 + }, + { + "epoch": 0.5840978593272171, + "grad_norm": 8.79525089263916, + "learning_rate": 9.420751643349219e-05, + "loss": 1.2991, + "step": 573 + }, + { + "epoch": 0.5851172273190621, + "grad_norm": 6.719937801361084, + "learning_rate": 9.411917236123059e-05, + "loss": 0.4072, + "step": 574 + }, + { + "epoch": 0.5861365953109072, + "grad_norm": 8.360040664672852, + "learning_rate": 9.403020175919517e-05, + "loss": 1.169, + "step": 575 + }, + { + "epoch": 0.5871559633027523, + "grad_norm": 5.402820587158203, + "learning_rate": 9.394060589084417e-05, + "loss": 0.3374, + "step": 576 + }, + { + "epoch": 0.5881753312945973, + "grad_norm": 9.037818908691406, + "learning_rate": 9.385038602851515e-05, + "loss": 0.6785, + "step": 577 + }, + { + "epoch": 0.5891946992864424, + "grad_norm": 9.151761054992676, + "learning_rate": 9.375954345340685e-05, + "loss": 1.2757, + "step": 578 + }, + { + "epoch": 0.5902140672782875, + "grad_norm": 5.834461212158203, + "learning_rate": 9.366807945556113e-05, + "loss": 0.5899, + "step": 579 + }, + { + "epoch": 0.5912334352701325, + "grad_norm": 5.722581386566162, + "learning_rate": 9.357599533384453e-05, + "loss": 0.3389, + "step": 580 + }, + { + "epoch": 0.5922528032619776, + "grad_norm": 10.132628440856934, + "learning_rate": 9.348329239592995e-05, + "loss": 1.631, + "step": 581 + }, + { + "epoch": 0.5932721712538226, + "grad_norm": 9.922087669372559, + "learning_rate": 9.338997195827792e-05, + "loss": 1.3975, + "step": 582 + }, + { + "epoch": 0.5942915392456677, + "grad_norm": 8.382550239562988, + "learning_rate": 9.329603534611806e-05, + "loss": 0.4654, + "step": 583 + }, + { + "epoch": 0.5953109072375128, + "grad_norm": 8.080007553100586, + "learning_rate": 9.32014838934301e-05, + "loss": 0.56, + "step": 584 + }, + { + "epoch": 0.5963302752293578, + "grad_norm": 5.616114616394043, + "learning_rate": 9.310631894292518e-05, + "loss": 0.2282, + "step": 585 + }, + { + "epoch": 0.5973496432212029, + "grad_norm": 10.813580513000488, + "learning_rate": 9.301054184602647e-05, + "loss": 1.0754, + "step": 586 + }, + { + "epoch": 0.5983690112130479, + "grad_norm": 8.062788963317871, + "learning_rate": 9.291415396285024e-05, + "loss": 0.4411, + "step": 587 + }, + { + "epoch": 0.599388379204893, + "grad_norm": 8.6395845413208, + "learning_rate": 9.281715666218643e-05, + "loss": 0.9243, + "step": 588 + }, + { + "epoch": 0.599388379204893, + "eval_Qnli-dev-1024_cosine_accuracy": 0.71875, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8436912298202515, + "eval_Qnli-dev-1024_cosine_ap": 0.7587494204458187, + "eval_Qnli-dev-1024_cosine_f1": 0.6875, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8028630018234253, + "eval_Qnli-dev-1024_cosine_mcc": 0.3803921568627451, + "eval_Qnli-dev-1024_cosine_precision": 0.6470588235294118, + "eval_Qnli-dev-1024_cosine_recall": 0.7333333333333333, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8078321218490601, + "eval_Qnli-dev_cosine_ap": 0.7321739553695406, + "eval_Qnli-dev_cosine_f1": 0.7339449541284404, + "eval_Qnli-dev_cosine_f1_threshold": 0.6781572699546814, + "eval_Qnli-dev_cosine_mcc": 0.4428074427700477, + "eval_Qnli-dev_cosine_precision": 0.625, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.90625, + "eval_allNLI-triplets_cosine_accuracy": 0.9375, + "eval_global_dataset_loss": 0.36118289828300476, + "eval_global_dataset_runtime": 104.3983, + "eval_global_dataset_samples_per_second": 7.692, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.90625, + "eval_sts-test-1024_pearson_cosine": 0.8631921152381832, + "eval_sts-test-1024_spearman_cosine": 0.9009700758334896, + "eval_sts-test_pearson_cosine": 0.9009355736320144, + "eval_sts-test_spearman_cosine": 0.9171725695772274, + "step": 588 + }, + { + "epoch": 0.6004077471967381, + "grad_norm": 6.184821128845215, + "learning_rate": 9.271955132147916e-05, + "loss": 0.3572, + "step": 589 + }, + { + "epoch": 0.601427115188583, + "grad_norm": 8.318941116333008, + "learning_rate": 9.262133932680733e-05, + "loss": 0.6761, + "step": 590 + }, + { + "epoch": 0.6024464831804281, + "grad_norm": 7.54533052444458, + "learning_rate": 9.252252207286479e-05, + "loss": 0.5754, + "step": 591 + }, + { + "epoch": 0.6034658511722731, + "grad_norm": 4.341547012329102, + "learning_rate": 9.24231009629406e-05, + "loss": 0.3664, + "step": 592 + }, + { + "epoch": 0.6044852191641182, + "grad_norm": 7.616749286651611, + "learning_rate": 9.232307740889909e-05, + "loss": 0.5391, + "step": 593 + }, + { + "epoch": 0.6055045871559633, + "grad_norm": 4.843873977661133, + "learning_rate": 9.222245283115979e-05, + "loss": 0.518, + "step": 594 + }, + { + "epoch": 0.6065239551478083, + "grad_norm": 8.295080184936523, + "learning_rate": 9.21212286586773e-05, + "loss": 0.5263, + "step": 595 + }, + { + "epoch": 0.6075433231396534, + "grad_norm": 3.873260736465454, + "learning_rate": 9.201940632892096e-05, + "loss": 0.2995, + "step": 596 + }, + { + "epoch": 0.6085626911314985, + "grad_norm": 4.403683185577393, + "learning_rate": 9.191698728785448e-05, + "loss": 0.4181, + "step": 597 + }, + { + "epoch": 0.6095820591233435, + "grad_norm": 7.282264709472656, + "learning_rate": 9.181397298991532e-05, + "loss": 0.5087, + "step": 598 + }, + { + "epoch": 0.6106014271151886, + "grad_norm": 5.132986068725586, + "learning_rate": 9.171036489799416e-05, + "loss": 0.4344, + "step": 599 + }, + { + "epoch": 0.6116207951070336, + "grad_norm": 11.096871376037598, + "learning_rate": 9.160616448341403e-05, + "loss": 1.6529, + "step": 600 + }, + { + "epoch": 0.6126401630988787, + "grad_norm": 4.306335926055908, + "learning_rate": 9.150137322590944e-05, + "loss": 0.4079, + "step": 601 + }, + { + "epoch": 0.6136595310907238, + "grad_norm": 5.622674942016602, + "learning_rate": 9.139599261360537e-05, + "loss": 0.3123, + "step": 602 + }, + { + "epoch": 0.6146788990825688, + "grad_norm": 10.172139167785645, + "learning_rate": 9.129002414299617e-05, + "loss": 1.4398, + "step": 603 + }, + { + "epoch": 0.6156982670744139, + "grad_norm": 10.175543785095215, + "learning_rate": 9.118346931892423e-05, + "loss": 1.5553, + "step": 604 + }, + { + "epoch": 0.6167176350662589, + "grad_norm": 7.616044521331787, + "learning_rate": 9.10763296545587e-05, + "loss": 0.7958, + "step": 605 + }, + { + "epoch": 0.617737003058104, + "grad_norm": 5.390756607055664, + "learning_rate": 9.096860667137394e-05, + "loss": 0.3815, + "step": 606 + }, + { + "epoch": 0.6187563710499491, + "grad_norm": 6.750911235809326, + "learning_rate": 9.086030189912794e-05, + "loss": 0.787, + "step": 607 + }, + { + "epoch": 0.6197757390417941, + "grad_norm": 5.695408344268799, + "learning_rate": 9.075141687584057e-05, + "loss": 0.2352, + "step": 608 + }, + { + "epoch": 0.6207951070336392, + "grad_norm": 12.017024040222168, + "learning_rate": 9.06419531477718e-05, + "loss": 0.6469, + "step": 609 + }, + { + "epoch": 0.6218144750254841, + "grad_norm": 9.70870304107666, + "learning_rate": 9.053191226939965e-05, + "loss": 1.0997, + "step": 610 + }, + { + "epoch": 0.6228338430173292, + "grad_norm": 8.183333396911621, + "learning_rate": 9.042129580339822e-05, + "loss": 0.8762, + "step": 611 + }, + { + "epoch": 0.6238532110091743, + "grad_norm": 8.237792015075684, + "learning_rate": 9.031010532061538e-05, + "loss": 0.6259, + "step": 612 + }, + { + "epoch": 0.6248725790010193, + "grad_norm": 7.553733825683594, + "learning_rate": 9.019834240005058e-05, + "loss": 0.8133, + "step": 613 + }, + { + "epoch": 0.6258919469928644, + "grad_norm": 8.876506805419922, + "learning_rate": 9.008600862883235e-05, + "loss": 1.2119, + "step": 614 + }, + { + "epoch": 0.6269113149847095, + "grad_norm": 6.738461971282959, + "learning_rate": 8.997310560219578e-05, + "loss": 0.5068, + "step": 615 + }, + { + "epoch": 0.6279306829765545, + "grad_norm": 9.400090217590332, + "learning_rate": 8.985963492345991e-05, + "loss": 0.7723, + "step": 616 + }, + { + "epoch": 0.6289500509683996, + "grad_norm": 8.690120697021484, + "learning_rate": 8.974559820400486e-05, + "loss": 1.0791, + "step": 617 + }, + { + "epoch": 0.6299694189602446, + "grad_norm": 5.441365718841553, + "learning_rate": 8.963099706324904e-05, + "loss": 0.348, + "step": 618 + }, + { + "epoch": 0.6309887869520897, + "grad_norm": 8.373964309692383, + "learning_rate": 8.951583312862616e-05, + "loss": 0.6523, + "step": 619 + }, + { + "epoch": 0.6320081549439348, + "grad_norm": 8.361169815063477, + "learning_rate": 8.9400108035562e-05, + "loss": 0.6241, + "step": 620 + }, + { + "epoch": 0.6330275229357798, + "grad_norm": 8.095520973205566, + "learning_rate": 8.928382342745137e-05, + "loss": 0.5039, + "step": 621 + }, + { + "epoch": 0.6340468909276249, + "grad_norm": 9.879805564880371, + "learning_rate": 8.916698095563453e-05, + "loss": 1.0113, + "step": 622 + }, + { + "epoch": 0.6350662589194699, + "grad_norm": 11.630424499511719, + "learning_rate": 8.904958227937406e-05, + "loss": 1.0527, + "step": 623 + }, + { + "epoch": 0.636085626911315, + "grad_norm": 9.939377784729004, + "learning_rate": 8.893162906583094e-05, + "loss": 1.3893, + "step": 624 + }, + { + "epoch": 0.6371049949031601, + "grad_norm": 7.852113723754883, + "learning_rate": 8.881312299004117e-05, + "loss": 0.7191, + "step": 625 + }, + { + "epoch": 0.6381243628950051, + "grad_norm": 7.134123802185059, + "learning_rate": 8.86940657348918e-05, + "loss": 0.3591, + "step": 626 + }, + { + "epoch": 0.6391437308868502, + "grad_norm": 5.795046806335449, + "learning_rate": 8.857445899109715e-05, + "loss": 0.9856, + "step": 627 + }, + { + "epoch": 0.6401630988786952, + "grad_norm": 7.4533610343933105, + "learning_rate": 8.845430445717469e-05, + "loss": 0.7603, + "step": 628 + }, + { + "epoch": 0.6411824668705403, + "grad_norm": 9.926379203796387, + "learning_rate": 8.8333603839421e-05, + "loss": 1.1553, + "step": 629 + }, + { + "epoch": 0.6422018348623854, + "grad_norm": 7.032261371612549, + "learning_rate": 8.821235885188754e-05, + "loss": 0.5608, + "step": 630 + }, + { + "epoch": 0.6432212028542303, + "grad_norm": 6.283802509307861, + "learning_rate": 8.809057121635624e-05, + "loss": 0.4338, + "step": 631 + }, + { + "epoch": 0.6442405708460754, + "grad_norm": 2.8640384674072266, + "learning_rate": 8.796824266231511e-05, + "loss": 0.1376, + "step": 632 + }, + { + "epoch": 0.6452599388379205, + "grad_norm": 7.722833633422852, + "learning_rate": 8.784537492693368e-05, + "loss": 0.6539, + "step": 633 + }, + { + "epoch": 0.6462793068297655, + "grad_norm": 7.714670658111572, + "learning_rate": 8.772196975503828e-05, + "loss": 0.5017, + "step": 634 + }, + { + "epoch": 0.6472986748216106, + "grad_norm": 4.0773091316223145, + "learning_rate": 8.759802889908733e-05, + "loss": 0.1888, + "step": 635 + }, + { + "epoch": 0.6483180428134556, + "grad_norm": 12.99943733215332, + "learning_rate": 8.747355411914642e-05, + "loss": 1.6077, + "step": 636 + }, + { + "epoch": 0.6493374108053007, + "grad_norm": 10.86596393585205, + "learning_rate": 8.734854718286327e-05, + "loss": 0.9635, + "step": 637 + }, + { + "epoch": 0.6503567787971458, + "grad_norm": 9.243484497070312, + "learning_rate": 8.722300986544272e-05, + "loss": 0.9786, + "step": 638 + }, + { + "epoch": 0.6513761467889908, + "grad_norm": 10.92319393157959, + "learning_rate": 8.709694394962142e-05, + "loss": 0.6728, + "step": 639 + }, + { + "epoch": 0.6523955147808359, + "grad_norm": 11.628253936767578, + "learning_rate": 8.697035122564266e-05, + "loss": 0.8592, + "step": 640 + }, + { + "epoch": 0.6534148827726809, + "grad_norm": 5.602497100830078, + "learning_rate": 8.684323349123075e-05, + "loss": 0.3945, + "step": 641 + }, + { + "epoch": 0.654434250764526, + "grad_norm": 7.681665420532227, + "learning_rate": 8.671559255156567e-05, + "loss": 0.7486, + "step": 642 + }, + { + "epoch": 0.6554536187563711, + "grad_norm": 9.017338752746582, + "learning_rate": 8.658743021925733e-05, + "loss": 0.7793, + "step": 643 + }, + { + "epoch": 0.6564729867482161, + "grad_norm": 5.24987268447876, + "learning_rate": 8.645874831431982e-05, + "loss": 0.4401, + "step": 644 + }, + { + "epoch": 0.6574923547400612, + "grad_norm": 10.270877838134766, + "learning_rate": 8.632954866414567e-05, + "loss": 0.6189, + "step": 645 + }, + { + "epoch": 0.6585117227319062, + "grad_norm": 8.378297805786133, + "learning_rate": 8.619983310347982e-05, + "loss": 0.7339, + "step": 646 + }, + { + "epoch": 0.6595310907237513, + "grad_norm": 6.045844554901123, + "learning_rate": 8.606960347439355e-05, + "loss": 0.4089, + "step": 647 + }, + { + "epoch": 0.6605504587155964, + "grad_norm": 10.432483673095703, + "learning_rate": 8.593886162625835e-05, + "loss": 1.1412, + "step": 648 + }, + { + "epoch": 0.6615698267074414, + "grad_norm": 5.939512729644775, + "learning_rate": 8.580760941571967e-05, + "loss": 0.798, + "step": 649 + }, + { + "epoch": 0.6625891946992865, + "grad_norm": 12.093332290649414, + "learning_rate": 8.567584870667056e-05, + "loss": 1.0588, + "step": 650 + }, + { + "epoch": 0.6636085626911316, + "grad_norm": 8.624043464660645, + "learning_rate": 8.554358137022513e-05, + "loss": 0.9044, + "step": 651 + }, + { + "epoch": 0.6646279306829765, + "grad_norm": 7.735975742340088, + "learning_rate": 8.54108092846921e-05, + "loss": 0.4464, + "step": 652 + }, + { + "epoch": 0.6656472986748216, + "grad_norm": 3.8205575942993164, + "learning_rate": 8.527753433554797e-05, + "loss": 0.2756, + "step": 653 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 10.537273406982422, + "learning_rate": 8.51437584154104e-05, + "loss": 1.123, + "step": 654 + }, + { + "epoch": 0.6676860346585117, + "grad_norm": 6.052632808685303, + "learning_rate": 8.500948342401124e-05, + "loss": 0.5377, + "step": 655 + }, + { + "epoch": 0.6687054026503568, + "grad_norm": 7.787528991699219, + "learning_rate": 8.48747112681696e-05, + "loss": 0.5164, + "step": 656 + }, + { + "epoch": 0.6697247706422018, + "grad_norm": 10.115964889526367, + "learning_rate": 8.473944386176469e-05, + "loss": 0.7155, + "step": 657 + }, + { + "epoch": 0.6707441386340469, + "grad_norm": 6.880122184753418, + "learning_rate": 8.460368312570873e-05, + "loss": 0.4512, + "step": 658 + }, + { + "epoch": 0.6717635066258919, + "grad_norm": 8.106338500976562, + "learning_rate": 8.446743098791969e-05, + "loss": 0.6199, + "step": 659 + }, + { + "epoch": 0.672782874617737, + "grad_norm": 11.035154342651367, + "learning_rate": 8.433068938329376e-05, + "loss": 0.6673, + "step": 660 + }, + { + "epoch": 0.6738022426095821, + "grad_norm": 4.484703540802002, + "learning_rate": 8.419346025367809e-05, + "loss": 0.5934, + "step": 661 + }, + { + "epoch": 0.6748216106014271, + "grad_norm": 6.977105140686035, + "learning_rate": 8.4055745547843e-05, + "loss": 0.5034, + "step": 662 + }, + { + "epoch": 0.6758409785932722, + "grad_norm": 5.447470664978027, + "learning_rate": 8.391754722145449e-05, + "loss": 0.4161, + "step": 663 + }, + { + "epoch": 0.6768603465851172, + "grad_norm": 13.200489044189453, + "learning_rate": 8.37788672370463e-05, + "loss": 0.9848, + "step": 664 + }, + { + "epoch": 0.6778797145769623, + "grad_norm": 6.03376579284668, + "learning_rate": 8.36397075639922e-05, + "loss": 0.356, + "step": 665 + }, + { + "epoch": 0.6788990825688074, + "grad_norm": 6.075347900390625, + "learning_rate": 8.350007017847788e-05, + "loss": 0.3031, + "step": 666 + }, + { + "epoch": 0.6799184505606524, + "grad_norm": 5.790109157562256, + "learning_rate": 8.335995706347299e-05, + "loss": 0.254, + "step": 667 + }, + { + "epoch": 0.6809378185524975, + "grad_norm": 11.979147911071777, + "learning_rate": 8.321937020870296e-05, + "loss": 0.8646, + "step": 668 + }, + { + "epoch": 0.6819571865443425, + "grad_norm": 9.445723533630371, + "learning_rate": 8.30783116106207e-05, + "loss": 0.7303, + "step": 669 + }, + { + "epoch": 0.6829765545361876, + "grad_norm": 8.001054763793945, + "learning_rate": 8.293678327237827e-05, + "loss": 0.4105, + "step": 670 + }, + { + "epoch": 0.6839959225280327, + "grad_norm": 4.437264919281006, + "learning_rate": 8.279478720379845e-05, + "loss": 0.2874, + "step": 671 + }, + { + "epoch": 0.6850152905198776, + "grad_norm": 4.547714710235596, + "learning_rate": 8.265232542134622e-05, + "loss": 0.2112, + "step": 672 + }, + { + "epoch": 0.6860346585117227, + "grad_norm": 7.875749588012695, + "learning_rate": 8.250939994810003e-05, + "loss": 1.0919, + "step": 673 + }, + { + "epoch": 0.6870540265035678, + "grad_norm": 5.349310874938965, + "learning_rate": 8.236601281372319e-05, + "loss": 0.5927, + "step": 674 + }, + { + "epoch": 0.6880733944954128, + "grad_norm": 11.490046501159668, + "learning_rate": 8.222216605443496e-05, + "loss": 1.011, + "step": 675 + }, + { + "epoch": 0.6890927624872579, + "grad_norm": 7.11298942565918, + "learning_rate": 8.207786171298166e-05, + "loss": 0.5656, + "step": 676 + }, + { + "epoch": 0.6901121304791029, + "grad_norm": 10.48589038848877, + "learning_rate": 8.193310183860771e-05, + "loss": 0.7199, + "step": 677 + }, + { + "epoch": 0.691131498470948, + "grad_norm": 9.364179611206055, + "learning_rate": 8.178788848702643e-05, + "loss": 0.7506, + "step": 678 + }, + { + "epoch": 0.6921508664627931, + "grad_norm": 6.678390026092529, + "learning_rate": 8.164222372039092e-05, + "loss": 0.5386, + "step": 679 + }, + { + "epoch": 0.6931702344546381, + "grad_norm": 6.151979446411133, + "learning_rate": 8.149610960726479e-05, + "loss": 0.6678, + "step": 680 + }, + { + "epoch": 0.6941896024464832, + "grad_norm": 6.415065765380859, + "learning_rate": 8.134954822259271e-05, + "loss": 0.4834, + "step": 681 + }, + { + "epoch": 0.6952089704383282, + "grad_norm": 4.4640326499938965, + "learning_rate": 8.120254164767101e-05, + "loss": 0.2411, + "step": 682 + }, + { + "epoch": 0.6962283384301733, + "grad_norm": 6.626987457275391, + "learning_rate": 8.105509197011807e-05, + "loss": 0.5011, + "step": 683 + }, + { + "epoch": 0.6972477064220184, + "grad_norm": 7.628388404846191, + "learning_rate": 8.090720128384475e-05, + "loss": 0.6573, + "step": 684 + }, + { + "epoch": 0.6982670744138634, + "grad_norm": 3.4043076038360596, + "learning_rate": 8.075887168902459e-05, + "loss": 0.2798, + "step": 685 + }, + { + "epoch": 0.6992864424057085, + "grad_norm": 5.682481288909912, + "learning_rate": 8.061010529206398e-05, + "loss": 0.5887, + "step": 686 + }, + { + "epoch": 0.6992864424057085, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8446075320243835, + "eval_Qnli-dev-1024_cosine_ap": 0.7501532568375827, + "eval_Qnli-dev-1024_cosine_f1": 0.7207207207207208, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.756614089012146, + "eval_Qnli-dev-1024_cosine_mcc": 0.4081269865567241, + "eval_Qnli-dev-1024_cosine_precision": 0.6060606060606061, + "eval_Qnli-dev-1024_cosine_recall": 0.8888888888888888, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7564685344696045, + "eval_Qnli-dev_cosine_ap": 0.731843650475666, + "eval_Qnli-dev_cosine_f1": 0.7378640776699029, + "eval_Qnli-dev_cosine_f1_threshold": 0.6987220048904419, + "eval_Qnli-dev_cosine_mcc": 0.46153029495329345, + "eval_Qnli-dev_cosine_precision": 0.6551724137931034, + "eval_Qnli-dev_cosine_recall": 0.8444444444444444, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9270833134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9583333134651184, + "eval_global_dataset_loss": 0.29955434799194336, + "eval_global_dataset_runtime": 104.3655, + "eval_global_dataset_samples_per_second": 7.694, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.9270833134651184, + "eval_sts-test-1024_pearson_cosine": 0.8628190908797548, + "eval_sts-test-1024_spearman_cosine": 0.9062196010289961, + "eval_sts-test_pearson_cosine": 0.9012940791829644, + "eval_sts-test_spearman_cosine": 0.9179642241352577, + "step": 686 + }, + { + "epoch": 0.7003058103975535, + "grad_norm": 5.198816299438477, + "learning_rate": 8.046090420557231e-05, + "loss": 0.6328, + "step": 687 + }, + { + "epoch": 0.7013251783893986, + "grad_norm": 7.044593811035156, + "learning_rate": 8.031127054833189e-05, + "loss": 0.4322, + "step": 688 + }, + { + "epoch": 0.7023445463812437, + "grad_norm": 13.891091346740723, + "learning_rate": 8.016120644526797e-05, + "loss": 1.1473, + "step": 689 + }, + { + "epoch": 0.7033639143730887, + "grad_norm": 9.329078674316406, + "learning_rate": 8.001071402741842e-05, + "loss": 1.0135, + "step": 690 + }, + { + "epoch": 0.7043832823649337, + "grad_norm": 4.1149210929870605, + "learning_rate": 7.985979543190358e-05, + "loss": 0.2655, + "step": 691 + }, + { + "epoch": 0.7054026503567788, + "grad_norm": 7.722234725952148, + "learning_rate": 7.970845280189586e-05, + "loss": 0.5053, + "step": 692 + }, + { + "epoch": 0.7064220183486238, + "grad_norm": 6.9180216789245605, + "learning_rate": 7.955668828658937e-05, + "loss": 0.8647, + "step": 693 + }, + { + "epoch": 0.7074413863404689, + "grad_norm": 5.709589004516602, + "learning_rate": 7.940450404116928e-05, + "loss": 0.4423, + "step": 694 + }, + { + "epoch": 0.7084607543323139, + "grad_norm": 4.812499523162842, + "learning_rate": 7.925190222678133e-05, + "loss": 0.3673, + "step": 695 + }, + { + "epoch": 0.709480122324159, + "grad_norm": 11.944628715515137, + "learning_rate": 7.909888501050109e-05, + "loss": 1.1714, + "step": 696 + }, + { + "epoch": 0.7104994903160041, + "grad_norm": 7.61957311630249, + "learning_rate": 7.894545456530316e-05, + "loss": 0.8142, + "step": 697 + }, + { + "epoch": 0.7115188583078491, + "grad_norm": 9.580735206604004, + "learning_rate": 7.879161307003038e-05, + "loss": 0.8027, + "step": 698 + }, + { + "epoch": 0.7125382262996942, + "grad_norm": 7.831961154937744, + "learning_rate": 7.863736270936284e-05, + "loss": 0.4514, + "step": 699 + }, + { + "epoch": 0.7135575942915392, + "grad_norm": 9.805893898010254, + "learning_rate": 7.848270567378686e-05, + "loss": 0.8798, + "step": 700 + }, + { + "epoch": 0.7145769622833843, + "grad_norm": 8.573545455932617, + "learning_rate": 7.832764415956389e-05, + "loss": 0.7718, + "step": 701 + }, + { + "epoch": 0.7155963302752294, + "grad_norm": 6.185779571533203, + "learning_rate": 7.817218036869932e-05, + "loss": 0.4094, + "step": 702 + }, + { + "epoch": 0.7166156982670744, + "grad_norm": 9.415246963500977, + "learning_rate": 7.80163165089112e-05, + "loss": 0.5358, + "step": 703 + }, + { + "epoch": 0.7176350662589195, + "grad_norm": 5.7925543785095215, + "learning_rate": 7.78600547935989e-05, + "loss": 0.5728, + "step": 704 + }, + { + "epoch": 0.7186544342507645, + "grad_norm": 8.365612983703613, + "learning_rate": 7.770339744181175e-05, + "loss": 0.4349, + "step": 705 + }, + { + "epoch": 0.7196738022426096, + "grad_norm": 11.040353775024414, + "learning_rate": 7.754634667821734e-05, + "loss": 1.0107, + "step": 706 + }, + { + "epoch": 0.7206931702344547, + "grad_norm": 10.400522232055664, + "learning_rate": 7.73889047330701e-05, + "loss": 1.3393, + "step": 707 + }, + { + "epoch": 0.7217125382262997, + "grad_norm": 6.314993381500244, + "learning_rate": 7.723107384217958e-05, + "loss": 0.5175, + "step": 708 + }, + { + "epoch": 0.7227319062181448, + "grad_norm": 7.7337541580200195, + "learning_rate": 7.70728562468787e-05, + "loss": 0.3906, + "step": 709 + }, + { + "epoch": 0.7237512742099899, + "grad_norm": 8.559732437133789, + "learning_rate": 7.691425419399183e-05, + "loss": 0.726, + "step": 710 + }, + { + "epoch": 0.7247706422018348, + "grad_norm": 5.824985504150391, + "learning_rate": 7.675526993580306e-05, + "loss": 0.4299, + "step": 711 + }, + { + "epoch": 0.72579001019368, + "grad_norm": 9.804418563842773, + "learning_rate": 7.659590573002407e-05, + "loss": 0.7486, + "step": 712 + }, + { + "epoch": 0.7268093781855249, + "grad_norm": 5.5835957527160645, + "learning_rate": 7.643616383976214e-05, + "loss": 0.3316, + "step": 713 + }, + { + "epoch": 0.72782874617737, + "grad_norm": 8.719099044799805, + "learning_rate": 7.627604653348796e-05, + "loss": 0.5444, + "step": 714 + }, + { + "epoch": 0.7288481141692151, + "grad_norm": 7.16873025894165, + "learning_rate": 7.611555608500351e-05, + "loss": 0.4717, + "step": 715 + }, + { + "epoch": 0.7298674821610601, + "grad_norm": 8.529095649719238, + "learning_rate": 7.595469477340965e-05, + "loss": 0.5413, + "step": 716 + }, + { + "epoch": 0.7308868501529052, + "grad_norm": 4.7856245040893555, + "learning_rate": 7.579346488307379e-05, + "loss": 0.2207, + "step": 717 + }, + { + "epoch": 0.7319062181447502, + "grad_norm": 8.381448745727539, + "learning_rate": 7.563186870359758e-05, + "loss": 0.8042, + "step": 718 + }, + { + "epoch": 0.7329255861365953, + "grad_norm": 6.099252700805664, + "learning_rate": 7.546990852978415e-05, + "loss": 0.3666, + "step": 719 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 6.979067802429199, + "learning_rate": 7.530758666160577e-05, + "loss": 0.4511, + "step": 720 + }, + { + "epoch": 0.7349643221202854, + "grad_norm": 8.355476379394531, + "learning_rate": 7.514490540417103e-05, + "loss": 0.782, + "step": 721 + }, + { + "epoch": 0.7359836901121305, + "grad_norm": 3.4271693229675293, + "learning_rate": 7.498186706769213e-05, + "loss": 0.2947, + "step": 722 + }, + { + "epoch": 0.7370030581039755, + "grad_norm": 10.753888130187988, + "learning_rate": 7.481847396745215e-05, + "loss": 1.6206, + "step": 723 + }, + { + "epoch": 0.7380224260958206, + "grad_norm": 10.323583602905273, + "learning_rate": 7.465472842377206e-05, + "loss": 0.6216, + "step": 724 + }, + { + "epoch": 0.7390417940876657, + "grad_norm": 5.268289566040039, + "learning_rate": 7.449063276197789e-05, + "loss": 0.2819, + "step": 725 + }, + { + "epoch": 0.7400611620795107, + "grad_norm": 8.948394775390625, + "learning_rate": 7.432618931236759e-05, + "loss": 0.4797, + "step": 726 + }, + { + "epoch": 0.7410805300713558, + "grad_norm": 4.78109884262085, + "learning_rate": 7.416140041017802e-05, + "loss": 0.3875, + "step": 727 + }, + { + "epoch": 0.7420998980632009, + "grad_norm": 7.643434047698975, + "learning_rate": 7.399626839555176e-05, + "loss": 0.6888, + "step": 728 + }, + { + "epoch": 0.7431192660550459, + "grad_norm": 4.128391742706299, + "learning_rate": 7.383079561350386e-05, + "loss": 0.3023, + "step": 729 + }, + { + "epoch": 0.744138634046891, + "grad_norm": 8.254578590393066, + "learning_rate": 7.36649844138886e-05, + "loss": 0.6654, + "step": 730 + }, + { + "epoch": 0.745158002038736, + "grad_norm": 10.747797966003418, + "learning_rate": 7.3498837151366e-05, + "loss": 0.6517, + "step": 731 + }, + { + "epoch": 0.746177370030581, + "grad_norm": 6.274332046508789, + "learning_rate": 7.333235618536856e-05, + "loss": 0.4537, + "step": 732 + }, + { + "epoch": 0.7471967380224261, + "grad_norm": 8.256685256958008, + "learning_rate": 7.316554388006756e-05, + "loss": 0.7224, + "step": 733 + }, + { + "epoch": 0.7482161060142711, + "grad_norm": 7.657110214233398, + "learning_rate": 7.299840260433965e-05, + "loss": 0.4447, + "step": 734 + }, + { + "epoch": 0.7492354740061162, + "grad_norm": 6.170997142791748, + "learning_rate": 7.283093473173307e-05, + "loss": 0.4127, + "step": 735 + }, + { + "epoch": 0.7502548419979612, + "grad_norm": 5.84876823425293, + "learning_rate": 7.26631426404341e-05, + "loss": 0.3297, + "step": 736 + }, + { + "epoch": 0.7512742099898063, + "grad_norm": 5.986436367034912, + "learning_rate": 7.249502871323314e-05, + "loss": 0.3664, + "step": 737 + }, + { + "epoch": 0.7522935779816514, + "grad_norm": 9.613632202148438, + "learning_rate": 7.232659533749092e-05, + "loss": 0.7934, + "step": 738 + }, + { + "epoch": 0.7533129459734964, + "grad_norm": 5.5741286277771, + "learning_rate": 7.215784490510468e-05, + "loss": 0.4214, + "step": 739 + }, + { + "epoch": 0.7543323139653415, + "grad_norm": 8.343430519104004, + "learning_rate": 7.198877981247406e-05, + "loss": 0.6174, + "step": 740 + }, + { + "epoch": 0.7553516819571865, + "grad_norm": 11.505045890808105, + "learning_rate": 7.18194024604672e-05, + "loss": 0.7011, + "step": 741 + }, + { + "epoch": 0.7563710499490316, + "grad_norm": 9.192388534545898, + "learning_rate": 7.164971525438657e-05, + "loss": 0.6472, + "step": 742 + }, + { + "epoch": 0.7573904179408767, + "grad_norm": 10.685009002685547, + "learning_rate": 7.147972060393478e-05, + "loss": 0.9555, + "step": 743 + }, + { + "epoch": 0.7584097859327217, + "grad_norm": 9.81982421875, + "learning_rate": 7.130942092318051e-05, + "loss": 1.1771, + "step": 744 + }, + { + "epoch": 0.7594291539245668, + "grad_norm": 7.654698848724365, + "learning_rate": 7.113881863052407e-05, + "loss": 0.6876, + "step": 745 + }, + { + "epoch": 0.7604485219164119, + "grad_norm": 10.608144760131836, + "learning_rate": 7.096791614866309e-05, + "loss": 0.6737, + "step": 746 + }, + { + "epoch": 0.7614678899082569, + "grad_norm": 8.949767112731934, + "learning_rate": 7.079671590455821e-05, + "loss": 0.9648, + "step": 747 + }, + { + "epoch": 0.762487257900102, + "grad_norm": 5.873875141143799, + "learning_rate": 7.06252203293985e-05, + "loss": 0.3267, + "step": 748 + }, + { + "epoch": 0.763506625891947, + "grad_norm": 3.814371347427368, + "learning_rate": 7.045343185856701e-05, + "loss": 0.2244, + "step": 749 + }, + { + "epoch": 0.764525993883792, + "grad_norm": 5.834865570068359, + "learning_rate": 7.028135293160611e-05, + "loss": 0.305, + "step": 750 + }, + { + "epoch": 0.7655453618756372, + "grad_norm": 8.765941619873047, + "learning_rate": 7.010898599218296e-05, + "loss": 0.5588, + "step": 751 + }, + { + "epoch": 0.7665647298674821, + "grad_norm": 8.091228485107422, + "learning_rate": 6.99363334880547e-05, + "loss": 1.0974, + "step": 752 + }, + { + "epoch": 0.7675840978593272, + "grad_norm": 7.041286468505859, + "learning_rate": 6.976339787103373e-05, + "loss": 0.603, + "step": 753 + }, + { + "epoch": 0.7686034658511722, + "grad_norm": 6.676450729370117, + "learning_rate": 6.959018159695293e-05, + "loss": 0.6972, + "step": 754 + }, + { + "epoch": 0.7696228338430173, + "grad_norm": 9.935379981994629, + "learning_rate": 6.94166871256307e-05, + "loss": 0.958, + "step": 755 + }, + { + "epoch": 0.7706422018348624, + "grad_norm": 6.536661624908447, + "learning_rate": 6.92429169208361e-05, + "loss": 0.2937, + "step": 756 + }, + { + "epoch": 0.7716615698267074, + "grad_norm": 5.736427307128906, + "learning_rate": 6.906887345025385e-05, + "loss": 0.3384, + "step": 757 + }, + { + "epoch": 0.7726809378185525, + "grad_norm": 5.628017425537109, + "learning_rate": 6.88945591854493e-05, + "loss": 0.3321, + "step": 758 + }, + { + "epoch": 0.7737003058103975, + "grad_norm": 9.1480712890625, + "learning_rate": 6.87199766018332e-05, + "loss": 0.8029, + "step": 759 + }, + { + "epoch": 0.7747196738022426, + "grad_norm": 7.8731770515441895, + "learning_rate": 6.85451281786268e-05, + "loss": 0.7043, + "step": 760 + }, + { + "epoch": 0.7757390417940877, + "grad_norm": 13.733153343200684, + "learning_rate": 6.837001639882641e-05, + "loss": 1.6068, + "step": 761 + }, + { + "epoch": 0.7767584097859327, + "grad_norm": 9.02813720703125, + "learning_rate": 6.819464374916823e-05, + "loss": 1.1273, + "step": 762 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 8.211151123046875, + "learning_rate": 6.801901272009307e-05, + "loss": 0.5429, + "step": 763 + }, + { + "epoch": 0.7787971457696228, + "grad_norm": 5.243499755859375, + "learning_rate": 6.784312580571091e-05, + "loss": 0.2976, + "step": 764 + }, + { + "epoch": 0.7798165137614679, + "grad_norm": 11.219100952148438, + "learning_rate": 6.766698550376556e-05, + "loss": 0.9183, + "step": 765 + }, + { + "epoch": 0.780835881753313, + "grad_norm": 7.10944938659668, + "learning_rate": 6.749059431559913e-05, + "loss": 0.4734, + "step": 766 + }, + { + "epoch": 0.781855249745158, + "grad_norm": 7.810965061187744, + "learning_rate": 6.731395474611649e-05, + "loss": 0.5437, + "step": 767 + }, + { + "epoch": 0.7828746177370031, + "grad_norm": 6.063333034515381, + "learning_rate": 6.71370693037498e-05, + "loss": 0.3382, + "step": 768 + }, + { + "epoch": 0.7838939857288482, + "grad_norm": 5.784426689147949, + "learning_rate": 6.695994050042277e-05, + "loss": 0.3925, + "step": 769 + }, + { + "epoch": 0.7849133537206932, + "grad_norm": 7.640711784362793, + "learning_rate": 6.678257085151509e-05, + "loss": 0.4345, + "step": 770 + }, + { + "epoch": 0.7859327217125383, + "grad_norm": 9.467418670654297, + "learning_rate": 6.660496287582667e-05, + "loss": 0.9237, + "step": 771 + }, + { + "epoch": 0.7869520897043832, + "grad_norm": 4.449363708496094, + "learning_rate": 6.642711909554174e-05, + "loss": 0.3875, + "step": 772 + }, + { + "epoch": 0.7879714576962283, + "grad_norm": 7.483307838439941, + "learning_rate": 6.624904203619333e-05, + "loss": 0.533, + "step": 773 + }, + { + "epoch": 0.7889908256880734, + "grad_norm": 4.827091693878174, + "learning_rate": 6.607073422662711e-05, + "loss": 0.4211, + "step": 774 + }, + { + "epoch": 0.7900101936799184, + "grad_norm": 6.135465621948242, + "learning_rate": 6.589219819896565e-05, + "loss": 0.5421, + "step": 775 + }, + { + "epoch": 0.7910295616717635, + "grad_norm": 9.622929573059082, + "learning_rate": 6.571343648857242e-05, + "loss": 0.8904, + "step": 776 + }, + { + "epoch": 0.7920489296636085, + "grad_norm": 5.664134502410889, + "learning_rate": 6.553445163401571e-05, + "loss": 0.4604, + "step": 777 + }, + { + "epoch": 0.7930682976554536, + "grad_norm": 9.634468078613281, + "learning_rate": 6.535524617703273e-05, + "loss": 0.7431, + "step": 778 + }, + { + "epoch": 0.7940876656472987, + "grad_norm": 10.855483055114746, + "learning_rate": 6.517582266249336e-05, + "loss": 1.0159, + "step": 779 + }, + { + "epoch": 0.7951070336391437, + "grad_norm": 9.945262908935547, + "learning_rate": 6.499618363836417e-05, + "loss": 0.6554, + "step": 780 + }, + { + "epoch": 0.7961264016309888, + "grad_norm": 7.224388599395752, + "learning_rate": 6.481633165567207e-05, + "loss": 0.8539, + "step": 781 + }, + { + "epoch": 0.7971457696228338, + "grad_norm": 8.917383193969727, + "learning_rate": 6.463626926846817e-05, + "loss": 0.4543, + "step": 782 + }, + { + "epoch": 0.7981651376146789, + "grad_norm": 4.411260604858398, + "learning_rate": 6.445599903379154e-05, + "loss": 0.2281, + "step": 783 + }, + { + "epoch": 0.799184505606524, + "grad_norm": 8.85741138458252, + "learning_rate": 6.427552351163286e-05, + "loss": 1.0334, + "step": 784 + }, + { + "epoch": 0.799184505606524, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8376107215881348, + "eval_Qnli-dev-1024_cosine_ap": 0.7815698422458957, + "eval_Qnli-dev-1024_cosine_f1": 0.7222222222222222, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7544304132461548, + "eval_Qnli-dev-1024_cosine_mcc": 0.41614558708189836, + "eval_Qnli-dev-1024_cosine_precision": 0.6190476190476191, + "eval_Qnli-dev-1024_cosine_recall": 0.8666666666666667, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7388297319412231, + "eval_Qnli-dev_cosine_ap": 0.7636341718424307, + "eval_Qnli-dev_cosine_f1": 0.7450980392156862, + "eval_Qnli-dev_cosine_f1_threshold": 0.695953369140625, + "eval_Qnli-dev_cosine_mcc": 0.4794765594627558, + "eval_Qnli-dev_cosine_precision": 0.6666666666666666, + "eval_Qnli-dev_cosine_recall": 0.8444444444444444, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9583333134651184, + "eval_global_dataset_loss": 0.3045359253883362, + "eval_global_dataset_runtime": 103.772, + "eval_global_dataset_samples_per_second": 7.738, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8706480103495355, + "eval_sts-test-1024_spearman_cosine": 0.9094148980677476, + "eval_sts-test_pearson_cosine": 0.9036838203711135, + "eval_sts-test_spearman_cosine": 0.9196077696084266, + "step": 784 + }, + { + "epoch": 0.800203873598369, + "grad_norm": 10.137900352478027, + "learning_rate": 6.409484526489805e-05, + "loss": 0.9697, + "step": 785 + }, + { + "epoch": 0.8012232415902141, + "grad_norm": 8.976780891418457, + "learning_rate": 6.391396685937186e-05, + "loss": 0.7048, + "step": 786 + }, + { + "epoch": 0.8022426095820592, + "grad_norm": 8.672534942626953, + "learning_rate": 6.373289086368151e-05, + "loss": 0.5263, + "step": 787 + }, + { + "epoch": 0.8032619775739042, + "grad_norm": 9.115574836730957, + "learning_rate": 6.355161984926019e-05, + "loss": 0.5056, + "step": 788 + }, + { + "epoch": 0.8042813455657493, + "grad_norm": 5.47214412689209, + "learning_rate": 6.337015639031044e-05, + "loss": 0.3826, + "step": 789 + }, + { + "epoch": 0.8053007135575942, + "grad_norm": 4.726554870605469, + "learning_rate": 6.318850306376777e-05, + "loss": 0.3029, + "step": 790 + }, + { + "epoch": 0.8063200815494393, + "grad_norm": 9.025796890258789, + "learning_rate": 6.300666244926387e-05, + "loss": 0.7712, + "step": 791 + }, + { + "epoch": 0.8073394495412844, + "grad_norm": 8.51115894317627, + "learning_rate": 6.282463712909018e-05, + "loss": 0.5587, + "step": 792 + }, + { + "epoch": 0.8083588175331294, + "grad_norm": 4.170646667480469, + "learning_rate": 6.264242968816106e-05, + "loss": 0.2386, + "step": 793 + }, + { + "epoch": 0.8093781855249745, + "grad_norm": 7.225284576416016, + "learning_rate": 6.246004271397713e-05, + "loss": 0.5662, + "step": 794 + }, + { + "epoch": 0.8103975535168195, + "grad_norm": 8.109657287597656, + "learning_rate": 6.227747879658859e-05, + "loss": 0.5322, + "step": 795 + }, + { + "epoch": 0.8114169215086646, + "grad_norm": 8.729584693908691, + "learning_rate": 6.20947405285583e-05, + "loss": 0.5122, + "step": 796 + }, + { + "epoch": 0.8124362895005097, + "grad_norm": 6.562040328979492, + "learning_rate": 6.191183050492515e-05, + "loss": 0.5094, + "step": 797 + }, + { + "epoch": 0.8134556574923547, + "grad_norm": 8.552765846252441, + "learning_rate": 6.172875132316703e-05, + "loss": 0.8412, + "step": 798 + }, + { + "epoch": 0.8144750254841998, + "grad_norm": 8.517980575561523, + "learning_rate": 6.154550558316405e-05, + "loss": 0.3771, + "step": 799 + }, + { + "epoch": 0.8154943934760448, + "grad_norm": 9.862586975097656, + "learning_rate": 6.136209588716155e-05, + "loss": 0.626, + "step": 800 + }, + { + "epoch": 0.8165137614678899, + "grad_norm": 11.597122192382812, + "learning_rate": 6.117852483973325e-05, + "loss": 0.8902, + "step": 801 + }, + { + "epoch": 0.817533129459735, + "grad_norm": 4.268974781036377, + "learning_rate": 6.0994795047744144e-05, + "loss": 0.2301, + "step": 802 + }, + { + "epoch": 0.81855249745158, + "grad_norm": 2.586038112640381, + "learning_rate": 6.081090912031358e-05, + "loss": 0.16, + "step": 803 + }, + { + "epoch": 0.8195718654434251, + "grad_norm": 6.814731121063232, + "learning_rate": 6.0626869668778085e-05, + "loss": 0.4375, + "step": 804 + }, + { + "epoch": 0.8205912334352702, + "grad_norm": 9.699979782104492, + "learning_rate": 6.044267930665446e-05, + "loss": 0.9554, + "step": 805 + }, + { + "epoch": 0.8216106014271152, + "grad_norm": 7.751320838928223, + "learning_rate": 6.025834064960247e-05, + "loss": 0.4906, + "step": 806 + }, + { + "epoch": 0.8226299694189603, + "grad_norm": 8.852093696594238, + "learning_rate": 6.007385631538787e-05, + "loss": 0.478, + "step": 807 + }, + { + "epoch": 0.8236493374108053, + "grad_norm": 5.510447025299072, + "learning_rate": 5.988922892384513e-05, + "loss": 0.6057, + "step": 808 + }, + { + "epoch": 0.8246687054026504, + "grad_norm": 6.745148658752441, + "learning_rate": 5.9704461096840204e-05, + "loss": 0.5003, + "step": 809 + }, + { + "epoch": 0.8256880733944955, + "grad_norm": 11.509452819824219, + "learning_rate": 5.9519555458233436e-05, + "loss": 1.0844, + "step": 810 + }, + { + "epoch": 0.8267074413863404, + "grad_norm": 9.71648120880127, + "learning_rate": 5.933451463384213e-05, + "loss": 1.0267, + "step": 811 + }, + { + "epoch": 0.8277268093781855, + "grad_norm": 9.810832023620605, + "learning_rate": 5.91493412514034e-05, + "loss": 0.5415, + "step": 812 + }, + { + "epoch": 0.8287461773700305, + "grad_norm": 5.600392818450928, + "learning_rate": 5.896403794053679e-05, + "loss": 0.3295, + "step": 813 + }, + { + "epoch": 0.8297655453618756, + "grad_norm": 7.511580944061279, + "learning_rate": 5.877860733270692e-05, + "loss": 0.5511, + "step": 814 + }, + { + "epoch": 0.8307849133537207, + "grad_norm": 5.374726295471191, + "learning_rate": 5.8593052061186125e-05, + "loss": 0.3234, + "step": 815 + }, + { + "epoch": 0.8318042813455657, + "grad_norm": 4.7778639793396, + "learning_rate": 5.8407374761017105e-05, + "loss": 0.2917, + "step": 816 + }, + { + "epoch": 0.8328236493374108, + "grad_norm": 4.155742645263672, + "learning_rate": 5.822157806897548e-05, + "loss": 0.3865, + "step": 817 + }, + { + "epoch": 0.8338430173292558, + "grad_norm": 5.087594032287598, + "learning_rate": 5.803566462353225e-05, + "loss": 0.2401, + "step": 818 + }, + { + "epoch": 0.8348623853211009, + "grad_norm": 3.707869529724121, + "learning_rate": 5.7849637064816496e-05, + "loss": 0.1582, + "step": 819 + }, + { + "epoch": 0.835881753312946, + "grad_norm": 7.63162899017334, + "learning_rate": 5.76634980345778e-05, + "loss": 0.5475, + "step": 820 + }, + { + "epoch": 0.836901121304791, + "grad_norm": 5.092942237854004, + "learning_rate": 5.747725017614869e-05, + "loss": 0.3291, + "step": 821 + }, + { + "epoch": 0.8379204892966361, + "grad_norm": 6.86021089553833, + "learning_rate": 5.72908961344072e-05, + "loss": 0.6867, + "step": 822 + }, + { + "epoch": 0.8389398572884812, + "grad_norm": 9.336700439453125, + "learning_rate": 5.710443855573919e-05, + "loss": 0.9519, + "step": 823 + }, + { + "epoch": 0.8399592252803262, + "grad_norm": 6.382976055145264, + "learning_rate": 5.6917880088000894e-05, + "loss": 0.4898, + "step": 824 + }, + { + "epoch": 0.8409785932721713, + "grad_norm": 8.171992301940918, + "learning_rate": 5.6731223380481257e-05, + "loss": 0.3361, + "step": 825 + }, + { + "epoch": 0.8419979612640163, + "grad_norm": 11.304964065551758, + "learning_rate": 5.6544471083864245e-05, + "loss": 1.0131, + "step": 826 + }, + { + "epoch": 0.8430173292558614, + "grad_norm": 7.883802890777588, + "learning_rate": 5.635762585019136e-05, + "loss": 0.4988, + "step": 827 + }, + { + "epoch": 0.8440366972477065, + "grad_norm": 5.304625988006592, + "learning_rate": 5.61706903328238e-05, + "loss": 0.2737, + "step": 828 + }, + { + "epoch": 0.8450560652395515, + "grad_norm": 8.170361518859863, + "learning_rate": 5.598366718640494e-05, + "loss": 0.5214, + "step": 829 + }, + { + "epoch": 0.8460754332313966, + "grad_norm": 7.193360805511475, + "learning_rate": 5.579655906682255e-05, + "loss": 0.5261, + "step": 830 + }, + { + "epoch": 0.8470948012232415, + "grad_norm": 5.908787250518799, + "learning_rate": 5.5609368631171035e-05, + "loss": 0.4337, + "step": 831 + }, + { + "epoch": 0.8481141692150866, + "grad_norm": 11.470138549804688, + "learning_rate": 5.5422098537713815e-05, + "loss": 1.0523, + "step": 832 + }, + { + "epoch": 0.8491335372069317, + "grad_norm": 5.7633514404296875, + "learning_rate": 5.52347514458455e-05, + "loss": 0.59, + "step": 833 + }, + { + "epoch": 0.8501529051987767, + "grad_norm": 9.171930313110352, + "learning_rate": 5.5047330016054154e-05, + "loss": 0.9984, + "step": 834 + }, + { + "epoch": 0.8511722731906218, + "grad_norm": 7.584822177886963, + "learning_rate": 5.48598369098835e-05, + "loss": 1.0533, + "step": 835 + }, + { + "epoch": 0.8521916411824668, + "grad_norm": 5.429177761077881, + "learning_rate": 5.4672274789895104e-05, + "loss": 0.266, + "step": 836 + }, + { + "epoch": 0.8532110091743119, + "grad_norm": 7.292309284210205, + "learning_rate": 5.4484646319630636e-05, + "loss": 0.3497, + "step": 837 + }, + { + "epoch": 0.854230377166157, + "grad_norm": 7.126836776733398, + "learning_rate": 5.429695416357392e-05, + "loss": 0.5161, + "step": 838 + }, + { + "epoch": 0.855249745158002, + "grad_norm": 6.357126235961914, + "learning_rate": 5.410920098711323e-05, + "loss": 0.4256, + "step": 839 + }, + { + "epoch": 0.8562691131498471, + "grad_norm": 6.682480335235596, + "learning_rate": 5.392138945650339e-05, + "loss": 0.3334, + "step": 840 + }, + { + "epoch": 0.8572884811416922, + "grad_norm": 6.9180521965026855, + "learning_rate": 5.373352223882787e-05, + "loss": 0.5704, + "step": 841 + }, + { + "epoch": 0.8583078491335372, + "grad_norm": 6.871384620666504, + "learning_rate": 5.354560200196094e-05, + "loss": 0.3803, + "step": 842 + }, + { + "epoch": 0.8593272171253823, + "grad_norm": 9.186737060546875, + "learning_rate": 5.335763141452982e-05, + "loss": 0.7648, + "step": 843 + }, + { + "epoch": 0.8603465851172273, + "grad_norm": 8.700101852416992, + "learning_rate": 5.3169613145876714e-05, + "loss": 0.7548, + "step": 844 + }, + { + "epoch": 0.8613659531090724, + "grad_norm": 7.032200336456299, + "learning_rate": 5.2981549866020975e-05, + "loss": 0.7275, + "step": 845 + }, + { + "epoch": 0.8623853211009175, + "grad_norm": 13.48193359375, + "learning_rate": 5.2793444245621146e-05, + "loss": 1.1788, + "step": 846 + }, + { + "epoch": 0.8634046890927625, + "grad_norm": 9.682479858398438, + "learning_rate": 5.260529895593702e-05, + "loss": 0.7809, + "step": 847 + }, + { + "epoch": 0.8644240570846076, + "grad_norm": 8.730304718017578, + "learning_rate": 5.241711666879172e-05, + "loss": 0.6487, + "step": 848 + }, + { + "epoch": 0.8654434250764526, + "grad_norm": 6.570590972900391, + "learning_rate": 5.2228900056533836e-05, + "loss": 0.561, + "step": 849 + }, + { + "epoch": 0.8664627930682977, + "grad_norm": 8.695535659790039, + "learning_rate": 5.204065179199931e-05, + "loss": 0.5906, + "step": 850 + }, + { + "epoch": 0.8674821610601428, + "grad_norm": 5.353935241699219, + "learning_rate": 5.1852374548473614e-05, + "loss": 0.5192, + "step": 851 + }, + { + "epoch": 0.8685015290519877, + "grad_norm": 10.60522174835205, + "learning_rate": 5.1664070999653766e-05, + "loss": 0.8094, + "step": 852 + }, + { + "epoch": 0.8695208970438328, + "grad_norm": 3.7188539505004883, + "learning_rate": 5.147574381961032e-05, + "loss": 0.2399, + "step": 853 + }, + { + "epoch": 0.8705402650356778, + "grad_norm": 5.648993492126465, + "learning_rate": 5.128739568274944e-05, + "loss": 0.4103, + "step": 854 + }, + { + "epoch": 0.8715596330275229, + "grad_norm": 6.711026668548584, + "learning_rate": 5.109902926377482e-05, + "loss": 0.4969, + "step": 855 + }, + { + "epoch": 0.872579001019368, + "grad_norm": 5.686347961425781, + "learning_rate": 5.091064723764987e-05, + "loss": 0.37, + "step": 856 + }, + { + "epoch": 0.873598369011213, + "grad_norm": 4.857931613922119, + "learning_rate": 5.072225227955959e-05, + "loss": 0.4109, + "step": 857 + }, + { + "epoch": 0.8746177370030581, + "grad_norm": 8.75938606262207, + "learning_rate": 5.053384706487261e-05, + "loss": 0.525, + "step": 858 + }, + { + "epoch": 0.8756371049949032, + "grad_norm": 5.874378204345703, + "learning_rate": 5.034543426910324e-05, + "loss": 0.5958, + "step": 859 + }, + { + "epoch": 0.8766564729867482, + "grad_norm": 5.085257530212402, + "learning_rate": 5.0157016567873424e-05, + "loss": 0.4708, + "step": 860 + }, + { + "epoch": 0.8776758409785933, + "grad_norm": 7.9917707443237305, + "learning_rate": 4.996859663687479e-05, + "loss": 0.6881, + "step": 861 + }, + { + "epoch": 0.8786952089704383, + "grad_norm": 8.1506929397583, + "learning_rate": 4.9780177151830634e-05, + "loss": 0.5545, + "step": 862 + }, + { + "epoch": 0.8797145769622834, + "grad_norm": 9.375650405883789, + "learning_rate": 4.959176078845789e-05, + "loss": 0.645, + "step": 863 + }, + { + "epoch": 0.8807339449541285, + "grad_norm": 4.8143310546875, + "learning_rate": 4.9403350222429184e-05, + "loss": 0.4112, + "step": 864 + }, + { + "epoch": 0.8817533129459735, + "grad_norm": 7.862481594085693, + "learning_rate": 4.92149481293348e-05, + "loss": 0.4178, + "step": 865 + }, + { + "epoch": 0.8827726809378186, + "grad_norm": 5.252464771270752, + "learning_rate": 4.902655718464473e-05, + "loss": 0.2857, + "step": 866 + }, + { + "epoch": 0.8837920489296636, + "grad_norm": 6.06905460357666, + "learning_rate": 4.883818006367062e-05, + "loss": 0.3374, + "step": 867 + }, + { + "epoch": 0.8848114169215087, + "grad_norm": 6.810131072998047, + "learning_rate": 4.86498194415278e-05, + "loss": 0.5303, + "step": 868 + }, + { + "epoch": 0.8858307849133538, + "grad_norm": 7.676322937011719, + "learning_rate": 4.846147799309734e-05, + "loss": 0.7438, + "step": 869 + }, + { + "epoch": 0.8868501529051988, + "grad_norm": 11.570023536682129, + "learning_rate": 4.8273158392987986e-05, + "loss": 1.0872, + "step": 870 + }, + { + "epoch": 0.8878695208970439, + "grad_norm": 6.312341213226318, + "learning_rate": 4.8084863315498234e-05, + "loss": 0.4497, + "step": 871 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 7.389033794403076, + "learning_rate": 4.7896595434578356e-05, + "loss": 0.4171, + "step": 872 + }, + { + "epoch": 0.8899082568807339, + "grad_norm": 8.600625038146973, + "learning_rate": 4.770835742379239e-05, + "loss": 0.4417, + "step": 873 + }, + { + "epoch": 0.890927624872579, + "grad_norm": 7.350024223327637, + "learning_rate": 4.7520151956280227e-05, + "loss": 0.7023, + "step": 874 + }, + { + "epoch": 0.891946992864424, + "grad_norm": 12.617684364318848, + "learning_rate": 4.733198170471953e-05, + "loss": 1.0547, + "step": 875 + }, + { + "epoch": 0.8929663608562691, + "grad_norm": 5.219171524047852, + "learning_rate": 4.714384934128796e-05, + "loss": 0.3526, + "step": 876 + }, + { + "epoch": 0.8939857288481141, + "grad_norm": 10.923335075378418, + "learning_rate": 4.6955757537625104e-05, + "loss": 0.7315, + "step": 877 + }, + { + "epoch": 0.8950050968399592, + "grad_norm": 4.7785325050354, + "learning_rate": 4.6767708964794526e-05, + "loss": 0.4082, + "step": 878 + }, + { + "epoch": 0.8960244648318043, + "grad_norm": 7.037627696990967, + "learning_rate": 4.6579706293245944e-05, + "loss": 0.8155, + "step": 879 + }, + { + "epoch": 0.8970438328236493, + "grad_norm": 7.149205207824707, + "learning_rate": 4.6391752192777164e-05, + "loss": 0.5083, + "step": 880 + }, + { + "epoch": 0.8980632008154944, + "grad_norm": 5.331564426422119, + "learning_rate": 4.620384933249631e-05, + "loss": 0.655, + "step": 881 + }, + { + "epoch": 0.8990825688073395, + "grad_norm": 10.019486427307129, + "learning_rate": 4.6016000380783805e-05, + "loss": 0.7207, + "step": 882 + }, + { + "epoch": 0.8990825688073395, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7708333333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8360349535942078, + "eval_Qnli-dev-1024_cosine_ap": 0.8011558872452826, + "eval_Qnli-dev-1024_cosine_f1": 0.7250000000000001, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8360349535942078, + "eval_Qnli-dev-1024_cosine_mcc": 0.5461802806126049, + "eval_Qnli-dev-1024_cosine_precision": 0.8285714285714286, + "eval_Qnli-dev-1024_cosine_recall": 0.6444444444444445, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7521146535873413, + "eval_Qnli-dev_cosine_ap": 0.7712094779135136, + "eval_Qnli-dev_cosine_f1": 0.7500000000000001, + "eval_Qnli-dev_cosine_f1_threshold": 0.6768573522567749, + "eval_Qnli-dev_cosine_mcc": 0.48653004754089046, + "eval_Qnli-dev_cosine_precision": 0.6610169491525424, + "eval_Qnli-dev_cosine_recall": 0.8666666666666667, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9479166865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.9583333134651184, + "eval_global_dataset_loss": 0.26387155055999756, + "eval_global_dataset_runtime": 103.9177, + "eval_global_dataset_samples_per_second": 7.727, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9479166865348816, + "eval_sts-test-1024_pearson_cosine": 0.8810824372715894, + "eval_sts-test-1024_spearman_cosine": 0.9117642789427417, + "eval_sts-test_pearson_cosine": 0.9044525796924666, + "eval_sts-test_spearman_cosine": 0.9182572042166309, + "step": 882 + }, + { + "epoch": 0.9001019367991845, + "grad_norm": 7.724752426147461, + "learning_rate": 4.582820800525455e-05, + "loss": 0.4898, + "step": 883 + }, + { + "epoch": 0.9011213047910296, + "grad_norm": 9.442131042480469, + "learning_rate": 4.564047487272001e-05, + "loss": 0.5506, + "step": 884 + }, + { + "epoch": 0.9021406727828746, + "grad_norm": 8.832263946533203, + "learning_rate": 4.5452803649150324e-05, + "loss": 0.6206, + "step": 885 + }, + { + "epoch": 0.9031600407747197, + "grad_norm": 12.987079620361328, + "learning_rate": 4.5265196999636535e-05, + "loss": 1.9669, + "step": 886 + }, + { + "epoch": 0.9041794087665648, + "grad_norm": 9.050309181213379, + "learning_rate": 4.5077657588352595e-05, + "loss": 0.8493, + "step": 887 + }, + { + "epoch": 0.9051987767584098, + "grad_norm": 9.642857551574707, + "learning_rate": 4.489018807851769e-05, + "loss": 0.9698, + "step": 888 + }, + { + "epoch": 0.9062181447502549, + "grad_norm": 7.444589614868164, + "learning_rate": 4.4702791132358314e-05, + "loss": 0.7322, + "step": 889 + }, + { + "epoch": 0.9072375127420998, + "grad_norm": 9.99152946472168, + "learning_rate": 4.451546941107046e-05, + "loss": 0.484, + "step": 890 + }, + { + "epoch": 0.908256880733945, + "grad_norm": 6.232360363006592, + "learning_rate": 4.432822557478194e-05, + "loss": 0.5604, + "step": 891 + }, + { + "epoch": 0.90927624872579, + "grad_norm": 3.1541106700897217, + "learning_rate": 4.414106228251446e-05, + "loss": 0.2633, + "step": 892 + }, + { + "epoch": 0.910295616717635, + "grad_norm": 5.661106109619141, + "learning_rate": 4.3953982192146006e-05, + "loss": 0.2417, + "step": 893 + }, + { + "epoch": 0.9113149847094801, + "grad_norm": 4.497067451477051, + "learning_rate": 4.3766987960372956e-05, + "loss": 0.4481, + "step": 894 + }, + { + "epoch": 0.9123343527013251, + "grad_norm": 8.505694389343262, + "learning_rate": 4.358008224267245e-05, + "loss": 0.7402, + "step": 895 + }, + { + "epoch": 0.9133537206931702, + "grad_norm": 5.820054054260254, + "learning_rate": 4.3393267693264686e-05, + "loss": 0.4897, + "step": 896 + }, + { + "epoch": 0.9143730886850153, + "grad_norm": 7.943095684051514, + "learning_rate": 4.320654696507511e-05, + "loss": 0.5863, + "step": 897 + }, + { + "epoch": 0.9153924566768603, + "grad_norm": 10.6437349319458, + "learning_rate": 4.301992270969692e-05, + "loss": 0.7101, + "step": 898 + }, + { + "epoch": 0.9164118246687054, + "grad_norm": 3.8055593967437744, + "learning_rate": 4.2833397577353284e-05, + "loss": 0.2404, + "step": 899 + }, + { + "epoch": 0.9174311926605505, + "grad_norm": 8.539854049682617, + "learning_rate": 4.26469742168597e-05, + "loss": 0.5594, + "step": 900 + }, + { + "epoch": 0.9184505606523955, + "grad_norm": 5.611748218536377, + "learning_rate": 4.2460655275586494e-05, + "loss": 0.4047, + "step": 901 + }, + { + "epoch": 0.9194699286442406, + "grad_norm": 4.898343086242676, + "learning_rate": 4.227444339942107e-05, + "loss": 0.4865, + "step": 902 + }, + { + "epoch": 0.9204892966360856, + "grad_norm": 8.28711986541748, + "learning_rate": 4.208834123273047e-05, + "loss": 0.3909, + "step": 903 + }, + { + "epoch": 0.9215086646279307, + "grad_norm": 6.98935604095459, + "learning_rate": 4.190235141832375e-05, + "loss": 0.2808, + "step": 904 + }, + { + "epoch": 0.9225280326197758, + "grad_norm": 9.016980171203613, + "learning_rate": 4.171647659741448e-05, + "loss": 0.7509, + "step": 905 + }, + { + "epoch": 0.9235474006116208, + "grad_norm": 5.859550476074219, + "learning_rate": 4.153071940958321e-05, + "loss": 0.325, + "step": 906 + }, + { + "epoch": 0.9245667686034659, + "grad_norm": 7.970040321350098, + "learning_rate": 4.134508249274002e-05, + "loss": 0.5335, + "step": 907 + }, + { + "epoch": 0.9255861365953109, + "grad_norm": 6.2324981689453125, + "learning_rate": 4.1159568483087e-05, + "loss": 0.6193, + "step": 908 + }, + { + "epoch": 0.926605504587156, + "grad_norm": 5.227268218994141, + "learning_rate": 4.0974180015080897e-05, + "loss": 0.2974, + "step": 909 + }, + { + "epoch": 0.9276248725790011, + "grad_norm": 9.293944358825684, + "learning_rate": 4.078891972139564e-05, + "loss": 0.6725, + "step": 910 + }, + { + "epoch": 0.928644240570846, + "grad_norm": 10.003561019897461, + "learning_rate": 4.060379023288495e-05, + "loss": 0.8828, + "step": 911 + }, + { + "epoch": 0.9296636085626911, + "grad_norm": 9.07729721069336, + "learning_rate": 4.0418794178545076e-05, + "loss": 0.8751, + "step": 912 + }, + { + "epoch": 0.9306829765545361, + "grad_norm": 7.200821876525879, + "learning_rate": 4.023393418547732e-05, + "loss": 0.7019, + "step": 913 + }, + { + "epoch": 0.9317023445463812, + "grad_norm": 10.154699325561523, + "learning_rate": 4.0049212878850793e-05, + "loss": 0.7131, + "step": 914 + }, + { + "epoch": 0.9327217125382263, + "grad_norm": 7.271543025970459, + "learning_rate": 3.98646328818652e-05, + "loss": 0.2849, + "step": 915 + }, + { + "epoch": 0.9337410805300713, + "grad_norm": 9.933566093444824, + "learning_rate": 3.96801968157135e-05, + "loss": 0.8097, + "step": 916 + }, + { + "epoch": 0.9347604485219164, + "grad_norm": 5.370792865753174, + "learning_rate": 3.949590729954467e-05, + "loss": 0.3447, + "step": 917 + }, + { + "epoch": 0.9357798165137615, + "grad_norm": 8.846680641174316, + "learning_rate": 3.931176695042664e-05, + "loss": 0.8601, + "step": 918 + }, + { + "epoch": 0.9367991845056065, + "grad_norm": 5.936051368713379, + "learning_rate": 3.912777838330893e-05, + "loss": 0.4467, + "step": 919 + }, + { + "epoch": 0.9378185524974516, + "grad_norm": 10.40077018737793, + "learning_rate": 3.8943944210985735e-05, + "loss": 0.8137, + "step": 920 + }, + { + "epoch": 0.9388379204892966, + "grad_norm": 7.319591999053955, + "learning_rate": 3.876026704405866e-05, + "loss": 0.4527, + "step": 921 + }, + { + "epoch": 0.9398572884811417, + "grad_norm": 8.947883605957031, + "learning_rate": 3.8576749490899686e-05, + "loss": 0.7656, + "step": 922 + }, + { + "epoch": 0.9408766564729868, + "grad_norm": 10.776662826538086, + "learning_rate": 3.839339415761416e-05, + "loss": 1.1218, + "step": 923 + }, + { + "epoch": 0.9418960244648318, + "grad_norm": 2.9248359203338623, + "learning_rate": 3.821020364800379e-05, + "loss": 0.188, + "step": 924 + }, + { + "epoch": 0.9429153924566769, + "grad_norm": 9.73752212524414, + "learning_rate": 3.8027180563529616e-05, + "loss": 0.8454, + "step": 925 + }, + { + "epoch": 0.9439347604485219, + "grad_norm": 6.643280506134033, + "learning_rate": 3.7844327503275136e-05, + "loss": 0.5368, + "step": 926 + }, + { + "epoch": 0.944954128440367, + "grad_norm": 9.299040794372559, + "learning_rate": 3.7661647063909294e-05, + "loss": 0.7602, + "step": 927 + }, + { + "epoch": 0.9459734964322121, + "grad_norm": 6.660792827606201, + "learning_rate": 3.747914183964974e-05, + "loss": 0.4733, + "step": 928 + }, + { + "epoch": 0.9469928644240571, + "grad_norm": 5.206737995147705, + "learning_rate": 3.729681442222587e-05, + "loss": 0.2305, + "step": 929 + }, + { + "epoch": 0.9480122324159022, + "grad_norm": 9.746971130371094, + "learning_rate": 3.711466740084211e-05, + "loss": 0.7775, + "step": 930 + }, + { + "epoch": 0.9490316004077471, + "grad_norm": 9.825338363647461, + "learning_rate": 3.6932703362141084e-05, + "loss": 0.8859, + "step": 931 + }, + { + "epoch": 0.9500509683995922, + "grad_norm": 7.335731506347656, + "learning_rate": 3.6750924890166914e-05, + "loss": 0.3918, + "step": 932 + }, + { + "epoch": 0.9510703363914373, + "grad_norm": 6.4724931716918945, + "learning_rate": 3.656933456632853e-05, + "loss": 0.3842, + "step": 933 + }, + { + "epoch": 0.9520897043832823, + "grad_norm": 4.886312484741211, + "learning_rate": 3.638793496936296e-05, + "loss": 0.3719, + "step": 934 + }, + { + "epoch": 0.9531090723751274, + "grad_norm": 8.522834777832031, + "learning_rate": 3.620672867529878e-05, + "loss": 0.8043, + "step": 935 + }, + { + "epoch": 0.9541284403669725, + "grad_norm": 9.507696151733398, + "learning_rate": 3.602571825741953e-05, + "loss": 0.8282, + "step": 936 + }, + { + "epoch": 0.9551478083588175, + "grad_norm": 4.895750045776367, + "learning_rate": 3.584490628622705e-05, + "loss": 0.4599, + "step": 937 + }, + { + "epoch": 0.9561671763506626, + "grad_norm": 7.197470664978027, + "learning_rate": 3.566429532940518e-05, + "loss": 0.649, + "step": 938 + }, + { + "epoch": 0.9571865443425076, + "grad_norm": 6.60915470123291, + "learning_rate": 3.548388795178307e-05, + "loss": 0.4325, + "step": 939 + }, + { + "epoch": 0.9582059123343527, + "grad_norm": 10.626359939575195, + "learning_rate": 3.5303686715298955e-05, + "loss": 1.3108, + "step": 940 + }, + { + "epoch": 0.9592252803261978, + "grad_norm": 6.316555023193359, + "learning_rate": 3.51236941789637e-05, + "loss": 0.3018, + "step": 941 + }, + { + "epoch": 0.9602446483180428, + "grad_norm": 7.12025785446167, + "learning_rate": 3.494391289882435e-05, + "loss": 0.6258, + "step": 942 + }, + { + "epoch": 0.9612640163098879, + "grad_norm": 10.008544921875, + "learning_rate": 3.476434542792805e-05, + "loss": 1.2266, + "step": 943 + }, + { + "epoch": 0.9622833843017329, + "grad_norm": 8.917716979980469, + "learning_rate": 3.4584994316285604e-05, + "loss": 0.6593, + "step": 944 + }, + { + "epoch": 0.963302752293578, + "grad_norm": 5.837446689605713, + "learning_rate": 3.4405862110835364e-05, + "loss": 0.3096, + "step": 945 + }, + { + "epoch": 0.9643221202854231, + "grad_norm": 4.312796115875244, + "learning_rate": 3.422695135540697e-05, + "loss": 0.3436, + "step": 946 + }, + { + "epoch": 0.9653414882772681, + "grad_norm": 4.772927284240723, + "learning_rate": 3.404826459068536e-05, + "loss": 0.2497, + "step": 947 + }, + { + "epoch": 0.9663608562691132, + "grad_norm": 3.3676137924194336, + "learning_rate": 3.386980435417457e-05, + "loss": 0.1653, + "step": 948 + }, + { + "epoch": 0.9673802242609582, + "grad_norm": 6.203863143920898, + "learning_rate": 3.369157318016176e-05, + "loss": 0.469, + "step": 949 + }, + { + "epoch": 0.9683995922528033, + "grad_norm": 7.628493309020996, + "learning_rate": 3.351357359968117e-05, + "loss": 0.4919, + "step": 950 + }, + { + "epoch": 0.9694189602446484, + "grad_norm": 7.940287113189697, + "learning_rate": 3.333580814047826e-05, + "loss": 0.4788, + "step": 951 + }, + { + "epoch": 0.9704383282364933, + "grad_norm": 6.046499729156494, + "learning_rate": 3.3158279326973766e-05, + "loss": 0.3041, + "step": 952 + }, + { + "epoch": 0.9714576962283384, + "grad_norm": 4.314492225646973, + "learning_rate": 3.298098968022782e-05, + "loss": 0.3138, + "step": 953 + }, + { + "epoch": 0.9724770642201835, + "grad_norm": 8.91407585144043, + "learning_rate": 3.2803941717904216e-05, + "loss": 0.7758, + "step": 954 + }, + { + "epoch": 0.9734964322120285, + "grad_norm": 11.913896560668945, + "learning_rate": 3.26271379542346e-05, + "loss": 0.6974, + "step": 955 + }, + { + "epoch": 0.9745158002038736, + "grad_norm": 4.831221580505371, + "learning_rate": 3.2450580899982795e-05, + "loss": 0.2964, + "step": 956 + }, + { + "epoch": 0.9755351681957186, + "grad_norm": 6.116502285003662, + "learning_rate": 3.2274273062409154e-05, + "loss": 0.3473, + "step": 957 + }, + { + "epoch": 0.9765545361875637, + "grad_norm": 11.75236988067627, + "learning_rate": 3.2098216945234946e-05, + "loss": 0.8905, + "step": 958 + }, + { + "epoch": 0.9775739041794088, + "grad_norm": 3.468975067138672, + "learning_rate": 3.192241504860675e-05, + "loss": 0.2521, + "step": 959 + }, + { + "epoch": 0.9785932721712538, + "grad_norm": 7.624709606170654, + "learning_rate": 3.1746869869061063e-05, + "loss": 0.4462, + "step": 960 + }, + { + "epoch": 0.9796126401630989, + "grad_norm": 9.019265174865723, + "learning_rate": 3.157158389948871e-05, + "loss": 0.7842, + "step": 961 + }, + { + "epoch": 0.9806320081549439, + "grad_norm": 4.77131986618042, + "learning_rate": 3.1396559629099574e-05, + "loss": 0.2973, + "step": 962 + }, + { + "epoch": 0.981651376146789, + "grad_norm": 8.40596866607666, + "learning_rate": 3.122179954338716e-05, + "loss": 0.6026, + "step": 963 + }, + { + "epoch": 0.9826707441386341, + "grad_norm": 6.705322265625, + "learning_rate": 3.1047306124093335e-05, + "loss": 0.4026, + "step": 964 + }, + { + "epoch": 0.9836901121304791, + "grad_norm": 10.35732364654541, + "learning_rate": 3.087308184917308e-05, + "loss": 0.9181, + "step": 965 + }, + { + "epoch": 0.9847094801223242, + "grad_norm": 6.806704998016357, + "learning_rate": 3.069912919275926e-05, + "loss": 0.473, + "step": 966 + }, + { + "epoch": 0.9857288481141692, + "grad_norm": 10.28345012664795, + "learning_rate": 3.0525450625127575e-05, + "loss": 0.7152, + "step": 967 + }, + { + "epoch": 0.9867482161060143, + "grad_norm": 11.785171508789062, + "learning_rate": 3.0352048612661416e-05, + "loss": 0.9519, + "step": 968 + }, + { + "epoch": 0.9877675840978594, + "grad_norm": 8.55274772644043, + "learning_rate": 3.017892561781682e-05, + "loss": 0.5322, + "step": 969 + }, + { + "epoch": 0.9887869520897044, + "grad_norm": 8.597644805908203, + "learning_rate": 3.0006084099087595e-05, + "loss": 0.8257, + "step": 970 + }, + { + "epoch": 0.9898063200815495, + "grad_norm": 6.743808746337891, + "learning_rate": 2.983352651097031e-05, + "loss": 0.5648, + "step": 971 + }, + { + "epoch": 0.9908256880733946, + "grad_norm": 10.981080055236816, + "learning_rate": 2.9661255303929486e-05, + "loss": 0.909, + "step": 972 + }, + { + "epoch": 0.9918450560652395, + "grad_norm": 9.426305770874023, + "learning_rate": 2.948927292436281e-05, + "loss": 0.5531, + "step": 973 + }, + { + "epoch": 0.9928644240570846, + "grad_norm": 5.883917331695557, + "learning_rate": 2.9317581814566323e-05, + "loss": 0.2709, + "step": 974 + }, + { + "epoch": 0.9938837920489296, + "grad_norm": 7.118516445159912, + "learning_rate": 2.9146184412699855e-05, + "loss": 0.7348, + "step": 975 + }, + { + "epoch": 0.9949031600407747, + "grad_norm": 5.449122428894043, + "learning_rate": 2.8975083152752258e-05, + "loss": 0.2765, + "step": 976 + }, + { + "epoch": 0.9959225280326198, + "grad_norm": 7.071670055389404, + "learning_rate": 2.880428046450699e-05, + "loss": 0.4227, + "step": 977 + }, + { + "epoch": 0.9969418960244648, + "grad_norm": 4.812356948852539, + "learning_rate": 2.863377877350747e-05, + "loss": 0.252, + "step": 978 + }, + { + "epoch": 0.9979612640163099, + "grad_norm": 5.169205188751221, + "learning_rate": 2.8463580501022748e-05, + "loss": 0.3303, + "step": 979 + }, + { + "epoch": 0.9989806320081549, + "grad_norm": 7.911106109619141, + "learning_rate": 2.8293688064013062e-05, + "loss": 0.4636, + "step": 980 + }, + { + "epoch": 0.9989806320081549, + "eval_Qnli-dev-1024_cosine_accuracy": 0.75, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8305908441543579, + "eval_Qnli-dev-1024_cosine_ap": 0.7944254663147428, + "eval_Qnli-dev-1024_cosine_f1": 0.7474747474747475, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7933956980705261, + "eval_Qnli-dev-1024_cosine_mcc": 0.49179033209958445, + "eval_Qnli-dev-1024_cosine_precision": 0.6851851851851852, + "eval_Qnli-dev-1024_cosine_recall": 0.8222222222222222, + "eval_Qnli-dev_cosine_accuracy": 0.7604166666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7133668661117554, + "eval_Qnli-dev_cosine_ap": 0.7661051082646182, + "eval_Qnli-dev_cosine_f1": 0.7526881720430108, + "eval_Qnli-dev_cosine_f1_threshold": 0.7133668661117554, + "eval_Qnli-dev_cosine_mcc": 0.5218535759042912, + "eval_Qnli-dev_cosine_precision": 0.7291666666666666, + "eval_Qnli-dev_cosine_recall": 0.7777777777777778, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9479166865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.9479166865348816, + "eval_global_dataset_loss": 0.3123040199279785, + "eval_global_dataset_runtime": 103.8412, + "eval_global_dataset_samples_per_second": 7.733, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9479166865348816, + "eval_sts-test-1024_pearson_cosine": 0.8847345798228932, + "eval_sts-test-1024_spearman_cosine": 0.9153180610643749, + "eval_sts-test_pearson_cosine": 0.9064074468577172, + "eval_sts-test_spearman_cosine": 0.9206390660127711, + "step": 980 + }, + { + "epoch": 1.0, + "grad_norm": 12.788724899291992, + "learning_rate": 2.8124103875095527e-05, + "loss": 1.3317, + "step": 981 + }, + { + "epoch": 1.001019367991845, + "grad_norm": 4.531240940093994, + "learning_rate": 2.7954830342509875e-05, + "loss": 0.2939, + "step": 982 + }, + { + "epoch": 1.0020387359836902, + "grad_norm": 4.971615314483643, + "learning_rate": 2.7785869870084213e-05, + "loss": 0.3436, + "step": 983 + }, + { + "epoch": 1.003058103975535, + "grad_norm": 6.3732805252075195, + "learning_rate": 2.761722485720099e-05, + "loss": 0.6107, + "step": 984 + }, + { + "epoch": 1.0040774719673802, + "grad_norm": 6.102477550506592, + "learning_rate": 2.744889769876282e-05, + "loss": 0.4759, + "step": 985 + }, + { + "epoch": 1.0050968399592253, + "grad_norm": 8.986217498779297, + "learning_rate": 2.728089078515854e-05, + "loss": 0.4255, + "step": 986 + }, + { + "epoch": 1.0061162079510704, + "grad_norm": 4.163675308227539, + "learning_rate": 2.7113206502229232e-05, + "loss": 0.274, + "step": 987 + }, + { + "epoch": 1.0071355759429155, + "grad_norm": 8.744937896728516, + "learning_rate": 2.69458472312343e-05, + "loss": 0.7007, + "step": 988 + }, + { + "epoch": 1.0081549439347604, + "grad_norm": 3.9354403018951416, + "learning_rate": 2.6778815348817776e-05, + "loss": 0.2389, + "step": 989 + }, + { + "epoch": 1.0091743119266054, + "grad_norm": 8.15425968170166, + "learning_rate": 2.6612113226974443e-05, + "loss": 0.6767, + "step": 990 + }, + { + "epoch": 1.0101936799184505, + "grad_norm": 2.782219648361206, + "learning_rate": 2.6445743233016234e-05, + "loss": 0.1771, + "step": 991 + }, + { + "epoch": 1.0112130479102956, + "grad_norm": 7.419161319732666, + "learning_rate": 2.6279707729538592e-05, + "loss": 0.3375, + "step": 992 + }, + { + "epoch": 1.0122324159021407, + "grad_norm": 4.217768669128418, + "learning_rate": 2.6114009074386846e-05, + "loss": 0.3619, + "step": 993 + }, + { + "epoch": 1.0132517838939856, + "grad_norm": 1.7565678358078003, + "learning_rate": 2.5948649620622868e-05, + "loss": 0.1643, + "step": 994 + }, + { + "epoch": 1.0142711518858307, + "grad_norm": 6.78450870513916, + "learning_rate": 2.5783631716491553e-05, + "loss": 0.5588, + "step": 995 + }, + { + "epoch": 1.0152905198776758, + "grad_norm": 6.4786577224731445, + "learning_rate": 2.5618957705387465e-05, + "loss": 0.4759, + "step": 996 + }, + { + "epoch": 1.016309887869521, + "grad_norm": 3.9980854988098145, + "learning_rate": 2.5454629925821673e-05, + "loss": 0.2266, + "step": 997 + }, + { + "epoch": 1.017329255861366, + "grad_norm": 4.7267608642578125, + "learning_rate": 2.5290650711388374e-05, + "loss": 0.3953, + "step": 998 + }, + { + "epoch": 1.018348623853211, + "grad_norm": 3.118359088897705, + "learning_rate": 2.5127022390731913e-05, + "loss": 0.2146, + "step": 999 + }, + { + "epoch": 1.019367991845056, + "grad_norm": 7.00241231918335, + "learning_rate": 2.4963747287513633e-05, + "loss": 0.8455, + "step": 1000 + }, + { + "epoch": 1.020387359836901, + "grad_norm": 4.788176536560059, + "learning_rate": 2.4800827720378843e-05, + "loss": 0.3804, + "step": 1001 + }, + { + "epoch": 1.0214067278287462, + "grad_norm": 6.446220397949219, + "learning_rate": 2.4638266002923983e-05, + "loss": 0.429, + "step": 1002 + }, + { + "epoch": 1.0224260958205913, + "grad_norm": 8.392274856567383, + "learning_rate": 2.4476064443663714e-05, + "loss": 0.5209, + "step": 1003 + }, + { + "epoch": 1.0234454638124364, + "grad_norm": 11.21457290649414, + "learning_rate": 2.431422534599815e-05, + "loss": 0.7556, + "step": 1004 + }, + { + "epoch": 1.0244648318042813, + "grad_norm": 5.634324073791504, + "learning_rate": 2.415275100818013e-05, + "loss": 0.4399, + "step": 1005 + }, + { + "epoch": 1.0254841997961264, + "grad_norm": 6.49767541885376, + "learning_rate": 2.3991643723282576e-05, + "loss": 0.3612, + "step": 1006 + }, + { + "epoch": 1.0265035677879715, + "grad_norm": 3.142667531967163, + "learning_rate": 2.3830905779165997e-05, + "loss": 0.175, + "step": 1007 + }, + { + "epoch": 1.0275229357798166, + "grad_norm": 2.9294066429138184, + "learning_rate": 2.3670539458445883e-05, + "loss": 0.1849, + "step": 1008 + }, + { + "epoch": 1.0285423037716617, + "grad_norm": 10.674302101135254, + "learning_rate": 2.3510547038460405e-05, + "loss": 1.0634, + "step": 1009 + }, + { + "epoch": 1.0295616717635065, + "grad_norm": 5.9518818855285645, + "learning_rate": 2.3350930791238012e-05, + "loss": 0.6881, + "step": 1010 + }, + { + "epoch": 1.0305810397553516, + "grad_norm": 4.818733215332031, + "learning_rate": 2.319169298346518e-05, + "loss": 0.2433, + "step": 1011 + }, + { + "epoch": 1.0316004077471967, + "grad_norm": 4.729939937591553, + "learning_rate": 2.303283587645424e-05, + "loss": 0.2113, + "step": 1012 + }, + { + "epoch": 1.0326197757390418, + "grad_norm": 2.8469977378845215, + "learning_rate": 2.2874361726111194e-05, + "loss": 0.1412, + "step": 1013 + }, + { + "epoch": 1.033639143730887, + "grad_norm": 6.6002655029296875, + "learning_rate": 2.2716272782903806e-05, + "loss": 0.4273, + "step": 1014 + }, + { + "epoch": 1.0346585117227318, + "grad_norm": 5.73666524887085, + "learning_rate": 2.25585712918295e-05, + "loss": 0.2649, + "step": 1015 + }, + { + "epoch": 1.035677879714577, + "grad_norm": 10.085776329040527, + "learning_rate": 2.2401259492383593e-05, + "loss": 0.6387, + "step": 1016 + }, + { + "epoch": 1.036697247706422, + "grad_norm": 6.065640449523926, + "learning_rate": 2.224433961852747e-05, + "loss": 0.2274, + "step": 1017 + }, + { + "epoch": 1.0377166156982671, + "grad_norm": 4.713268280029297, + "learning_rate": 2.2087813898656774e-05, + "loss": 0.2802, + "step": 1018 + }, + { + "epoch": 1.0387359836901122, + "grad_norm": 2.990431547164917, + "learning_rate": 2.1931684555569877e-05, + "loss": 0.1943, + "step": 1019 + }, + { + "epoch": 1.039755351681957, + "grad_norm": 8.173871040344238, + "learning_rate": 2.1775953806436265e-05, + "loss": 0.4127, + "step": 1020 + }, + { + "epoch": 1.0407747196738022, + "grad_norm": 3.7410459518432617, + "learning_rate": 2.1620623862765006e-05, + "loss": 0.1905, + "step": 1021 + }, + { + "epoch": 1.0417940876656473, + "grad_norm": 6.727060794830322, + "learning_rate": 2.146569693037343e-05, + "loss": 0.3222, + "step": 1022 + }, + { + "epoch": 1.0428134556574924, + "grad_norm": 4.430773735046387, + "learning_rate": 2.1311175209355755e-05, + "loss": 0.1848, + "step": 1023 + }, + { + "epoch": 1.0438328236493375, + "grad_norm": 2.9469778537750244, + "learning_rate": 2.115706089405185e-05, + "loss": 0.177, + "step": 1024 + }, + { + "epoch": 1.0448521916411824, + "grad_norm": 7.147342681884766, + "learning_rate": 2.1003356173016098e-05, + "loss": 0.2783, + "step": 1025 + }, + { + "epoch": 1.0458715596330275, + "grad_norm": 8.615937232971191, + "learning_rate": 2.0850063228986234e-05, + "loss": 0.3611, + "step": 1026 + }, + { + "epoch": 1.0468909276248726, + "grad_norm": 5.674496173858643, + "learning_rate": 2.0697184238852468e-05, + "loss": 0.2966, + "step": 1027 + }, + { + "epoch": 1.0479102956167177, + "grad_norm": 6.515853404998779, + "learning_rate": 2.054472137362649e-05, + "loss": 0.2342, + "step": 1028 + }, + { + "epoch": 1.0489296636085628, + "grad_norm": 2.8067984580993652, + "learning_rate": 2.0392676798410677e-05, + "loss": 0.1982, + "step": 1029 + }, + { + "epoch": 1.0499490316004076, + "grad_norm": 5.038998126983643, + "learning_rate": 2.0241052672367327e-05, + "loss": 0.2343, + "step": 1030 + }, + { + "epoch": 1.0509683995922527, + "grad_norm": 8.588050842285156, + "learning_rate": 2.0089851148687965e-05, + "loss": 0.5463, + "step": 1031 + }, + { + "epoch": 1.0519877675840978, + "grad_norm": 6.627236843109131, + "learning_rate": 1.993907437456285e-05, + "loss": 0.5568, + "step": 1032 + }, + { + "epoch": 1.053007135575943, + "grad_norm": 7.842597484588623, + "learning_rate": 1.9788724491150423e-05, + "loss": 0.5468, + "step": 1033 + }, + { + "epoch": 1.054026503567788, + "grad_norm": 3.9404547214508057, + "learning_rate": 1.9638803633546933e-05, + "loss": 0.2003, + "step": 1034 + }, + { + "epoch": 1.0550458715596331, + "grad_norm": 6.290144443511963, + "learning_rate": 1.948931393075603e-05, + "loss": 0.2297, + "step": 1035 + }, + { + "epoch": 1.056065239551478, + "grad_norm": 6.990847110748291, + "learning_rate": 1.9340257505658667e-05, + "loss": 0.5335, + "step": 1036 + }, + { + "epoch": 1.0570846075433231, + "grad_norm": 7.3486247062683105, + "learning_rate": 1.9191636474982883e-05, + "loss": 0.5115, + "step": 1037 + }, + { + "epoch": 1.0581039755351682, + "grad_norm": 6.156608581542969, + "learning_rate": 1.9043452949273687e-05, + "loss": 0.4874, + "step": 1038 + }, + { + "epoch": 1.0591233435270133, + "grad_norm": 8.564006805419922, + "learning_rate": 1.889570903286322e-05, + "loss": 0.8818, + "step": 1039 + }, + { + "epoch": 1.0601427115188584, + "grad_norm": 3.6055350303649902, + "learning_rate": 1.8748406823840726e-05, + "loss": 0.1726, + "step": 1040 + }, + { + "epoch": 1.0611620795107033, + "grad_norm": 5.525221824645996, + "learning_rate": 1.860154841402288e-05, + "loss": 0.3581, + "step": 1041 + }, + { + "epoch": 1.0621814475025484, + "grad_norm": 3.264824390411377, + "learning_rate": 1.8455135888924013e-05, + "loss": 0.1545, + "step": 1042 + }, + { + "epoch": 1.0632008154943935, + "grad_norm": 6.2633891105651855, + "learning_rate": 1.8309171327726522e-05, + "loss": 0.6471, + "step": 1043 + }, + { + "epoch": 1.0642201834862386, + "grad_norm": 8.516809463500977, + "learning_rate": 1.816365680325134e-05, + "loss": 0.8047, + "step": 1044 + }, + { + "epoch": 1.0652395514780837, + "grad_norm": 6.006096839904785, + "learning_rate": 1.8018594381928444e-05, + "loss": 0.3753, + "step": 1045 + }, + { + "epoch": 1.0662589194699286, + "grad_norm": 8.395868301391602, + "learning_rate": 1.7873986123767648e-05, + "loss": 0.4489, + "step": 1046 + }, + { + "epoch": 1.0672782874617737, + "grad_norm": 2.2208871841430664, + "learning_rate": 1.7729834082329184e-05, + "loss": 0.133, + "step": 1047 + }, + { + "epoch": 1.0682976554536188, + "grad_norm": 4.6480913162231445, + "learning_rate": 1.7586140304694655e-05, + "loss": 0.3512, + "step": 1048 + }, + { + "epoch": 1.0693170234454639, + "grad_norm": 5.258199214935303, + "learning_rate": 1.7442906831437927e-05, + "loss": 0.2497, + "step": 1049 + }, + { + "epoch": 1.070336391437309, + "grad_norm": 5.950115203857422, + "learning_rate": 1.730013569659616e-05, + "loss": 0.298, + "step": 1050 + }, + { + "epoch": 1.0713557594291538, + "grad_norm": 5.7246994972229, + "learning_rate": 1.715782892764092e-05, + "loss": 0.2226, + "step": 1051 + }, + { + "epoch": 1.072375127420999, + "grad_norm": 4.580132484436035, + "learning_rate": 1.7015988545449318e-05, + "loss": 0.3375, + "step": 1052 + }, + { + "epoch": 1.073394495412844, + "grad_norm": 3.1227149963378906, + "learning_rate": 1.6874616564275463e-05, + "loss": 0.2353, + "step": 1053 + }, + { + "epoch": 1.0744138634046891, + "grad_norm": 6.9805908203125, + "learning_rate": 1.673371499172174e-05, + "loss": 0.3775, + "step": 1054 + }, + { + "epoch": 1.0754332313965342, + "grad_norm": 5.377047538757324, + "learning_rate": 1.6593285828710298e-05, + "loss": 0.4914, + "step": 1055 + }, + { + "epoch": 1.0764525993883791, + "grad_norm": 5.249961853027344, + "learning_rate": 1.6453331069454718e-05, + "loss": 0.2293, + "step": 1056 + }, + { + "epoch": 1.0774719673802242, + "grad_norm": 4.464038848876953, + "learning_rate": 1.6313852701431597e-05, + "loss": 0.3663, + "step": 1057 + }, + { + "epoch": 1.0784913353720693, + "grad_norm": 3.9056015014648438, + "learning_rate": 1.6174852705352418e-05, + "loss": 0.4417, + "step": 1058 + }, + { + "epoch": 1.0795107033639144, + "grad_norm": 2.9351861476898193, + "learning_rate": 1.603633305513536e-05, + "loss": 0.1897, + "step": 1059 + }, + { + "epoch": 1.0805300713557595, + "grad_norm": 7.004980564117432, + "learning_rate": 1.5898295717877255e-05, + "loss": 0.6545, + "step": 1060 + }, + { + "epoch": 1.0815494393476044, + "grad_norm": 3.4823687076568604, + "learning_rate": 1.5760742653825706e-05, + "loss": 0.2848, + "step": 1061 + }, + { + "epoch": 1.0825688073394495, + "grad_norm": 7.051894187927246, + "learning_rate": 1.5623675816351224e-05, + "loss": 0.4413, + "step": 1062 + }, + { + "epoch": 1.0835881753312946, + "grad_norm": 4.416322708129883, + "learning_rate": 1.5487097151919494e-05, + "loss": 0.2769, + "step": 1063 + }, + { + "epoch": 1.0846075433231397, + "grad_norm": 3.4366087913513184, + "learning_rate": 1.5351008600063728e-05, + "loss": 0.2127, + "step": 1064 + }, + { + "epoch": 1.0856269113149848, + "grad_norm": 4.421938419342041, + "learning_rate": 1.5215412093357084e-05, + "loss": 0.3111, + "step": 1065 + }, + { + "epoch": 1.0866462793068297, + "grad_norm": 3.5228912830352783, + "learning_rate": 1.5080309557385303e-05, + "loss": 0.2097, + "step": 1066 + }, + { + "epoch": 1.0876656472986748, + "grad_norm": 4.430333614349365, + "learning_rate": 1.4945702910719334e-05, + "loss": 0.2142, + "step": 1067 + }, + { + "epoch": 1.0886850152905199, + "grad_norm": 3.3618974685668945, + "learning_rate": 1.4811594064888019e-05, + "loss": 0.3016, + "step": 1068 + }, + { + "epoch": 1.089704383282365, + "grad_norm": 6.342488765716553, + "learning_rate": 1.467798492435104e-05, + "loss": 0.1612, + "step": 1069 + }, + { + "epoch": 1.09072375127421, + "grad_norm": 5.643674850463867, + "learning_rate": 1.4544877386471856e-05, + "loss": 0.2357, + "step": 1070 + }, + { + "epoch": 1.091743119266055, + "grad_norm": 6.614534378051758, + "learning_rate": 1.4412273341490706e-05, + "loss": 0.6814, + "step": 1071 + }, + { + "epoch": 1.0927624872579, + "grad_norm": 1.9248261451721191, + "learning_rate": 1.4280174672497837e-05, + "loss": 0.0978, + "step": 1072 + }, + { + "epoch": 1.0937818552497451, + "grad_norm": 9.543230056762695, + "learning_rate": 1.4148583255406684e-05, + "loss": 0.8243, + "step": 1073 + }, + { + "epoch": 1.0948012232415902, + "grad_norm": 3.6566359996795654, + "learning_rate": 1.4017500958927298e-05, + "loss": 0.1563, + "step": 1074 + }, + { + "epoch": 1.0958205912334353, + "grad_norm": 7.203381538391113, + "learning_rate": 1.3886929644539798e-05, + "loss": 0.2596, + "step": 1075 + }, + { + "epoch": 1.0968399592252802, + "grad_norm": 3.83259654045105, + "learning_rate": 1.3756871166467894e-05, + "loss": 0.1584, + "step": 1076 + }, + { + "epoch": 1.0978593272171253, + "grad_norm": 6.46610164642334, + "learning_rate": 1.3627327371652643e-05, + "loss": 0.4703, + "step": 1077 + }, + { + "epoch": 1.0988786952089704, + "grad_norm": 11.403766632080078, + "learning_rate": 1.3498300099726042e-05, + "loss": 1.0745, + "step": 1078 + }, + { + "epoch": 1.0988786952089704, + "eval_Qnli-dev-1024_cosine_accuracy": 0.75, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8006778955459595, + "eval_Qnli-dev-1024_cosine_ap": 0.7824068286827124, + "eval_Qnli-dev-1024_cosine_f1": 0.7289719626168225, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7280842661857605, + "eval_Qnli-dev-1024_cosine_mcc": 0.43373226132862797, + "eval_Qnli-dev-1024_cosine_precision": 0.6290322580645161, + "eval_Qnli-dev-1024_cosine_recall": 0.8666666666666667, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6900060772895813, + "eval_Qnli-dev_cosine_ap": 0.7604401746827563, + "eval_Qnli-dev_cosine_f1": 0.7422680412371134, + "eval_Qnli-dev_cosine_f1_threshold": 0.6807612776756287, + "eval_Qnli-dev_cosine_mcc": 0.48701780569984915, + "eval_Qnli-dev_cosine_precision": 0.6923076923076923, + "eval_Qnli-dev_cosine_recall": 0.8, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9479166865348816, + "eval_global_dataset_loss": 0.3292054533958435, + "eval_global_dataset_runtime": 103.8809, + "eval_global_dataset_samples_per_second": 7.73, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8872344763338829, + "eval_sts-test-1024_spearman_cosine": 0.913351335857396, + "eval_sts-test_pearson_cosine": 0.9070699316238475, + "eval_sts-test_spearman_cosine": 0.9203335932694187, + "step": 1078 + }, + { + "epoch": 1.0998980632008155, + "grad_norm": 5.55651330947876, + "learning_rate": 1.3369791182985136e-05, + "loss": 0.3291, + "step": 1079 + }, + { + "epoch": 1.1009174311926606, + "grad_norm": 4.600091934204102, + "learning_rate": 1.3241802446365853e-05, + "loss": 0.2577, + "step": 1080 + }, + { + "epoch": 1.1019367991845055, + "grad_norm": 3.14005446434021, + "learning_rate": 1.3114335707417108e-05, + "loss": 0.3184, + "step": 1081 + }, + { + "epoch": 1.1029561671763506, + "grad_norm": 8.673876762390137, + "learning_rate": 1.2987392776275025e-05, + "loss": 0.5004, + "step": 1082 + }, + { + "epoch": 1.1039755351681957, + "grad_norm": 3.660937547683716, + "learning_rate": 1.286097545563718e-05, + "loss": 0.1591, + "step": 1083 + }, + { + "epoch": 1.1049949031600408, + "grad_norm": 6.43077278137207, + "learning_rate": 1.2735085540737063e-05, + "loss": 0.2775, + "step": 1084 + }, + { + "epoch": 1.1060142711518859, + "grad_norm": 5.092467308044434, + "learning_rate": 1.2609724819318542e-05, + "loss": 0.4079, + "step": 1085 + }, + { + "epoch": 1.107033639143731, + "grad_norm": 9.57333755493164, + "learning_rate": 1.2484895071610486e-05, + "loss": 0.7756, + "step": 1086 + }, + { + "epoch": 1.1080530071355759, + "grad_norm": 8.876251220703125, + "learning_rate": 1.236059807030151e-05, + "loss": 0.7169, + "step": 1087 + }, + { + "epoch": 1.109072375127421, + "grad_norm": 2.4624123573303223, + "learning_rate": 1.2236835580514717e-05, + "loss": 0.1473, + "step": 1088 + }, + { + "epoch": 1.110091743119266, + "grad_norm": 6.119423866271973, + "learning_rate": 1.2113609359782757e-05, + "loss": 0.4862, + "step": 1089 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 7.974935054779053, + "learning_rate": 1.1990921158022766e-05, + "loss": 0.6355, + "step": 1090 + }, + { + "epoch": 1.1121304791029563, + "grad_norm": 4.422234535217285, + "learning_rate": 1.1868772717511568e-05, + "loss": 0.2046, + "step": 1091 + }, + { + "epoch": 1.1131498470948011, + "grad_norm": 3.5344936847686768, + "learning_rate": 1.1747165772860941e-05, + "loss": 0.2609, + "step": 1092 + }, + { + "epoch": 1.1141692150866462, + "grad_norm": 10.226187705993652, + "learning_rate": 1.1626102050992877e-05, + "loss": 0.6422, + "step": 1093 + }, + { + "epoch": 1.1151885830784913, + "grad_norm": 4.9040961265563965, + "learning_rate": 1.1505583271115228e-05, + "loss": 0.2296, + "step": 1094 + }, + { + "epoch": 1.1162079510703364, + "grad_norm": 3.2139878273010254, + "learning_rate": 1.1385611144697178e-05, + "loss": 0.2221, + "step": 1095 + }, + { + "epoch": 1.1172273190621815, + "grad_norm": 2.8489925861358643, + "learning_rate": 1.1266187375444964e-05, + "loss": 0.142, + "step": 1096 + }, + { + "epoch": 1.1182466870540264, + "grad_norm": 4.342340469360352, + "learning_rate": 1.1147313659277715e-05, + "loss": 0.4458, + "step": 1097 + }, + { + "epoch": 1.1192660550458715, + "grad_norm": 7.662566661834717, + "learning_rate": 1.1028991684303259e-05, + "loss": 0.6697, + "step": 1098 + }, + { + "epoch": 1.1202854230377166, + "grad_norm": 6.858183860778809, + "learning_rate": 1.0911223130794307e-05, + "loss": 0.3055, + "step": 1099 + }, + { + "epoch": 1.1213047910295617, + "grad_norm": 7.914067268371582, + "learning_rate": 1.0794009671164484e-05, + "loss": 0.7791, + "step": 1100 + }, + { + "epoch": 1.1223241590214068, + "grad_norm": 3.899747610092163, + "learning_rate": 1.0677352969944627e-05, + "loss": 0.2956, + "step": 1101 + }, + { + "epoch": 1.1233435270132517, + "grad_norm": 6.076767444610596, + "learning_rate": 1.0561254683759114e-05, + "loss": 0.1481, + "step": 1102 + }, + { + "epoch": 1.1243628950050968, + "grad_norm": 8.103065490722656, + "learning_rate": 1.0445716461302347e-05, + "loss": 0.6428, + "step": 1103 + }, + { + "epoch": 1.1253822629969419, + "grad_norm": 5.3769636154174805, + "learning_rate": 1.033073994331536e-05, + "loss": 0.5238, + "step": 1104 + }, + { + "epoch": 1.126401630988787, + "grad_norm": 7.144674777984619, + "learning_rate": 1.0216326762562512e-05, + "loss": 0.6283, + "step": 1105 + }, + { + "epoch": 1.127420998980632, + "grad_norm": 8.02340030670166, + "learning_rate": 1.010247854380828e-05, + "loss": 0.442, + "step": 1106 + }, + { + "epoch": 1.1284403669724772, + "grad_norm": 3.026409149169922, + "learning_rate": 9.989196903794217e-06, + "loss": 0.2222, + "step": 1107 + }, + { + "epoch": 1.129459734964322, + "grad_norm": 6.199506759643555, + "learning_rate": 9.876483451215945e-06, + "loss": 0.3275, + "step": 1108 + }, + { + "epoch": 1.1304791029561672, + "grad_norm": 3.136950969696045, + "learning_rate": 9.764339786700372e-06, + "loss": 0.1822, + "step": 1109 + }, + { + "epoch": 1.1314984709480123, + "grad_norm": 7.81477689743042, + "learning_rate": 9.652767502782916e-06, + "loss": 0.4456, + "step": 1110 + }, + { + "epoch": 1.1325178389398574, + "grad_norm": 4.547238349914551, + "learning_rate": 9.541768183884913e-06, + "loss": 0.3127, + "step": 1111 + }, + { + "epoch": 1.1335372069317025, + "grad_norm": 4.855820655822754, + "learning_rate": 9.431343406291115e-06, + "loss": 0.2407, + "step": 1112 + }, + { + "epoch": 1.1345565749235473, + "grad_norm": 4.511462211608887, + "learning_rate": 9.321494738127257e-06, + "loss": 0.2194, + "step": 1113 + }, + { + "epoch": 1.1355759429153924, + "grad_norm": 12.011505126953125, + "learning_rate": 9.212223739337883e-06, + "loss": 0.9654, + "step": 1114 + }, + { + "epoch": 1.1365953109072375, + "grad_norm": 8.190558433532715, + "learning_rate": 9.103531961664118e-06, + "loss": 0.6017, + "step": 1115 + }, + { + "epoch": 1.1376146788990826, + "grad_norm": 7.281464576721191, + "learning_rate": 8.99542094862164e-06, + "loss": 0.2722, + "step": 1116 + }, + { + "epoch": 1.1386340468909277, + "grad_norm": 2.0321619510650635, + "learning_rate": 8.887892235478817e-06, + "loss": 0.1289, + "step": 1117 + }, + { + "epoch": 1.1396534148827726, + "grad_norm": 9.805882453918457, + "learning_rate": 8.780947349234797e-06, + "loss": 0.6156, + "step": 1118 + }, + { + "epoch": 1.1406727828746177, + "grad_norm": 5.596035003662109, + "learning_rate": 8.67458780859795e-06, + "loss": 0.3163, + "step": 1119 + }, + { + "epoch": 1.1416921508664628, + "grad_norm": 5.5199785232543945, + "learning_rate": 8.568815123964225e-06, + "loss": 0.3798, + "step": 1120 + }, + { + "epoch": 1.142711518858308, + "grad_norm": 6.384104251861572, + "learning_rate": 8.463630797395705e-06, + "loss": 0.3098, + "step": 1121 + }, + { + "epoch": 1.143730886850153, + "grad_norm": 3.7240169048309326, + "learning_rate": 8.35903632259929e-06, + "loss": 0.1738, + "step": 1122 + }, + { + "epoch": 1.1447502548419979, + "grad_norm": 4.985114097595215, + "learning_rate": 8.255033184905481e-06, + "loss": 0.3891, + "step": 1123 + }, + { + "epoch": 1.145769622833843, + "grad_norm": 2.9914612770080566, + "learning_rate": 8.151622861247304e-06, + "loss": 0.2051, + "step": 1124 + }, + { + "epoch": 1.146788990825688, + "grad_norm": 7.231305122375488, + "learning_rate": 8.04880682013931e-06, + "loss": 0.4286, + "step": 1125 + }, + { + "epoch": 1.1478083588175332, + "grad_norm": 4.831548690795898, + "learning_rate": 7.946586521656751e-06, + "loss": 0.4055, + "step": 1126 + }, + { + "epoch": 1.1488277268093783, + "grad_norm": 7.110738754272461, + "learning_rate": 7.84496341741478e-06, + "loss": 0.6177, + "step": 1127 + }, + { + "epoch": 1.1498470948012232, + "grad_norm": 4.9150919914245605, + "learning_rate": 7.743938950547925e-06, + "loss": 0.2766, + "step": 1128 + }, + { + "epoch": 1.1508664627930683, + "grad_norm": 3.1030569076538086, + "learning_rate": 7.643514555689552e-06, + "loss": 0.1609, + "step": 1129 + }, + { + "epoch": 1.1518858307849134, + "grad_norm": 6.362361431121826, + "learning_rate": 7.543691658951479e-06, + "loss": 0.2926, + "step": 1130 + }, + { + "epoch": 1.1529051987767585, + "grad_norm": 6.581395626068115, + "learning_rate": 7.444471677903775e-06, + "loss": 0.3756, + "step": 1131 + }, + { + "epoch": 1.1539245667686036, + "grad_norm": 7.190741062164307, + "learning_rate": 7.345856021554509e-06, + "loss": 0.3885, + "step": 1132 + }, + { + "epoch": 1.1549439347604484, + "grad_norm": 6.979515552520752, + "learning_rate": 7.247846090329913e-06, + "loss": 0.2404, + "step": 1133 + }, + { + "epoch": 1.1559633027522935, + "grad_norm": 5.002467632293701, + "learning_rate": 7.150443276054369e-06, + "loss": 0.2946, + "step": 1134 + }, + { + "epoch": 1.1569826707441386, + "grad_norm": 3.771885395050049, + "learning_rate": 7.053648961930681e-06, + "loss": 0.4079, + "step": 1135 + }, + { + "epoch": 1.1580020387359837, + "grad_norm": 8.6467866897583, + "learning_rate": 6.9574645225204735e-06, + "loss": 0.3785, + "step": 1136 + }, + { + "epoch": 1.1590214067278288, + "grad_norm": 6.50823974609375, + "learning_rate": 6.861891323724551e-06, + "loss": 0.2116, + "step": 1137 + }, + { + "epoch": 1.1600407747196737, + "grad_norm": 3.70011568069458, + "learning_rate": 6.766930722763642e-06, + "loss": 0.2715, + "step": 1138 + }, + { + "epoch": 1.1610601427115188, + "grad_norm": 3.5144193172454834, + "learning_rate": 6.672584068159055e-06, + "loss": 0.1894, + "step": 1139 + }, + { + "epoch": 1.162079510703364, + "grad_norm": 9.98049545288086, + "learning_rate": 6.578852699713539e-06, + "loss": 0.4822, + "step": 1140 + }, + { + "epoch": 1.163098878695209, + "grad_norm": 5.186913967132568, + "learning_rate": 6.4857379484922486e-06, + "loss": 0.2893, + "step": 1141 + }, + { + "epoch": 1.164118246687054, + "grad_norm": 11.172896385192871, + "learning_rate": 6.3932411368038455e-06, + "loss": 0.993, + "step": 1142 + }, + { + "epoch": 1.165137614678899, + "grad_norm": 8.763799667358398, + "learning_rate": 6.3013635781817234e-06, + "loss": 0.5991, + "step": 1143 + }, + { + "epoch": 1.166156982670744, + "grad_norm": 8.425676345825195, + "learning_rate": 6.210106577365382e-06, + "loss": 0.7188, + "step": 1144 + }, + { + "epoch": 1.1671763506625892, + "grad_norm": 3.8981945514678955, + "learning_rate": 6.119471430281837e-06, + "loss": 0.1915, + "step": 1145 + }, + { + "epoch": 1.1681957186544343, + "grad_norm": 9.766255378723145, + "learning_rate": 6.0294594240272895e-06, + "loss": 0.7058, + "step": 1146 + }, + { + "epoch": 1.1692150866462794, + "grad_norm": 6.730464935302734, + "learning_rate": 5.940071836848759e-06, + "loss": 0.5334, + "step": 1147 + }, + { + "epoch": 1.1702344546381243, + "grad_norm": 5.2849884033203125, + "learning_rate": 5.851309938126031e-06, + "loss": 0.4142, + "step": 1148 + }, + { + "epoch": 1.1712538226299694, + "grad_norm": 9.674934387207031, + "learning_rate": 5.763174988353565e-06, + "loss": 0.4762, + "step": 1149 + }, + { + "epoch": 1.1722731906218145, + "grad_norm": 8.052671432495117, + "learning_rate": 5.675668239122606e-06, + "loss": 0.4203, + "step": 1150 + }, + { + "epoch": 1.1732925586136596, + "grad_norm": 5.496038436889648, + "learning_rate": 5.588790933103444e-06, + "loss": 0.2975, + "step": 1151 + }, + { + "epoch": 1.1743119266055047, + "grad_norm": 6.982463359832764, + "learning_rate": 5.502544304027701e-06, + "loss": 0.3294, + "step": 1152 + }, + { + "epoch": 1.1753312945973495, + "grad_norm": 6.534660816192627, + "learning_rate": 5.41692957667086e-06, + "loss": 0.4958, + "step": 1153 + }, + { + "epoch": 1.1763506625891946, + "grad_norm": 5.204538345336914, + "learning_rate": 5.3319479668348775e-06, + "loss": 0.2991, + "step": 1154 + }, + { + "epoch": 1.1773700305810397, + "grad_norm": 6.391998767852783, + "learning_rate": 5.247600681330905e-06, + "loss": 0.3466, + "step": 1155 + }, + { + "epoch": 1.1783893985728848, + "grad_norm": 5.729480266571045, + "learning_rate": 5.16388891796214e-06, + "loss": 0.3326, + "step": 1156 + }, + { + "epoch": 1.17940876656473, + "grad_norm": 7.996901988983154, + "learning_rate": 5.0808138655068115e-06, + "loss": 0.4252, + "step": 1157 + }, + { + "epoch": 1.1804281345565748, + "grad_norm": 1.6697102785110474, + "learning_rate": 4.99837670370133e-06, + "loss": 0.1011, + "step": 1158 + }, + { + "epoch": 1.18144750254842, + "grad_norm": 6.206373691558838, + "learning_rate": 4.916578603223515e-06, + "loss": 0.3505, + "step": 1159 + }, + { + "epoch": 1.182466870540265, + "grad_norm": 8.138467788696289, + "learning_rate": 4.835420725675965e-06, + "loss": 0.2831, + "step": 1160 + }, + { + "epoch": 1.18348623853211, + "grad_norm": 4.128909587860107, + "learning_rate": 4.754904223569584e-06, + "loss": 0.2483, + "step": 1161 + }, + { + "epoch": 1.1845056065239552, + "grad_norm": 5.585041522979736, + "learning_rate": 4.67503024030716e-06, + "loss": 0.2343, + "step": 1162 + }, + { + "epoch": 1.1855249745158003, + "grad_norm": 9.036812782287598, + "learning_rate": 4.5957999101672145e-06, + "loss": 0.4812, + "step": 1163 + }, + { + "epoch": 1.1865443425076452, + "grad_norm": 7.6726765632629395, + "learning_rate": 4.517214358287825e-06, + "loss": 0.6771, + "step": 1164 + }, + { + "epoch": 1.1875637104994903, + "grad_norm": 9.784526824951172, + "learning_rate": 4.439274700650659e-06, + "loss": 0.6021, + "step": 1165 + }, + { + "epoch": 1.1885830784913354, + "grad_norm": 4.780755996704102, + "learning_rate": 4.361982044065166e-06, + "loss": 0.3105, + "step": 1166 + }, + { + "epoch": 1.1896024464831805, + "grad_norm": 5.239990711212158, + "learning_rate": 4.2853374861527905e-06, + "loss": 0.3328, + "step": 1167 + }, + { + "epoch": 1.1906218144750256, + "grad_norm": 8.040908813476562, + "learning_rate": 4.209342115331455e-06, + "loss": 0.6799, + "step": 1168 + }, + { + "epoch": 1.1916411824668705, + "grad_norm": 9.09835147857666, + "learning_rate": 4.133997010800072e-06, + "loss": 0.3449, + "step": 1169 + }, + { + "epoch": 1.1926605504587156, + "grad_norm": 1.4003539085388184, + "learning_rate": 4.0593032425231995e-06, + "loss": 0.0486, + "step": 1170 + }, + { + "epoch": 1.1936799184505607, + "grad_norm": 4.901974201202393, + "learning_rate": 3.985261871215906e-06, + "loss": 0.216, + "step": 1171 + }, + { + "epoch": 1.1946992864424058, + "grad_norm": 4.057441711425781, + "learning_rate": 3.9118739483285985e-06, + "loss": 0.241, + "step": 1172 + }, + { + "epoch": 1.1957186544342508, + "grad_norm": 3.083153247833252, + "learning_rate": 3.83914051603223e-06, + "loss": 0.1816, + "step": 1173 + }, + { + "epoch": 1.1967380224260957, + "grad_norm": 4.460226535797119, + "learning_rate": 3.767062607203392e-06, + "loss": 0.1868, + "step": 1174 + }, + { + "epoch": 1.1977573904179408, + "grad_norm": 4.221453666687012, + "learning_rate": 3.695641245409709e-06, + "loss": 0.2032, + "step": 1175 + }, + { + "epoch": 1.198776758409786, + "grad_norm": 4.9512434005737305, + "learning_rate": 3.624877444895275e-06, + "loss": 0.3835, + "step": 1176 + }, + { + "epoch": 1.198776758409786, + "eval_Qnli-dev-1024_cosine_accuracy": 0.75, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7976787090301514, + "eval_Qnli-dev-1024_cosine_ap": 0.7822748522324331, + "eval_Qnli-dev-1024_cosine_f1": 0.7238095238095237, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.728473961353302, + "eval_Qnli-dev-1024_cosine_mcc": 0.42578476395267345, + "eval_Qnli-dev-1024_cosine_precision": 0.6333333333333333, + "eval_Qnli-dev-1024_cosine_recall": 0.8444444444444444, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6868179440498352, + "eval_Qnli-dev_cosine_ap": 0.7613974691552876, + "eval_Qnli-dev_cosine_f1": 0.7422680412371134, + "eval_Qnli-dev_cosine_f1_threshold": 0.6748286485671997, + "eval_Qnli-dev_cosine_mcc": 0.48701780569984915, + "eval_Qnli-dev_cosine_precision": 0.6923076923076923, + "eval_Qnli-dev_cosine_recall": 0.8, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9479166865348816, + "eval_global_dataset_loss": 0.31731289625167847, + "eval_global_dataset_runtime": 103.9541, + "eval_global_dataset_samples_per_second": 7.725, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8887238353668264, + "eval_sts-test-1024_spearman_cosine": 0.9149437700482935, + "eval_sts-test_pearson_cosine": 0.9069731992077504, + "eval_sts-test_spearman_cosine": 0.9201751537259693, + "step": 1176 + }, + { + "epoch": 1.199796126401631, + "grad_norm": 3.4483885765075684, + "learning_rate": 3.554772210566221e-06, + "loss": 0.1712, + "step": 1177 + }, + { + "epoch": 1.2008154943934761, + "grad_norm": 2.9189963340759277, + "learning_rate": 3.4853265379765133e-06, + "loss": 0.2074, + "step": 1178 + }, + { + "epoch": 1.2018348623853212, + "grad_norm": 7.722777366638184, + "learning_rate": 3.4165414133137728e-06, + "loss": 0.6232, + "step": 1179 + }, + { + "epoch": 1.202854230377166, + "grad_norm": 5.288763046264648, + "learning_rate": 3.348417813385274e-06, + "loss": 0.4244, + "step": 1180 + }, + { + "epoch": 1.2038735983690112, + "grad_norm": 9.635140419006348, + "learning_rate": 3.2809567056040937e-06, + "loss": 0.3264, + "step": 1181 + }, + { + "epoch": 1.2048929663608563, + "grad_norm": 8.942253112792969, + "learning_rate": 3.2141590479753236e-06, + "loss": 0.608, + "step": 1182 + }, + { + "epoch": 1.2059123343527014, + "grad_norm": 6.490620136260986, + "learning_rate": 3.1480257890825205e-06, + "loss": 0.305, + "step": 1183 + }, + { + "epoch": 1.2069317023445465, + "grad_norm": 4.955615043640137, + "learning_rate": 3.082557868074221e-06, + "loss": 0.275, + "step": 1184 + }, + { + "epoch": 1.2079510703363914, + "grad_norm": 7.741005897521973, + "learning_rate": 3.0177562146505856e-06, + "loss": 0.329, + "step": 1185 + }, + { + "epoch": 1.2089704383282365, + "grad_norm": 5.985404968261719, + "learning_rate": 2.953621749050206e-06, + "loss": 0.7011, + "step": 1186 + }, + { + "epoch": 1.2099898063200816, + "grad_norm": 7.798089027404785, + "learning_rate": 2.8901553820370463e-06, + "loss": 0.4506, + "step": 1187 + }, + { + "epoch": 1.2110091743119267, + "grad_norm": 1.5254271030426025, + "learning_rate": 2.827358014887499e-06, + "loss": 0.1123, + "step": 1188 + }, + { + "epoch": 1.2120285423037718, + "grad_norm": 9.104569435119629, + "learning_rate": 2.7652305393775947e-06, + "loss": 0.407, + "step": 1189 + }, + { + "epoch": 1.2130479102956166, + "grad_norm": 2.4905483722686768, + "learning_rate": 2.70377383777034e-06, + "loss": 0.1889, + "step": 1190 + }, + { + "epoch": 1.2140672782874617, + "grad_norm": 5.599071979522705, + "learning_rate": 2.6429887828031407e-06, + "loss": 0.4215, + "step": 1191 + }, + { + "epoch": 1.2150866462793068, + "grad_norm": 3.8303184509277344, + "learning_rate": 2.5828762376755024e-06, + "loss": 0.21, + "step": 1192 + }, + { + "epoch": 1.216106014271152, + "grad_norm": 8.759214401245117, + "learning_rate": 2.523437056036687e-06, + "loss": 0.3706, + "step": 1193 + }, + { + "epoch": 1.217125382262997, + "grad_norm": 5.240026950836182, + "learning_rate": 2.4646720819736344e-06, + "loss": 0.3331, + "step": 1194 + }, + { + "epoch": 1.218144750254842, + "grad_norm": 6.483129024505615, + "learning_rate": 2.4065821499989647e-06, + "loss": 0.5147, + "step": 1195 + }, + { + "epoch": 1.219164118246687, + "grad_norm": 8.765278816223145, + "learning_rate": 2.3491680850391105e-06, + "loss": 0.4573, + "step": 1196 + }, + { + "epoch": 1.2201834862385321, + "grad_norm": 6.686453342437744, + "learning_rate": 2.2924307024226322e-06, + "loss": 0.3008, + "step": 1197 + }, + { + "epoch": 1.2212028542303772, + "grad_norm": 9.20877742767334, + "learning_rate": 2.2363708078686263e-06, + "loss": 0.9406, + "step": 1198 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.597818374633789, + "learning_rate": 2.1809891974752694e-06, + "loss": 0.0947, + "step": 1199 + }, + { + "epoch": 1.2232415902140672, + "grad_norm": 2.858272075653076, + "learning_rate": 2.126286657708548e-06, + "loss": 0.1345, + "step": 1200 + }, + { + "epoch": 1.2242609582059123, + "grad_norm": 3.3939218521118164, + "learning_rate": 2.072263965391047e-06, + "loss": 0.2185, + "step": 1201 + }, + { + "epoch": 1.2252803261977574, + "grad_norm": 3.4250073432922363, + "learning_rate": 2.0189218876909444e-06, + "loss": 0.289, + "step": 1202 + }, + { + "epoch": 1.2262996941896025, + "grad_norm": 4.34546422958374, + "learning_rate": 1.9662611821111122e-06, + "loss": 0.3703, + "step": 1203 + }, + { + "epoch": 1.2273190621814476, + "grad_norm": 8.702229499816895, + "learning_rate": 1.914282596478373e-06, + "loss": 0.4135, + "step": 1204 + }, + { + "epoch": 1.2283384301732925, + "grad_norm": 5.491823673248291, + "learning_rate": 1.8629868689328533e-06, + "loss": 0.34, + "step": 1205 + }, + { + "epoch": 1.2293577981651376, + "grad_norm": 6.028850078582764, + "learning_rate": 1.8123747279174986e-06, + "loss": 0.2804, + "step": 1206 + }, + { + "epoch": 1.2303771661569827, + "grad_norm": 4.991639614105225, + "learning_rate": 1.7624468921677738e-06, + "loss": 0.1493, + "step": 1207 + }, + { + "epoch": 1.2313965341488278, + "grad_norm": 7.220829963684082, + "learning_rate": 1.713204070701413e-06, + "loss": 0.6822, + "step": 1208 + }, + { + "epoch": 1.2324159021406729, + "grad_norm": 4.651946544647217, + "learning_rate": 1.6646469628083583e-06, + "loss": 0.2489, + "step": 1209 + }, + { + "epoch": 1.2334352701325177, + "grad_norm": 3.441234827041626, + "learning_rate": 1.6167762580408585e-06, + "loss": 0.2702, + "step": 1210 + }, + { + "epoch": 1.2344546381243628, + "grad_norm": 6.1341776847839355, + "learning_rate": 1.5695926362036205e-06, + "loss": 0.22, + "step": 1211 + }, + { + "epoch": 1.235474006116208, + "grad_norm": 5.035029888153076, + "learning_rate": 1.5230967673442066e-06, + "loss": 0.1727, + "step": 1212 + }, + { + "epoch": 1.236493374108053, + "grad_norm": 7.472972393035889, + "learning_rate": 1.4772893117435127e-06, + "loss": 0.5975, + "step": 1213 + }, + { + "epoch": 1.2375127420998981, + "grad_norm": 4.070760250091553, + "learning_rate": 1.432170919906367e-06, + "loss": 0.175, + "step": 1214 + }, + { + "epoch": 1.238532110091743, + "grad_norm": 3.1894075870513916, + "learning_rate": 1.3877422325523247e-06, + "loss": 0.1245, + "step": 1215 + }, + { + "epoch": 1.2395514780835881, + "grad_norm": 7.860419273376465, + "learning_rate": 1.3440038806065091e-06, + "loss": 0.9057, + "step": 1216 + }, + { + "epoch": 1.2405708460754332, + "grad_norm": 2.675111770629883, + "learning_rate": 1.3009564851907297e-06, + "loss": 0.1773, + "step": 1217 + }, + { + "epoch": 1.2415902140672783, + "grad_norm": 7.003297805786133, + "learning_rate": 1.258600657614617e-06, + "loss": 0.6302, + "step": 1218 + }, + { + "epoch": 1.2426095820591234, + "grad_norm": 5.752772331237793, + "learning_rate": 1.2169369993669578e-06, + "loss": 0.4958, + "step": 1219 + }, + { + "epoch": 1.2436289500509683, + "grad_norm": 5.576023578643799, + "learning_rate": 1.1759661021071288e-06, + "loss": 0.3934, + "step": 1220 + }, + { + "epoch": 1.2446483180428134, + "grad_norm": 7.377546787261963, + "learning_rate": 1.1356885476567214e-06, + "loss": 0.6317, + "step": 1221 + }, + { + "epoch": 1.2456676860346585, + "grad_norm": 4.526902198791504, + "learning_rate": 1.0961049079912633e-06, + "loss": 0.2713, + "step": 1222 + }, + { + "epoch": 1.2466870540265036, + "grad_norm": 5.726096153259277, + "learning_rate": 1.0572157452321097e-06, + "loss": 0.4534, + "step": 1223 + }, + { + "epoch": 1.2477064220183487, + "grad_norm": 5.3342413902282715, + "learning_rate": 1.0190216116384488e-06, + "loss": 0.2045, + "step": 1224 + }, + { + "epoch": 1.2487257900101936, + "grad_norm": 2.1736345291137695, + "learning_rate": 9.8152304959947e-07, + "loss": 0.2142, + "step": 1225 + }, + { + "epoch": 1.2497451580020387, + "grad_norm": 8.47685718536377, + "learning_rate": 9.447205916266411e-07, + "loss": 0.6366, + "step": 1226 + }, + { + "epoch": 1.2507645259938838, + "grad_norm": 7.194876670837402, + "learning_rate": 9.086147603461714e-07, + "loss": 0.5906, + "step": 1227 + }, + { + "epoch": 1.2517838939857289, + "grad_norm": 4.089115142822266, + "learning_rate": 8.732060684915721e-07, + "loss": 0.1791, + "step": 1228 + }, + { + "epoch": 1.252803261977574, + "grad_norm": 7.872777938842773, + "learning_rate": 8.384950188963902e-07, + "loss": 0.2664, + "step": 1229 + }, + { + "epoch": 1.2538226299694188, + "grad_norm": 4.340756893157959, + "learning_rate": 8.044821044870642e-07, + "loss": 0.3745, + "step": 1230 + }, + { + "epoch": 1.254841997961264, + "grad_norm": 3.608949899673462, + "learning_rate": 7.711678082758855e-07, + "loss": 0.3423, + "step": 1231 + }, + { + "epoch": 1.255861365953109, + "grad_norm": 6.383975505828857, + "learning_rate": 7.38552603354209e-07, + "loss": 0.2945, + "step": 1232 + }, + { + "epoch": 1.2568807339449541, + "grad_norm": 8.53671646118164, + "learning_rate": 7.066369528856809e-07, + "loss": 0.4634, + "step": 1233 + }, + { + "epoch": 1.2579001019367992, + "grad_norm": 7.338963985443115, + "learning_rate": 6.754213100996942e-07, + "loss": 0.3154, + "step": 1234 + }, + { + "epoch": 1.2589194699286441, + "grad_norm": 3.15653920173645, + "learning_rate": 6.449061182849215e-07, + "loss": 0.2901, + "step": 1235 + }, + { + "epoch": 1.2599388379204892, + "grad_norm": 5.7759928703308105, + "learning_rate": 6.150918107830361e-07, + "loss": 0.5114, + "step": 1236 + }, + { + "epoch": 1.2609582059123343, + "grad_norm": 4.276795387268066, + "learning_rate": 5.859788109825793e-07, + "loss": 0.1735, + "step": 1237 + }, + { + "epoch": 1.2619775739041794, + "grad_norm": 6.433792591094971, + "learning_rate": 5.575675323128915e-07, + "loss": 0.3959, + "step": 1238 + }, + { + "epoch": 1.2629969418960245, + "grad_norm": 4.392580986022949, + "learning_rate": 5.298583782383071e-07, + "loss": 0.3166, + "step": 1239 + }, + { + "epoch": 1.2640163098878694, + "grad_norm": 6.252894878387451, + "learning_rate": 5.028517422523749e-07, + "loss": 0.3007, + "step": 1240 + }, + { + "epoch": 1.2650356778797147, + "grad_norm": 2.0571022033691406, + "learning_rate": 4.7654800787230723e-07, + "loss": 0.0804, + "step": 1241 + }, + { + "epoch": 1.2660550458715596, + "grad_norm": 9.38656234741211, + "learning_rate": 4.509475486335013e-07, + "loss": 0.3005, + "step": 1242 + }, + { + "epoch": 1.2670744138634047, + "grad_norm": 7.827151775360107, + "learning_rate": 4.260507280842485e-07, + "loss": 0.3268, + "step": 1243 + }, + { + "epoch": 1.2680937818552498, + "grad_norm": 4.051625728607178, + "learning_rate": 4.0185789978057774e-07, + "loss": 0.2535, + "step": 1244 + }, + { + "epoch": 1.2691131498470947, + "grad_norm": 7.677860736846924, + "learning_rate": 3.7836940728123716e-07, + "loss": 0.736, + "step": 1245 + }, + { + "epoch": 1.27013251783894, + "grad_norm": 5.909502029418945, + "learning_rate": 3.555855841427869e-07, + "loss": 0.3688, + "step": 1246 + }, + { + "epoch": 1.2711518858307849, + "grad_norm": 3.4724619388580322, + "learning_rate": 3.335067539149084e-07, + "loss": 0.2656, + "step": 1247 + }, + { + "epoch": 1.27217125382263, + "grad_norm": 5.613596439361572, + "learning_rate": 3.1213323013575825e-07, + "loss": 0.2453, + "step": 1248 + }, + { + "epoch": 1.273190621814475, + "grad_norm": 9.458756446838379, + "learning_rate": 2.914653163275549e-07, + "loss": 0.5073, + "step": 1249 + }, + { + "epoch": 1.2742099898063202, + "grad_norm": 6.05524206161499, + "learning_rate": 2.7150330599226e-07, + "loss": 0.573, + "step": 1250 + }, + { + "epoch": 1.2752293577981653, + "grad_norm": 3.4236321449279785, + "learning_rate": 2.5224748260739284e-07, + "loss": 0.2673, + "step": 1251 + }, + { + "epoch": 1.2762487257900101, + "grad_norm": 8.313695907592773, + "learning_rate": 2.3369811962203335e-07, + "loss": 0.5646, + "step": 1252 + }, + { + "epoch": 1.2772680937818552, + "grad_norm": 9.496676445007324, + "learning_rate": 2.1585548045290337e-07, + "loss": 0.3595, + "step": 1253 + }, + { + "epoch": 1.2782874617737003, + "grad_norm": 8.075193405151367, + "learning_rate": 1.987198184806638e-07, + "loss": 0.3005, + "step": 1254 + }, + { + "epoch": 1.2793068297655454, + "grad_norm": 5.755889892578125, + "learning_rate": 1.8229137704627864e-07, + "loss": 0.189, + "step": 1255 + }, + { + "epoch": 1.2803261977573905, + "grad_norm": 3.0507397651672363, + "learning_rate": 1.6657038944759563e-07, + "loss": 0.2303, + "step": 1256 + }, + { + "epoch": 1.2813455657492354, + "grad_norm": 7.6736860275268555, + "learning_rate": 1.5155707893601546e-07, + "loss": 0.3083, + "step": 1257 + }, + { + "epoch": 1.2823649337410805, + "grad_norm": 4.9907050132751465, + "learning_rate": 1.3725165871331103e-07, + "loss": 0.4482, + "step": 1258 + }, + { + "epoch": 1.2833843017329256, + "grad_norm": 9.572212219238281, + "learning_rate": 1.236543319286243e-07, + "loss": 0.6032, + "step": 1259 + }, + { + "epoch": 1.2844036697247707, + "grad_norm": 4.888943195343018, + "learning_rate": 1.1076529167554639e-07, + "loss": 0.2635, + "step": 1260 + }, + { + "epoch": 1.2854230377166158, + "grad_norm": 6.444303512573242, + "learning_rate": 9.858472098942528e-08, + "loss": 0.3321, + "step": 1261 + }, + { + "epoch": 1.2864424057084607, + "grad_norm": 6.6462626457214355, + "learning_rate": 8.71127928447235e-08, + "loss": 0.4375, + "step": 1262 + }, + { + "epoch": 1.2874617737003058, + "grad_norm": 9.320449829101562, + "learning_rate": 7.63496701525701e-08, + "loss": 0.7222, + "step": 1263 + }, + { + "epoch": 1.2884811416921509, + "grad_norm": 9.66598129272461, + "learning_rate": 6.629550575847354e-08, + "loss": 0.5028, + "step": 1264 + }, + { + "epoch": 1.289500509683996, + "grad_norm": 8.09259033203125, + "learning_rate": 5.695044244011238e-08, + "loss": 0.5178, + "step": 1265 + }, + { + "epoch": 1.290519877675841, + "grad_norm": 5.617757320404053, + "learning_rate": 4.83146129053369e-08, + "loss": 0.4406, + "step": 1266 + }, + { + "epoch": 1.291539245667686, + "grad_norm": 2.1872336864471436, + "learning_rate": 4.038813979027056e-08, + "loss": 0.1136, + "step": 1267 + }, + { + "epoch": 1.292558613659531, + "grad_norm": 6.6291608810424805, + "learning_rate": 3.3171135657572575e-08, + "loss": 0.2167, + "step": 1268 + }, + { + "epoch": 1.2935779816513762, + "grad_norm": 5.140267848968506, + "learning_rate": 2.6663702994844664e-08, + "loss": 0.2206, + "step": 1269 + }, + { + "epoch": 1.2945973496432213, + "grad_norm": 4.506137371063232, + "learning_rate": 2.0865934213160078e-08, + "loss": 0.2313, + "step": 1270 + }, + { + "epoch": 1.2956167176350664, + "grad_norm": 3.7297956943511963, + "learning_rate": 1.577791164577014e-08, + "loss": 0.248, + "step": 1271 + }, + { + "epoch": 1.2966360856269112, + "grad_norm": 4.870246887207031, + "learning_rate": 1.1399707546921879e-08, + "loss": 0.285, + "step": 1272 + }, + { + "epoch": 1.2976554536187563, + "grad_norm": 2.5686280727386475, + "learning_rate": 7.731384090842176e-09, + "loss": 0.127, + "step": 1273 + }, + { + "epoch": 1.2986748216106014, + "grad_norm": 11.340642929077148, + "learning_rate": 4.772993370832923e-09, + "loss": 0.5928, + "step": 1274 + }, + { + "epoch": 1.2986748216106014, + "eval_Qnli-dev-1024_cosine_accuracy": 0.75, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8024710416793823, + "eval_Qnli-dev-1024_cosine_ap": 0.7863440662349561, + "eval_Qnli-dev-1024_cosine_f1": 0.7222222222222222, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7177085280418396, + "eval_Qnli-dev-1024_cosine_mcc": 0.41614558708189836, + "eval_Qnli-dev-1024_cosine_precision": 0.6190476190476191, + "eval_Qnli-dev-1024_cosine_recall": 0.8666666666666667, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6857460737228394, + "eval_Qnli-dev_cosine_ap": 0.761525183076105, + "eval_Qnli-dev_cosine_f1": 0.7422680412371134, + "eval_Qnli-dev_cosine_f1_threshold": 0.6738643646240234, + "eval_Qnli-dev_cosine_mcc": 0.48701780569984915, + "eval_Qnli-dev_cosine_precision": 0.6923076923076923, + "eval_Qnli-dev_cosine_recall": 0.8, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9479166865348816, + "eval_global_dataset_loss": 0.2720896899700165, + "eval_global_dataset_runtime": 103.9022, + "eval_global_dataset_samples_per_second": 7.728, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8890098652399769, + "eval_sts-test-1024_spearman_cosine": 0.9150082430472904, + "eval_sts-test_pearson_cosine": 0.906966954203853, + "eval_sts-test_spearman_cosine": 0.9201041628163744, + "step": 1274 + }, + { + "epoch": 1.2996941896024465, + "grad_norm": 5.426166534423828, + "learning_rate": 2.5245773985715927e-09, + "loss": 0.2689, + "step": 1275 + }, + { + "epoch": 1.3007135575942916, + "grad_norm": 6.4207868576049805, + "learning_rate": 9.861681034672997e-10, + "loss": 0.3258, + "step": 1276 + }, + { + "epoch": 1.3017329255861365, + "grad_norm": 4.865072727203369, + "learning_rate": 1.5778733225002562e-10, + "loss": 0.1809, + "step": 1277 + }, + { + "epoch": 1.3027522935779816, + "grad_norm": 5.379539489746094, + "learning_rate": 9.999996055315139e-05, + "loss": 0.274, + "step": 1278 + }, + { + "epoch": 1.3037716615698267, + "grad_norm": 5.370091915130615, + "learning_rate": 9.999936885166688e-05, + "loss": 0.4674, + "step": 1279 + }, + { + "epoch": 1.3047910295616718, + "grad_norm": 7.655372619628906, + "learning_rate": 9.999806711661691e-05, + "loss": 0.2996, + "step": 1280 + }, + { + "epoch": 1.305810397553517, + "grad_norm": 8.584575653076172, + "learning_rate": 9.999605536648723e-05, + "loss": 0.6664, + "step": 1281 + }, + { + "epoch": 1.3068297655453618, + "grad_norm": 7.809893608093262, + "learning_rate": 9.999333362984638e-05, + "loss": 0.5189, + "step": 1282 + }, + { + "epoch": 1.3078491335372069, + "grad_norm": 10.140406608581543, + "learning_rate": 9.998990194534536e-05, + "loss": 1.0369, + "step": 1283 + }, + { + "epoch": 1.308868501529052, + "grad_norm": 4.876784801483154, + "learning_rate": 9.998576036171699e-05, + "loss": 0.3865, + "step": 1284 + }, + { + "epoch": 1.309887869520897, + "grad_norm": 2.4785141944885254, + "learning_rate": 9.99809089377753e-05, + "loss": 0.2092, + "step": 1285 + }, + { + "epoch": 1.3109072375127422, + "grad_norm": 3.415043592453003, + "learning_rate": 9.997534774241461e-05, + "loss": 0.1867, + "step": 1286 + }, + { + "epoch": 1.311926605504587, + "grad_norm": 5.118708610534668, + "learning_rate": 9.996907685460863e-05, + "loss": 0.2461, + "step": 1287 + }, + { + "epoch": 1.3129459734964322, + "grad_norm": 4.022343635559082, + "learning_rate": 9.996209636340933e-05, + "loss": 0.1819, + "step": 1288 + }, + { + "epoch": 1.3139653414882773, + "grad_norm": 5.242366790771484, + "learning_rate": 9.99544063679456e-05, + "loss": 0.3707, + "step": 1289 + }, + { + "epoch": 1.3149847094801224, + "grad_norm": 3.762380838394165, + "learning_rate": 9.994600697742192e-05, + "loss": 0.2654, + "step": 1290 + }, + { + "epoch": 1.3160040774719675, + "grad_norm": 8.3553466796875, + "learning_rate": 9.993689831111675e-05, + "loss": 0.4869, + "step": 1291 + }, + { + "epoch": 1.3170234454638123, + "grad_norm": 4.632368087768555, + "learning_rate": 9.992708049838096e-05, + "loss": 0.2443, + "step": 1292 + }, + { + "epoch": 1.3180428134556574, + "grad_norm": 11.02981185913086, + "learning_rate": 9.99165536786358e-05, + "loss": 0.6129, + "step": 1293 + }, + { + "epoch": 1.3190621814475025, + "grad_norm": 6.492003440856934, + "learning_rate": 9.990531800137104e-05, + "loss": 0.5258, + "step": 1294 + }, + { + "epoch": 1.3200815494393476, + "grad_norm": 7.420266628265381, + "learning_rate": 9.989337362614292e-05, + "loss": 0.3551, + "step": 1295 + }, + { + "epoch": 1.3211009174311927, + "grad_norm": 5.672893047332764, + "learning_rate": 9.988072072257168e-05, + "loss": 0.1857, + "step": 1296 + }, + { + "epoch": 1.3221202854230376, + "grad_norm": 14.246613502502441, + "learning_rate": 9.986735947033934e-05, + "loss": 0.8164, + "step": 1297 + }, + { + "epoch": 1.3231396534148827, + "grad_norm": 7.219099521636963, + "learning_rate": 9.985329005918702e-05, + "loss": 0.4561, + "step": 1298 + }, + { + "epoch": 1.3241590214067278, + "grad_norm": 7.1640448570251465, + "learning_rate": 9.983851268891235e-05, + "loss": 0.3761, + "step": 1299 + }, + { + "epoch": 1.325178389398573, + "grad_norm": 6.6731181144714355, + "learning_rate": 9.982302756936654e-05, + "loss": 0.4334, + "step": 1300 + }, + { + "epoch": 1.326197757390418, + "grad_norm": 5.247565746307373, + "learning_rate": 9.980683492045146e-05, + "loss": 0.2473, + "step": 1301 + }, + { + "epoch": 1.3272171253822629, + "grad_norm": 6.342030048370361, + "learning_rate": 9.978993497211651e-05, + "loss": 0.3142, + "step": 1302 + }, + { + "epoch": 1.328236493374108, + "grad_norm": 9.803683280944824, + "learning_rate": 9.977232796435532e-05, + "loss": 0.8421, + "step": 1303 + }, + { + "epoch": 1.329255861365953, + "grad_norm": 8.165091514587402, + "learning_rate": 9.975401414720238e-05, + "loss": 0.565, + "step": 1304 + }, + { + "epoch": 1.3302752293577982, + "grad_norm": 3.9490790367126465, + "learning_rate": 9.973499378072945e-05, + "loss": 0.235, + "step": 1305 + }, + { + "epoch": 1.3312945973496433, + "grad_norm": 3.1755430698394775, + "learning_rate": 9.971526713504195e-05, + "loss": 0.1707, + "step": 1306 + }, + { + "epoch": 1.3323139653414882, + "grad_norm": 7.461353778839111, + "learning_rate": 9.969483449027502e-05, + "loss": 0.3428, + "step": 1307 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 6.9499664306640625, + "learning_rate": 9.967369613658955e-05, + "loss": 0.349, + "step": 1308 + }, + { + "epoch": 1.3343527013251784, + "grad_norm": 5.956964492797852, + "learning_rate": 9.965185237416821e-05, + "loss": 0.3372, + "step": 1309 + }, + { + "epoch": 1.3353720693170235, + "grad_norm": 10.764342308044434, + "learning_rate": 9.962930351321095e-05, + "loss": 0.8009, + "step": 1310 + }, + { + "epoch": 1.3363914373088686, + "grad_norm": 6.670104503631592, + "learning_rate": 9.960604987393081e-05, + "loss": 0.5033, + "step": 1311 + }, + { + "epoch": 1.3374108053007134, + "grad_norm": 5.372352600097656, + "learning_rate": 9.958209178654921e-05, + "loss": 0.352, + "step": 1312 + }, + { + "epoch": 1.3384301732925585, + "grad_norm": 7.409302234649658, + "learning_rate": 9.955742959129142e-05, + "loss": 0.4625, + "step": 1313 + }, + { + "epoch": 1.3394495412844036, + "grad_norm": 6.971982002258301, + "learning_rate": 9.953206363838155e-05, + "loss": 0.3854, + "step": 1314 + }, + { + "epoch": 1.3404689092762487, + "grad_norm": 9.18579387664795, + "learning_rate": 9.95059942880377e-05, + "loss": 0.6175, + "step": 1315 + }, + { + "epoch": 1.3414882772680938, + "grad_norm": 5.547621250152588, + "learning_rate": 9.947922191046686e-05, + "loss": 0.3714, + "step": 1316 + }, + { + "epoch": 1.3425076452599387, + "grad_norm": 4.560646057128906, + "learning_rate": 9.945174688585954e-05, + "loss": 0.3143, + "step": 1317 + }, + { + "epoch": 1.343527013251784, + "grad_norm": 4.778387069702148, + "learning_rate": 9.942356960438447e-05, + "loss": 0.31, + "step": 1318 + }, + { + "epoch": 1.344546381243629, + "grad_norm": 7.907627105712891, + "learning_rate": 9.939469046618297e-05, + "loss": 0.9255, + "step": 1319 + }, + { + "epoch": 1.345565749235474, + "grad_norm": 8.810420989990234, + "learning_rate": 9.93651098813634e-05, + "loss": 0.412, + "step": 1320 + }, + { + "epoch": 1.346585117227319, + "grad_norm": 6.384575843811035, + "learning_rate": 9.933482826999525e-05, + "loss": 0.6548, + "step": 1321 + }, + { + "epoch": 1.347604485219164, + "grad_norm": 3.368100881576538, + "learning_rate": 9.930384606210312e-05, + "loss": 0.2306, + "step": 1322 + }, + { + "epoch": 1.3486238532110093, + "grad_norm": 6.657802581787109, + "learning_rate": 9.927216369766071e-05, + "loss": 0.3323, + "step": 1323 + }, + { + "epoch": 1.3496432212028542, + "grad_norm": 6.96447229385376, + "learning_rate": 9.923978162658459e-05, + "loss": 0.6847, + "step": 1324 + }, + { + "epoch": 1.3506625891946993, + "grad_norm": 5.063149452209473, + "learning_rate": 9.920670030872765e-05, + "loss": 0.1593, + "step": 1325 + }, + { + "epoch": 1.3516819571865444, + "grad_norm": 8.810893058776855, + "learning_rate": 9.917292021387277e-05, + "loss": 0.9157, + "step": 1326 + }, + { + "epoch": 1.3527013251783895, + "grad_norm": 7.74990177154541, + "learning_rate": 9.913844182172604e-05, + "loss": 0.3446, + "step": 1327 + }, + { + "epoch": 1.3537206931702346, + "grad_norm": 4.324987888336182, + "learning_rate": 9.910326562190997e-05, + "loss": 0.4183, + "step": 1328 + }, + { + "epoch": 1.3547400611620795, + "grad_norm": 5.610269546508789, + "learning_rate": 9.906739211395648e-05, + "loss": 0.3208, + "step": 1329 + }, + { + "epoch": 1.3557594291539246, + "grad_norm": 4.970494270324707, + "learning_rate": 9.90308218072999e-05, + "loss": 0.2009, + "step": 1330 + }, + { + "epoch": 1.3567787971457697, + "grad_norm": 5.053140640258789, + "learning_rate": 9.89935552212697e-05, + "loss": 0.3931, + "step": 1331 + }, + { + "epoch": 1.3577981651376148, + "grad_norm": 4.1161932945251465, + "learning_rate": 9.895559288508309e-05, + "loss": 0.1704, + "step": 1332 + }, + { + "epoch": 1.3588175331294599, + "grad_norm": 1.7614240646362305, + "learning_rate": 9.891693533783756e-05, + "loss": 0.1302, + "step": 1333 + }, + { + "epoch": 1.3598369011213047, + "grad_norm": 5.90859842300415, + "learning_rate": 9.887758312850312e-05, + "loss": 0.3544, + "step": 1334 + }, + { + "epoch": 1.3608562691131498, + "grad_norm": 12.01405143737793, + "learning_rate": 9.883753681591467e-05, + "loss": 1.0104, + "step": 1335 + }, + { + "epoch": 1.361875637104995, + "grad_norm": 3.6148080825805664, + "learning_rate": 9.879679696876384e-05, + "loss": 0.1455, + "step": 1336 + }, + { + "epoch": 1.36289500509684, + "grad_norm": 8.55483341217041, + "learning_rate": 9.875536416559118e-05, + "loss": 0.5414, + "step": 1337 + }, + { + "epoch": 1.3639143730886851, + "grad_norm": 9.5636568069458, + "learning_rate": 9.871323899477769e-05, + "loss": 0.5544, + "step": 1338 + }, + { + "epoch": 1.36493374108053, + "grad_norm": 4.816516876220703, + "learning_rate": 9.867042205453665e-05, + "loss": 0.2696, + "step": 1339 + }, + { + "epoch": 1.365953109072375, + "grad_norm": 5.621625900268555, + "learning_rate": 9.862691395290502e-05, + "loss": 0.2817, + "step": 1340 + }, + { + "epoch": 1.3669724770642202, + "grad_norm": 11.75194263458252, + "learning_rate": 9.858271530773486e-05, + "loss": 0.8851, + "step": 1341 + }, + { + "epoch": 1.3679918450560653, + "grad_norm": 6.325568675994873, + "learning_rate": 9.85378267466845e-05, + "loss": 0.3788, + "step": 1342 + }, + { + "epoch": 1.3690112130479104, + "grad_norm": 6.309666156768799, + "learning_rate": 9.849224890720972e-05, + "loss": 0.4749, + "step": 1343 + }, + { + "epoch": 1.3700305810397553, + "grad_norm": 7.9817423820495605, + "learning_rate": 9.844598243655458e-05, + "loss": 0.5644, + "step": 1344 + }, + { + "epoch": 1.3710499490316004, + "grad_norm": 8.903715133666992, + "learning_rate": 9.839902799174232e-05, + "loss": 0.8262, + "step": 1345 + }, + { + "epoch": 1.3720693170234455, + "grad_norm": 11.527419090270996, + "learning_rate": 9.835138623956601e-05, + "loss": 1.1799, + "step": 1346 + }, + { + "epoch": 1.3730886850152906, + "grad_norm": 6.788682460784912, + "learning_rate": 9.830305785657905e-05, + "loss": 0.4572, + "step": 1347 + }, + { + "epoch": 1.3741080530071357, + "grad_norm": 5.4351091384887695, + "learning_rate": 9.82540435290856e-05, + "loss": 0.4109, + "step": 1348 + }, + { + "epoch": 1.3751274209989806, + "grad_norm": 4.732766628265381, + "learning_rate": 9.820434395313076e-05, + "loss": 0.2398, + "step": 1349 + }, + { + "epoch": 1.3761467889908257, + "grad_norm": 5.804527282714844, + "learning_rate": 9.81539598344908e-05, + "loss": 0.4608, + "step": 1350 + }, + { + "epoch": 1.3771661569826708, + "grad_norm": 10.281136512756348, + "learning_rate": 9.810289188866307e-05, + "loss": 0.8431, + "step": 1351 + }, + { + "epoch": 1.3781855249745159, + "grad_norm": 5.0923991203308105, + "learning_rate": 9.805114084085581e-05, + "loss": 0.3761, + "step": 1352 + }, + { + "epoch": 1.379204892966361, + "grad_norm": 2.6065218448638916, + "learning_rate": 9.799870742597796e-05, + "loss": 0.1865, + "step": 1353 + }, + { + "epoch": 1.3802242609582058, + "grad_norm": 7.639798164367676, + "learning_rate": 9.794559238862857e-05, + "loss": 0.4188, + "step": 1354 + }, + { + "epoch": 1.381243628950051, + "grad_norm": 7.440917015075684, + "learning_rate": 9.789179648308637e-05, + "loss": 0.3582, + "step": 1355 + }, + { + "epoch": 1.382262996941896, + "grad_norm": 8.206061363220215, + "learning_rate": 9.783732047329897e-05, + "loss": 0.4767, + "step": 1356 + }, + { + "epoch": 1.3832823649337411, + "grad_norm": 6.895366191864014, + "learning_rate": 9.778216513287204e-05, + "loss": 0.5777, + "step": 1357 + }, + { + "epoch": 1.3843017329255862, + "grad_norm": 6.850992202758789, + "learning_rate": 9.772633124505834e-05, + "loss": 0.2647, + "step": 1358 + }, + { + "epoch": 1.385321100917431, + "grad_norm": 6.088956356048584, + "learning_rate": 9.766981960274653e-05, + "loss": 0.3234, + "step": 1359 + }, + { + "epoch": 1.3863404689092762, + "grad_norm": 6.445374011993408, + "learning_rate": 9.761263100845005e-05, + "loss": 0.5274, + "step": 1360 + }, + { + "epoch": 1.3873598369011213, + "grad_norm": 4.1013031005859375, + "learning_rate": 9.755476627429554e-05, + "loss": 0.3592, + "step": 1361 + }, + { + "epoch": 1.3883792048929664, + "grad_norm": 5.010069847106934, + "learning_rate": 9.749622622201149e-05, + "loss": 0.3388, + "step": 1362 + }, + { + "epoch": 1.3893985728848115, + "grad_norm": 7.896244049072266, + "learning_rate": 9.743701168291638e-05, + "loss": 0.5058, + "step": 1363 + }, + { + "epoch": 1.3904179408766564, + "grad_norm": 3.5064926147460938, + "learning_rate": 9.737712349790706e-05, + "loss": 0.1924, + "step": 1364 + }, + { + "epoch": 1.3914373088685015, + "grad_norm": 9.105462074279785, + "learning_rate": 9.73165625174467e-05, + "loss": 0.7132, + "step": 1365 + }, + { + "epoch": 1.3924566768603466, + "grad_norm": 5.037233829498291, + "learning_rate": 9.725532960155272e-05, + "loss": 0.3511, + "step": 1366 + }, + { + "epoch": 1.3934760448521917, + "grad_norm": 5.967740535736084, + "learning_rate": 9.719342561978462e-05, + "loss": 0.5524, + "step": 1367 + }, + { + "epoch": 1.3944954128440368, + "grad_norm": 7.187025547027588, + "learning_rate": 9.713085145123158e-05, + "loss": 0.3859, + "step": 1368 + }, + { + "epoch": 1.3955147808358817, + "grad_norm": 9.78358268737793, + "learning_rate": 9.706760798450004e-05, + "loss": 0.6256, + "step": 1369 + }, + { + "epoch": 1.3965341488277268, + "grad_norm": 9.806771278381348, + "learning_rate": 9.700369611770099e-05, + "loss": 0.7257, + "step": 1370 + }, + { + "epoch": 1.3975535168195719, + "grad_norm": 8.946002006530762, + "learning_rate": 9.693911675843732e-05, + "loss": 0.9092, + "step": 1371 + }, + { + "epoch": 1.398572884811417, + "grad_norm": 4.351489067077637, + "learning_rate": 9.687387082379085e-05, + "loss": 0.1719, + "step": 1372 + }, + { + "epoch": 1.398572884811417, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7861621975898743, + "eval_Qnli-dev-1024_cosine_ap": 0.7440149504607769, + "eval_Qnli-dev-1024_cosine_f1": 0.7157894736842104, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.756857693195343, + "eval_Qnli-dev-1024_cosine_mcc": 0.44134955399887316, + "eval_Qnli-dev-1024_cosine_precision": 0.68, + "eval_Qnli-dev-1024_cosine_recall": 0.7555555555555555, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7261146306991577, + "eval_Qnli-dev_cosine_ap": 0.7580605820681848, + "eval_Qnli-dev_cosine_f1": 0.7454545454545455, + "eval_Qnli-dev_cosine_f1_threshold": 0.6269410848617554, + "eval_Qnli-dev_cosine_mcc": 0.47013467657639685, + "eval_Qnli-dev_cosine_precision": 0.6307692307692307, + "eval_Qnli-dev_cosine_recall": 0.9111111111111111, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9791666865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.39148974418640137, + "eval_global_dataset_runtime": 103.9064, + "eval_global_dataset_samples_per_second": 7.728, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9791666865348816, + "eval_sts-test-1024_pearson_cosine": 0.8610388110383311, + "eval_sts-test-1024_spearman_cosine": 0.897175185335658, + "eval_sts-test_pearson_cosine": 0.9050122455904925, + "eval_sts-test_spearman_cosine": 0.9224667967762508, + "step": 1372 + }, + { + "epoch": 1.399592252803262, + "grad_norm": 6.123342990875244, + "learning_rate": 9.680795924030936e-05, + "loss": 0.5039, + "step": 1373 + }, + { + "epoch": 1.400611620795107, + "grad_norm": 6.790772438049316, + "learning_rate": 9.67413829439934e-05, + "loss": 0.4752, + "step": 1374 + }, + { + "epoch": 1.401630988786952, + "grad_norm": 10.278209686279297, + "learning_rate": 9.6674142880283e-05, + "loss": 0.7859, + "step": 1375 + }, + { + "epoch": 1.4026503567787971, + "grad_norm": 9.054181098937988, + "learning_rate": 9.660624000404423e-05, + "loss": 0.5784, + "step": 1376 + }, + { + "epoch": 1.4036697247706422, + "grad_norm": 9.129789352416992, + "learning_rate": 9.653767527955574e-05, + "loss": 0.5278, + "step": 1377 + }, + { + "epoch": 1.4046890927624873, + "grad_norm": 4.569271087646484, + "learning_rate": 9.646844968049488e-05, + "loss": 0.215, + "step": 1378 + }, + { + "epoch": 1.4057084607543322, + "grad_norm": 12.985228538513184, + "learning_rate": 9.639856418992409e-05, + "loss": 0.7942, + "step": 1379 + }, + { + "epoch": 1.4067278287461773, + "grad_norm": 8.284036636352539, + "learning_rate": 9.632801980027672e-05, + "loss": 0.4433, + "step": 1380 + }, + { + "epoch": 1.4077471967380224, + "grad_norm": 6.351473331451416, + "learning_rate": 9.625681751334319e-05, + "loss": 0.3836, + "step": 1381 + }, + { + "epoch": 1.4087665647298675, + "grad_norm": 8.75439167022705, + "learning_rate": 9.618495834025646e-05, + "loss": 0.4579, + "step": 1382 + }, + { + "epoch": 1.4097859327217126, + "grad_norm": 5.620606422424316, + "learning_rate": 9.611244330147793e-05, + "loss": 0.3356, + "step": 1383 + }, + { + "epoch": 1.4108053007135575, + "grad_norm": 5.859284400939941, + "learning_rate": 9.603927342678285e-05, + "loss": 0.3149, + "step": 1384 + }, + { + "epoch": 1.4118246687054026, + "grad_norm": 4.390924453735352, + "learning_rate": 9.596544975524565e-05, + "loss": 0.209, + "step": 1385 + }, + { + "epoch": 1.4128440366972477, + "grad_norm": 4.031999588012695, + "learning_rate": 9.589097333522528e-05, + "loss": 0.1533, + "step": 1386 + }, + { + "epoch": 1.4138634046890928, + "grad_norm": 7.579880714416504, + "learning_rate": 9.581584522435026e-05, + "loss": 0.3178, + "step": 1387 + }, + { + "epoch": 1.4148827726809379, + "grad_norm": 8.35764217376709, + "learning_rate": 9.574006648950362e-05, + "loss": 0.431, + "step": 1388 + }, + { + "epoch": 1.4159021406727827, + "grad_norm": 5.463128566741943, + "learning_rate": 9.566363820680787e-05, + "loss": 0.3837, + "step": 1389 + }, + { + "epoch": 1.4169215086646278, + "grad_norm": 10.94266128540039, + "learning_rate": 9.558656146160964e-05, + "loss": 0.6833, + "step": 1390 + }, + { + "epoch": 1.417940876656473, + "grad_norm": 7.073869228363037, + "learning_rate": 9.550883734846427e-05, + "loss": 0.2991, + "step": 1391 + }, + { + "epoch": 1.418960244648318, + "grad_norm": 5.472149848937988, + "learning_rate": 9.54304669711203e-05, + "loss": 0.4607, + "step": 1392 + }, + { + "epoch": 1.4199796126401631, + "grad_norm": 6.09562349319458, + "learning_rate": 9.53514514425037e-05, + "loss": 0.4161, + "step": 1393 + }, + { + "epoch": 1.420998980632008, + "grad_norm": 4.8968939781188965, + "learning_rate": 9.527179188470222e-05, + "loss": 0.3701, + "step": 1394 + }, + { + "epoch": 1.4220183486238533, + "grad_norm": 5.902759075164795, + "learning_rate": 9.51914894289493e-05, + "loss": 0.2434, + "step": 1395 + }, + { + "epoch": 1.4230377166156982, + "grad_norm": 7.825418949127197, + "learning_rate": 9.511054521560816e-05, + "loss": 0.471, + "step": 1396 + }, + { + "epoch": 1.4240570846075433, + "grad_norm": 8.981621742248535, + "learning_rate": 9.502896039415545e-05, + "loss": 0.692, + "step": 1397 + }, + { + "epoch": 1.4250764525993884, + "grad_norm": 4.354804992675781, + "learning_rate": 9.494673612316505e-05, + "loss": 0.3583, + "step": 1398 + }, + { + "epoch": 1.4260958205912333, + "grad_norm": 3.310420513153076, + "learning_rate": 9.486387357029148e-05, + "loss": 0.2116, + "step": 1399 + }, + { + "epoch": 1.4271151885830786, + "grad_norm": 5.200766563415527, + "learning_rate": 9.478037391225356e-05, + "loss": 0.2882, + "step": 1400 + }, + { + "epoch": 1.4281345565749235, + "grad_norm": 7.121079444885254, + "learning_rate": 9.46962383348174e-05, + "loss": 0.5388, + "step": 1401 + }, + { + "epoch": 1.4291539245667686, + "grad_norm": 9.046875953674316, + "learning_rate": 9.461146803277979e-05, + "loss": 0.534, + "step": 1402 + }, + { + "epoch": 1.4301732925586137, + "grad_norm": 4.394150733947754, + "learning_rate": 9.45260642099511e-05, + "loss": 0.189, + "step": 1403 + }, + { + "epoch": 1.4311926605504588, + "grad_norm": 8.304125785827637, + "learning_rate": 9.444002807913828e-05, + "loss": 0.5505, + "step": 1404 + }, + { + "epoch": 1.432212028542304, + "grad_norm": 8.639719009399414, + "learning_rate": 9.435336086212753e-05, + "loss": 0.6606, + "step": 1405 + }, + { + "epoch": 1.4332313965341488, + "grad_norm": 9.554101943969727, + "learning_rate": 9.426606378966707e-05, + "loss": 0.9008, + "step": 1406 + }, + { + "epoch": 1.4342507645259939, + "grad_norm": 4.584405422210693, + "learning_rate": 9.417813810144962e-05, + "loss": 0.3275, + "step": 1407 + }, + { + "epoch": 1.435270132517839, + "grad_norm": 5.24059534072876, + "learning_rate": 9.408958504609466e-05, + "loss": 0.3699, + "step": 1408 + }, + { + "epoch": 1.436289500509684, + "grad_norm": 6.843227863311768, + "learning_rate": 9.400040588113095e-05, + "loss": 0.5497, + "step": 1409 + }, + { + "epoch": 1.4373088685015292, + "grad_norm": 6.056573390960693, + "learning_rate": 9.391060187297846e-05, + "loss": 0.2722, + "step": 1410 + }, + { + "epoch": 1.438328236493374, + "grad_norm": 10.264248847961426, + "learning_rate": 9.382017429693053e-05, + "loss": 0.8038, + "step": 1411 + }, + { + "epoch": 1.4393476044852191, + "grad_norm": 4.382177352905273, + "learning_rate": 9.372912443713561e-05, + "loss": 0.1399, + "step": 1412 + }, + { + "epoch": 1.4403669724770642, + "grad_norm": 7.841924667358398, + "learning_rate": 9.363745358657917e-05, + "loss": 0.2747, + "step": 1413 + }, + { + "epoch": 1.4413863404689093, + "grad_norm": 6.299066543579102, + "learning_rate": 9.354516304706527e-05, + "loss": 0.3525, + "step": 1414 + }, + { + "epoch": 1.4424057084607544, + "grad_norm": 4.2002949714660645, + "learning_rate": 9.345225412919803e-05, + "loss": 0.353, + "step": 1415 + }, + { + "epoch": 1.4434250764525993, + "grad_norm": 5.748711109161377, + "learning_rate": 9.335872815236315e-05, + "loss": 0.3674, + "step": 1416 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 10.88302230834961, + "learning_rate": 9.326458644470907e-05, + "loss": 0.826, + "step": 1417 + }, + { + "epoch": 1.4454638124362895, + "grad_norm": 7.587636947631836, + "learning_rate": 9.316983034312804e-05, + "loss": 0.6642, + "step": 1418 + }, + { + "epoch": 1.4464831804281346, + "grad_norm": 3.749462842941284, + "learning_rate": 9.307446119323738e-05, + "loss": 0.2859, + "step": 1419 + }, + { + "epoch": 1.4475025484199797, + "grad_norm": 7.171744346618652, + "learning_rate": 9.297848034936005e-05, + "loss": 0.3904, + "step": 1420 + }, + { + "epoch": 1.4485219164118246, + "grad_norm": 8.47977066040039, + "learning_rate": 9.288188917450575e-05, + "loss": 0.612, + "step": 1421 + }, + { + "epoch": 1.4495412844036697, + "grad_norm": 2.8732564449310303, + "learning_rate": 9.278468904035129e-05, + "loss": 0.1087, + "step": 1422 + }, + { + "epoch": 1.4505606523955148, + "grad_norm": 5.59121036529541, + "learning_rate": 9.268688132722124e-05, + "loss": 0.2336, + "step": 1423 + }, + { + "epoch": 1.45158002038736, + "grad_norm": 8.162138938903809, + "learning_rate": 9.258846742406833e-05, + "loss": 0.2767, + "step": 1424 + }, + { + "epoch": 1.452599388379205, + "grad_norm": 1.6772691011428833, + "learning_rate": 9.248944872845369e-05, + "loss": 0.1598, + "step": 1425 + }, + { + "epoch": 1.4536187563710499, + "grad_norm": 6.238204479217529, + "learning_rate": 9.238982664652701e-05, + "loss": 0.3641, + "step": 1426 + }, + { + "epoch": 1.454638124362895, + "grad_norm": 5.3337202072143555, + "learning_rate": 9.228960259300662e-05, + "loss": 0.5724, + "step": 1427 + }, + { + "epoch": 1.45565749235474, + "grad_norm": 5.208616733551025, + "learning_rate": 9.21887779911593e-05, + "loss": 0.3569, + "step": 1428 + }, + { + "epoch": 1.4566768603465852, + "grad_norm": 4.691967010498047, + "learning_rate": 9.208735427278014e-05, + "loss": 0.1306, + "step": 1429 + }, + { + "epoch": 1.4576962283384303, + "grad_norm": 5.81736421585083, + "learning_rate": 9.198533287817223e-05, + "loss": 0.2527, + "step": 1430 + }, + { + "epoch": 1.4587155963302751, + "grad_norm": 7.3892340660095215, + "learning_rate": 9.188271525612615e-05, + "loss": 0.3424, + "step": 1431 + }, + { + "epoch": 1.4597349643221202, + "grad_norm": 8.754351615905762, + "learning_rate": 9.177950286389942e-05, + "loss": 0.5734, + "step": 1432 + }, + { + "epoch": 1.4607543323139653, + "grad_norm": 4.002055644989014, + "learning_rate": 9.167569716719579e-05, + "loss": 0.2381, + "step": 1433 + }, + { + "epoch": 1.4617737003058104, + "grad_norm": 9.790334701538086, + "learning_rate": 9.157129964014445e-05, + "loss": 0.7411, + "step": 1434 + }, + { + "epoch": 1.4627930682976555, + "grad_norm": 6.292593955993652, + "learning_rate": 9.146631176527906e-05, + "loss": 0.3854, + "step": 1435 + }, + { + "epoch": 1.4638124362895004, + "grad_norm": 5.826279163360596, + "learning_rate": 9.136073503351679e-05, + "loss": 0.4475, + "step": 1436 + }, + { + "epoch": 1.4648318042813455, + "grad_norm": 7.841248035430908, + "learning_rate": 9.125457094413698e-05, + "loss": 0.5616, + "step": 1437 + }, + { + "epoch": 1.4658511722731906, + "grad_norm": 6.76909065246582, + "learning_rate": 9.114782100476005e-05, + "loss": 0.4432, + "step": 1438 + }, + { + "epoch": 1.4668705402650357, + "grad_norm": 6.64108419418335, + "learning_rate": 9.104048673132587e-05, + "loss": 0.5708, + "step": 1439 + }, + { + "epoch": 1.4678899082568808, + "grad_norm": 6.3219218254089355, + "learning_rate": 9.093256964807249e-05, + "loss": 0.3974, + "step": 1440 + }, + { + "epoch": 1.4689092762487257, + "grad_norm": 5.191364765167236, + "learning_rate": 9.082407128751423e-05, + "loss": 0.2253, + "step": 1441 + }, + { + "epoch": 1.4699286442405708, + "grad_norm": 2.702972412109375, + "learning_rate": 9.071499319042011e-05, + "loss": 0.1825, + "step": 1442 + }, + { + "epoch": 1.470948012232416, + "grad_norm": 7.462799072265625, + "learning_rate": 9.060533690579191e-05, + "loss": 0.5323, + "step": 1443 + }, + { + "epoch": 1.471967380224261, + "grad_norm": 6.826075553894043, + "learning_rate": 9.049510399084211e-05, + "loss": 0.2, + "step": 1444 + }, + { + "epoch": 1.472986748216106, + "grad_norm": 2.9609243869781494, + "learning_rate": 9.038429601097187e-05, + "loss": 0.2048, + "step": 1445 + }, + { + "epoch": 1.474006116207951, + "grad_norm": 8.720600128173828, + "learning_rate": 9.027291453974877e-05, + "loss": 0.964, + "step": 1446 + }, + { + "epoch": 1.475025484199796, + "grad_norm": 5.362283229827881, + "learning_rate": 9.016096115888443e-05, + "loss": 0.2534, + "step": 1447 + }, + { + "epoch": 1.4760448521916412, + "grad_norm": 6.537226676940918, + "learning_rate": 9.004843745821207e-05, + "loss": 0.3222, + "step": 1448 + }, + { + "epoch": 1.4770642201834863, + "grad_norm": 8.83950424194336, + "learning_rate": 8.993534503566397e-05, + "loss": 0.4912, + "step": 1449 + }, + { + "epoch": 1.4780835881753314, + "grad_norm": 7.760464191436768, + "learning_rate": 8.982168549724869e-05, + "loss": 0.7533, + "step": 1450 + }, + { + "epoch": 1.4791029561671762, + "grad_norm": 2.361077308654785, + "learning_rate": 8.970746045702841e-05, + "loss": 0.161, + "step": 1451 + }, + { + "epoch": 1.4801223241590213, + "grad_norm": 12.053560256958008, + "learning_rate": 8.959267153709578e-05, + "loss": 0.9155, + "step": 1452 + }, + { + "epoch": 1.4811416921508664, + "grad_norm": 7.693728446960449, + "learning_rate": 8.947732036755114e-05, + "loss": 1.0607, + "step": 1453 + }, + { + "epoch": 1.4821610601427115, + "grad_norm": 6.32127046585083, + "learning_rate": 8.936140858647923e-05, + "loss": 0.5694, + "step": 1454 + }, + { + "epoch": 1.4831804281345566, + "grad_norm": 5.448962211608887, + "learning_rate": 8.924493783992589e-05, + "loss": 0.5814, + "step": 1455 + }, + { + "epoch": 1.4841997961264015, + "grad_norm": 11.943927764892578, + "learning_rate": 8.91279097818748e-05, + "loss": 0.8297, + "step": 1456 + }, + { + "epoch": 1.4852191641182466, + "grad_norm": 6.5847930908203125, + "learning_rate": 8.901032607422397e-05, + "loss": 0.4661, + "step": 1457 + }, + { + "epoch": 1.4862385321100917, + "grad_norm": 10.038482666015625, + "learning_rate": 8.889218838676198e-05, + "loss": 1.035, + "step": 1458 + }, + { + "epoch": 1.4872579001019368, + "grad_norm": 5.818447589874268, + "learning_rate": 8.877349839714454e-05, + "loss": 0.2972, + "step": 1459 + }, + { + "epoch": 1.488277268093782, + "grad_norm": 8.625869750976562, + "learning_rate": 8.865425779087042e-05, + "loss": 0.5676, + "step": 1460 + }, + { + "epoch": 1.4892966360856268, + "grad_norm": 7.4677910804748535, + "learning_rate": 8.85344682612577e-05, + "loss": 0.3514, + "step": 1461 + }, + { + "epoch": 1.490316004077472, + "grad_norm": 11.1100435256958, + "learning_rate": 8.841413150941954e-05, + "loss": 1.017, + "step": 1462 + }, + { + "epoch": 1.491335372069317, + "grad_norm": 5.164977073669434, + "learning_rate": 8.829324924424016e-05, + "loss": 0.4169, + "step": 1463 + }, + { + "epoch": 1.492354740061162, + "grad_norm": 5.428584098815918, + "learning_rate": 8.817182318235059e-05, + "loss": 0.4397, + "step": 1464 + }, + { + "epoch": 1.4933741080530072, + "grad_norm": 5.453851222991943, + "learning_rate": 8.804985504810416e-05, + "loss": 0.3873, + "step": 1465 + }, + { + "epoch": 1.494393476044852, + "grad_norm": 3.758465528488159, + "learning_rate": 8.792734657355217e-05, + "loss": 0.2653, + "step": 1466 + }, + { + "epoch": 1.4954128440366974, + "grad_norm": 5.696004867553711, + "learning_rate": 8.780429949841908e-05, + "loss": 0.3229, + "step": 1467 + }, + { + "epoch": 1.4964322120285423, + "grad_norm": 7.936735153198242, + "learning_rate": 8.768071557007806e-05, + "loss": 0.4697, + "step": 1468 + }, + { + "epoch": 1.4974515800203874, + "grad_norm": 8.697381019592285, + "learning_rate": 8.755659654352599e-05, + "loss": 0.595, + "step": 1469 + }, + { + "epoch": 1.4984709480122325, + "grad_norm": 5.7998528480529785, + "learning_rate": 8.743194418135865e-05, + "loss": 0.4291, + "step": 1470 + }, + { + "epoch": 1.4984709480122325, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8078973293304443, + "eval_Qnli-dev-1024_cosine_ap": 0.7609733003079024, + "eval_Qnli-dev-1024_cosine_f1": 0.7128712871287128, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7630533576011658, + "eval_Qnli-dev-1024_cosine_mcc": 0.41281977673947123, + "eval_Qnli-dev-1024_cosine_precision": 0.6428571428571429, + "eval_Qnli-dev-1024_cosine_recall": 0.8, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.693989634513855, + "eval_Qnli-dev_cosine_ap": 0.7635441957451561, + "eval_Qnli-dev_cosine_f1": 0.7289719626168225, + "eval_Qnli-dev_cosine_f1_threshold": 0.6386604905128479, + "eval_Qnli-dev_cosine_mcc": 0.43373226132862797, + "eval_Qnli-dev_cosine_precision": 0.6290322580645161, + "eval_Qnli-dev_cosine_recall": 0.8666666666666667, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.3019393980503082, + "eval_global_dataset_runtime": 103.8441, + "eval_global_dataset_samples_per_second": 7.733, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.874585602380735, + "eval_sts-test-1024_spearman_cosine": 0.9074874432721208, + "eval_sts-test_pearson_cosine": 0.9044927301887464, + "eval_sts-test_spearman_cosine": 0.9217758439369574, + "step": 1470 + }, + { + "epoch": 1.4994903160040773, + "grad_norm": 6.363936901092529, + "learning_rate": 8.73067602537456e-05, + "loss": 0.472, + "step": 1471 + }, + { + "epoch": 1.5005096839959227, + "grad_norm": 21.527421951293945, + "learning_rate": 8.718104653840506e-05, + "loss": 1.5565, + "step": 1472 + }, + { + "epoch": 1.5015290519877675, + "grad_norm": 5.951153755187988, + "learning_rate": 8.705480482057875e-05, + "loss": 0.226, + "step": 1473 + }, + { + "epoch": 1.5025484199796126, + "grad_norm": 3.3359270095825195, + "learning_rate": 8.692803689300641e-05, + "loss": 0.2818, + "step": 1474 + }, + { + "epoch": 1.5035677879714577, + "grad_norm": 4.292994022369385, + "learning_rate": 8.680074455590045e-05, + "loss": 0.331, + "step": 1475 + }, + { + "epoch": 1.5045871559633026, + "grad_norm": 8.305481910705566, + "learning_rate": 8.667292961692035e-05, + "loss": 0.5533, + "step": 1476 + }, + { + "epoch": 1.505606523955148, + "grad_norm": 12.950459480285645, + "learning_rate": 8.65445938911469e-05, + "loss": 0.9666, + "step": 1477 + }, + { + "epoch": 1.5066258919469928, + "grad_norm": 13.390387535095215, + "learning_rate": 8.641573920105664e-05, + "loss": 0.9593, + "step": 1478 + }, + { + "epoch": 1.507645259938838, + "grad_norm": 5.181076526641846, + "learning_rate": 8.628636737649569e-05, + "loss": 0.2917, + "step": 1479 + }, + { + "epoch": 1.508664627930683, + "grad_norm": 5.981019020080566, + "learning_rate": 8.615648025465409e-05, + "loss": 0.396, + "step": 1480 + }, + { + "epoch": 1.5096839959225279, + "grad_norm": 9.361347198486328, + "learning_rate": 8.602607968003935e-05, + "loss": 0.7183, + "step": 1481 + }, + { + "epoch": 1.5107033639143732, + "grad_norm": 6.163973331451416, + "learning_rate": 8.589516750445061e-05, + "loss": 0.3434, + "step": 1482 + }, + { + "epoch": 1.511722731906218, + "grad_norm": 3.649142265319824, + "learning_rate": 8.576374558695208e-05, + "loss": 0.218, + "step": 1483 + }, + { + "epoch": 1.5127420998980632, + "grad_norm": 3.3364171981811523, + "learning_rate": 8.563181579384679e-05, + "loss": 0.1735, + "step": 1484 + }, + { + "epoch": 1.5137614678899083, + "grad_norm": 4.01456356048584, + "learning_rate": 8.549937999865001e-05, + "loss": 0.4277, + "step": 1485 + }, + { + "epoch": 1.5147808358817532, + "grad_norm": 5.251349449157715, + "learning_rate": 8.53664400820627e-05, + "loss": 0.2979, + "step": 1486 + }, + { + "epoch": 1.5158002038735985, + "grad_norm": 10.812429428100586, + "learning_rate": 8.523299793194471e-05, + "loss": 0.8903, + "step": 1487 + }, + { + "epoch": 1.5168195718654434, + "grad_norm": 9.290162086486816, + "learning_rate": 8.509905544328808e-05, + "loss": 0.5525, + "step": 1488 + }, + { + "epoch": 1.5178389398572885, + "grad_norm": 3.046337604522705, + "learning_rate": 8.496461451819009e-05, + "loss": 0.2197, + "step": 1489 + }, + { + "epoch": 1.5188583078491336, + "grad_norm": 4.534480571746826, + "learning_rate": 8.482967706582623e-05, + "loss": 0.2638, + "step": 1490 + }, + { + "epoch": 1.5198776758409784, + "grad_norm": 6.42506217956543, + "learning_rate": 8.46942450024231e-05, + "loss": 0.3562, + "step": 1491 + }, + { + "epoch": 1.5208970438328238, + "grad_norm": 6.858558177947998, + "learning_rate": 8.455832025123119e-05, + "loss": 0.4361, + "step": 1492 + }, + { + "epoch": 1.5219164118246686, + "grad_norm": 7.202121734619141, + "learning_rate": 8.442190474249755e-05, + "loss": 0.6902, + "step": 1493 + }, + { + "epoch": 1.5229357798165137, + "grad_norm": 5.493877410888672, + "learning_rate": 8.428500041343847e-05, + "loss": 0.2922, + "step": 1494 + }, + { + "epoch": 1.5239551478083588, + "grad_norm": 9.43299674987793, + "learning_rate": 8.414760920821185e-05, + "loss": 0.9379, + "step": 1495 + }, + { + "epoch": 1.5249745158002037, + "grad_norm": 10.610505104064941, + "learning_rate": 8.400973307788968e-05, + "loss": 0.7092, + "step": 1496 + }, + { + "epoch": 1.525993883792049, + "grad_norm": 8.512781143188477, + "learning_rate": 8.387137398043031e-05, + "loss": 0.5948, + "step": 1497 + }, + { + "epoch": 1.527013251783894, + "grad_norm": 8.651867866516113, + "learning_rate": 8.37325338806505e-05, + "loss": 0.4054, + "step": 1498 + }, + { + "epoch": 1.528032619775739, + "grad_norm": 7.8862199783325195, + "learning_rate": 8.35932147501979e-05, + "loss": 0.5618, + "step": 1499 + }, + { + "epoch": 1.529051987767584, + "grad_norm": 10.186121940612793, + "learning_rate": 8.345341856752254e-05, + "loss": 0.7368, + "step": 1500 + }, + { + "epoch": 1.5300713557594292, + "grad_norm": 5.780384063720703, + "learning_rate": 8.331314731784922e-05, + "loss": 0.5013, + "step": 1501 + }, + { + "epoch": 1.5310907237512743, + "grad_norm": 5.746453762054443, + "learning_rate": 8.317240299314894e-05, + "loss": 0.4127, + "step": 1502 + }, + { + "epoch": 1.5321100917431192, + "grad_norm": 6.575455188751221, + "learning_rate": 8.303118759211082e-05, + "loss": 0.5177, + "step": 1503 + }, + { + "epoch": 1.5331294597349643, + "grad_norm": 8.351615905761719, + "learning_rate": 8.288950312011368e-05, + "loss": 0.5595, + "step": 1504 + }, + { + "epoch": 1.5341488277268094, + "grad_norm": 5.2061767578125, + "learning_rate": 8.274735158919757e-05, + "loss": 0.1897, + "step": 1505 + }, + { + "epoch": 1.5351681957186545, + "grad_norm": 8.74786376953125, + "learning_rate": 8.260473501803508e-05, + "loss": 0.5909, + "step": 1506 + }, + { + "epoch": 1.5361875637104996, + "grad_norm": 8.112961769104004, + "learning_rate": 8.246165543190285e-05, + "loss": 0.7854, + "step": 1507 + }, + { + "epoch": 1.5372069317023445, + "grad_norm": 5.495329856872559, + "learning_rate": 8.231811486265271e-05, + "loss": 0.3179, + "step": 1508 + }, + { + "epoch": 1.5382262996941896, + "grad_norm": 5.7515668869018555, + "learning_rate": 8.217411534868281e-05, + "loss": 0.2756, + "step": 1509 + }, + { + "epoch": 1.5392456676860347, + "grad_norm": 3.9375154972076416, + "learning_rate": 8.202965893490878e-05, + "loss": 0.2375, + "step": 1510 + }, + { + "epoch": 1.5402650356778798, + "grad_norm": 4.8834757804870605, + "learning_rate": 8.18847476727345e-05, + "loss": 0.4164, + "step": 1511 + }, + { + "epoch": 1.5412844036697249, + "grad_norm": 5.144208908081055, + "learning_rate": 8.173938362002318e-05, + "loss": 0.2501, + "step": 1512 + }, + { + "epoch": 1.5423037716615697, + "grad_norm": 2.856844186782837, + "learning_rate": 8.159356884106802e-05, + "loss": 0.1618, + "step": 1513 + }, + { + "epoch": 1.5433231396534148, + "grad_norm": 8.247689247131348, + "learning_rate": 8.14473054065629e-05, + "loss": 0.6395, + "step": 1514 + }, + { + "epoch": 1.54434250764526, + "grad_norm": 7.794886112213135, + "learning_rate": 8.130059539357297e-05, + "loss": 0.5933, + "step": 1515 + }, + { + "epoch": 1.545361875637105, + "grad_norm": 11.686348915100098, + "learning_rate": 8.115344088550526e-05, + "loss": 0.8926, + "step": 1516 + }, + { + "epoch": 1.5463812436289501, + "grad_norm": 6.921617031097412, + "learning_rate": 8.100584397207886e-05, + "loss": 0.3411, + "step": 1517 + }, + { + "epoch": 1.547400611620795, + "grad_norm": 3.175189256668091, + "learning_rate": 8.08578067492956e-05, + "loss": 0.1612, + "step": 1518 + }, + { + "epoch": 1.5484199796126403, + "grad_norm": 5.199557304382324, + "learning_rate": 8.070933131940982e-05, + "loss": 0.2548, + "step": 1519 + }, + { + "epoch": 1.5494393476044852, + "grad_norm": 9.249714851379395, + "learning_rate": 8.056041979089905e-05, + "loss": 0.892, + "step": 1520 + }, + { + "epoch": 1.5504587155963303, + "grad_norm": 8.709851264953613, + "learning_rate": 8.041107427843357e-05, + "loss": 0.3798, + "step": 1521 + }, + { + "epoch": 1.5514780835881754, + "grad_norm": 9.481815338134766, + "learning_rate": 8.026129690284669e-05, + "loss": 0.5753, + "step": 1522 + }, + { + "epoch": 1.5524974515800203, + "grad_norm": 10.316771507263184, + "learning_rate": 8.011108979110457e-05, + "loss": 1.2305, + "step": 1523 + }, + { + "epoch": 1.5535168195718656, + "grad_norm": 9.717813491821289, + "learning_rate": 7.996045507627594e-05, + "loss": 0.7169, + "step": 1524 + }, + { + "epoch": 1.5545361875637105, + "grad_norm": 9.637219429016113, + "learning_rate": 7.98093948975019e-05, + "loss": 0.7199, + "step": 1525 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 10.511667251586914, + "learning_rate": 7.965791139996543e-05, + "loss": 0.7569, + "step": 1526 + }, + { + "epoch": 1.5565749235474007, + "grad_norm": 8.276548385620117, + "learning_rate": 7.950600673486106e-05, + "loss": 0.5522, + "step": 1527 + }, + { + "epoch": 1.5575942915392456, + "grad_norm": 10.812773704528809, + "learning_rate": 7.935368305936425e-05, + "loss": 0.9574, + "step": 1528 + }, + { + "epoch": 1.5586136595310909, + "grad_norm": 4.123507499694824, + "learning_rate": 7.920094253660074e-05, + "loss": 0.241, + "step": 1529 + }, + { + "epoch": 1.5596330275229358, + "grad_norm": 7.355940818786621, + "learning_rate": 7.904778733561591e-05, + "loss": 0.3016, + "step": 1530 + }, + { + "epoch": 1.5606523955147809, + "grad_norm": 7.6690802574157715, + "learning_rate": 7.889421963134383e-05, + "loss": 0.342, + "step": 1531 + }, + { + "epoch": 1.561671763506626, + "grad_norm": 11.40833568572998, + "learning_rate": 7.874024160457652e-05, + "loss": 0.6452, + "step": 1532 + }, + { + "epoch": 1.5626911314984708, + "grad_norm": 1.7198033332824707, + "learning_rate": 7.858585544193297e-05, + "loss": 0.0948, + "step": 1533 + }, + { + "epoch": 1.5637104994903162, + "grad_norm": 7.055710315704346, + "learning_rate": 7.843106333582796e-05, + "loss": 0.4199, + "step": 1534 + }, + { + "epoch": 1.564729867482161, + "grad_norm": 6.451409816741943, + "learning_rate": 7.827586748444114e-05, + "loss": 0.4689, + "step": 1535 + }, + { + "epoch": 1.5657492354740061, + "grad_norm": 5.309058666229248, + "learning_rate": 7.812027009168546e-05, + "loss": 0.3849, + "step": 1536 + }, + { + "epoch": 1.5667686034658512, + "grad_norm": 6.170285224914551, + "learning_rate": 7.79642733671764e-05, + "loss": 0.3263, + "step": 1537 + }, + { + "epoch": 1.567787971457696, + "grad_norm": 7.963518142700195, + "learning_rate": 7.78078795262e-05, + "loss": 0.2742, + "step": 1538 + }, + { + "epoch": 1.5688073394495414, + "grad_norm": 5.102199554443359, + "learning_rate": 7.765109078968193e-05, + "loss": 0.2751, + "step": 1539 + }, + { + "epoch": 1.5698267074413863, + "grad_norm": 9.289502143859863, + "learning_rate": 7.749390938415556e-05, + "loss": 0.4461, + "step": 1540 + }, + { + "epoch": 1.5708460754332314, + "grad_norm": 7.5046210289001465, + "learning_rate": 7.733633754173053e-05, + "loss": 0.454, + "step": 1541 + }, + { + "epoch": 1.5718654434250765, + "grad_norm": 4.898295879364014, + "learning_rate": 7.717837750006105e-05, + "loss": 0.3172, + "step": 1542 + }, + { + "epoch": 1.5728848114169214, + "grad_norm": 5.772686004638672, + "learning_rate": 7.702003150231407e-05, + "loss": 0.3302, + "step": 1543 + }, + { + "epoch": 1.5739041794087667, + "grad_norm": 6.3255791664123535, + "learning_rate": 7.686130179713742e-05, + "loss": 0.4316, + "step": 1544 + }, + { + "epoch": 1.5749235474006116, + "grad_norm": 3.258190155029297, + "learning_rate": 7.670219063862798e-05, + "loss": 0.1803, + "step": 1545 + }, + { + "epoch": 1.5759429153924567, + "grad_norm": 6.360843658447266, + "learning_rate": 7.654270028629943e-05, + "loss": 0.4801, + "step": 1546 + }, + { + "epoch": 1.5769622833843018, + "grad_norm": 2.4267330169677734, + "learning_rate": 7.638283300505052e-05, + "loss": 0.0975, + "step": 1547 + }, + { + "epoch": 1.5779816513761467, + "grad_norm": 8.417614936828613, + "learning_rate": 7.622259106513259e-05, + "loss": 0.6647, + "step": 1548 + }, + { + "epoch": 1.579001019367992, + "grad_norm": 7.388288974761963, + "learning_rate": 7.606197674211747e-05, + "loss": 0.3962, + "step": 1549 + }, + { + "epoch": 1.5800203873598369, + "grad_norm": 3.0918948650360107, + "learning_rate": 7.590099231686524e-05, + "loss": 0.1611, + "step": 1550 + }, + { + "epoch": 1.581039755351682, + "grad_norm": 4.632137775421143, + "learning_rate": 7.573964007549155e-05, + "loss": 0.4832, + "step": 1551 + }, + { + "epoch": 1.582059123343527, + "grad_norm": 3.659292697906494, + "learning_rate": 7.557792230933552e-05, + "loss": 0.2286, + "step": 1552 + }, + { + "epoch": 1.583078491335372, + "grad_norm": 7.091201305389404, + "learning_rate": 7.541584131492701e-05, + "loss": 0.3312, + "step": 1553 + }, + { + "epoch": 1.5840978593272173, + "grad_norm": 10.971521377563477, + "learning_rate": 7.525339939395394e-05, + "loss": 0.5886, + "step": 1554 + }, + { + "epoch": 1.5851172273190621, + "grad_norm": 9.640290260314941, + "learning_rate": 7.50905988532298e-05, + "loss": 0.3686, + "step": 1555 + }, + { + "epoch": 1.5861365953109072, + "grad_norm": 3.686574697494507, + "learning_rate": 7.492744200466075e-05, + "loss": 0.2189, + "step": 1556 + }, + { + "epoch": 1.5871559633027523, + "grad_norm": 5.4901018142700195, + "learning_rate": 7.476393116521276e-05, + "loss": 0.1875, + "step": 1557 + }, + { + "epoch": 1.5881753312945972, + "grad_norm": 8.475384712219238, + "learning_rate": 7.46000686568789e-05, + "loss": 0.4654, + "step": 1558 + }, + { + "epoch": 1.5891946992864425, + "grad_norm": 7.7796759605407715, + "learning_rate": 7.443585680664607e-05, + "loss": 0.4154, + "step": 1559 + }, + { + "epoch": 1.5902140672782874, + "grad_norm": 9.805006980895996, + "learning_rate": 7.427129794646234e-05, + "loss": 0.5811, + "step": 1560 + }, + { + "epoch": 1.5912334352701325, + "grad_norm": 10.294364929199219, + "learning_rate": 7.410639441320339e-05, + "loss": 0.8518, + "step": 1561 + }, + { + "epoch": 1.5922528032619776, + "grad_norm": 2.761817693710327, + "learning_rate": 7.39411485486397e-05, + "loss": 0.1038, + "step": 1562 + }, + { + "epoch": 1.5932721712538225, + "grad_norm": 8.296953201293945, + "learning_rate": 7.37755626994031e-05, + "loss": 0.6343, + "step": 1563 + }, + { + "epoch": 1.5942915392456678, + "grad_norm": 4.766301155090332, + "learning_rate": 7.360963921695344e-05, + "loss": 0.1857, + "step": 1564 + }, + { + "epoch": 1.5953109072375127, + "grad_norm": 5.613104343414307, + "learning_rate": 7.34433804575454e-05, + "loss": 0.4422, + "step": 1565 + }, + { + "epoch": 1.5963302752293578, + "grad_norm": 7.4801459312438965, + "learning_rate": 7.327678878219467e-05, + "loss": 0.4847, + "step": 1566 + }, + { + "epoch": 1.5973496432212029, + "grad_norm": 5.0312886238098145, + "learning_rate": 7.31098665566448e-05, + "loss": 0.2852, + "step": 1567 + }, + { + "epoch": 1.5983690112130478, + "grad_norm": 7.759483337402344, + "learning_rate": 7.294261615133333e-05, + "loss": 0.5726, + "step": 1568 + }, + { + "epoch": 1.5983690112130478, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8175796270370483, + "eval_Qnli-dev-1024_cosine_ap": 0.7381325092579727, + "eval_Qnli-dev-1024_cosine_f1": 0.6862745098039216, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.72412109375, + "eval_Qnli-dev-1024_cosine_mcc": 0.3519703275834634, + "eval_Qnli-dev-1024_cosine_precision": 0.6140350877192983, + "eval_Qnli-dev-1024_cosine_recall": 0.7777777777777778, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7137888669967651, + "eval_Qnli-dev_cosine_ap": 0.7705408016969871, + "eval_Qnli-dev_cosine_f1": 0.7192982456140351, + "eval_Qnli-dev_cosine_f1_threshold": 0.621550440788269, + "eval_Qnli-dev_cosine_mcc": 0.401886346014753, + "eval_Qnli-dev_cosine_precision": 0.5942028985507246, + "eval_Qnli-dev_cosine_recall": 0.9111111111111111, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.96875, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.35073724389076233, + "eval_global_dataset_runtime": 104.0769, + "eval_global_dataset_samples_per_second": 7.715, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.96875, + "eval_sts-test-1024_pearson_cosine": 0.8847218700205163, + "eval_sts-test-1024_spearman_cosine": 0.9125263313234735, + "eval_sts-test_pearson_cosine": 0.9057930208914168, + "eval_sts-test_spearman_cosine": 0.9222613739605754, + "step": 1568 + }, + { + "epoch": 1.599388379204893, + "grad_norm": 5.494419574737549, + "learning_rate": 7.277503994135835e-05, + "loss": 0.2879, + "step": 1569 + }, + { + "epoch": 1.600407747196738, + "grad_norm": 6.195016860961914, + "learning_rate": 7.260714030644449e-05, + "loss": 0.2818, + "step": 1570 + }, + { + "epoch": 1.601427115188583, + "grad_norm": 7.059222221374512, + "learning_rate": 7.24389196309094e-05, + "loss": 0.7579, + "step": 1571 + }, + { + "epoch": 1.6024464831804281, + "grad_norm": 9.059467315673828, + "learning_rate": 7.227038030362979e-05, + "loss": 0.3801, + "step": 1572 + }, + { + "epoch": 1.603465851172273, + "grad_norm": 6.267899990081787, + "learning_rate": 7.210152471800741e-05, + "loss": 0.3066, + "step": 1573 + }, + { + "epoch": 1.6044852191641183, + "grad_norm": 6.997797966003418, + "learning_rate": 7.193235527193523e-05, + "loss": 0.2659, + "step": 1574 + }, + { + "epoch": 1.6055045871559632, + "grad_norm": 5.461020469665527, + "learning_rate": 7.176287436776333e-05, + "loss": 0.3371, + "step": 1575 + }, + { + "epoch": 1.6065239551478083, + "grad_norm": 2.6381993293762207, + "learning_rate": 7.159308441226455e-05, + "loss": 0.0823, + "step": 1576 + }, + { + "epoch": 1.6075433231396534, + "grad_norm": 6.970646858215332, + "learning_rate": 7.142298781660082e-05, + "loss": 0.4826, + "step": 1577 + }, + { + "epoch": 1.6085626911314985, + "grad_norm": 9.13263988494873, + "learning_rate": 7.12525869962884e-05, + "loss": 0.5215, + "step": 1578 + }, + { + "epoch": 1.6095820591233436, + "grad_norm": 3.3016436100006104, + "learning_rate": 7.108188437116394e-05, + "loss": 0.2161, + "step": 1579 + }, + { + "epoch": 1.6106014271151885, + "grad_norm": 9.76569938659668, + "learning_rate": 7.091088236534985e-05, + "loss": 0.6502, + "step": 1580 + }, + { + "epoch": 1.6116207951070336, + "grad_norm": 5.767617702484131, + "learning_rate": 7.073958340722008e-05, + "loss": 0.4971, + "step": 1581 + }, + { + "epoch": 1.6126401630988787, + "grad_norm": 6.001529216766357, + "learning_rate": 7.056798992936555e-05, + "loss": 0.4015, + "step": 1582 + }, + { + "epoch": 1.6136595310907238, + "grad_norm": 6.167148590087891, + "learning_rate": 7.039610436855957e-05, + "loss": 0.2794, + "step": 1583 + }, + { + "epoch": 1.614678899082569, + "grad_norm": 8.581472396850586, + "learning_rate": 7.022392916572336e-05, + "loss": 0.9008, + "step": 1584 + }, + { + "epoch": 1.6156982670744138, + "grad_norm": 4.615275859832764, + "learning_rate": 7.005146676589118e-05, + "loss": 0.2382, + "step": 1585 + }, + { + "epoch": 1.6167176350662589, + "grad_norm": 7.006135940551758, + "learning_rate": 6.987871961817581e-05, + "loss": 0.3115, + "step": 1586 + }, + { + "epoch": 1.617737003058104, + "grad_norm": 10.424875259399414, + "learning_rate": 6.970569017573371e-05, + "loss": 0.7609, + "step": 1587 + }, + { + "epoch": 1.618756371049949, + "grad_norm": 8.667938232421875, + "learning_rate": 6.953238089573012e-05, + "loss": 0.475, + "step": 1588 + }, + { + "epoch": 1.6197757390417942, + "grad_norm": 8.349859237670898, + "learning_rate": 6.935879423930426e-05, + "loss": 0.714, + "step": 1589 + }, + { + "epoch": 1.620795107033639, + "grad_norm": 3.482165575027466, + "learning_rate": 6.918493267153424e-05, + "loss": 0.2345, + "step": 1590 + }, + { + "epoch": 1.6218144750254841, + "grad_norm": 5.844244003295898, + "learning_rate": 6.901079866140222e-05, + "loss": 0.3892, + "step": 1591 + }, + { + "epoch": 1.6228338430173292, + "grad_norm": 6.793012619018555, + "learning_rate": 6.883639468175927e-05, + "loss": 0.3771, + "step": 1592 + }, + { + "epoch": 1.6238532110091743, + "grad_norm": 5.695591449737549, + "learning_rate": 6.866172320929022e-05, + "loss": 0.4352, + "step": 1593 + }, + { + "epoch": 1.6248725790010194, + "grad_norm": 8.875423431396484, + "learning_rate": 6.848678672447863e-05, + "loss": 1.0865, + "step": 1594 + }, + { + "epoch": 1.6258919469928643, + "grad_norm": 4.769944190979004, + "learning_rate": 6.831158771157124e-05, + "loss": 0.3441, + "step": 1595 + }, + { + "epoch": 1.6269113149847096, + "grad_norm": 9.363751411437988, + "learning_rate": 6.81361286585432e-05, + "loss": 0.8453, + "step": 1596 + }, + { + "epoch": 1.6279306829765545, + "grad_norm": 5.906173229217529, + "learning_rate": 6.796041205706216e-05, + "loss": 0.2858, + "step": 1597 + }, + { + "epoch": 1.6289500509683996, + "grad_norm": 6.232714653015137, + "learning_rate": 6.778444040245345e-05, + "loss": 0.5172, + "step": 1598 + }, + { + "epoch": 1.6299694189602447, + "grad_norm": 6.986159801483154, + "learning_rate": 6.760821619366415e-05, + "loss": 0.2466, + "step": 1599 + }, + { + "epoch": 1.6309887869520896, + "grad_norm": 4.346200942993164, + "learning_rate": 6.743174193322796e-05, + "loss": 0.2753, + "step": 1600 + }, + { + "epoch": 1.632008154943935, + "grad_norm": 6.619315147399902, + "learning_rate": 6.725502012722948e-05, + "loss": 0.6937, + "step": 1601 + }, + { + "epoch": 1.6330275229357798, + "grad_norm": 5.2573723793029785, + "learning_rate": 6.707805328526864e-05, + "loss": 0.4227, + "step": 1602 + }, + { + "epoch": 1.634046890927625, + "grad_norm": 10.570611000061035, + "learning_rate": 6.690084392042513e-05, + "loss": 0.7107, + "step": 1603 + }, + { + "epoch": 1.63506625891947, + "grad_norm": 5.327670574188232, + "learning_rate": 6.67233945492227e-05, + "loss": 0.4683, + "step": 1604 + }, + { + "epoch": 1.6360856269113149, + "grad_norm": 7.018125057220459, + "learning_rate": 6.654570769159328e-05, + "loss": 0.3031, + "step": 1605 + }, + { + "epoch": 1.6371049949031602, + "grad_norm": 9.383675575256348, + "learning_rate": 6.636778587084142e-05, + "loss": 0.4622, + "step": 1606 + }, + { + "epoch": 1.638124362895005, + "grad_norm": 3.507168769836426, + "learning_rate": 6.618963161360832e-05, + "loss": 0.145, + "step": 1607 + }, + { + "epoch": 1.6391437308868502, + "grad_norm": 8.523704528808594, + "learning_rate": 6.601124744983596e-05, + "loss": 0.3465, + "step": 1608 + }, + { + "epoch": 1.6401630988786953, + "grad_norm": 6.723085880279541, + "learning_rate": 6.583263591273121e-05, + "loss": 0.508, + "step": 1609 + }, + { + "epoch": 1.6411824668705401, + "grad_norm": 5.034709453582764, + "learning_rate": 6.565379953872977e-05, + "loss": 0.205, + "step": 1610 + }, + { + "epoch": 1.6422018348623855, + "grad_norm": 10.521743774414062, + "learning_rate": 6.547474086746028e-05, + "loss": 1.0824, + "step": 1611 + }, + { + "epoch": 1.6432212028542303, + "grad_norm": 8.010995864868164, + "learning_rate": 6.529546244170818e-05, + "loss": 0.4191, + "step": 1612 + }, + { + "epoch": 1.6442405708460754, + "grad_norm": 5.811416149139404, + "learning_rate": 6.51159668073796e-05, + "loss": 0.4591, + "step": 1613 + }, + { + "epoch": 1.6452599388379205, + "grad_norm": 3.7815189361572266, + "learning_rate": 6.493625651346523e-05, + "loss": 0.3046, + "step": 1614 + }, + { + "epoch": 1.6462793068297654, + "grad_norm": 3.966926097869873, + "learning_rate": 6.475633411200414e-05, + "loss": 0.2447, + "step": 1615 + }, + { + "epoch": 1.6472986748216107, + "grad_norm": 5.435057640075684, + "learning_rate": 6.457620215804734e-05, + "loss": 0.2797, + "step": 1616 + }, + { + "epoch": 1.6483180428134556, + "grad_norm": 11.412252426147461, + "learning_rate": 6.439586320962194e-05, + "loss": 1.2793, + "step": 1617 + }, + { + "epoch": 1.6493374108053007, + "grad_norm": 8.497121810913086, + "learning_rate": 6.421531982769427e-05, + "loss": 0.4026, + "step": 1618 + }, + { + "epoch": 1.6503567787971458, + "grad_norm": 10.641962051391602, + "learning_rate": 6.403457457613404e-05, + "loss": 0.6297, + "step": 1619 + }, + { + "epoch": 1.6513761467889907, + "grad_norm": 9.194568634033203, + "learning_rate": 6.385363002167746e-05, + "loss": 0.9285, + "step": 1620 + }, + { + "epoch": 1.652395514780836, + "grad_norm": 4.320868968963623, + "learning_rate": 6.367248873389115e-05, + "loss": 0.327, + "step": 1621 + }, + { + "epoch": 1.653414882772681, + "grad_norm": 5.681830883026123, + "learning_rate": 6.349115328513545e-05, + "loss": 0.3873, + "step": 1622 + }, + { + "epoch": 1.654434250764526, + "grad_norm": 3.3009400367736816, + "learning_rate": 6.330962625052798e-05, + "loss": 0.2392, + "step": 1623 + }, + { + "epoch": 1.655453618756371, + "grad_norm": 7.098931312561035, + "learning_rate": 6.312791020790709e-05, + "loss": 0.4528, + "step": 1624 + }, + { + "epoch": 1.656472986748216, + "grad_norm": 3.843114137649536, + "learning_rate": 6.294600773779504e-05, + "loss": 0.1439, + "step": 1625 + }, + { + "epoch": 1.6574923547400613, + "grad_norm": 7.354673862457275, + "learning_rate": 6.276392142336168e-05, + "loss": 0.4292, + "step": 1626 + }, + { + "epoch": 1.6585117227319062, + "grad_norm": 5.355445861816406, + "learning_rate": 6.258165385038755e-05, + "loss": 0.1736, + "step": 1627 + }, + { + "epoch": 1.6595310907237513, + "grad_norm": 5.2437849044799805, + "learning_rate": 6.239920760722722e-05, + "loss": 0.5714, + "step": 1628 + }, + { + "epoch": 1.6605504587155964, + "grad_norm": 3.639320135116577, + "learning_rate": 6.221658528477255e-05, + "loss": 0.2066, + "step": 1629 + }, + { + "epoch": 1.6615698267074412, + "grad_norm": 6.4522247314453125, + "learning_rate": 6.203378947641581e-05, + "loss": 0.3611, + "step": 1630 + }, + { + "epoch": 1.6625891946992866, + "grad_norm": 6.9241461753845215, + "learning_rate": 6.185082277801294e-05, + "loss": 0.3273, + "step": 1631 + }, + { + "epoch": 1.6636085626911314, + "grad_norm": 5.387557029724121, + "learning_rate": 6.166768778784673e-05, + "loss": 0.1974, + "step": 1632 + }, + { + "epoch": 1.6646279306829765, + "grad_norm": 8.088689804077148, + "learning_rate": 6.148438710658978e-05, + "loss": 0.3644, + "step": 1633 + }, + { + "epoch": 1.6656472986748216, + "grad_norm": 9.129353523254395, + "learning_rate": 6.130092333726773e-05, + "loss": 0.4186, + "step": 1634 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 6.403126239776611, + "learning_rate": 6.111729908522203e-05, + "loss": 0.4829, + "step": 1635 + }, + { + "epoch": 1.6676860346585118, + "grad_norm": 7.876978874206543, + "learning_rate": 6.093351695807339e-05, + "loss": 0.3506, + "step": 1636 + }, + { + "epoch": 1.6687054026503567, + "grad_norm": 5.055951118469238, + "learning_rate": 6.074957956568421e-05, + "loss": 0.2133, + "step": 1637 + }, + { + "epoch": 1.6697247706422018, + "grad_norm": 8.396081924438477, + "learning_rate": 6.056548952012204e-05, + "loss": 0.4685, + "step": 1638 + }, + { + "epoch": 1.670744138634047, + "grad_norm": 6.602619171142578, + "learning_rate": 6.038124943562199e-05, + "loss": 0.3484, + "step": 1639 + }, + { + "epoch": 1.6717635066258918, + "grad_norm": 8.266304016113281, + "learning_rate": 6.019686192855002e-05, + "loss": 0.6875, + "step": 1640 + }, + { + "epoch": 1.6727828746177371, + "grad_norm": 4.310380935668945, + "learning_rate": 6.001232961736555e-05, + "loss": 0.171, + "step": 1641 + }, + { + "epoch": 1.673802242609582, + "grad_norm": 6.7401652336120605, + "learning_rate": 5.982765512258437e-05, + "loss": 0.4644, + "step": 1642 + }, + { + "epoch": 1.674821610601427, + "grad_norm": 5.405831336975098, + "learning_rate": 5.9642841066741415e-05, + "loss": 0.2419, + "step": 1643 + }, + { + "epoch": 1.6758409785932722, + "grad_norm": 5.0712714195251465, + "learning_rate": 5.9457890074353404e-05, + "loss": 0.2566, + "step": 1644 + }, + { + "epoch": 1.676860346585117, + "grad_norm": 3.400216579437256, + "learning_rate": 5.9272804771881776e-05, + "loss": 0.125, + "step": 1645 + }, + { + "epoch": 1.6778797145769624, + "grad_norm": 8.119034767150879, + "learning_rate": 5.9087587787695244e-05, + "loss": 0.5321, + "step": 1646 + }, + { + "epoch": 1.6788990825688073, + "grad_norm": 5.170970916748047, + "learning_rate": 5.8902241752032536e-05, + "loss": 0.3017, + "step": 1647 + }, + { + "epoch": 1.6799184505606524, + "grad_norm": 8.949460983276367, + "learning_rate": 5.871676929696506e-05, + "loss": 0.7024, + "step": 1648 + }, + { + "epoch": 1.6809378185524975, + "grad_norm": 7.251186370849609, + "learning_rate": 5.853117305635932e-05, + "loss": 0.4489, + "step": 1649 + }, + { + "epoch": 1.6819571865443423, + "grad_norm": 7.259670734405518, + "learning_rate": 5.834545566583986e-05, + "loss": 0.3247, + "step": 1650 + }, + { + "epoch": 1.6829765545361877, + "grad_norm": 10.1055326461792, + "learning_rate": 5.815961976275158e-05, + "loss": 0.6301, + "step": 1651 + }, + { + "epoch": 1.6839959225280325, + "grad_norm": 8.810796737670898, + "learning_rate": 5.797366798612237e-05, + "loss": 0.52, + "step": 1652 + }, + { + "epoch": 1.6850152905198776, + "grad_norm": 7.598231315612793, + "learning_rate": 5.778760297662567e-05, + "loss": 0.7236, + "step": 1653 + }, + { + "epoch": 1.6860346585117227, + "grad_norm": 3.1995418071746826, + "learning_rate": 5.760142737654275e-05, + "loss": 0.146, + "step": 1654 + }, + { + "epoch": 1.6870540265035678, + "grad_norm": 10.069400787353516, + "learning_rate": 5.7415143829725634e-05, + "loss": 0.649, + "step": 1655 + }, + { + "epoch": 1.688073394495413, + "grad_norm": 7.260120391845703, + "learning_rate": 5.722875498155901e-05, + "loss": 0.7242, + "step": 1656 + }, + { + "epoch": 1.6890927624872578, + "grad_norm": 6.980922698974609, + "learning_rate": 5.704226347892319e-05, + "loss": 0.3496, + "step": 1657 + }, + { + "epoch": 1.690112130479103, + "grad_norm": 8.837504386901855, + "learning_rate": 5.6855671970156e-05, + "loss": 0.9688, + "step": 1658 + }, + { + "epoch": 1.691131498470948, + "grad_norm": 7.200884819030762, + "learning_rate": 5.6668983105015635e-05, + "loss": 0.7482, + "step": 1659 + }, + { + "epoch": 1.6921508664627931, + "grad_norm": 6.389528274536133, + "learning_rate": 5.6482199534642775e-05, + "loss": 0.2365, + "step": 1660 + }, + { + "epoch": 1.6931702344546382, + "grad_norm": 1.9209339618682861, + "learning_rate": 5.629532391152298e-05, + "loss": 0.1193, + "step": 1661 + }, + { + "epoch": 1.694189602446483, + "grad_norm": 11.277609825134277, + "learning_rate": 5.6108358889449055e-05, + "loss": 0.864, + "step": 1662 + }, + { + "epoch": 1.6952089704383282, + "grad_norm": 8.00525188446045, + "learning_rate": 5.5921307123483365e-05, + "loss": 0.5214, + "step": 1663 + }, + { + "epoch": 1.6962283384301733, + "grad_norm": 8.152433395385742, + "learning_rate": 5.573417126992003e-05, + "loss": 0.5691, + "step": 1664 + }, + { + "epoch": 1.6972477064220184, + "grad_norm": 6.9097490310668945, + "learning_rate": 5.5546953986247366e-05, + "loss": 0.2645, + "step": 1665 + }, + { + "epoch": 1.6982670744138635, + "grad_norm": 3.46396541595459, + "learning_rate": 5.535965793111004e-05, + "loss": 0.1646, + "step": 1666 + }, + { + "epoch": 1.6982670744138635, + "eval_Qnli-dev-1024_cosine_accuracy": 0.71875, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7981840372085571, + "eval_Qnli-dev-1024_cosine_ap": 0.7241242792005642, + "eval_Qnli-dev-1024_cosine_f1": 0.7200000000000001, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7665559649467468, + "eval_Qnli-dev-1024_cosine_mcc": 0.43122545523632066, + "eval_Qnli-dev-1024_cosine_precision": 0.6545454545454545, + "eval_Qnli-dev-1024_cosine_recall": 0.8, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6936626434326172, + "eval_Qnli-dev_cosine_ap": 0.7608245383505858, + "eval_Qnli-dev_cosine_f1": 0.7454545454545455, + "eval_Qnli-dev_cosine_f1_threshold": 0.6196604371070862, + "eval_Qnli-dev_cosine_mcc": 0.47013467657639685, + "eval_Qnli-dev_cosine_precision": 0.6307692307692307, + "eval_Qnli-dev_cosine_recall": 0.9111111111111111, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.35253724455833435, + "eval_global_dataset_runtime": 104.1307, + "eval_global_dataset_samples_per_second": 7.711, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8808346155517819, + "eval_sts-test-1024_spearman_cosine": 0.9112034670517186, + "eval_sts-test_pearson_cosine": 0.9053504773307102, + "eval_sts-test_spearman_cosine": 0.9205147911840388, + "step": 1666 + }, + { + "epoch": 1.6992864424057084, + "grad_norm": 4.240916728973389, + "learning_rate": 5.517228576427137e-05, + "loss": 0.1995, + "step": 1667 + }, + { + "epoch": 1.7003058103975535, + "grad_norm": 3.9136712551116943, + "learning_rate": 5.49848401465755e-05, + "loss": 0.2672, + "step": 1668 + }, + { + "epoch": 1.7013251783893986, + "grad_norm": 7.760051727294922, + "learning_rate": 5.479732373990958e-05, + "loss": 0.3449, + "step": 1669 + }, + { + "epoch": 1.7023445463812437, + "grad_norm": 5.199024677276611, + "learning_rate": 5.46097392071661e-05, + "loss": 0.4137, + "step": 1670 + }, + { + "epoch": 1.7033639143730888, + "grad_norm": 5.0411906242370605, + "learning_rate": 5.4422089212204994e-05, + "loss": 0.2984, + "step": 1671 + }, + { + "epoch": 1.7043832823649336, + "grad_norm": 2.470310688018799, + "learning_rate": 5.4234376419815805e-05, + "loss": 0.1743, + "step": 1672 + }, + { + "epoch": 1.705402650356779, + "grad_norm": 5.406039714813232, + "learning_rate": 5.4046603495679825e-05, + "loss": 0.2739, + "step": 1673 + }, + { + "epoch": 1.7064220183486238, + "grad_norm": 4.936440944671631, + "learning_rate": 5.385877310633234e-05, + "loss": 0.1755, + "step": 1674 + }, + { + "epoch": 1.707441386340469, + "grad_norm": 8.654784202575684, + "learning_rate": 5.367088791912454e-05, + "loss": 0.5298, + "step": 1675 + }, + { + "epoch": 1.708460754332314, + "grad_norm": 6.0031938552856445, + "learning_rate": 5.348295060218603e-05, + "loss": 0.3842, + "step": 1676 + }, + { + "epoch": 1.709480122324159, + "grad_norm": 2.8984975814819336, + "learning_rate": 5.329496382438641e-05, + "loss": 0.1637, + "step": 1677 + }, + { + "epoch": 1.7104994903160042, + "grad_norm": 7.161764144897461, + "learning_rate": 5.310693025529797e-05, + "loss": 0.6203, + "step": 1678 + }, + { + "epoch": 1.7115188583078491, + "grad_norm": 3.71823787689209, + "learning_rate": 5.2918852565157216e-05, + "loss": 0.1549, + "step": 1679 + }, + { + "epoch": 1.7125382262996942, + "grad_norm": 10.297740936279297, + "learning_rate": 5.273073342482736e-05, + "loss": 0.4458, + "step": 1680 + }, + { + "epoch": 1.7135575942915393, + "grad_norm": 6.423152923583984, + "learning_rate": 5.254257550576021e-05, + "loss": 0.4041, + "step": 1681 + }, + { + "epoch": 1.7145769622833842, + "grad_norm": 6.07492733001709, + "learning_rate": 5.235438147995824e-05, + "loss": 0.3155, + "step": 1682 + }, + { + "epoch": 1.7155963302752295, + "grad_norm": 4.876982688903809, + "learning_rate": 5.216615401993674e-05, + "loss": 0.1794, + "step": 1683 + }, + { + "epoch": 1.7166156982670744, + "grad_norm": 8.693746566772461, + "learning_rate": 5.1977895798685664e-05, + "loss": 0.6088, + "step": 1684 + }, + { + "epoch": 1.7176350662589195, + "grad_norm": 8.894753456115723, + "learning_rate": 5.178960948963191e-05, + "loss": 0.3606, + "step": 1685 + }, + { + "epoch": 1.7186544342507646, + "grad_norm": 3.4491775035858154, + "learning_rate": 5.160129776660123e-05, + "loss": 0.1251, + "step": 1686 + }, + { + "epoch": 1.7196738022426095, + "grad_norm": 5.475042819976807, + "learning_rate": 5.141296330378025e-05, + "loss": 0.2667, + "step": 1687 + }, + { + "epoch": 1.7206931702344548, + "grad_norm": 6.056704998016357, + "learning_rate": 5.122460877567857e-05, + "loss": 0.4352, + "step": 1688 + }, + { + "epoch": 1.7217125382262997, + "grad_norm": 10.229000091552734, + "learning_rate": 5.103623685709063e-05, + "loss": 0.6197, + "step": 1689 + }, + { + "epoch": 1.7227319062181448, + "grad_norm": 5.402790546417236, + "learning_rate": 5.0847850223057936e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 1.7237512742099899, + "grad_norm": 2.1127982139587402, + "learning_rate": 5.0659451548830917e-05, + "loss": 0.1342, + "step": 1691 + }, + { + "epoch": 1.7247706422018347, + "grad_norm": 4.332535743713379, + "learning_rate": 5.0471043509831e-05, + "loss": 0.2149, + "step": 1692 + }, + { + "epoch": 1.72579001019368, + "grad_norm": 8.170999526977539, + "learning_rate": 5.028262878161262e-05, + "loss": 0.5404, + "step": 1693 + }, + { + "epoch": 1.726809378185525, + "grad_norm": 3.3618481159210205, + "learning_rate": 5.009421003982508e-05, + "loss": 0.1522, + "step": 1694 + }, + { + "epoch": 1.72782874617737, + "grad_norm": 5.414154529571533, + "learning_rate": 4.990578996017493e-05, + "loss": 0.2736, + "step": 1695 + }, + { + "epoch": 1.7288481141692151, + "grad_norm": 4.799069881439209, + "learning_rate": 4.971737121838739e-05, + "loss": 0.3062, + "step": 1696 + }, + { + "epoch": 1.72986748216106, + "grad_norm": 5.350613594055176, + "learning_rate": 4.952895649016901e-05, + "loss": 0.4809, + "step": 1697 + }, + { + "epoch": 1.7308868501529053, + "grad_norm": 3.9708356857299805, + "learning_rate": 4.934054845116906e-05, + "loss": 0.3985, + "step": 1698 + }, + { + "epoch": 1.7319062181447502, + "grad_norm": 6.353355407714844, + "learning_rate": 4.9152149776942076e-05, + "loss": 0.3152, + "step": 1699 + }, + { + "epoch": 1.7329255861365953, + "grad_norm": 9.268322944641113, + "learning_rate": 4.896376314290941e-05, + "loss": 0.6119, + "step": 1700 + }, + { + "epoch": 1.7339449541284404, + "grad_norm": 5.95634651184082, + "learning_rate": 4.877539122432147e-05, + "loss": 0.3004, + "step": 1701 + }, + { + "epoch": 1.7349643221202853, + "grad_norm": 9.091329574584961, + "learning_rate": 4.8587036696219754e-05, + "loss": 0.7715, + "step": 1702 + }, + { + "epoch": 1.7359836901121306, + "grad_norm": 7.045917987823486, + "learning_rate": 4.839870223339878e-05, + "loss": 0.3693, + "step": 1703 + }, + { + "epoch": 1.7370030581039755, + "grad_norm": 6.63254976272583, + "learning_rate": 4.82103905103681e-05, + "loss": 0.546, + "step": 1704 + }, + { + "epoch": 1.7380224260958206, + "grad_norm": 8.315082550048828, + "learning_rate": 4.8022104201314354e-05, + "loss": 0.6736, + "step": 1705 + }, + { + "epoch": 1.7390417940876657, + "grad_norm": 5.275691986083984, + "learning_rate": 4.783384598006327e-05, + "loss": 0.2598, + "step": 1706 + }, + { + "epoch": 1.7400611620795106, + "grad_norm": 3.820236921310425, + "learning_rate": 4.764561852004173e-05, + "loss": 0.1398, + "step": 1707 + }, + { + "epoch": 1.7410805300713559, + "grad_norm": 7.040982723236084, + "learning_rate": 4.745742449423977e-05, + "loss": 0.379, + "step": 1708 + }, + { + "epoch": 1.7420998980632008, + "grad_norm": 6.6788482666015625, + "learning_rate": 4.726926657517262e-05, + "loss": 0.5142, + "step": 1709 + }, + { + "epoch": 1.7431192660550459, + "grad_norm": 6.863526344299316, + "learning_rate": 4.708114743484282e-05, + "loss": 0.3975, + "step": 1710 + }, + { + "epoch": 1.744138634046891, + "grad_norm": 8.351316452026367, + "learning_rate": 4.6893069744702045e-05, + "loss": 0.3153, + "step": 1711 + }, + { + "epoch": 1.7451580020387358, + "grad_norm": 10.181187629699707, + "learning_rate": 4.6705036175613606e-05, + "loss": 0.8402, + "step": 1712 + }, + { + "epoch": 1.7461773700305812, + "grad_norm": 6.891512393951416, + "learning_rate": 4.651704939781398e-05, + "loss": 0.4639, + "step": 1713 + }, + { + "epoch": 1.747196738022426, + "grad_norm": 8.785947799682617, + "learning_rate": 4.6329112080875474e-05, + "loss": 0.5627, + "step": 1714 + }, + { + "epoch": 1.7482161060142711, + "grad_norm": 7.0134196281433105, + "learning_rate": 4.614122689366767e-05, + "loss": 0.4902, + "step": 1715 + }, + { + "epoch": 1.7492354740061162, + "grad_norm": 8.736035346984863, + "learning_rate": 4.5953396504320186e-05, + "loss": 0.6778, + "step": 1716 + }, + { + "epoch": 1.750254841997961, + "grad_norm": 3.7078139781951904, + "learning_rate": 4.576562358018418e-05, + "loss": 0.4115, + "step": 1717 + }, + { + "epoch": 1.7512742099898064, + "grad_norm": 2.6372509002685547, + "learning_rate": 4.557791078779502e-05, + "loss": 0.1016, + "step": 1718 + }, + { + "epoch": 1.7522935779816513, + "grad_norm": 4.356749057769775, + "learning_rate": 4.539026079283388e-05, + "loss": 0.2225, + "step": 1719 + }, + { + "epoch": 1.7533129459734964, + "grad_norm": 5.2822418212890625, + "learning_rate": 4.520267626009047e-05, + "loss": 0.3715, + "step": 1720 + }, + { + "epoch": 1.7543323139653415, + "grad_norm": 9.365150451660156, + "learning_rate": 4.5015159853424546e-05, + "loss": 0.7597, + "step": 1721 + }, + { + "epoch": 1.7553516819571864, + "grad_norm": 6.232312202453613, + "learning_rate": 4.4827714235728635e-05, + "loss": 0.2513, + "step": 1722 + }, + { + "epoch": 1.7563710499490317, + "grad_norm": 7.213202476501465, + "learning_rate": 4.4640342068889964e-05, + "loss": 0.6002, + "step": 1723 + }, + { + "epoch": 1.7573904179408766, + "grad_norm": 4.853435039520264, + "learning_rate": 4.445304601375264e-05, + "loss": 0.1624, + "step": 1724 + }, + { + "epoch": 1.7584097859327217, + "grad_norm": 4.7718892097473145, + "learning_rate": 4.4265828730079987e-05, + "loss": 0.286, + "step": 1725 + }, + { + "epoch": 1.7594291539245668, + "grad_norm": 4.1447038650512695, + "learning_rate": 4.407869287651664e-05, + "loss": 0.206, + "step": 1726 + }, + { + "epoch": 1.7604485219164119, + "grad_norm": 5.51598596572876, + "learning_rate": 4.389164111055092e-05, + "loss": 0.211, + "step": 1727 + }, + { + "epoch": 1.761467889908257, + "grad_norm": 8.03819751739502, + "learning_rate": 4.370467608847699e-05, + "loss": 0.3425, + "step": 1728 + }, + { + "epoch": 1.7624872579001019, + "grad_norm": 6.400393962860107, + "learning_rate": 4.3517800465357264e-05, + "loss": 0.5059, + "step": 1729 + }, + { + "epoch": 1.763506625891947, + "grad_norm": 9.561805725097656, + "learning_rate": 4.333101689498437e-05, + "loss": 0.5656, + "step": 1730 + }, + { + "epoch": 1.764525993883792, + "grad_norm": 8.619925498962402, + "learning_rate": 4.314432802984406e-05, + "loss": 0.4599, + "step": 1731 + }, + { + "epoch": 1.7655453618756372, + "grad_norm": 3.827829122543335, + "learning_rate": 4.295773652107683e-05, + "loss": 0.2513, + "step": 1732 + }, + { + "epoch": 1.7665647298674823, + "grad_norm": 8.728647232055664, + "learning_rate": 4.2771245018441e-05, + "loss": 0.5087, + "step": 1733 + }, + { + "epoch": 1.7675840978593271, + "grad_norm": 6.656282424926758, + "learning_rate": 4.258485617027437e-05, + "loss": 0.4029, + "step": 1734 + }, + { + "epoch": 1.7686034658511722, + "grad_norm": 3.345684051513672, + "learning_rate": 4.239857262345726e-05, + "loss": 0.1634, + "step": 1735 + }, + { + "epoch": 1.7696228338430173, + "grad_norm": 4.737642765045166, + "learning_rate": 4.221239702337434e-05, + "loss": 0.3575, + "step": 1736 + }, + { + "epoch": 1.7706422018348624, + "grad_norm": 5.47233772277832, + "learning_rate": 4.2026332013877634e-05, + "loss": 0.2304, + "step": 1737 + }, + { + "epoch": 1.7716615698267075, + "grad_norm": 4.983118534088135, + "learning_rate": 4.18403802372484e-05, + "loss": 0.2056, + "step": 1738 + }, + { + "epoch": 1.7726809378185524, + "grad_norm": 7.349947452545166, + "learning_rate": 4.165454433416018e-05, + "loss": 0.5025, + "step": 1739 + }, + { + "epoch": 1.7737003058103975, + "grad_norm": 5.841440200805664, + "learning_rate": 4.1468826943640724e-05, + "loss": 0.355, + "step": 1740 + }, + { + "epoch": 1.7747196738022426, + "grad_norm": 3.5881755352020264, + "learning_rate": 4.128323070303499e-05, + "loss": 0.2275, + "step": 1741 + }, + { + "epoch": 1.7757390417940877, + "grad_norm": 5.0039963722229, + "learning_rate": 4.109775824796747e-05, + "loss": 0.2257, + "step": 1742 + }, + { + "epoch": 1.7767584097859328, + "grad_norm": 5.417962074279785, + "learning_rate": 4.091241221230476e-05, + "loss": 0.2303, + "step": 1743 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 6.626123428344727, + "learning_rate": 4.072719522811824e-05, + "loss": 0.5247, + "step": 1744 + }, + { + "epoch": 1.7787971457696228, + "grad_norm": 4.882989406585693, + "learning_rate": 4.054210992564661e-05, + "loss": 0.2173, + "step": 1745 + }, + { + "epoch": 1.7798165137614679, + "grad_norm": 4.634632110595703, + "learning_rate": 4.0357158933258596e-05, + "loss": 0.4443, + "step": 1746 + }, + { + "epoch": 1.780835881753313, + "grad_norm": 6.226490020751953, + "learning_rate": 4.017234487741561e-05, + "loss": 0.569, + "step": 1747 + }, + { + "epoch": 1.781855249745158, + "grad_norm": 6.3392767906188965, + "learning_rate": 3.998767038263442e-05, + "loss": 0.3075, + "step": 1748 + }, + { + "epoch": 1.782874617737003, + "grad_norm": 4.992177963256836, + "learning_rate": 3.9803138071449996e-05, + "loss": 0.3494, + "step": 1749 + }, + { + "epoch": 1.7838939857288483, + "grad_norm": 7.793630599975586, + "learning_rate": 3.9618750564378064e-05, + "loss": 0.4004, + "step": 1750 + }, + { + "epoch": 1.7849133537206932, + "grad_norm": 4.538532733917236, + "learning_rate": 3.9434510479877975e-05, + "loss": 0.1982, + "step": 1751 + }, + { + "epoch": 1.7859327217125383, + "grad_norm": 7.877995491027832, + "learning_rate": 3.9250420434315806e-05, + "loss": 0.4556, + "step": 1752 + }, + { + "epoch": 1.7869520897043834, + "grad_norm": 6.7634053230285645, + "learning_rate": 3.9066483041926616e-05, + "loss": 0.4251, + "step": 1753 + }, + { + "epoch": 1.7879714576962282, + "grad_norm": 3.238656759262085, + "learning_rate": 3.888270091477798e-05, + "loss": 0.1181, + "step": 1754 + }, + { + "epoch": 1.7889908256880735, + "grad_norm": 9.58579158782959, + "learning_rate": 3.8699076662732284e-05, + "loss": 0.5456, + "step": 1755 + }, + { + "epoch": 1.7900101936799184, + "grad_norm": 6.472123622894287, + "learning_rate": 3.851561289341023e-05, + "loss": 0.379, + "step": 1756 + }, + { + "epoch": 1.7910295616717635, + "grad_norm": 4.2044172286987305, + "learning_rate": 3.833231221215325e-05, + "loss": 0.2473, + "step": 1757 + }, + { + "epoch": 1.7920489296636086, + "grad_norm": 3.803764581680298, + "learning_rate": 3.814917722198707e-05, + "loss": 0.2603, + "step": 1758 + }, + { + "epoch": 1.7930682976554535, + "grad_norm": 7.514235973358154, + "learning_rate": 3.7966210523584245e-05, + "loss": 0.5655, + "step": 1759 + }, + { + "epoch": 1.7940876656472988, + "grad_norm": 7.211087226867676, + "learning_rate": 3.778341471522749e-05, + "loss": 0.5536, + "step": 1760 + }, + { + "epoch": 1.7951070336391437, + "grad_norm": 8.897974014282227, + "learning_rate": 3.7600792392772795e-05, + "loss": 0.645, + "step": 1761 + }, + { + "epoch": 1.7961264016309888, + "grad_norm": 7.907827377319336, + "learning_rate": 3.741834614961246e-05, + "loss": 0.4652, + "step": 1762 + }, + { + "epoch": 1.797145769622834, + "grad_norm": 5.5902791023254395, + "learning_rate": 3.7236078576638334e-05, + "loss": 0.4776, + "step": 1763 + }, + { + "epoch": 1.7981651376146788, + "grad_norm": 3.502904176712036, + "learning_rate": 3.705399226220497e-05, + "loss": 0.1951, + "step": 1764 + }, + { + "epoch": 1.7981651376146788, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8075562715530396, + "eval_Qnli-dev-1024_cosine_ap": 0.749513188759678, + "eval_Qnli-dev-1024_cosine_f1": 0.7272727272727272, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7558927536010742, + "eval_Qnli-dev-1024_cosine_mcc": 0.4497120149145933, + "eval_Qnli-dev-1024_cosine_precision": 0.6666666666666666, + "eval_Qnli-dev-1024_cosine_recall": 0.8, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6980013251304626, + "eval_Qnli-dev_cosine_ap": 0.7628031195186381, + "eval_Qnli-dev_cosine_f1": 0.7500000000000001, + "eval_Qnli-dev_cosine_f1_threshold": 0.636489748954773, + "eval_Qnli-dev_cosine_mcc": 0.48653004754089046, + "eval_Qnli-dev_cosine_precision": 0.6610169491525424, + "eval_Qnli-dev_cosine_recall": 0.8666666666666667, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9479166865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.28949180245399475, + "eval_global_dataset_runtime": 103.9623, + "eval_global_dataset_samples_per_second": 7.724, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9479166865348816, + "eval_sts-test-1024_pearson_cosine": 0.8750724336289886, + "eval_sts-test-1024_spearman_cosine": 0.9060820187994599, + "eval_sts-test_pearson_cosine": 0.9050891583995959, + "eval_sts-test_spearman_cosine": 0.9208979900112706, + "step": 1764 + }, + { + "epoch": 1.799184505606524, + "grad_norm": 5.523859977722168, + "learning_rate": 3.6872089792092925e-05, + "loss": 0.2023, + "step": 1765 + }, + { + "epoch": 1.800203873598369, + "grad_norm": 7.787282943725586, + "learning_rate": 3.669037374947199e-05, + "loss": 0.6349, + "step": 1766 + }, + { + "epoch": 1.801223241590214, + "grad_norm": 4.277161121368408, + "learning_rate": 3.650884671486453e-05, + "loss": 0.2479, + "step": 1767 + }, + { + "epoch": 1.8022426095820592, + "grad_norm": 11.096944808959961, + "learning_rate": 3.6327511266108835e-05, + "loss": 0.7766, + "step": 1768 + }, + { + "epoch": 1.803261977573904, + "grad_norm": 5.538233280181885, + "learning_rate": 3.614636997832259e-05, + "loss": 0.2044, + "step": 1769 + }, + { + "epoch": 1.8042813455657494, + "grad_norm": 3.1658334732055664, + "learning_rate": 3.596542542386597e-05, + "loss": 0.2323, + "step": 1770 + }, + { + "epoch": 1.8053007135575942, + "grad_norm": 4.729072570800781, + "learning_rate": 3.578468017230575e-05, + "loss": 0.242, + "step": 1771 + }, + { + "epoch": 1.8063200815494393, + "grad_norm": 6.442706108093262, + "learning_rate": 3.560413679037807e-05, + "loss": 0.2719, + "step": 1772 + }, + { + "epoch": 1.8073394495412844, + "grad_norm": 7.747926712036133, + "learning_rate": 3.542379784195267e-05, + "loss": 0.2921, + "step": 1773 + }, + { + "epoch": 1.8083588175331293, + "grad_norm": 5.683032512664795, + "learning_rate": 3.524366588799588e-05, + "loss": 0.3284, + "step": 1774 + }, + { + "epoch": 1.8093781855249746, + "grad_norm": 11.293792724609375, + "learning_rate": 3.5063743486534775e-05, + "loss": 1.0605, + "step": 1775 + }, + { + "epoch": 1.8103975535168195, + "grad_norm": 5.910725116729736, + "learning_rate": 3.488403319262037e-05, + "loss": 0.291, + "step": 1776 + }, + { + "epoch": 1.8114169215086646, + "grad_norm": 4.038328170776367, + "learning_rate": 3.470453755829183e-05, + "loss": 0.2033, + "step": 1777 + }, + { + "epoch": 1.8124362895005097, + "grad_norm": 2.819408655166626, + "learning_rate": 3.45252591325397e-05, + "loss": 0.2328, + "step": 1778 + }, + { + "epoch": 1.8134556574923546, + "grad_norm": 5.704439640045166, + "learning_rate": 3.4346200461270284e-05, + "loss": 0.4924, + "step": 1779 + }, + { + "epoch": 1.8144750254842, + "grad_norm": 6.023610591888428, + "learning_rate": 3.416736408726884e-05, + "loss": 0.3924, + "step": 1780 + }, + { + "epoch": 1.8154943934760448, + "grad_norm": 7.2247490882873535, + "learning_rate": 3.398875255016405e-05, + "loss": 0.6327, + "step": 1781 + }, + { + "epoch": 1.81651376146789, + "grad_norm": 2.289973735809326, + "learning_rate": 3.381036838639169e-05, + "loss": 0.1877, + "step": 1782 + }, + { + "epoch": 1.817533129459735, + "grad_norm": 1.9493658542633057, + "learning_rate": 3.363221412915858e-05, + "loss": 0.1135, + "step": 1783 + }, + { + "epoch": 1.8185524974515799, + "grad_norm": 4.736155986785889, + "learning_rate": 3.345429230840672e-05, + "loss": 0.3404, + "step": 1784 + }, + { + "epoch": 1.8195718654434252, + "grad_norm": 6.019724369049072, + "learning_rate": 3.327660545077731e-05, + "loss": 0.2669, + "step": 1785 + }, + { + "epoch": 1.82059123343527, + "grad_norm": 9.922126770019531, + "learning_rate": 3.309915607957485e-05, + "loss": 0.5143, + "step": 1786 + }, + { + "epoch": 1.8216106014271152, + "grad_norm": 5.188144683837891, + "learning_rate": 3.292194671473135e-05, + "loss": 0.3719, + "step": 1787 + }, + { + "epoch": 1.8226299694189603, + "grad_norm": 6.299932956695557, + "learning_rate": 3.2744979872770506e-05, + "loss": 0.3536, + "step": 1788 + }, + { + "epoch": 1.8236493374108051, + "grad_norm": 4.642245769500732, + "learning_rate": 3.256825806677205e-05, + "loss": 0.2735, + "step": 1789 + }, + { + "epoch": 1.8246687054026505, + "grad_norm": 4.521740913391113, + "learning_rate": 3.2391783806335885e-05, + "loss": 0.1971, + "step": 1790 + }, + { + "epoch": 1.8256880733944953, + "grad_norm": 4.655755996704102, + "learning_rate": 3.221555959754656e-05, + "loss": 0.4326, + "step": 1791 + }, + { + "epoch": 1.8267074413863404, + "grad_norm": 0.8639253377914429, + "learning_rate": 3.2039587942937855e-05, + "loss": 0.0688, + "step": 1792 + }, + { + "epoch": 1.8277268093781855, + "grad_norm": 4.769573211669922, + "learning_rate": 3.186387134145682e-05, + "loss": 0.2317, + "step": 1793 + }, + { + "epoch": 1.8287461773700304, + "grad_norm": 5.148104667663574, + "learning_rate": 3.168841228842877e-05, + "loss": 0.3941, + "step": 1794 + }, + { + "epoch": 1.8297655453618757, + "grad_norm": 5.8588080406188965, + "learning_rate": 3.1513213275521384e-05, + "loss": 0.3326, + "step": 1795 + }, + { + "epoch": 1.8307849133537206, + "grad_norm": 4.889102935791016, + "learning_rate": 3.1338276790709775e-05, + "loss": 0.2981, + "step": 1796 + }, + { + "epoch": 1.8318042813455657, + "grad_norm": 10.12937068939209, + "learning_rate": 3.1163605318240715e-05, + "loss": 0.4302, + "step": 1797 + }, + { + "epoch": 1.8328236493374108, + "grad_norm": 4.488791465759277, + "learning_rate": 3.098920133859783e-05, + "loss": 0.1922, + "step": 1798 + }, + { + "epoch": 1.8338430173292557, + "grad_norm": 7.872842311859131, + "learning_rate": 3.0815067328465816e-05, + "loss": 0.6065, + "step": 1799 + }, + { + "epoch": 1.834862385321101, + "grad_norm": 9.769292831420898, + "learning_rate": 3.064120576069579e-05, + "loss": 0.7834, + "step": 1800 + }, + { + "epoch": 1.835881753312946, + "grad_norm": 7.953698635101318, + "learning_rate": 3.0467619104269896e-05, + "loss": 0.521, + "step": 1801 + }, + { + "epoch": 1.836901121304791, + "grad_norm": 5.160398960113525, + "learning_rate": 3.0294309824266298e-05, + "loss": 0.3919, + "step": 1802 + }, + { + "epoch": 1.837920489296636, + "grad_norm": 7.545567989349365, + "learning_rate": 3.012128038182419e-05, + "loss": 0.303, + "step": 1803 + }, + { + "epoch": 1.8389398572884812, + "grad_norm": 7.457415580749512, + "learning_rate": 2.9948533234108834e-05, + "loss": 0.4616, + "step": 1804 + }, + { + "epoch": 1.8399592252803263, + "grad_norm": 7.828042030334473, + "learning_rate": 2.9776070834276647e-05, + "loss": 0.5165, + "step": 1805 + }, + { + "epoch": 1.8409785932721712, + "grad_norm": 2.9092535972595215, + "learning_rate": 2.9603895631440405e-05, + "loss": 0.1552, + "step": 1806 + }, + { + "epoch": 1.8419979612640163, + "grad_norm": 4.436439514160156, + "learning_rate": 2.943201007063443e-05, + "loss": 0.2466, + "step": 1807 + }, + { + "epoch": 1.8430173292558614, + "grad_norm": 6.84224796295166, + "learning_rate": 2.9260416592779934e-05, + "loss": 0.5846, + "step": 1808 + }, + { + "epoch": 1.8440366972477065, + "grad_norm": 6.275019645690918, + "learning_rate": 2.9089117634650192e-05, + "loss": 0.4507, + "step": 1809 + }, + { + "epoch": 1.8450560652395516, + "grad_norm": 7.318490982055664, + "learning_rate": 2.8918115628836062e-05, + "loss": 0.2341, + "step": 1810 + }, + { + "epoch": 1.8460754332313964, + "grad_norm": 4.57275915145874, + "learning_rate": 2.8747413003711614e-05, + "loss": 0.1863, + "step": 1811 + }, + { + "epoch": 1.8470948012232415, + "grad_norm": 7.398787975311279, + "learning_rate": 2.8577012183399164e-05, + "loss": 0.2505, + "step": 1812 + }, + { + "epoch": 1.8481141692150866, + "grad_norm": 6.0185370445251465, + "learning_rate": 2.8406915587735466e-05, + "loss": 0.2833, + "step": 1813 + }, + { + "epoch": 1.8491335372069317, + "grad_norm": 9.213631629943848, + "learning_rate": 2.8237125632236704e-05, + "loss": 0.3884, + "step": 1814 + }, + { + "epoch": 1.8501529051987768, + "grad_norm": 14.965542793273926, + "learning_rate": 2.8067644728064767e-05, + "loss": 0.8776, + "step": 1815 + }, + { + "epoch": 1.8511722731906217, + "grad_norm": 5.327033996582031, + "learning_rate": 2.7898475281992575e-05, + "loss": 0.4291, + "step": 1816 + }, + { + "epoch": 1.8521916411824668, + "grad_norm": 4.308135986328125, + "learning_rate": 2.7729619696370223e-05, + "loss": 0.3203, + "step": 1817 + }, + { + "epoch": 1.853211009174312, + "grad_norm": 6.840002059936523, + "learning_rate": 2.756108036909064e-05, + "loss": 0.3705, + "step": 1818 + }, + { + "epoch": 1.854230377166157, + "grad_norm": 10.1884765625, + "learning_rate": 2.7392859693555555e-05, + "loss": 0.6378, + "step": 1819 + }, + { + "epoch": 1.8552497451580021, + "grad_norm": 8.687849998474121, + "learning_rate": 2.7224960058641692e-05, + "loss": 1.005, + "step": 1820 + }, + { + "epoch": 1.856269113149847, + "grad_norm": 8.345951080322266, + "learning_rate": 2.7057383848666677e-05, + "loss": 0.4776, + "step": 1821 + }, + { + "epoch": 1.8572884811416923, + "grad_norm": 7.518494606018066, + "learning_rate": 2.6890133443355224e-05, + "loss": 0.4999, + "step": 1822 + }, + { + "epoch": 1.8583078491335372, + "grad_norm": 7.342522144317627, + "learning_rate": 2.6723211217805343e-05, + "loss": 0.2922, + "step": 1823 + }, + { + "epoch": 1.8593272171253823, + "grad_norm": 10.63000774383545, + "learning_rate": 2.655661954245462e-05, + "loss": 0.4653, + "step": 1824 + }, + { + "epoch": 1.8603465851172274, + "grad_norm": 5.479205131530762, + "learning_rate": 2.6390360783046535e-05, + "loss": 0.3264, + "step": 1825 + }, + { + "epoch": 1.8613659531090723, + "grad_norm": 6.947875499725342, + "learning_rate": 2.6224437300596892e-05, + "loss": 0.5453, + "step": 1826 + }, + { + "epoch": 1.8623853211009176, + "grad_norm": 12.862199783325195, + "learning_rate": 2.6058851451360278e-05, + "loss": 1.0997, + "step": 1827 + }, + { + "epoch": 1.8634046890927625, + "grad_norm": 6.396128177642822, + "learning_rate": 2.589360558679664e-05, + "loss": 0.5142, + "step": 1828 + }, + { + "epoch": 1.8644240570846076, + "grad_norm": 3.3630640506744385, + "learning_rate": 2.5728702053537668e-05, + "loss": 0.1527, + "step": 1829 + }, + { + "epoch": 1.8654434250764527, + "grad_norm": 3.5490384101867676, + "learning_rate": 2.5564143193353928e-05, + "loss": 0.2184, + "step": 1830 + }, + { + "epoch": 1.8664627930682975, + "grad_norm": 6.618597507476807, + "learning_rate": 2.539993134312111e-05, + "loss": 0.3838, + "step": 1831 + }, + { + "epoch": 1.8674821610601429, + "grad_norm": 5.235430717468262, + "learning_rate": 2.5236068834787263e-05, + "loss": 0.4265, + "step": 1832 + }, + { + "epoch": 1.8685015290519877, + "grad_norm": 10.94070053100586, + "learning_rate": 2.507255799533925e-05, + "loss": 0.5124, + "step": 1833 + }, + { + "epoch": 1.8695208970438328, + "grad_norm": 2.8730661869049072, + "learning_rate": 2.490940114677022e-05, + "loss": 0.1508, + "step": 1834 + }, + { + "epoch": 1.870540265035678, + "grad_norm": 5.6821980476379395, + "learning_rate": 2.4746600606046037e-05, + "loss": 0.2766, + "step": 1835 + }, + { + "epoch": 1.8715596330275228, + "grad_norm": 8.820428848266602, + "learning_rate": 2.4584158685073024e-05, + "loss": 0.5235, + "step": 1836 + }, + { + "epoch": 1.8725790010193681, + "grad_norm": 2.182717800140381, + "learning_rate": 2.4422077690664446e-05, + "loss": 0.2394, + "step": 1837 + }, + { + "epoch": 1.873598369011213, + "grad_norm": 6.691490650177002, + "learning_rate": 2.426035992450848e-05, + "loss": 0.326, + "step": 1838 + }, + { + "epoch": 1.8746177370030581, + "grad_norm": 4.4693217277526855, + "learning_rate": 2.4099007683134796e-05, + "loss": 0.3008, + "step": 1839 + }, + { + "epoch": 1.8756371049949032, + "grad_norm": 3.999272108078003, + "learning_rate": 2.3938023257882514e-05, + "loss": 0.1525, + "step": 1840 + }, + { + "epoch": 1.876656472986748, + "grad_norm": 9.120763778686523, + "learning_rate": 2.3777408934867424e-05, + "loss": 0.39, + "step": 1841 + }, + { + "epoch": 1.8776758409785934, + "grad_norm": 1.8546042442321777, + "learning_rate": 2.3617166994949493e-05, + "loss": 0.0774, + "step": 1842 + }, + { + "epoch": 1.8786952089704383, + "grad_norm": 4.0527825355529785, + "learning_rate": 2.3457299713700577e-05, + "loss": 0.2674, + "step": 1843 + }, + { + "epoch": 1.8797145769622834, + "grad_norm": 3.430372714996338, + "learning_rate": 2.329780936137205e-05, + "loss": 0.1116, + "step": 1844 + }, + { + "epoch": 1.8807339449541285, + "grad_norm": 6.467163562774658, + "learning_rate": 2.313869820286257e-05, + "loss": 0.4234, + "step": 1845 + }, + { + "epoch": 1.8817533129459734, + "grad_norm": 12.85617733001709, + "learning_rate": 2.2979968497685924e-05, + "loss": 0.8432, + "step": 1846 + }, + { + "epoch": 1.8827726809378187, + "grad_norm": 5.897470474243164, + "learning_rate": 2.2821622499938926e-05, + "loss": 0.3522, + "step": 1847 + }, + { + "epoch": 1.8837920489296636, + "grad_norm": 4.003026485443115, + "learning_rate": 2.266366245826947e-05, + "loss": 0.1858, + "step": 1848 + }, + { + "epoch": 1.8848114169215087, + "grad_norm": 5.525468826293945, + "learning_rate": 2.2506090615844477e-05, + "loss": 0.2835, + "step": 1849 + }, + { + "epoch": 1.8858307849133538, + "grad_norm": 9.481707572937012, + "learning_rate": 2.2348909210318064e-05, + "loss": 0.7738, + "step": 1850 + }, + { + "epoch": 1.8868501529051986, + "grad_norm": 2.575413942337036, + "learning_rate": 2.2192120473800014e-05, + "loss": 0.1524, + "step": 1851 + }, + { + "epoch": 1.887869520897044, + "grad_norm": 9.639274597167969, + "learning_rate": 2.203572663282362e-05, + "loss": 0.7539, + "step": 1852 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 5.821088790893555, + "learning_rate": 2.1879729908314544e-05, + "loss": 0.5166, + "step": 1853 + }, + { + "epoch": 1.889908256880734, + "grad_norm": 3.0853917598724365, + "learning_rate": 2.1724132515558887e-05, + "loss": 0.2346, + "step": 1854 + }, + { + "epoch": 1.890927624872579, + "grad_norm": 5.704559803009033, + "learning_rate": 2.156893666417204e-05, + "loss": 0.2003, + "step": 1855 + }, + { + "epoch": 1.891946992864424, + "grad_norm": 9.251178741455078, + "learning_rate": 2.1414144558067023e-05, + "loss": 0.7549, + "step": 1856 + }, + { + "epoch": 1.8929663608562692, + "grad_norm": 3.4819564819335938, + "learning_rate": 2.1259758395423512e-05, + "loss": 0.1809, + "step": 1857 + }, + { + "epoch": 1.8939857288481141, + "grad_norm": 9.680218696594238, + "learning_rate": 2.1105780368656215e-05, + "loss": 0.6533, + "step": 1858 + }, + { + "epoch": 1.8950050968399592, + "grad_norm": 3.142969846725464, + "learning_rate": 2.0952212664384124e-05, + "loss": 0.1845, + "step": 1859 + }, + { + "epoch": 1.8960244648318043, + "grad_norm": 5.323211193084717, + "learning_rate": 2.079905746339927e-05, + "loss": 0.3156, + "step": 1860 + }, + { + "epoch": 1.8970438328236492, + "grad_norm": 9.717694282531738, + "learning_rate": 2.0646316940635763e-05, + "loss": 0.5856, + "step": 1861 + }, + { + "epoch": 1.8980632008154945, + "grad_norm": 2.1464314460754395, + "learning_rate": 2.049399326513895e-05, + "loss": 0.1409, + "step": 1862 + }, + { + "epoch": 1.8980632008154945, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7604166666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7837380170822144, + "eval_Qnli-dev-1024_cosine_ap": 0.7655287400697098, + "eval_Qnli-dev-1024_cosine_f1": 0.7358490566037736, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7243161797523499, + "eval_Qnli-dev-1024_cosine_mcc": 0.45131025668485714, + "eval_Qnli-dev-1024_cosine_precision": 0.639344262295082, + "eval_Qnli-dev-1024_cosine_recall": 0.8666666666666667, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6975128650665283, + "eval_Qnli-dev_cosine_ap": 0.7629854627366377, + "eval_Qnli-dev_cosine_f1": 0.7476635514018692, + "eval_Qnli-dev_cosine_f1_threshold": 0.6258813738822937, + "eval_Qnli-dev_cosine_mcc": 0.47737827504723207, + "eval_Qnli-dev_cosine_precision": 0.6451612903225806, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9479166865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.3163749873638153, + "eval_global_dataset_runtime": 103.9557, + "eval_global_dataset_samples_per_second": 7.724, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9479166865348816, + "eval_sts-test-1024_pearson_cosine": 0.8841306743942718, + "eval_sts-test-1024_spearman_cosine": 0.9094778502209384, + "eval_sts-test_pearson_cosine": 0.9062162835110004, + "eval_sts-test_spearman_cosine": 0.9204680794914208, + "step": 1862 + }, + { + "epoch": 1.8990825688073394, + "grad_norm": 7.201123237609863, + "learning_rate": 2.0342088600034582e-05, + "loss": 0.3773, + "step": 1863 + }, + { + "epoch": 1.9001019367991845, + "grad_norm": 8.974194526672363, + "learning_rate": 2.0190605102498105e-05, + "loss": 0.8936, + "step": 1864 + }, + { + "epoch": 1.9011213047910296, + "grad_norm": 10.563526153564453, + "learning_rate": 2.003954492372404e-05, + "loss": 0.3592, + "step": 1865 + }, + { + "epoch": 1.9021406727828745, + "grad_norm": 7.776918888092041, + "learning_rate": 1.9888910208895407e-05, + "loss": 0.2913, + "step": 1866 + }, + { + "epoch": 1.9031600407747198, + "grad_norm": 7.598758220672607, + "learning_rate": 1.9738703097153316e-05, + "loss": 0.436, + "step": 1867 + }, + { + "epoch": 1.9041794087665647, + "grad_norm": 8.523947715759277, + "learning_rate": 1.958892572156648e-05, + "loss": 0.4502, + "step": 1868 + }, + { + "epoch": 1.9051987767584098, + "grad_norm": 4.650737762451172, + "learning_rate": 1.943958020910096e-05, + "loss": 0.2545, + "step": 1869 + }, + { + "epoch": 1.9062181447502549, + "grad_norm": 3.7644124031066895, + "learning_rate": 1.92906686805902e-05, + "loss": 0.1539, + "step": 1870 + }, + { + "epoch": 1.9072375127420997, + "grad_norm": 8.597049713134766, + "learning_rate": 1.914219325070442e-05, + "loss": 0.4734, + "step": 1871 + }, + { + "epoch": 1.908256880733945, + "grad_norm": 3.616166830062866, + "learning_rate": 1.8994156027921162e-05, + "loss": 0.1828, + "step": 1872 + }, + { + "epoch": 1.90927624872579, + "grad_norm": 6.930202960968018, + "learning_rate": 1.8846559114494756e-05, + "loss": 0.3154, + "step": 1873 + }, + { + "epoch": 1.910295616717635, + "grad_norm": 6.539050102233887, + "learning_rate": 1.8699404606427052e-05, + "loss": 0.2951, + "step": 1874 + }, + { + "epoch": 1.9113149847094801, + "grad_norm": 5.528707504272461, + "learning_rate": 1.8552694593437097e-05, + "loss": 0.3063, + "step": 1875 + }, + { + "epoch": 1.912334352701325, + "grad_norm": 5.866353511810303, + "learning_rate": 1.8406431158931996e-05, + "loss": 0.4556, + "step": 1876 + }, + { + "epoch": 1.9133537206931703, + "grad_norm": 8.167952537536621, + "learning_rate": 1.826061637997685e-05, + "loss": 0.4327, + "step": 1877 + }, + { + "epoch": 1.9143730886850152, + "grad_norm": 11.24150276184082, + "learning_rate": 1.8115252327265543e-05, + "loss": 0.5985, + "step": 1878 + }, + { + "epoch": 1.9153924566768603, + "grad_norm": 5.483310222625732, + "learning_rate": 1.797034106509127e-05, + "loss": 0.1853, + "step": 1879 + }, + { + "epoch": 1.9164118246687054, + "grad_norm": 5.693748474121094, + "learning_rate": 1.7825884651317204e-05, + "loss": 0.2905, + "step": 1880 + }, + { + "epoch": 1.9174311926605505, + "grad_norm": 4.149360656738281, + "learning_rate": 1.768188513734731e-05, + "loss": 0.19, + "step": 1881 + }, + { + "epoch": 1.9184505606523956, + "grad_norm": 3.8560631275177, + "learning_rate": 1.753834456809716e-05, + "loss": 0.1926, + "step": 1882 + }, + { + "epoch": 1.9194699286442405, + "grad_norm": 7.31168270111084, + "learning_rate": 1.7395264981964927e-05, + "loss": 0.291, + "step": 1883 + }, + { + "epoch": 1.9204892966360856, + "grad_norm": 6.7560715675354, + "learning_rate": 1.7252648410802434e-05, + "loss": 0.6064, + "step": 1884 + }, + { + "epoch": 1.9215086646279307, + "grad_norm": 6.752120494842529, + "learning_rate": 1.71104968798863e-05, + "loss": 0.6472, + "step": 1885 + }, + { + "epoch": 1.9225280326197758, + "grad_norm": 3.197389602661133, + "learning_rate": 1.696881240788917e-05, + "loss": 0.2575, + "step": 1886 + }, + { + "epoch": 1.9235474006116209, + "grad_norm": 10.3342866897583, + "learning_rate": 1.6827597006851104e-05, + "loss": 0.4679, + "step": 1887 + }, + { + "epoch": 1.9245667686034658, + "grad_norm": 3.474792003631592, + "learning_rate": 1.6686852682150794e-05, + "loss": 0.1591, + "step": 1888 + }, + { + "epoch": 1.9255861365953109, + "grad_norm": 9.85100269317627, + "learning_rate": 1.654658143247747e-05, + "loss": 0.4278, + "step": 1889 + }, + { + "epoch": 1.926605504587156, + "grad_norm": 2.223907947540283, + "learning_rate": 1.640678524980212e-05, + "loss": 0.1457, + "step": 1890 + }, + { + "epoch": 1.927624872579001, + "grad_norm": 4.660088539123535, + "learning_rate": 1.6267466119349507e-05, + "loss": 0.3107, + "step": 1891 + }, + { + "epoch": 1.9286442405708462, + "grad_norm": 7.299057483673096, + "learning_rate": 1.6128626019569715e-05, + "loss": 0.3369, + "step": 1892 + }, + { + "epoch": 1.929663608562691, + "grad_norm": 5.282012462615967, + "learning_rate": 1.5990266922110324e-05, + "loss": 0.4062, + "step": 1893 + }, + { + "epoch": 1.9306829765545361, + "grad_norm": 4.812608242034912, + "learning_rate": 1.5852390791788134e-05, + "loss": 0.3265, + "step": 1894 + }, + { + "epoch": 1.9317023445463812, + "grad_norm": 10.287457466125488, + "learning_rate": 1.5714999586561536e-05, + "loss": 0.6488, + "step": 1895 + }, + { + "epoch": 1.9327217125382263, + "grad_norm": 8.709385871887207, + "learning_rate": 1.5578095257502433e-05, + "loss": 0.5861, + "step": 1896 + }, + { + "epoch": 1.9337410805300714, + "grad_norm": 6.345649242401123, + "learning_rate": 1.544167974876885e-05, + "loss": 0.2017, + "step": 1897 + }, + { + "epoch": 1.9347604485219163, + "grad_norm": 4.160301208496094, + "learning_rate": 1.5305754997576922e-05, + "loss": 0.275, + "step": 1898 + }, + { + "epoch": 1.9357798165137616, + "grad_norm": 6.832029342651367, + "learning_rate": 1.5170322934173775e-05, + "loss": 0.2694, + "step": 1899 + }, + { + "epoch": 1.9367991845056065, + "grad_norm": 4.3108296394348145, + "learning_rate": 1.503538548180991e-05, + "loss": 0.1678, + "step": 1900 + }, + { + "epoch": 1.9378185524974516, + "grad_norm": 6.166708946228027, + "learning_rate": 1.4900944556711927e-05, + "loss": 0.2412, + "step": 1901 + }, + { + "epoch": 1.9388379204892967, + "grad_norm": 7.308569431304932, + "learning_rate": 1.4767002068055297e-05, + "loss": 0.6276, + "step": 1902 + }, + { + "epoch": 1.9398572884811416, + "grad_norm": 2.976868152618408, + "learning_rate": 1.4633559917937306e-05, + "loss": 0.1367, + "step": 1903 + }, + { + "epoch": 1.940876656472987, + "grad_norm": 6.624225616455078, + "learning_rate": 1.4500620001349968e-05, + "loss": 0.551, + "step": 1904 + }, + { + "epoch": 1.9418960244648318, + "grad_norm": 4.78963041305542, + "learning_rate": 1.436818420615319e-05, + "loss": 0.3431, + "step": 1905 + }, + { + "epoch": 1.9429153924566769, + "grad_norm": 5.378144264221191, + "learning_rate": 1.4236254413047896e-05, + "loss": 0.2006, + "step": 1906 + }, + { + "epoch": 1.943934760448522, + "grad_norm": 6.3299760818481445, + "learning_rate": 1.4104832495549402e-05, + "loss": 0.4514, + "step": 1907 + }, + { + "epoch": 1.9449541284403669, + "grad_norm": 3.8007140159606934, + "learning_rate": 1.3973920319960682e-05, + "loss": 0.232, + "step": 1908 + }, + { + "epoch": 1.9459734964322122, + "grad_norm": 8.437200546264648, + "learning_rate": 1.3843519745345923e-05, + "loss": 0.7087, + "step": 1909 + }, + { + "epoch": 1.946992864424057, + "grad_norm": 9.044495582580566, + "learning_rate": 1.3713632623504318e-05, + "loss": 0.3232, + "step": 1910 + }, + { + "epoch": 1.9480122324159022, + "grad_norm": 3.7584590911865234, + "learning_rate": 1.358426079894336e-05, + "loss": 0.142, + "step": 1911 + }, + { + "epoch": 1.9490316004077473, + "grad_norm": 3.0307207107543945, + "learning_rate": 1.3455406108853108e-05, + "loss": 0.1597, + "step": 1912 + }, + { + "epoch": 1.9500509683995921, + "grad_norm": 5.186105728149414, + "learning_rate": 1.3327070383079649e-05, + "loss": 0.2134, + "step": 1913 + }, + { + "epoch": 1.9510703363914375, + "grad_norm": 11.696715354919434, + "learning_rate": 1.3199255444099557e-05, + "loss": 0.5582, + "step": 1914 + }, + { + "epoch": 1.9520897043832823, + "grad_norm": 6.090602874755859, + "learning_rate": 1.3071963106993573e-05, + "loss": 0.184, + "step": 1915 + }, + { + "epoch": 1.9531090723751274, + "grad_norm": 7.272611618041992, + "learning_rate": 1.2945195179421266e-05, + "loss": 0.5319, + "step": 1916 + }, + { + "epoch": 1.9541284403669725, + "grad_norm": 6.290035724639893, + "learning_rate": 1.2818953461594969e-05, + "loss": 0.2426, + "step": 1917 + }, + { + "epoch": 1.9551478083588174, + "grad_norm": 6.6716156005859375, + "learning_rate": 1.2693239746254432e-05, + "loss": 0.3149, + "step": 1918 + }, + { + "epoch": 1.9561671763506627, + "grad_norm": 6.8174896240234375, + "learning_rate": 1.2568055818641366e-05, + "loss": 0.281, + "step": 1919 + }, + { + "epoch": 1.9571865443425076, + "grad_norm": 5.842833518981934, + "learning_rate": 1.2443403456474017e-05, + "loss": 0.2779, + "step": 1920 + }, + { + "epoch": 1.9582059123343527, + "grad_norm": 8.29177474975586, + "learning_rate": 1.2319284429921957e-05, + "loss": 0.3821, + "step": 1921 + }, + { + "epoch": 1.9592252803261978, + "grad_norm": 7.320400714874268, + "learning_rate": 1.2195700501580937e-05, + "loss": 0.5121, + "step": 1922 + }, + { + "epoch": 1.9602446483180427, + "grad_norm": 6.3962321281433105, + "learning_rate": 1.207265342644785e-05, + "loss": 0.2637, + "step": 1923 + }, + { + "epoch": 1.961264016309888, + "grad_norm": 9.778839111328125, + "learning_rate": 1.1950144951895819e-05, + "loss": 0.8175, + "step": 1924 + }, + { + "epoch": 1.9622833843017329, + "grad_norm": 7.268181800842285, + "learning_rate": 1.18281768176494e-05, + "loss": 0.3914, + "step": 1925 + }, + { + "epoch": 1.963302752293578, + "grad_norm": 3.256939172744751, + "learning_rate": 1.1706750755759854e-05, + "loss": 0.1444, + "step": 1926 + }, + { + "epoch": 1.964322120285423, + "grad_norm": 3.6003193855285645, + "learning_rate": 1.1585868490580503e-05, + "loss": 0.141, + "step": 1927 + }, + { + "epoch": 1.965341488277268, + "grad_norm": 7.784787654876709, + "learning_rate": 1.146553173874232e-05, + "loss": 0.291, + "step": 1928 + }, + { + "epoch": 1.9663608562691133, + "grad_norm": 4.663620471954346, + "learning_rate": 1.1345742209129589e-05, + "loss": 0.211, + "step": 1929 + }, + { + "epoch": 1.9673802242609582, + "grad_norm": 8.380531311035156, + "learning_rate": 1.1226501602855466e-05, + "loss": 0.3455, + "step": 1930 + }, + { + "epoch": 1.9683995922528033, + "grad_norm": 3.0026469230651855, + "learning_rate": 1.1107811613238034e-05, + "loss": 0.1692, + "step": 1931 + }, + { + "epoch": 1.9694189602446484, + "grad_norm": 8.605925559997559, + "learning_rate": 1.0989673925776039e-05, + "loss": 0.8896, + "step": 1932 + }, + { + "epoch": 1.9704383282364932, + "grad_norm": 6.9157233238220215, + "learning_rate": 1.0872090218125197e-05, + "loss": 0.5577, + "step": 1933 + }, + { + "epoch": 1.9714576962283386, + "grad_norm": 5.527960777282715, + "learning_rate": 1.0755062160074103e-05, + "loss": 0.4554, + "step": 1934 + }, + { + "epoch": 1.9724770642201834, + "grad_norm": 4.29521369934082, + "learning_rate": 1.0638591413520782e-05, + "loss": 0.1784, + "step": 1935 + }, + { + "epoch": 1.9734964322120285, + "grad_norm": 6.10489559173584, + "learning_rate": 1.0522679632448879e-05, + "loss": 0.2751, + "step": 1936 + }, + { + "epoch": 1.9745158002038736, + "grad_norm": 5.862469673156738, + "learning_rate": 1.0407328462904247e-05, + "loss": 0.3531, + "step": 1937 + }, + { + "epoch": 1.9755351681957185, + "grad_norm": 9.201395988464355, + "learning_rate": 1.0292539542971625e-05, + "loss": 0.6773, + "step": 1938 + }, + { + "epoch": 1.9765545361875638, + "grad_norm": 5.540637969970703, + "learning_rate": 1.0178314502751312e-05, + "loss": 0.2948, + "step": 1939 + }, + { + "epoch": 1.9775739041794087, + "grad_norm": 8.971341133117676, + "learning_rate": 1.006465496433604e-05, + "loss": 0.4181, + "step": 1940 + }, + { + "epoch": 1.9785932721712538, + "grad_norm": 8.356114387512207, + "learning_rate": 9.951562541787929e-06, + "loss": 0.7386, + "step": 1941 + }, + { + "epoch": 1.979612640163099, + "grad_norm": 6.853330135345459, + "learning_rate": 9.839038841115566e-06, + "loss": 0.2926, + "step": 1942 + }, + { + "epoch": 1.9806320081549438, + "grad_norm": 5.473418712615967, + "learning_rate": 9.727085460251218e-06, + "loss": 0.2158, + "step": 1943 + }, + { + "epoch": 1.981651376146789, + "grad_norm": 7.808025360107422, + "learning_rate": 9.615703989028112e-06, + "loss": 0.6198, + "step": 1944 + }, + { + "epoch": 1.982670744138634, + "grad_norm": 4.961716651916504, + "learning_rate": 9.504896009157876e-06, + "loss": 0.2425, + "step": 1945 + }, + { + "epoch": 1.983690112130479, + "grad_norm": 8.417882919311523, + "learning_rate": 9.394663094208128e-06, + "loss": 0.4055, + "step": 1946 + }, + { + "epoch": 1.9847094801223242, + "grad_norm": 5.619123935699463, + "learning_rate": 9.285006809579888e-06, + "loss": 0.4608, + "step": 1947 + }, + { + "epoch": 1.985728848114169, + "grad_norm": 6.5537567138671875, + "learning_rate": 9.175928712485798e-06, + "loss": 0.3028, + "step": 1948 + }, + { + "epoch": 1.9867482161060144, + "grad_norm": 6.92478084564209, + "learning_rate": 9.067430351927513e-06, + "loss": 0.7157, + "step": 1949 + }, + { + "epoch": 1.9877675840978593, + "grad_norm": 6.300648212432861, + "learning_rate": 8.959513268674141e-06, + "loss": 0.3149, + "step": 1950 + }, + { + "epoch": 1.9887869520897044, + "grad_norm": 3.5753180980682373, + "learning_rate": 8.852178995239952e-06, + "loss": 0.0783, + "step": 1951 + }, + { + "epoch": 1.9898063200815495, + "grad_norm": 2.463205337524414, + "learning_rate": 8.745429055863024e-06, + "loss": 0.1055, + "step": 1952 + }, + { + "epoch": 1.9908256880733946, + "grad_norm": 7.284326553344727, + "learning_rate": 8.639264966483196e-06, + "loss": 0.4019, + "step": 1953 + }, + { + "epoch": 1.9918450560652396, + "grad_norm": 7.7004852294921875, + "learning_rate": 8.533688234720937e-06, + "loss": 0.4939, + "step": 1954 + }, + { + "epoch": 1.9928644240570845, + "grad_norm": 6.602788925170898, + "learning_rate": 8.428700359855535e-06, + "loss": 0.2458, + "step": 1955 + }, + { + "epoch": 1.9938837920489296, + "grad_norm": 3.019469738006592, + "learning_rate": 8.324302832804237e-06, + "loss": 0.1383, + "step": 1956 + }, + { + "epoch": 1.9949031600407747, + "grad_norm": 3.9522721767425537, + "learning_rate": 8.220497136100602e-06, + "loss": 0.2046, + "step": 1957 + }, + { + "epoch": 1.9959225280326198, + "grad_norm": 8.255660057067871, + "learning_rate": 8.117284743873859e-06, + "loss": 0.254, + "step": 1958 + }, + { + "epoch": 1.996941896024465, + "grad_norm": 3.8426706790924072, + "learning_rate": 8.014667121827784e-06, + "loss": 0.1302, + "step": 1959 + }, + { + "epoch": 1.9979612640163098, + "grad_norm": 6.920229911804199, + "learning_rate": 7.912645727219875e-06, + "loss": 0.595, + "step": 1960 + }, + { + "epoch": 1.9979612640163098, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7604166666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7934629917144775, + "eval_Qnli-dev-1024_cosine_ap": 0.7691003178954985, + "eval_Qnli-dev-1024_cosine_f1": 0.7450980392156862, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.734605610370636, + "eval_Qnli-dev-1024_cosine_mcc": 0.4794765594627558, + "eval_Qnli-dev-1024_cosine_precision": 0.6666666666666666, + "eval_Qnli-dev-1024_cosine_recall": 0.8444444444444444, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6731052994728088, + "eval_Qnli-dev_cosine_ap": 0.7666441534536714, + "eval_Qnli-dev_cosine_f1": 0.7476635514018692, + "eval_Qnli-dev_cosine_f1_threshold": 0.6249356865882874, + "eval_Qnli-dev_cosine_mcc": 0.47737827504723207, + "eval_Qnli-dev_cosine_precision": 0.6451612903225806, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.3007451593875885, + "eval_global_dataset_runtime": 103.9733, + "eval_global_dataset_samples_per_second": 7.723, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8876934677640924, + "eval_sts-test-1024_spearman_cosine": 0.9111361153088742, + "eval_sts-test_pearson_cosine": 0.9070198458509664, + "eval_sts-test_spearman_cosine": 0.9207956588148845, + "step": 1960 + }, + { + "epoch": 1.998980632008155, + "grad_norm": 7.969030857086182, + "learning_rate": 7.81122200884072e-06, + "loss": 0.3089, + "step": 1961 + }, + { + "epoch": 2.0, + "grad_norm": 4.521041393280029, + "learning_rate": 7.710397406993387e-06, + "loss": 0.2925, + "step": 1962 + }, + { + "epoch": 2.001019367991845, + "grad_norm": 1.096656322479248, + "learning_rate": 7.610173353472977e-06, + "loss": 0.0737, + "step": 1963 + }, + { + "epoch": 2.00203873598369, + "grad_norm": 5.494912624359131, + "learning_rate": 7.510551271546301e-06, + "loss": 0.3361, + "step": 1964 + }, + { + "epoch": 2.003058103975535, + "grad_norm": 7.918971538543701, + "learning_rate": 7.411532575931657e-06, + "loss": 0.738, + "step": 1965 + }, + { + "epoch": 2.0040774719673804, + "grad_norm": 4.213651657104492, + "learning_rate": 7.313118672778768e-06, + "loss": 0.2693, + "step": 1966 + }, + { + "epoch": 2.0050968399592253, + "grad_norm": 10.979836463928223, + "learning_rate": 7.21531095964873e-06, + "loss": 0.9024, + "step": 1967 + }, + { + "epoch": 2.00611620795107, + "grad_norm": 8.45730209350586, + "learning_rate": 7.118110825494251e-06, + "loss": 0.3686, + "step": 1968 + }, + { + "epoch": 2.0071355759429155, + "grad_norm": 3.898918628692627, + "learning_rate": 7.0215196506399515e-06, + "loss": 0.1791, + "step": 1969 + }, + { + "epoch": 2.0081549439347604, + "grad_norm": 5.748415946960449, + "learning_rate": 6.925538806762638e-06, + "loss": 0.341, + "step": 1970 + }, + { + "epoch": 2.0091743119266057, + "grad_norm": 6.287943363189697, + "learning_rate": 6.830169656871966e-06, + "loss": 0.4936, + "step": 1971 + }, + { + "epoch": 2.0101936799184505, + "grad_norm": 6.493780136108398, + "learning_rate": 6.735413555290937e-06, + "loss": 0.5617, + "step": 1972 + }, + { + "epoch": 2.0112130479102954, + "grad_norm": 2.098747968673706, + "learning_rate": 6.641271847636854e-06, + "loss": 0.1672, + "step": 1973 + }, + { + "epoch": 2.0122324159021407, + "grad_norm": 7.320080757141113, + "learning_rate": 6.547745870801958e-06, + "loss": 0.4374, + "step": 1974 + }, + { + "epoch": 2.0132517838939856, + "grad_norm": 8.45764446258545, + "learning_rate": 6.4548369529347566e-06, + "loss": 0.5038, + "step": 1975 + }, + { + "epoch": 2.014271151885831, + "grad_norm": 1.3194546699523926, + "learning_rate": 6.36254641342085e-06, + "loss": 0.0422, + "step": 1976 + }, + { + "epoch": 2.015290519877676, + "grad_norm": 7.725487232208252, + "learning_rate": 6.270875562864409e-06, + "loss": 0.3417, + "step": 1977 + }, + { + "epoch": 2.0163098878695207, + "grad_norm": 3.446214437484741, + "learning_rate": 6.179825703069486e-06, + "loss": 0.1576, + "step": 1978 + }, + { + "epoch": 2.017329255861366, + "grad_norm": 2.145524024963379, + "learning_rate": 6.089398127021534e-06, + "loss": 0.0915, + "step": 1979 + }, + { + "epoch": 2.018348623853211, + "grad_norm": 2.5441269874572754, + "learning_rate": 5.999594118869051e-06, + "loss": 0.1103, + "step": 1980 + }, + { + "epoch": 2.019367991845056, + "grad_norm": 6.149587154388428, + "learning_rate": 5.910414953905341e-06, + "loss": 0.3266, + "step": 1981 + }, + { + "epoch": 2.020387359836901, + "grad_norm": 6.195409297943115, + "learning_rate": 5.82186189855039e-06, + "loss": 0.1879, + "step": 1982 + }, + { + "epoch": 2.021406727828746, + "grad_norm": 6.459590435028076, + "learning_rate": 5.733936210332919e-06, + "loss": 0.5804, + "step": 1983 + }, + { + "epoch": 2.0224260958205913, + "grad_norm": 7.298165798187256, + "learning_rate": 5.646639137872467e-06, + "loss": 0.4159, + "step": 1984 + }, + { + "epoch": 2.023445463812436, + "grad_norm": 3.1634249687194824, + "learning_rate": 5.559971920861734e-06, + "loss": 0.1358, + "step": 1985 + }, + { + "epoch": 2.0244648318042815, + "grad_norm": 2.3591601848602295, + "learning_rate": 5.473935790048923e-06, + "loss": 0.0971, + "step": 1986 + }, + { + "epoch": 2.0254841997961264, + "grad_norm": 4.338249683380127, + "learning_rate": 5.388531967220211e-06, + "loss": 0.1704, + "step": 1987 + }, + { + "epoch": 2.0265035677879712, + "grad_norm": 6.169326305389404, + "learning_rate": 5.30376166518261e-06, + "loss": 0.221, + "step": 1988 + }, + { + "epoch": 2.0275229357798166, + "grad_norm": 3.112377882003784, + "learning_rate": 5.219626087746432e-06, + "loss": 0.1377, + "step": 1989 + }, + { + "epoch": 2.0285423037716614, + "grad_norm": 4.449329853057861, + "learning_rate": 5.136126429708521e-06, + "loss": 0.2792, + "step": 1990 + }, + { + "epoch": 2.0295616717635068, + "grad_norm": 7.014669895172119, + "learning_rate": 5.053263876834957e-06, + "loss": 0.5533, + "step": 1991 + }, + { + "epoch": 2.0305810397553516, + "grad_norm": 5.11379861831665, + "learning_rate": 4.971039605844558e-06, + "loss": 0.243, + "step": 1992 + }, + { + "epoch": 2.0316004077471965, + "grad_norm": 4.427427768707275, + "learning_rate": 4.889454784391823e-06, + "loss": 0.2642, + "step": 1993 + }, + { + "epoch": 2.032619775739042, + "grad_norm": 2.5310215950012207, + "learning_rate": 4.808510571050695e-06, + "loss": 0.1542, + "step": 1994 + }, + { + "epoch": 2.0336391437308867, + "grad_norm": 4.451014518737793, + "learning_rate": 4.7282081152978056e-06, + "loss": 0.3514, + "step": 1995 + }, + { + "epoch": 2.034658511722732, + "grad_norm": 6.827959060668945, + "learning_rate": 4.6485485574963125e-06, + "loss": 0.192, + "step": 1996 + }, + { + "epoch": 2.035677879714577, + "grad_norm": 4.101830005645752, + "learning_rate": 4.569533028879719e-06, + "loss": 0.1563, + "step": 1997 + }, + { + "epoch": 2.036697247706422, + "grad_norm": 6.259415626525879, + "learning_rate": 4.491162651535729e-06, + "loss": 0.2977, + "step": 1998 + }, + { + "epoch": 2.037716615698267, + "grad_norm": 4.608965873718262, + "learning_rate": 4.413438538390363e-06, + "loss": 0.3345, + "step": 1999 + }, + { + "epoch": 2.038735983690112, + "grad_norm": 3.5626466274261475, + "learning_rate": 4.3363617931921396e-06, + "loss": 0.2128, + "step": 2000 + }, + { + "epoch": 2.0397553516819573, + "grad_norm": 3.7776856422424316, + "learning_rate": 4.2599335104964e-06, + "loss": 0.178, + "step": 2001 + }, + { + "epoch": 2.040774719673802, + "grad_norm": 4.22892951965332, + "learning_rate": 4.184154775649762e-06, + "loss": 0.2355, + "step": 2002 + }, + { + "epoch": 2.0417940876656475, + "grad_norm": 7.214908123016357, + "learning_rate": 4.109026664774718e-06, + "loss": 0.2723, + "step": 2003 + }, + { + "epoch": 2.0428134556574924, + "grad_norm": 4.9243645668029785, + "learning_rate": 4.034550244754337e-06, + "loss": 0.3108, + "step": 2004 + }, + { + "epoch": 2.0438328236493373, + "grad_norm": 2.550334930419922, + "learning_rate": 3.960726573217171e-06, + "loss": 0.1128, + "step": 2005 + }, + { + "epoch": 2.0448521916411826, + "grad_norm": 6.916386604309082, + "learning_rate": 3.887556698522071e-06, + "loss": 0.4542, + "step": 2006 + }, + { + "epoch": 2.0458715596330275, + "grad_norm": 11.782197952270508, + "learning_rate": 3.815041659743556e-06, + "loss": 1.0039, + "step": 2007 + }, + { + "epoch": 2.046890927624873, + "grad_norm": 6.364333152770996, + "learning_rate": 3.743182486656821e-06, + "loss": 0.4065, + "step": 2008 + }, + { + "epoch": 2.0479102956167177, + "grad_norm": 8.159380912780762, + "learning_rate": 3.671980199723274e-06, + "loss": 0.6474, + "step": 2009 + }, + { + "epoch": 2.0489296636085625, + "grad_norm": 9.521140098571777, + "learning_rate": 3.6014358100759204e-06, + "loss": 0.6126, + "step": 2010 + }, + { + "epoch": 2.049949031600408, + "grad_norm": 5.315720558166504, + "learning_rate": 3.5315503195051337e-06, + "loss": 0.2899, + "step": 2011 + }, + { + "epoch": 2.0509683995922527, + "grad_norm": 3.33170747756958, + "learning_rate": 3.462324720444271e-06, + "loss": 0.2236, + "step": 2012 + }, + { + "epoch": 2.051987767584098, + "grad_norm": 2.4479920864105225, + "learning_rate": 3.393759995955781e-06, + "loss": 0.1018, + "step": 2013 + }, + { + "epoch": 2.053007135575943, + "grad_norm": 5.11319637298584, + "learning_rate": 3.3258571197170017e-06, + "loss": 0.4546, + "step": 2014 + }, + { + "epoch": 2.054026503567788, + "grad_norm": 5.450096607208252, + "learning_rate": 3.2586170560066133e-06, + "loss": 0.1877, + "step": 2015 + }, + { + "epoch": 2.055045871559633, + "grad_norm": 3.155742645263672, + "learning_rate": 3.1920407596906455e-06, + "loss": 0.203, + "step": 2016 + }, + { + "epoch": 2.056065239551478, + "grad_norm": 3.579530715942383, + "learning_rate": 3.1261291762091527e-06, + "loss": 0.2151, + "step": 2017 + }, + { + "epoch": 2.0570846075433233, + "grad_norm": 6.1576032638549805, + "learning_rate": 3.0608832415626898e-06, + "loss": 0.4264, + "step": 2018 + }, + { + "epoch": 2.058103975535168, + "grad_norm": 6.620378017425537, + "learning_rate": 2.9963038822990174e-06, + "loss": 0.305, + "step": 2019 + }, + { + "epoch": 2.059123343527013, + "grad_norm": 3.6878819465637207, + "learning_rate": 2.932392015499974e-06, + "loss": 0.1407, + "step": 2020 + }, + { + "epoch": 2.0601427115188584, + "grad_norm": 5.848405838012695, + "learning_rate": 2.8691485487684246e-06, + "loss": 0.1914, + "step": 2021 + }, + { + "epoch": 2.0611620795107033, + "grad_norm": 3.685908317565918, + "learning_rate": 2.8065743802153875e-06, + "loss": 0.1305, + "step": 2022 + }, + { + "epoch": 2.0621814475025486, + "grad_norm": 6.540226936340332, + "learning_rate": 2.7446703984472797e-06, + "loss": 0.3055, + "step": 2023 + }, + { + "epoch": 2.0632008154943935, + "grad_norm": 2.917329788208008, + "learning_rate": 2.6834374825533025e-06, + "loss": 0.1309, + "step": 2024 + }, + { + "epoch": 2.0642201834862384, + "grad_norm": 8.328691482543945, + "learning_rate": 2.6228765020929415e-06, + "loss": 0.8764, + "step": 2025 + }, + { + "epoch": 2.0652395514780837, + "grad_norm": 3.314098358154297, + "learning_rate": 2.5629883170836366e-06, + "loss": 0.1458, + "step": 2026 + }, + { + "epoch": 2.0662589194699286, + "grad_norm": 3.6642093658447266, + "learning_rate": 2.503773777988522e-06, + "loss": 0.1433, + "step": 2027 + }, + { + "epoch": 2.067278287461774, + "grad_norm": 5.074317932128906, + "learning_rate": 2.4452337257044656e-06, + "loss": 0.3791, + "step": 2028 + }, + { + "epoch": 2.0682976554536188, + "grad_norm": 2.789017677307129, + "learning_rate": 2.387368991549954e-06, + "loss": 0.2152, + "step": 2029 + }, + { + "epoch": 2.0693170234454636, + "grad_norm": 7.425163269042969, + "learning_rate": 2.3301803972534785e-06, + "loss": 0.4335, + "step": 2030 + }, + { + "epoch": 2.070336391437309, + "grad_norm": 4.4709062576293945, + "learning_rate": 2.273668754941677e-06, + "loss": 0.2469, + "step": 2031 + }, + { + "epoch": 2.071355759429154, + "grad_norm": 5.429105281829834, + "learning_rate": 2.217834867127977e-06, + "loss": 0.3997, + "step": 2032 + }, + { + "epoch": 2.072375127420999, + "grad_norm": 9.30593204498291, + "learning_rate": 2.1626795267010393e-06, + "loss": 0.4345, + "step": 2033 + }, + { + "epoch": 2.073394495412844, + "grad_norm": 5.841343402862549, + "learning_rate": 2.1082035169136373e-06, + "loss": 0.2812, + "step": 2034 + }, + { + "epoch": 2.074413863404689, + "grad_norm": 8.149571418762207, + "learning_rate": 2.054407611371445e-06, + "loss": 0.3564, + "step": 2035 + }, + { + "epoch": 2.0754332313965342, + "grad_norm": 4.423587799072266, + "learning_rate": 2.0012925740220624e-06, + "loss": 0.1496, + "step": 2036 + }, + { + "epoch": 2.076452599388379, + "grad_norm": 2.966921806335449, + "learning_rate": 1.9488591591441954e-06, + "loss": 0.1768, + "step": 2037 + }, + { + "epoch": 2.0774719673802244, + "grad_norm": 1.870186686515808, + "learning_rate": 1.8971081113369481e-06, + "loss": 0.1257, + "step": 2038 + }, + { + "epoch": 2.0784913353720693, + "grad_norm": 5.432903289794922, + "learning_rate": 1.8460401655092107e-06, + "loss": 0.3168, + "step": 2039 + }, + { + "epoch": 2.079510703363914, + "grad_norm": 2.7983829975128174, + "learning_rate": 1.795656046869254e-06, + "loss": 0.21, + "step": 2040 + }, + { + "epoch": 2.0805300713557595, + "grad_norm": 1.5770361423492432, + "learning_rate": 1.7459564709144116e-06, + "loss": 0.0651, + "step": 2041 + }, + { + "epoch": 2.0815494393476044, + "grad_norm": 2.986161947250366, + "learning_rate": 1.6969421434209376e-06, + "loss": 0.1315, + "step": 2042 + }, + { + "epoch": 2.0825688073394497, + "grad_norm": 8.22683048248291, + "learning_rate": 1.6486137604339758e-06, + "loss": 0.7834, + "step": 2043 + }, + { + "epoch": 2.0835881753312946, + "grad_norm": 1.8245917558670044, + "learning_rate": 1.6009720082576728e-06, + "loss": 0.0943, + "step": 2044 + }, + { + "epoch": 2.0846075433231395, + "grad_norm": 4.468945503234863, + "learning_rate": 1.5540175634454368e-06, + "loss": 0.1908, + "step": 2045 + }, + { + "epoch": 2.085626911314985, + "grad_norm": 4.662402629852295, + "learning_rate": 1.5077510927902938e-06, + "loss": 0.1952, + "step": 2046 + }, + { + "epoch": 2.0866462793068297, + "grad_norm": 6.890171051025391, + "learning_rate": 1.4621732533155075e-06, + "loss": 0.6385, + "step": 2047 + }, + { + "epoch": 2.087665647298675, + "grad_norm": 2.867135524749756, + "learning_rate": 1.4172846922651528e-06, + "loss": 0.203, + "step": 2048 + }, + { + "epoch": 2.08868501529052, + "grad_norm": 6.184851169586182, + "learning_rate": 1.3730860470949902e-06, + "loss": 0.1727, + "step": 2049 + }, + { + "epoch": 2.0897043832823647, + "grad_norm": 3.2211556434631348, + "learning_rate": 1.3295779454633451e-06, + "loss": 0.1612, + "step": 2050 + }, + { + "epoch": 2.09072375127421, + "grad_norm": 5.037843704223633, + "learning_rate": 1.2867610052223144e-06, + "loss": 0.3213, + "step": 2051 + }, + { + "epoch": 2.091743119266055, + "grad_norm": 10.782960891723633, + "learning_rate": 1.2446358344088193e-06, + "loss": 0.4739, + "step": 2052 + }, + { + "epoch": 2.0927624872579003, + "grad_norm": 9.444459915161133, + "learning_rate": 1.2032030312361554e-06, + "loss": 0.4755, + "step": 2053 + }, + { + "epoch": 2.093781855249745, + "grad_norm": 6.151844024658203, + "learning_rate": 1.1624631840853495e-06, + "loss": 0.6616, + "step": 2054 + }, + { + "epoch": 2.09480122324159, + "grad_norm": 6.036664009094238, + "learning_rate": 1.1224168714968786e-06, + "loss": 0.3557, + "step": 2055 + }, + { + "epoch": 2.0958205912334353, + "grad_norm": 7.177772521972656, + "learning_rate": 1.0830646621624529e-06, + "loss": 0.4678, + "step": 2056 + }, + { + "epoch": 2.09683995922528, + "grad_norm": 3.4542040824890137, + "learning_rate": 1.0444071149169122e-06, + "loss": 0.2447, + "step": 2057 + }, + { + "epoch": 2.0978593272171255, + "grad_norm": 1.9717903137207031, + "learning_rate": 1.0064447787303144e-06, + "loss": 0.1077, + "step": 2058 + }, + { + "epoch": 2.0978593272171255, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7708333333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7858775854110718, + "eval_Qnli-dev-1024_cosine_ap": 0.7717931866855599, + "eval_Qnli-dev-1024_cosine_f1": 0.7450980392156862, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7231606245040894, + "eval_Qnli-dev-1024_cosine_mcc": 0.4794765594627558, + "eval_Qnli-dev-1024_cosine_precision": 0.6666666666666666, + "eval_Qnli-dev-1024_cosine_recall": 0.8444444444444444, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6960161924362183, + "eval_Qnli-dev_cosine_ap": 0.7668004254073291, + "eval_Qnli-dev_cosine_f1": 0.7524752475247526, + "eval_Qnli-dev_cosine_f1_threshold": 0.643132209777832, + "eval_Qnli-dev_cosine_mcc": 0.4975007565834654, + "eval_Qnli-dev_cosine_precision": 0.6785714285714286, + "eval_Qnli-dev_cosine_recall": 0.8444444444444444, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.28185296058654785, + "eval_global_dataset_runtime": 103.9195, + "eval_global_dataset_samples_per_second": 7.727, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8880021668150686, + "eval_sts-test-1024_spearman_cosine": 0.9116855751722728, + "eval_sts-test_pearson_cosine": 0.9070605432857464, + "eval_sts-test_spearman_cosine": 0.9208122251709873, + "step": 2058 + }, + { + "epoch": 2.0988786952089704, + "grad_norm": 4.328063011169434, + "learning_rate": 9.691781927001154e-07, + "loss": 0.4307, + "step": 2059 + }, + { + "epoch": 2.0998980632008153, + "grad_norm": 3.5777828693389893, + "learning_rate": 9.326078860435349e-07, + "loss": 0.1913, + "step": 2060 + }, + { + "epoch": 2.1009174311926606, + "grad_norm": 7.239850044250488, + "learning_rate": 8.967343780900361e-07, + "loss": 0.4133, + "step": 2061 + }, + { + "epoch": 2.1019367991845055, + "grad_norm": 2.4862873554229736, + "learning_rate": 8.615581782739468e-07, + "loss": 0.1214, + "step": 2062 + }, + { + "epoch": 2.102956167176351, + "grad_norm": 4.599758625030518, + "learning_rate": 8.270797861272217e-07, + "loss": 0.2374, + "step": 2063 + }, + { + "epoch": 2.1039755351681957, + "grad_norm": 3.0807836055755615, + "learning_rate": 7.932996912723644e-07, + "loss": 0.1138, + "step": 2064 + }, + { + "epoch": 2.1049949031600406, + "grad_norm": 4.003174304962158, + "learning_rate": 7.602183734154278e-07, + "loss": 0.1756, + "step": 2065 + }, + { + "epoch": 2.106014271151886, + "grad_norm": 6.551426887512207, + "learning_rate": 7.278363023392964e-07, + "loss": 0.3942, + "step": 2066 + }, + { + "epoch": 2.1070336391437308, + "grad_norm": 8.321428298950195, + "learning_rate": 6.961539378968929e-07, + "loss": 0.3343, + "step": 2067 + }, + { + "epoch": 2.108053007135576, + "grad_norm": 2.4432618618011475, + "learning_rate": 6.651717300047656e-07, + "loss": 0.1298, + "step": 2068 + }, + { + "epoch": 2.109072375127421, + "grad_norm": 4.8150954246521, + "learning_rate": 6.348901186365941e-07, + "loss": 0.1862, + "step": 2069 + }, + { + "epoch": 2.1100917431192663, + "grad_norm": 10.590511322021484, + "learning_rate": 6.053095338170389e-07, + "loss": 0.8467, + "step": 2070 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 4.546511650085449, + "learning_rate": 5.764303956155515e-07, + "loss": 0.2192, + "step": 2071 + }, + { + "epoch": 2.112130479102956, + "grad_norm": 4.88209342956543, + "learning_rate": 5.482531141404679e-07, + "loss": 0.2541, + "step": 2072 + }, + { + "epoch": 2.1131498470948014, + "grad_norm": 4.770115375518799, + "learning_rate": 5.20778089533136e-07, + "loss": 0.4093, + "step": 2073 + }, + { + "epoch": 2.1141692150866462, + "grad_norm": 7.157455921173096, + "learning_rate": 4.940057119622976e-07, + "loss": 0.5352, + "step": 2074 + }, + { + "epoch": 2.115188583078491, + "grad_norm": 1.989957571029663, + "learning_rate": 4.679363616184651e-07, + "loss": 0.1393, + "step": 2075 + }, + { + "epoch": 2.1162079510703364, + "grad_norm": 5.129319190979004, + "learning_rate": 4.425704087085925e-07, + "loss": 0.1769, + "step": 2076 + }, + { + "epoch": 2.1172273190621813, + "grad_norm": 3.754472255706787, + "learning_rate": 4.1790821345079055e-07, + "loss": 0.1795, + "step": 2077 + }, + { + "epoch": 2.1182466870540266, + "grad_norm": 8.386497497558594, + "learning_rate": 3.939501260692036e-07, + "loss": 0.3381, + "step": 2078 + }, + { + "epoch": 2.1192660550458715, + "grad_norm": 5.126825332641602, + "learning_rate": 3.706964867890572e-07, + "loss": 0.3702, + "step": 2079 + }, + { + "epoch": 2.120285423037717, + "grad_norm": 5.695154666900635, + "learning_rate": 3.481476258318017e-07, + "loss": 0.3147, + "step": 2080 + }, + { + "epoch": 2.1213047910295617, + "grad_norm": 8.309889793395996, + "learning_rate": 3.263038634104487e-07, + "loss": 0.3419, + "step": 2081 + }, + { + "epoch": 2.1223241590214066, + "grad_norm": 6.508575439453125, + "learning_rate": 3.051655097249917e-07, + "loss": 0.2264, + "step": 2082 + }, + { + "epoch": 2.123343527013252, + "grad_norm": 5.034822940826416, + "learning_rate": 2.847328649580483e-07, + "loss": 0.2863, + "step": 2083 + }, + { + "epoch": 2.124362895005097, + "grad_norm": 5.786725997924805, + "learning_rate": 2.6500621927054715e-07, + "loss": 0.3757, + "step": 2084 + }, + { + "epoch": 2.1253822629969417, + "grad_norm": 3.350344181060791, + "learning_rate": 2.459858527976366e-07, + "loss": 0.1941, + "step": 2085 + }, + { + "epoch": 2.126401630988787, + "grad_norm": 4.888509273529053, + "learning_rate": 2.276720356446882e-07, + "loss": 0.3484, + "step": 2086 + }, + { + "epoch": 2.127420998980632, + "grad_norm": 2.2694365978240967, + "learning_rate": 2.1006502788349924e-07, + "loss": 0.1174, + "step": 2087 + }, + { + "epoch": 2.128440366972477, + "grad_norm": 6.6389594078063965, + "learning_rate": 1.9316507954854067e-07, + "loss": 0.3486, + "step": 2088 + }, + { + "epoch": 2.129459734964322, + "grad_norm": 7.36968469619751, + "learning_rate": 1.7697243063346524e-07, + "loss": 0.2587, + "step": 2089 + }, + { + "epoch": 2.1304791029561674, + "grad_norm": 2.8399484157562256, + "learning_rate": 1.6148731108764913e-07, + "loss": 0.166, + "step": 2090 + }, + { + "epoch": 2.1314984709480123, + "grad_norm": 6.889630317687988, + "learning_rate": 1.4670994081297795e-07, + "loss": 0.5485, + "step": 2091 + }, + { + "epoch": 2.132517838939857, + "grad_norm": 10.132883071899414, + "learning_rate": 1.3264052966066033e-07, + "loss": 0.5755, + "step": 2092 + }, + { + "epoch": 2.1335372069317025, + "grad_norm": 7.8372602462768555, + "learning_rate": 1.1927927742831358e-07, + "loss": 0.2181, + "step": 2093 + }, + { + "epoch": 2.1345565749235473, + "grad_norm": 3.8295810222625732, + "learning_rate": 1.0662637385708274e-07, + "loss": 0.2995, + "step": 2094 + }, + { + "epoch": 2.1355759429153927, + "grad_norm": 2.522563934326172, + "learning_rate": 9.468199862895377e-08, + "loss": 0.1704, + "step": 2095 + }, + { + "epoch": 2.1365953109072375, + "grad_norm": 6.671058654785156, + "learning_rate": 8.344632136422225e-08, + "loss": 0.3037, + "step": 2096 + }, + { + "epoch": 2.1376146788990824, + "grad_norm": 2.7199294567108154, + "learning_rate": 7.291950161905092e-08, + "loss": 0.1496, + "step": 2097 + }, + { + "epoch": 2.1386340468909277, + "grad_norm": 6.42370080947876, + "learning_rate": 6.310168888324919e-08, + "loss": 0.3936, + "step": 2098 + }, + { + "epoch": 2.1396534148827726, + "grad_norm": 6.266992092132568, + "learning_rate": 5.399302257809713e-08, + "loss": 0.4842, + "step": 2099 + }, + { + "epoch": 2.140672782874618, + "grad_norm": 6.234472274780273, + "learning_rate": 4.559363205440814e-08, + "loss": 0.4433, + "step": 2100 + }, + { + "epoch": 2.141692150866463, + "grad_norm": 1.4575471878051758, + "learning_rate": 3.790363659066931e-08, + "loss": 0.0814, + "step": 2101 + }, + { + "epoch": 2.1427115188583077, + "grad_norm": 5.8357367515563965, + "learning_rate": 3.0923145391364984e-08, + "loss": 0.308, + "step": 2102 + }, + { + "epoch": 2.143730886850153, + "grad_norm": 4.658002853393555, + "learning_rate": 2.4652257585394688e-08, + "loss": 0.2528, + "step": 2103 + }, + { + "epoch": 2.144750254841998, + "grad_norm": 6.6633219718933105, + "learning_rate": 1.909106222471313e-08, + "loss": 0.4332, + "step": 2104 + }, + { + "epoch": 2.145769622833843, + "grad_norm": 9.741914749145508, + "learning_rate": 1.4239638283014555e-08, + "loss": 1.0274, + "step": 2105 + }, + { + "epoch": 2.146788990825688, + "grad_norm": 3.0744028091430664, + "learning_rate": 1.009805465464475e-08, + "loss": 0.248, + "step": 2106 + }, + { + "epoch": 2.147808358817533, + "grad_norm": 3.3139028549194336, + "learning_rate": 6.666370153624035e-09, + "loss": 0.2274, + "step": 2107 + }, + { + "epoch": 2.1488277268093783, + "grad_norm": 3.4172706604003906, + "learning_rate": 3.9446335127757414e-09, + "loss": 0.1735, + "step": 2108 + }, + { + "epoch": 2.149847094801223, + "grad_norm": 5.286227226257324, + "learning_rate": 1.932883383093387e-09, + "loss": 0.3032, + "step": 2109 + }, + { + "epoch": 2.1508664627930685, + "grad_norm": 4.259544372558594, + "learning_rate": 6.311483331244983e-10, + "loss": 0.1403, + "step": 2110 + }, + { + "epoch": 2.1518858307849134, + "grad_norm": 7.1230292320251465, + "learning_rate": 3.944684862089432e-11, + "loss": 0.2983, + "step": 2111 + }, + { + "epoch": 2.1529051987767582, + "grad_norm": 5.71543025970459, + "learning_rate": 9.999984221266776e-05, + "loss": 0.4975, + "step": 2112 + }, + { + "epoch": 2.1539245667686036, + "grad_norm": 8.9960355758667, + "learning_rate": 9.999901383189654e-05, + "loss": 0.3564, + "step": 2113 + }, + { + "epoch": 2.1549439347604484, + "grad_norm": 5.388774394989014, + "learning_rate": 9.999747542260143e-05, + "loss": 0.2478, + "step": 2114 + }, + { + "epoch": 2.1559633027522938, + "grad_norm": 4.084414482116699, + "learning_rate": 9.999522700662917e-05, + "loss": 0.2289, + "step": 2115 + }, + { + "epoch": 2.1569826707441386, + "grad_norm": 4.7534332275390625, + "learning_rate": 9.999226861590915e-05, + "loss": 0.3633, + "step": 2116 + }, + { + "epoch": 2.1580020387359835, + "grad_norm": 5.211404323577881, + "learning_rate": 9.998860029245308e-05, + "loss": 0.3248, + "step": 2117 + }, + { + "epoch": 2.159021406727829, + "grad_norm": 4.009915351867676, + "learning_rate": 9.998422208835423e-05, + "loss": 0.2411, + "step": 2118 + }, + { + "epoch": 2.1600407747196737, + "grad_norm": 4.3494181632995605, + "learning_rate": 9.997913406578685e-05, + "loss": 0.2621, + "step": 2119 + }, + { + "epoch": 2.161060142711519, + "grad_norm": 5.335855960845947, + "learning_rate": 9.997333629700516e-05, + "loss": 0.2409, + "step": 2120 + }, + { + "epoch": 2.162079510703364, + "grad_norm": 3.7476727962493896, + "learning_rate": 9.996682886434243e-05, + "loss": 0.1745, + "step": 2121 + }, + { + "epoch": 2.163098878695209, + "grad_norm": 9.698570251464844, + "learning_rate": 9.995961186020974e-05, + "loss": 0.7162, + "step": 2122 + }, + { + "epoch": 2.164118246687054, + "grad_norm": 4.056085109710693, + "learning_rate": 9.995168538709467e-05, + "loss": 0.2209, + "step": 2123 + }, + { + "epoch": 2.165137614678899, + "grad_norm": 6.227935791015625, + "learning_rate": 9.994304955755988e-05, + "loss": 0.3706, + "step": 2124 + }, + { + "epoch": 2.1661569826707443, + "grad_norm": 3.197852373123169, + "learning_rate": 9.993370449424153e-05, + "loss": 0.108, + "step": 2125 + }, + { + "epoch": 2.167176350662589, + "grad_norm": 4.697956085205078, + "learning_rate": 9.992365032984743e-05, + "loss": 0.2343, + "step": 2126 + }, + { + "epoch": 2.168195718654434, + "grad_norm": 8.649667739868164, + "learning_rate": 9.991288720715528e-05, + "loss": 0.7377, + "step": 2127 + }, + { + "epoch": 2.1692150866462794, + "grad_norm": 3.0363082885742188, + "learning_rate": 9.990141527901058e-05, + "loss": 0.1477, + "step": 2128 + }, + { + "epoch": 2.1702344546381243, + "grad_norm": 8.80601692199707, + "learning_rate": 9.988923470832445e-05, + "loss": 0.4212, + "step": 2129 + }, + { + "epoch": 2.1712538226299696, + "grad_norm": 5.953189373016357, + "learning_rate": 9.987634566807139e-05, + "loss": 0.4576, + "step": 2130 + }, + { + "epoch": 2.1722731906218145, + "grad_norm": 2.836698055267334, + "learning_rate": 9.98627483412867e-05, + "loss": 0.1307, + "step": 2131 + }, + { + "epoch": 2.1732925586136593, + "grad_norm": 10.231998443603516, + "learning_rate": 9.984844292106399e-05, + "loss": 0.7796, + "step": 2132 + }, + { + "epoch": 2.1743119266055047, + "grad_norm": 5.002100467681885, + "learning_rate": 9.98334296105524e-05, + "loss": 0.2185, + "step": 2133 + }, + { + "epoch": 2.1753312945973495, + "grad_norm": 5.0506696701049805, + "learning_rate": 9.981770862295373e-05, + "loss": 0.4168, + "step": 2134 + }, + { + "epoch": 2.176350662589195, + "grad_norm": 5.320004463195801, + "learning_rate": 9.980128018151936e-05, + "loss": 0.2968, + "step": 2135 + }, + { + "epoch": 2.1773700305810397, + "grad_norm": 4.058662414550781, + "learning_rate": 9.978414451954709e-05, + "loss": 0.2187, + "step": 2136 + }, + { + "epoch": 2.1783893985728846, + "grad_norm": 4.8133134841918945, + "learning_rate": 9.976630188037796e-05, + "loss": 0.4316, + "step": 2137 + }, + { + "epoch": 2.17940876656473, + "grad_norm": 2.8492605686187744, + "learning_rate": 9.974775251739262e-05, + "loss": 0.1237, + "step": 2138 + }, + { + "epoch": 2.180428134556575, + "grad_norm": 3.489551544189453, + "learning_rate": 9.972849669400775e-05, + "loss": 0.2164, + "step": 2139 + }, + { + "epoch": 2.18144750254842, + "grad_norm": 6.3751702308654785, + "learning_rate": 9.970853468367245e-05, + "loss": 0.4144, + "step": 2140 + }, + { + "epoch": 2.182466870540265, + "grad_norm": 5.416356563568115, + "learning_rate": 9.968786676986424e-05, + "loss": 0.3848, + "step": 2141 + }, + { + "epoch": 2.18348623853211, + "grad_norm": 3.666043281555176, + "learning_rate": 9.966649324608511e-05, + "loss": 0.1683, + "step": 2142 + }, + { + "epoch": 2.184505606523955, + "grad_norm": 2.8194453716278076, + "learning_rate": 9.964441441585722e-05, + "loss": 0.0857, + "step": 2143 + }, + { + "epoch": 2.1855249745158, + "grad_norm": 12.224459648132324, + "learning_rate": 9.962163059271878e-05, + "loss": 0.6639, + "step": 2144 + }, + { + "epoch": 2.1865443425076454, + "grad_norm": 7.330053806304932, + "learning_rate": 9.959814210021943e-05, + "loss": 0.243, + "step": 2145 + }, + { + "epoch": 2.1875637104994903, + "grad_norm": 7.457223892211914, + "learning_rate": 9.957394927191577e-05, + "loss": 0.4547, + "step": 2146 + }, + { + "epoch": 2.1885830784913356, + "grad_norm": 7.0147223472595215, + "learning_rate": 9.95490524513665e-05, + "loss": 0.5578, + "step": 2147 + }, + { + "epoch": 2.1896024464831805, + "grad_norm": 5.137038707733154, + "learning_rate": 9.952345199212769e-05, + "loss": 0.2503, + "step": 2148 + }, + { + "epoch": 2.1906218144750254, + "grad_norm": 6.037380218505859, + "learning_rate": 9.949714825774763e-05, + "loss": 0.2688, + "step": 2149 + }, + { + "epoch": 2.1916411824668707, + "grad_norm": 5.003573894500732, + "learning_rate": 9.94701416217617e-05, + "loss": 0.3163, + "step": 2150 + }, + { + "epoch": 2.1926605504587156, + "grad_norm": 9.454754829406738, + "learning_rate": 9.944243246768712e-05, + "loss": 0.4275, + "step": 2151 + }, + { + "epoch": 2.1936799184505604, + "grad_norm": 3.1916465759277344, + "learning_rate": 9.941402118901744e-05, + "loss": 0.2407, + "step": 2152 + }, + { + "epoch": 2.1946992864424058, + "grad_norm": 6.89423131942749, + "learning_rate": 9.938490818921697e-05, + "loss": 0.1208, + "step": 2153 + }, + { + "epoch": 2.1957186544342506, + "grad_norm": 8.483088493347168, + "learning_rate": 9.935509388171509e-05, + "loss": 0.3453, + "step": 2154 + }, + { + "epoch": 2.196738022426096, + "grad_norm": 4.580751895904541, + "learning_rate": 9.93245786899003e-05, + "loss": 0.2303, + "step": 2155 + }, + { + "epoch": 2.197757390417941, + "grad_norm": 5.408090114593506, + "learning_rate": 9.929336304711432e-05, + "loss": 0.2171, + "step": 2156 + }, + { + "epoch": 2.197757390417941, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8115701675415039, + "eval_Qnli-dev-1024_cosine_ap": 0.7199766664494036, + "eval_Qnli-dev-1024_cosine_f1": 0.7254901960784313, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7445250749588013, + "eval_Qnli-dev-1024_cosine_mcc": 0.43697448216965834, + "eval_Qnli-dev-1024_cosine_precision": 0.6491228070175439, + "eval_Qnli-dev-1024_cosine_recall": 0.8222222222222222, + "eval_Qnli-dev_cosine_accuracy": 0.75, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6777580976486206, + "eval_Qnli-dev_cosine_ap": 0.7484611306073241, + "eval_Qnli-dev_cosine_f1": 0.7476635514018692, + "eval_Qnli-dev_cosine_f1_threshold": 0.6229462027549744, + "eval_Qnli-dev_cosine_mcc": 0.47737827504723207, + "eval_Qnli-dev_cosine_precision": 0.6451612903225806, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.2789512574672699, + "eval_global_dataset_runtime": 103.8218, + "eval_global_dataset_samples_per_second": 7.734, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8786118022629635, + "eval_sts-test-1024_spearman_cosine": 0.904928185675876, + "eval_sts-test_pearson_cosine": 0.9066404858754178, + "eval_sts-test_spearman_cosine": 0.9204510786079447, + "step": 2156 + }, + { + "epoch": 2.198776758409786, + "grad_norm": 5.818151950836182, + "learning_rate": 9.92614473966458e-05, + "loss": 0.3691, + "step": 2157 + }, + { + "epoch": 2.199796126401631, + "grad_norm": 7.688577175140381, + "learning_rate": 9.922883219172413e-05, + "loss": 0.283, + "step": 2158 + }, + { + "epoch": 2.200815494393476, + "grad_norm": 9.936375617980957, + "learning_rate": 9.919551789551295e-05, + "loss": 0.8282, + "step": 2159 + }, + { + "epoch": 2.2018348623853212, + "grad_norm": 5.860223770141602, + "learning_rate": 9.91615049811036e-05, + "loss": 0.365, + "step": 2160 + }, + { + "epoch": 2.202854230377166, + "grad_norm": 1.4134682416915894, + "learning_rate": 9.912679393150843e-05, + "loss": 0.0961, + "step": 2161 + }, + { + "epoch": 2.203873598369011, + "grad_norm": 8.950481414794922, + "learning_rate": 9.909138523965385e-05, + "loss": 0.4634, + "step": 2162 + }, + { + "epoch": 2.2048929663608563, + "grad_norm": 6.5395331382751465, + "learning_rate": 9.905527940837338e-05, + "loss": 0.376, + "step": 2163 + }, + { + "epoch": 2.205912334352701, + "grad_norm": 7.763592720031738, + "learning_rate": 9.901847695040054e-05, + "loss": 0.3719, + "step": 2164 + }, + { + "epoch": 2.2069317023445465, + "grad_norm": 12.144659042358398, + "learning_rate": 9.898097838836156e-05, + "loss": 0.578, + "step": 2165 + }, + { + "epoch": 2.2079510703363914, + "grad_norm": 2.760550022125244, + "learning_rate": 9.89427842547679e-05, + "loss": 0.0921, + "step": 2166 + }, + { + "epoch": 2.2089704383282367, + "grad_norm": 7.8105363845825195, + "learning_rate": 9.890389509200874e-05, + "loss": 0.293, + "step": 2167 + }, + { + "epoch": 2.2099898063200816, + "grad_norm": 5.359696388244629, + "learning_rate": 9.886431145234328e-05, + "loss": 0.3036, + "step": 2168 + }, + { + "epoch": 2.2110091743119265, + "grad_norm": 6.331681251525879, + "learning_rate": 9.882403389789288e-05, + "loss": 0.2374, + "step": 2169 + }, + { + "epoch": 2.2120285423037718, + "grad_norm": 4.270999908447266, + "learning_rate": 9.878306300063305e-05, + "loss": 0.2242, + "step": 2170 + }, + { + "epoch": 2.2130479102956166, + "grad_norm": 7.299144268035889, + "learning_rate": 9.874139934238538e-05, + "loss": 0.4379, + "step": 2171 + }, + { + "epoch": 2.214067278287462, + "grad_norm": 6.275256156921387, + "learning_rate": 9.869904351480928e-05, + "loss": 0.2515, + "step": 2172 + }, + { + "epoch": 2.215086646279307, + "grad_norm": 3.000127077102661, + "learning_rate": 9.865599611939351e-05, + "loss": 0.1617, + "step": 2173 + }, + { + "epoch": 2.2161060142711517, + "grad_norm": 8.104231834411621, + "learning_rate": 9.86122577674477e-05, + "loss": 0.4121, + "step": 2174 + }, + { + "epoch": 2.217125382262997, + "grad_norm": 2.574758291244507, + "learning_rate": 9.856782908009363e-05, + "loss": 0.2169, + "step": 2175 + }, + { + "epoch": 2.218144750254842, + "grad_norm": 5.668185234069824, + "learning_rate": 9.85227106882565e-05, + "loss": 0.4749, + "step": 2176 + }, + { + "epoch": 2.2191641182466872, + "grad_norm": 9.118616104125977, + "learning_rate": 9.847690323265581e-05, + "loss": 0.8064, + "step": 2177 + }, + { + "epoch": 2.220183486238532, + "grad_norm": 4.4046831130981445, + "learning_rate": 9.843040736379639e-05, + "loss": 0.1712, + "step": 2178 + }, + { + "epoch": 2.221202854230377, + "grad_norm": 4.443243503570557, + "learning_rate": 9.838322374195915e-05, + "loss": 0.2074, + "step": 2179 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 6.054386615753174, + "learning_rate": 9.833535303719163e-05, + "loss": 0.3482, + "step": 2180 + }, + { + "epoch": 2.223241590214067, + "grad_norm": 4.216024398803711, + "learning_rate": 9.82867959292986e-05, + "loss": 0.2334, + "step": 2181 + }, + { + "epoch": 2.2242609582059125, + "grad_norm": 7.953604221343994, + "learning_rate": 9.823755310783224e-05, + "loss": 0.3724, + "step": 2182 + }, + { + "epoch": 2.2252803261977574, + "grad_norm": 7.373310089111328, + "learning_rate": 9.818762527208251e-05, + "loss": 0.2847, + "step": 2183 + }, + { + "epoch": 2.2262996941896023, + "grad_norm": 6.493857383728027, + "learning_rate": 9.813701313106716e-05, + "loss": 0.3983, + "step": 2184 + }, + { + "epoch": 2.2273190621814476, + "grad_norm": 7.406630992889404, + "learning_rate": 9.808571740352163e-05, + "loss": 0.2759, + "step": 2185 + }, + { + "epoch": 2.2283384301732925, + "grad_norm": 8.359546661376953, + "learning_rate": 9.803373881788888e-05, + "loss": 0.5751, + "step": 2186 + }, + { + "epoch": 2.229357798165138, + "grad_norm": 4.752198219299316, + "learning_rate": 9.798107811230906e-05, + "loss": 0.1683, + "step": 2187 + }, + { + "epoch": 2.2303771661569827, + "grad_norm": 6.510313034057617, + "learning_rate": 9.792773603460897e-05, + "loss": 0.2357, + "step": 2188 + }, + { + "epoch": 2.2313965341488275, + "grad_norm": 6.2854743003845215, + "learning_rate": 9.787371334229145e-05, + "loss": 0.3081, + "step": 2189 + }, + { + "epoch": 2.232415902140673, + "grad_norm": 4.730719566345215, + "learning_rate": 9.781901080252473e-05, + "loss": 0.3411, + "step": 2190 + }, + { + "epoch": 2.2334352701325177, + "grad_norm": 9.087937355041504, + "learning_rate": 9.776362919213137e-05, + "loss": 0.4046, + "step": 2191 + }, + { + "epoch": 2.234454638124363, + "grad_norm": 6.309107780456543, + "learning_rate": 9.77075692975774e-05, + "loss": 0.2311, + "step": 2192 + }, + { + "epoch": 2.235474006116208, + "grad_norm": 5.710136413574219, + "learning_rate": 9.76508319149609e-05, + "loss": 0.2457, + "step": 2193 + }, + { + "epoch": 2.236493374108053, + "grad_norm": 9.256056785583496, + "learning_rate": 9.759341785000105e-05, + "loss": 0.4165, + "step": 2194 + }, + { + "epoch": 2.237512742099898, + "grad_norm": 3.0695488452911377, + "learning_rate": 9.753532791802637e-05, + "loss": 0.1263, + "step": 2195 + }, + { + "epoch": 2.238532110091743, + "grad_norm": 7.062183856964111, + "learning_rate": 9.747656294396334e-05, + "loss": 0.3795, + "step": 2196 + }, + { + "epoch": 2.2395514780835883, + "grad_norm": 9.748225212097168, + "learning_rate": 9.74171237623245e-05, + "loss": 0.5094, + "step": 2197 + }, + { + "epoch": 2.240570846075433, + "grad_norm": 4.3179826736450195, + "learning_rate": 9.735701121719686e-05, + "loss": 0.1671, + "step": 2198 + }, + { + "epoch": 2.241590214067278, + "grad_norm": 3.598177671432495, + "learning_rate": 9.729622616222966e-05, + "loss": 0.1327, + "step": 2199 + }, + { + "epoch": 2.2426095820591234, + "grad_norm": 4.8626813888549805, + "learning_rate": 9.723476946062243e-05, + "loss": 0.4628, + "step": 2200 + }, + { + "epoch": 2.2436289500509683, + "grad_norm": 7.495136260986328, + "learning_rate": 9.71726419851125e-05, + "loss": 0.3733, + "step": 2201 + }, + { + "epoch": 2.2446483180428136, + "grad_norm": 3.3497166633605957, + "learning_rate": 9.710984461796297e-05, + "loss": 0.1483, + "step": 2202 + }, + { + "epoch": 2.2456676860346585, + "grad_norm": 10.693584442138672, + "learning_rate": 9.704637825094983e-05, + "loss": 0.5756, + "step": 2203 + }, + { + "epoch": 2.2466870540265034, + "grad_norm": 3.6354808807373047, + "learning_rate": 9.698224378534943e-05, + "loss": 0.2273, + "step": 2204 + }, + { + "epoch": 2.2477064220183487, + "grad_norm": 5.719125747680664, + "learning_rate": 9.691744213192579e-05, + "loss": 0.2651, + "step": 2205 + }, + { + "epoch": 2.2487257900101936, + "grad_norm": 2.961449384689331, + "learning_rate": 9.685197421091747e-05, + "loss": 0.1644, + "step": 2206 + }, + { + "epoch": 2.249745158002039, + "grad_norm": 7.821652412414551, + "learning_rate": 9.67858409520247e-05, + "loss": 0.8976, + "step": 2207 + }, + { + "epoch": 2.2507645259938838, + "grad_norm": 8.642051696777344, + "learning_rate": 9.671904329439592e-05, + "loss": 0.4614, + "step": 2208 + }, + { + "epoch": 2.2517838939857286, + "grad_norm": 3.5357613563537598, + "learning_rate": 9.665158218661473e-05, + "loss": 0.1483, + "step": 2209 + }, + { + "epoch": 2.252803261977574, + "grad_norm": 4.354801177978516, + "learning_rate": 9.658345858668622e-05, + "loss": 0.22, + "step": 2210 + }, + { + "epoch": 2.253822629969419, + "grad_norm": 5.700054168701172, + "learning_rate": 9.65146734620235e-05, + "loss": 0.2612, + "step": 2211 + }, + { + "epoch": 2.254841997961264, + "grad_norm": 10.76186752319336, + "learning_rate": 9.64452277894338e-05, + "loss": 1.1265, + "step": 2212 + }, + { + "epoch": 2.255861365953109, + "grad_norm": 6.577109336853027, + "learning_rate": 9.637512255510472e-05, + "loss": 0.5792, + "step": 2213 + }, + { + "epoch": 2.2568807339449544, + "grad_norm": 3.7338356971740723, + "learning_rate": 9.630435875459029e-05, + "loss": 0.2643, + "step": 2214 + }, + { + "epoch": 2.2579001019367992, + "grad_norm": 6.350942611694336, + "learning_rate": 9.623293739279661e-05, + "loss": 0.3964, + "step": 2215 + }, + { + "epoch": 2.258919469928644, + "grad_norm": 5.956366539001465, + "learning_rate": 9.616085948396778e-05, + "loss": 0.5892, + "step": 2216 + }, + { + "epoch": 2.2599388379204894, + "grad_norm": 7.00551700592041, + "learning_rate": 9.608812605167139e-05, + "loss": 0.357, + "step": 2217 + }, + { + "epoch": 2.2609582059123343, + "grad_norm": 8.318918228149414, + "learning_rate": 9.60147381287841e-05, + "loss": 0.5915, + "step": 2218 + }, + { + "epoch": 2.261977573904179, + "grad_norm": 3.9956605434417725, + "learning_rate": 9.594069675747681e-05, + "loss": 0.2284, + "step": 2219 + }, + { + "epoch": 2.2629969418960245, + "grad_norm": 6.184433937072754, + "learning_rate": 9.586600298919992e-05, + "loss": 0.2741, + "step": 2220 + }, + { + "epoch": 2.2640163098878694, + "grad_norm": 6.979036331176758, + "learning_rate": 9.579065788466853e-05, + "loss": 0.3373, + "step": 2221 + }, + { + "epoch": 2.2650356778797147, + "grad_norm": 3.9439306259155273, + "learning_rate": 9.571466251384722e-05, + "loss": 0.1557, + "step": 2222 + }, + { + "epoch": 2.2660550458715596, + "grad_norm": 5.736883640289307, + "learning_rate": 9.563801795593483e-05, + "loss": 0.4192, + "step": 2223 + }, + { + "epoch": 2.267074413863405, + "grad_norm": 9.300738334655762, + "learning_rate": 9.556072529934935e-05, + "loss": 0.6405, + "step": 2224 + }, + { + "epoch": 2.26809378185525, + "grad_norm": 6.677163600921631, + "learning_rate": 9.548278564171219e-05, + "loss": 0.2207, + "step": 2225 + }, + { + "epoch": 2.2691131498470947, + "grad_norm": 1.9049301147460938, + "learning_rate": 9.54042000898328e-05, + "loss": 0.1769, + "step": 2226 + }, + { + "epoch": 2.27013251783894, + "grad_norm": 2.9124698638916016, + "learning_rate": 9.532496975969283e-05, + "loss": 0.1876, + "step": 2227 + }, + { + "epoch": 2.271151885830785, + "grad_norm": 6.220628261566162, + "learning_rate": 9.524509577643043e-05, + "loss": 0.4072, + "step": 2228 + }, + { + "epoch": 2.2721712538226297, + "grad_norm": 4.811798572540283, + "learning_rate": 9.516457927432402e-05, + "loss": 0.3481, + "step": 2229 + }, + { + "epoch": 2.273190621814475, + "grad_norm": 5.673060417175293, + "learning_rate": 9.508342139677648e-05, + "loss": 0.4266, + "step": 2230 + }, + { + "epoch": 2.27420998980632, + "grad_norm": 5.483099460601807, + "learning_rate": 9.500162329629866e-05, + "loss": 0.4333, + "step": 2231 + }, + { + "epoch": 2.2752293577981653, + "grad_norm": 5.340078353881836, + "learning_rate": 9.49191861344932e-05, + "loss": 0.2413, + "step": 2232 + }, + { + "epoch": 2.27624872579001, + "grad_norm": 9.624187469482422, + "learning_rate": 9.483611108203788e-05, + "loss": 0.5317, + "step": 2233 + }, + { + "epoch": 2.2772680937818555, + "grad_norm": 4.5000786781311035, + "learning_rate": 9.475239931866913e-05, + "loss": 0.2643, + "step": 2234 + }, + { + "epoch": 2.2782874617737003, + "grad_norm": 7.982000827789307, + "learning_rate": 9.466805203316514e-05, + "loss": 0.7617, + "step": 2235 + }, + { + "epoch": 2.279306829765545, + "grad_norm": 5.092596054077148, + "learning_rate": 9.458307042332914e-05, + "loss": 0.3159, + "step": 2236 + }, + { + "epoch": 2.2803261977573905, + "grad_norm": 7.607308387756348, + "learning_rate": 9.449745569597232e-05, + "loss": 0.3722, + "step": 2237 + }, + { + "epoch": 2.2813455657492354, + "grad_norm": 9.206737518310547, + "learning_rate": 9.441120906689658e-05, + "loss": 0.7328, + "step": 2238 + }, + { + "epoch": 2.2823649337410803, + "grad_norm": 8.724227905273438, + "learning_rate": 9.432433176087738e-05, + "loss": 0.4747, + "step": 2239 + }, + { + "epoch": 2.2833843017329256, + "grad_norm": 7.079002380371094, + "learning_rate": 9.423682501164641e-05, + "loss": 0.3334, + "step": 2240 + }, + { + "epoch": 2.2844036697247705, + "grad_norm": 5.796501636505127, + "learning_rate": 9.4148690061874e-05, + "loss": 0.3518, + "step": 2241 + }, + { + "epoch": 2.285423037716616, + "grad_norm": 7.118459701538086, + "learning_rate": 9.405992816315125e-05, + "loss": 0.242, + "step": 2242 + }, + { + "epoch": 2.2864424057084607, + "grad_norm": 6.54943323135376, + "learning_rate": 9.397054057597275e-05, + "loss": 0.4397, + "step": 2243 + }, + { + "epoch": 2.287461773700306, + "grad_norm": 10.270355224609375, + "learning_rate": 9.388052856971816e-05, + "loss": 0.7398, + "step": 2244 + }, + { + "epoch": 2.288481141692151, + "grad_norm": 8.216813087463379, + "learning_rate": 9.378989342263464e-05, + "loss": 0.7289, + "step": 2245 + }, + { + "epoch": 2.2895005096839958, + "grad_norm": 6.499934196472168, + "learning_rate": 9.369863642181828e-05, + "loss": 0.2978, + "step": 2246 + }, + { + "epoch": 2.290519877675841, + "grad_norm": 3.5899710655212402, + "learning_rate": 9.360675886319617e-05, + "loss": 0.2361, + "step": 2247 + }, + { + "epoch": 2.291539245667686, + "grad_norm": 3.1094970703125, + "learning_rate": 9.351426205150774e-05, + "loss": 0.2009, + "step": 2248 + }, + { + "epoch": 2.292558613659531, + "grad_norm": 5.628443717956543, + "learning_rate": 9.342114730028647e-05, + "loss": 0.5583, + "step": 2249 + }, + { + "epoch": 2.293577981651376, + "grad_norm": 3.848273277282715, + "learning_rate": 9.332741593184094e-05, + "loss": 0.151, + "step": 2250 + }, + { + "epoch": 2.294597349643221, + "grad_norm": 7.327558517456055, + "learning_rate": 9.323306927723637e-05, + "loss": 0.3008, + "step": 2251 + }, + { + "epoch": 2.2956167176350664, + "grad_norm": 8.393162727355957, + "learning_rate": 9.313810867627549e-05, + "loss": 0.4939, + "step": 2252 + }, + { + "epoch": 2.2966360856269112, + "grad_norm": 6.88380241394043, + "learning_rate": 9.304253547747956e-05, + "loss": 0.2644, + "step": 2253 + }, + { + "epoch": 2.2976554536187566, + "grad_norm": 4.7066850662231445, + "learning_rate": 9.294635103806933e-05, + "loss": 0.1587, + "step": 2254 + }, + { + "epoch": 2.2976554536187566, + "eval_Qnli-dev-1024_cosine_accuracy": 0.71875, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8054693937301636, + "eval_Qnli-dev-1024_cosine_ap": 0.7510897581326281, + "eval_Qnli-dev-1024_cosine_f1": 0.7256637168141592, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.6971149444580078, + "eval_Qnli-dev-1024_cosine_mcc": 0.419062972501429, + "eval_Qnli-dev-1024_cosine_precision": 0.6029411764705882, + "eval_Qnli-dev-1024_cosine_recall": 0.9111111111111111, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7125545740127563, + "eval_Qnli-dev_cosine_ap": 0.7513469031575852, + "eval_Qnli-dev_cosine_f1": 0.7339449541284404, + "eval_Qnli-dev_cosine_f1_threshold": 0.6222972869873047, + "eval_Qnli-dev_cosine_mcc": 0.4428074427700477, + "eval_Qnli-dev_cosine_precision": 0.625, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9791666865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.35465556383132935, + "eval_global_dataset_runtime": 103.8621, + "eval_global_dataset_samples_per_second": 7.731, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9791666865348816, + "eval_sts-test-1024_pearson_cosine": 0.884708804231128, + "eval_sts-test-1024_spearman_cosine": 0.9101560388186445, + "eval_sts-test_pearson_cosine": 0.9064669487568003, + "eval_sts-test_spearman_cosine": 0.9197963544883909, + "step": 2254 + }, + { + "epoch": 2.2986748216106014, + "grad_norm": 4.420230865478516, + "learning_rate": 9.284955672394562e-05, + "loss": 0.1962, + "step": 2255 + }, + { + "epoch": 2.2996941896024463, + "grad_norm": 2.1525185108184814, + "learning_rate": 9.275215390967009e-05, + "loss": 0.1461, + "step": 2256 + }, + { + "epoch": 2.3007135575942916, + "grad_norm": 6.365402698516846, + "learning_rate": 9.265414397844552e-05, + "loss": 0.1951, + "step": 2257 + }, + { + "epoch": 2.3017329255861365, + "grad_norm": 6.767193794250488, + "learning_rate": 9.255552832209623e-05, + "loss": 0.284, + "step": 2258 + }, + { + "epoch": 2.302752293577982, + "grad_norm": 4.424633502960205, + "learning_rate": 9.245630834104848e-05, + "loss": 0.3417, + "step": 2259 + }, + { + "epoch": 2.3037716615698267, + "grad_norm": 4.7739152908325195, + "learning_rate": 9.235648544431044e-05, + "loss": 0.2067, + "step": 2260 + }, + { + "epoch": 2.3047910295616716, + "grad_norm": 3.6877939701080322, + "learning_rate": 9.225606104945208e-05, + "loss": 0.2394, + "step": 2261 + }, + { + "epoch": 2.305810397553517, + "grad_norm": 7.127565383911133, + "learning_rate": 9.215503658258524e-05, + "loss": 0.4127, + "step": 2262 + }, + { + "epoch": 2.306829765545362, + "grad_norm": 4.339338302612305, + "learning_rate": 9.205341347834325e-05, + "loss": 0.295, + "step": 2263 + }, + { + "epoch": 2.307849133537207, + "grad_norm": 4.622450351715088, + "learning_rate": 9.19511931798607e-05, + "loss": 0.3318, + "step": 2264 + }, + { + "epoch": 2.308868501529052, + "grad_norm": 6.046075820922852, + "learning_rate": 9.18483771387527e-05, + "loss": 0.3391, + "step": 2265 + }, + { + "epoch": 2.309887869520897, + "grad_norm": 7.373628616333008, + "learning_rate": 9.174496681509453e-05, + "loss": 0.6988, + "step": 2266 + }, + { + "epoch": 2.310907237512742, + "grad_norm": 5.423571586608887, + "learning_rate": 9.164096367740072e-05, + "loss": 0.2089, + "step": 2267 + }, + { + "epoch": 2.311926605504587, + "grad_norm": 10.064982414245605, + "learning_rate": 9.15363692026043e-05, + "loss": 0.7888, + "step": 2268 + }, + { + "epoch": 2.3129459734964324, + "grad_norm": 6.538580417633057, + "learning_rate": 9.143118487603576e-05, + "loss": 0.2692, + "step": 2269 + }, + { + "epoch": 2.3139653414882773, + "grad_norm": 5.713818550109863, + "learning_rate": 9.132541219140205e-05, + "loss": 0.3081, + "step": 2270 + }, + { + "epoch": 2.314984709480122, + "grad_norm": 6.522933006286621, + "learning_rate": 9.121905265076523e-05, + "loss": 0.5452, + "step": 2271 + }, + { + "epoch": 2.3160040774719675, + "grad_norm": 7.0937275886535645, + "learning_rate": 9.111210776452124e-05, + "loss": 0.561, + "step": 2272 + }, + { + "epoch": 2.3170234454638123, + "grad_norm": 6.514686584472656, + "learning_rate": 9.100457905137836e-05, + "loss": 0.4489, + "step": 2273 + }, + { + "epoch": 2.3180428134556577, + "grad_norm": 5.026601791381836, + "learning_rate": 9.089646803833586e-05, + "loss": 0.2322, + "step": 2274 + }, + { + "epoch": 2.3190621814475025, + "grad_norm": 3.829639196395874, + "learning_rate": 9.078777626066212e-05, + "loss": 0.3165, + "step": 2275 + }, + { + "epoch": 2.3200815494393474, + "grad_norm": 6.508183479309082, + "learning_rate": 9.067850526187276e-05, + "loss": 0.3822, + "step": 2276 + }, + { + "epoch": 2.3211009174311927, + "grad_norm": 8.799606323242188, + "learning_rate": 9.056865659370889e-05, + "loss": 0.6136, + "step": 2277 + }, + { + "epoch": 2.3221202854230376, + "grad_norm": 2.762552499771118, + "learning_rate": 9.045823181611506e-05, + "loss": 0.1497, + "step": 2278 + }, + { + "epoch": 2.323139653414883, + "grad_norm": 6.088753700256348, + "learning_rate": 9.034723249721708e-05, + "loss": 0.2277, + "step": 2279 + }, + { + "epoch": 2.324159021406728, + "grad_norm": 10.172626495361328, + "learning_rate": 9.023566021329963e-05, + "loss": 0.7456, + "step": 2280 + }, + { + "epoch": 2.325178389398573, + "grad_norm": 8.9097900390625, + "learning_rate": 9.012351654878408e-05, + "loss": 0.7613, + "step": 2281 + }, + { + "epoch": 2.326197757390418, + "grad_norm": 4.431034564971924, + "learning_rate": 9.00108030962058e-05, + "loss": 0.3183, + "step": 2282 + }, + { + "epoch": 2.327217125382263, + "grad_norm": 4.566897392272949, + "learning_rate": 8.989752145619174e-05, + "loss": 0.2711, + "step": 2283 + }, + { + "epoch": 2.328236493374108, + "grad_norm": 5.168494701385498, + "learning_rate": 8.978367323743748e-05, + "loss": 0.3639, + "step": 2284 + }, + { + "epoch": 2.329255861365953, + "grad_norm": 2.9187684059143066, + "learning_rate": 8.966926005668465e-05, + "loss": 0.1569, + "step": 2285 + }, + { + "epoch": 2.330275229357798, + "grad_norm": 10.322551727294922, + "learning_rate": 8.955428353869766e-05, + "loss": 0.6622, + "step": 2286 + }, + { + "epoch": 2.3312945973496433, + "grad_norm": 2.910454750061035, + "learning_rate": 8.94387453162409e-05, + "loss": 0.1854, + "step": 2287 + }, + { + "epoch": 2.332313965341488, + "grad_norm": 5.9969892501831055, + "learning_rate": 8.932264703005537e-05, + "loss": 0.2545, + "step": 2288 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 3.5394508838653564, + "learning_rate": 8.920599032883552e-05, + "loss": 0.2507, + "step": 2289 + }, + { + "epoch": 2.3343527013251784, + "grad_norm": 3.04937744140625, + "learning_rate": 8.90887768692057e-05, + "loss": 0.1284, + "step": 2290 + }, + { + "epoch": 2.3353720693170237, + "grad_norm": 1.6997092962265015, + "learning_rate": 8.89710083156968e-05, + "loss": 0.0984, + "step": 2291 + }, + { + "epoch": 2.3363914373088686, + "grad_norm": 4.270977973937988, + "learning_rate": 8.885268634072233e-05, + "loss": 0.2855, + "step": 2292 + }, + { + "epoch": 2.3374108053007134, + "grad_norm": 4.65965461730957, + "learning_rate": 8.873381262455503e-05, + "loss": 0.2992, + "step": 2293 + }, + { + "epoch": 2.3384301732925588, + "grad_norm": 8.1211519241333, + "learning_rate": 8.861438885530283e-05, + "loss": 0.3614, + "step": 2294 + }, + { + "epoch": 2.3394495412844036, + "grad_norm": 7.206984519958496, + "learning_rate": 8.849441672888481e-05, + "loss": 0.3101, + "step": 2295 + }, + { + "epoch": 2.3404689092762485, + "grad_norm": 7.5257110595703125, + "learning_rate": 8.837389794900713e-05, + "loss": 0.443, + "step": 2296 + }, + { + "epoch": 2.341488277268094, + "grad_norm": 6.125361919403076, + "learning_rate": 8.825283422713905e-05, + "loss": 0.4293, + "step": 2297 + }, + { + "epoch": 2.3425076452599387, + "grad_norm": 4.915850639343262, + "learning_rate": 8.813122728248842e-05, + "loss": 0.2135, + "step": 2298 + }, + { + "epoch": 2.343527013251784, + "grad_norm": 8.4254789352417, + "learning_rate": 8.800907884197725e-05, + "loss": 0.4949, + "step": 2299 + }, + { + "epoch": 2.344546381243629, + "grad_norm": 2.7700583934783936, + "learning_rate": 8.788639064021721e-05, + "loss": 0.1009, + "step": 2300 + }, + { + "epoch": 2.3455657492354742, + "grad_norm": 2.5243680477142334, + "learning_rate": 8.776316441948529e-05, + "loss": 0.0916, + "step": 2301 + }, + { + "epoch": 2.346585117227319, + "grad_norm": 2.952864408493042, + "learning_rate": 8.763940192969853e-05, + "loss": 0.2077, + "step": 2302 + }, + { + "epoch": 2.347604485219164, + "grad_norm": 5.396402359008789, + "learning_rate": 8.75151049283895e-05, + "loss": 0.2194, + "step": 2303 + }, + { + "epoch": 2.3486238532110093, + "grad_norm": 6.208902359008789, + "learning_rate": 8.739027518068148e-05, + "loss": 0.4228, + "step": 2304 + }, + { + "epoch": 2.349643221202854, + "grad_norm": 4.609702110290527, + "learning_rate": 8.726491445926292e-05, + "loss": 0.1663, + "step": 2305 + }, + { + "epoch": 2.350662589194699, + "grad_norm": 2.649308681488037, + "learning_rate": 8.713902454436285e-05, + "loss": 0.1789, + "step": 2306 + }, + { + "epoch": 2.3516819571865444, + "grad_norm": 8.164649963378906, + "learning_rate": 8.701260722372497e-05, + "loss": 0.2843, + "step": 2307 + }, + { + "epoch": 2.3527013251783893, + "grad_norm": 7.712100505828857, + "learning_rate": 8.68856642925829e-05, + "loss": 0.6079, + "step": 2308 + }, + { + "epoch": 2.3537206931702346, + "grad_norm": 4.999669551849365, + "learning_rate": 8.675819755363412e-05, + "loss": 0.2216, + "step": 2309 + }, + { + "epoch": 2.3547400611620795, + "grad_norm": 6.541714668273926, + "learning_rate": 8.663020881701491e-05, + "loss": 0.3121, + "step": 2310 + }, + { + "epoch": 2.3557594291539248, + "grad_norm": 6.196269512176514, + "learning_rate": 8.650169990027399e-05, + "loss": 0.4209, + "step": 2311 + }, + { + "epoch": 2.3567787971457697, + "grad_norm": 4.6961894035339355, + "learning_rate": 8.637267262834737e-05, + "loss": 0.1548, + "step": 2312 + }, + { + "epoch": 2.3577981651376145, + "grad_norm": 7.67713737487793, + "learning_rate": 8.624312883353211e-05, + "loss": 0.2983, + "step": 2313 + }, + { + "epoch": 2.35881753312946, + "grad_norm": 5.699832439422607, + "learning_rate": 8.611307035546023e-05, + "loss": 0.2876, + "step": 2314 + }, + { + "epoch": 2.3598369011213047, + "grad_norm": 3.3547933101654053, + "learning_rate": 8.59824990410727e-05, + "loss": 0.1027, + "step": 2315 + }, + { + "epoch": 2.3608562691131496, + "grad_norm": 7.3823323249816895, + "learning_rate": 8.585141674459329e-05, + "loss": 0.5218, + "step": 2316 + }, + { + "epoch": 2.361875637104995, + "grad_norm": 2.786806583404541, + "learning_rate": 8.571982532750217e-05, + "loss": 0.1536, + "step": 2317 + }, + { + "epoch": 2.36289500509684, + "grad_norm": 5.496379375457764, + "learning_rate": 8.558772665850932e-05, + "loss": 0.4112, + "step": 2318 + }, + { + "epoch": 2.363914373088685, + "grad_norm": 8.679461479187012, + "learning_rate": 8.545512261352812e-05, + "loss": 0.418, + "step": 2319 + }, + { + "epoch": 2.36493374108053, + "grad_norm": 8.090753555297852, + "learning_rate": 8.532201507564898e-05, + "loss": 0.3728, + "step": 2320 + }, + { + "epoch": 2.3659531090723753, + "grad_norm": 5.165255069732666, + "learning_rate": 8.518840593511202e-05, + "loss": 0.2554, + "step": 2321 + }, + { + "epoch": 2.36697247706422, + "grad_norm": 6.425000190734863, + "learning_rate": 8.505429708928068e-05, + "loss": 0.4931, + "step": 2322 + }, + { + "epoch": 2.367991845056065, + "grad_norm": 2.3066000938415527, + "learning_rate": 8.491969044261472e-05, + "loss": 0.1478, + "step": 2323 + }, + { + "epoch": 2.3690112130479104, + "grad_norm": 3.497620105743408, + "learning_rate": 8.478458790664292e-05, + "loss": 0.2185, + "step": 2324 + }, + { + "epoch": 2.3700305810397553, + "grad_norm": 7.057107448577881, + "learning_rate": 8.46489913999363e-05, + "loss": 0.6166, + "step": 2325 + }, + { + "epoch": 2.3710499490316006, + "grad_norm": 4.212674140930176, + "learning_rate": 8.451290284808048e-05, + "loss": 0.2404, + "step": 2326 + }, + { + "epoch": 2.3720693170234455, + "grad_norm": 5.787489891052246, + "learning_rate": 8.437632418364878e-05, + "loss": 0.371, + "step": 2327 + }, + { + "epoch": 2.3730886850152904, + "grad_norm": 8.768516540527344, + "learning_rate": 8.423925734617428e-05, + "loss": 0.3587, + "step": 2328 + }, + { + "epoch": 2.3741080530071357, + "grad_norm": 5.419494152069092, + "learning_rate": 8.410170428212276e-05, + "loss": 0.1999, + "step": 2329 + }, + { + "epoch": 2.3751274209989806, + "grad_norm": 3.336252450942993, + "learning_rate": 8.396366694486469e-05, + "loss": 0.1607, + "step": 2330 + }, + { + "epoch": 2.376146788990826, + "grad_norm": 5.8776535987854, + "learning_rate": 8.38251472946476e-05, + "loss": 0.4547, + "step": 2331 + }, + { + "epoch": 2.3771661569826708, + "grad_norm": 3.843593120574951, + "learning_rate": 8.368614729856843e-05, + "loss": 0.1942, + "step": 2332 + }, + { + "epoch": 2.3781855249745156, + "grad_norm": 6.0981574058532715, + "learning_rate": 8.354666893054533e-05, + "loss": 0.4157, + "step": 2333 + }, + { + "epoch": 2.379204892966361, + "grad_norm": 3.4683003425598145, + "learning_rate": 8.340671417128971e-05, + "loss": 0.2771, + "step": 2334 + }, + { + "epoch": 2.380224260958206, + "grad_norm": 9.179664611816406, + "learning_rate": 8.326628500827825e-05, + "loss": 0.6997, + "step": 2335 + }, + { + "epoch": 2.381243628950051, + "grad_norm": 2.7431480884552, + "learning_rate": 8.312538343572454e-05, + "loss": 0.1812, + "step": 2336 + }, + { + "epoch": 2.382262996941896, + "grad_norm": 8.77397632598877, + "learning_rate": 8.29840114545507e-05, + "loss": 0.6393, + "step": 2337 + }, + { + "epoch": 2.383282364933741, + "grad_norm": 3.6477086544036865, + "learning_rate": 8.284217107235908e-05, + "loss": 0.2243, + "step": 2338 + }, + { + "epoch": 2.3843017329255862, + "grad_norm": 9.626862525939941, + "learning_rate": 8.269986430340379e-05, + "loss": 0.6306, + "step": 2339 + }, + { + "epoch": 2.385321100917431, + "grad_norm": 6.5339131355285645, + "learning_rate": 8.25570931685621e-05, + "loss": 0.2627, + "step": 2340 + }, + { + "epoch": 2.3863404689092764, + "grad_norm": 4.381488800048828, + "learning_rate": 8.241385969530535e-05, + "loss": 0.1925, + "step": 2341 + }, + { + "epoch": 2.3873598369011213, + "grad_norm": 5.464411735534668, + "learning_rate": 8.227016591767085e-05, + "loss": 0.5205, + "step": 2342 + }, + { + "epoch": 2.388379204892966, + "grad_norm": 7.5830841064453125, + "learning_rate": 8.212601387623235e-05, + "loss": 0.3821, + "step": 2343 + }, + { + "epoch": 2.3893985728848115, + "grad_norm": 6.62937068939209, + "learning_rate": 8.198140561807157e-05, + "loss": 0.3973, + "step": 2344 + }, + { + "epoch": 2.3904179408766564, + "grad_norm": 5.732163906097412, + "learning_rate": 8.183634319674867e-05, + "loss": 0.4019, + "step": 2345 + }, + { + "epoch": 2.3914373088685017, + "grad_norm": 5.45266580581665, + "learning_rate": 8.169082867227349e-05, + "loss": 0.2915, + "step": 2346 + }, + { + "epoch": 2.3924566768603466, + "grad_norm": 5.806284427642822, + "learning_rate": 8.154486411107596e-05, + "loss": 0.3495, + "step": 2347 + }, + { + "epoch": 2.3934760448521915, + "grad_norm": 3.3837454319000244, + "learning_rate": 8.139845158597712e-05, + "loss": 0.1303, + "step": 2348 + }, + { + "epoch": 2.3944954128440368, + "grad_norm": 6.189777374267578, + "learning_rate": 8.125159317615926e-05, + "loss": 0.316, + "step": 2349 + }, + { + "epoch": 2.3955147808358817, + "grad_norm": 6.784691333770752, + "learning_rate": 8.110429096713679e-05, + "loss": 0.428, + "step": 2350 + }, + { + "epoch": 2.396534148827727, + "grad_norm": 7.712477684020996, + "learning_rate": 8.095654705072632e-05, + "loss": 0.78, + "step": 2351 + }, + { + "epoch": 2.397553516819572, + "grad_norm": 6.170988082885742, + "learning_rate": 8.080836352501717e-05, + "loss": 0.2817, + "step": 2352 + }, + { + "epoch": 2.397553516819572, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7604166666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8025434017181396, + "eval_Qnli-dev-1024_cosine_ap": 0.7565294376349275, + "eval_Qnli-dev-1024_cosine_f1": 0.7294117647058822, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.786746621131897, + "eval_Qnli-dev-1024_cosine_mcc": 0.5186710015444639, + "eval_Qnli-dev-1024_cosine_precision": 0.775, + "eval_Qnli-dev-1024_cosine_recall": 0.6888888888888889, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6690382957458496, + "eval_Qnli-dev_cosine_ap": 0.7519786295263348, + "eval_Qnli-dev_cosine_f1": 0.7547169811320755, + "eval_Qnli-dev_cosine_f1_threshold": 0.6388322114944458, + "eval_Qnli-dev_cosine_mcc": 0.494679410480399, + "eval_Qnli-dev_cosine_precision": 0.6557377049180327, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.32961907982826233, + "eval_global_dataset_runtime": 104.2744, + "eval_global_dataset_samples_per_second": 7.701, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8876446213573181, + "eval_sts-test-1024_spearman_cosine": 0.9120654064123624, + "eval_sts-test_pearson_cosine": 0.9064454982972331, + "eval_sts-test_spearman_cosine": 0.9216604226034537, + "step": 2352 + }, + { + "epoch": 2.3985728848114167, + "grad_norm": 4.633171081542969, + "learning_rate": 8.065974249434133e-05, + "loss": 0.4222, + "step": 2353 + }, + { + "epoch": 2.399592252803262, + "grad_norm": 4.3140997886657715, + "learning_rate": 8.051068606924395e-05, + "loss": 0.273, + "step": 2354 + }, + { + "epoch": 2.400611620795107, + "grad_norm": 6.594303131103516, + "learning_rate": 8.036119636645307e-05, + "loss": 0.4256, + "step": 2355 + }, + { + "epoch": 2.4016309887869522, + "grad_norm": 2.8384170532226562, + "learning_rate": 8.021127550884959e-05, + "loss": 0.1358, + "step": 2356 + }, + { + "epoch": 2.402650356778797, + "grad_norm": 8.013402938842773, + "learning_rate": 8.006092562543714e-05, + "loss": 0.6191, + "step": 2357 + }, + { + "epoch": 2.4036697247706424, + "grad_norm": 7.213308334350586, + "learning_rate": 7.9910148851312e-05, + "loss": 0.4026, + "step": 2358 + }, + { + "epoch": 2.4046890927624873, + "grad_norm": 4.690682888031006, + "learning_rate": 7.975894732763267e-05, + "loss": 0.3215, + "step": 2359 + }, + { + "epoch": 2.405708460754332, + "grad_norm": 10.285234451293945, + "learning_rate": 7.960732320158932e-05, + "loss": 0.5026, + "step": 2360 + }, + { + "epoch": 2.4067278287461775, + "grad_norm": 1.6485399007797241, + "learning_rate": 7.945527862637354e-05, + "loss": 0.0854, + "step": 2361 + }, + { + "epoch": 2.4077471967380224, + "grad_norm": 5.8788371086120605, + "learning_rate": 7.930281576114754e-05, + "loss": 0.2336, + "step": 2362 + }, + { + "epoch": 2.4087665647298673, + "grad_norm": 3.2773468494415283, + "learning_rate": 7.91499367710138e-05, + "loss": 0.1266, + "step": 2363 + }, + { + "epoch": 2.4097859327217126, + "grad_norm": 2.446866273880005, + "learning_rate": 7.89966438269839e-05, + "loss": 0.1203, + "step": 2364 + }, + { + "epoch": 2.4108053007135575, + "grad_norm": 7.6452765464782715, + "learning_rate": 7.884293910594816e-05, + "loss": 0.4739, + "step": 2365 + }, + { + "epoch": 2.411824668705403, + "grad_norm": 4.1281352043151855, + "learning_rate": 7.868882479064423e-05, + "loss": 0.1563, + "step": 2366 + }, + { + "epoch": 2.4128440366972477, + "grad_norm": 6.580128192901611, + "learning_rate": 7.85343030696266e-05, + "loss": 0.4343, + "step": 2367 + }, + { + "epoch": 2.413863404689093, + "grad_norm": 4.8013105392456055, + "learning_rate": 7.837937613723498e-05, + "loss": 0.2669, + "step": 2368 + }, + { + "epoch": 2.414882772680938, + "grad_norm": 4.374575138092041, + "learning_rate": 7.822404619356376e-05, + "loss": 0.319, + "step": 2369 + }, + { + "epoch": 2.4159021406727827, + "grad_norm": 4.736109733581543, + "learning_rate": 7.806831544443015e-05, + "loss": 0.2109, + "step": 2370 + }, + { + "epoch": 2.416921508664628, + "grad_norm": 5.618893623352051, + "learning_rate": 7.791218610134329e-05, + "loss": 0.4047, + "step": 2371 + }, + { + "epoch": 2.417940876656473, + "grad_norm": 6.425780773162842, + "learning_rate": 7.775566038147256e-05, + "loss": 0.2339, + "step": 2372 + }, + { + "epoch": 2.418960244648318, + "grad_norm": 5.232985973358154, + "learning_rate": 7.759874050761639e-05, + "loss": 0.2116, + "step": 2373 + }, + { + "epoch": 2.419979612640163, + "grad_norm": 5.943445205688477, + "learning_rate": 7.744142870817052e-05, + "loss": 0.3895, + "step": 2374 + }, + { + "epoch": 2.420998980632008, + "grad_norm": 4.810280799865723, + "learning_rate": 7.728372721709623e-05, + "loss": 0.1327, + "step": 2375 + }, + { + "epoch": 2.4220183486238533, + "grad_norm": 6.564337253570557, + "learning_rate": 7.71256382738888e-05, + "loss": 0.3463, + "step": 2376 + }, + { + "epoch": 2.4230377166156982, + "grad_norm": 4.958079814910889, + "learning_rate": 7.696716412354574e-05, + "loss": 0.2962, + "step": 2377 + }, + { + "epoch": 2.4240570846075435, + "grad_norm": 8.169729232788086, + "learning_rate": 7.680830701653481e-05, + "loss": 0.7273, + "step": 2378 + }, + { + "epoch": 2.4250764525993884, + "grad_norm": 6.614120006561279, + "learning_rate": 7.6649069208762e-05, + "loss": 0.3655, + "step": 2379 + }, + { + "epoch": 2.4260958205912333, + "grad_norm": 2.4092414379119873, + "learning_rate": 7.648945296153963e-05, + "loss": 0.1066, + "step": 2380 + }, + { + "epoch": 2.4271151885830786, + "grad_norm": 5.589473724365234, + "learning_rate": 7.632946054155412e-05, + "loss": 0.2677, + "step": 2381 + }, + { + "epoch": 2.4281345565749235, + "grad_norm": 4.0893096923828125, + "learning_rate": 7.616909422083405e-05, + "loss": 0.2942, + "step": 2382 + }, + { + "epoch": 2.4291539245667684, + "grad_norm": 6.262392520904541, + "learning_rate": 7.60083562767174e-05, + "loss": 0.2955, + "step": 2383 + }, + { + "epoch": 2.4301732925586137, + "grad_norm": 4.966406345367432, + "learning_rate": 7.58472489918199e-05, + "loss": 0.3573, + "step": 2384 + }, + { + "epoch": 2.4311926605504586, + "grad_norm": 5.376223564147949, + "learning_rate": 7.568577465400184e-05, + "loss": 0.1925, + "step": 2385 + }, + { + "epoch": 2.432212028542304, + "grad_norm": 2.352720260620117, + "learning_rate": 7.55239355563363e-05, + "loss": 0.2332, + "step": 2386 + }, + { + "epoch": 2.4332313965341488, + "grad_norm": 5.364041805267334, + "learning_rate": 7.5361733997076e-05, + "loss": 0.2864, + "step": 2387 + }, + { + "epoch": 2.434250764525994, + "grad_norm": 9.162808418273926, + "learning_rate": 7.519917227962116e-05, + "loss": 0.3454, + "step": 2388 + }, + { + "epoch": 2.435270132517839, + "grad_norm": 5.606325149536133, + "learning_rate": 7.50362527124864e-05, + "loss": 0.2977, + "step": 2389 + }, + { + "epoch": 2.436289500509684, + "grad_norm": 9.166282653808594, + "learning_rate": 7.487297760926814e-05, + "loss": 0.5608, + "step": 2390 + }, + { + "epoch": 2.437308868501529, + "grad_norm": 6.161780834197998, + "learning_rate": 7.470934928861164e-05, + "loss": 0.2588, + "step": 2391 + }, + { + "epoch": 2.438328236493374, + "grad_norm": 5.777623176574707, + "learning_rate": 7.454537007417832e-05, + "loss": 0.3611, + "step": 2392 + }, + { + "epoch": 2.439347604485219, + "grad_norm": 6.068408012390137, + "learning_rate": 7.438104229461255e-05, + "loss": 0.3381, + "step": 2393 + }, + { + "epoch": 2.4403669724770642, + "grad_norm": 3.068044900894165, + "learning_rate": 7.421636828350849e-05, + "loss": 0.1305, + "step": 2394 + }, + { + "epoch": 2.441386340468909, + "grad_norm": 5.745426654815674, + "learning_rate": 7.405135037937712e-05, + "loss": 0.259, + "step": 2395 + }, + { + "epoch": 2.4424057084607544, + "grad_norm": 9.846254348754883, + "learning_rate": 7.388599092561312e-05, + "loss": 0.9878, + "step": 2396 + }, + { + "epoch": 2.4434250764525993, + "grad_norm": 2.3299012184143066, + "learning_rate": 7.37202922704614e-05, + "loss": 0.0945, + "step": 2397 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 5.867788314819336, + "learning_rate": 7.355425676698377e-05, + "loss": 0.2487, + "step": 2398 + }, + { + "epoch": 2.4454638124362895, + "grad_norm": 6.639028549194336, + "learning_rate": 7.338788677302558e-05, + "loss": 0.2808, + "step": 2399 + }, + { + "epoch": 2.4464831804281344, + "grad_norm": 7.313209533691406, + "learning_rate": 7.322118465118223e-05, + "loss": 0.3393, + "step": 2400 + }, + { + "epoch": 2.4475025484199797, + "grad_norm": 6.632044315338135, + "learning_rate": 7.305415276876573e-05, + "loss": 0.3317, + "step": 2401 + }, + { + "epoch": 2.4485219164118246, + "grad_norm": 8.070530891418457, + "learning_rate": 7.288679349777077e-05, + "loss": 0.37, + "step": 2402 + }, + { + "epoch": 2.44954128440367, + "grad_norm": 5.373542785644531, + "learning_rate": 7.271910921484148e-05, + "loss": 0.3059, + "step": 2403 + }, + { + "epoch": 2.450560652395515, + "grad_norm": 4.393563270568848, + "learning_rate": 7.255110230123716e-05, + "loss": 0.2791, + "step": 2404 + }, + { + "epoch": 2.4515800203873597, + "grad_norm": 3.5517146587371826, + "learning_rate": 7.238277514279903e-05, + "loss": 0.2652, + "step": 2405 + }, + { + "epoch": 2.452599388379205, + "grad_norm": 7.802023410797119, + "learning_rate": 7.221413012991576e-05, + "loss": 0.4359, + "step": 2406 + }, + { + "epoch": 2.45361875637105, + "grad_norm": 8.12000846862793, + "learning_rate": 7.204516965749014e-05, + "loss": 0.3624, + "step": 2407 + }, + { + "epoch": 2.454638124362895, + "grad_norm": 4.847606182098389, + "learning_rate": 7.187589612490444e-05, + "loss": 0.2125, + "step": 2408 + }, + { + "epoch": 2.45565749235474, + "grad_norm": 9.615645408630371, + "learning_rate": 7.1706311935987e-05, + "loss": 0.44, + "step": 2409 + }, + { + "epoch": 2.456676860346585, + "grad_norm": 6.675761699676514, + "learning_rate": 7.153641949897728e-05, + "loss": 0.4531, + "step": 2410 + }, + { + "epoch": 2.4576962283384303, + "grad_norm": 4.328212738037109, + "learning_rate": 7.136622122649252e-05, + "loss": 0.3082, + "step": 2411 + }, + { + "epoch": 2.458715596330275, + "grad_norm": 4.534638404846191, + "learning_rate": 7.119571953549305e-05, + "loss": 0.2764, + "step": 2412 + }, + { + "epoch": 2.4597349643221205, + "grad_norm": 5.104310989379883, + "learning_rate": 7.10249168472478e-05, + "loss": 0.338, + "step": 2413 + }, + { + "epoch": 2.4607543323139653, + "grad_norm": 7.326198101043701, + "learning_rate": 7.085381558730016e-05, + "loss": 0.3387, + "step": 2414 + }, + { + "epoch": 2.46177370030581, + "grad_norm": 6.451673984527588, + "learning_rate": 7.068241818543364e-05, + "loss": 0.3514, + "step": 2415 + }, + { + "epoch": 2.4627930682976555, + "grad_norm": 8.178759574890137, + "learning_rate": 7.051072707563718e-05, + "loss": 0.481, + "step": 2416 + }, + { + "epoch": 2.4638124362895004, + "grad_norm": 6.602081298828125, + "learning_rate": 7.033874469607052e-05, + "loss": 0.4517, + "step": 2417 + }, + { + "epoch": 2.4648318042813457, + "grad_norm": 6.172163009643555, + "learning_rate": 7.016647348902967e-05, + "loss": 0.3072, + "step": 2418 + }, + { + "epoch": 2.4658511722731906, + "grad_norm": 3.3300230503082275, + "learning_rate": 6.999391590091241e-05, + "loss": 0.1182, + "step": 2419 + }, + { + "epoch": 2.4668705402650355, + "grad_norm": 4.476836681365967, + "learning_rate": 6.982107438218323e-05, + "loss": 0.2143, + "step": 2420 + }, + { + "epoch": 2.467889908256881, + "grad_norm": 4.774931907653809, + "learning_rate": 6.96479513873386e-05, + "loss": 0.1692, + "step": 2421 + }, + { + "epoch": 2.4689092762487257, + "grad_norm": 8.344072341918945, + "learning_rate": 6.947454937487245e-05, + "loss": 0.4225, + "step": 2422 + }, + { + "epoch": 2.469928644240571, + "grad_norm": 4.035219669342041, + "learning_rate": 6.930087080724073e-05, + "loss": 0.1875, + "step": 2423 + }, + { + "epoch": 2.470948012232416, + "grad_norm": 7.9445624351501465, + "learning_rate": 6.912691815082695e-05, + "loss": 0.3638, + "step": 2424 + }, + { + "epoch": 2.4719673802242608, + "grad_norm": 5.097878456115723, + "learning_rate": 6.895269387590664e-05, + "loss": 0.3165, + "step": 2425 + }, + { + "epoch": 2.472986748216106, + "grad_norm": 3.0666706562042236, + "learning_rate": 6.877820045661285e-05, + "loss": 0.1063, + "step": 2426 + }, + { + "epoch": 2.474006116207951, + "grad_norm": 4.933103084564209, + "learning_rate": 6.860344037090041e-05, + "loss": 0.235, + "step": 2427 + }, + { + "epoch": 2.4750254841997963, + "grad_norm": 6.793315887451172, + "learning_rate": 6.84284161005113e-05, + "loss": 0.2965, + "step": 2428 + }, + { + "epoch": 2.476044852191641, + "grad_norm": 3.302661657333374, + "learning_rate": 6.825313013093898e-05, + "loss": 0.1366, + "step": 2429 + }, + { + "epoch": 2.477064220183486, + "grad_norm": 4.984063148498535, + "learning_rate": 6.807758495139325e-05, + "loss": 0.1843, + "step": 2430 + }, + { + "epoch": 2.4780835881753314, + "grad_norm": 5.525577545166016, + "learning_rate": 6.790178305476509e-05, + "loss": 0.2572, + "step": 2431 + }, + { + "epoch": 2.4791029561671762, + "grad_norm": 3.890428066253662, + "learning_rate": 6.77257269375909e-05, + "loss": 0.1414, + "step": 2432 + }, + { + "epoch": 2.4801223241590216, + "grad_norm": 8.498802185058594, + "learning_rate": 6.754941910001722e-05, + "loss": 0.4618, + "step": 2433 + }, + { + "epoch": 2.4811416921508664, + "grad_norm": 5.803354740142822, + "learning_rate": 6.737286204576538e-05, + "loss": 0.2221, + "step": 2434 + }, + { + "epoch": 2.4821610601427118, + "grad_norm": 8.255953788757324, + "learning_rate": 6.719605828209578e-05, + "loss": 0.5251, + "step": 2435 + }, + { + "epoch": 2.4831804281345566, + "grad_norm": 5.898708343505859, + "learning_rate": 6.701901031977221e-05, + "loss": 0.4367, + "step": 2436 + }, + { + "epoch": 2.4841997961264015, + "grad_norm": 6.51034688949585, + "learning_rate": 6.684172067302623e-05, + "loss": 0.2569, + "step": 2437 + }, + { + "epoch": 2.485219164118247, + "grad_norm": 7.042967319488525, + "learning_rate": 6.666419185952176e-05, + "loss": 0.3693, + "step": 2438 + }, + { + "epoch": 2.4862385321100917, + "grad_norm": 3.433729648590088, + "learning_rate": 6.648642640031888e-05, + "loss": 0.2292, + "step": 2439 + }, + { + "epoch": 2.4872579001019366, + "grad_norm": 5.5609049797058105, + "learning_rate": 6.630842681983825e-05, + "loss": 0.2464, + "step": 2440 + }, + { + "epoch": 2.488277268093782, + "grad_norm": 4.253046989440918, + "learning_rate": 6.613019564582546e-05, + "loss": 0.3644, + "step": 2441 + }, + { + "epoch": 2.489296636085627, + "grad_norm": 5.878583908081055, + "learning_rate": 6.595173540931464e-05, + "loss": 0.2055, + "step": 2442 + }, + { + "epoch": 2.490316004077472, + "grad_norm": 9.551033973693848, + "learning_rate": 6.577304864459306e-05, + "loss": 0.4718, + "step": 2443 + }, + { + "epoch": 2.491335372069317, + "grad_norm": 5.011621475219727, + "learning_rate": 6.559413788916464e-05, + "loss": 0.2129, + "step": 2444 + }, + { + "epoch": 2.4923547400611623, + "grad_norm": 5.127644062042236, + "learning_rate": 6.541500568371441e-05, + "loss": 0.2876, + "step": 2445 + }, + { + "epoch": 2.493374108053007, + "grad_norm": 6.037209510803223, + "learning_rate": 6.523565457207193e-05, + "loss": 0.2636, + "step": 2446 + }, + { + "epoch": 2.494393476044852, + "grad_norm": 8.762588500976562, + "learning_rate": 6.505608710117566e-05, + "loss": 0.4882, + "step": 2447 + }, + { + "epoch": 2.4954128440366974, + "grad_norm": 5.17008113861084, + "learning_rate": 6.487630582103635e-05, + "loss": 0.1813, + "step": 2448 + }, + { + "epoch": 2.4964322120285423, + "grad_norm": 7.731829643249512, + "learning_rate": 6.469631328470103e-05, + "loss": 0.2958, + "step": 2449 + }, + { + "epoch": 2.497451580020387, + "grad_norm": 6.016170501708984, + "learning_rate": 6.451611204821695e-05, + "loss": 0.2231, + "step": 2450 + }, + { + "epoch": 2.497451580020387, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7708333333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7951542139053345, + "eval_Qnli-dev-1024_cosine_ap": 0.7338367052537506, + "eval_Qnli-dev-1024_cosine_f1": 0.738095238095238, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7809998989105225, + "eval_Qnli-dev-1024_cosine_mcc": 0.5405732955715834, + "eval_Qnli-dev-1024_cosine_precision": 0.7948717948717948, + "eval_Qnli-dev-1024_cosine_recall": 0.6888888888888889, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7180579900741577, + "eval_Qnli-dev_cosine_ap": 0.7378262269664827, + "eval_Qnli-dev_cosine_f1": 0.7339449541284404, + "eval_Qnli-dev_cosine_f1_threshold": 0.624002993106842, + "eval_Qnli-dev_cosine_mcc": 0.4428074427700477, + "eval_Qnli-dev_cosine_precision": 0.625, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9479166865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.9583333134651184, + "eval_global_dataset_loss": 0.37020039558410645, + "eval_global_dataset_runtime": 104.1585, + "eval_global_dataset_samples_per_second": 7.709, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.9479166865348816, + "eval_sts-test-1024_pearson_cosine": 0.8783180735157112, + "eval_sts-test-1024_spearman_cosine": 0.9080893179999104, + "eval_sts-test_pearson_cosine": 0.9058391907599144, + "eval_sts-test_spearman_cosine": 0.9220448706968835, + "step": 2450 + }, + { + "epoch": 2.4984709480122325, + "grad_norm": 5.705495834350586, + "learning_rate": 6.433570467059489e-05, + "loss": 0.3331, + "step": 2451 + }, + { + "epoch": 2.4994903160040773, + "grad_norm": 2.863286256790161, + "learning_rate": 6.415509371377295e-05, + "loss": 0.2186, + "step": 2452 + }, + { + "epoch": 2.5005096839959227, + "grad_norm": 5.570882797241211, + "learning_rate": 6.397428174258045e-05, + "loss": 0.2169, + "step": 2453 + }, + { + "epoch": 2.5015290519877675, + "grad_norm": 9.619322776794434, + "learning_rate": 6.379327132470121e-05, + "loss": 0.8065, + "step": 2454 + }, + { + "epoch": 2.502548419979613, + "grad_norm": 5.321788311004639, + "learning_rate": 6.361206503063706e-05, + "loss": 0.2582, + "step": 2455 + }, + { + "epoch": 2.5035677879714577, + "grad_norm": 6.590585708618164, + "learning_rate": 6.343066543367147e-05, + "loss": 0.2145, + "step": 2456 + }, + { + "epoch": 2.5045871559633026, + "grad_norm": 3.9598350524902344, + "learning_rate": 6.324907510983303e-05, + "loss": 0.1023, + "step": 2457 + }, + { + "epoch": 2.505606523955148, + "grad_norm": 6.960710048675537, + "learning_rate": 6.306729663785897e-05, + "loss": 0.302, + "step": 2458 + }, + { + "epoch": 2.506625891946993, + "grad_norm": 6.428700923919678, + "learning_rate": 6.288533259915791e-05, + "loss": 0.444, + "step": 2459 + }, + { + "epoch": 2.5076452599388377, + "grad_norm": 3.422715425491333, + "learning_rate": 6.270318557777418e-05, + "loss": 0.2323, + "step": 2460 + }, + { + "epoch": 2.508664627930683, + "grad_norm": 6.3273186683654785, + "learning_rate": 6.252085816035027e-05, + "loss": 0.2695, + "step": 2461 + }, + { + "epoch": 2.509683995922528, + "grad_norm": 6.5879340171813965, + "learning_rate": 6.233835293609074e-05, + "loss": 0.3088, + "step": 2462 + }, + { + "epoch": 2.510703363914373, + "grad_norm": 6.615034103393555, + "learning_rate": 6.215567249672486e-05, + "loss": 0.4545, + "step": 2463 + }, + { + "epoch": 2.511722731906218, + "grad_norm": 6.601266860961914, + "learning_rate": 6.19728194364704e-05, + "loss": 0.2826, + "step": 2464 + }, + { + "epoch": 2.5127420998980634, + "grad_norm": 3.1535212993621826, + "learning_rate": 6.178979635199619e-05, + "loss": 0.1859, + "step": 2465 + }, + { + "epoch": 2.5137614678899083, + "grad_norm": 3.975011110305786, + "learning_rate": 6.160660584238584e-05, + "loss": 0.2104, + "step": 2466 + }, + { + "epoch": 2.514780835881753, + "grad_norm": 4.474131107330322, + "learning_rate": 6.142325050910029e-05, + "loss": 0.2007, + "step": 2467 + }, + { + "epoch": 2.5158002038735985, + "grad_norm": 3.4603450298309326, + "learning_rate": 6.123973295594134e-05, + "loss": 0.1731, + "step": 2468 + }, + { + "epoch": 2.5168195718654434, + "grad_norm": 7.2192912101745605, + "learning_rate": 6.10560557890143e-05, + "loss": 0.6639, + "step": 2469 + }, + { + "epoch": 2.5178389398572882, + "grad_norm": 10.097108840942383, + "learning_rate": 6.0872221616691127e-05, + "loss": 0.4735, + "step": 2470 + }, + { + "epoch": 2.5188583078491336, + "grad_norm": 5.151859283447266, + "learning_rate": 6.068823304957339e-05, + "loss": 0.277, + "step": 2471 + }, + { + "epoch": 2.5198776758409784, + "grad_norm": 5.761507987976074, + "learning_rate": 6.0504092700455306e-05, + "loss": 0.3984, + "step": 2472 + }, + { + "epoch": 2.5208970438328238, + "grad_norm": 8.459686279296875, + "learning_rate": 6.031980318428652e-05, + "loss": 0.5498, + "step": 2473 + }, + { + "epoch": 2.5219164118246686, + "grad_norm": 4.518146991729736, + "learning_rate": 6.013536711813482e-05, + "loss": 0.318, + "step": 2474 + }, + { + "epoch": 2.522935779816514, + "grad_norm": 6.556336879730225, + "learning_rate": 5.995078712114919e-05, + "loss": 0.334, + "step": 2475 + }, + { + "epoch": 2.523955147808359, + "grad_norm": 6.316927909851074, + "learning_rate": 5.9766065814522645e-05, + "loss": 0.2502, + "step": 2476 + }, + { + "epoch": 2.5249745158002037, + "grad_norm": 6.545000076293945, + "learning_rate": 5.95812058214549e-05, + "loss": 0.2655, + "step": 2477 + }, + { + "epoch": 2.525993883792049, + "grad_norm": 9.777205467224121, + "learning_rate": 5.9396209767115053e-05, + "loss": 0.5449, + "step": 2478 + }, + { + "epoch": 2.527013251783894, + "grad_norm": 4.878248691558838, + "learning_rate": 5.9211080278604415e-05, + "loss": 0.2503, + "step": 2479 + }, + { + "epoch": 2.528032619775739, + "grad_norm": 6.339738368988037, + "learning_rate": 5.9025819984919115e-05, + "loss": 0.5456, + "step": 2480 + }, + { + "epoch": 2.529051987767584, + "grad_norm": 6.251143932342529, + "learning_rate": 5.884043151691303e-05, + "loss": 0.3832, + "step": 2481 + }, + { + "epoch": 2.5300713557594294, + "grad_norm": 6.100076198577881, + "learning_rate": 5.865491750725998e-05, + "loss": 0.452, + "step": 2482 + }, + { + "epoch": 2.5310907237512743, + "grad_norm": 4.228699207305908, + "learning_rate": 5.8469280590416806e-05, + "loss": 0.399, + "step": 2483 + }, + { + "epoch": 2.532110091743119, + "grad_norm": 6.980855464935303, + "learning_rate": 5.8283523402585505e-05, + "loss": 0.3104, + "step": 2484 + }, + { + "epoch": 2.5331294597349645, + "grad_norm": 10.003399848937988, + "learning_rate": 5.809764858167627e-05, + "loss": 0.5134, + "step": 2485 + }, + { + "epoch": 2.5341488277268094, + "grad_norm": 9.853922843933105, + "learning_rate": 5.7911658767269516e-05, + "loss": 0.4744, + "step": 2486 + }, + { + "epoch": 2.5351681957186543, + "grad_norm": 5.091624736785889, + "learning_rate": 5.772555660057895e-05, + "loss": 0.2087, + "step": 2487 + }, + { + "epoch": 2.5361875637104996, + "grad_norm": 6.095240116119385, + "learning_rate": 5.753934472441356e-05, + "loss": 0.2806, + "step": 2488 + }, + { + "epoch": 2.5372069317023445, + "grad_norm": 7.038525104522705, + "learning_rate": 5.735302578314036e-05, + "loss": 0.3187, + "step": 2489 + }, + { + "epoch": 2.5382262996941893, + "grad_norm": 6.65778923034668, + "learning_rate": 5.716660242264674e-05, + "loss": 0.2877, + "step": 2490 + }, + { + "epoch": 2.5392456676860347, + "grad_norm": 8.296245574951172, + "learning_rate": 5.698007729030306e-05, + "loss": 0.273, + "step": 2491 + }, + { + "epoch": 2.54026503567788, + "grad_norm": 8.01514720916748, + "learning_rate": 5.6793453034924906e-05, + "loss": 0.3605, + "step": 2492 + }, + { + "epoch": 2.541284403669725, + "grad_norm": 7.5846476554870605, + "learning_rate": 5.6606732306735366e-05, + "loss": 0.3579, + "step": 2493 + }, + { + "epoch": 2.5423037716615697, + "grad_norm": 12.329811096191406, + "learning_rate": 5.641991775732754e-05, + "loss": 0.8508, + "step": 2494 + }, + { + "epoch": 2.543323139653415, + "grad_norm": 4.689081192016602, + "learning_rate": 5.6233012039626994e-05, + "loss": 0.3545, + "step": 2495 + }, + { + "epoch": 2.54434250764526, + "grad_norm": 6.278460502624512, + "learning_rate": 5.6046017807853965e-05, + "loss": 0.5836, + "step": 2496 + }, + { + "epoch": 2.545361875637105, + "grad_norm": 7.2878313064575195, + "learning_rate": 5.585893771748555e-05, + "loss": 0.2704, + "step": 2497 + }, + { + "epoch": 2.54638124362895, + "grad_norm": 6.415917873382568, + "learning_rate": 5.5671774425218115e-05, + "loss": 0.1861, + "step": 2498 + }, + { + "epoch": 2.547400611620795, + "grad_norm": 4.367844581604004, + "learning_rate": 5.548453058892955e-05, + "loss": 0.1821, + "step": 2499 + }, + { + "epoch": 2.5484199796126403, + "grad_norm": 6.253251552581787, + "learning_rate": 5.529720886764174e-05, + "loss": 0.2393, + "step": 2500 + }, + { + "epoch": 2.549439347604485, + "grad_norm": 5.571644306182861, + "learning_rate": 5.51098119214823e-05, + "loss": 0.2192, + "step": 2501 + }, + { + "epoch": 2.5504587155963305, + "grad_norm": 5.735546588897705, + "learning_rate": 5.4922342411647424e-05, + "loss": 0.5834, + "step": 2502 + }, + { + "epoch": 2.5514780835881754, + "grad_norm": 3.2489004135131836, + "learning_rate": 5.4734803000363456e-05, + "loss": 0.2575, + "step": 2503 + }, + { + "epoch": 2.5524974515800203, + "grad_norm": 2.887796401977539, + "learning_rate": 5.454719635084968e-05, + "loss": 0.1475, + "step": 2504 + }, + { + "epoch": 2.5535168195718656, + "grad_norm": 2.2065820693969727, + "learning_rate": 5.435952512727998e-05, + "loss": 0.1323, + "step": 2505 + }, + { + "epoch": 2.5545361875637105, + "grad_norm": 4.697101593017578, + "learning_rate": 5.4171791994745455e-05, + "loss": 0.4186, + "step": 2506 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 7.107315540313721, + "learning_rate": 5.398399961921624e-05, + "loss": 0.4217, + "step": 2507 + }, + { + "epoch": 2.5565749235474007, + "grad_norm": 4.283377647399902, + "learning_rate": 5.379615066750377e-05, + "loss": 0.1777, + "step": 2508 + }, + { + "epoch": 2.5575942915392456, + "grad_norm": 2.9004464149475098, + "learning_rate": 5.360824780722287e-05, + "loss": 0.1921, + "step": 2509 + }, + { + "epoch": 2.558613659531091, + "grad_norm": 8.493511199951172, + "learning_rate": 5.3420293706754054e-05, + "loss": 0.4628, + "step": 2510 + }, + { + "epoch": 2.5596330275229358, + "grad_norm": 3.586085796356201, + "learning_rate": 5.3232291035205485e-05, + "loss": 0.2653, + "step": 2511 + }, + { + "epoch": 2.560652395514781, + "grad_norm": 8.821948051452637, + "learning_rate": 5.304424246237494e-05, + "loss": 0.4299, + "step": 2512 + }, + { + "epoch": 2.561671763506626, + "grad_norm": 8.818429946899414, + "learning_rate": 5.285615065871203e-05, + "loss": 0.4889, + "step": 2513 + }, + { + "epoch": 2.562691131498471, + "grad_norm": 2.939493417739868, + "learning_rate": 5.2668018295280416e-05, + "loss": 0.1218, + "step": 2514 + }, + { + "epoch": 2.563710499490316, + "grad_norm": 7.347713947296143, + "learning_rate": 5.247984804371976e-05, + "loss": 0.4561, + "step": 2515 + }, + { + "epoch": 2.564729867482161, + "grad_norm": 5.744930744171143, + "learning_rate": 5.229164257620762e-05, + "loss": 0.3118, + "step": 2516 + }, + { + "epoch": 2.565749235474006, + "grad_norm": 3.0342025756835938, + "learning_rate": 5.210340456542169e-05, + "loss": 0.1742, + "step": 2517 + }, + { + "epoch": 2.5667686034658512, + "grad_norm": 5.914112567901611, + "learning_rate": 5.191513668450178e-05, + "loss": 0.2229, + "step": 2518 + }, + { + "epoch": 2.567787971457696, + "grad_norm": 5.135507583618164, + "learning_rate": 5.172684160701207e-05, + "loss": 0.2406, + "step": 2519 + }, + { + "epoch": 2.5688073394495414, + "grad_norm": 8.28679084777832, + "learning_rate": 5.153852200690267e-05, + "loss": 0.5932, + "step": 2520 + }, + { + "epoch": 2.5698267074413863, + "grad_norm": 7.528500080108643, + "learning_rate": 5.135018055847223e-05, + "loss": 0.2706, + "step": 2521 + }, + { + "epoch": 2.5708460754332316, + "grad_norm": 3.1454074382781982, + "learning_rate": 5.116181993632937e-05, + "loss": 0.1532, + "step": 2522 + }, + { + "epoch": 2.5718654434250765, + "grad_norm": 2.4760966300964355, + "learning_rate": 5.097344281535529e-05, + "loss": 0.1913, + "step": 2523 + }, + { + "epoch": 2.5728848114169214, + "grad_norm": 4.306360244750977, + "learning_rate": 5.078505187066517e-05, + "loss": 0.3509, + "step": 2524 + }, + { + "epoch": 2.5739041794087667, + "grad_norm": 10.413430213928223, + "learning_rate": 5.059664977757083e-05, + "loss": 0.4713, + "step": 2525 + }, + { + "epoch": 2.5749235474006116, + "grad_norm": 5.3981781005859375, + "learning_rate": 5.0408239211542084e-05, + "loss": 0.2841, + "step": 2526 + }, + { + "epoch": 2.5759429153924565, + "grad_norm": 2.901654005050659, + "learning_rate": 5.021982284816944e-05, + "loss": 0.1348, + "step": 2527 + }, + { + "epoch": 2.5769622833843018, + "grad_norm": 7.1827545166015625, + "learning_rate": 5.003140336312524e-05, + "loss": 0.408, + "step": 2528 + }, + { + "epoch": 2.5779816513761467, + "grad_norm": 5.4475932121276855, + "learning_rate": 4.9842983432126574e-05, + "loss": 0.1852, + "step": 2529 + }, + { + "epoch": 2.579001019367992, + "grad_norm": 8.925348281860352, + "learning_rate": 4.965456573089678e-05, + "loss": 0.681, + "step": 2530 + }, + { + "epoch": 2.580020387359837, + "grad_norm": 4.100930213928223, + "learning_rate": 4.946615293512744e-05, + "loss": 0.1261, + "step": 2531 + }, + { + "epoch": 2.581039755351682, + "grad_norm": 5.197756290435791, + "learning_rate": 4.927774772044042e-05, + "loss": 0.2175, + "step": 2532 + }, + { + "epoch": 2.582059123343527, + "grad_norm": 3.854301691055298, + "learning_rate": 4.908935276235009e-05, + "loss": 0.146, + "step": 2533 + }, + { + "epoch": 2.583078491335372, + "grad_norm": 6.240309238433838, + "learning_rate": 4.8900970736225164e-05, + "loss": 0.4023, + "step": 2534 + }, + { + "epoch": 2.5840978593272173, + "grad_norm": 5.006434440612793, + "learning_rate": 4.8712604317250596e-05, + "loss": 0.1632, + "step": 2535 + }, + { + "epoch": 2.585117227319062, + "grad_norm": 7.134576797485352, + "learning_rate": 4.852425618038966e-05, + "loss": 0.278, + "step": 2536 + }, + { + "epoch": 2.586136595310907, + "grad_norm": 4.0280327796936035, + "learning_rate": 4.8335929000346245e-05, + "loss": 0.1896, + "step": 2537 + }, + { + "epoch": 2.5871559633027523, + "grad_norm": 5.938254356384277, + "learning_rate": 4.814762545152643e-05, + "loss": 0.2148, + "step": 2538 + }, + { + "epoch": 2.588175331294597, + "grad_norm": 13.921966552734375, + "learning_rate": 4.795934820800071e-05, + "loss": 1.4833, + "step": 2539 + }, + { + "epoch": 2.5891946992864425, + "grad_norm": 5.229700565338135, + "learning_rate": 4.77710999434662e-05, + "loss": 0.5066, + "step": 2540 + }, + { + "epoch": 2.5902140672782874, + "grad_norm": 5.119759559631348, + "learning_rate": 4.758288333120826e-05, + "loss": 0.2406, + "step": 2541 + }, + { + "epoch": 2.5912334352701327, + "grad_norm": 4.06699275970459, + "learning_rate": 4.7394701044063004e-05, + "loss": 0.2206, + "step": 2542 + }, + { + "epoch": 2.5922528032619776, + "grad_norm": 3.3897323608398438, + "learning_rate": 4.7206555754378825e-05, + "loss": 0.1581, + "step": 2543 + }, + { + "epoch": 2.5932721712538225, + "grad_norm": 8.2315673828125, + "learning_rate": 4.701845013397903e-05, + "loss": 0.3168, + "step": 2544 + }, + { + "epoch": 2.594291539245668, + "grad_norm": 3.2306790351867676, + "learning_rate": 4.683038685412325e-05, + "loss": 0.1425, + "step": 2545 + }, + { + "epoch": 2.5953109072375127, + "grad_norm": 2.8688368797302246, + "learning_rate": 4.664236858547019e-05, + "loss": 0.1639, + "step": 2546 + }, + { + "epoch": 2.5963302752293576, + "grad_norm": 4.288220405578613, + "learning_rate": 4.645439799803909e-05, + "loss": 0.1915, + "step": 2547 + }, + { + "epoch": 2.597349643221203, + "grad_norm": 9.484834671020508, + "learning_rate": 4.626647776117213e-05, + "loss": 0.4821, + "step": 2548 + }, + { + "epoch": 2.597349643221203, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8034257888793945, + "eval_Qnli-dev-1024_cosine_ap": 0.744787681075809, + "eval_Qnli-dev-1024_cosine_f1": 0.7256637168141592, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.687965989112854, + "eval_Qnli-dev-1024_cosine_mcc": 0.419062972501429, + "eval_Qnli-dev-1024_cosine_precision": 0.6029411764705882, + "eval_Qnli-dev-1024_cosine_recall": 0.9111111111111111, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6738884449005127, + "eval_Qnli-dev_cosine_ap": 0.7378629719325543, + "eval_Qnli-dev_cosine_f1": 0.7407407407407407, + "eval_Qnli-dev_cosine_f1_threshold": 0.617904782295227, + "eval_Qnli-dev_cosine_mcc": 0.4600949560146401, + "eval_Qnli-dev_cosine_precision": 0.6349206349206349, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.96875, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.3022010624408722, + "eval_global_dataset_runtime": 104.1737, + "eval_global_dataset_samples_per_second": 7.708, + "eval_global_dataset_steps_per_second": 0.163, + "eval_sequential_score": 0.96875, + "eval_sts-test-1024_pearson_cosine": 0.880993649364402, + "eval_sts-test-1024_spearman_cosine": 0.9061897815880109, + "eval_sts-test_pearson_cosine": 0.9026524055260908, + "eval_sts-test_spearman_cosine": 0.9186055322221641, + "step": 2548 + }, + { + "epoch": 2.5983690112130478, + "grad_norm": 3.646552801132202, + "learning_rate": 4.607861054349663e-05, + "loss": 0.2793, + "step": 2549 + }, + { + "epoch": 2.599388379204893, + "grad_norm": 6.311948299407959, + "learning_rate": 4.589079901288681e-05, + "loss": 0.4315, + "step": 2550 + }, + { + "epoch": 2.600407747196738, + "grad_norm": 5.987321376800537, + "learning_rate": 4.57030458364261e-05, + "loss": 0.4048, + "step": 2551 + }, + { + "epoch": 2.6014271151885833, + "grad_norm": 4.98535680770874, + "learning_rate": 4.551535368036934e-05, + "loss": 0.2146, + "step": 2552 + }, + { + "epoch": 2.602446483180428, + "grad_norm": 6.634832382202148, + "learning_rate": 4.532772521010488e-05, + "loss": 0.4875, + "step": 2553 + }, + { + "epoch": 2.603465851172273, + "grad_norm": 7.706894874572754, + "learning_rate": 4.514016309011653e-05, + "loss": 0.239, + "step": 2554 + }, + { + "epoch": 2.6044852191641183, + "grad_norm": 7.011508941650391, + "learning_rate": 4.495266998394584e-05, + "loss": 0.2756, + "step": 2555 + }, + { + "epoch": 2.6055045871559632, + "grad_norm": 8.816591262817383, + "learning_rate": 4.4765248554154454e-05, + "loss": 0.2547, + "step": 2556 + }, + { + "epoch": 2.606523955147808, + "grad_norm": 6.2531585693359375, + "learning_rate": 4.4577901462286244e-05, + "loss": 0.2222, + "step": 2557 + }, + { + "epoch": 2.6075433231396534, + "grad_norm": 6.575552463531494, + "learning_rate": 4.4390631368828984e-05, + "loss": 0.2396, + "step": 2558 + }, + { + "epoch": 2.6085626911314987, + "grad_norm": 7.20054817199707, + "learning_rate": 4.420344093317749e-05, + "loss": 0.2958, + "step": 2559 + }, + { + "epoch": 2.6095820591233436, + "grad_norm": 4.621170520782471, + "learning_rate": 4.401633281359504e-05, + "loss": 0.2492, + "step": 2560 + }, + { + "epoch": 2.6106014271151885, + "grad_norm": 6.393376350402832, + "learning_rate": 4.382930966717621e-05, + "loss": 0.3191, + "step": 2561 + }, + { + "epoch": 2.611620795107034, + "grad_norm": 2.958164691925049, + "learning_rate": 4.3642374149808615e-05, + "loss": 0.2152, + "step": 2562 + }, + { + "epoch": 2.6126401630988787, + "grad_norm": 4.713347434997559, + "learning_rate": 4.345552891613576e-05, + "loss": 0.2988, + "step": 2563 + }, + { + "epoch": 2.6136595310907236, + "grad_norm": 6.476726531982422, + "learning_rate": 4.326877661951871e-05, + "loss": 0.4129, + "step": 2564 + }, + { + "epoch": 2.614678899082569, + "grad_norm": 4.05019998550415, + "learning_rate": 4.30821199119991e-05, + "loss": 0.2333, + "step": 2565 + }, + { + "epoch": 2.6156982670744138, + "grad_norm": 5.9746012687683105, + "learning_rate": 4.289556144426084e-05, + "loss": 0.3483, + "step": 2566 + }, + { + "epoch": 2.6167176350662587, + "grad_norm": 6.902339935302734, + "learning_rate": 4.2709103865592803e-05, + "loss": 0.2648, + "step": 2567 + }, + { + "epoch": 2.617737003058104, + "grad_norm": 8.029814720153809, + "learning_rate": 4.2522749823851335e-05, + "loss": 0.2654, + "step": 2568 + }, + { + "epoch": 2.6187563710499493, + "grad_norm": 8.265084266662598, + "learning_rate": 4.2336501965422254e-05, + "loss": 0.2348, + "step": 2569 + }, + { + "epoch": 2.619775739041794, + "grad_norm": 5.334643840789795, + "learning_rate": 4.2150362935183515e-05, + "loss": 0.3404, + "step": 2570 + }, + { + "epoch": 2.620795107033639, + "grad_norm": 2.954676389694214, + "learning_rate": 4.1964335376467734e-05, + "loss": 0.202, + "step": 2571 + }, + { + "epoch": 2.6218144750254844, + "grad_norm": 3.2372565269470215, + "learning_rate": 4.1778421931024535e-05, + "loss": 0.2473, + "step": 2572 + }, + { + "epoch": 2.6228338430173292, + "grad_norm": 7.0147480964660645, + "learning_rate": 4.159262523898293e-05, + "loss": 0.4864, + "step": 2573 + }, + { + "epoch": 2.623853211009174, + "grad_norm": 5.646935939788818, + "learning_rate": 4.140694793881387e-05, + "loss": 0.2384, + "step": 2574 + }, + { + "epoch": 2.6248725790010194, + "grad_norm": 2.3776071071624756, + "learning_rate": 4.122139266729305e-05, + "loss": 0.0747, + "step": 2575 + }, + { + "epoch": 2.6258919469928643, + "grad_norm": 7.911314010620117, + "learning_rate": 4.103596205946326e-05, + "loss": 0.2378, + "step": 2576 + }, + { + "epoch": 2.6269113149847096, + "grad_norm": 8.810286521911621, + "learning_rate": 4.085065874859661e-05, + "loss": 0.4169, + "step": 2577 + }, + { + "epoch": 2.6279306829765545, + "grad_norm": 5.295368194580078, + "learning_rate": 4.066548536615792e-05, + "loss": 0.2578, + "step": 2578 + }, + { + "epoch": 2.6289500509684, + "grad_norm": 3.0903420448303223, + "learning_rate": 4.0480444541766576e-05, + "loss": 0.2464, + "step": 2579 + }, + { + "epoch": 2.6299694189602447, + "grad_norm": 6.635354518890381, + "learning_rate": 4.029553890315982e-05, + "loss": 0.5019, + "step": 2580 + }, + { + "epoch": 2.6309887869520896, + "grad_norm": 5.016622066497803, + "learning_rate": 4.0110771076154865e-05, + "loss": 0.5358, + "step": 2581 + }, + { + "epoch": 2.632008154943935, + "grad_norm": 9.547528266906738, + "learning_rate": 3.9926143684612145e-05, + "loss": 0.9614, + "step": 2582 + }, + { + "epoch": 2.63302752293578, + "grad_norm": 9.94871997833252, + "learning_rate": 3.97416593503975e-05, + "loss": 0.7375, + "step": 2583 + }, + { + "epoch": 2.6340468909276247, + "grad_norm": 5.576138973236084, + "learning_rate": 3.955732069334556e-05, + "loss": 0.2736, + "step": 2584 + }, + { + "epoch": 2.63506625891947, + "grad_norm": 6.261617183685303, + "learning_rate": 3.9373130331221886e-05, + "loss": 0.3175, + "step": 2585 + }, + { + "epoch": 2.636085626911315, + "grad_norm": 3.3927159309387207, + "learning_rate": 3.9189090879686426e-05, + "loss": 0.1689, + "step": 2586 + }, + { + "epoch": 2.63710499490316, + "grad_norm": 9.100037574768066, + "learning_rate": 3.900520495225588e-05, + "loss": 0.5786, + "step": 2587 + }, + { + "epoch": 2.638124362895005, + "grad_norm": 4.424569129943848, + "learning_rate": 3.8821475160266805e-05, + "loss": 0.2639, + "step": 2588 + }, + { + "epoch": 2.6391437308868504, + "grad_norm": 5.2483811378479, + "learning_rate": 3.8637904112838466e-05, + "loss": 0.2476, + "step": 2589 + }, + { + "epoch": 2.6401630988786953, + "grad_norm": 2.6344261169433594, + "learning_rate": 3.845449441683594e-05, + "loss": 0.1445, + "step": 2590 + }, + { + "epoch": 2.64118246687054, + "grad_norm": 7.211129665374756, + "learning_rate": 3.827124867683297e-05, + "loss": 0.3231, + "step": 2591 + }, + { + "epoch": 2.6422018348623855, + "grad_norm": 3.855678081512451, + "learning_rate": 3.808816949507489e-05, + "loss": 0.2304, + "step": 2592 + }, + { + "epoch": 2.6432212028542303, + "grad_norm": 5.783268928527832, + "learning_rate": 3.79052594714417e-05, + "loss": 0.2328, + "step": 2593 + }, + { + "epoch": 2.6442405708460752, + "grad_norm": 6.157005786895752, + "learning_rate": 3.7722521203411385e-05, + "loss": 0.5027, + "step": 2594 + }, + { + "epoch": 2.6452599388379205, + "grad_norm": 8.950606346130371, + "learning_rate": 3.753995728602286e-05, + "loss": 0.4601, + "step": 2595 + }, + { + "epoch": 2.6462793068297654, + "grad_norm": 1.801347017288208, + "learning_rate": 3.735757031183896e-05, + "loss": 0.0599, + "step": 2596 + }, + { + "epoch": 2.6472986748216107, + "grad_norm": 1.2584683895111084, + "learning_rate": 3.7175362870909857e-05, + "loss": 0.0803, + "step": 2597 + }, + { + "epoch": 2.6483180428134556, + "grad_norm": 2.8321571350097656, + "learning_rate": 3.699333755073613e-05, + "loss": 0.1362, + "step": 2598 + }, + { + "epoch": 2.649337410805301, + "grad_norm": 5.989924430847168, + "learning_rate": 3.681149693623227e-05, + "loss": 0.4228, + "step": 2599 + }, + { + "epoch": 2.650356778797146, + "grad_norm": 7.6800665855407715, + "learning_rate": 3.662984360968954e-05, + "loss": 0.3464, + "step": 2600 + }, + { + "epoch": 2.6513761467889907, + "grad_norm": 7.051743507385254, + "learning_rate": 3.644838015073983e-05, + "loss": 0.2288, + "step": 2601 + }, + { + "epoch": 2.652395514780836, + "grad_norm": 11.621438026428223, + "learning_rate": 3.626710913631847e-05, + "loss": 0.8281, + "step": 2602 + }, + { + "epoch": 2.653414882772681, + "grad_norm": 11.922146797180176, + "learning_rate": 3.6086033140628154e-05, + "loss": 0.5109, + "step": 2603 + }, + { + "epoch": 2.6544342507645258, + "grad_norm": 5.72918701171875, + "learning_rate": 3.590515473510193e-05, + "loss": 0.353, + "step": 2604 + }, + { + "epoch": 2.655453618756371, + "grad_norm": 4.346190452575684, + "learning_rate": 3.572447648836714e-05, + "loss": 0.2096, + "step": 2605 + }, + { + "epoch": 2.656472986748216, + "grad_norm": 5.70217227935791, + "learning_rate": 3.554400096620848e-05, + "loss": 0.2133, + "step": 2606 + }, + { + "epoch": 2.6574923547400613, + "grad_norm": 7.087839126586914, + "learning_rate": 3.5363730731531884e-05, + "loss": 0.417, + "step": 2607 + }, + { + "epoch": 2.658511722731906, + "grad_norm": 6.4641547203063965, + "learning_rate": 3.518366834432796e-05, + "loss": 0.2561, + "step": 2608 + }, + { + "epoch": 2.6595310907237515, + "grad_norm": 5.385159492492676, + "learning_rate": 3.500381636163581e-05, + "loss": 0.311, + "step": 2609 + }, + { + "epoch": 2.6605504587155964, + "grad_norm": 4.892179489135742, + "learning_rate": 3.482417733750665e-05, + "loss": 0.304, + "step": 2610 + }, + { + "epoch": 2.6615698267074412, + "grad_norm": 8.19067668914795, + "learning_rate": 3.464475382296733e-05, + "loss": 0.3286, + "step": 2611 + }, + { + "epoch": 2.6625891946992866, + "grad_norm": 3.3572440147399902, + "learning_rate": 3.4465548365984304e-05, + "loss": 0.242, + "step": 2612 + }, + { + "epoch": 2.6636085626911314, + "grad_norm": 7.174647331237793, + "learning_rate": 3.428656351142756e-05, + "loss": 0.2746, + "step": 2613 + }, + { + "epoch": 2.6646279306829763, + "grad_norm": 1.8434126377105713, + "learning_rate": 3.410780180103434e-05, + "loss": 0.0884, + "step": 2614 + }, + { + "epoch": 2.6656472986748216, + "grad_norm": 8.234567642211914, + "learning_rate": 3.392926577337291e-05, + "loss": 0.564, + "step": 2615 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 7.409701347351074, + "learning_rate": 3.375095796380672e-05, + "loss": 0.4143, + "step": 2616 + }, + { + "epoch": 2.667686034658512, + "grad_norm": 8.04309368133545, + "learning_rate": 3.357288090445827e-05, + "loss": 0.3845, + "step": 2617 + }, + { + "epoch": 2.6687054026503567, + "grad_norm": 3.051427125930786, + "learning_rate": 3.339503712417338e-05, + "loss": 0.1215, + "step": 2618 + }, + { + "epoch": 2.669724770642202, + "grad_norm": 8.909161567687988, + "learning_rate": 3.3217429148484893e-05, + "loss": 0.4757, + "step": 2619 + }, + { + "epoch": 2.670744138634047, + "grad_norm": 6.251672744750977, + "learning_rate": 3.304005949957726e-05, + "loss": 0.2547, + "step": 2620 + }, + { + "epoch": 2.671763506625892, + "grad_norm": 4.752338409423828, + "learning_rate": 3.28629306962502e-05, + "loss": 0.1851, + "step": 2621 + }, + { + "epoch": 2.672782874617737, + "grad_norm": 6.223941802978516, + "learning_rate": 3.268604525388354e-05, + "loss": 0.2438, + "step": 2622 + }, + { + "epoch": 2.673802242609582, + "grad_norm": 5.9268293380737305, + "learning_rate": 3.2509405684400864e-05, + "loss": 0.437, + "step": 2623 + }, + { + "epoch": 2.674821610601427, + "grad_norm": 5.047306060791016, + "learning_rate": 3.233301449623445e-05, + "loss": 0.3586, + "step": 2624 + }, + { + "epoch": 2.675840978593272, + "grad_norm": 4.698922634124756, + "learning_rate": 3.2156874194289123e-05, + "loss": 0.208, + "step": 2625 + }, + { + "epoch": 2.676860346585117, + "grad_norm": 7.528968811035156, + "learning_rate": 3.198098727990699e-05, + "loss": 0.3605, + "step": 2626 + }, + { + "epoch": 2.6778797145769624, + "grad_norm": 3.599567413330078, + "learning_rate": 3.18053562508318e-05, + "loss": 0.2689, + "step": 2627 + }, + { + "epoch": 2.6788990825688073, + "grad_norm": 8.71760082244873, + "learning_rate": 3.1629983601173585e-05, + "loss": 0.6955, + "step": 2628 + }, + { + "epoch": 2.6799184505606526, + "grad_norm": 3.29335880279541, + "learning_rate": 3.145487182137322e-05, + "loss": 0.154, + "step": 2629 + }, + { + "epoch": 2.6809378185524975, + "grad_norm": 6.2927937507629395, + "learning_rate": 3.128002339816683e-05, + "loss": 0.3374, + "step": 2630 + }, + { + "epoch": 2.6819571865443423, + "grad_norm": 5.186264991760254, + "learning_rate": 3.110544081455072e-05, + "loss": 0.2798, + "step": 2631 + }, + { + "epoch": 2.6829765545361877, + "grad_norm": 6.7110466957092285, + "learning_rate": 3.093112654974611e-05, + "loss": 0.3138, + "step": 2632 + }, + { + "epoch": 2.6839959225280325, + "grad_norm": 4.012898921966553, + "learning_rate": 3.075708307916389e-05, + "loss": 0.2881, + "step": 2633 + }, + { + "epoch": 2.6850152905198774, + "grad_norm": 6.191000938415527, + "learning_rate": 3.058331287436933e-05, + "loss": 0.2858, + "step": 2634 + }, + { + "epoch": 2.6860346585117227, + "grad_norm": 4.7844085693359375, + "learning_rate": 3.040981840304712e-05, + "loss": 0.2717, + "step": 2635 + }, + { + "epoch": 2.687054026503568, + "grad_norm": 7.037001609802246, + "learning_rate": 3.0236602128966275e-05, + "loss": 0.6454, + "step": 2636 + }, + { + "epoch": 2.688073394495413, + "grad_norm": 4.534252166748047, + "learning_rate": 3.0063666511945336e-05, + "loss": 0.3898, + "step": 2637 + }, + { + "epoch": 2.689092762487258, + "grad_norm": 6.041018962860107, + "learning_rate": 2.989101400781704e-05, + "loss": 0.4842, + "step": 2638 + }, + { + "epoch": 2.690112130479103, + "grad_norm": 7.821871280670166, + "learning_rate": 2.9718647068393925e-05, + "loss": 0.2664, + "step": 2639 + }, + { + "epoch": 2.691131498470948, + "grad_norm": 3.5887277126312256, + "learning_rate": 2.9546568141432996e-05, + "loss": 0.1344, + "step": 2640 + }, + { + "epoch": 2.692150866462793, + "grad_norm": 8.541189193725586, + "learning_rate": 2.9374779670601522e-05, + "loss": 0.2727, + "step": 2641 + }, + { + "epoch": 2.693170234454638, + "grad_norm": 4.584117889404297, + "learning_rate": 2.9203284095441773e-05, + "loss": 0.2096, + "step": 2642 + }, + { + "epoch": 2.694189602446483, + "grad_norm": 4.40488862991333, + "learning_rate": 2.903208385133692e-05, + "loss": 0.2204, + "step": 2643 + }, + { + "epoch": 2.695208970438328, + "grad_norm": 8.537614822387695, + "learning_rate": 2.8861181369475902e-05, + "loss": 0.5485, + "step": 2644 + }, + { + "epoch": 2.6962283384301733, + "grad_norm": 5.3955769538879395, + "learning_rate": 2.8690579076819544e-05, + "loss": 0.2895, + "step": 2645 + }, + { + "epoch": 2.6972477064220186, + "grad_norm": 5.815518379211426, + "learning_rate": 2.852027939606525e-05, + "loss": 0.1881, + "step": 2646 + }, + { + "epoch": 2.6972477064220186, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8087142109870911, + "eval_Qnli-dev-1024_cosine_ap": 0.7557593703190841, + "eval_Qnli-dev-1024_cosine_f1": 0.7179487179487181, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.6551511287689209, + "eval_Qnli-dev-1024_cosine_mcc": 0.397705839334203, + "eval_Qnli-dev-1024_cosine_precision": 0.5833333333333334, + "eval_Qnli-dev-1024_cosine_recall": 0.9333333333333333, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7260158061981201, + "eval_Qnli-dev_cosine_ap": 0.7399947312668047, + "eval_Qnli-dev_cosine_f1": 0.7339449541284404, + "eval_Qnli-dev_cosine_f1_threshold": 0.6138423681259155, + "eval_Qnli-dev_cosine_mcc": 0.4428074427700477, + "eval_Qnli-dev_cosine_precision": 0.625, + "eval_Qnli-dev_cosine_recall": 0.8888888888888888, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.28514590859413147, + "eval_global_dataset_runtime": 103.8721, + "eval_global_dataset_samples_per_second": 7.731, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8911116976425595, + "eval_sts-test-1024_spearman_cosine": 0.9133837081466986, + "eval_sts-test_pearson_cosine": 0.906324200031563, + "eval_sts-test_spearman_cosine": 0.9208139632804799, + "step": 2646 + }, + { + "epoch": 2.6982670744138635, + "grad_norm": 4.640608787536621, + "learning_rate": 2.8350284745613432e-05, + "loss": 0.2747, + "step": 2647 + }, + { + "epoch": 2.6992864424057084, + "grad_norm": 4.971389293670654, + "learning_rate": 2.8180597539532816e-05, + "loss": 0.2051, + "step": 2648 + }, + { + "epoch": 2.7003058103975537, + "grad_norm": 9.61092758178711, + "learning_rate": 2.8011220187525968e-05, + "loss": 0.6696, + "step": 2649 + }, + { + "epoch": 2.7013251783893986, + "grad_norm": 6.7433319091796875, + "learning_rate": 2.7842155094895326e-05, + "loss": 0.2742, + "step": 2650 + }, + { + "epoch": 2.7023445463812434, + "grad_norm": 3.069979667663574, + "learning_rate": 2.7673404662509038e-05, + "loss": 0.1871, + "step": 2651 + }, + { + "epoch": 2.7033639143730888, + "grad_norm": 7.87565803527832, + "learning_rate": 2.7504971286766866e-05, + "loss": 0.2713, + "step": 2652 + }, + { + "epoch": 2.7043832823649336, + "grad_norm": 4.452849388122559, + "learning_rate": 2.7336857359565925e-05, + "loss": 0.1782, + "step": 2653 + }, + { + "epoch": 2.705402650356779, + "grad_norm": 9.874443054199219, + "learning_rate": 2.7169065268266906e-05, + "loss": 0.8587, + "step": 2654 + }, + { + "epoch": 2.706422018348624, + "grad_norm": 8.942673683166504, + "learning_rate": 2.7001597395660376e-05, + "loss": 0.3289, + "step": 2655 + }, + { + "epoch": 2.707441386340469, + "grad_norm": 3.1680140495300293, + "learning_rate": 2.683445611993247e-05, + "loss": 0.161, + "step": 2656 + }, + { + "epoch": 2.708460754332314, + "grad_norm": 1.2902299165725708, + "learning_rate": 2.6667643814631453e-05, + "loss": 0.0794, + "step": 2657 + }, + { + "epoch": 2.709480122324159, + "grad_norm": 3.362070083618164, + "learning_rate": 2.650116284863402e-05, + "loss": 0.1901, + "step": 2658 + }, + { + "epoch": 2.7104994903160042, + "grad_norm": 1.860801100730896, + "learning_rate": 2.6335015586111413e-05, + "loss": 0.155, + "step": 2659 + }, + { + "epoch": 2.711518858307849, + "grad_norm": 9.599827766418457, + "learning_rate": 2.6169204386496148e-05, + "loss": 0.3647, + "step": 2660 + }, + { + "epoch": 2.712538226299694, + "grad_norm": 2.472621440887451, + "learning_rate": 2.6003731604448235e-05, + "loss": 0.2017, + "step": 2661 + }, + { + "epoch": 2.7135575942915393, + "grad_norm": 6.350067615509033, + "learning_rate": 2.5838599589822e-05, + "loss": 0.4322, + "step": 2662 + }, + { + "epoch": 2.714576962283384, + "grad_norm": 9.118274688720703, + "learning_rate": 2.5673810687632394e-05, + "loss": 0.4111, + "step": 2663 + }, + { + "epoch": 2.7155963302752295, + "grad_norm": 6.452308654785156, + "learning_rate": 2.5509367238022126e-05, + "loss": 0.3989, + "step": 2664 + }, + { + "epoch": 2.7166156982670744, + "grad_norm": 6.926027297973633, + "learning_rate": 2.5345271576227962e-05, + "loss": 0.2821, + "step": 2665 + }, + { + "epoch": 2.7176350662589197, + "grad_norm": 6.381979942321777, + "learning_rate": 2.518152603254785e-05, + "loss": 0.2476, + "step": 2666 + }, + { + "epoch": 2.7186544342507646, + "grad_norm": 7.060910701751709, + "learning_rate": 2.5018132932307882e-05, + "loss": 0.515, + "step": 2667 + }, + { + "epoch": 2.7196738022426095, + "grad_norm": 3.8371355533599854, + "learning_rate": 2.4855094595829015e-05, + "loss": 0.1412, + "step": 2668 + }, + { + "epoch": 2.720693170234455, + "grad_norm": 10.531432151794434, + "learning_rate": 2.4692413338394223e-05, + "loss": 0.572, + "step": 2669 + }, + { + "epoch": 2.7217125382262997, + "grad_norm": 9.766855239868164, + "learning_rate": 2.4530091470215815e-05, + "loss": 0.588, + "step": 2670 + }, + { + "epoch": 2.7227319062181445, + "grad_norm": 4.339486598968506, + "learning_rate": 2.4368131296402415e-05, + "loss": 0.319, + "step": 2671 + }, + { + "epoch": 2.72375127420999, + "grad_norm": 4.394313335418701, + "learning_rate": 2.4206535116926222e-05, + "loss": 0.1713, + "step": 2672 + }, + { + "epoch": 2.7247706422018347, + "grad_norm": 6.469154357910156, + "learning_rate": 2.404530522659036e-05, + "loss": 0.4146, + "step": 2673 + }, + { + "epoch": 2.72579001019368, + "grad_norm": 8.661172866821289, + "learning_rate": 2.3884443914996447e-05, + "loss": 0.5111, + "step": 2674 + }, + { + "epoch": 2.726809378185525, + "grad_norm": 7.845223426818848, + "learning_rate": 2.3723953466512083e-05, + "loss": 0.3798, + "step": 2675 + }, + { + "epoch": 2.7278287461773703, + "grad_norm": 1.795512318611145, + "learning_rate": 2.3563836160237873e-05, + "loss": 0.1102, + "step": 2676 + }, + { + "epoch": 2.728848114169215, + "grad_norm": 7.233203411102295, + "learning_rate": 2.3404094269975972e-05, + "loss": 0.2584, + "step": 2677 + }, + { + "epoch": 2.72986748216106, + "grad_norm": 3.4168431758880615, + "learning_rate": 2.3244730064196946e-05, + "loss": 0.243, + "step": 2678 + }, + { + "epoch": 2.7308868501529053, + "grad_norm": 5.412033557891846, + "learning_rate": 2.3085745806008202e-05, + "loss": 0.3049, + "step": 2679 + }, + { + "epoch": 2.73190621814475, + "grad_norm": 4.973258972167969, + "learning_rate": 2.2927143753121293e-05, + "loss": 0.3685, + "step": 2680 + }, + { + "epoch": 2.732925586136595, + "grad_norm": 5.000575542449951, + "learning_rate": 2.2768926157820425e-05, + "loss": 0.2967, + "step": 2681 + }, + { + "epoch": 2.7339449541284404, + "grad_norm": 8.042799949645996, + "learning_rate": 2.261109526692988e-05, + "loss": 0.6673, + "step": 2682 + }, + { + "epoch": 2.7349643221202853, + "grad_norm": 9.245428085327148, + "learning_rate": 2.245365332178267e-05, + "loss": 0.5758, + "step": 2683 + }, + { + "epoch": 2.7359836901121306, + "grad_norm": 7.430521488189697, + "learning_rate": 2.2296602558188236e-05, + "loss": 0.2995, + "step": 2684 + }, + { + "epoch": 2.7370030581039755, + "grad_norm": 4.441887855529785, + "learning_rate": 2.2139945206401086e-05, + "loss": 0.2548, + "step": 2685 + }, + { + "epoch": 2.738022426095821, + "grad_norm": 4.856107234954834, + "learning_rate": 2.198368349108884e-05, + "loss": 0.3433, + "step": 2686 + }, + { + "epoch": 2.7390417940876657, + "grad_norm": 3.318927526473999, + "learning_rate": 2.182781963130074e-05, + "loss": 0.1641, + "step": 2687 + }, + { + "epoch": 2.7400611620795106, + "grad_norm": 6.292228698730469, + "learning_rate": 2.1672355840436136e-05, + "loss": 0.2822, + "step": 2688 + }, + { + "epoch": 2.741080530071356, + "grad_norm": 5.510320663452148, + "learning_rate": 2.1517294326213115e-05, + "loss": 0.1818, + "step": 2689 + }, + { + "epoch": 2.7420998980632008, + "grad_norm": 3.1131033897399902, + "learning_rate": 2.136263729063716e-05, + "loss": 0.2151, + "step": 2690 + }, + { + "epoch": 2.7431192660550456, + "grad_norm": 9.573681831359863, + "learning_rate": 2.1208386929969653e-05, + "loss": 0.5548, + "step": 2691 + }, + { + "epoch": 2.744138634046891, + "grad_norm": 3.7194857597351074, + "learning_rate": 2.1054545434696837e-05, + "loss": 0.1391, + "step": 2692 + }, + { + "epoch": 2.745158002038736, + "grad_norm": 7.959753513336182, + "learning_rate": 2.0901114989498892e-05, + "loss": 0.4368, + "step": 2693 + }, + { + "epoch": 2.746177370030581, + "grad_norm": 3.638978958129883, + "learning_rate": 2.0748097773218712e-05, + "loss": 0.1451, + "step": 2694 + }, + { + "epoch": 2.747196738022426, + "grad_norm": 2.528212785720825, + "learning_rate": 2.059549595883074e-05, + "loss": 0.1124, + "step": 2695 + }, + { + "epoch": 2.7482161060142714, + "grad_norm": 6.308834552764893, + "learning_rate": 2.044331171341067e-05, + "loss": 0.2342, + "step": 2696 + }, + { + "epoch": 2.7492354740061162, + "grad_norm": 4.431784629821777, + "learning_rate": 2.0291547198104143e-05, + "loss": 0.2525, + "step": 2697 + }, + { + "epoch": 2.750254841997961, + "grad_norm": 10.3881196975708, + "learning_rate": 2.0140204568096448e-05, + "loss": 0.6524, + "step": 2698 + }, + { + "epoch": 2.7512742099898064, + "grad_norm": 9.04806900024414, + "learning_rate": 1.9989285972581578e-05, + "loss": 0.3521, + "step": 2699 + }, + { + "epoch": 2.7522935779816513, + "grad_norm": 6.471098899841309, + "learning_rate": 1.9838793554732053e-05, + "loss": 0.5034, + "step": 2700 + }, + { + "epoch": 2.753312945973496, + "grad_norm": 5.406115531921387, + "learning_rate": 1.968872945166808e-05, + "loss": 0.2161, + "step": 2701 + }, + { + "epoch": 2.7543323139653415, + "grad_norm": 3.8615589141845703, + "learning_rate": 1.9539095794427702e-05, + "loss": 0.1294, + "step": 2702 + }, + { + "epoch": 2.7553516819571864, + "grad_norm": 4.188022136688232, + "learning_rate": 1.938989470793599e-05, + "loss": 0.1653, + "step": 2703 + }, + { + "epoch": 2.7563710499490317, + "grad_norm": 3.1788315773010254, + "learning_rate": 1.9241128310975415e-05, + "loss": 0.213, + "step": 2704 + }, + { + "epoch": 2.7573904179408766, + "grad_norm": 3.548327684402466, + "learning_rate": 1.9092798716155263e-05, + "loss": 0.0979, + "step": 2705 + }, + { + "epoch": 2.758409785932722, + "grad_norm": 8.507022857666016, + "learning_rate": 1.8944908029881975e-05, + "loss": 0.6243, + "step": 2706 + }, + { + "epoch": 2.759429153924567, + "grad_norm": 4.468634605407715, + "learning_rate": 1.8797458352329005e-05, + "loss": 0.1636, + "step": 2707 + }, + { + "epoch": 2.7604485219164117, + "grad_norm": 10.539908409118652, + "learning_rate": 1.8650451777407272e-05, + "loss": 0.6725, + "step": 2708 + }, + { + "epoch": 2.761467889908257, + "grad_norm": 9.40577507019043, + "learning_rate": 1.850389039273521e-05, + "loss": 0.6119, + "step": 2709 + }, + { + "epoch": 2.762487257900102, + "grad_norm": 5.786467552185059, + "learning_rate": 1.8357776279609103e-05, + "loss": 0.4547, + "step": 2710 + }, + { + "epoch": 2.7635066258919467, + "grad_norm": 3.0541884899139404, + "learning_rate": 1.821211151297358e-05, + "loss": 0.1164, + "step": 2711 + }, + { + "epoch": 2.764525993883792, + "grad_norm": 4.641866683959961, + "learning_rate": 1.8066898161392258e-05, + "loss": 0.2381, + "step": 2712 + }, + { + "epoch": 2.7655453618756374, + "grad_norm": 4.353042125701904, + "learning_rate": 1.792213828701833e-05, + "loss": 0.3101, + "step": 2713 + }, + { + "epoch": 2.7665647298674823, + "grad_norm": 7.727181911468506, + "learning_rate": 1.7777833945565052e-05, + "loss": 0.2404, + "step": 2714 + }, + { + "epoch": 2.767584097859327, + "grad_norm": 4.7307233810424805, + "learning_rate": 1.7633987186276845e-05, + "loss": 0.1955, + "step": 2715 + }, + { + "epoch": 2.7686034658511725, + "grad_norm": 8.080448150634766, + "learning_rate": 1.7490600051899963e-05, + "loss": 0.4751, + "step": 2716 + }, + { + "epoch": 2.7696228338430173, + "grad_norm": 5.427289009094238, + "learning_rate": 1.7347674578653806e-05, + "loss": 0.1427, + "step": 2717 + }, + { + "epoch": 2.770642201834862, + "grad_norm": 4.197767734527588, + "learning_rate": 1.720521279620153e-05, + "loss": 0.2715, + "step": 2718 + }, + { + "epoch": 2.7716615698267075, + "grad_norm": 4.137823581695557, + "learning_rate": 1.706321672762175e-05, + "loss": 0.2044, + "step": 2719 + }, + { + "epoch": 2.7726809378185524, + "grad_norm": 4.52974271774292, + "learning_rate": 1.69216883893793e-05, + "loss": 0.3344, + "step": 2720 + }, + { + "epoch": 2.7737003058103973, + "grad_norm": 5.985829830169678, + "learning_rate": 1.6780629791297044e-05, + "loss": 0.2797, + "step": 2721 + }, + { + "epoch": 2.7747196738022426, + "grad_norm": 2.3331515789031982, + "learning_rate": 1.6640042936526994e-05, + "loss": 0.0801, + "step": 2722 + }, + { + "epoch": 2.775739041794088, + "grad_norm": 6.142969131469727, + "learning_rate": 1.6499929821522125e-05, + "loss": 0.2107, + "step": 2723 + }, + { + "epoch": 2.776758409785933, + "grad_norm": 3.632021427154541, + "learning_rate": 1.6360292436007836e-05, + "loss": 0.161, + "step": 2724 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 8.594420433044434, + "learning_rate": 1.6221132762953733e-05, + "loss": 0.5933, + "step": 2725 + }, + { + "epoch": 2.778797145769623, + "grad_norm": 7.609989166259766, + "learning_rate": 1.6082452778545532e-05, + "loss": 0.2562, + "step": 2726 + }, + { + "epoch": 2.779816513761468, + "grad_norm": 5.107419013977051, + "learning_rate": 1.594425445215697e-05, + "loss": 0.1928, + "step": 2727 + }, + { + "epoch": 2.7808358817533128, + "grad_norm": 8.12761116027832, + "learning_rate": 1.5806539746321918e-05, + "loss": 0.5271, + "step": 2728 + }, + { + "epoch": 2.781855249745158, + "grad_norm": 11.674429893493652, + "learning_rate": 1.5669310616706268e-05, + "loss": 0.5303, + "step": 2729 + }, + { + "epoch": 2.782874617737003, + "grad_norm": 6.5717926025390625, + "learning_rate": 1.5532569012080322e-05, + "loss": 0.4117, + "step": 2730 + }, + { + "epoch": 2.7838939857288483, + "grad_norm": 5.438577651977539, + "learning_rate": 1.5396316874291244e-05, + "loss": 0.1852, + "step": 2731 + }, + { + "epoch": 2.784913353720693, + "grad_norm": 7.296583652496338, + "learning_rate": 1.526055613823531e-05, + "loss": 0.2319, + "step": 2732 + }, + { + "epoch": 2.7859327217125385, + "grad_norm": 7.109930992126465, + "learning_rate": 1.5125288731830428e-05, + "loss": 0.3579, + "step": 2733 + }, + { + "epoch": 2.7869520897043834, + "grad_norm": 7.2482991218566895, + "learning_rate": 1.4990516575988778e-05, + "loss": 0.36, + "step": 2734 + }, + { + "epoch": 2.7879714576962282, + "grad_norm": 8.622855186462402, + "learning_rate": 1.4856241584589603e-05, + "loss": 0.4538, + "step": 2735 + }, + { + "epoch": 2.7889908256880735, + "grad_norm": 5.97171688079834, + "learning_rate": 1.472246566445205e-05, + "loss": 0.218, + "step": 2736 + }, + { + "epoch": 2.7900101936799184, + "grad_norm": 4.555757522583008, + "learning_rate": 1.458919071530791e-05, + "loss": 0.1233, + "step": 2737 + }, + { + "epoch": 2.7910295616717633, + "grad_norm": 3.099499225616455, + "learning_rate": 1.4456418629774892e-05, + "loss": 0.1817, + "step": 2738 + }, + { + "epoch": 2.7920489296636086, + "grad_norm": 5.9811248779296875, + "learning_rate": 1.4324151293329436e-05, + "loss": 0.2584, + "step": 2739 + }, + { + "epoch": 2.7930682976554535, + "grad_norm": 7.639220237731934, + "learning_rate": 1.4192390584280346e-05, + "loss": 0.5893, + "step": 2740 + }, + { + "epoch": 2.794087665647299, + "grad_norm": 5.624851226806641, + "learning_rate": 1.4061138373741638e-05, + "loss": 0.2184, + "step": 2741 + }, + { + "epoch": 2.7951070336391437, + "grad_norm": 9.424039840698242, + "learning_rate": 1.393039652560647e-05, + "loss": 0.6381, + "step": 2742 + }, + { + "epoch": 2.796126401630989, + "grad_norm": 6.021475315093994, + "learning_rate": 1.3800166896520155e-05, + "loss": 0.3252, + "step": 2743 + }, + { + "epoch": 2.797145769622834, + "grad_norm": 6.239711284637451, + "learning_rate": 1.3670451335854372e-05, + "loss": 0.2966, + "step": 2744 + }, + { + "epoch": 2.797145769622834, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7571010589599609, + "eval_Qnli-dev-1024_cosine_ap": 0.7504622260499265, + "eval_Qnli-dev-1024_cosine_f1": 0.723404255319149, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7475010752677917, + "eval_Qnli-dev-1024_cosine_mcc": 0.46063575594147665, + "eval_Qnli-dev-1024_cosine_precision": 0.6938775510204082, + "eval_Qnli-dev-1024_cosine_recall": 0.7555555555555555, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6713130474090576, + "eval_Qnli-dev_cosine_ap": 0.7399770165919752, + "eval_Qnli-dev_cosine_f1": 0.7256637168141592, + "eval_Qnli-dev_cosine_f1_threshold": 0.5859470963478088, + "eval_Qnli-dev_cosine_mcc": 0.419062972501429, + "eval_Qnli-dev_cosine_precision": 0.6029411764705882, + "eval_Qnli-dev_cosine_recall": 0.9111111111111111, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.3241865932941437, + "eval_global_dataset_runtime": 103.8083, + "eval_global_dataset_samples_per_second": 7.735, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8939745372812724, + "eval_sts-test-1024_spearman_cosine": 0.9148932562411601, + "eval_sts-test_pearson_cosine": 0.9070974288417767, + "eval_sts-test_spearman_cosine": 0.9217502611378607, + "step": 2744 + }, + { + "epoch": 2.7981651376146788, + "grad_norm": 6.5206685066223145, + "learning_rate": 1.3541251685680196e-05, + "loss": 0.6773, + "step": 2745 + }, + { + "epoch": 2.799184505606524, + "grad_norm": 6.555695533752441, + "learning_rate": 1.3412569780742673e-05, + "loss": 0.4735, + "step": 2746 + }, + { + "epoch": 2.800203873598369, + "grad_norm": 3.916958808898926, + "learning_rate": 1.3284407448434343e-05, + "loss": 0.2945, + "step": 2747 + }, + { + "epoch": 2.801223241590214, + "grad_norm": 5.9219231605529785, + "learning_rate": 1.3156766508769269e-05, + "loss": 0.3866, + "step": 2748 + }, + { + "epoch": 2.802242609582059, + "grad_norm": 6.4752278327941895, + "learning_rate": 1.3029648774357345e-05, + "loss": 0.2936, + "step": 2749 + }, + { + "epoch": 2.803261977573904, + "grad_norm": 5.425044536590576, + "learning_rate": 1.2903056050378543e-05, + "loss": 0.3548, + "step": 2750 + }, + { + "epoch": 2.8042813455657494, + "grad_norm": 4.103187561035156, + "learning_rate": 1.2776990134557293e-05, + "loss": 0.401, + "step": 2751 + }, + { + "epoch": 2.8053007135575942, + "grad_norm": 2.648897171020508, + "learning_rate": 1.2651452817136744e-05, + "loss": 0.1286, + "step": 2752 + }, + { + "epoch": 2.8063200815494396, + "grad_norm": 2.8857932090759277, + "learning_rate": 1.2526445880853622e-05, + "loss": 0.1662, + "step": 2753 + }, + { + "epoch": 2.8073394495412844, + "grad_norm": 7.301206588745117, + "learning_rate": 1.2401971100912663e-05, + "loss": 0.3503, + "step": 2754 + }, + { + "epoch": 2.8083588175331293, + "grad_norm": 2.2922003269195557, + "learning_rate": 1.2278030244961747e-05, + "loss": 0.0835, + "step": 2755 + }, + { + "epoch": 2.8093781855249746, + "grad_norm": 4.0575666427612305, + "learning_rate": 1.2154625073066323e-05, + "loss": 0.164, + "step": 2756 + }, + { + "epoch": 2.8103975535168195, + "grad_norm": 6.752877235412598, + "learning_rate": 1.2031757337684912e-05, + "loss": 0.3127, + "step": 2757 + }, + { + "epoch": 2.8114169215086644, + "grad_norm": 7.1814446449279785, + "learning_rate": 1.1909428783643766e-05, + "loss": 0.479, + "step": 2758 + }, + { + "epoch": 2.8124362895005097, + "grad_norm": 2.4549272060394287, + "learning_rate": 1.1787641148112472e-05, + "loss": 0.1292, + "step": 2759 + }, + { + "epoch": 2.8134556574923546, + "grad_norm": 9.54740047454834, + "learning_rate": 1.1666396160578985e-05, + "loss": 0.5329, + "step": 2760 + }, + { + "epoch": 2.8144750254842, + "grad_norm": 8.301751136779785, + "learning_rate": 1.1545695542825313e-05, + "loss": 0.2671, + "step": 2761 + }, + { + "epoch": 2.815494393476045, + "grad_norm": 11.171977043151855, + "learning_rate": 1.1425541008902834e-05, + "loss": 0.4597, + "step": 2762 + }, + { + "epoch": 2.81651376146789, + "grad_norm": 5.594420433044434, + "learning_rate": 1.1305934265108232e-05, + "loss": 0.3011, + "step": 2763 + }, + { + "epoch": 2.817533129459735, + "grad_norm": 5.141785144805908, + "learning_rate": 1.1186877009958851e-05, + "loss": 0.2159, + "step": 2764 + }, + { + "epoch": 2.81855249745158, + "grad_norm": 3.065958261489868, + "learning_rate": 1.1068370934169048e-05, + "loss": 0.1038, + "step": 2765 + }, + { + "epoch": 2.819571865443425, + "grad_norm": 8.466320991516113, + "learning_rate": 1.0950417720625961e-05, + "loss": 0.3436, + "step": 2766 + }, + { + "epoch": 2.82059123343527, + "grad_norm": 6.828653335571289, + "learning_rate": 1.0833019044365495e-05, + "loss": 0.6016, + "step": 2767 + }, + { + "epoch": 2.821610601427115, + "grad_norm": 11.670631408691406, + "learning_rate": 1.0716176572548648e-05, + "loss": 0.7132, + "step": 2768 + }, + { + "epoch": 2.8226299694189603, + "grad_norm": 4.495013236999512, + "learning_rate": 1.059989196443798e-05, + "loss": 0.2802, + "step": 2769 + }, + { + "epoch": 2.823649337410805, + "grad_norm": 5.054066181182861, + "learning_rate": 1.048416687137384e-05, + "loss": 0.2158, + "step": 2770 + }, + { + "epoch": 2.8246687054026505, + "grad_norm": 6.254016399383545, + "learning_rate": 1.036900293675097e-05, + "loss": 0.4267, + "step": 2771 + }, + { + "epoch": 2.8256880733944953, + "grad_norm": 7.1414947509765625, + "learning_rate": 1.0254401795995134e-05, + "loss": 0.2813, + "step": 2772 + }, + { + "epoch": 2.8267074413863407, + "grad_norm": 5.415386199951172, + "learning_rate": 1.0140365076540104e-05, + "loss": 0.3579, + "step": 2773 + }, + { + "epoch": 2.8277268093781855, + "grad_norm": 4.965087413787842, + "learning_rate": 1.0026894397804242e-05, + "loss": 0.3354, + "step": 2774 + }, + { + "epoch": 2.8287461773700304, + "grad_norm": 6.15515661239624, + "learning_rate": 9.913991371167653e-06, + "loss": 0.3876, + "step": 2775 + }, + { + "epoch": 2.8297655453618757, + "grad_norm": 4.003582000732422, + "learning_rate": 9.801657599949449e-06, + "loss": 0.2908, + "step": 2776 + }, + { + "epoch": 2.8307849133537206, + "grad_norm": 3.4854180812835693, + "learning_rate": 9.689894679384614e-06, + "loss": 0.2131, + "step": 2777 + }, + { + "epoch": 2.8318042813455655, + "grad_norm": 6.192078113555908, + "learning_rate": 9.578704196601807e-06, + "loss": 0.3381, + "step": 2778 + }, + { + "epoch": 2.832823649337411, + "grad_norm": 3.7153677940368652, + "learning_rate": 9.468087730600333e-06, + "loss": 0.2269, + "step": 2779 + }, + { + "epoch": 2.8338430173292557, + "grad_norm": 3.5471065044403076, + "learning_rate": 9.358046852228214e-06, + "loss": 0.1943, + "step": 2780 + }, + { + "epoch": 2.834862385321101, + "grad_norm": 3.409437417984009, + "learning_rate": 9.24858312415941e-06, + "loss": 0.1935, + "step": 2781 + }, + { + "epoch": 2.835881753312946, + "grad_norm": 4.60510778427124, + "learning_rate": 9.139698100872074e-06, + "loss": 0.2, + "step": 2782 + }, + { + "epoch": 2.836901121304791, + "grad_norm": 3.126591444015503, + "learning_rate": 9.031393328626082e-06, + "loss": 0.2202, + "step": 2783 + }, + { + "epoch": 2.837920489296636, + "grad_norm": 4.4597930908203125, + "learning_rate": 8.923670345441303e-06, + "loss": 0.2008, + "step": 2784 + }, + { + "epoch": 2.838939857288481, + "grad_norm": 5.132372856140137, + "learning_rate": 8.816530681075796e-06, + "loss": 0.1634, + "step": 2785 + }, + { + "epoch": 2.8399592252803263, + "grad_norm": 7.019527912139893, + "learning_rate": 8.709975857003866e-06, + "loss": 0.1994, + "step": 2786 + }, + { + "epoch": 2.840978593272171, + "grad_norm": 1.802855134010315, + "learning_rate": 8.604007386394647e-06, + "loss": 0.0783, + "step": 2787 + }, + { + "epoch": 2.841997961264016, + "grad_norm": 8.37390422821045, + "learning_rate": 8.49862677409055e-06, + "loss": 0.2402, + "step": 2788 + }, + { + "epoch": 2.8430173292558614, + "grad_norm": 5.241154670715332, + "learning_rate": 8.393835516585979e-06, + "loss": 0.2826, + "step": 2789 + }, + { + "epoch": 2.8440366972477067, + "grad_norm": 8.102668762207031, + "learning_rate": 8.289635102005855e-06, + "loss": 0.3095, + "step": 2790 + }, + { + "epoch": 2.8450560652395516, + "grad_norm": 7.070498943328857, + "learning_rate": 8.186027010084684e-06, + "loss": 0.394, + "step": 2791 + }, + { + "epoch": 2.8460754332313964, + "grad_norm": 6.332981586456299, + "learning_rate": 8.083012712145505e-06, + "loss": 0.2141, + "step": 2792 + }, + { + "epoch": 2.8470948012232418, + "grad_norm": 2.69991135597229, + "learning_rate": 7.980593671079068e-06, + "loss": 0.1438, + "step": 2793 + }, + { + "epoch": 2.8481141692150866, + "grad_norm": 6.667513847351074, + "learning_rate": 7.878771341322716e-06, + "loss": 0.4134, + "step": 2794 + }, + { + "epoch": 2.8491335372069315, + "grad_norm": 11.117960929870605, + "learning_rate": 7.777547168840233e-06, + "loss": 1.0622, + "step": 2795 + }, + { + "epoch": 2.850152905198777, + "grad_norm": 8.538312911987305, + "learning_rate": 7.676922591100922e-06, + "loss": 0.3276, + "step": 2796 + }, + { + "epoch": 2.8511722731906217, + "grad_norm": 9.44412612915039, + "learning_rate": 7.576899037059409e-06, + "loss": 0.285, + "step": 2797 + }, + { + "epoch": 2.8521916411824666, + "grad_norm": 3.452684164047241, + "learning_rate": 7.477477927135207e-06, + "loss": 0.1505, + "step": 2798 + }, + { + "epoch": 2.853211009174312, + "grad_norm": 4.277743816375732, + "learning_rate": 7.378660673192683e-06, + "loss": 0.1384, + "step": 2799 + }, + { + "epoch": 2.8542303771661572, + "grad_norm": 4.803715705871582, + "learning_rate": 7.28044867852084e-06, + "loss": 0.2855, + "step": 2800 + }, + { + "epoch": 2.855249745158002, + "grad_norm": 3.5064492225646973, + "learning_rate": 7.182843337813589e-06, + "loss": 0.1841, + "step": 2801 + }, + { + "epoch": 2.856269113149847, + "grad_norm": 3.3467700481414795, + "learning_rate": 7.085846037149746e-06, + "loss": 0.1406, + "step": 2802 + }, + { + "epoch": 2.8572884811416923, + "grad_norm": 6.449463844299316, + "learning_rate": 6.989458153973522e-06, + "loss": 0.4628, + "step": 2803 + }, + { + "epoch": 2.858307849133537, + "grad_norm": 2.180995464324951, + "learning_rate": 6.893681057074835e-06, + "loss": 0.1789, + "step": 2804 + }, + { + "epoch": 2.859327217125382, + "grad_norm": 4.380829334259033, + "learning_rate": 6.7985161065699185e-06, + "loss": 0.1803, + "step": 2805 + }, + { + "epoch": 2.8603465851172274, + "grad_norm": 2.5380773544311523, + "learning_rate": 6.703964653881955e-06, + "loss": 0.1188, + "step": 2806 + }, + { + "epoch": 2.8613659531090723, + "grad_norm": 5.769050121307373, + "learning_rate": 6.610028041722066e-06, + "loss": 0.1484, + "step": 2807 + }, + { + "epoch": 2.8623853211009176, + "grad_norm": 8.550139427185059, + "learning_rate": 6.5167076040700495e-06, + "loss": 0.4521, + "step": 2808 + }, + { + "epoch": 2.8634046890927625, + "grad_norm": 13.322113037109375, + "learning_rate": 6.424004666155481e-06, + "loss": 0.8927, + "step": 2809 + }, + { + "epoch": 2.864424057084608, + "grad_norm": 6.6663007736206055, + "learning_rate": 6.331920544438874e-06, + "loss": 0.2667, + "step": 2810 + }, + { + "epoch": 2.8654434250764527, + "grad_norm": 7.3615522384643555, + "learning_rate": 6.240456546593138e-06, + "loss": 0.5552, + "step": 2811 + }, + { + "epoch": 2.8664627930682975, + "grad_norm": 4.977193832397461, + "learning_rate": 6.149613971484852e-06, + "loss": 0.246, + "step": 2812 + }, + { + "epoch": 2.867482161060143, + "grad_norm": 3.421010971069336, + "learning_rate": 6.05939410915583e-06, + "loss": 0.1816, + "step": 2813 + }, + { + "epoch": 2.8685015290519877, + "grad_norm": 3.8643534183502197, + "learning_rate": 5.969798240804853e-06, + "loss": 0.179, + "step": 2814 + }, + { + "epoch": 2.8695208970438326, + "grad_norm": 5.163575172424316, + "learning_rate": 5.880827638769415e-06, + "loss": 0.1567, + "step": 2815 + }, + { + "epoch": 2.870540265035678, + "grad_norm": 10.057819366455078, + "learning_rate": 5.792483566507822e-06, + "loss": 0.8622, + "step": 2816 + }, + { + "epoch": 2.871559633027523, + "grad_norm": 6.665347099304199, + "learning_rate": 5.704767278580958e-06, + "loss": 0.3246, + "step": 2817 + }, + { + "epoch": 2.872579001019368, + "grad_norm": 4.60490083694458, + "learning_rate": 5.6176800206348075e-06, + "loss": 0.0939, + "step": 2818 + }, + { + "epoch": 2.873598369011213, + "grad_norm": 10.552454948425293, + "learning_rate": 5.531223029382426e-06, + "loss": 0.4211, + "step": 2819 + }, + { + "epoch": 2.8746177370030583, + "grad_norm": 6.045144557952881, + "learning_rate": 5.445397532586699e-06, + "loss": 0.3004, + "step": 2820 + }, + { + "epoch": 2.875637104994903, + "grad_norm": 7.7133026123046875, + "learning_rate": 5.3602047490426076e-06, + "loss": 0.5342, + "step": 2821 + }, + { + "epoch": 2.876656472986748, + "grad_norm": 3.270509958267212, + "learning_rate": 5.275645888560221e-06, + "loss": 0.2811, + "step": 2822 + }, + { + "epoch": 2.8776758409785934, + "grad_norm": 5.2947893142700195, + "learning_rate": 5.191722151947237e-06, + "loss": 0.2122, + "step": 2823 + }, + { + "epoch": 2.8786952089704383, + "grad_norm": 4.58398962020874, + "learning_rate": 5.1084347309920895e-06, + "loss": 0.2948, + "step": 2824 + }, + { + "epoch": 2.879714576962283, + "grad_norm": 3.8912346363067627, + "learning_rate": 5.025784808446987e-06, + "loss": 0.1354, + "step": 2825 + }, + { + "epoch": 2.8807339449541285, + "grad_norm": 2.3357369899749756, + "learning_rate": 4.9437735580111385e-06, + "loss": 0.1591, + "step": 2826 + }, + { + "epoch": 2.8817533129459734, + "grad_norm": 7.461204528808594, + "learning_rate": 4.86240214431411e-06, + "loss": 0.3842, + "step": 2827 + }, + { + "epoch": 2.8827726809378187, + "grad_norm": 12.113036155700684, + "learning_rate": 4.781671722899139e-06, + "loss": 0.6479, + "step": 2828 + }, + { + "epoch": 2.8837920489296636, + "grad_norm": 8.575617790222168, + "learning_rate": 4.701583440206858e-06, + "loss": 0.402, + "step": 2829 + }, + { + "epoch": 2.884811416921509, + "grad_norm": 8.859447479248047, + "learning_rate": 4.622138433559015e-06, + "loss": 0.4825, + "step": 2830 + }, + { + "epoch": 2.8858307849133538, + "grad_norm": 4.101320266723633, + "learning_rate": 4.54333783114233e-06, + "loss": 0.1895, + "step": 2831 + }, + { + "epoch": 2.8868501529051986, + "grad_norm": 7.142346382141113, + "learning_rate": 4.465182751992342e-06, + "loss": 0.5035, + "step": 2832 + }, + { + "epoch": 2.887869520897044, + "grad_norm": 5.056869983673096, + "learning_rate": 4.38767430597764e-06, + "loss": 0.2528, + "step": 2833 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 4.816026210784912, + "learning_rate": 4.310813593784075e-06, + "loss": 0.1977, + "step": 2834 + }, + { + "epoch": 2.8899082568807337, + "grad_norm": 6.842755317687988, + "learning_rate": 4.234601706899166e-06, + "loss": 0.3115, + "step": 2835 + }, + { + "epoch": 2.890927624872579, + "grad_norm": 7.81460428237915, + "learning_rate": 4.159039727596509e-06, + "loss": 0.3466, + "step": 2836 + }, + { + "epoch": 2.891946992864424, + "grad_norm": 6.274500846862793, + "learning_rate": 4.0841287289205e-06, + "loss": 0.4209, + "step": 2837 + }, + { + "epoch": 2.8929663608562692, + "grad_norm": 5.631227493286133, + "learning_rate": 4.0098697746710155e-06, + "loss": 0.5445, + "step": 2838 + }, + { + "epoch": 2.893985728848114, + "grad_norm": 7.602738857269287, + "learning_rate": 3.936263919388394e-06, + "loss": 0.6857, + "step": 2839 + }, + { + "epoch": 2.8950050968399594, + "grad_norm": 3.171926498413086, + "learning_rate": 3.863312208338354e-06, + "loss": 0.1408, + "step": 2840 + }, + { + "epoch": 2.8960244648318043, + "grad_norm": 3.9830353260040283, + "learning_rate": 3.7910156774972784e-06, + "loss": 0.18, + "step": 2841 + }, + { + "epoch": 2.897043832823649, + "grad_norm": 4.877954959869385, + "learning_rate": 3.7193753535373854e-06, + "loss": 0.2425, + "step": 2842 + }, + { + "epoch": 2.897043832823649, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.7575306296348572, + "eval_Qnli-dev-1024_cosine_ap": 0.7439778731668312, + "eval_Qnli-dev-1024_cosine_f1": 0.7142857142857142, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7346500158309937, + "eval_Qnli-dev-1024_cosine_mcc": 0.4263253018001963, + "eval_Qnli-dev-1024_cosine_precision": 0.660377358490566, + "eval_Qnli-dev-1024_cosine_recall": 0.7777777777777778, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.6697263717651367, + "eval_Qnli-dev_cosine_ap": 0.7410436810871419, + "eval_Qnli-dev_cosine_f1": 0.7256637168141592, + "eval_Qnli-dev_cosine_f1_threshold": 0.5878369808197021, + "eval_Qnli-dev_cosine_mcc": 0.419062972501429, + "eval_Qnli-dev_cosine_precision": 0.6029411764705882, + "eval_Qnli-dev_cosine_recall": 0.9111111111111111, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.316562294960022, + "eval_global_dataset_runtime": 103.7708, + "eval_global_dataset_samples_per_second": 7.738, + "eval_global_dataset_steps_per_second": 0.164, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8948788156897003, + "eval_sts-test-1024_spearman_cosine": 0.9161429026505324, + "eval_sts-test_pearson_cosine": 0.9074720804711505, + "eval_sts-test_spearman_cosine": 0.922284621175041, + "step": 2842 + }, + { + "epoch": 2.8980632008154945, + "grad_norm": 6.035852909088135, + "learning_rate": 3.648392253812205e-06, + "loss": 0.1627, + "step": 2843 + }, + { + "epoch": 2.8990825688073394, + "grad_norm": 8.02778148651123, + "learning_rate": 3.578067386342099e-06, + "loss": 0.523, + "step": 2844 + }, + { + "epoch": 2.9001019367991843, + "grad_norm": 2.975386142730713, + "learning_rate": 3.5084017498000443e-06, + "loss": 0.2438, + "step": 2845 + }, + { + "epoch": 2.9011213047910296, + "grad_norm": 5.486757278442383, + "learning_rate": 3.43939633349733e-06, + "loss": 0.2044, + "step": 2846 + }, + { + "epoch": 2.9021406727828745, + "grad_norm": 5.736207008361816, + "learning_rate": 3.3710521173695665e-06, + "loss": 0.1509, + "step": 2847 + }, + { + "epoch": 2.90316004077472, + "grad_norm": 4.051654815673828, + "learning_rate": 3.303370071962708e-06, + "loss": 0.0885, + "step": 2848 + }, + { + "epoch": 2.9041794087665647, + "grad_norm": 3.427027940750122, + "learning_rate": 3.2363511584194093e-06, + "loss": 0.1362, + "step": 2849 + }, + { + "epoch": 2.90519877675841, + "grad_norm": 9.068493843078613, + "learning_rate": 3.1699963284652523e-06, + "loss": 0.351, + "step": 2850 + }, + { + "epoch": 2.906218144750255, + "grad_norm": 7.973621368408203, + "learning_rate": 3.104306524395256e-06, + "loss": 0.2569, + "step": 2851 + }, + { + "epoch": 2.9072375127420997, + "grad_norm": 1.665149450302124, + "learning_rate": 3.0392826790605068e-06, + "loss": 0.0932, + "step": 2852 + }, + { + "epoch": 2.908256880733945, + "grad_norm": 7.884661674499512, + "learning_rate": 2.974925715854876e-06, + "loss": 0.2864, + "step": 2853 + }, + { + "epoch": 2.90927624872579, + "grad_norm": 3.071857452392578, + "learning_rate": 2.911236548702051e-06, + "loss": 0.1591, + "step": 2854 + }, + { + "epoch": 2.910295616717635, + "grad_norm": 6.92976188659668, + "learning_rate": 2.84821608204231e-06, + "loss": 0.2428, + "step": 2855 + }, + { + "epoch": 2.91131498470948, + "grad_norm": 2.4845893383026123, + "learning_rate": 2.7858652108199437e-06, + "loss": 0.1249, + "step": 2856 + }, + { + "epoch": 2.912334352701325, + "grad_norm": 5.34236478805542, + "learning_rate": 2.724184820470299e-06, + "loss": 0.1352, + "step": 2857 + }, + { + "epoch": 2.9133537206931703, + "grad_norm": 4.656986713409424, + "learning_rate": 2.6631757869074457e-06, + "loss": 0.2245, + "step": 2858 + }, + { + "epoch": 2.914373088685015, + "grad_norm": 8.07036304473877, + "learning_rate": 2.6028389765114845e-06, + "loss": 0.3924, + "step": 2859 + }, + { + "epoch": 2.9153924566768605, + "grad_norm": 4.92465353012085, + "learning_rate": 2.543175246116514e-06, + "loss": 0.2301, + "step": 2860 + }, + { + "epoch": 2.9164118246687054, + "grad_norm": 3.6421494483947754, + "learning_rate": 2.4841854429981824e-06, + "loss": 0.1289, + "step": 2861 + }, + { + "epoch": 2.9174311926605503, + "grad_norm": 1.8478538990020752, + "learning_rate": 2.4258704048619574e-06, + "loss": 0.1209, + "step": 2862 + }, + { + "epoch": 2.9184505606523956, + "grad_norm": 6.145997047424316, + "learning_rate": 2.3682309598308807e-06, + "loss": 0.3491, + "step": 2863 + }, + { + "epoch": 2.9194699286442405, + "grad_norm": 5.5693359375, + "learning_rate": 2.311267926434141e-06, + "loss": 0.2015, + "step": 2864 + }, + { + "epoch": 2.9204892966360854, + "grad_norm": 2.2183048725128174, + "learning_rate": 2.254982113595294e-06, + "loss": 0.0936, + "step": 2865 + }, + { + "epoch": 2.9215086646279307, + "grad_norm": 8.294523239135742, + "learning_rate": 2.1993743206207283e-06, + "loss": 0.4695, + "step": 2866 + }, + { + "epoch": 2.922528032619776, + "grad_norm": 7.269809722900391, + "learning_rate": 2.1444453371883833e-06, + "loss": 0.235, + "step": 2867 + }, + { + "epoch": 2.923547400611621, + "grad_norm": 4.965348720550537, + "learning_rate": 2.090195943336565e-06, + "loss": 0.2326, + "step": 2868 + }, + { + "epoch": 2.9245667686034658, + "grad_norm": 7.198317527770996, + "learning_rate": 2.0366269094528325e-06, + "loss": 0.4356, + "step": 2869 + }, + { + "epoch": 2.925586136595311, + "grad_norm": 3.5571677684783936, + "learning_rate": 1.983738996263007e-06, + "loss": 0.1375, + "step": 2870 + }, + { + "epoch": 2.926605504587156, + "grad_norm": 8.104644775390625, + "learning_rate": 1.9315329548204195e-06, + "loss": 0.4536, + "step": 2871 + }, + { + "epoch": 2.927624872579001, + "grad_norm": 7.931567668914795, + "learning_rate": 1.8800095264953021e-06, + "loss": 0.371, + "step": 2872 + }, + { + "epoch": 2.928644240570846, + "grad_norm": 5.3822503089904785, + "learning_rate": 1.829169442964146e-06, + "loss": 0.3237, + "step": 2873 + }, + { + "epoch": 2.929663608562691, + "grad_norm": 7.507733345031738, + "learning_rate": 1.7790134261993607e-06, + "loss": 0.1961, + "step": 2874 + }, + { + "epoch": 2.930682976554536, + "grad_norm": 7.484766483306885, + "learning_rate": 1.7295421884590769e-06, + "loss": 0.4343, + "step": 2875 + }, + { + "epoch": 2.9317023445463812, + "grad_norm": 3.543367385864258, + "learning_rate": 1.6807564322769198e-06, + "loss": 0.1418, + "step": 2876 + }, + { + "epoch": 2.9327217125382266, + "grad_norm": 6.311351776123047, + "learning_rate": 1.6326568504521521e-06, + "loss": 0.2141, + "step": 2877 + }, + { + "epoch": 2.9337410805300714, + "grad_norm": 7.337040424346924, + "learning_rate": 1.58524412603972e-06, + "loss": 0.4081, + "step": 2878 + }, + { + "epoch": 2.9347604485219163, + "grad_norm": 3.225853443145752, + "learning_rate": 1.5385189323406657e-06, + "loss": 0.0816, + "step": 2879 + }, + { + "epoch": 2.9357798165137616, + "grad_norm": 6.153621673583984, + "learning_rate": 1.4924819328924477e-06, + "loss": 0.331, + "step": 2880 + }, + { + "epoch": 2.9367991845056065, + "grad_norm": 5.877473831176758, + "learning_rate": 1.4471337814596752e-06, + "loss": 0.1582, + "step": 2881 + }, + { + "epoch": 2.9378185524974514, + "grad_norm": 5.566036701202393, + "learning_rate": 1.4024751220245935e-06, + "loss": 0.4107, + "step": 2882 + }, + { + "epoch": 2.9388379204892967, + "grad_norm": 6.447503089904785, + "learning_rate": 1.3585065887781912e-06, + "loss": 0.3968, + "step": 2883 + }, + { + "epoch": 2.9398572884811416, + "grad_norm": 7.995684623718262, + "learning_rate": 1.3152288061110518e-06, + "loss": 0.2108, + "step": 2884 + }, + { + "epoch": 2.940876656472987, + "grad_norm": 5.465237140655518, + "learning_rate": 1.2726423886044835e-06, + "loss": 0.2353, + "step": 2885 + }, + { + "epoch": 2.941896024464832, + "grad_norm": 2.0872719287872314, + "learning_rate": 1.2307479410218203e-06, + "loss": 0.1005, + "step": 2886 + }, + { + "epoch": 2.942915392456677, + "grad_norm": 3.393397331237793, + "learning_rate": 1.189546058299873e-06, + "loss": 0.2052, + "step": 2887 + }, + { + "epoch": 2.943934760448522, + "grad_norm": 3.042525291442871, + "learning_rate": 1.1490373255404309e-06, + "loss": 0.1, + "step": 2888 + }, + { + "epoch": 2.944954128440367, + "grad_norm": 7.329123497009277, + "learning_rate": 1.1092223180019456e-06, + "loss": 0.4187, + "step": 2889 + }, + { + "epoch": 2.945973496432212, + "grad_norm": 9.219369888305664, + "learning_rate": 1.0701016010913723e-06, + "loss": 0.7002, + "step": 2890 + }, + { + "epoch": 2.946992864424057, + "grad_norm": 6.616352558135986, + "learning_rate": 1.0316757303561852e-06, + "loss": 0.3459, + "step": 2891 + }, + { + "epoch": 2.948012232415902, + "grad_norm": 6.501418113708496, + "learning_rate": 9.939452514764303e-07, + "loss": 0.2325, + "step": 2892 + }, + { + "epoch": 2.9490316004077473, + "grad_norm": 2.8526558876037598, + "learning_rate": 9.5691070025698e-07, + "loss": 0.1916, + "step": 2893 + }, + { + "epoch": 2.950050968399592, + "grad_norm": 4.048646926879883, + "learning_rate": 9.205726026199957e-07, + "loss": 0.1248, + "step": 2894 + }, + { + "epoch": 2.9510703363914375, + "grad_norm": 6.920399188995361, + "learning_rate": 8.849314745973392e-07, + "loss": 0.5866, + "step": 2895 + }, + { + "epoch": 2.9520897043832823, + "grad_norm": 12.091155052185059, + "learning_rate": 8.499878223233726e-07, + "loss": 0.7633, + "step": 2896 + }, + { + "epoch": 2.9531090723751277, + "grad_norm": 4.252422332763672, + "learning_rate": 8.157421420276479e-07, + "loss": 0.2877, + "step": 2897 + }, + { + "epoch": 2.9541284403669725, + "grad_norm": 12.343497276306152, + "learning_rate": 7.821949200279899e-07, + "loss": 0.5611, + "step": 2898 + }, + { + "epoch": 2.9551478083588174, + "grad_norm": 6.737751007080078, + "learning_rate": 7.493466327234521e-07, + "loss": 0.2056, + "step": 2899 + }, + { + "epoch": 2.9561671763506627, + "grad_norm": 7.8665947914123535, + "learning_rate": 7.171977465876834e-07, + "loss": 0.3976, + "step": 2900 + }, + { + "epoch": 2.9571865443425076, + "grad_norm": 6.2594709396362305, + "learning_rate": 6.857487181621935e-07, + "loss": 0.2407, + "step": 2901 + }, + { + "epoch": 2.9582059123343525, + "grad_norm": 8.915580749511719, + "learning_rate": 6.549999940499263e-07, + "loss": 0.6218, + "step": 2902 + }, + { + "epoch": 2.959225280326198, + "grad_norm": 6.0038743019104, + "learning_rate": 6.249520109089469e-07, + "loss": 0.3254, + "step": 2903 + }, + { + "epoch": 2.9602446483180427, + "grad_norm": 3.5023767948150635, + "learning_rate": 5.956051954461472e-07, + "loss": 0.2097, + "step": 2904 + }, + { + "epoch": 2.961264016309888, + "grad_norm": 5.1068034172058105, + "learning_rate": 5.669599644112788e-07, + "loss": 0.3198, + "step": 2905 + }, + { + "epoch": 2.962283384301733, + "grad_norm": 6.734827995300293, + "learning_rate": 5.390167245909794e-07, + "loss": 0.365, + "step": 2906 + }, + { + "epoch": 2.963302752293578, + "grad_norm": 6.597443580627441, + "learning_rate": 5.117758728030441e-07, + "loss": 0.3549, + "step": 2907 + }, + { + "epoch": 2.964322120285423, + "grad_norm": 4.171931266784668, + "learning_rate": 4.852377958907195e-07, + "loss": 0.2156, + "step": 2908 + }, + { + "epoch": 2.965341488277268, + "grad_norm": 3.3261940479278564, + "learning_rate": 4.594028707172626e-07, + "loss": 0.1436, + "step": 2909 + }, + { + "epoch": 2.9663608562691133, + "grad_norm": 6.220198631286621, + "learning_rate": 4.3427146416060163e-07, + "loss": 0.3647, + "step": 2910 + }, + { + "epoch": 2.967380224260958, + "grad_norm": 7.359838962554932, + "learning_rate": 4.09843933108095e-07, + "loss": 0.29, + "step": 2911 + }, + { + "epoch": 2.968399592252803, + "grad_norm": 5.08355712890625, + "learning_rate": 3.8612062445143596e-07, + "loss": 0.2596, + "step": 2912 + }, + { + "epoch": 2.9694189602446484, + "grad_norm": 2.2986602783203125, + "learning_rate": 3.6310187508179494e-07, + "loss": 0.076, + "step": 2913 + }, + { + "epoch": 2.9704383282364932, + "grad_norm": 3.739488124847412, + "learning_rate": 3.4078801188499597e-07, + "loss": 0.1321, + "step": 2914 + }, + { + "epoch": 2.9714576962283386, + "grad_norm": 10.203991889953613, + "learning_rate": 3.191793517368702e-07, + "loss": 0.5034, + "step": 2915 + }, + { + "epoch": 2.9724770642201834, + "grad_norm": 3.171394109725952, + "learning_rate": 2.982762014987761e-07, + "loss": 0.1067, + "step": 2916 + } + ], + "logging_steps": 1, + "max_steps": 2943, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 972, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}