diff --git "a/checkpoint-1145/trainer_state.json" "b/checkpoint-1145/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1145/trainer_state.json" @@ -0,0 +1,8328 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9901989045834535, + "eval_steps": 115, + "global_step": 1145, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008648025367541078, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 8.1889, + "step": 1 + }, + { + "epoch": 0.0017296050735082155, + "grad_norm": 25.332334518432617, + "learning_rate": 0.0, + "loss": 9.3962, + "step": 2 + }, + { + "epoch": 0.0025944076102623233, + "grad_norm": 21.9357967376709, + "learning_rate": 1.9193857965451055e-07, + "loss": 9.4132, + "step": 3 + }, + { + "epoch": 0.003459210147016431, + "grad_norm": 26.18132781982422, + "learning_rate": 3.838771593090211e-07, + "loss": 9.1807, + "step": 4 + }, + { + "epoch": 0.004324012683770539, + "grad_norm": 24.46787452697754, + "learning_rate": 5.758157389635317e-07, + "loss": 8.1559, + "step": 5 + }, + { + "epoch": 0.0051888152205246466, + "grad_norm": 25.57164764404297, + "learning_rate": 7.677543186180422e-07, + "loss": 9.9007, + "step": 6 + }, + { + "epoch": 0.006053617757278754, + "grad_norm": 19.92409324645996, + "learning_rate": 9.596928982725527e-07, + "loss": 9.634, + "step": 7 + }, + { + "epoch": 0.006918420294032862, + "grad_norm": 23.451889038085938, + "learning_rate": 1.1516314779270634e-06, + "loss": 10.8722, + "step": 8 + }, + { + "epoch": 0.007783222830786971, + "grad_norm": 22.753061294555664, + "learning_rate": 1.343570057581574e-06, + "loss": 8.6969, + "step": 9 + }, + { + "epoch": 0.008648025367541078, + "grad_norm": 23.012781143188477, + "learning_rate": 1.5355086372360844e-06, + "loss": 8.9552, + "step": 10 + }, + { + "epoch": 0.009512827904295185, + "grad_norm": 20.173051834106445, + "learning_rate": 1.7274472168905951e-06, + "loss": 7.5245, + "step": 11 + }, + { + "epoch": 0.010377630441049293, + "grad_norm": 24.979217529296875, + "learning_rate": 1.9193857965451054e-06, + "loss": 9.1573, + "step": 12 + }, + { + "epoch": 0.011242432977803401, + "grad_norm": 24.23455810546875, + "learning_rate": 2.1113243761996164e-06, + "loss": 9.2615, + "step": 13 + }, + { + "epoch": 0.012107235514557509, + "grad_norm": 25.147851943969727, + "learning_rate": 2.303262955854127e-06, + "loss": 9.1465, + "step": 14 + }, + { + "epoch": 0.012972038051311616, + "grad_norm": 21.937841415405273, + "learning_rate": 2.4952015355086374e-06, + "loss": 9.3845, + "step": 15 + }, + { + "epoch": 0.013836840588065724, + "grad_norm": 24.25821304321289, + "learning_rate": 2.687140115163148e-06, + "loss": 9.3638, + "step": 16 + }, + { + "epoch": 0.014701643124819834, + "grad_norm": 22.018434524536133, + "learning_rate": 2.879078694817659e-06, + "loss": 9.3365, + "step": 17 + }, + { + "epoch": 0.015566445661573941, + "grad_norm": 19.021236419677734, + "learning_rate": 3.071017274472169e-06, + "loss": 8.267, + "step": 18 + }, + { + "epoch": 0.016431248198328047, + "grad_norm": 24.68037986755371, + "learning_rate": 3.2629558541266794e-06, + "loss": 9.8727, + "step": 19 + }, + { + "epoch": 0.017296050735082155, + "grad_norm": 18.95473861694336, + "learning_rate": 3.4548944337811903e-06, + "loss": 9.6234, + "step": 20 + }, + { + "epoch": 0.018160853271836263, + "grad_norm": 20.910001754760742, + "learning_rate": 3.646833013435701e-06, + "loss": 9.0452, + "step": 21 + }, + { + "epoch": 0.01902565580859037, + "grad_norm": 23.27020835876465, + "learning_rate": 3.838771593090211e-06, + "loss": 8.5295, + "step": 22 + }, + { + "epoch": 0.01989045834534448, + "grad_norm": 20.173105239868164, + "learning_rate": 4.030710172744722e-06, + "loss": 7.8237, + "step": 23 + }, + { + "epoch": 0.020755260882098586, + "grad_norm": 21.863664627075195, + "learning_rate": 4.222648752399233e-06, + "loss": 8.5162, + "step": 24 + }, + { + "epoch": 0.021620063418852694, + "grad_norm": 19.267335891723633, + "learning_rate": 4.414587332053743e-06, + "loss": 8.312, + "step": 25 + }, + { + "epoch": 0.022484865955606802, + "grad_norm": 18.204317092895508, + "learning_rate": 4.606525911708254e-06, + "loss": 9.2816, + "step": 26 + }, + { + "epoch": 0.02334966849236091, + "grad_norm": 18.67437744140625, + "learning_rate": 4.798464491362764e-06, + "loss": 7.9128, + "step": 27 + }, + { + "epoch": 0.024214471029115017, + "grad_norm": 18.21224594116211, + "learning_rate": 4.990403071017275e-06, + "loss": 7.6773, + "step": 28 + }, + { + "epoch": 0.025079273565869125, + "grad_norm": 18.150562286376953, + "learning_rate": 5.182341650671786e-06, + "loss": 7.4793, + "step": 29 + }, + { + "epoch": 0.025944076102623233, + "grad_norm": 17.31494140625, + "learning_rate": 5.374280230326296e-06, + "loss": 8.2437, + "step": 30 + }, + { + "epoch": 0.02680887863937734, + "grad_norm": 16.86028289794922, + "learning_rate": 5.566218809980806e-06, + "loss": 8.6053, + "step": 31 + }, + { + "epoch": 0.02767368117613145, + "grad_norm": 15.361474990844727, + "learning_rate": 5.758157389635318e-06, + "loss": 7.4762, + "step": 32 + }, + { + "epoch": 0.02853848371288556, + "grad_norm": 18.730810165405273, + "learning_rate": 5.950095969289828e-06, + "loss": 10.5289, + "step": 33 + }, + { + "epoch": 0.029403286249639667, + "grad_norm": 15.356877326965332, + "learning_rate": 6.142034548944338e-06, + "loss": 7.8911, + "step": 34 + }, + { + "epoch": 0.030268088786393775, + "grad_norm": 17.622791290283203, + "learning_rate": 6.333973128598848e-06, + "loss": 7.9708, + "step": 35 + }, + { + "epoch": 0.031132891323147883, + "grad_norm": 17.50615882873535, + "learning_rate": 6.525911708253359e-06, + "loss": 7.2581, + "step": 36 + }, + { + "epoch": 0.03199769385990199, + "grad_norm": 16.068561553955078, + "learning_rate": 6.7178502879078705e-06, + "loss": 8.6747, + "step": 37 + }, + { + "epoch": 0.032862496396656095, + "grad_norm": 13.518677711486816, + "learning_rate": 6.909788867562381e-06, + "loss": 7.308, + "step": 38 + }, + { + "epoch": 0.033727298933410206, + "grad_norm": 18.713558197021484, + "learning_rate": 7.101727447216891e-06, + "loss": 8.6224, + "step": 39 + }, + { + "epoch": 0.03459210147016431, + "grad_norm": 20.201255798339844, + "learning_rate": 7.293666026871402e-06, + "loss": 12.3613, + "step": 40 + }, + { + "epoch": 0.03545690400691842, + "grad_norm": 13.44450855255127, + "learning_rate": 7.485604606525912e-06, + "loss": 7.2108, + "step": 41 + }, + { + "epoch": 0.036321706543672526, + "grad_norm": 15.1000394821167, + "learning_rate": 7.677543186180422e-06, + "loss": 7.831, + "step": 42 + }, + { + "epoch": 0.03718650908042664, + "grad_norm": 14.707894325256348, + "learning_rate": 7.869481765834934e-06, + "loss": 6.9282, + "step": 43 + }, + { + "epoch": 0.03805131161718074, + "grad_norm": 13.331870079040527, + "learning_rate": 8.061420345489444e-06, + "loss": 6.9092, + "step": 44 + }, + { + "epoch": 0.03891611415393485, + "grad_norm": 14.49152660369873, + "learning_rate": 8.253358925143954e-06, + "loss": 8.9053, + "step": 45 + }, + { + "epoch": 0.03978091669068896, + "grad_norm": 13.79437255859375, + "learning_rate": 8.445297504798465e-06, + "loss": 7.5276, + "step": 46 + }, + { + "epoch": 0.04064571922744307, + "grad_norm": 15.470795631408691, + "learning_rate": 8.637236084452976e-06, + "loss": 7.4793, + "step": 47 + }, + { + "epoch": 0.04151052176419717, + "grad_norm": 13.469670295715332, + "learning_rate": 8.829174664107486e-06, + "loss": 7.4401, + "step": 48 + }, + { + "epoch": 0.042375324300951284, + "grad_norm": 12.38973617553711, + "learning_rate": 9.021113243761997e-06, + "loss": 6.6742, + "step": 49 + }, + { + "epoch": 0.04324012683770539, + "grad_norm": 14.353404998779297, + "learning_rate": 9.213051823416507e-06, + "loss": 8.89, + "step": 50 + }, + { + "epoch": 0.0441049293744595, + "grad_norm": 12.149626731872559, + "learning_rate": 9.404990403071018e-06, + "loss": 8.6311, + "step": 51 + }, + { + "epoch": 0.044969731911213603, + "grad_norm": 12.504135131835938, + "learning_rate": 9.596928982725528e-06, + "loss": 6.9648, + "step": 52 + }, + { + "epoch": 0.045834534447967715, + "grad_norm": 12.439926147460938, + "learning_rate": 9.78886756238004e-06, + "loss": 7.0633, + "step": 53 + }, + { + "epoch": 0.04669933698472182, + "grad_norm": 13.445518493652344, + "learning_rate": 9.98080614203455e-06, + "loss": 8.1331, + "step": 54 + }, + { + "epoch": 0.04756413952147593, + "grad_norm": 12.668989181518555, + "learning_rate": 1.0172744721689061e-05, + "loss": 8.4931, + "step": 55 + }, + { + "epoch": 0.048428942058230035, + "grad_norm": 11.86841869354248, + "learning_rate": 1.0364683301343571e-05, + "loss": 6.9534, + "step": 56 + }, + { + "epoch": 0.049293744594984146, + "grad_norm": 12.336670875549316, + "learning_rate": 1.0556621880998081e-05, + "loss": 6.9585, + "step": 57 + }, + { + "epoch": 0.05015854713173825, + "grad_norm": 12.496221542358398, + "learning_rate": 1.0748560460652591e-05, + "loss": 7.6699, + "step": 58 + }, + { + "epoch": 0.05102334966849236, + "grad_norm": 11.765594482421875, + "learning_rate": 1.0940499040307102e-05, + "loss": 6.5076, + "step": 59 + }, + { + "epoch": 0.051888152205246466, + "grad_norm": 13.426615715026855, + "learning_rate": 1.1132437619961612e-05, + "loss": 9.5443, + "step": 60 + }, + { + "epoch": 0.05275295474200058, + "grad_norm": 12.127195358276367, + "learning_rate": 1.1324376199616123e-05, + "loss": 6.7481, + "step": 61 + }, + { + "epoch": 0.05361775727875468, + "grad_norm": 10.69729232788086, + "learning_rate": 1.1516314779270635e-05, + "loss": 6.4521, + "step": 62 + }, + { + "epoch": 0.05448255981550879, + "grad_norm": 12.042082786560059, + "learning_rate": 1.1708253358925145e-05, + "loss": 8.1839, + "step": 63 + }, + { + "epoch": 0.0553473623522629, + "grad_norm": 13.164307594299316, + "learning_rate": 1.1900191938579655e-05, + "loss": 7.1924, + "step": 64 + }, + { + "epoch": 0.05621216488901701, + "grad_norm": 10.799245834350586, + "learning_rate": 1.2092130518234165e-05, + "loss": 7.5767, + "step": 65 + }, + { + "epoch": 0.05707696742577112, + "grad_norm": 10.165273666381836, + "learning_rate": 1.2284069097888675e-05, + "loss": 7.2645, + "step": 66 + }, + { + "epoch": 0.05794176996252522, + "grad_norm": 12.342886924743652, + "learning_rate": 1.2476007677543186e-05, + "loss": 6.175, + "step": 67 + }, + { + "epoch": 0.058806572499279335, + "grad_norm": 10.652329444885254, + "learning_rate": 1.2667946257197696e-05, + "loss": 6.5491, + "step": 68 + }, + { + "epoch": 0.05967137503603344, + "grad_norm": 10.688251495361328, + "learning_rate": 1.2859884836852207e-05, + "loss": 6.7543, + "step": 69 + }, + { + "epoch": 0.06053617757278755, + "grad_norm": 11.341581344604492, + "learning_rate": 1.3051823416506717e-05, + "loss": 6.98, + "step": 70 + }, + { + "epoch": 0.061400980109541654, + "grad_norm": 10.539051055908203, + "learning_rate": 1.3243761996161231e-05, + "loss": 6.76, + "step": 71 + }, + { + "epoch": 0.062265782646295766, + "grad_norm": 10.746752738952637, + "learning_rate": 1.3435700575815741e-05, + "loss": 7.2167, + "step": 72 + }, + { + "epoch": 0.06313058518304987, + "grad_norm": 12.96174144744873, + "learning_rate": 1.3627639155470251e-05, + "loss": 9.219, + "step": 73 + }, + { + "epoch": 0.06399538771980398, + "grad_norm": 10.668299674987793, + "learning_rate": 1.3819577735124761e-05, + "loss": 7.3113, + "step": 74 + }, + { + "epoch": 0.06486019025655809, + "grad_norm": 10.878615379333496, + "learning_rate": 1.4011516314779271e-05, + "loss": 6.4098, + "step": 75 + }, + { + "epoch": 0.06572499279331219, + "grad_norm": 12.29603099822998, + "learning_rate": 1.4203454894433781e-05, + "loss": 8.3399, + "step": 76 + }, + { + "epoch": 0.0665897953300663, + "grad_norm": 13.01440143585205, + "learning_rate": 1.4395393474088293e-05, + "loss": 8.6991, + "step": 77 + }, + { + "epoch": 0.06745459786682041, + "grad_norm": 10.999458312988281, + "learning_rate": 1.4587332053742803e-05, + "loss": 9.5087, + "step": 78 + }, + { + "epoch": 0.06831940040357452, + "grad_norm": 11.303417205810547, + "learning_rate": 1.4779270633397313e-05, + "loss": 7.3491, + "step": 79 + }, + { + "epoch": 0.06918420294032862, + "grad_norm": 10.507055282592773, + "learning_rate": 1.4971209213051823e-05, + "loss": 6.8214, + "step": 80 + }, + { + "epoch": 0.07004900547708273, + "grad_norm": 11.467567443847656, + "learning_rate": 1.5163147792706333e-05, + "loss": 6.5489, + "step": 81 + }, + { + "epoch": 0.07091380801383684, + "grad_norm": 10.555798530578613, + "learning_rate": 1.5355086372360844e-05, + "loss": 6.7692, + "step": 82 + }, + { + "epoch": 0.07177861055059095, + "grad_norm": 12.266429901123047, + "learning_rate": 1.5547024952015357e-05, + "loss": 8.8059, + "step": 83 + }, + { + "epoch": 0.07264341308734505, + "grad_norm": 9.898346900939941, + "learning_rate": 1.5738963531669867e-05, + "loss": 6.4811, + "step": 84 + }, + { + "epoch": 0.07350821562409916, + "grad_norm": 11.04404067993164, + "learning_rate": 1.5930902111324377e-05, + "loss": 7.0495, + "step": 85 + }, + { + "epoch": 0.07437301816085327, + "grad_norm": 11.240497589111328, + "learning_rate": 1.6122840690978887e-05, + "loss": 5.8256, + "step": 86 + }, + { + "epoch": 0.07523782069760739, + "grad_norm": 10.409235000610352, + "learning_rate": 1.6314779270633397e-05, + "loss": 5.7203, + "step": 87 + }, + { + "epoch": 0.07610262323436148, + "grad_norm": 11.557363510131836, + "learning_rate": 1.6506717850287907e-05, + "loss": 6.5094, + "step": 88 + }, + { + "epoch": 0.0769674257711156, + "grad_norm": 9.760974884033203, + "learning_rate": 1.669865642994242e-05, + "loss": 5.7523, + "step": 89 + }, + { + "epoch": 0.0778322283078697, + "grad_norm": 9.31316089630127, + "learning_rate": 1.689059500959693e-05, + "loss": 6.0464, + "step": 90 + }, + { + "epoch": 0.07869703084462382, + "grad_norm": 11.943814277648926, + "learning_rate": 1.708253358925144e-05, + "loss": 6.5233, + "step": 91 + }, + { + "epoch": 0.07956183338137791, + "grad_norm": 9.126127243041992, + "learning_rate": 1.727447216890595e-05, + "loss": 6.8966, + "step": 92 + }, + { + "epoch": 0.08042663591813203, + "grad_norm": 9.386579513549805, + "learning_rate": 1.746641074856046e-05, + "loss": 6.3621, + "step": 93 + }, + { + "epoch": 0.08129143845488614, + "grad_norm": 10.63054370880127, + "learning_rate": 1.765834932821497e-05, + "loss": 6.0194, + "step": 94 + }, + { + "epoch": 0.08215624099164025, + "grad_norm": 10.119132995605469, + "learning_rate": 1.785028790786948e-05, + "loss": 6.6797, + "step": 95 + }, + { + "epoch": 0.08302104352839434, + "grad_norm": 10.746257781982422, + "learning_rate": 1.8042226487523995e-05, + "loss": 5.6214, + "step": 96 + }, + { + "epoch": 0.08388584606514846, + "grad_norm": 10.64887809753418, + "learning_rate": 1.8234165067178505e-05, + "loss": 6.4946, + "step": 97 + }, + { + "epoch": 0.08475064860190257, + "grad_norm": 11.115398406982422, + "learning_rate": 1.8426103646833015e-05, + "loss": 5.9069, + "step": 98 + }, + { + "epoch": 0.08561545113865668, + "grad_norm": 11.452004432678223, + "learning_rate": 1.8618042226487525e-05, + "loss": 6.8848, + "step": 99 + }, + { + "epoch": 0.08648025367541078, + "grad_norm": 12.722066879272461, + "learning_rate": 1.8809980806142035e-05, + "loss": 7.7248, + "step": 100 + }, + { + "epoch": 0.08734505621216489, + "grad_norm": 10.500570297241211, + "learning_rate": 1.9001919385796545e-05, + "loss": 6.9069, + "step": 101 + }, + { + "epoch": 0.088209858748919, + "grad_norm": 10.750312805175781, + "learning_rate": 1.9193857965451055e-05, + "loss": 6.3612, + "step": 102 + }, + { + "epoch": 0.08907466128567311, + "grad_norm": 12.96158218383789, + "learning_rate": 1.9385796545105565e-05, + "loss": 7.6664, + "step": 103 + }, + { + "epoch": 0.08993946382242721, + "grad_norm": 11.477307319641113, + "learning_rate": 1.957773512476008e-05, + "loss": 5.4654, + "step": 104 + }, + { + "epoch": 0.09080426635918132, + "grad_norm": 13.458792686462402, + "learning_rate": 1.976967370441459e-05, + "loss": 6.7583, + "step": 105 + }, + { + "epoch": 0.09166906889593543, + "grad_norm": 11.862403869628906, + "learning_rate": 1.99616122840691e-05, + "loss": 6.354, + "step": 106 + }, + { + "epoch": 0.09253387143268954, + "grad_norm": 15.43807601928711, + "learning_rate": 2.015355086372361e-05, + "loss": 5.0476, + "step": 107 + }, + { + "epoch": 0.09339867396944364, + "grad_norm": 15.703176498413086, + "learning_rate": 2.0345489443378122e-05, + "loss": 5.535, + "step": 108 + }, + { + "epoch": 0.09426347650619775, + "grad_norm": 15.830728530883789, + "learning_rate": 2.0537428023032633e-05, + "loss": 5.125, + "step": 109 + }, + { + "epoch": 0.09512827904295186, + "grad_norm": 18.535364151000977, + "learning_rate": 2.0729366602687143e-05, + "loss": 5.3941, + "step": 110 + }, + { + "epoch": 0.09599308157970597, + "grad_norm": 20.664087295532227, + "learning_rate": 2.0921305182341653e-05, + "loss": 7.6313, + "step": 111 + }, + { + "epoch": 0.09685788411646007, + "grad_norm": 26.702512741088867, + "learning_rate": 2.1113243761996163e-05, + "loss": 5.584, + "step": 112 + }, + { + "epoch": 0.09772268665321418, + "grad_norm": 24.893169403076172, + "learning_rate": 2.1305182341650673e-05, + "loss": 6.7148, + "step": 113 + }, + { + "epoch": 0.09858748918996829, + "grad_norm": 23.61020278930664, + "learning_rate": 2.1497120921305183e-05, + "loss": 4.3739, + "step": 114 + }, + { + "epoch": 0.0994522917267224, + "grad_norm": 30.567276000976562, + "learning_rate": 2.1689059500959693e-05, + "loss": 7.8202, + "step": 115 + }, + { + "epoch": 0.0994522917267224, + "eval_Qnli-dev-1024_cosine_accuracy": 0.6979166666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.9675614833831787, + "eval_Qnli-dev-1024_cosine_ap": 0.688956658941829, + "eval_Qnli-dev-1024_cosine_f1": 0.6881720430107527, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.9675614833831787, + "eval_Qnli-dev-1024_cosine_mcc": 0.3966087176872613, + "eval_Qnli-dev-1024_cosine_precision": 0.6666666666666666, + "eval_Qnli-dev-1024_cosine_recall": 0.7111111111111111, + "eval_Qnli-dev_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8765031099319458, + "eval_Qnli-dev_cosine_ap": 0.760920950345153, + "eval_Qnli-dev_cosine_f1": 0.7272727272727272, + "eval_Qnli-dev_cosine_f1_threshold": 0.8635396957397461, + "eval_Qnli-dev_cosine_mcc": 0.4497120149145933, + "eval_Qnli-dev_cosine_precision": 0.6666666666666666, + "eval_Qnli-dev_cosine_recall": 0.8, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.8020833134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9791666865348816, + "eval_global_dataset_loss": 1.940317153930664, + "eval_global_dataset_runtime": 67.8871, + "eval_global_dataset_samples_per_second": 14.318, + "eval_global_dataset_steps_per_second": 0.309, + "eval_sequential_score": 0.8020833134651184, + "eval_sts-test-1024_pearson_cosine": 0.6710405361187501, + "eval_sts-test-1024_spearman_cosine": 0.8309834676298848, + "eval_sts-test_pearson_cosine": 0.9054066453363472, + "eval_sts-test_spearman_cosine": 0.9155824166550393, + "step": 115 + }, + { + "epoch": 0.1003170942634765, + "grad_norm": 31.343202590942383, + "learning_rate": 2.1880998080614203e-05, + "loss": 4.42, + "step": 116 + }, + { + "epoch": 0.10118189680023061, + "grad_norm": 28.399757385253906, + "learning_rate": 2.2072936660268713e-05, + "loss": 4.5047, + "step": 117 + }, + { + "epoch": 0.10204669933698472, + "grad_norm": 32.25544357299805, + "learning_rate": 2.2264875239923223e-05, + "loss": 4.427, + "step": 118 + }, + { + "epoch": 0.10291150187373883, + "grad_norm": 27.07774543762207, + "learning_rate": 2.2456813819577733e-05, + "loss": 3.1201, + "step": 119 + }, + { + "epoch": 0.10377630441049293, + "grad_norm": 31.4462833404541, + "learning_rate": 2.2648752399232247e-05, + "loss": 4.3632, + "step": 120 + }, + { + "epoch": 0.10464110694724704, + "grad_norm": 27.67288589477539, + "learning_rate": 2.2840690978886757e-05, + "loss": 3.5101, + "step": 121 + }, + { + "epoch": 0.10550590948400115, + "grad_norm": 29.23362922668457, + "learning_rate": 2.303262955854127e-05, + "loss": 4.7499, + "step": 122 + }, + { + "epoch": 0.10637071202075526, + "grad_norm": 27.85274887084961, + "learning_rate": 2.322456813819578e-05, + "loss": 4.5242, + "step": 123 + }, + { + "epoch": 0.10723551455750936, + "grad_norm": 21.893939971923828, + "learning_rate": 2.341650671785029e-05, + "loss": 3.379, + "step": 124 + }, + { + "epoch": 0.10810031709426347, + "grad_norm": 18.63385772705078, + "learning_rate": 2.36084452975048e-05, + "loss": 2.8004, + "step": 125 + }, + { + "epoch": 0.10896511963101758, + "grad_norm": 16.17616844177246, + "learning_rate": 2.380038387715931e-05, + "loss": 2.8855, + "step": 126 + }, + { + "epoch": 0.1098299221677717, + "grad_norm": 17.123281478881836, + "learning_rate": 2.399232245681382e-05, + "loss": 3.937, + "step": 127 + }, + { + "epoch": 0.1106947247045258, + "grad_norm": 14.539612770080566, + "learning_rate": 2.418426103646833e-05, + "loss": 3.5914, + "step": 128 + }, + { + "epoch": 0.1115595272412799, + "grad_norm": 12.644956588745117, + "learning_rate": 2.437619961612284e-05, + "loss": 2.6459, + "step": 129 + }, + { + "epoch": 0.11242432977803402, + "grad_norm": 10.95170783996582, + "learning_rate": 2.456813819577735e-05, + "loss": 2.3887, + "step": 130 + }, + { + "epoch": 0.11328913231478813, + "grad_norm": 12.561387062072754, + "learning_rate": 2.476007677543186e-05, + "loss": 4.1043, + "step": 131 + }, + { + "epoch": 0.11415393485154224, + "grad_norm": 9.273588180541992, + "learning_rate": 2.495201535508637e-05, + "loss": 2.2758, + "step": 132 + }, + { + "epoch": 0.11501873738829634, + "grad_norm": 9.219544410705566, + "learning_rate": 2.514395393474088e-05, + "loss": 2.859, + "step": 133 + }, + { + "epoch": 0.11588353992505045, + "grad_norm": 8.443903923034668, + "learning_rate": 2.533589251439539e-05, + "loss": 2.0162, + "step": 134 + }, + { + "epoch": 0.11674834246180456, + "grad_norm": 9.522578239440918, + "learning_rate": 2.5527831094049905e-05, + "loss": 2.7069, + "step": 135 + }, + { + "epoch": 0.11761314499855867, + "grad_norm": 8.184837341308594, + "learning_rate": 2.5719769673704415e-05, + "loss": 1.9536, + "step": 136 + }, + { + "epoch": 0.11847794753531277, + "grad_norm": 9.079197883605957, + "learning_rate": 2.5911708253358925e-05, + "loss": 2.3063, + "step": 137 + }, + { + "epoch": 0.11934275007206688, + "grad_norm": 9.438823699951172, + "learning_rate": 2.6103646833013435e-05, + "loss": 3.3783, + "step": 138 + }, + { + "epoch": 0.12020755260882099, + "grad_norm": 8.003981590270996, + "learning_rate": 2.6295585412667952e-05, + "loss": 1.9538, + "step": 139 + }, + { + "epoch": 0.1210723551455751, + "grad_norm": 8.199268341064453, + "learning_rate": 2.6487523992322462e-05, + "loss": 2.8959, + "step": 140 + }, + { + "epoch": 0.1219371576823292, + "grad_norm": 9.071074485778809, + "learning_rate": 2.6679462571976972e-05, + "loss": 2.3064, + "step": 141 + }, + { + "epoch": 0.12280196021908331, + "grad_norm": 10.237217903137207, + "learning_rate": 2.6871401151631482e-05, + "loss": 2.4625, + "step": 142 + }, + { + "epoch": 0.12366676275583742, + "grad_norm": 7.96627950668335, + "learning_rate": 2.7063339731285992e-05, + "loss": 2.4083, + "step": 143 + }, + { + "epoch": 0.12453156529259153, + "grad_norm": 8.751070022583008, + "learning_rate": 2.7255278310940502e-05, + "loss": 1.5914, + "step": 144 + }, + { + "epoch": 0.12539636782934563, + "grad_norm": 6.843534469604492, + "learning_rate": 2.7447216890595012e-05, + "loss": 1.5798, + "step": 145 + }, + { + "epoch": 0.12626117036609974, + "grad_norm": 7.700779438018799, + "learning_rate": 2.7639155470249522e-05, + "loss": 1.5194, + "step": 146 + }, + { + "epoch": 0.12712597290285385, + "grad_norm": 8.954259872436523, + "learning_rate": 2.7831094049904032e-05, + "loss": 1.5924, + "step": 147 + }, + { + "epoch": 0.12799077543960796, + "grad_norm": 10.815597534179688, + "learning_rate": 2.8023032629558543e-05, + "loss": 3.1143, + "step": 148 + }, + { + "epoch": 0.12885557797636207, + "grad_norm": 9.539572715759277, + "learning_rate": 2.8214971209213053e-05, + "loss": 1.8632, + "step": 149 + }, + { + "epoch": 0.12972038051311618, + "grad_norm": 6.322872638702393, + "learning_rate": 2.8406909788867563e-05, + "loss": 2.0489, + "step": 150 + }, + { + "epoch": 0.13058518304987027, + "grad_norm": 6.538212776184082, + "learning_rate": 2.8598848368522073e-05, + "loss": 1.5573, + "step": 151 + }, + { + "epoch": 0.13144998558662438, + "grad_norm": 6.798872470855713, + "learning_rate": 2.8790786948176586e-05, + "loss": 2.8024, + "step": 152 + }, + { + "epoch": 0.1323147881233785, + "grad_norm": 8.393974304199219, + "learning_rate": 2.8982725527831096e-05, + "loss": 1.9423, + "step": 153 + }, + { + "epoch": 0.1331795906601326, + "grad_norm": 8.043729782104492, + "learning_rate": 2.9174664107485606e-05, + "loss": 3.1444, + "step": 154 + }, + { + "epoch": 0.1340443931968867, + "grad_norm": 9.158576965332031, + "learning_rate": 2.9366602687140116e-05, + "loss": 2.5482, + "step": 155 + }, + { + "epoch": 0.13490919573364082, + "grad_norm": 6.786825180053711, + "learning_rate": 2.9558541266794627e-05, + "loss": 1.0428, + "step": 156 + }, + { + "epoch": 0.13577399827039494, + "grad_norm": 12.157453536987305, + "learning_rate": 2.9750479846449137e-05, + "loss": 5.8267, + "step": 157 + }, + { + "epoch": 0.13663880080714905, + "grad_norm": 10.719176292419434, + "learning_rate": 2.9942418426103647e-05, + "loss": 1.9785, + "step": 158 + }, + { + "epoch": 0.13750360334390313, + "grad_norm": 8.25823974609375, + "learning_rate": 3.0134357005758157e-05, + "loss": 2.5306, + "step": 159 + }, + { + "epoch": 0.13836840588065724, + "grad_norm": 8.451217651367188, + "learning_rate": 3.0326295585412667e-05, + "loss": 1.8271, + "step": 160 + }, + { + "epoch": 0.13923320841741135, + "grad_norm": 9.387060165405273, + "learning_rate": 3.051823416506718e-05, + "loss": 2.6579, + "step": 161 + }, + { + "epoch": 0.14009801095416546, + "grad_norm": 8.968480110168457, + "learning_rate": 3.071017274472169e-05, + "loss": 3.0193, + "step": 162 + }, + { + "epoch": 0.14096281349091958, + "grad_norm": 8.816688537597656, + "learning_rate": 3.09021113243762e-05, + "loss": 1.5596, + "step": 163 + }, + { + "epoch": 0.1418276160276737, + "grad_norm": 5.402006149291992, + "learning_rate": 3.1094049904030714e-05, + "loss": 1.4505, + "step": 164 + }, + { + "epoch": 0.1426924185644278, + "grad_norm": 7.654393196105957, + "learning_rate": 3.128598848368523e-05, + "loss": 2.5331, + "step": 165 + }, + { + "epoch": 0.1435572211011819, + "grad_norm": 6.393066883087158, + "learning_rate": 3.1477927063339734e-05, + "loss": 1.384, + "step": 166 + }, + { + "epoch": 0.144422023637936, + "grad_norm": 8.975717544555664, + "learning_rate": 3.166986564299425e-05, + "loss": 3.3553, + "step": 167 + }, + { + "epoch": 0.1452868261746901, + "grad_norm": 8.812336921691895, + "learning_rate": 3.1861804222648754e-05, + "loss": 2.2541, + "step": 168 + }, + { + "epoch": 0.14615162871144421, + "grad_norm": 7.189652919769287, + "learning_rate": 3.205374280230327e-05, + "loss": 1.1827, + "step": 169 + }, + { + "epoch": 0.14701643124819833, + "grad_norm": 7.888529300689697, + "learning_rate": 3.2245681381957774e-05, + "loss": 1.3643, + "step": 170 + }, + { + "epoch": 0.14788123378495244, + "grad_norm": 6.611407279968262, + "learning_rate": 3.243761996161229e-05, + "loss": 1.9817, + "step": 171 + }, + { + "epoch": 0.14874603632170655, + "grad_norm": 6.734430313110352, + "learning_rate": 3.2629558541266795e-05, + "loss": 2.3332, + "step": 172 + }, + { + "epoch": 0.14961083885846066, + "grad_norm": 6.5995306968688965, + "learning_rate": 3.282149712092131e-05, + "loss": 1.4638, + "step": 173 + }, + { + "epoch": 0.15047564139521477, + "grad_norm": 7.57749605178833, + "learning_rate": 3.3013435700575815e-05, + "loss": 1.7929, + "step": 174 + }, + { + "epoch": 0.15134044393196885, + "grad_norm": 4.956903457641602, + "learning_rate": 3.320537428023033e-05, + "loss": 1.0457, + "step": 175 + }, + { + "epoch": 0.15220524646872297, + "grad_norm": 9.929686546325684, + "learning_rate": 3.339731285988484e-05, + "loss": 1.4866, + "step": 176 + }, + { + "epoch": 0.15307004900547708, + "grad_norm": 7.194726467132568, + "learning_rate": 3.358925143953935e-05, + "loss": 1.7834, + "step": 177 + }, + { + "epoch": 0.1539348515422312, + "grad_norm": 6.916417598724365, + "learning_rate": 3.378119001919386e-05, + "loss": 1.1396, + "step": 178 + }, + { + "epoch": 0.1547996540789853, + "grad_norm": 9.47856330871582, + "learning_rate": 3.397312859884837e-05, + "loss": 1.9811, + "step": 179 + }, + { + "epoch": 0.1556644566157394, + "grad_norm": 7.894885540008545, + "learning_rate": 3.416506717850288e-05, + "loss": 1.1859, + "step": 180 + }, + { + "epoch": 0.15652925915249352, + "grad_norm": 7.631194114685059, + "learning_rate": 3.435700575815739e-05, + "loss": 1.5481, + "step": 181 + }, + { + "epoch": 0.15739406168924763, + "grad_norm": 5.6157073974609375, + "learning_rate": 3.45489443378119e-05, + "loss": 1.5954, + "step": 182 + }, + { + "epoch": 0.15825886422600172, + "grad_norm": 9.201720237731934, + "learning_rate": 3.474088291746641e-05, + "loss": 2.2163, + "step": 183 + }, + { + "epoch": 0.15912366676275583, + "grad_norm": 5.702026844024658, + "learning_rate": 3.493282149712092e-05, + "loss": 1.475, + "step": 184 + }, + { + "epoch": 0.15998846929950994, + "grad_norm": 5.93116569519043, + "learning_rate": 3.512476007677543e-05, + "loss": 1.2394, + "step": 185 + }, + { + "epoch": 0.16085327183626405, + "grad_norm": 3.9884233474731445, + "learning_rate": 3.531669865642994e-05, + "loss": 1.2713, + "step": 186 + }, + { + "epoch": 0.16171807437301816, + "grad_norm": 7.569946765899658, + "learning_rate": 3.550863723608445e-05, + "loss": 1.435, + "step": 187 + }, + { + "epoch": 0.16258287690977227, + "grad_norm": 7.594637393951416, + "learning_rate": 3.570057581573896e-05, + "loss": 1.1762, + "step": 188 + }, + { + "epoch": 0.16344767944652638, + "grad_norm": 7.092876434326172, + "learning_rate": 3.5892514395393476e-05, + "loss": 2.3349, + "step": 189 + }, + { + "epoch": 0.1643124819832805, + "grad_norm": 6.997330188751221, + "learning_rate": 3.608445297504799e-05, + "loss": 1.1459, + "step": 190 + }, + { + "epoch": 0.16517728452003458, + "grad_norm": 9.205595016479492, + "learning_rate": 3.6276391554702496e-05, + "loss": 1.313, + "step": 191 + }, + { + "epoch": 0.1660420870567887, + "grad_norm": 6.776134014129639, + "learning_rate": 3.646833013435701e-05, + "loss": 1.1422, + "step": 192 + }, + { + "epoch": 0.1669068895935428, + "grad_norm": 9.902478218078613, + "learning_rate": 3.6660268714011516e-05, + "loss": 1.4937, + "step": 193 + }, + { + "epoch": 0.1677716921302969, + "grad_norm": 8.630653381347656, + "learning_rate": 3.685220729366603e-05, + "loss": 1.351, + "step": 194 + }, + { + "epoch": 0.16863649466705102, + "grad_norm": 8.957950592041016, + "learning_rate": 3.704414587332054e-05, + "loss": 1.1581, + "step": 195 + }, + { + "epoch": 0.16950129720380513, + "grad_norm": 8.303983688354492, + "learning_rate": 3.723608445297505e-05, + "loss": 2.1473, + "step": 196 + }, + { + "epoch": 0.17036609974055925, + "grad_norm": 8.272674560546875, + "learning_rate": 3.7428023032629563e-05, + "loss": 0.8801, + "step": 197 + }, + { + "epoch": 0.17123090227731336, + "grad_norm": 7.904557228088379, + "learning_rate": 3.761996161228407e-05, + "loss": 1.3985, + "step": 198 + }, + { + "epoch": 0.17209570481406747, + "grad_norm": 5.652804851531982, + "learning_rate": 3.7811900191938584e-05, + "loss": 0.8468, + "step": 199 + }, + { + "epoch": 0.17296050735082155, + "grad_norm": 5.771730422973633, + "learning_rate": 3.800383877159309e-05, + "loss": 1.0563, + "step": 200 + }, + { + "epoch": 0.17382530988757566, + "grad_norm": 6.634278297424316, + "learning_rate": 3.8195777351247604e-05, + "loss": 0.9612, + "step": 201 + }, + { + "epoch": 0.17469011242432977, + "grad_norm": 8.659712791442871, + "learning_rate": 3.838771593090211e-05, + "loss": 1.665, + "step": 202 + }, + { + "epoch": 0.17555491496108389, + "grad_norm": 6.617002487182617, + "learning_rate": 3.8579654510556624e-05, + "loss": 1.1505, + "step": 203 + }, + { + "epoch": 0.176419717497838, + "grad_norm": 10.3783597946167, + "learning_rate": 3.877159309021113e-05, + "loss": 1.7958, + "step": 204 + }, + { + "epoch": 0.1772845200345921, + "grad_norm": 9.473942756652832, + "learning_rate": 3.8963531669865644e-05, + "loss": 1.3115, + "step": 205 + }, + { + "epoch": 0.17814932257134622, + "grad_norm": 7.500204563140869, + "learning_rate": 3.915547024952016e-05, + "loss": 1.0855, + "step": 206 + }, + { + "epoch": 0.17901412510810033, + "grad_norm": 6.897130012512207, + "learning_rate": 3.9347408829174664e-05, + "loss": 1.1051, + "step": 207 + }, + { + "epoch": 0.17987892764485441, + "grad_norm": 9.034842491149902, + "learning_rate": 3.953934740882918e-05, + "loss": 2.5371, + "step": 208 + }, + { + "epoch": 0.18074373018160853, + "grad_norm": 9.812570571899414, + "learning_rate": 3.9731285988483684e-05, + "loss": 1.7992, + "step": 209 + }, + { + "epoch": 0.18160853271836264, + "grad_norm": 7.528004169464111, + "learning_rate": 3.99232245681382e-05, + "loss": 1.7798, + "step": 210 + }, + { + "epoch": 0.18247333525511675, + "grad_norm": 7.52139139175415, + "learning_rate": 4.0115163147792705e-05, + "loss": 0.7093, + "step": 211 + }, + { + "epoch": 0.18333813779187086, + "grad_norm": 9.2921142578125, + "learning_rate": 4.030710172744722e-05, + "loss": 1.2681, + "step": 212 + }, + { + "epoch": 0.18420294032862497, + "grad_norm": 4.883711814880371, + "learning_rate": 4.049904030710173e-05, + "loss": 0.911, + "step": 213 + }, + { + "epoch": 0.18506774286537908, + "grad_norm": 8.103593826293945, + "learning_rate": 4.0690978886756245e-05, + "loss": 1.1144, + "step": 214 + }, + { + "epoch": 0.1859325454021332, + "grad_norm": 6.5846381187438965, + "learning_rate": 4.088291746641075e-05, + "loss": 0.8362, + "step": 215 + }, + { + "epoch": 0.18679734793888728, + "grad_norm": 5.238864421844482, + "learning_rate": 4.1074856046065265e-05, + "loss": 0.838, + "step": 216 + }, + { + "epoch": 0.1876621504756414, + "grad_norm": 7.091164588928223, + "learning_rate": 4.126679462571977e-05, + "loss": 1.3143, + "step": 217 + }, + { + "epoch": 0.1885269530123955, + "grad_norm": 4.529580116271973, + "learning_rate": 4.1458733205374285e-05, + "loss": 0.8799, + "step": 218 + }, + { + "epoch": 0.1893917555491496, + "grad_norm": 5.912927627563477, + "learning_rate": 4.165067178502879e-05, + "loss": 0.7928, + "step": 219 + }, + { + "epoch": 0.19025655808590372, + "grad_norm": 7.802720069885254, + "learning_rate": 4.1842610364683305e-05, + "loss": 0.7077, + "step": 220 + }, + { + "epoch": 0.19112136062265783, + "grad_norm": 7.49670934677124, + "learning_rate": 4.203454894433781e-05, + "loss": 1.7815, + "step": 221 + }, + { + "epoch": 0.19198616315941194, + "grad_norm": 5.978695392608643, + "learning_rate": 4.2226487523992326e-05, + "loss": 1.3599, + "step": 222 + }, + { + "epoch": 0.19285096569616605, + "grad_norm": 8.289727210998535, + "learning_rate": 4.241842610364683e-05, + "loss": 0.7413, + "step": 223 + }, + { + "epoch": 0.19371576823292014, + "grad_norm": 7.663917541503906, + "learning_rate": 4.2610364683301346e-05, + "loss": 1.9959, + "step": 224 + }, + { + "epoch": 0.19458057076967425, + "grad_norm": 9.845619201660156, + "learning_rate": 4.280230326295586e-05, + "loss": 1.9112, + "step": 225 + }, + { + "epoch": 0.19544537330642836, + "grad_norm": 5.703056812286377, + "learning_rate": 4.2994241842610366e-05, + "loss": 0.5033, + "step": 226 + }, + { + "epoch": 0.19631017584318247, + "grad_norm": 9.209814071655273, + "learning_rate": 4.318618042226488e-05, + "loss": 1.1669, + "step": 227 + }, + { + "epoch": 0.19717497837993658, + "grad_norm": 8.577181816101074, + "learning_rate": 4.3378119001919386e-05, + "loss": 1.2109, + "step": 228 + }, + { + "epoch": 0.1980397809166907, + "grad_norm": 7.078784942626953, + "learning_rate": 4.35700575815739e-05, + "loss": 0.781, + "step": 229 + }, + { + "epoch": 0.1989045834534448, + "grad_norm": 9.162598609924316, + "learning_rate": 4.3761996161228406e-05, + "loss": 1.5895, + "step": 230 + }, + { + "epoch": 0.1989045834534448, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7395833333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.867725133895874, + "eval_Qnli-dev-1024_cosine_ap": 0.713229712410124, + "eval_Qnli-dev-1024_cosine_f1": 0.7291666666666667, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8537728786468506, + "eval_Qnli-dev-1024_cosine_mcc": 0.46405228758169936, + "eval_Qnli-dev-1024_cosine_precision": 0.6862745098039216, + "eval_Qnli-dev-1024_cosine_recall": 0.7777777777777778, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8137844204902649, + "eval_Qnli-dev_cosine_ap": 0.7513782450871136, + "eval_Qnli-dev_cosine_f1": 0.7222222222222222, + "eval_Qnli-dev_cosine_f1_threshold": 0.7686975002288818, + "eval_Qnli-dev_cosine_mcc": 0.41614558708189836, + "eval_Qnli-dev_cosine_precision": 0.6190476190476191, + "eval_Qnli-dev_cosine_recall": 0.8666666666666667, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9270833134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9791666865348816, + "eval_global_dataset_loss": 0.7282267808914185, + "eval_global_dataset_runtime": 67.7277, + "eval_global_dataset_samples_per_second": 14.352, + "eval_global_dataset_steps_per_second": 0.31, + "eval_sequential_score": 0.9270833134651184, + "eval_sts-test-1024_pearson_cosine": 0.8484219629681994, + "eval_sts-test-1024_spearman_cosine": 0.8984444397927454, + "eval_sts-test_pearson_cosine": 0.9066337545995211, + "eval_sts-test_spearman_cosine": 0.9170443296862147, + "step": 230 + }, + { + "epoch": 0.19976938599019892, + "grad_norm": 6.589021682739258, + "learning_rate": 4.395393474088292e-05, + "loss": 1.2439, + "step": 231 + }, + { + "epoch": 0.200634188526953, + "grad_norm": 4.9722089767456055, + "learning_rate": 4.4145873320537426e-05, + "loss": 0.5947, + "step": 232 + }, + { + "epoch": 0.2014989910637071, + "grad_norm": 6.424257278442383, + "learning_rate": 4.433781190019194e-05, + "loss": 1.1687, + "step": 233 + }, + { + "epoch": 0.20236379360046122, + "grad_norm": 10.21776008605957, + "learning_rate": 4.4529750479846447e-05, + "loss": 2.082, + "step": 234 + }, + { + "epoch": 0.20322859613721533, + "grad_norm": 6.3251633644104, + "learning_rate": 4.472168905950096e-05, + "loss": 0.521, + "step": 235 + }, + { + "epoch": 0.20409339867396945, + "grad_norm": 6.459076881408691, + "learning_rate": 4.491362763915547e-05, + "loss": 1.2406, + "step": 236 + }, + { + "epoch": 0.20495820121072356, + "grad_norm": 6.254432201385498, + "learning_rate": 4.510556621880998e-05, + "loss": 0.6586, + "step": 237 + }, + { + "epoch": 0.20582300374747767, + "grad_norm": 6.352238655090332, + "learning_rate": 4.5297504798464494e-05, + "loss": 0.6746, + "step": 238 + }, + { + "epoch": 0.20668780628423178, + "grad_norm": 4.247053146362305, + "learning_rate": 4.548944337811901e-05, + "loss": 0.3925, + "step": 239 + }, + { + "epoch": 0.20755260882098586, + "grad_norm": 6.61681604385376, + "learning_rate": 4.5681381957773514e-05, + "loss": 0.8654, + "step": 240 + }, + { + "epoch": 0.20841741135773997, + "grad_norm": 7.9061408042907715, + "learning_rate": 4.587332053742803e-05, + "loss": 0.6723, + "step": 241 + }, + { + "epoch": 0.20928221389449408, + "grad_norm": 3.9183671474456787, + "learning_rate": 4.606525911708254e-05, + "loss": 0.4345, + "step": 242 + }, + { + "epoch": 0.2101470164312482, + "grad_norm": 8.863993644714355, + "learning_rate": 4.625719769673705e-05, + "loss": 1.0822, + "step": 243 + }, + { + "epoch": 0.2110118189680023, + "grad_norm": 8.070558547973633, + "learning_rate": 4.644913627639156e-05, + "loss": 1.5697, + "step": 244 + }, + { + "epoch": 0.21187662150475642, + "grad_norm": 3.8370699882507324, + "learning_rate": 4.664107485604607e-05, + "loss": 0.3771, + "step": 245 + }, + { + "epoch": 0.21274142404151053, + "grad_norm": 4.8743486404418945, + "learning_rate": 4.683301343570058e-05, + "loss": 0.484, + "step": 246 + }, + { + "epoch": 0.21360622657826464, + "grad_norm": 6.827274322509766, + "learning_rate": 4.702495201535509e-05, + "loss": 1.0994, + "step": 247 + }, + { + "epoch": 0.21447102911501872, + "grad_norm": 6.400326251983643, + "learning_rate": 4.72168905950096e-05, + "loss": 1.0543, + "step": 248 + }, + { + "epoch": 0.21533583165177284, + "grad_norm": 9.760299682617188, + "learning_rate": 4.740882917466411e-05, + "loss": 1.1132, + "step": 249 + }, + { + "epoch": 0.21620063418852695, + "grad_norm": 9.413398742675781, + "learning_rate": 4.760076775431862e-05, + "loss": 1.7051, + "step": 250 + }, + { + "epoch": 0.21706543672528106, + "grad_norm": 6.986111164093018, + "learning_rate": 4.779270633397313e-05, + "loss": 0.5029, + "step": 251 + }, + { + "epoch": 0.21793023926203517, + "grad_norm": 11.26386547088623, + "learning_rate": 4.798464491362764e-05, + "loss": 1.5927, + "step": 252 + }, + { + "epoch": 0.21879504179878928, + "grad_norm": 5.758693695068359, + "learning_rate": 4.817658349328215e-05, + "loss": 0.9221, + "step": 253 + }, + { + "epoch": 0.2196598443355434, + "grad_norm": 6.061553478240967, + "learning_rate": 4.836852207293666e-05, + "loss": 0.989, + "step": 254 + }, + { + "epoch": 0.2205246468722975, + "grad_norm": 7.509443759918213, + "learning_rate": 4.8560460652591175e-05, + "loss": 1.9468, + "step": 255 + }, + { + "epoch": 0.2213894494090516, + "grad_norm": 7.857194900512695, + "learning_rate": 4.875239923224568e-05, + "loss": 0.7299, + "step": 256 + }, + { + "epoch": 0.2222542519458057, + "grad_norm": 9.96574592590332, + "learning_rate": 4.8944337811900195e-05, + "loss": 1.0199, + "step": 257 + }, + { + "epoch": 0.2231190544825598, + "grad_norm": 8.403667449951172, + "learning_rate": 4.91362763915547e-05, + "loss": 1.0238, + "step": 258 + }, + { + "epoch": 0.22398385701931392, + "grad_norm": 8.612835884094238, + "learning_rate": 4.9328214971209215e-05, + "loss": 1.8386, + "step": 259 + }, + { + "epoch": 0.22484865955606803, + "grad_norm": 7.690261363983154, + "learning_rate": 4.952015355086372e-05, + "loss": 0.7887, + "step": 260 + }, + { + "epoch": 0.22571346209282214, + "grad_norm": 9.24271011352539, + "learning_rate": 4.9712092130518236e-05, + "loss": 1.0248, + "step": 261 + }, + { + "epoch": 0.22657826462957625, + "grad_norm": 6.5738525390625, + "learning_rate": 4.990403071017274e-05, + "loss": 0.9891, + "step": 262 + }, + { + "epoch": 0.22744306716633036, + "grad_norm": 10.909134864807129, + "learning_rate": 5.009596928982726e-05, + "loss": 2.007, + "step": 263 + }, + { + "epoch": 0.22830786970308448, + "grad_norm": 7.512816905975342, + "learning_rate": 5.028790786948176e-05, + "loss": 1.6522, + "step": 264 + }, + { + "epoch": 0.22917267223983856, + "grad_norm": 4.3134446144104, + "learning_rate": 5.047984644913628e-05, + "loss": 0.8482, + "step": 265 + }, + { + "epoch": 0.23003747477659267, + "grad_norm": 6.679250240325928, + "learning_rate": 5.067178502879078e-05, + "loss": 0.7231, + "step": 266 + }, + { + "epoch": 0.23090227731334678, + "grad_norm": 8.060896873474121, + "learning_rate": 5.08637236084453e-05, + "loss": 0.9017, + "step": 267 + }, + { + "epoch": 0.2317670798501009, + "grad_norm": 10.473666191101074, + "learning_rate": 5.105566218809981e-05, + "loss": 1.2073, + "step": 268 + }, + { + "epoch": 0.232631882386855, + "grad_norm": 5.640207290649414, + "learning_rate": 5.124760076775432e-05, + "loss": 0.3825, + "step": 269 + }, + { + "epoch": 0.23349668492360912, + "grad_norm": 7.310571193695068, + "learning_rate": 5.143953934740883e-05, + "loss": 0.6634, + "step": 270 + }, + { + "epoch": 0.23436148746036323, + "grad_norm": 10.224222183227539, + "learning_rate": 5.163147792706334e-05, + "loss": 1.3564, + "step": 271 + }, + { + "epoch": 0.23522628999711734, + "grad_norm": 4.993323802947998, + "learning_rate": 5.182341650671785e-05, + "loss": 1.1294, + "step": 272 + }, + { + "epoch": 0.23609109253387142, + "grad_norm": 6.149577617645264, + "learning_rate": 5.201535508637236e-05, + "loss": 0.5599, + "step": 273 + }, + { + "epoch": 0.23695589507062553, + "grad_norm": 6.756112098693848, + "learning_rate": 5.220729366602687e-05, + "loss": 0.6844, + "step": 274 + }, + { + "epoch": 0.23782069760737964, + "grad_norm": 8.450921058654785, + "learning_rate": 5.2399232245681383e-05, + "loss": 0.7783, + "step": 275 + }, + { + "epoch": 0.23868550014413376, + "grad_norm": 7.2079267501831055, + "learning_rate": 5.2591170825335904e-05, + "loss": 1.101, + "step": 276 + }, + { + "epoch": 0.23955030268088787, + "grad_norm": 6.447202205657959, + "learning_rate": 5.2783109404990404e-05, + "loss": 0.9447, + "step": 277 + }, + { + "epoch": 0.24041510521764198, + "grad_norm": 10.80993366241455, + "learning_rate": 5.2975047984644924e-05, + "loss": 2.4452, + "step": 278 + }, + { + "epoch": 0.2412799077543961, + "grad_norm": 7.458428859710693, + "learning_rate": 5.3166986564299424e-05, + "loss": 1.2032, + "step": 279 + }, + { + "epoch": 0.2421447102911502, + "grad_norm": 11.762413024902344, + "learning_rate": 5.3358925143953944e-05, + "loss": 1.9775, + "step": 280 + }, + { + "epoch": 0.24300951282790428, + "grad_norm": 6.029952049255371, + "learning_rate": 5.3550863723608444e-05, + "loss": 0.523, + "step": 281 + }, + { + "epoch": 0.2438743153646584, + "grad_norm": 7.083131313323975, + "learning_rate": 5.3742802303262964e-05, + "loss": 0.6166, + "step": 282 + }, + { + "epoch": 0.2447391179014125, + "grad_norm": 8.343469619750977, + "learning_rate": 5.3934740882917464e-05, + "loss": 0.7902, + "step": 283 + }, + { + "epoch": 0.24560392043816662, + "grad_norm": 11.58956241607666, + "learning_rate": 5.4126679462571984e-05, + "loss": 1.1019, + "step": 284 + }, + { + "epoch": 0.24646872297492073, + "grad_norm": 6.451682090759277, + "learning_rate": 5.431861804222649e-05, + "loss": 1.1185, + "step": 285 + }, + { + "epoch": 0.24733352551167484, + "grad_norm": 8.293807983398438, + "learning_rate": 5.4510556621881004e-05, + "loss": 0.7051, + "step": 286 + }, + { + "epoch": 0.24819832804842895, + "grad_norm": 6.799464702606201, + "learning_rate": 5.470249520153551e-05, + "loss": 1.076, + "step": 287 + }, + { + "epoch": 0.24906313058518306, + "grad_norm": 6.457718849182129, + "learning_rate": 5.4894433781190025e-05, + "loss": 1.5065, + "step": 288 + }, + { + "epoch": 0.24992793312193715, + "grad_norm": 8.503544807434082, + "learning_rate": 5.508637236084453e-05, + "loss": 0.9986, + "step": 289 + }, + { + "epoch": 0.25079273565869126, + "grad_norm": 8.062347412109375, + "learning_rate": 5.5278310940499045e-05, + "loss": 1.1196, + "step": 290 + }, + { + "epoch": 0.2516575381954454, + "grad_norm": 5.3419508934021, + "learning_rate": 5.547024952015355e-05, + "loss": 0.7055, + "step": 291 + }, + { + "epoch": 0.2525223407321995, + "grad_norm": 3.2817585468292236, + "learning_rate": 5.5662188099808065e-05, + "loss": 0.2865, + "step": 292 + }, + { + "epoch": 0.25338714326895356, + "grad_norm": 8.452672004699707, + "learning_rate": 5.585412667946257e-05, + "loss": 0.6973, + "step": 293 + }, + { + "epoch": 0.2542519458057077, + "grad_norm": 9.172618865966797, + "learning_rate": 5.6046065259117085e-05, + "loss": 1.0347, + "step": 294 + }, + { + "epoch": 0.2551167483424618, + "grad_norm": 7.101957321166992, + "learning_rate": 5.623800383877159e-05, + "loss": 0.5065, + "step": 295 + }, + { + "epoch": 0.2559815508792159, + "grad_norm": 8.655692100524902, + "learning_rate": 5.6429942418426105e-05, + "loss": 0.7479, + "step": 296 + }, + { + "epoch": 0.25684635341597, + "grad_norm": 6.224137306213379, + "learning_rate": 5.662188099808061e-05, + "loss": 0.5214, + "step": 297 + }, + { + "epoch": 0.25771115595272415, + "grad_norm": 5.057961463928223, + "learning_rate": 5.6813819577735125e-05, + "loss": 0.4925, + "step": 298 + }, + { + "epoch": 0.25857595848947823, + "grad_norm": 5.989309787750244, + "learning_rate": 5.700575815738963e-05, + "loss": 0.9331, + "step": 299 + }, + { + "epoch": 0.25944076102623237, + "grad_norm": 5.4001336097717285, + "learning_rate": 5.7197696737044146e-05, + "loss": 0.4239, + "step": 300 + }, + { + "epoch": 0.26030556356298645, + "grad_norm": 8.392406463623047, + "learning_rate": 5.7389635316698666e-05, + "loss": 0.7426, + "step": 301 + }, + { + "epoch": 0.26117036609974054, + "grad_norm": 9.140869140625, + "learning_rate": 5.758157389635317e-05, + "loss": 1.292, + "step": 302 + }, + { + "epoch": 0.2620351686364947, + "grad_norm": 5.900636196136475, + "learning_rate": 5.7773512476007686e-05, + "loss": 1.1471, + "step": 303 + }, + { + "epoch": 0.26289997117324876, + "grad_norm": 2.76983904838562, + "learning_rate": 5.796545105566219e-05, + "loss": 0.5639, + "step": 304 + }, + { + "epoch": 0.2637647737100029, + "grad_norm": 8.212996482849121, + "learning_rate": 5.8157389635316706e-05, + "loss": 1.592, + "step": 305 + }, + { + "epoch": 0.264629576246757, + "grad_norm": 6.7358174324035645, + "learning_rate": 5.834932821497121e-05, + "loss": 0.6063, + "step": 306 + }, + { + "epoch": 0.2654943787835111, + "grad_norm": 9.422693252563477, + "learning_rate": 5.8541266794625726e-05, + "loss": 0.665, + "step": 307 + }, + { + "epoch": 0.2663591813202652, + "grad_norm": 10.346942901611328, + "learning_rate": 5.873320537428023e-05, + "loss": 0.7966, + "step": 308 + }, + { + "epoch": 0.2672239838570193, + "grad_norm": 8.950202941894531, + "learning_rate": 5.8925143953934746e-05, + "loss": 0.6255, + "step": 309 + }, + { + "epoch": 0.2680887863937734, + "grad_norm": 6.519852638244629, + "learning_rate": 5.911708253358925e-05, + "loss": 0.7197, + "step": 310 + }, + { + "epoch": 0.2689535889305275, + "grad_norm": 12.285760879516602, + "learning_rate": 5.9309021113243767e-05, + "loss": 2.22, + "step": 311 + }, + { + "epoch": 0.26981839146728165, + "grad_norm": 9.598986625671387, + "learning_rate": 5.950095969289827e-05, + "loss": 0.7472, + "step": 312 + }, + { + "epoch": 0.27068319400403573, + "grad_norm": 13.030138969421387, + "learning_rate": 5.969289827255279e-05, + "loss": 1.0278, + "step": 313 + }, + { + "epoch": 0.27154799654078987, + "grad_norm": 9.371500015258789, + "learning_rate": 5.9884836852207293e-05, + "loss": 0.6434, + "step": 314 + }, + { + "epoch": 0.27241279907754395, + "grad_norm": 7.387608528137207, + "learning_rate": 6.007677543186181e-05, + "loss": 0.4596, + "step": 315 + }, + { + "epoch": 0.2732776016142981, + "grad_norm": 6.994756698608398, + "learning_rate": 6.0268714011516314e-05, + "loss": 0.5547, + "step": 316 + }, + { + "epoch": 0.2741424041510522, + "grad_norm": 7.713170528411865, + "learning_rate": 6.046065259117083e-05, + "loss": 1.2906, + "step": 317 + }, + { + "epoch": 0.27500720668780626, + "grad_norm": 12.936992645263672, + "learning_rate": 6.0652591170825334e-05, + "loss": 2.2893, + "step": 318 + }, + { + "epoch": 0.2758720092245604, + "grad_norm": 12.210866928100586, + "learning_rate": 6.084452975047985e-05, + "loss": 2.0067, + "step": 319 + }, + { + "epoch": 0.2767368117613145, + "grad_norm": 9.767999649047852, + "learning_rate": 6.103646833013436e-05, + "loss": 1.0523, + "step": 320 + }, + { + "epoch": 0.2776016142980686, + "grad_norm": 10.349803924560547, + "learning_rate": 6.122840690978887e-05, + "loss": 1.425, + "step": 321 + }, + { + "epoch": 0.2784664168348227, + "grad_norm": 8.848223686218262, + "learning_rate": 6.142034548944337e-05, + "loss": 1.0846, + "step": 322 + }, + { + "epoch": 0.27933121937157684, + "grad_norm": 12.004369735717773, + "learning_rate": 6.16122840690979e-05, + "loss": 1.614, + "step": 323 + }, + { + "epoch": 0.28019602190833093, + "grad_norm": 4.841424465179443, + "learning_rate": 6.18042226487524e-05, + "loss": 0.831, + "step": 324 + }, + { + "epoch": 0.281060824445085, + "grad_norm": 10.002786636352539, + "learning_rate": 6.199616122840691e-05, + "loss": 0.8297, + "step": 325 + }, + { + "epoch": 0.28192562698183915, + "grad_norm": 6.301035404205322, + "learning_rate": 6.218809980806143e-05, + "loss": 0.5425, + "step": 326 + }, + { + "epoch": 0.28279042951859323, + "grad_norm": 5.8098626136779785, + "learning_rate": 6.238003838771593e-05, + "loss": 0.6583, + "step": 327 + }, + { + "epoch": 0.2836552320553474, + "grad_norm": 5.272045135498047, + "learning_rate": 6.257197696737045e-05, + "loss": 1.0148, + "step": 328 + }, + { + "epoch": 0.28452003459210146, + "grad_norm": 8.22673511505127, + "learning_rate": 6.276391554702495e-05, + "loss": 1.4798, + "step": 329 + }, + { + "epoch": 0.2853848371288556, + "grad_norm": 3.6933820247650146, + "learning_rate": 6.295585412667947e-05, + "loss": 0.3907, + "step": 330 + }, + { + "epoch": 0.2862496396656097, + "grad_norm": 9.97194766998291, + "learning_rate": 6.314779270633397e-05, + "loss": 1.2206, + "step": 331 + }, + { + "epoch": 0.2871144422023638, + "grad_norm": 3.41243577003479, + "learning_rate": 6.33397312859885e-05, + "loss": 0.6509, + "step": 332 + }, + { + "epoch": 0.2879792447391179, + "grad_norm": 5.184510231018066, + "learning_rate": 6.3531669865643e-05, + "loss": 0.5982, + "step": 333 + }, + { + "epoch": 0.288844047275872, + "grad_norm": 6.894106864929199, + "learning_rate": 6.372360844529751e-05, + "loss": 1.066, + "step": 334 + }, + { + "epoch": 0.2897088498126261, + "grad_norm": 6.806879997253418, + "learning_rate": 6.391554702495202e-05, + "loss": 0.6874, + "step": 335 + }, + { + "epoch": 0.2905736523493802, + "grad_norm": 4.7376933097839355, + "learning_rate": 6.410748560460654e-05, + "loss": 0.2232, + "step": 336 + }, + { + "epoch": 0.29143845488613435, + "grad_norm": 7.3895745277404785, + "learning_rate": 6.429942418426104e-05, + "loss": 0.8978, + "step": 337 + }, + { + "epoch": 0.29230325742288843, + "grad_norm": 4.52320671081543, + "learning_rate": 6.449136276391555e-05, + "loss": 0.5689, + "step": 338 + }, + { + "epoch": 0.29316805995964257, + "grad_norm": 10.309342384338379, + "learning_rate": 6.468330134357006e-05, + "loss": 1.1131, + "step": 339 + }, + { + "epoch": 0.29403286249639665, + "grad_norm": 7.698537826538086, + "learning_rate": 6.487523992322458e-05, + "loss": 0.4493, + "step": 340 + }, + { + "epoch": 0.2948976650331508, + "grad_norm": 9.31425952911377, + "learning_rate": 6.506717850287908e-05, + "loss": 0.5409, + "step": 341 + }, + { + "epoch": 0.2957624675699049, + "grad_norm": 3.6749117374420166, + "learning_rate": 6.525911708253359e-05, + "loss": 0.5921, + "step": 342 + }, + { + "epoch": 0.29662727010665896, + "grad_norm": 8.300640106201172, + "learning_rate": 6.54510556621881e-05, + "loss": 0.6657, + "step": 343 + }, + { + "epoch": 0.2974920726434131, + "grad_norm": 7.509027481079102, + "learning_rate": 6.564299424184262e-05, + "loss": 0.8345, + "step": 344 + }, + { + "epoch": 0.2983568751801672, + "grad_norm": 6.161888122558594, + "learning_rate": 6.583493282149712e-05, + "loss": 0.9418, + "step": 345 + }, + { + "epoch": 0.2983568751801672, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8395562767982483, + "eval_Qnli-dev-1024_cosine_ap": 0.753054394869091, + "eval_Qnli-dev-1024_cosine_f1": 0.7216494845360825, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8208398818969727, + "eval_Qnli-dev-1024_cosine_mcc": 0.44512380090846426, + "eval_Qnli-dev-1024_cosine_precision": 0.6730769230769231, + "eval_Qnli-dev-1024_cosine_recall": 0.7777777777777778, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7856455445289612, + "eval_Qnli-dev_cosine_ap": 0.7529763141762885, + "eval_Qnli-dev_cosine_f1": 0.7169811320754719, + "eval_Qnli-dev_cosine_f1_threshold": 0.7426920533180237, + "eval_Qnli-dev_cosine_mcc": 0.4079411028893153, + "eval_Qnli-dev_cosine_precision": 0.6229508196721312, + "eval_Qnli-dev_cosine_recall": 0.8444444444444444, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9270833134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9791666865348816, + "eval_global_dataset_loss": 0.6650346517562866, + "eval_global_dataset_runtime": 67.8938, + "eval_global_dataset_samples_per_second": 14.316, + "eval_global_dataset_steps_per_second": 0.309, + "eval_sequential_score": 0.9270833134651184, + "eval_sts-test-1024_pearson_cosine": 0.8520971782224942, + "eval_sts-test-1024_spearman_cosine": 0.894236476710775, + "eval_sts-test_pearson_cosine": 0.9080363785366253, + "eval_sts-test_spearman_cosine": 0.9193020252854658, + "step": 345 + }, + { + "epoch": 0.2992216777169213, + "grad_norm": 3.2981910705566406, + "learning_rate": 6.602687140115163e-05, + "loss": 0.3752, + "step": 346 + }, + { + "epoch": 0.3000864802536754, + "grad_norm": 5.650154113769531, + "learning_rate": 6.621880998080614e-05, + "loss": 0.362, + "step": 347 + }, + { + "epoch": 0.30095128279042954, + "grad_norm": 7.166718482971191, + "learning_rate": 6.641074856046066e-05, + "loss": 0.99, + "step": 348 + }, + { + "epoch": 0.3018160853271836, + "grad_norm": 5.6671295166015625, + "learning_rate": 6.660268714011516e-05, + "loss": 0.4309, + "step": 349 + }, + { + "epoch": 0.3026808878639377, + "grad_norm": 7.15688943862915, + "learning_rate": 6.679462571976968e-05, + "loss": 0.4101, + "step": 350 + }, + { + "epoch": 0.30354569040069185, + "grad_norm": 4.845415115356445, + "learning_rate": 6.698656429942419e-05, + "loss": 0.357, + "step": 351 + }, + { + "epoch": 0.30441049293744593, + "grad_norm": 6.766101360321045, + "learning_rate": 6.71785028790787e-05, + "loss": 0.4257, + "step": 352 + }, + { + "epoch": 0.30527529547420007, + "grad_norm": 9.900660514831543, + "learning_rate": 6.737044145873322e-05, + "loss": 0.6665, + "step": 353 + }, + { + "epoch": 0.30614009801095415, + "grad_norm": 4.632408142089844, + "learning_rate": 6.756238003838772e-05, + "loss": 0.4523, + "step": 354 + }, + { + "epoch": 0.3070049005477083, + "grad_norm": 9.697669982910156, + "learning_rate": 6.775431861804223e-05, + "loss": 1.4959, + "step": 355 + }, + { + "epoch": 0.3078697030844624, + "grad_norm": 9.970297813415527, + "learning_rate": 6.794625719769674e-05, + "loss": 0.833, + "step": 356 + }, + { + "epoch": 0.3087345056212165, + "grad_norm": 9.964993476867676, + "learning_rate": 6.813819577735126e-05, + "loss": 0.7928, + "step": 357 + }, + { + "epoch": 0.3095993081579706, + "grad_norm": 3.866377353668213, + "learning_rate": 6.833013435700576e-05, + "loss": 0.3277, + "step": 358 + }, + { + "epoch": 0.3104641106947247, + "grad_norm": 6.179882526397705, + "learning_rate": 6.852207293666027e-05, + "loss": 0.5336, + "step": 359 + }, + { + "epoch": 0.3113289132314788, + "grad_norm": 5.517486095428467, + "learning_rate": 6.871401151631478e-05, + "loss": 0.4663, + "step": 360 + }, + { + "epoch": 0.3121937157682329, + "grad_norm": 6.7118306159973145, + "learning_rate": 6.89059500959693e-05, + "loss": 0.5869, + "step": 361 + }, + { + "epoch": 0.31305851830498704, + "grad_norm": 8.203336715698242, + "learning_rate": 6.90978886756238e-05, + "loss": 0.6056, + "step": 362 + }, + { + "epoch": 0.3139233208417411, + "grad_norm": 4.762539863586426, + "learning_rate": 6.928982725527831e-05, + "loss": 0.5402, + "step": 363 + }, + { + "epoch": 0.31478812337849527, + "grad_norm": 5.39819860458374, + "learning_rate": 6.948176583493282e-05, + "loss": 1.046, + "step": 364 + }, + { + "epoch": 0.31565292591524935, + "grad_norm": 4.130873680114746, + "learning_rate": 6.967370441458734e-05, + "loss": 0.3102, + "step": 365 + }, + { + "epoch": 0.31651772845200343, + "grad_norm": 7.361220359802246, + "learning_rate": 6.986564299424184e-05, + "loss": 0.412, + "step": 366 + }, + { + "epoch": 0.31738253098875757, + "grad_norm": 7.686898708343506, + "learning_rate": 7.005758157389636e-05, + "loss": 0.5703, + "step": 367 + }, + { + "epoch": 0.31824733352551166, + "grad_norm": 10.829538345336914, + "learning_rate": 7.024952015355086e-05, + "loss": 1.6531, + "step": 368 + }, + { + "epoch": 0.3191121360622658, + "grad_norm": 5.71692419052124, + "learning_rate": 7.044145873320538e-05, + "loss": 0.4314, + "step": 369 + }, + { + "epoch": 0.3199769385990199, + "grad_norm": 8.669037818908691, + "learning_rate": 7.063339731285988e-05, + "loss": 0.7062, + "step": 370 + }, + { + "epoch": 0.320841741135774, + "grad_norm": 5.996104717254639, + "learning_rate": 7.08253358925144e-05, + "loss": 0.5788, + "step": 371 + }, + { + "epoch": 0.3217065436725281, + "grad_norm": 12.612412452697754, + "learning_rate": 7.10172744721689e-05, + "loss": 1.8529, + "step": 372 + }, + { + "epoch": 0.32257134620928224, + "grad_norm": 8.934858322143555, + "learning_rate": 7.120921305182342e-05, + "loss": 0.6606, + "step": 373 + }, + { + "epoch": 0.3234361487460363, + "grad_norm": 10.218025207519531, + "learning_rate": 7.140115163147793e-05, + "loss": 0.8089, + "step": 374 + }, + { + "epoch": 0.3243009512827904, + "grad_norm": 5.20566987991333, + "learning_rate": 7.159309021113245e-05, + "loss": 0.3905, + "step": 375 + }, + { + "epoch": 0.32516575381954455, + "grad_norm": 10.471417427062988, + "learning_rate": 7.178502879078695e-05, + "loss": 1.2417, + "step": 376 + }, + { + "epoch": 0.32603055635629863, + "grad_norm": 7.703388690948486, + "learning_rate": 7.197696737044146e-05, + "loss": 0.8738, + "step": 377 + }, + { + "epoch": 0.32689535889305277, + "grad_norm": 8.099038124084473, + "learning_rate": 7.216890595009598e-05, + "loss": 0.8544, + "step": 378 + }, + { + "epoch": 0.32776016142980685, + "grad_norm": 6.550043106079102, + "learning_rate": 7.236084452975049e-05, + "loss": 0.4667, + "step": 379 + }, + { + "epoch": 0.328624963966561, + "grad_norm": 10.672149658203125, + "learning_rate": 7.255278310940499e-05, + "loss": 0.8825, + "step": 380 + }, + { + "epoch": 0.3294897665033151, + "grad_norm": 7.584779262542725, + "learning_rate": 7.27447216890595e-05, + "loss": 0.6003, + "step": 381 + }, + { + "epoch": 0.33035456904006916, + "grad_norm": 5.818914890289307, + "learning_rate": 7.293666026871402e-05, + "loss": 0.4643, + "step": 382 + }, + { + "epoch": 0.3312193715768233, + "grad_norm": 6.871515274047852, + "learning_rate": 7.312859884836853e-05, + "loss": 0.5097, + "step": 383 + }, + { + "epoch": 0.3320841741135774, + "grad_norm": 3.9484200477600098, + "learning_rate": 7.332053742802303e-05, + "loss": 0.4679, + "step": 384 + }, + { + "epoch": 0.3329489766503315, + "grad_norm": 3.8606741428375244, + "learning_rate": 7.351247600767754e-05, + "loss": 0.3732, + "step": 385 + }, + { + "epoch": 0.3338137791870856, + "grad_norm": 10.65389347076416, + "learning_rate": 7.370441458733206e-05, + "loss": 0.9031, + "step": 386 + }, + { + "epoch": 0.33467858172383974, + "grad_norm": 10.56472396850586, + "learning_rate": 7.389635316698657e-05, + "loss": 0.6668, + "step": 387 + }, + { + "epoch": 0.3355433842605938, + "grad_norm": 9.798723220825195, + "learning_rate": 7.408829174664109e-05, + "loss": 0.7715, + "step": 388 + }, + { + "epoch": 0.33640818679734796, + "grad_norm": 8.35350227355957, + "learning_rate": 7.428023032629558e-05, + "loss": 0.8536, + "step": 389 + }, + { + "epoch": 0.33727298933410205, + "grad_norm": 7.99412727355957, + "learning_rate": 7.44721689059501e-05, + "loss": 0.9303, + "step": 390 + }, + { + "epoch": 0.33813779187085613, + "grad_norm": 8.098565101623535, + "learning_rate": 7.46641074856046e-05, + "loss": 0.3704, + "step": 391 + }, + { + "epoch": 0.33900259440761027, + "grad_norm": 7.83499002456665, + "learning_rate": 7.485604606525913e-05, + "loss": 0.3678, + "step": 392 + }, + { + "epoch": 0.33986739694436435, + "grad_norm": 9.846261978149414, + "learning_rate": 7.504798464491363e-05, + "loss": 1.6854, + "step": 393 + }, + { + "epoch": 0.3407321994811185, + "grad_norm": 10.261216163635254, + "learning_rate": 7.523992322456814e-05, + "loss": 0.7636, + "step": 394 + }, + { + "epoch": 0.3415970020178726, + "grad_norm": 5.547618389129639, + "learning_rate": 7.543186180422265e-05, + "loss": 0.3462, + "step": 395 + }, + { + "epoch": 0.3424618045546267, + "grad_norm": 6.500753402709961, + "learning_rate": 7.562380038387717e-05, + "loss": 0.644, + "step": 396 + }, + { + "epoch": 0.3433266070913808, + "grad_norm": 8.669839859008789, + "learning_rate": 7.581573896353167e-05, + "loss": 0.7317, + "step": 397 + }, + { + "epoch": 0.34419140962813494, + "grad_norm": 6.280559062957764, + "learning_rate": 7.600767754318618e-05, + "loss": 0.7023, + "step": 398 + }, + { + "epoch": 0.345056212164889, + "grad_norm": 7.725942611694336, + "learning_rate": 7.61996161228407e-05, + "loss": 0.7164, + "step": 399 + }, + { + "epoch": 0.3459210147016431, + "grad_norm": 7.478891849517822, + "learning_rate": 7.639155470249521e-05, + "loss": 0.4271, + "step": 400 + }, + { + "epoch": 0.34678581723839724, + "grad_norm": 4.877331256866455, + "learning_rate": 7.658349328214971e-05, + "loss": 0.7332, + "step": 401 + }, + { + "epoch": 0.3476506197751513, + "grad_norm": 8.025667190551758, + "learning_rate": 7.677543186180422e-05, + "loss": 0.3978, + "step": 402 + }, + { + "epoch": 0.34851542231190547, + "grad_norm": 7.804194450378418, + "learning_rate": 7.696737044145874e-05, + "loss": 0.5208, + "step": 403 + }, + { + "epoch": 0.34938022484865955, + "grad_norm": 5.8793230056762695, + "learning_rate": 7.715930902111325e-05, + "loss": 0.4889, + "step": 404 + }, + { + "epoch": 0.3502450273854137, + "grad_norm": 8.609319686889648, + "learning_rate": 7.735124760076777e-05, + "loss": 0.769, + "step": 405 + }, + { + "epoch": 0.35110982992216777, + "grad_norm": 6.56134033203125, + "learning_rate": 7.754318618042226e-05, + "loss": 0.3932, + "step": 406 + }, + { + "epoch": 0.35197463245892185, + "grad_norm": 8.588756561279297, + "learning_rate": 7.773512476007678e-05, + "loss": 0.5919, + "step": 407 + }, + { + "epoch": 0.352839434995676, + "grad_norm": 7.530106067657471, + "learning_rate": 7.792706333973129e-05, + "loss": 0.6037, + "step": 408 + }, + { + "epoch": 0.3537042375324301, + "grad_norm": 7.5281853675842285, + "learning_rate": 7.811900191938581e-05, + "loss": 0.4321, + "step": 409 + }, + { + "epoch": 0.3545690400691842, + "grad_norm": 8.16552448272705, + "learning_rate": 7.831094049904032e-05, + "loss": 1.1022, + "step": 410 + }, + { + "epoch": 0.3554338426059383, + "grad_norm": 8.752754211425781, + "learning_rate": 7.850287907869482e-05, + "loss": 0.5996, + "step": 411 + }, + { + "epoch": 0.35629864514269244, + "grad_norm": 7.659090995788574, + "learning_rate": 7.869481765834933e-05, + "loss": 0.5673, + "step": 412 + }, + { + "epoch": 0.3571634476794465, + "grad_norm": 6.884600639343262, + "learning_rate": 7.888675623800385e-05, + "loss": 0.3437, + "step": 413 + }, + { + "epoch": 0.35802825021620066, + "grad_norm": 5.328488349914551, + "learning_rate": 7.907869481765836e-05, + "loss": 0.519, + "step": 414 + }, + { + "epoch": 0.35889305275295474, + "grad_norm": 10.308977127075195, + "learning_rate": 7.927063339731286e-05, + "loss": 1.5373, + "step": 415 + }, + { + "epoch": 0.35975785528970883, + "grad_norm": 7.618837356567383, + "learning_rate": 7.946257197696737e-05, + "loss": 0.764, + "step": 416 + }, + { + "epoch": 0.36062265782646297, + "grad_norm": 8.787110328674316, + "learning_rate": 7.965451055662189e-05, + "loss": 0.6131, + "step": 417 + }, + { + "epoch": 0.36148746036321705, + "grad_norm": 6.432898998260498, + "learning_rate": 7.98464491362764e-05, + "loss": 0.6826, + "step": 418 + }, + { + "epoch": 0.3623522628999712, + "grad_norm": 8.762993812561035, + "learning_rate": 8.00383877159309e-05, + "loss": 0.9631, + "step": 419 + }, + { + "epoch": 0.3632170654367253, + "grad_norm": 5.939430236816406, + "learning_rate": 8.023032629558541e-05, + "loss": 0.4283, + "step": 420 + }, + { + "epoch": 0.3640818679734794, + "grad_norm": 8.092362403869629, + "learning_rate": 8.042226487523993e-05, + "loss": 1.2001, + "step": 421 + }, + { + "epoch": 0.3649466705102335, + "grad_norm": 7.594040870666504, + "learning_rate": 8.061420345489444e-05, + "loss": 0.4499, + "step": 422 + }, + { + "epoch": 0.3658114730469876, + "grad_norm": 12.614463806152344, + "learning_rate": 8.080614203454894e-05, + "loss": 1.4073, + "step": 423 + }, + { + "epoch": 0.3666762755837417, + "grad_norm": 6.807295322418213, + "learning_rate": 8.099808061420346e-05, + "loss": 0.8035, + "step": 424 + }, + { + "epoch": 0.3675410781204958, + "grad_norm": 3.6670141220092773, + "learning_rate": 8.119001919385797e-05, + "loss": 0.3207, + "step": 425 + }, + { + "epoch": 0.36840588065724994, + "grad_norm": 7.3801445960998535, + "learning_rate": 8.138195777351249e-05, + "loss": 0.4752, + "step": 426 + }, + { + "epoch": 0.369270683194004, + "grad_norm": 9.895638465881348, + "learning_rate": 8.157389635316698e-05, + "loss": 1.1256, + "step": 427 + }, + { + "epoch": 0.37013548573075816, + "grad_norm": 6.200985431671143, + "learning_rate": 8.17658349328215e-05, + "loss": 0.4226, + "step": 428 + }, + { + "epoch": 0.37100028826751225, + "grad_norm": 9.858406066894531, + "learning_rate": 8.195777351247601e-05, + "loss": 1.123, + "step": 429 + }, + { + "epoch": 0.3718650908042664, + "grad_norm": 7.274184703826904, + "learning_rate": 8.214971209213053e-05, + "loss": 0.4425, + "step": 430 + }, + { + "epoch": 0.37272989334102047, + "grad_norm": 4.712157249450684, + "learning_rate": 8.234165067178504e-05, + "loss": 0.4242, + "step": 431 + }, + { + "epoch": 0.37359469587777455, + "grad_norm": 7.515327453613281, + "learning_rate": 8.253358925143954e-05, + "loss": 1.0072, + "step": 432 + }, + { + "epoch": 0.3744594984145287, + "grad_norm": 3.97876238822937, + "learning_rate": 8.272552783109405e-05, + "loss": 0.241, + "step": 433 + }, + { + "epoch": 0.3753243009512828, + "grad_norm": 7.888240337371826, + "learning_rate": 8.291746641074857e-05, + "loss": 0.7359, + "step": 434 + }, + { + "epoch": 0.3761891034880369, + "grad_norm": 6.10671329498291, + "learning_rate": 8.310940499040308e-05, + "loss": 0.4583, + "step": 435 + }, + { + "epoch": 0.377053906024791, + "grad_norm": 6.102023601531982, + "learning_rate": 8.330134357005758e-05, + "loss": 0.9001, + "step": 436 + }, + { + "epoch": 0.37791870856154514, + "grad_norm": 7.122408390045166, + "learning_rate": 8.349328214971209e-05, + "loss": 0.4614, + "step": 437 + }, + { + "epoch": 0.3787835110982992, + "grad_norm": 9.432422637939453, + "learning_rate": 8.368522072936661e-05, + "loss": 1.238, + "step": 438 + }, + { + "epoch": 0.3796483136350533, + "grad_norm": 9.530061721801758, + "learning_rate": 8.387715930902112e-05, + "loss": 1.5289, + "step": 439 + }, + { + "epoch": 0.38051311617180744, + "grad_norm": 7.045010566711426, + "learning_rate": 8.406909788867562e-05, + "loss": 0.3283, + "step": 440 + }, + { + "epoch": 0.3813779187085615, + "grad_norm": 6.275206089019775, + "learning_rate": 8.426103646833013e-05, + "loss": 0.4147, + "step": 441 + }, + { + "epoch": 0.38224272124531566, + "grad_norm": 4.124218940734863, + "learning_rate": 8.445297504798465e-05, + "loss": 0.4956, + "step": 442 + }, + { + "epoch": 0.38310752378206975, + "grad_norm": 5.8184895515441895, + "learning_rate": 8.464491362763916e-05, + "loss": 0.5166, + "step": 443 + }, + { + "epoch": 0.3839723263188239, + "grad_norm": 2.6442999839782715, + "learning_rate": 8.483685220729366e-05, + "loss": 0.2486, + "step": 444 + }, + { + "epoch": 0.38483712885557797, + "grad_norm": 3.8425562381744385, + "learning_rate": 8.502879078694817e-05, + "loss": 0.4493, + "step": 445 + }, + { + "epoch": 0.3857019313923321, + "grad_norm": 9.125511169433594, + "learning_rate": 8.522072936660269e-05, + "loss": 1.0439, + "step": 446 + }, + { + "epoch": 0.3865667339290862, + "grad_norm": 9.67273998260498, + "learning_rate": 8.54126679462572e-05, + "loss": 1.249, + "step": 447 + }, + { + "epoch": 0.3874315364658403, + "grad_norm": 7.822050094604492, + "learning_rate": 8.560460652591172e-05, + "loss": 0.8329, + "step": 448 + }, + { + "epoch": 0.3882963390025944, + "grad_norm": 5.747166633605957, + "learning_rate": 8.579654510556623e-05, + "loss": 0.3256, + "step": 449 + }, + { + "epoch": 0.3891611415393485, + "grad_norm": 7.257145404815674, + "learning_rate": 8.598848368522073e-05, + "loss": 1.0333, + "step": 450 + }, + { + "epoch": 0.39002594407610264, + "grad_norm": 7.6516642570495605, + "learning_rate": 8.618042226487525e-05, + "loss": 0.3821, + "step": 451 + }, + { + "epoch": 0.3908907466128567, + "grad_norm": 6.943114757537842, + "learning_rate": 8.637236084452976e-05, + "loss": 0.4578, + "step": 452 + }, + { + "epoch": 0.39175554914961086, + "grad_norm": 6.90556526184082, + "learning_rate": 8.656429942418427e-05, + "loss": 0.6716, + "step": 453 + }, + { + "epoch": 0.39262035168636494, + "grad_norm": 5.005017280578613, + "learning_rate": 8.675623800383877e-05, + "loss": 0.2694, + "step": 454 + }, + { + "epoch": 0.393485154223119, + "grad_norm": 9.84821605682373, + "learning_rate": 8.694817658349329e-05, + "loss": 1.7739, + "step": 455 + }, + { + "epoch": 0.39434995675987317, + "grad_norm": 7.2032647132873535, + "learning_rate": 8.71401151631478e-05, + "loss": 0.7109, + "step": 456 + }, + { + "epoch": 0.39521475929662725, + "grad_norm": 10.030957221984863, + "learning_rate": 8.73320537428023e-05, + "loss": 0.5733, + "step": 457 + }, + { + "epoch": 0.3960795618333814, + "grad_norm": 3.6352131366729736, + "learning_rate": 8.752399232245681e-05, + "loss": 0.283, + "step": 458 + }, + { + "epoch": 0.39694436437013547, + "grad_norm": 3.4260525703430176, + "learning_rate": 8.771593090211133e-05, + "loss": 0.4214, + "step": 459 + }, + { + "epoch": 0.3978091669068896, + "grad_norm": 4.595706462860107, + "learning_rate": 8.790786948176584e-05, + "loss": 0.6332, + "step": 460 + }, + { + "epoch": 0.3978091669068896, + "eval_Qnli-dev-1024_cosine_accuracy": 0.75, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.879416823387146, + "eval_Qnli-dev-1024_cosine_ap": 0.781841863547347, + "eval_Qnli-dev-1024_cosine_f1": 0.7346938775510203, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8326764106750488, + "eval_Qnli-dev-1024_cosine_mcc": 0.4683019469005233, + "eval_Qnli-dev-1024_cosine_precision": 0.6792452830188679, + "eval_Qnli-dev-1024_cosine_recall": 0.8, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8109143972396851, + "eval_Qnli-dev_cosine_ap": 0.7568700790587495, + "eval_Qnli-dev_cosine_f1": 0.7346938775510203, + "eval_Qnli-dev_cosine_f1_threshold": 0.742037296295166, + "eval_Qnli-dev_cosine_mcc": 0.4683019469005233, + "eval_Qnli-dev_cosine_precision": 0.6792452830188679, + "eval_Qnli-dev_cosine_recall": 0.8, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9375, + "eval_allNLI-triplets_cosine_accuracy": 0.9791666865348816, + "eval_global_dataset_loss": 0.7834931015968323, + "eval_global_dataset_runtime": 68.0114, + "eval_global_dataset_samples_per_second": 14.292, + "eval_global_dataset_steps_per_second": 0.309, + "eval_sequential_score": 0.9375, + "eval_sts-test-1024_pearson_cosine": 0.8881975899886265, + "eval_sts-test-1024_spearman_cosine": 0.9114913957523785, + "eval_sts-test_pearson_cosine": 0.9096311897411768, + "eval_sts-test_spearman_cosine": 0.9200152476526354, + "step": 460 + }, + { + "epoch": 0.3986739694436437, + "grad_norm": 10.936285018920898, + "learning_rate": 8.809980806142035e-05, + "loss": 1.2458, + "step": 461 + }, + { + "epoch": 0.39953877198039783, + "grad_norm": 5.99333381652832, + "learning_rate": 8.829174664107485e-05, + "loss": 0.3104, + "step": 462 + }, + { + "epoch": 0.4004035745171519, + "grad_norm": 6.260789394378662, + "learning_rate": 8.848368522072937e-05, + "loss": 0.4634, + "step": 463 + }, + { + "epoch": 0.401268377053906, + "grad_norm": 4.397698879241943, + "learning_rate": 8.867562380038388e-05, + "loss": 0.5893, + "step": 464 + }, + { + "epoch": 0.40213317959066014, + "grad_norm": 4.650321960449219, + "learning_rate": 8.88675623800384e-05, + "loss": 0.377, + "step": 465 + }, + { + "epoch": 0.4029979821274142, + "grad_norm": 9.407013893127441, + "learning_rate": 8.905950095969289e-05, + "loss": 0.5403, + "step": 466 + }, + { + "epoch": 0.40386278466416836, + "grad_norm": 10.255672454833984, + "learning_rate": 8.925143953934741e-05, + "loss": 0.9454, + "step": 467 + }, + { + "epoch": 0.40472758720092245, + "grad_norm": 11.186202049255371, + "learning_rate": 8.944337811900192e-05, + "loss": 0.9292, + "step": 468 + }, + { + "epoch": 0.4055923897376766, + "grad_norm": 6.13421630859375, + "learning_rate": 8.963531669865644e-05, + "loss": 0.4132, + "step": 469 + }, + { + "epoch": 0.40645719227443067, + "grad_norm": 12.439327239990234, + "learning_rate": 8.982725527831093e-05, + "loss": 0.9805, + "step": 470 + }, + { + "epoch": 0.4073219948111848, + "grad_norm": 10.574874877929688, + "learning_rate": 9.001919385796545e-05, + "loss": 0.7784, + "step": 471 + }, + { + "epoch": 0.4081867973479389, + "grad_norm": 5.993617057800293, + "learning_rate": 9.021113243761996e-05, + "loss": 0.369, + "step": 472 + }, + { + "epoch": 0.409051599884693, + "grad_norm": 10.213888168334961, + "learning_rate": 9.040307101727448e-05, + "loss": 1.4911, + "step": 473 + }, + { + "epoch": 0.4099164024214471, + "grad_norm": 7.043622016906738, + "learning_rate": 9.059500959692899e-05, + "loss": 0.4223, + "step": 474 + }, + { + "epoch": 0.4107812049582012, + "grad_norm": 8.350674629211426, + "learning_rate": 9.07869481765835e-05, + "loss": 1.2959, + "step": 475 + }, + { + "epoch": 0.41164600749495533, + "grad_norm": 8.64110279083252, + "learning_rate": 9.097888675623801e-05, + "loss": 0.4928, + "step": 476 + }, + { + "epoch": 0.4125108100317094, + "grad_norm": 6.388255596160889, + "learning_rate": 9.117082533589252e-05, + "loss": 1.02, + "step": 477 + }, + { + "epoch": 0.41337561256846356, + "grad_norm": 4.513554096221924, + "learning_rate": 9.136276391554703e-05, + "loss": 0.5034, + "step": 478 + }, + { + "epoch": 0.41424041510521764, + "grad_norm": 10.509414672851562, + "learning_rate": 9.155470249520153e-05, + "loss": 1.6119, + "step": 479 + }, + { + "epoch": 0.4151052176419717, + "grad_norm": 5.085805416107178, + "learning_rate": 9.174664107485605e-05, + "loss": 0.59, + "step": 480 + }, + { + "epoch": 0.41597002017872586, + "grad_norm": 8.275995254516602, + "learning_rate": 9.193857965451056e-05, + "loss": 0.5589, + "step": 481 + }, + { + "epoch": 0.41683482271547995, + "grad_norm": 9.266075134277344, + "learning_rate": 9.213051823416508e-05, + "loss": 0.8402, + "step": 482 + }, + { + "epoch": 0.4176996252522341, + "grad_norm": 9.998162269592285, + "learning_rate": 9.232245681381957e-05, + "loss": 0.5293, + "step": 483 + }, + { + "epoch": 0.41856442778898817, + "grad_norm": 6.49897575378418, + "learning_rate": 9.25143953934741e-05, + "loss": 0.5911, + "step": 484 + }, + { + "epoch": 0.4194292303257423, + "grad_norm": 2.814267158508301, + "learning_rate": 9.27063339731286e-05, + "loss": 0.1819, + "step": 485 + }, + { + "epoch": 0.4202940328624964, + "grad_norm": 6.657732009887695, + "learning_rate": 9.289827255278312e-05, + "loss": 0.6963, + "step": 486 + }, + { + "epoch": 0.42115883539925053, + "grad_norm": 10.121885299682617, + "learning_rate": 9.309021113243761e-05, + "loss": 0.6004, + "step": 487 + }, + { + "epoch": 0.4220236379360046, + "grad_norm": 15.206952095031738, + "learning_rate": 9.328214971209214e-05, + "loss": 2.0825, + "step": 488 + }, + { + "epoch": 0.4228884404727587, + "grad_norm": 11.911534309387207, + "learning_rate": 9.347408829174664e-05, + "loss": 1.6674, + "step": 489 + }, + { + "epoch": 0.42375324300951284, + "grad_norm": 6.184067726135254, + "learning_rate": 9.366602687140116e-05, + "loss": 0.5685, + "step": 490 + }, + { + "epoch": 0.4246180455462669, + "grad_norm": 7.771515846252441, + "learning_rate": 9.385796545105567e-05, + "loss": 0.7835, + "step": 491 + }, + { + "epoch": 0.42548284808302106, + "grad_norm": 8.338656425476074, + "learning_rate": 9.404990403071018e-05, + "loss": 0.4613, + "step": 492 + }, + { + "epoch": 0.42634765061977514, + "grad_norm": 9.678628921508789, + "learning_rate": 9.424184261036468e-05, + "loss": 1.0492, + "step": 493 + }, + { + "epoch": 0.4272124531565293, + "grad_norm": 5.315983772277832, + "learning_rate": 9.44337811900192e-05, + "loss": 0.3512, + "step": 494 + }, + { + "epoch": 0.42807725569328337, + "grad_norm": 7.20918607711792, + "learning_rate": 9.462571976967371e-05, + "loss": 0.8227, + "step": 495 + }, + { + "epoch": 0.42894205823003745, + "grad_norm": 8.044875144958496, + "learning_rate": 9.481765834932822e-05, + "loss": 0.7849, + "step": 496 + }, + { + "epoch": 0.4298068607667916, + "grad_norm": 8.14607048034668, + "learning_rate": 9.500959692898272e-05, + "loss": 1.2823, + "step": 497 + }, + { + "epoch": 0.43067166330354567, + "grad_norm": 9.731268882751465, + "learning_rate": 9.520153550863724e-05, + "loss": 1.2799, + "step": 498 + }, + { + "epoch": 0.4315364658402998, + "grad_norm": 9.654071807861328, + "learning_rate": 9.539347408829176e-05, + "loss": 0.596, + "step": 499 + }, + { + "epoch": 0.4324012683770539, + "grad_norm": 9.026534080505371, + "learning_rate": 9.558541266794626e-05, + "loss": 0.6793, + "step": 500 + }, + { + "epoch": 0.43326607091380803, + "grad_norm": 7.325682163238525, + "learning_rate": 9.577735124760078e-05, + "loss": 0.575, + "step": 501 + }, + { + "epoch": 0.4341308734505621, + "grad_norm": 4.846238136291504, + "learning_rate": 9.596928982725528e-05, + "loss": 0.2631, + "step": 502 + }, + { + "epoch": 0.43499567598731625, + "grad_norm": 8.93980598449707, + "learning_rate": 9.61612284069098e-05, + "loss": 0.5173, + "step": 503 + }, + { + "epoch": 0.43586047852407034, + "grad_norm": 11.70151138305664, + "learning_rate": 9.63531669865643e-05, + "loss": 0.9963, + "step": 504 + }, + { + "epoch": 0.4367252810608244, + "grad_norm": 6.328804016113281, + "learning_rate": 9.654510556621882e-05, + "loss": 0.9315, + "step": 505 + }, + { + "epoch": 0.43759008359757856, + "grad_norm": 9.678471565246582, + "learning_rate": 9.673704414587332e-05, + "loss": 0.7878, + "step": 506 + }, + { + "epoch": 0.43845488613433264, + "grad_norm": 6.569301128387451, + "learning_rate": 9.692898272552784e-05, + "loss": 0.4346, + "step": 507 + }, + { + "epoch": 0.4393196886710868, + "grad_norm": 6.5204596519470215, + "learning_rate": 9.712092130518235e-05, + "loss": 0.7662, + "step": 508 + }, + { + "epoch": 0.44018449120784087, + "grad_norm": 8.459349632263184, + "learning_rate": 9.731285988483686e-05, + "loss": 0.5221, + "step": 509 + }, + { + "epoch": 0.441049293744595, + "grad_norm": 8.08749008178711, + "learning_rate": 9.750479846449136e-05, + "loss": 0.9803, + "step": 510 + }, + { + "epoch": 0.4419140962813491, + "grad_norm": 8.031821250915527, + "learning_rate": 9.769673704414588e-05, + "loss": 1.1605, + "step": 511 + }, + { + "epoch": 0.4427788988181032, + "grad_norm": 9.393692016601562, + "learning_rate": 9.788867562380039e-05, + "loss": 0.6801, + "step": 512 + }, + { + "epoch": 0.4436437013548573, + "grad_norm": 5.011040687561035, + "learning_rate": 9.80806142034549e-05, + "loss": 0.3935, + "step": 513 + }, + { + "epoch": 0.4445085038916114, + "grad_norm": 2.235301971435547, + "learning_rate": 9.82725527831094e-05, + "loss": 0.1377, + "step": 514 + }, + { + "epoch": 0.44537330642836553, + "grad_norm": 5.642356872558594, + "learning_rate": 9.846449136276392e-05, + "loss": 0.985, + "step": 515 + }, + { + "epoch": 0.4462381089651196, + "grad_norm": 9.386540412902832, + "learning_rate": 9.865642994241843e-05, + "loss": 0.7949, + "step": 516 + }, + { + "epoch": 0.44710291150187376, + "grad_norm": 5.782979965209961, + "learning_rate": 9.884836852207294e-05, + "loss": 0.5974, + "step": 517 + }, + { + "epoch": 0.44796771403862784, + "grad_norm": 5.323793888092041, + "learning_rate": 9.904030710172744e-05, + "loss": 0.6797, + "step": 518 + }, + { + "epoch": 0.448832516575382, + "grad_norm": 8.012255668640137, + "learning_rate": 9.923224568138196e-05, + "loss": 0.6953, + "step": 519 + }, + { + "epoch": 0.44969731911213606, + "grad_norm": 6.930400371551514, + "learning_rate": 9.942418426103647e-05, + "loss": 0.4729, + "step": 520 + }, + { + "epoch": 0.45056212164889015, + "grad_norm": 10.408514976501465, + "learning_rate": 9.961612284069098e-05, + "loss": 0.5509, + "step": 521 + }, + { + "epoch": 0.4514269241856443, + "grad_norm": 5.082659721374512, + "learning_rate": 9.980806142034548e-05, + "loss": 0.4549, + "step": 522 + }, + { + "epoch": 0.45229172672239837, + "grad_norm": 10.625167846679688, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 523 + }, + { + "epoch": 0.4531565292591525, + "grad_norm": 7.423165798187256, + "learning_rate": 9.999974430536151e-05, + "loss": 0.4466, + "step": 524 + }, + { + "epoch": 0.4540213317959066, + "grad_norm": 10.48806095123291, + "learning_rate": 9.999897722406126e-05, + "loss": 1.3489, + "step": 525 + }, + { + "epoch": 0.45488613433266073, + "grad_norm": 8.526479721069336, + "learning_rate": 9.999769876394478e-05, + "loss": 0.6699, + "step": 526 + }, + { + "epoch": 0.4557509368694148, + "grad_norm": 7.596718788146973, + "learning_rate": 9.999590893808788e-05, + "loss": 0.5189, + "step": 527 + }, + { + "epoch": 0.45661573940616895, + "grad_norm": 10.089831352233887, + "learning_rate": 9.999360776479651e-05, + "loss": 0.7617, + "step": 528 + }, + { + "epoch": 0.45748054194292304, + "grad_norm": 7.766354560852051, + "learning_rate": 9.999079526760659e-05, + "loss": 0.5148, + "step": 529 + }, + { + "epoch": 0.4583453444796771, + "grad_norm": 6.268951892852783, + "learning_rate": 9.998747147528374e-05, + "loss": 0.5564, + "step": 530 + }, + { + "epoch": 0.45921014701643126, + "grad_norm": 5.794777870178223, + "learning_rate": 9.9983636421823e-05, + "loss": 0.4038, + "step": 531 + }, + { + "epoch": 0.46007494955318534, + "grad_norm": 8.995209693908691, + "learning_rate": 9.997929014644845e-05, + "loss": 0.6968, + "step": 532 + }, + { + "epoch": 0.4609397520899395, + "grad_norm": 6.833916187286377, + "learning_rate": 9.997443269361289e-05, + "loss": 0.4393, + "step": 533 + }, + { + "epoch": 0.46180455462669356, + "grad_norm": 9.531277656555176, + "learning_rate": 9.996906411299726e-05, + "loss": 0.7228, + "step": 534 + }, + { + "epoch": 0.4626693571634477, + "grad_norm": 11.1766939163208, + "learning_rate": 9.996318445951032e-05, + "loss": 0.898, + "step": 535 + }, + { + "epoch": 0.4635341597002018, + "grad_norm": 4.982804298400879, + "learning_rate": 9.995679379328785e-05, + "loss": 0.3461, + "step": 536 + }, + { + "epoch": 0.46439896223695587, + "grad_norm": 3.0458362102508545, + "learning_rate": 9.994989217969224e-05, + "loss": 0.4753, + "step": 537 + }, + { + "epoch": 0.46526376477371, + "grad_norm": 7.552469253540039, + "learning_rate": 9.99424796893117e-05, + "loss": 0.4446, + "step": 538 + }, + { + "epoch": 0.4661285673104641, + "grad_norm": 10.52206039428711, + "learning_rate": 9.99345563979596e-05, + "loss": 0.8696, + "step": 539 + }, + { + "epoch": 0.46699336984721823, + "grad_norm": 9.044191360473633, + "learning_rate": 9.992612238667368e-05, + "loss": 1.0505, + "step": 540 + }, + { + "epoch": 0.4678581723839723, + "grad_norm": 7.528494834899902, + "learning_rate": 9.991717774171514e-05, + "loss": 0.3523, + "step": 541 + }, + { + "epoch": 0.46872297492072645, + "grad_norm": 8.00634765625, + "learning_rate": 9.990772255456797e-05, + "loss": 0.6452, + "step": 542 + }, + { + "epoch": 0.46958777745748054, + "grad_norm": 6.528989315032959, + "learning_rate": 9.989775692193773e-05, + "loss": 0.5005, + "step": 543 + }, + { + "epoch": 0.4704525799942347, + "grad_norm": 7.66871452331543, + "learning_rate": 9.988728094575082e-05, + "loss": 0.5364, + "step": 544 + }, + { + "epoch": 0.47131738253098876, + "grad_norm": 6.3178558349609375, + "learning_rate": 9.987629473315325e-05, + "loss": 0.6121, + "step": 545 + }, + { + "epoch": 0.47218218506774284, + "grad_norm": 3.713564872741699, + "learning_rate": 9.986479839650966e-05, + "loss": 0.2326, + "step": 546 + }, + { + "epoch": 0.473046987604497, + "grad_norm": 11.291918754577637, + "learning_rate": 9.98527920534021e-05, + "loss": 1.2339, + "step": 547 + }, + { + "epoch": 0.47391179014125107, + "grad_norm": 8.482532501220703, + "learning_rate": 9.984027582662892e-05, + "loss": 0.8196, + "step": 548 + }, + { + "epoch": 0.4747765926780052, + "grad_norm": 2.9724512100219727, + "learning_rate": 9.982724984420333e-05, + "loss": 0.2354, + "step": 549 + }, + { + "epoch": 0.4756413952147593, + "grad_norm": 9.461052894592285, + "learning_rate": 9.981371423935233e-05, + "loss": 0.6666, + "step": 550 + }, + { + "epoch": 0.4765061977515134, + "grad_norm": 5.076896667480469, + "learning_rate": 9.979966915051517e-05, + "loss": 0.3125, + "step": 551 + }, + { + "epoch": 0.4773710002882675, + "grad_norm": 8.995684623718262, + "learning_rate": 9.978511472134203e-05, + "loss": 0.7455, + "step": 552 + }, + { + "epoch": 0.4782358028250216, + "grad_norm": 2.971757173538208, + "learning_rate": 9.977005110069245e-05, + "loss": 0.32, + "step": 553 + }, + { + "epoch": 0.47910060536177573, + "grad_norm": 7.4964399337768555, + "learning_rate": 9.975447844263395e-05, + "loss": 0.9793, + "step": 554 + }, + { + "epoch": 0.4799654078985298, + "grad_norm": 6.13850736618042, + "learning_rate": 9.973839690644032e-05, + "loss": 0.7821, + "step": 555 + }, + { + "epoch": 0.48083021043528396, + "grad_norm": 8.951305389404297, + "learning_rate": 9.972180665659004e-05, + "loss": 0.6022, + "step": 556 + }, + { + "epoch": 0.48169501297203804, + "grad_norm": 6.228058338165283, + "learning_rate": 9.970470786276467e-05, + "loss": 0.8369, + "step": 557 + }, + { + "epoch": 0.4825598155087922, + "grad_norm": 10.346866607666016, + "learning_rate": 9.968710069984698e-05, + "loss": 0.8025, + "step": 558 + }, + { + "epoch": 0.48342461804554626, + "grad_norm": 2.9348461627960205, + "learning_rate": 9.966898534791926e-05, + "loss": 0.1631, + "step": 559 + }, + { + "epoch": 0.4842894205823004, + "grad_norm": 8.404128074645996, + "learning_rate": 9.965036199226147e-05, + "loss": 0.7858, + "step": 560 + }, + { + "epoch": 0.4851542231190545, + "grad_norm": 3.0906944274902344, + "learning_rate": 9.963123082334925e-05, + "loss": 0.3223, + "step": 561 + }, + { + "epoch": 0.48601902565580857, + "grad_norm": 4.46307373046875, + "learning_rate": 9.961159203685212e-05, + "loss": 0.2361, + "step": 562 + }, + { + "epoch": 0.4868838281925627, + "grad_norm": 7.367444038391113, + "learning_rate": 9.959144583363141e-05, + "loss": 1.2893, + "step": 563 + }, + { + "epoch": 0.4877486307293168, + "grad_norm": 4.720983505249023, + "learning_rate": 9.957079241973809e-05, + "loss": 0.5666, + "step": 564 + }, + { + "epoch": 0.48861343326607093, + "grad_norm": 5.1994829177856445, + "learning_rate": 9.95496320064109e-05, + "loss": 0.2794, + "step": 565 + }, + { + "epoch": 0.489478235802825, + "grad_norm": 8.899139404296875, + "learning_rate": 9.952796481007401e-05, + "loss": 0.6303, + "step": 566 + }, + { + "epoch": 0.49034303833957915, + "grad_norm": 4.118505477905273, + "learning_rate": 9.950579105233483e-05, + "loss": 0.1724, + "step": 567 + }, + { + "epoch": 0.49120784087633323, + "grad_norm": 6.728652477264404, + "learning_rate": 9.948311095998181e-05, + "loss": 0.662, + "step": 568 + }, + { + "epoch": 0.4920726434130873, + "grad_norm": 7.761811256408691, + "learning_rate": 9.945992476498209e-05, + "loss": 0.4051, + "step": 569 + }, + { + "epoch": 0.49293744594984146, + "grad_norm": 10.437024116516113, + "learning_rate": 9.943623270447909e-05, + "loss": 0.7596, + "step": 570 + }, + { + "epoch": 0.49380224848659554, + "grad_norm": 8.579437255859375, + "learning_rate": 9.94120350207901e-05, + "loss": 0.4666, + "step": 571 + }, + { + "epoch": 0.4946670510233497, + "grad_norm": 11.050808906555176, + "learning_rate": 9.938733196140386e-05, + "loss": 0.8923, + "step": 572 + }, + { + "epoch": 0.49553185356010376, + "grad_norm": 6.367518901824951, + "learning_rate": 9.936212377897798e-05, + "loss": 0.3065, + "step": 573 + }, + { + "epoch": 0.4963966560968579, + "grad_norm": 5.786684036254883, + "learning_rate": 9.933641073133631e-05, + "loss": 0.6386, + "step": 574 + }, + { + "epoch": 0.497261458633612, + "grad_norm": 3.814639091491699, + "learning_rate": 9.93101930814664e-05, + "loss": 0.2868, + "step": 575 + }, + { + "epoch": 0.497261458633612, + "eval_Qnli-dev-1024_cosine_accuracy": 0.6979166666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8905854225158691, + "eval_Qnli-dev-1024_cosine_ap": 0.7246322873104885, + "eval_Qnli-dev-1024_cosine_f1": 0.6909090909090909, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.805380642414093, + "eval_Qnli-dev-1024_cosine_mcc": 0.33620907137955974, + "eval_Qnli-dev-1024_cosine_precision": 0.5846153846153846, + "eval_Qnli-dev-1024_cosine_recall": 0.8444444444444444, + "eval_Qnli-dev_cosine_accuracy": 0.6979166666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.8145653009414673, + "eval_Qnli-dev_cosine_ap": 0.7254668033788828, + "eval_Qnli-dev_cosine_f1": 0.7289719626168225, + "eval_Qnli-dev_cosine_f1_threshold": 0.7076575756072998, + "eval_Qnli-dev_cosine_mcc": 0.43373226132862797, + "eval_Qnli-dev_cosine_precision": 0.6290322580645161, + "eval_Qnli-dev_cosine_recall": 0.8666666666666667, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9479166865348816, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.5409160852432251, + "eval_global_dataset_runtime": 68.0813, + "eval_global_dataset_samples_per_second": 14.277, + "eval_global_dataset_steps_per_second": 0.308, + "eval_sequential_score": 0.9479166865348816, + "eval_sts-test-1024_pearson_cosine": 0.8845045352861245, + "eval_sts-test-1024_spearman_cosine": 0.9123160743907711, + "eval_sts-test_pearson_cosine": 0.9122846955191348, + "eval_sts-test_spearman_cosine": 0.920479051307594, + "step": 575 + }, + { + "epoch": 0.4981262611703661, + "grad_norm": 9.594101905822754, + "learning_rate": 9.928347109751677e-05, + "loss": 0.6007, + "step": 576 + }, + { + "epoch": 0.4989910637071202, + "grad_norm": 5.121261119842529, + "learning_rate": 9.925624505279411e-05, + "loss": 0.2853, + "step": 577 + }, + { + "epoch": 0.4998558662438743, + "grad_norm": 4.0415215492248535, + "learning_rate": 9.922851522576058e-05, + "loss": 0.3982, + "step": 578 + }, + { + "epoch": 0.5007206687806284, + "grad_norm": 11.199448585510254, + "learning_rate": 9.92002819000309e-05, + "loss": 0.7686, + "step": 579 + }, + { + "epoch": 0.5015854713173825, + "grad_norm": 5.71658992767334, + "learning_rate": 9.917154536436948e-05, + "loss": 0.3809, + "step": 580 + }, + { + "epoch": 0.5024502738541367, + "grad_norm": 10.616915702819824, + "learning_rate": 9.914230591268743e-05, + "loss": 1.1228, + "step": 581 + }, + { + "epoch": 0.5033150763908908, + "grad_norm": 4.5049028396606445, + "learning_rate": 9.911256384403961e-05, + "loss": 0.3737, + "step": 582 + }, + { + "epoch": 0.5041798789276448, + "grad_norm": 5.022185325622559, + "learning_rate": 9.90823194626215e-05, + "loss": 0.2141, + "step": 583 + }, + { + "epoch": 0.505044681464399, + "grad_norm": 10.781139373779297, + "learning_rate": 9.905157307776616e-05, + "loss": 1.2942, + "step": 584 + }, + { + "epoch": 0.5059094840011531, + "grad_norm": 10.425268173217773, + "learning_rate": 9.902032500394103e-05, + "loss": 1.4177, + "step": 585 + }, + { + "epoch": 0.5067742865379071, + "grad_norm": 6.989367961883545, + "learning_rate": 9.898857556074468e-05, + "loss": 0.5832, + "step": 586 + }, + { + "epoch": 0.5076390890746613, + "grad_norm": 6.156850814819336, + "learning_rate": 9.895632507290362e-05, + "loss": 0.4419, + "step": 587 + }, + { + "epoch": 0.5085038916114154, + "grad_norm": 6.66822624206543, + "learning_rate": 9.892357387026892e-05, + "loss": 0.5903, + "step": 588 + }, + { + "epoch": 0.5093686941481695, + "grad_norm": 8.24500560760498, + "learning_rate": 9.889032228781285e-05, + "loss": 0.44, + "step": 589 + }, + { + "epoch": 0.5102334966849236, + "grad_norm": 6.062635898590088, + "learning_rate": 9.88565706656255e-05, + "loss": 0.3002, + "step": 590 + }, + { + "epoch": 0.5110982992216777, + "grad_norm": 8.822070121765137, + "learning_rate": 9.882231934891119e-05, + "loss": 0.6883, + "step": 591 + }, + { + "epoch": 0.5119631017584318, + "grad_norm": 6.581031322479248, + "learning_rate": 9.878756868798504e-05, + "loss": 0.7068, + "step": 592 + }, + { + "epoch": 0.512827904295186, + "grad_norm": 6.801186561584473, + "learning_rate": 9.875231903826936e-05, + "loss": 0.5245, + "step": 593 + }, + { + "epoch": 0.51369270683194, + "grad_norm": 8.146296501159668, + "learning_rate": 9.871657076029003e-05, + "loss": 0.7089, + "step": 594 + }, + { + "epoch": 0.5145575093686942, + "grad_norm": 12.6628999710083, + "learning_rate": 9.868032421967275e-05, + "loss": 1.8026, + "step": 595 + }, + { + "epoch": 0.5154223119054483, + "grad_norm": 3.0164332389831543, + "learning_rate": 9.864357978713936e-05, + "loss": 0.2736, + "step": 596 + }, + { + "epoch": 0.5162871144422023, + "grad_norm": 3.916259527206421, + "learning_rate": 9.860633783850406e-05, + "loss": 0.3196, + "step": 597 + }, + { + "epoch": 0.5171519169789565, + "grad_norm": 8.493870735168457, + "learning_rate": 9.856859875466948e-05, + "loss": 0.7005, + "step": 598 + }, + { + "epoch": 0.5180167195157106, + "grad_norm": 8.802308082580566, + "learning_rate": 9.853036292162291e-05, + "loss": 0.4239, + "step": 599 + }, + { + "epoch": 0.5188815220524647, + "grad_norm": 10.11483383178711, + "learning_rate": 9.849163073043223e-05, + "loss": 0.5686, + "step": 600 + }, + { + "epoch": 0.5197463245892188, + "grad_norm": 7.787915229797363, + "learning_rate": 9.845240257724198e-05, + "loss": 0.6015, + "step": 601 + }, + { + "epoch": 0.5206111271259729, + "grad_norm": 3.49916410446167, + "learning_rate": 9.841267886326932e-05, + "loss": 0.1611, + "step": 602 + }, + { + "epoch": 0.521475929662727, + "grad_norm": 8.411331176757812, + "learning_rate": 9.837245999479985e-05, + "loss": 0.6458, + "step": 603 + }, + { + "epoch": 0.5223407321994811, + "grad_norm": 7.405316352844238, + "learning_rate": 9.833174638318356e-05, + "loss": 0.7173, + "step": 604 + }, + { + "epoch": 0.5232055347362352, + "grad_norm": 8.42251968383789, + "learning_rate": 9.829053844483052e-05, + "loss": 0.8808, + "step": 605 + }, + { + "epoch": 0.5240703372729894, + "grad_norm": 6.8583269119262695, + "learning_rate": 9.824883660120667e-05, + "loss": 0.625, + "step": 606 + }, + { + "epoch": 0.5249351398097435, + "grad_norm": 6.834749698638916, + "learning_rate": 9.820664127882957e-05, + "loss": 0.4378, + "step": 607 + }, + { + "epoch": 0.5257999423464975, + "grad_norm": 5.739812850952148, + "learning_rate": 9.81639529092639e-05, + "loss": 0.7798, + "step": 608 + }, + { + "epoch": 0.5266647448832517, + "grad_norm": 7.9455084800720215, + "learning_rate": 9.812077192911713e-05, + "loss": 0.6586, + "step": 609 + }, + { + "epoch": 0.5275295474200058, + "grad_norm": 7.959743499755859, + "learning_rate": 9.80770987800351e-05, + "loss": 0.8475, + "step": 610 + }, + { + "epoch": 0.5283943499567598, + "grad_norm": 5.485658168792725, + "learning_rate": 9.803293390869739e-05, + "loss": 0.4095, + "step": 611 + }, + { + "epoch": 0.529259152493514, + "grad_norm": 7.284278392791748, + "learning_rate": 9.798827776681286e-05, + "loss": 0.4946, + "step": 612 + }, + { + "epoch": 0.5301239550302681, + "grad_norm": 8.508416175842285, + "learning_rate": 9.79431308111149e-05, + "loss": 0.3962, + "step": 613 + }, + { + "epoch": 0.5309887575670222, + "grad_norm": 5.56104850769043, + "learning_rate": 9.789749350335693e-05, + "loss": 0.7191, + "step": 614 + }, + { + "epoch": 0.5318535601037763, + "grad_norm": 11.444177627563477, + "learning_rate": 9.785136631030755e-05, + "loss": 0.6589, + "step": 615 + }, + { + "epoch": 0.5327183626405304, + "grad_norm": 8.934037208557129, + "learning_rate": 9.780474970374578e-05, + "loss": 0.5603, + "step": 616 + }, + { + "epoch": 0.5335831651772845, + "grad_norm": 12.182479858398438, + "learning_rate": 9.775764416045628e-05, + "loss": 1.3667, + "step": 617 + }, + { + "epoch": 0.5344479677140386, + "grad_norm": 6.506429195404053, + "learning_rate": 9.771005016222446e-05, + "loss": 0.5623, + "step": 618 + }, + { + "epoch": 0.5353127702507927, + "grad_norm": 8.439187049865723, + "learning_rate": 9.766196819583149e-05, + "loss": 0.6174, + "step": 619 + }, + { + "epoch": 0.5361775727875469, + "grad_norm": 9.493589401245117, + "learning_rate": 9.761339875304945e-05, + "loss": 0.6462, + "step": 620 + }, + { + "epoch": 0.537042375324301, + "grad_norm": 2.347870111465454, + "learning_rate": 9.756434233063616e-05, + "loss": 0.1693, + "step": 621 + }, + { + "epoch": 0.537907177861055, + "grad_norm": 8.565069198608398, + "learning_rate": 9.751479943033019e-05, + "loss": 0.4887, + "step": 622 + }, + { + "epoch": 0.5387719803978092, + "grad_norm": 8.762991905212402, + "learning_rate": 9.746477055884571e-05, + "loss": 0.9039, + "step": 623 + }, + { + "epoch": 0.5396367829345633, + "grad_norm": 5.132269382476807, + "learning_rate": 9.741425622786728e-05, + "loss": 0.3159, + "step": 624 + }, + { + "epoch": 0.5405015854713174, + "grad_norm": 6.715843677520752, + "learning_rate": 9.736325695404464e-05, + "loss": 0.6409, + "step": 625 + }, + { + "epoch": 0.5413663880080715, + "grad_norm": 2.351118803024292, + "learning_rate": 9.731177325898746e-05, + "loss": 0.1413, + "step": 626 + }, + { + "epoch": 0.5422311905448256, + "grad_norm": 5.473691940307617, + "learning_rate": 9.725980566925989e-05, + "loss": 0.3963, + "step": 627 + }, + { + "epoch": 0.5430959930815797, + "grad_norm": 6.525996685028076, + "learning_rate": 9.72073547163753e-05, + "loss": 0.4283, + "step": 628 + }, + { + "epoch": 0.5439607956183338, + "grad_norm": 9.671774864196777, + "learning_rate": 9.71544209367908e-05, + "loss": 0.8147, + "step": 629 + }, + { + "epoch": 0.5448255981550879, + "grad_norm": 7.720305919647217, + "learning_rate": 9.710100487190173e-05, + "loss": 0.7238, + "step": 630 + }, + { + "epoch": 0.545690400691842, + "grad_norm": 6.962470531463623, + "learning_rate": 9.704710706803613e-05, + "loss": 0.3583, + "step": 631 + }, + { + "epoch": 0.5465552032285962, + "grad_norm": 7.1871819496154785, + "learning_rate": 9.699272807644921e-05, + "loss": 0.5934, + "step": 632 + }, + { + "epoch": 0.5474200057653502, + "grad_norm": 8.43585205078125, + "learning_rate": 9.693786845331761e-05, + "loss": 0.3339, + "step": 633 + }, + { + "epoch": 0.5482848083021044, + "grad_norm": 8.839116096496582, + "learning_rate": 9.68825287597338e-05, + "loss": 0.5551, + "step": 634 + }, + { + "epoch": 0.5491496108388585, + "grad_norm": 7.399514675140381, + "learning_rate": 9.68267095617003e-05, + "loss": 0.7277, + "step": 635 + }, + { + "epoch": 0.5500144133756125, + "grad_norm": 3.7421650886535645, + "learning_rate": 9.677041143012391e-05, + "loss": 0.3276, + "step": 636 + }, + { + "epoch": 0.5508792159123667, + "grad_norm": 6.863941669464111, + "learning_rate": 9.67136349408098e-05, + "loss": 0.3983, + "step": 637 + }, + { + "epoch": 0.5517440184491208, + "grad_norm": 8.192028999328613, + "learning_rate": 9.665638067445577e-05, + "loss": 0.5536, + "step": 638 + }, + { + "epoch": 0.5526088209858749, + "grad_norm": 6.802035331726074, + "learning_rate": 9.659864921664617e-05, + "loss": 0.4256, + "step": 639 + }, + { + "epoch": 0.553473623522629, + "grad_norm": 8.902397155761719, + "learning_rate": 9.654044115784594e-05, + "loss": 0.6132, + "step": 640 + }, + { + "epoch": 0.5543384260593831, + "grad_norm": 3.023282289505005, + "learning_rate": 9.648175709339465e-05, + "loss": 0.1601, + "step": 641 + }, + { + "epoch": 0.5552032285961372, + "grad_norm": 6.913763523101807, + "learning_rate": 9.642259762350032e-05, + "loss": 0.8637, + "step": 642 + }, + { + "epoch": 0.5560680311328913, + "grad_norm": 5.186830043792725, + "learning_rate": 9.636296335323334e-05, + "loss": 0.2678, + "step": 643 + }, + { + "epoch": 0.5569328336696454, + "grad_norm": 8.123047828674316, + "learning_rate": 9.63028548925202e-05, + "loss": 0.4715, + "step": 644 + }, + { + "epoch": 0.5577976362063995, + "grad_norm": 8.248505592346191, + "learning_rate": 9.624227285613736e-05, + "loss": 0.4066, + "step": 645 + }, + { + "epoch": 0.5586624387431537, + "grad_norm": 7.174196243286133, + "learning_rate": 9.618121786370491e-05, + "loss": 0.2985, + "step": 646 + }, + { + "epoch": 0.5595272412799077, + "grad_norm": 9.055746078491211, + "learning_rate": 9.61196905396802e-05, + "loss": 0.4818, + "step": 647 + }, + { + "epoch": 0.5603920438166619, + "grad_norm": 5.331139087677002, + "learning_rate": 9.605769151335151e-05, + "loss": 0.3297, + "step": 648 + }, + { + "epoch": 0.561256846353416, + "grad_norm": 4.492726802825928, + "learning_rate": 9.59952214188316e-05, + "loss": 0.2309, + "step": 649 + }, + { + "epoch": 0.56212164889017, + "grad_norm": 7.451852798461914, + "learning_rate": 9.593228089505117e-05, + "loss": 0.3733, + "step": 650 + }, + { + "epoch": 0.5629864514269242, + "grad_norm": 9.455964088439941, + "learning_rate": 9.586887058575243e-05, + "loss": 0.471, + "step": 651 + }, + { + "epoch": 0.5638512539636783, + "grad_norm": 4.70458984375, + "learning_rate": 9.58049911394824e-05, + "loss": 0.1841, + "step": 652 + }, + { + "epoch": 0.5647160565004324, + "grad_norm": 3.027376413345337, + "learning_rate": 9.574064320958637e-05, + "loss": 0.1042, + "step": 653 + }, + { + "epoch": 0.5655808590371865, + "grad_norm": 13.047475814819336, + "learning_rate": 9.567582745420117e-05, + "loss": 1.7486, + "step": 654 + }, + { + "epoch": 0.5664456615739406, + "grad_norm": 5.038949489593506, + "learning_rate": 9.561054453624842e-05, + "loss": 0.7092, + "step": 655 + }, + { + "epoch": 0.5673104641106947, + "grad_norm": 6.817296981811523, + "learning_rate": 9.554479512342784e-05, + "loss": 0.4515, + "step": 656 + }, + { + "epoch": 0.5681752666474489, + "grad_norm": 6.715672969818115, + "learning_rate": 9.54785798882103e-05, + "loss": 0.5267, + "step": 657 + }, + { + "epoch": 0.5690400691842029, + "grad_norm": 12.338273048400879, + "learning_rate": 9.541189950783104e-05, + "loss": 0.8779, + "step": 658 + }, + { + "epoch": 0.569904871720957, + "grad_norm": 6.969177722930908, + "learning_rate": 9.534475466428267e-05, + "loss": 0.3105, + "step": 659 + }, + { + "epoch": 0.5707696742577112, + "grad_norm": 4.153381824493408, + "learning_rate": 9.527714604430827e-05, + "loss": 0.2972, + "step": 660 + }, + { + "epoch": 0.5716344767944652, + "grad_norm": 9.585479736328125, + "learning_rate": 9.52090743393943e-05, + "loss": 0.7349, + "step": 661 + }, + { + "epoch": 0.5724992793312194, + "grad_norm": 8.285649299621582, + "learning_rate": 9.514054024576356e-05, + "loss": 0.3054, + "step": 662 + }, + { + "epoch": 0.5733640818679735, + "grad_norm": 8.23316764831543, + "learning_rate": 9.507154446436805e-05, + "loss": 0.3722, + "step": 663 + }, + { + "epoch": 0.5742288844047276, + "grad_norm": 3.4087507724761963, + "learning_rate": 9.500208770088183e-05, + "loss": 0.3515, + "step": 664 + }, + { + "epoch": 0.5750936869414817, + "grad_norm": 11.583375930786133, + "learning_rate": 9.49321706656938e-05, + "loss": 1.0321, + "step": 665 + }, + { + "epoch": 0.5759584894782358, + "grad_norm": 9.680198669433594, + "learning_rate": 9.48617940739004e-05, + "loss": 0.6996, + "step": 666 + }, + { + "epoch": 0.5768232920149899, + "grad_norm": 5.860654354095459, + "learning_rate": 9.479095864529828e-05, + "loss": 0.584, + "step": 667 + }, + { + "epoch": 0.577688094551744, + "grad_norm": 8.714286804199219, + "learning_rate": 9.471966510437704e-05, + "loss": 0.8377, + "step": 668 + }, + { + "epoch": 0.5785528970884981, + "grad_norm": 5.863884925842285, + "learning_rate": 9.464791418031172e-05, + "loss": 0.3194, + "step": 669 + }, + { + "epoch": 0.5794176996252522, + "grad_norm": 3.8105716705322266, + "learning_rate": 9.457570660695541e-05, + "loss": 0.2197, + "step": 670 + }, + { + "epoch": 0.5802825021620064, + "grad_norm": 7.818668842315674, + "learning_rate": 9.450304312283164e-05, + "loss": 0.5296, + "step": 671 + }, + { + "epoch": 0.5811473046987604, + "grad_norm": 3.5748655796051025, + "learning_rate": 9.442992447112697e-05, + "loss": 0.2199, + "step": 672 + }, + { + "epoch": 0.5820121072355146, + "grad_norm": 9.74962043762207, + "learning_rate": 9.435635139968328e-05, + "loss": 0.7576, + "step": 673 + }, + { + "epoch": 0.5828769097722687, + "grad_norm": 5.957652568817139, + "learning_rate": 9.428232466099018e-05, + "loss": 0.4388, + "step": 674 + }, + { + "epoch": 0.5837417123090227, + "grad_norm": 1.4129705429077148, + "learning_rate": 9.420784501217726e-05, + "loss": 0.0997, + "step": 675 + }, + { + "epoch": 0.5846065148457769, + "grad_norm": 6.296298503875732, + "learning_rate": 9.41329132150064e-05, + "loss": 0.4806, + "step": 676 + }, + { + "epoch": 0.585471317382531, + "grad_norm": 8.789826393127441, + "learning_rate": 9.405753003586395e-05, + "loss": 0.7328, + "step": 677 + }, + { + "epoch": 0.5863361199192851, + "grad_norm": 9.228763580322266, + "learning_rate": 9.39816962457529e-05, + "loss": 0.4772, + "step": 678 + }, + { + "epoch": 0.5872009224560392, + "grad_norm": 5.72409725189209, + "learning_rate": 9.3905412620285e-05, + "loss": 0.3285, + "step": 679 + }, + { + "epoch": 0.5880657249927933, + "grad_norm": 10.633530616760254, + "learning_rate": 9.382867993967281e-05, + "loss": 0.9213, + "step": 680 + }, + { + "epoch": 0.5889305275295474, + "grad_norm": 10.06709098815918, + "learning_rate": 9.375149898872172e-05, + "loss": 0.5335, + "step": 681 + }, + { + "epoch": 0.5897953300663016, + "grad_norm": 5.641694068908691, + "learning_rate": 9.367387055682197e-05, + "loss": 0.3178, + "step": 682 + }, + { + "epoch": 0.5906601326030556, + "grad_norm": 8.637955665588379, + "learning_rate": 9.359579543794048e-05, + "loss": 0.7194, + "step": 683 + }, + { + "epoch": 0.5915249351398097, + "grad_norm": 5.672209739685059, + "learning_rate": 9.351727443061283e-05, + "loss": 0.5559, + "step": 684 + }, + { + "epoch": 0.5923897376765639, + "grad_norm": 6.293837547302246, + "learning_rate": 9.343830833793505e-05, + "loss": 0.489, + "step": 685 + }, + { + "epoch": 0.5932545402133179, + "grad_norm": 5.788215160369873, + "learning_rate": 9.335889796755541e-05, + "loss": 0.2563, + "step": 686 + }, + { + "epoch": 0.594119342750072, + "grad_norm": 8.539923667907715, + "learning_rate": 9.327904413166615e-05, + "loss": 0.8217, + "step": 687 + }, + { + "epoch": 0.5949841452868262, + "grad_norm": 4.539181709289551, + "learning_rate": 9.319874764699515e-05, + "loss": 0.371, + "step": 688 + }, + { + "epoch": 0.5958489478235803, + "grad_norm": 4.926830291748047, + "learning_rate": 9.311800933479764e-05, + "loss": 0.3217, + "step": 689 + }, + { + "epoch": 0.5967137503603344, + "grad_norm": 8.856836318969727, + "learning_rate": 9.30368300208478e-05, + "loss": 0.5505, + "step": 690 + }, + { + "epoch": 0.5967137503603344, + "eval_Qnli-dev-1024_cosine_accuracy": 0.71875, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.868374228477478, + "eval_Qnli-dev-1024_cosine_ap": 0.7082660274050915, + "eval_Qnli-dev-1024_cosine_f1": 0.6938775510204082, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8440404534339905, + "eval_Qnli-dev-1024_cosine_mcc": 0.3843486566998693, + "eval_Qnli-dev-1024_cosine_precision": 0.6415094339622641, + "eval_Qnli-dev-1024_cosine_recall": 0.7555555555555555, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7942297458648682, + "eval_Qnli-dev_cosine_ap": 0.7468079642036429, + "eval_Qnli-dev_cosine_f1": 0.7222222222222222, + "eval_Qnli-dev_cosine_f1_threshold": 0.6964967250823975, + "eval_Qnli-dev_cosine_mcc": 0.41614558708189836, + "eval_Qnli-dev_cosine_precision": 0.6190476190476191, + "eval_Qnli-dev_cosine_recall": 0.8666666666666667, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9375, + "eval_allNLI-triplets_cosine_accuracy": 0.96875, + "eval_global_dataset_loss": 0.6163961887359619, + "eval_global_dataset_runtime": 68.0763, + "eval_global_dataset_samples_per_second": 14.278, + "eval_global_dataset_steps_per_second": 0.308, + "eval_sequential_score": 0.9375, + "eval_sts-test-1024_pearson_cosine": 0.8687143218667199, + "eval_sts-test-1024_spearman_cosine": 0.9060527968336128, + "eval_sts-test_pearson_cosine": 0.9081393663808583, + "eval_sts-test_spearman_cosine": 0.920736019932914, + "step": 690 + }, + { + "epoch": 0.5975785528970885, + "grad_norm": 7.396856784820557, + "learning_rate": 9.295521053543019e-05, + "loss": 0.5105, + "step": 691 + }, + { + "epoch": 0.5984433554338426, + "grad_norm": 8.075766563415527, + "learning_rate": 9.287315171333144e-05, + "loss": 0.5877, + "step": 692 + }, + { + "epoch": 0.5993081579705967, + "grad_norm": 8.688477516174316, + "learning_rate": 9.279065439383157e-05, + "loss": 0.7346, + "step": 693 + }, + { + "epoch": 0.6001729605073508, + "grad_norm": 5.448639869689941, + "learning_rate": 9.27077194206955e-05, + "loss": 0.3644, + "step": 694 + }, + { + "epoch": 0.601037763044105, + "grad_norm": 5.420974254608154, + "learning_rate": 9.262434764216428e-05, + "loss": 0.2205, + "step": 695 + }, + { + "epoch": 0.6019025655808591, + "grad_norm": 6.542895793914795, + "learning_rate": 9.254053991094666e-05, + "loss": 0.2832, + "step": 696 + }, + { + "epoch": 0.6027673681176131, + "grad_norm": 3.4809961318969727, + "learning_rate": 9.245629708421008e-05, + "loss": 0.145, + "step": 697 + }, + { + "epoch": 0.6036321706543673, + "grad_norm": 10.80398941040039, + "learning_rate": 9.237162002357214e-05, + "loss": 0.8998, + "step": 698 + }, + { + "epoch": 0.6044969731911214, + "grad_norm": 2.6544158458709717, + "learning_rate": 9.228650959509166e-05, + "loss": 0.1194, + "step": 699 + }, + { + "epoch": 0.6053617757278754, + "grad_norm": 4.051424980163574, + "learning_rate": 9.220096666925982e-05, + "loss": 0.1845, + "step": 700 + }, + { + "epoch": 0.6062265782646296, + "grad_norm": 10.206416130065918, + "learning_rate": 9.211499212099135e-05, + "loss": 0.6004, + "step": 701 + }, + { + "epoch": 0.6070913808013837, + "grad_norm": 11.007821083068848, + "learning_rate": 9.202858682961545e-05, + "loss": 0.5262, + "step": 702 + }, + { + "epoch": 0.6079561833381378, + "grad_norm": 9.616263389587402, + "learning_rate": 9.194175167886698e-05, + "loss": 1.3073, + "step": 703 + }, + { + "epoch": 0.6088209858748919, + "grad_norm": 7.583075523376465, + "learning_rate": 9.185448755687717e-05, + "loss": 0.4977, + "step": 704 + }, + { + "epoch": 0.609685788411646, + "grad_norm": 3.0454254150390625, + "learning_rate": 9.176679535616477e-05, + "loss": 0.1434, + "step": 705 + }, + { + "epoch": 0.6105505909484001, + "grad_norm": 5.383974075317383, + "learning_rate": 9.167867597362682e-05, + "loss": 0.1923, + "step": 706 + }, + { + "epoch": 0.6114153934851542, + "grad_norm": 10.157812118530273, + "learning_rate": 9.159013031052943e-05, + "loss": 0.5597, + "step": 707 + }, + { + "epoch": 0.6122801960219083, + "grad_norm": 12.371292114257812, + "learning_rate": 9.150115927249869e-05, + "loss": 0.8295, + "step": 708 + }, + { + "epoch": 0.6131449985586624, + "grad_norm": 9.299467086791992, + "learning_rate": 9.141176376951128e-05, + "loss": 0.5907, + "step": 709 + }, + { + "epoch": 0.6140098010954166, + "grad_norm": 7.16170597076416, + "learning_rate": 9.132194471588522e-05, + "loss": 0.5436, + "step": 710 + }, + { + "epoch": 0.6148746036321706, + "grad_norm": 6.266456127166748, + "learning_rate": 9.123170303027055e-05, + "loss": 0.309, + "step": 711 + }, + { + "epoch": 0.6157394061689248, + "grad_norm": 10.73092269897461, + "learning_rate": 9.114103963563985e-05, + "loss": 0.7257, + "step": 712 + }, + { + "epoch": 0.6166042087056789, + "grad_norm": 8.290569305419922, + "learning_rate": 9.104995545927893e-05, + "loss": 0.6665, + "step": 713 + }, + { + "epoch": 0.617469011242433, + "grad_norm": 6.256021499633789, + "learning_rate": 9.095845143277714e-05, + "loss": 0.281, + "step": 714 + }, + { + "epoch": 0.6183338137791871, + "grad_norm": 3.134965419769287, + "learning_rate": 9.086652849201807e-05, + "loss": 0.2241, + "step": 715 + }, + { + "epoch": 0.6191986163159412, + "grad_norm": 8.62253475189209, + "learning_rate": 9.077418757716988e-05, + "loss": 0.7341, + "step": 716 + }, + { + "epoch": 0.6200634188526953, + "grad_norm": 8.627028465270996, + "learning_rate": 9.06814296326756e-05, + "loss": 0.3946, + "step": 717 + }, + { + "epoch": 0.6209282213894494, + "grad_norm": 5.673067092895508, + "learning_rate": 9.05882556072436e-05, + "loss": 0.4353, + "step": 718 + }, + { + "epoch": 0.6217930239262035, + "grad_norm": 5.314984321594238, + "learning_rate": 9.049466645383784e-05, + "loss": 0.3919, + "step": 719 + }, + { + "epoch": 0.6226578264629576, + "grad_norm": 8.689918518066406, + "learning_rate": 9.040066312966811e-05, + "loss": 0.5087, + "step": 720 + }, + { + "epoch": 0.6235226289997118, + "grad_norm": 5.046836853027344, + "learning_rate": 9.030624659618023e-05, + "loss": 0.2345, + "step": 721 + }, + { + "epoch": 0.6243874315364658, + "grad_norm": 12.160417556762695, + "learning_rate": 9.021141781904627e-05, + "loss": 0.8855, + "step": 722 + }, + { + "epoch": 0.62525223407322, + "grad_norm": 9.182302474975586, + "learning_rate": 9.011617776815464e-05, + "loss": 0.7187, + "step": 723 + }, + { + "epoch": 0.6261170366099741, + "grad_norm": 6.717326641082764, + "learning_rate": 9.002052741760015e-05, + "loss": 0.5225, + "step": 724 + }, + { + "epoch": 0.6269818391467281, + "grad_norm": 11.271307945251465, + "learning_rate": 8.992446774567405e-05, + "loss": 0.9725, + "step": 725 + }, + { + "epoch": 0.6278466416834823, + "grad_norm": 12.319371223449707, + "learning_rate": 8.982799973485407e-05, + "loss": 0.6209, + "step": 726 + }, + { + "epoch": 0.6287114442202364, + "grad_norm": 7.424941062927246, + "learning_rate": 8.973112437179436e-05, + "loss": 0.478, + "step": 727 + }, + { + "epoch": 0.6295762467569905, + "grad_norm": 6.208258628845215, + "learning_rate": 8.963384264731533e-05, + "loss": 0.2833, + "step": 728 + }, + { + "epoch": 0.6304410492937446, + "grad_norm": 4.718559265136719, + "learning_rate": 8.95361555563936e-05, + "loss": 0.2356, + "step": 729 + }, + { + "epoch": 0.6313058518304987, + "grad_norm": 9.238673210144043, + "learning_rate": 8.943806409815181e-05, + "loss": 0.6937, + "step": 730 + }, + { + "epoch": 0.6321706543672528, + "grad_norm": 11.935426712036133, + "learning_rate": 8.933956927584832e-05, + "loss": 0.8793, + "step": 731 + }, + { + "epoch": 0.6330354569040069, + "grad_norm": 8.183321952819824, + "learning_rate": 8.924067209686709e-05, + "loss": 0.6845, + "step": 732 + }, + { + "epoch": 0.633900259440761, + "grad_norm": 4.494237422943115, + "learning_rate": 8.914137357270723e-05, + "loss": 0.2744, + "step": 733 + }, + { + "epoch": 0.6347650619775151, + "grad_norm": 10.111383438110352, + "learning_rate": 8.904167471897274e-05, + "loss": 0.8681, + "step": 734 + }, + { + "epoch": 0.6356298645142693, + "grad_norm": 10.407071113586426, + "learning_rate": 8.894157655536216e-05, + "loss": 1.0385, + "step": 735 + }, + { + "epoch": 0.6364946670510233, + "grad_norm": 6.472255706787109, + "learning_rate": 8.884108010565797e-05, + "loss": 0.2331, + "step": 736 + }, + { + "epoch": 0.6373594695877774, + "grad_norm": 4.348916530609131, + "learning_rate": 8.874018639771637e-05, + "loss": 0.3183, + "step": 737 + }, + { + "epoch": 0.6382242721245316, + "grad_norm": 3.087089776992798, + "learning_rate": 8.863889646345653e-05, + "loss": 0.1691, + "step": 738 + }, + { + "epoch": 0.6390890746612857, + "grad_norm": 5.743144512176514, + "learning_rate": 8.85372113388502e-05, + "loss": 0.4625, + "step": 739 + }, + { + "epoch": 0.6399538771980398, + "grad_norm": 4.561880111694336, + "learning_rate": 8.843513206391101e-05, + "loss": 0.2338, + "step": 740 + }, + { + "epoch": 0.6408186797347939, + "grad_norm": 10.266475677490234, + "learning_rate": 8.83326596826839e-05, + "loss": 1.1701, + "step": 741 + }, + { + "epoch": 0.641683482271548, + "grad_norm": 8.521928787231445, + "learning_rate": 8.822979524323441e-05, + "loss": 0.7673, + "step": 742 + }, + { + "epoch": 0.6425482848083021, + "grad_norm": 8.54457950592041, + "learning_rate": 8.812653979763795e-05, + "loss": 0.5481, + "step": 743 + }, + { + "epoch": 0.6434130873450562, + "grad_norm": 5.748913288116455, + "learning_rate": 8.802289440196908e-05, + "loss": 0.3357, + "step": 744 + }, + { + "epoch": 0.6442778898818103, + "grad_norm": 4.804452896118164, + "learning_rate": 8.791886011629068e-05, + "loss": 0.263, + "step": 745 + }, + { + "epoch": 0.6451426924185645, + "grad_norm": 3.707672119140625, + "learning_rate": 8.781443800464316e-05, + "loss": 0.1461, + "step": 746 + }, + { + "epoch": 0.6460074949553185, + "grad_norm": 7.357616901397705, + "learning_rate": 8.77096291350334e-05, + "loss": 0.3193, + "step": 747 + }, + { + "epoch": 0.6468722974920726, + "grad_norm": 4.722273349761963, + "learning_rate": 8.760443457942408e-05, + "loss": 0.2647, + "step": 748 + }, + { + "epoch": 0.6477371000288268, + "grad_norm": 5.43215799331665, + "learning_rate": 8.749885541372257e-05, + "loss": 0.2494, + "step": 749 + }, + { + "epoch": 0.6486019025655808, + "grad_norm": 4.395086765289307, + "learning_rate": 8.739289271776991e-05, + "loss": 0.1905, + "step": 750 + }, + { + "epoch": 0.649466705102335, + "grad_norm": 6.617416858673096, + "learning_rate": 8.728654757532984e-05, + "loss": 0.6302, + "step": 751 + }, + { + "epoch": 0.6503315076390891, + "grad_norm": 3.7228050231933594, + "learning_rate": 8.717982107407768e-05, + "loss": 0.3397, + "step": 752 + }, + { + "epoch": 0.6511963101758432, + "grad_norm": 9.654953002929688, + "learning_rate": 8.707271430558919e-05, + "loss": 0.6679, + "step": 753 + }, + { + "epoch": 0.6520611127125973, + "grad_norm": 4.019669532775879, + "learning_rate": 8.69652283653294e-05, + "loss": 0.3372, + "step": 754 + }, + { + "epoch": 0.6529259152493514, + "grad_norm": 7.510921478271484, + "learning_rate": 8.68573643526415e-05, + "loss": 0.6676, + "step": 755 + }, + { + "epoch": 0.6537907177861055, + "grad_norm": 13.126535415649414, + "learning_rate": 8.674912337073544e-05, + "loss": 1.2867, + "step": 756 + }, + { + "epoch": 0.6546555203228596, + "grad_norm": 9.412704467773438, + "learning_rate": 8.66405065266768e-05, + "loss": 0.8248, + "step": 757 + }, + { + "epoch": 0.6555203228596137, + "grad_norm": 6.785587787628174, + "learning_rate": 8.653151493137536e-05, + "loss": 0.4971, + "step": 758 + }, + { + "epoch": 0.6563851253963678, + "grad_norm": 12.77095890045166, + "learning_rate": 8.642214969957376e-05, + "loss": 1.4049, + "step": 759 + }, + { + "epoch": 0.657249927933122, + "grad_norm": 6.501046180725098, + "learning_rate": 8.631241194983616e-05, + "loss": 0.3086, + "step": 760 + }, + { + "epoch": 0.658114730469876, + "grad_norm": 6.871536731719971, + "learning_rate": 8.620230280453673e-05, + "loss": 0.6796, + "step": 761 + }, + { + "epoch": 0.6589795330066301, + "grad_norm": 6.746383190155029, + "learning_rate": 8.609182338984818e-05, + "loss": 0.4314, + "step": 762 + }, + { + "epoch": 0.6598443355433843, + "grad_norm": 4.454339504241943, + "learning_rate": 8.598097483573029e-05, + "loss": 0.2843, + "step": 763 + }, + { + "epoch": 0.6607091380801383, + "grad_norm": 5.15504789352417, + "learning_rate": 8.586975827591825e-05, + "loss": 0.4569, + "step": 764 + }, + { + "epoch": 0.6615739406168925, + "grad_norm": 6.545773506164551, + "learning_rate": 8.575817484791127e-05, + "loss": 0.3931, + "step": 765 + }, + { + "epoch": 0.6624387431536466, + "grad_norm": 4.9794511795043945, + "learning_rate": 8.564622569296063e-05, + "loss": 0.2155, + "step": 766 + }, + { + "epoch": 0.6633035456904007, + "grad_norm": 8.013479232788086, + "learning_rate": 8.553391195605833e-05, + "loss": 0.3245, + "step": 767 + }, + { + "epoch": 0.6641683482271548, + "grad_norm": 9.687097549438477, + "learning_rate": 8.542123478592518e-05, + "loss": 0.7824, + "step": 768 + }, + { + "epoch": 0.6650331507639089, + "grad_norm": 5.516420364379883, + "learning_rate": 8.530819533499909e-05, + "loss": 0.3537, + "step": 769 + }, + { + "epoch": 0.665897953300663, + "grad_norm": 6.398399353027344, + "learning_rate": 8.519479475942334e-05, + "loss": 0.2212, + "step": 770 + }, + { + "epoch": 0.6667627558374172, + "grad_norm": 6.814426898956299, + "learning_rate": 8.508103421903468e-05, + "loss": 0.5911, + "step": 771 + }, + { + "epoch": 0.6676275583741712, + "grad_norm": 6.5453410148620605, + "learning_rate": 8.496691487735156e-05, + "loss": 0.4524, + "step": 772 + }, + { + "epoch": 0.6684923609109253, + "grad_norm": 3.5740625858306885, + "learning_rate": 8.485243790156208e-05, + "loss": 0.2604, + "step": 773 + }, + { + "epoch": 0.6693571634476795, + "grad_norm": 12.454208374023438, + "learning_rate": 8.473760446251221e-05, + "loss": 0.8186, + "step": 774 + }, + { + "epoch": 0.6702219659844335, + "grad_norm": 5.322040557861328, + "learning_rate": 8.462241573469379e-05, + "loss": 0.4612, + "step": 775 + }, + { + "epoch": 0.6710867685211876, + "grad_norm": 7.373685359954834, + "learning_rate": 8.450687289623235e-05, + "loss": 0.5306, + "step": 776 + }, + { + "epoch": 0.6719515710579418, + "grad_norm": 11.016031265258789, + "learning_rate": 8.439097712887531e-05, + "loss": 1.0424, + "step": 777 + }, + { + "epoch": 0.6728163735946959, + "grad_norm": 8.017274856567383, + "learning_rate": 8.427472961797971e-05, + "loss": 0.473, + "step": 778 + }, + { + "epoch": 0.67368117613145, + "grad_norm": 5.788976669311523, + "learning_rate": 8.415813155250017e-05, + "loss": 0.2846, + "step": 779 + }, + { + "epoch": 0.6745459786682041, + "grad_norm": 4.2314558029174805, + "learning_rate": 8.404118412497666e-05, + "loss": 0.4083, + "step": 780 + }, + { + "epoch": 0.6754107812049582, + "grad_norm": 3.476349115371704, + "learning_rate": 8.392388853152245e-05, + "loss": 0.236, + "step": 781 + }, + { + "epoch": 0.6762755837417123, + "grad_norm": 10.38036823272705, + "learning_rate": 8.380624597181165e-05, + "loss": 0.6732, + "step": 782 + }, + { + "epoch": 0.6771403862784664, + "grad_norm": 7.326548099517822, + "learning_rate": 8.368825764906716e-05, + "loss": 0.6798, + "step": 783 + }, + { + "epoch": 0.6780051888152205, + "grad_norm": 8.5910062789917, + "learning_rate": 8.356992477004828e-05, + "loss": 0.75, + "step": 784 + }, + { + "epoch": 0.6788699913519747, + "grad_norm": 4.450828552246094, + "learning_rate": 8.345124854503825e-05, + "loss": 0.2198, + "step": 785 + }, + { + "epoch": 0.6797347938887287, + "grad_norm": 3.15915584564209, + "learning_rate": 8.33322301878321e-05, + "loss": 0.1629, + "step": 786 + }, + { + "epoch": 0.6805995964254828, + "grad_norm": 3.2538440227508545, + "learning_rate": 8.321287091572403e-05, + "loss": 0.1949, + "step": 787 + }, + { + "epoch": 0.681464398962237, + "grad_norm": 8.031615257263184, + "learning_rate": 8.309317194949509e-05, + "loss": 0.3901, + "step": 788 + }, + { + "epoch": 0.682329201498991, + "grad_norm": 2.7871859073638916, + "learning_rate": 8.297313451340064e-05, + "loss": 0.2184, + "step": 789 + }, + { + "epoch": 0.6831940040357452, + "grad_norm": 6.6741204261779785, + "learning_rate": 8.285275983515783e-05, + "loss": 0.3516, + "step": 790 + }, + { + "epoch": 0.6840588065724993, + "grad_norm": 9.924346923828125, + "learning_rate": 8.273204914593304e-05, + "loss": 0.9001, + "step": 791 + }, + { + "epoch": 0.6849236091092534, + "grad_norm": 2.0380783081054688, + "learning_rate": 8.261100368032934e-05, + "loss": 0.0729, + "step": 792 + }, + { + "epoch": 0.6857884116460075, + "grad_norm": 4.190455913543701, + "learning_rate": 8.248962467637378e-05, + "loss": 0.1484, + "step": 793 + }, + { + "epoch": 0.6866532141827616, + "grad_norm": 10.513288497924805, + "learning_rate": 8.236791337550478e-05, + "loss": 0.8013, + "step": 794 + }, + { + "epoch": 0.6875180167195157, + "grad_norm": 5.367727279663086, + "learning_rate": 8.22458710225594e-05, + "loss": 0.2315, + "step": 795 + }, + { + "epoch": 0.6883828192562699, + "grad_norm": 4.737613201141357, + "learning_rate": 8.21234988657607e-05, + "loss": 0.2135, + "step": 796 + }, + { + "epoch": 0.6892476217930239, + "grad_norm": 7.230178356170654, + "learning_rate": 8.20007981567048e-05, + "loss": 0.6123, + "step": 797 + }, + { + "epoch": 0.690112424329778, + "grad_norm": 5.188995361328125, + "learning_rate": 8.18777701503483e-05, + "loss": 0.2533, + "step": 798 + }, + { + "epoch": 0.6909772268665322, + "grad_norm": 9.257750511169434, + "learning_rate": 8.175441610499522e-05, + "loss": 0.6212, + "step": 799 + }, + { + "epoch": 0.6918420294032862, + "grad_norm": 1.5883065462112427, + "learning_rate": 8.163073728228427e-05, + "loss": 0.0883, + "step": 800 + }, + { + "epoch": 0.6927068319400403, + "grad_norm": 8.530162811279297, + "learning_rate": 8.150673494717597e-05, + "loss": 0.3946, + "step": 801 + }, + { + "epoch": 0.6935716344767945, + "grad_norm": 7.668551445007324, + "learning_rate": 8.138241036793958e-05, + "loss": 0.4277, + "step": 802 + }, + { + "epoch": 0.6944364370135486, + "grad_norm": 8.265761375427246, + "learning_rate": 8.125776481614024e-05, + "loss": 0.5575, + "step": 803 + }, + { + "epoch": 0.6953012395503027, + "grad_norm": 7.973784446716309, + "learning_rate": 8.113279956662594e-05, + "loss": 0.4164, + "step": 804 + }, + { + "epoch": 0.6961660420870568, + "grad_norm": 4.912955284118652, + "learning_rate": 8.100751589751442e-05, + "loss": 0.1826, + "step": 805 + }, + { + "epoch": 0.6961660420870568, + "eval_Qnli-dev-1024_cosine_accuracy": 0.71875, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8731638193130493, + "eval_Qnli-dev-1024_cosine_ap": 0.724535579920194, + "eval_Qnli-dev-1024_cosine_f1": 0.7037037037037037, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.781539261341095, + "eval_Qnli-dev-1024_cosine_mcc": 0.3721962181491566, + "eval_Qnli-dev-1024_cosine_precision": 0.6031746031746031, + "eval_Qnli-dev-1024_cosine_recall": 0.8444444444444444, + "eval_Qnli-dev_cosine_accuracy": 0.7291666666666666, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.778366208076477, + "eval_Qnli-dev_cosine_ap": 0.7396468214578549, + "eval_Qnli-dev_cosine_f1": 0.7142857142857142, + "eval_Qnli-dev_cosine_f1_threshold": 0.7083452939987183, + "eval_Qnli-dev_cosine_mcc": 0.4263253018001963, + "eval_Qnli-dev_cosine_precision": 0.660377358490566, + "eval_Qnli-dev_cosine_recall": 0.7777777777777778, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9583333134651184, + "eval_allNLI-triplets_cosine_accuracy": 0.9791666865348816, + "eval_global_dataset_loss": 0.6504772901535034, + "eval_global_dataset_runtime": 67.87, + "eval_global_dataset_samples_per_second": 14.321, + "eval_global_dataset_steps_per_second": 0.309, + "eval_sequential_score": 0.9583333134651184, + "eval_sts-test-1024_pearson_cosine": 0.8671751973788917, + "eval_sts-test-1024_spearman_cosine": 0.9100830925358703, + "eval_sts-test_pearson_cosine": 0.9084514358185803, + "eval_sts-test_spearman_cosine": 0.9218648677325396, + "step": 805 + }, + { + "epoch": 0.6970308446238109, + "grad_norm": 6.679697513580322, + "learning_rate": 8.08819150901802e-05, + "loss": 0.1903, + "step": 806 + }, + { + "epoch": 0.697895647160565, + "grad_norm": 5.046963214874268, + "learning_rate": 8.075599842924139e-05, + "loss": 0.2428, + "step": 807 + }, + { + "epoch": 0.6987604496973191, + "grad_norm": 2.3785789012908936, + "learning_rate": 8.06297672025466e-05, + "loss": 0.0856, + "step": 808 + }, + { + "epoch": 0.6996252522340732, + "grad_norm": 3.804054021835327, + "learning_rate": 8.050322270116174e-05, + "loss": 0.1363, + "step": 809 + }, + { + "epoch": 0.7004900547708274, + "grad_norm": 7.834051132202148, + "learning_rate": 8.037636621935685e-05, + "loss": 0.3135, + "step": 810 + }, + { + "epoch": 0.7013548573075814, + "grad_norm": 9.44914722442627, + "learning_rate": 8.02491990545928e-05, + "loss": 0.758, + "step": 811 + }, + { + "epoch": 0.7022196598443355, + "grad_norm": 4.096058368682861, + "learning_rate": 8.012172250750807e-05, + "loss": 0.3242, + "step": 812 + }, + { + "epoch": 0.7030844623810897, + "grad_norm": 9.295559883117676, + "learning_rate": 7.999393788190548e-05, + "loss": 0.5513, + "step": 813 + }, + { + "epoch": 0.7039492649178437, + "grad_norm": 8.359066009521484, + "learning_rate": 7.986584648473874e-05, + "loss": 0.8495, + "step": 814 + }, + { + "epoch": 0.7048140674545978, + "grad_norm": 11.649872779846191, + "learning_rate": 7.973744962609921e-05, + "loss": 0.6741, + "step": 815 + }, + { + "epoch": 0.705678869991352, + "grad_norm": 7.094292163848877, + "learning_rate": 7.960874861920242e-05, + "loss": 0.3723, + "step": 816 + }, + { + "epoch": 0.7065436725281061, + "grad_norm": 7.240077495574951, + "learning_rate": 7.947974478037468e-05, + "loss": 0.5434, + "step": 817 + }, + { + "epoch": 0.7074084750648602, + "grad_norm": 2.250290632247925, + "learning_rate": 7.935043942903955e-05, + "loss": 0.1101, + "step": 818 + }, + { + "epoch": 0.7082732776016143, + "grad_norm": 4.252150058746338, + "learning_rate": 7.922083388770447e-05, + "loss": 0.1888, + "step": 819 + }, + { + "epoch": 0.7091380801383684, + "grad_norm": 4.072676181793213, + "learning_rate": 7.90909294819471e-05, + "loss": 0.2415, + "step": 820 + }, + { + "epoch": 0.7100028826751225, + "grad_norm": 9.136191368103027, + "learning_rate": 7.896072754040186e-05, + "loss": 0.4492, + "step": 821 + }, + { + "epoch": 0.7108676852118766, + "grad_norm": 10.30457878112793, + "learning_rate": 7.883022939474626e-05, + "loss": 0.9007, + "step": 822 + }, + { + "epoch": 0.7117324877486307, + "grad_norm": 5.737984657287598, + "learning_rate": 7.869943637968738e-05, + "loss": 0.5574, + "step": 823 + }, + { + "epoch": 0.7125972902853849, + "grad_norm": 9.1240234375, + "learning_rate": 7.85683498329481e-05, + "loss": 0.9687, + "step": 824 + }, + { + "epoch": 0.7134620928221389, + "grad_norm": 8.149517059326172, + "learning_rate": 7.843697109525352e-05, + "loss": 0.8161, + "step": 825 + }, + { + "epoch": 0.714326895358893, + "grad_norm": 10.62049674987793, + "learning_rate": 7.830530151031719e-05, + "loss": 0.6275, + "step": 826 + }, + { + "epoch": 0.7151916978956472, + "grad_norm": 4.933554172515869, + "learning_rate": 7.817334242482738e-05, + "loss": 0.361, + "step": 827 + }, + { + "epoch": 0.7160565004324013, + "grad_norm": 4.892520427703857, + "learning_rate": 7.804109518843334e-05, + "loss": 0.2424, + "step": 828 + }, + { + "epoch": 0.7169213029691553, + "grad_norm": 8.320906639099121, + "learning_rate": 7.790856115373142e-05, + "loss": 0.7132, + "step": 829 + }, + { + "epoch": 0.7177861055059095, + "grad_norm": 8.682563781738281, + "learning_rate": 7.77757416762513e-05, + "loss": 0.5795, + "step": 830 + }, + { + "epoch": 0.7186509080426636, + "grad_norm": 1.6696056127548218, + "learning_rate": 7.764263811444215e-05, + "loss": 0.0414, + "step": 831 + }, + { + "epoch": 0.7195157105794177, + "grad_norm": 10.681838989257812, + "learning_rate": 7.75092518296586e-05, + "loss": 0.8467, + "step": 832 + }, + { + "epoch": 0.7203805131161718, + "grad_norm": 5.933515548706055, + "learning_rate": 7.737558418614699e-05, + "loss": 0.3639, + "step": 833 + }, + { + "epoch": 0.7212453156529259, + "grad_norm": 3.935758352279663, + "learning_rate": 7.724163655103131e-05, + "loss": 0.3737, + "step": 834 + }, + { + "epoch": 0.7221101181896801, + "grad_norm": 9.584526062011719, + "learning_rate": 7.710741029429926e-05, + "loss": 0.6802, + "step": 835 + }, + { + "epoch": 0.7229749207264341, + "grad_norm": 13.70799446105957, + "learning_rate": 7.697290678878819e-05, + "loss": 1.0565, + "step": 836 + }, + { + "epoch": 0.7238397232631882, + "grad_norm": 8.689953804016113, + "learning_rate": 7.683812741017112e-05, + "loss": 0.5841, + "step": 837 + }, + { + "epoch": 0.7247045257999424, + "grad_norm": 13.601666450500488, + "learning_rate": 7.670307353694262e-05, + "loss": 1.0172, + "step": 838 + }, + { + "epoch": 0.7255693283366964, + "grad_norm": 3.734889507293701, + "learning_rate": 7.656774655040472e-05, + "loss": 0.3109, + "step": 839 + }, + { + "epoch": 0.7264341308734505, + "grad_norm": 10.951227188110352, + "learning_rate": 7.643214783465286e-05, + "loss": 0.6902, + "step": 840 + }, + { + "epoch": 0.7272989334102047, + "grad_norm": 2.3853259086608887, + "learning_rate": 7.62962787765616e-05, + "loss": 0.1287, + "step": 841 + }, + { + "epoch": 0.7281637359469588, + "grad_norm": 10.43367862701416, + "learning_rate": 7.616014076577054e-05, + "loss": 0.6679, + "step": 842 + }, + { + "epoch": 0.7290285384837129, + "grad_norm": 5.164660453796387, + "learning_rate": 7.602373519467005e-05, + "loss": 0.3239, + "step": 843 + }, + { + "epoch": 0.729893341020467, + "grad_norm": 6.129587650299072, + "learning_rate": 7.588706345838705e-05, + "loss": 0.1646, + "step": 844 + }, + { + "epoch": 0.7307581435572211, + "grad_norm": 5.64245080947876, + "learning_rate": 7.575012695477076e-05, + "loss": 0.3594, + "step": 845 + }, + { + "epoch": 0.7316229460939752, + "grad_norm": 5.245384216308594, + "learning_rate": 7.561292708437838e-05, + "loss": 0.2795, + "step": 846 + }, + { + "epoch": 0.7324877486307293, + "grad_norm": 6.762210369110107, + "learning_rate": 7.547546525046073e-05, + "loss": 0.3268, + "step": 847 + }, + { + "epoch": 0.7333525511674834, + "grad_norm": 9.87009048461914, + "learning_rate": 7.533774285894798e-05, + "loss": 0.8067, + "step": 848 + }, + { + "epoch": 0.7342173537042376, + "grad_norm": 4.286474704742432, + "learning_rate": 7.519976131843522e-05, + "loss": 0.3708, + "step": 849 + }, + { + "epoch": 0.7350821562409916, + "grad_norm": 9.59669303894043, + "learning_rate": 7.506152204016807e-05, + "loss": 0.5467, + "step": 850 + }, + { + "epoch": 0.7359469587777457, + "grad_norm": 3.928433895111084, + "learning_rate": 7.492302643802821e-05, + "loss": 0.15, + "step": 851 + }, + { + "epoch": 0.7368117613144999, + "grad_norm": 7.317601203918457, + "learning_rate": 7.478427592851893e-05, + "loss": 0.4525, + "step": 852 + }, + { + "epoch": 0.737676563851254, + "grad_norm": 6.541726589202881, + "learning_rate": 7.464527193075073e-05, + "loss": 0.3871, + "step": 853 + }, + { + "epoch": 0.738541366388008, + "grad_norm": 12.070144653320312, + "learning_rate": 7.450601586642664e-05, + "loss": 0.8351, + "step": 854 + }, + { + "epoch": 0.7394061689247622, + "grad_norm": 8.084358215332031, + "learning_rate": 7.436650915982785e-05, + "loss": 0.3939, + "step": 855 + }, + { + "epoch": 0.7402709714615163, + "grad_norm": 6.941904067993164, + "learning_rate": 7.422675323779907e-05, + "loss": 0.4311, + "step": 856 + }, + { + "epoch": 0.7411357739982704, + "grad_norm": 8.018699645996094, + "learning_rate": 7.408674952973382e-05, + "loss": 0.4675, + "step": 857 + }, + { + "epoch": 0.7420005765350245, + "grad_norm": 7.949825763702393, + "learning_rate": 7.394649946756004e-05, + "loss": 0.5963, + "step": 858 + }, + { + "epoch": 0.7428653790717786, + "grad_norm": 6.355823040008545, + "learning_rate": 7.38060044857253e-05, + "loss": 0.3415, + "step": 859 + }, + { + "epoch": 0.7437301816085328, + "grad_norm": 7.31845760345459, + "learning_rate": 7.366526602118214e-05, + "loss": 0.3599, + "step": 860 + }, + { + "epoch": 0.7445949841452868, + "grad_norm": 4.008370876312256, + "learning_rate": 7.352428551337338e-05, + "loss": 0.3354, + "step": 861 + }, + { + "epoch": 0.7454597866820409, + "grad_norm": 6.440021991729736, + "learning_rate": 7.338306440421743e-05, + "loss": 0.2971, + "step": 862 + }, + { + "epoch": 0.7463245892187951, + "grad_norm": 11.389256477355957, + "learning_rate": 7.32416041380935e-05, + "loss": 0.6679, + "step": 863 + }, + { + "epoch": 0.7471893917555491, + "grad_norm": 2.519818067550659, + "learning_rate": 7.309990616182685e-05, + "loss": 0.1211, + "step": 864 + }, + { + "epoch": 0.7480541942923032, + "grad_norm": 7.607640743255615, + "learning_rate": 7.2957971924674e-05, + "loss": 0.2407, + "step": 865 + }, + { + "epoch": 0.7489189968290574, + "grad_norm": 7.118372917175293, + "learning_rate": 7.28158028783079e-05, + "loss": 0.3254, + "step": 866 + }, + { + "epoch": 0.7497837993658115, + "grad_norm": 2.883557081222534, + "learning_rate": 7.267340047680305e-05, + "loss": 0.1074, + "step": 867 + }, + { + "epoch": 0.7506486019025655, + "grad_norm": 4.721225738525391, + "learning_rate": 7.253076617662065e-05, + "loss": 0.1904, + "step": 868 + }, + { + "epoch": 0.7515134044393197, + "grad_norm": 2.654787302017212, + "learning_rate": 7.23879014365938e-05, + "loss": 0.182, + "step": 869 + }, + { + "epoch": 0.7523782069760738, + "grad_norm": 7.568452835083008, + "learning_rate": 7.224480771791235e-05, + "loss": 0.4094, + "step": 870 + }, + { + "epoch": 0.7532430095128279, + "grad_norm": 8.068111419677734, + "learning_rate": 7.210148648410821e-05, + "loss": 0.8455, + "step": 871 + }, + { + "epoch": 0.754107812049582, + "grad_norm": 6.598762512207031, + "learning_rate": 7.195793920104023e-05, + "loss": 0.4085, + "step": 872 + }, + { + "epoch": 0.7549726145863361, + "grad_norm": 6.5393829345703125, + "learning_rate": 7.18141673368792e-05, + "loss": 0.4978, + "step": 873 + }, + { + "epoch": 0.7558374171230903, + "grad_norm": 4.241705894470215, + "learning_rate": 7.167017236209292e-05, + "loss": 0.2777, + "step": 874 + }, + { + "epoch": 0.7567022196598443, + "grad_norm": 5.239429950714111, + "learning_rate": 7.152595574943113e-05, + "loss": 0.3822, + "step": 875 + }, + { + "epoch": 0.7575670221965984, + "grad_norm": 10.576812744140625, + "learning_rate": 7.138151897391041e-05, + "loss": 0.5127, + "step": 876 + }, + { + "epoch": 0.7584318247333526, + "grad_norm": 4.40622615814209, + "learning_rate": 7.123686351279914e-05, + "loss": 0.2795, + "step": 877 + }, + { + "epoch": 0.7592966272701066, + "grad_norm": 8.214874267578125, + "learning_rate": 7.10919908456023e-05, + "loss": 0.4, + "step": 878 + }, + { + "epoch": 0.7601614298068607, + "grad_norm": 5.674429893493652, + "learning_rate": 7.094690245404652e-05, + "loss": 0.3919, + "step": 879 + }, + { + "epoch": 0.7610262323436149, + "grad_norm": 7.315159797668457, + "learning_rate": 7.080159982206471e-05, + "loss": 0.3323, + "step": 880 + }, + { + "epoch": 0.761891034880369, + "grad_norm": 5.864488124847412, + "learning_rate": 7.065608443578105e-05, + "loss": 0.5407, + "step": 881 + }, + { + "epoch": 0.762755837417123, + "grad_norm": 9.524258613586426, + "learning_rate": 7.05103577834957e-05, + "loss": 0.8925, + "step": 882 + }, + { + "epoch": 0.7636206399538772, + "grad_norm": 2.4174962043762207, + "learning_rate": 7.036442135566961e-05, + "loss": 0.116, + "step": 883 + }, + { + "epoch": 0.7644854424906313, + "grad_norm": 5.054670810699463, + "learning_rate": 7.021827664490928e-05, + "loss": 0.382, + "step": 884 + }, + { + "epoch": 0.7653502450273855, + "grad_norm": 4.311699867248535, + "learning_rate": 7.007192514595141e-05, + "loss": 0.2573, + "step": 885 + }, + { + "epoch": 0.7662150475641395, + "grad_norm": 5.006008625030518, + "learning_rate": 6.992536835564782e-05, + "loss": 0.2442, + "step": 886 + }, + { + "epoch": 0.7670798501008936, + "grad_norm": 4.521592140197754, + "learning_rate": 6.977860777294988e-05, + "loss": 0.2122, + "step": 887 + }, + { + "epoch": 0.7679446526376478, + "grad_norm": 7.981561183929443, + "learning_rate": 6.963164489889337e-05, + "loss": 0.3405, + "step": 888 + }, + { + "epoch": 0.7688094551744018, + "grad_norm": 10.011691093444824, + "learning_rate": 6.948448123658308e-05, + "loss": 0.4895, + "step": 889 + }, + { + "epoch": 0.7696742577111559, + "grad_norm": 6.9324517250061035, + "learning_rate": 6.933711829117733e-05, + "loss": 0.4046, + "step": 890 + }, + { + "epoch": 0.7705390602479101, + "grad_norm": 5.044534683227539, + "learning_rate": 6.918955756987275e-05, + "loss": 0.3365, + "step": 891 + }, + { + "epoch": 0.7714038627846642, + "grad_norm": 6.062309265136719, + "learning_rate": 6.904180058188877e-05, + "loss": 0.3073, + "step": 892 + }, + { + "epoch": 0.7722686653214182, + "grad_norm": 9.762418746948242, + "learning_rate": 6.889384883845214e-05, + "loss": 0.7621, + "step": 893 + }, + { + "epoch": 0.7731334678581724, + "grad_norm": 8.496923446655273, + "learning_rate": 6.874570385278158e-05, + "loss": 0.4088, + "step": 894 + }, + { + "epoch": 0.7739982703949265, + "grad_norm": 9.173744201660156, + "learning_rate": 6.859736714007226e-05, + "loss": 0.6372, + "step": 895 + }, + { + "epoch": 0.7748630729316806, + "grad_norm": 8.595545768737793, + "learning_rate": 6.844884021748019e-05, + "loss": 0.7089, + "step": 896 + }, + { + "epoch": 0.7757278754684347, + "grad_norm": 7.156553268432617, + "learning_rate": 6.830012460410697e-05, + "loss": 0.5503, + "step": 897 + }, + { + "epoch": 0.7765926780051888, + "grad_norm": 5.894566059112549, + "learning_rate": 6.815122182098394e-05, + "loss": 0.5239, + "step": 898 + }, + { + "epoch": 0.777457480541943, + "grad_norm": 5.80053186416626, + "learning_rate": 6.800213339105683e-05, + "loss": 0.1838, + "step": 899 + }, + { + "epoch": 0.778322283078697, + "grad_norm": 2.8142247200012207, + "learning_rate": 6.785286083917017e-05, + "loss": 0.1141, + "step": 900 + }, + { + "epoch": 0.7791870856154511, + "grad_norm": 5.2369537353515625, + "learning_rate": 6.770340569205157e-05, + "loss": 0.4552, + "step": 901 + }, + { + "epoch": 0.7800518881522053, + "grad_norm": 7.276421070098877, + "learning_rate": 6.755376947829625e-05, + "loss": 0.4267, + "step": 902 + }, + { + "epoch": 0.7809166906889593, + "grad_norm": 10.988953590393066, + "learning_rate": 6.74039537283513e-05, + "loss": 1.0252, + "step": 903 + }, + { + "epoch": 0.7817814932257134, + "grad_norm": 10.337282180786133, + "learning_rate": 6.725395997450008e-05, + "loss": 0.6281, + "step": 904 + }, + { + "epoch": 0.7826462957624676, + "grad_norm": 10.337082862854004, + "learning_rate": 6.710378975084652e-05, + "loss": 0.6716, + "step": 905 + }, + { + "epoch": 0.7835110982992217, + "grad_norm": 3.361793279647827, + "learning_rate": 6.695344459329948e-05, + "loss": 0.1769, + "step": 906 + }, + { + "epoch": 0.7843759008359757, + "grad_norm": 8.392909049987793, + "learning_rate": 6.6802926039557e-05, + "loss": 0.428, + "step": 907 + }, + { + "epoch": 0.7852407033727299, + "grad_norm": 5.3866729736328125, + "learning_rate": 6.665223562909058e-05, + "loss": 0.335, + "step": 908 + }, + { + "epoch": 0.786105505909484, + "grad_norm": 8.97474479675293, + "learning_rate": 6.650137490312935e-05, + "loss": 0.6272, + "step": 909 + }, + { + "epoch": 0.786970308446238, + "grad_norm": 9.634217262268066, + "learning_rate": 6.635034540464456e-05, + "loss": 0.6253, + "step": 910 + }, + { + "epoch": 0.7878351109829922, + "grad_norm": 3.891382932662964, + "learning_rate": 6.619914867833343e-05, + "loss": 0.2603, + "step": 911 + }, + { + "epoch": 0.7886999135197463, + "grad_norm": 6.183927059173584, + "learning_rate": 6.60477862706037e-05, + "loss": 0.5737, + "step": 912 + }, + { + "epoch": 0.7895647160565005, + "grad_norm": 7.62052583694458, + "learning_rate": 6.589625972955764e-05, + "loss": 0.3792, + "step": 913 + }, + { + "epoch": 0.7904295185932545, + "grad_norm": 8.527345657348633, + "learning_rate": 6.574457060497618e-05, + "loss": 0.308, + "step": 914 + }, + { + "epoch": 0.7912943211300086, + "grad_norm": 4.892148494720459, + "learning_rate": 6.559272044830317e-05, + "loss": 0.2018, + "step": 915 + }, + { + "epoch": 0.7921591236667628, + "grad_norm": 3.214404582977295, + "learning_rate": 6.544071081262943e-05, + "loss": 0.1299, + "step": 916 + }, + { + "epoch": 0.7930239262035169, + "grad_norm": 7.314729690551758, + "learning_rate": 6.528854325267692e-05, + "loss": 0.4338, + "step": 917 + }, + { + "epoch": 0.7938887287402709, + "grad_norm": 6.503054618835449, + "learning_rate": 6.513621932478282e-05, + "loss": 0.2775, + "step": 918 + }, + { + "epoch": 0.7947535312770251, + "grad_norm": 3.8166730403900146, + "learning_rate": 6.498374058688359e-05, + "loss": 0.2077, + "step": 919 + }, + { + "epoch": 0.7956183338137792, + "grad_norm": 3.5877130031585693, + "learning_rate": 6.483110859849907e-05, + "loss": 0.2204, + "step": 920 + }, + { + "epoch": 0.7956183338137792, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7604166666666666, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8248084783554077, + "eval_Qnli-dev-1024_cosine_ap": 0.7343586316206616, + "eval_Qnli-dev-1024_cosine_f1": 0.7628865979381444, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.8248084783554077, + "eval_Qnli-dev-1024_cosine_mcc": 0.528911810491234, + "eval_Qnli-dev-1024_cosine_precision": 0.7115384615384616, + "eval_Qnli-dev-1024_cosine_recall": 0.8222222222222222, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7927051782608032, + "eval_Qnli-dev_cosine_ap": 0.7346717053497452, + "eval_Qnli-dev_cosine_f1": 0.7254901960784313, + "eval_Qnli-dev_cosine_f1_threshold": 0.7089404463768005, + "eval_Qnli-dev_cosine_mcc": 0.43697448216965834, + "eval_Qnli-dev_cosine_precision": 0.6491228070175439, + "eval_Qnli-dev_cosine_recall": 0.8222222222222222, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9375, + "eval_allNLI-triplets_cosine_accuracy": 0.9583333134651184, + "eval_global_dataset_loss": 0.5776817798614502, + "eval_global_dataset_runtime": 67.9151, + "eval_global_dataset_samples_per_second": 14.312, + "eval_global_dataset_steps_per_second": 0.309, + "eval_sequential_score": 0.9375, + "eval_sts-test-1024_pearson_cosine": 0.8700957313588291, + "eval_sts-test-1024_spearman_cosine": 0.9050692984403192, + "eval_sts-test_pearson_cosine": 0.9076253073025099, + "eval_sts-test_spearman_cosine": 0.9206752404165891, + "step": 920 + }, + { + "epoch": 0.7964831363505332, + "grad_norm": 8.318743705749512, + "learning_rate": 6.467832492071649e-05, + "loss": 0.6926, + "step": 921 + }, + { + "epoch": 0.7973479388872874, + "grad_norm": 3.8544118404388428, + "learning_rate": 6.452539111617453e-05, + "loss": 0.287, + "step": 922 + }, + { + "epoch": 0.7982127414240415, + "grad_norm": 5.1298441886901855, + "learning_rate": 6.437230874904739e-05, + "loss": 0.119, + "step": 923 + }, + { + "epoch": 0.7990775439607957, + "grad_norm": 6.161678791046143, + "learning_rate": 6.421907938502865e-05, + "loss": 0.2366, + "step": 924 + }, + { + "epoch": 0.7999423464975497, + "grad_norm": 6.632068634033203, + "learning_rate": 6.406570459131538e-05, + "loss": 0.2398, + "step": 925 + }, + { + "epoch": 0.8008071490343038, + "grad_norm": 7.237907886505127, + "learning_rate": 6.39121859365921e-05, + "loss": 0.5197, + "step": 926 + }, + { + "epoch": 0.801671951571058, + "grad_norm": 4.103116035461426, + "learning_rate": 6.375852499101467e-05, + "loss": 0.2894, + "step": 927 + }, + { + "epoch": 0.802536754107812, + "grad_norm": 4.985617160797119, + "learning_rate": 6.36047233261943e-05, + "loss": 0.3327, + "step": 928 + }, + { + "epoch": 0.8034015566445661, + "grad_norm": 3.32094144821167, + "learning_rate": 6.345078251518143e-05, + "loss": 0.1615, + "step": 929 + }, + { + "epoch": 0.8042663591813203, + "grad_norm": 2.774306297302246, + "learning_rate": 6.329670413244967e-05, + "loss": 0.181, + "step": 930 + }, + { + "epoch": 0.8051311617180744, + "grad_norm": 5.269750595092773, + "learning_rate": 6.314248975387965e-05, + "loss": 0.2791, + "step": 931 + }, + { + "epoch": 0.8059959642548284, + "grad_norm": 6.187272548675537, + "learning_rate": 6.298814095674297e-05, + "loss": 0.4581, + "step": 932 + }, + { + "epoch": 0.8068607667915826, + "grad_norm": 5.671879291534424, + "learning_rate": 6.283365931968603e-05, + "loss": 0.2483, + "step": 933 + }, + { + "epoch": 0.8077255693283367, + "grad_norm": 12.671977043151855, + "learning_rate": 6.26790464227139e-05, + "loss": 1.2848, + "step": 934 + }, + { + "epoch": 0.8085903718650908, + "grad_norm": 4.673594951629639, + "learning_rate": 6.252430384717412e-05, + "loss": 0.2594, + "step": 935 + }, + { + "epoch": 0.8094551744018449, + "grad_norm": 6.6575117111206055, + "learning_rate": 6.236943317574056e-05, + "loss": 0.3473, + "step": 936 + }, + { + "epoch": 0.810319976938599, + "grad_norm": 3.377204656600952, + "learning_rate": 6.221443599239721e-05, + "loss": 0.2606, + "step": 937 + }, + { + "epoch": 0.8111847794753532, + "grad_norm": 7.629633903503418, + "learning_rate": 6.205931388242207e-05, + "loss": 0.3551, + "step": 938 + }, + { + "epoch": 0.8120495820121072, + "grad_norm": 7.514203071594238, + "learning_rate": 6.190406843237078e-05, + "loss": 0.3383, + "step": 939 + }, + { + "epoch": 0.8129143845488613, + "grad_norm": 6.37880277633667, + "learning_rate": 6.174870123006051e-05, + "loss": 0.2142, + "step": 940 + }, + { + "epoch": 0.8137791870856155, + "grad_norm": 5.619572639465332, + "learning_rate": 6.159321386455372e-05, + "loss": 0.2147, + "step": 941 + }, + { + "epoch": 0.8146439896223696, + "grad_norm": 4.535106658935547, + "learning_rate": 6.143760792614179e-05, + "loss": 0.2273, + "step": 942 + }, + { + "epoch": 0.8155087921591236, + "grad_norm": 9.820999145507812, + "learning_rate": 6.128188500632892e-05, + "loss": 0.9269, + "step": 943 + }, + { + "epoch": 0.8163735946958778, + "grad_norm": 13.05849838256836, + "learning_rate": 6.112604669781572e-05, + "loss": 1.5161, + "step": 944 + }, + { + "epoch": 0.8172383972326319, + "grad_norm": 5.841894626617432, + "learning_rate": 6.0970094594483004e-05, + "loss": 0.1962, + "step": 945 + }, + { + "epoch": 0.818103199769386, + "grad_norm": 7.9914069175720215, + "learning_rate": 6.0814030291375424e-05, + "loss": 0.5516, + "step": 946 + }, + { + "epoch": 0.8189680023061401, + "grad_norm": 4.961643695831299, + "learning_rate": 6.0657855384685215e-05, + "loss": 0.3498, + "step": 947 + }, + { + "epoch": 0.8198328048428942, + "grad_norm": 5.379317283630371, + "learning_rate": 6.050157147173581e-05, + "loss": 0.3962, + "step": 948 + }, + { + "epoch": 0.8206976073796484, + "grad_norm": 4.794488430023193, + "learning_rate": 6.0345180150965576e-05, + "loss": 0.2953, + "step": 949 + }, + { + "epoch": 0.8215624099164024, + "grad_norm": 3.9415969848632812, + "learning_rate": 6.0188683021911396e-05, + "loss": 0.1737, + "step": 950 + }, + { + "epoch": 0.8224272124531565, + "grad_norm": 5.720635890960693, + "learning_rate": 6.003208168519233e-05, + "loss": 0.1876, + "step": 951 + }, + { + "epoch": 0.8232920149899107, + "grad_norm": 5.777576923370361, + "learning_rate": 5.9875377742493276e-05, + "loss": 0.2037, + "step": 952 + }, + { + "epoch": 0.8241568175266647, + "grad_norm": 7.2347798347473145, + "learning_rate": 5.971857279654854e-05, + "loss": 0.6859, + "step": 953 + }, + { + "epoch": 0.8250216200634188, + "grad_norm": 9.168425559997559, + "learning_rate": 5.956166845112552e-05, + "loss": 0.5139, + "step": 954 + }, + { + "epoch": 0.825886422600173, + "grad_norm": 8.480242729187012, + "learning_rate": 5.9404666311008175e-05, + "loss": 0.4557, + "step": 955 + }, + { + "epoch": 0.8267512251369271, + "grad_norm": 7.415064811706543, + "learning_rate": 5.924756798198075e-05, + "loss": 0.418, + "step": 956 + }, + { + "epoch": 0.8276160276736811, + "grad_norm": 5.769486427307129, + "learning_rate": 5.909037507081121e-05, + "loss": 0.3326, + "step": 957 + }, + { + "epoch": 0.8284808302104353, + "grad_norm": 9.98505687713623, + "learning_rate": 5.893308918523498e-05, + "loss": 0.6773, + "step": 958 + }, + { + "epoch": 0.8293456327471894, + "grad_norm": 6.003732681274414, + "learning_rate": 5.877571193393837e-05, + "loss": 0.1938, + "step": 959 + }, + { + "epoch": 0.8302104352839434, + "grad_norm": 6.989200115203857, + "learning_rate": 5.8618244926542156e-05, + "loss": 0.2502, + "step": 960 + }, + { + "epoch": 0.8310752378206976, + "grad_norm": 5.944050312042236, + "learning_rate": 5.84606897735851e-05, + "loss": 0.1686, + "step": 961 + }, + { + "epoch": 0.8319400403574517, + "grad_norm": 2.568422794342041, + "learning_rate": 5.830304808650753e-05, + "loss": 0.1174, + "step": 962 + }, + { + "epoch": 0.8328048428942059, + "grad_norm": 6.347965717315674, + "learning_rate": 5.814532147763478e-05, + "loss": 0.3688, + "step": 963 + }, + { + "epoch": 0.8336696454309599, + "grad_norm": 9.391959190368652, + "learning_rate": 5.798751156016085e-05, + "loss": 0.4529, + "step": 964 + }, + { + "epoch": 0.834534447967714, + "grad_norm": 7.86402702331543, + "learning_rate": 5.7829619948131654e-05, + "loss": 0.5973, + "step": 965 + }, + { + "epoch": 0.8353992505044682, + "grad_norm": 9.44655990600586, + "learning_rate": 5.767164825642879e-05, + "loss": 0.7635, + "step": 966 + }, + { + "epoch": 0.8362640530412222, + "grad_norm": 7.177609920501709, + "learning_rate": 5.751359810075284e-05, + "loss": 0.5631, + "step": 967 + }, + { + "epoch": 0.8371288555779763, + "grad_norm": 5.718000411987305, + "learning_rate": 5.735547109760686e-05, + "loss": 0.313, + "step": 968 + }, + { + "epoch": 0.8379936581147305, + "grad_norm": 6.908907413482666, + "learning_rate": 5.719726886427998e-05, + "loss": 0.4425, + "step": 969 + }, + { + "epoch": 0.8388584606514846, + "grad_norm": 6.510931491851807, + "learning_rate": 5.7038993018830675e-05, + "loss": 0.3581, + "step": 970 + }, + { + "epoch": 0.8397232631882386, + "grad_norm": 6.836475372314453, + "learning_rate": 5.688064518007036e-05, + "loss": 0.2542, + "step": 971 + }, + { + "epoch": 0.8405880657249928, + "grad_norm": 6.765063762664795, + "learning_rate": 5.6722226967546764e-05, + "loss": 0.2576, + "step": 972 + }, + { + "epoch": 0.8414528682617469, + "grad_norm": 2.587757110595703, + "learning_rate": 5.65637400015274e-05, + "loss": 0.1156, + "step": 973 + }, + { + "epoch": 0.8423176707985011, + "grad_norm": 11.528030395507812, + "learning_rate": 5.640518590298298e-05, + "loss": 0.8184, + "step": 974 + }, + { + "epoch": 0.8431824733352551, + "grad_norm": 7.4515790939331055, + "learning_rate": 5.624656629357081e-05, + "loss": 0.3536, + "step": 975 + }, + { + "epoch": 0.8440472758720092, + "grad_norm": 6.2617082595825195, + "learning_rate": 5.6087882795618216e-05, + "loss": 0.3023, + "step": 976 + }, + { + "epoch": 0.8449120784087634, + "grad_norm": 4.997031211853027, + "learning_rate": 5.5929137032106005e-05, + "loss": 0.418, + "step": 977 + }, + { + "epoch": 0.8457768809455174, + "grad_norm": 7.6783671379089355, + "learning_rate": 5.577033062665179e-05, + "loss": 0.3036, + "step": 978 + }, + { + "epoch": 0.8466416834822715, + "grad_norm": 7.0620436668396, + "learning_rate": 5.561146520349343e-05, + "loss": 0.55, + "step": 979 + }, + { + "epoch": 0.8475064860190257, + "grad_norm": 8.351699829101562, + "learning_rate": 5.5452542387472416e-05, + "loss": 0.6477, + "step": 980 + }, + { + "epoch": 0.8483712885557798, + "grad_norm": 7.685431480407715, + "learning_rate": 5.529356380401722e-05, + "loss": 0.3518, + "step": 981 + }, + { + "epoch": 0.8492360910925338, + "grad_norm": 9.351055145263672, + "learning_rate": 5.5134531079126704e-05, + "loss": 0.7033, + "step": 982 + }, + { + "epoch": 0.850100893629288, + "grad_norm": 9.499361038208008, + "learning_rate": 5.497544583935347e-05, + "loss": 0.6931, + "step": 983 + }, + { + "epoch": 0.8509656961660421, + "grad_norm": 10.090303421020508, + "learning_rate": 5.481630971178721e-05, + "loss": 0.9278, + "step": 984 + }, + { + "epoch": 0.8518304987027961, + "grad_norm": 4.208652019500732, + "learning_rate": 5.465712432403812e-05, + "loss": 0.3061, + "step": 985 + }, + { + "epoch": 0.8526953012395503, + "grad_norm": 9.341512680053711, + "learning_rate": 5.4497891304220225e-05, + "loss": 0.8352, + "step": 986 + }, + { + "epoch": 0.8535601037763044, + "grad_norm": 1.4906487464904785, + "learning_rate": 5.433861228093471e-05, + "loss": 0.125, + "step": 987 + }, + { + "epoch": 0.8544249063130586, + "grad_norm": 2.660661458969116, + "learning_rate": 5.417928888325324e-05, + "loss": 0.2284, + "step": 988 + }, + { + "epoch": 0.8552897088498126, + "grad_norm": 10.015325546264648, + "learning_rate": 5.401992274070136e-05, + "loss": 0.838, + "step": 989 + }, + { + "epoch": 0.8561545113865667, + "grad_norm": 8.29864501953125, + "learning_rate": 5.386051548324179e-05, + "loss": 0.5318, + "step": 990 + }, + { + "epoch": 0.8570193139233209, + "grad_norm": 4.587142467498779, + "learning_rate": 5.3701068741257796e-05, + "loss": 0.1618, + "step": 991 + }, + { + "epoch": 0.8578841164600749, + "grad_norm": 1.8213179111480713, + "learning_rate": 5.354158414553646e-05, + "loss": 0.0871, + "step": 992 + }, + { + "epoch": 0.858748918996829, + "grad_norm": 8.93700122833252, + "learning_rate": 5.3382063327252017e-05, + "loss": 0.6915, + "step": 993 + }, + { + "epoch": 0.8596137215335832, + "grad_norm": 4.793188095092773, + "learning_rate": 5.322250791794916e-05, + "loss": 0.3728, + "step": 994 + }, + { + "epoch": 0.8604785240703373, + "grad_norm": 4.624011516571045, + "learning_rate": 5.3062919549526436e-05, + "loss": 0.2403, + "step": 995 + }, + { + "epoch": 0.8613433266070913, + "grad_norm": 1.8955051898956299, + "learning_rate": 5.2903299854219435e-05, + "loss": 0.0651, + "step": 996 + }, + { + "epoch": 0.8622081291438455, + "grad_norm": 10.889961242675781, + "learning_rate": 5.274365046458416e-05, + "loss": 0.5783, + "step": 997 + }, + { + "epoch": 0.8630729316805996, + "grad_norm": 4.15156888961792, + "learning_rate": 5.258397301348035e-05, + "loss": 0.2061, + "step": 998 + }, + { + "epoch": 0.8639377342173538, + "grad_norm": 3.9485700130462646, + "learning_rate": 5.2424269134054694e-05, + "loss": 0.154, + "step": 999 + }, + { + "epoch": 0.8648025367541078, + "grad_norm": 9.996199607849121, + "learning_rate": 5.2264540459724276e-05, + "loss": 0.4689, + "step": 1000 + }, + { + "epoch": 0.8656673392908619, + "grad_norm": 7.154214382171631, + "learning_rate": 5.21047886241597e-05, + "loss": 0.2088, + "step": 1001 + }, + { + "epoch": 0.8665321418276161, + "grad_norm": 8.80577564239502, + "learning_rate": 5.194501526126842e-05, + "loss": 0.5299, + "step": 1002 + }, + { + "epoch": 0.8673969443643701, + "grad_norm": 5.227262020111084, + "learning_rate": 5.1785222005178224e-05, + "loss": 0.2689, + "step": 1003 + }, + { + "epoch": 0.8682617469011242, + "grad_norm": 6.6007843017578125, + "learning_rate": 5.162541049022019e-05, + "loss": 0.3098, + "step": 1004 + }, + { + "epoch": 0.8691265494378784, + "grad_norm": 6.239222526550293, + "learning_rate": 5.146558235091225e-05, + "loss": 0.3478, + "step": 1005 + }, + { + "epoch": 0.8699913519746325, + "grad_norm": 2.814821243286133, + "learning_rate": 5.1305739221942364e-05, + "loss": 0.1841, + "step": 1006 + }, + { + "epoch": 0.8708561545113865, + "grad_norm": 1.4831047058105469, + "learning_rate": 5.114588273815173e-05, + "loss": 0.0862, + "step": 1007 + }, + { + "epoch": 0.8717209570481407, + "grad_norm": 8.568103790283203, + "learning_rate": 5.09860145345182e-05, + "loss": 0.5991, + "step": 1008 + }, + { + "epoch": 0.8725857595848948, + "grad_norm": 9.560081481933594, + "learning_rate": 5.082613624613946e-05, + "loss": 0.4026, + "step": 1009 + }, + { + "epoch": 0.8734505621216488, + "grad_norm": 7.88618803024292, + "learning_rate": 5.066624950821637e-05, + "loss": 0.4991, + "step": 1010 + }, + { + "epoch": 0.874315364658403, + "grad_norm": 5.938468933105469, + "learning_rate": 5.05063559560362e-05, + "loss": 0.2786, + "step": 1011 + }, + { + "epoch": 0.8751801671951571, + "grad_norm": 9.075552940368652, + "learning_rate": 5.0346457224955903e-05, + "loss": 0.4708, + "step": 1012 + }, + { + "epoch": 0.8760449697319113, + "grad_norm": 8.848043441772461, + "learning_rate": 5.018655495038541e-05, + "loss": 0.4201, + "step": 1013 + }, + { + "epoch": 0.8769097722686653, + "grad_norm": 5.168188095092773, + "learning_rate": 5.002665076777091e-05, + "loss": 0.2089, + "step": 1014 + }, + { + "epoch": 0.8777745748054194, + "grad_norm": 4.413999557495117, + "learning_rate": 4.986674631257804e-05, + "loss": 0.3158, + "step": 1015 + }, + { + "epoch": 0.8786393773421736, + "grad_norm": 9.610701560974121, + "learning_rate": 4.970684322027534e-05, + "loss": 0.7363, + "step": 1016 + }, + { + "epoch": 0.8795041798789276, + "grad_norm": 6.793404579162598, + "learning_rate": 4.9546943126317274e-05, + "loss": 0.2885, + "step": 1017 + }, + { + "epoch": 0.8803689824156817, + "grad_norm": 9.434625625610352, + "learning_rate": 4.9387047666127786e-05, + "loss": 0.4937, + "step": 1018 + }, + { + "epoch": 0.8812337849524359, + "grad_norm": 6.130424499511719, + "learning_rate": 4.9227158475083304e-05, + "loss": 0.1684, + "step": 1019 + }, + { + "epoch": 0.88209858748919, + "grad_norm": 4.250467777252197, + "learning_rate": 4.9067277188496185e-05, + "loss": 0.2749, + "step": 1020 + }, + { + "epoch": 0.882963390025944, + "grad_norm": 3.2336244583129883, + "learning_rate": 4.890740544159796e-05, + "loss": 0.2789, + "step": 1021 + }, + { + "epoch": 0.8838281925626982, + "grad_norm": 7.7692084312438965, + "learning_rate": 4.874754486952255e-05, + "loss": 0.5868, + "step": 1022 + }, + { + "epoch": 0.8846929950994523, + "grad_norm": 7.071033954620361, + "learning_rate": 4.8587697107289626e-05, + "loss": 0.5894, + "step": 1023 + }, + { + "epoch": 0.8855577976362063, + "grad_norm": 6.448328971862793, + "learning_rate": 4.84278637897878e-05, + "loss": 0.1635, + "step": 1024 + }, + { + "epoch": 0.8864226001729605, + "grad_norm": 11.375746726989746, + "learning_rate": 4.826804655175795e-05, + "loss": 0.6829, + "step": 1025 + }, + { + "epoch": 0.8872874027097146, + "grad_norm": 1.4379364252090454, + "learning_rate": 4.8108247027776565e-05, + "loss": 0.1124, + "step": 1026 + }, + { + "epoch": 0.8881522052464688, + "grad_norm": 11.497692108154297, + "learning_rate": 4.794846685223886e-05, + "loss": 1.2642, + "step": 1027 + }, + { + "epoch": 0.8890170077832228, + "grad_norm": 6.77423620223999, + "learning_rate": 4.778870765934221e-05, + "loss": 0.6585, + "step": 1028 + }, + { + "epoch": 0.8898818103199769, + "grad_norm": 6.651241779327393, + "learning_rate": 4.762897108306939e-05, + "loss": 0.2622, + "step": 1029 + }, + { + "epoch": 0.8907466128567311, + "grad_norm": 7.476505279541016, + "learning_rate": 4.7469258757171854e-05, + "loss": 0.2797, + "step": 1030 + }, + { + "epoch": 0.8916114153934852, + "grad_norm": 7.375949382781982, + "learning_rate": 4.7309572315152976e-05, + "loss": 0.2747, + "step": 1031 + }, + { + "epoch": 0.8924762179302392, + "grad_norm": 4.907548427581787, + "learning_rate": 4.7149913390251494e-05, + "loss": 0.3748, + "step": 1032 + }, + { + "epoch": 0.8933410204669934, + "grad_norm": 7.232724189758301, + "learning_rate": 4.6990283615424605e-05, + "loss": 0.1792, + "step": 1033 + }, + { + "epoch": 0.8942058230037475, + "grad_norm": 6.12727165222168, + "learning_rate": 4.6830684623331446e-05, + "loss": 0.342, + "step": 1034 + }, + { + "epoch": 0.8950706255405015, + "grad_norm": 4.968775272369385, + "learning_rate": 4.667111804631626e-05, + "loss": 0.3287, + "step": 1035 + }, + { + "epoch": 0.8950706255405015, + "eval_Qnli-dev-1024_cosine_accuracy": 0.7083333333333334, + "eval_Qnli-dev-1024_cosine_accuracy_threshold": 0.8668882846832275, + "eval_Qnli-dev-1024_cosine_ap": 0.6999822477767415, + "eval_Qnli-dev-1024_cosine_f1": 0.7090909090909091, + "eval_Qnli-dev-1024_cosine_f1_threshold": 0.7944933772087097, + "eval_Qnli-dev-1024_cosine_mcc": 0.3808509397785054, + "eval_Qnli-dev-1024_cosine_precision": 0.6, + "eval_Qnli-dev-1024_cosine_recall": 0.8666666666666667, + "eval_Qnli-dev_cosine_accuracy": 0.71875, + "eval_Qnli-dev_cosine_accuracy_threshold": 0.7880457639694214, + "eval_Qnli-dev_cosine_ap": 0.7340095878922616, + "eval_Qnli-dev_cosine_f1": 0.7238095238095237, + "eval_Qnli-dev_cosine_f1_threshold": 0.703315019607544, + "eval_Qnli-dev_cosine_mcc": 0.42578476395267345, + "eval_Qnli-dev_cosine_precision": 0.6333333333333333, + "eval_Qnli-dev_cosine_recall": 0.8444444444444444, + "eval_allNLI--triplets-1024_cosine_accuracy": 0.9375, + "eval_allNLI-triplets_cosine_accuracy": 0.9791666865348816, + "eval_global_dataset_loss": 0.5188027620315552, + "eval_global_dataset_runtime": 67.9093, + "eval_global_dataset_samples_per_second": 14.313, + "eval_global_dataset_steps_per_second": 0.309, + "eval_sequential_score": 0.9375, + "eval_sts-test-1024_pearson_cosine": 0.8797952712975998, + "eval_sts-test-1024_spearman_cosine": 0.9135158587840699, + "eval_sts-test_pearson_cosine": 0.9084511864603124, + "eval_sts-test_spearman_cosine": 0.9222536610997011, + "step": 1035 + }, + { + "epoch": 0.8959354280772557, + "grad_norm": 8.467510223388672, + "learning_rate": 4.651158551639177e-05, + "loss": 0.5348, + "step": 1036 + }, + { + "epoch": 0.8968002306140098, + "grad_norm": 6.454378604888916, + "learning_rate": 4.635208866522251e-05, + "loss": 0.6588, + "step": 1037 + }, + { + "epoch": 0.897665033150764, + "grad_norm": 6.776357650756836, + "learning_rate": 4.619262912410804e-05, + "loss": 0.5132, + "step": 1038 + }, + { + "epoch": 0.898529835687518, + "grad_norm": 10.85428237915039, + "learning_rate": 4.603320852396637e-05, + "loss": 1.1707, + "step": 1039 + }, + { + "epoch": 0.8993946382242721, + "grad_norm": 3.449406862258911, + "learning_rate": 4.587382849531717e-05, + "loss": 0.1442, + "step": 1040 + }, + { + "epoch": 0.9002594407610263, + "grad_norm": 2.9549670219421387, + "learning_rate": 4.5714490668265245e-05, + "loss": 0.2243, + "step": 1041 + }, + { + "epoch": 0.9011242432977803, + "grad_norm": 4.6617817878723145, + "learning_rate": 4.5555196672483685e-05, + "loss": 0.3099, + "step": 1042 + }, + { + "epoch": 0.9019890458345344, + "grad_norm": 6.141875267028809, + "learning_rate": 4.5395948137197296e-05, + "loss": 0.1839, + "step": 1043 + }, + { + "epoch": 0.9028538483712886, + "grad_norm": 12.232782363891602, + "learning_rate": 4.5236746691166e-05, + "loss": 0.6248, + "step": 1044 + }, + { + "epoch": 0.9037186509080427, + "grad_norm": 5.728059768676758, + "learning_rate": 4.507759396266802e-05, + "loss": 0.4605, + "step": 1045 + }, + { + "epoch": 0.9045834534447967, + "grad_norm": 8.688108444213867, + "learning_rate": 4.49184915794833e-05, + "loss": 0.4857, + "step": 1046 + }, + { + "epoch": 0.9054482559815509, + "grad_norm": 8.695257186889648, + "learning_rate": 4.475944116887695e-05, + "loss": 0.3966, + "step": 1047 + }, + { + "epoch": 0.906313058518305, + "grad_norm": 5.200995922088623, + "learning_rate": 4.460044435758241e-05, + "loss": 0.4439, + "step": 1048 + }, + { + "epoch": 0.907177861055059, + "grad_norm": 12.601680755615234, + "learning_rate": 4.4441502771785003e-05, + "loss": 0.6051, + "step": 1049 + }, + { + "epoch": 0.9080426635918132, + "grad_norm": 9.575990676879883, + "learning_rate": 4.428261803710516e-05, + "loss": 0.3982, + "step": 1050 + }, + { + "epoch": 0.9089074661285673, + "grad_norm": 2.344109058380127, + "learning_rate": 4.4123791778581865e-05, + "loss": 0.1718, + "step": 1051 + }, + { + "epoch": 0.9097722686653215, + "grad_norm": 7.567986488342285, + "learning_rate": 4.3965025620656065e-05, + "loss": 0.2641, + "step": 1052 + }, + { + "epoch": 0.9106370712020755, + "grad_norm": 8.634700775146484, + "learning_rate": 4.3806321187153934e-05, + "loss": 0.3788, + "step": 1053 + }, + { + "epoch": 0.9115018737388296, + "grad_norm": 8.53459644317627, + "learning_rate": 4.3647680101270416e-05, + "loss": 0.4456, + "step": 1054 + }, + { + "epoch": 0.9123666762755838, + "grad_norm": 10.249025344848633, + "learning_rate": 4.348910398555249e-05, + "loss": 1.0234, + "step": 1055 + }, + { + "epoch": 0.9132314788123379, + "grad_norm": 10.008344650268555, + "learning_rate": 4.333059446188269e-05, + "loss": 0.6228, + "step": 1056 + }, + { + "epoch": 0.9140962813490919, + "grad_norm": 8.067853927612305, + "learning_rate": 4.317215315146238e-05, + "loss": 0.4588, + "step": 1057 + }, + { + "epoch": 0.9149610838858461, + "grad_norm": 10.182132720947266, + "learning_rate": 4.301378167479532e-05, + "loss": 0.8651, + "step": 1058 + }, + { + "epoch": 0.9158258864226002, + "grad_norm": 11.363606452941895, + "learning_rate": 4.285548165167105e-05, + "loss": 0.8571, + "step": 1059 + }, + { + "epoch": 0.9166906889593542, + "grad_norm": 10.103208541870117, + "learning_rate": 4.2697254701148235e-05, + "loss": 0.6446, + "step": 1060 + }, + { + "epoch": 0.9175554914961084, + "grad_norm": 6.2334418296813965, + "learning_rate": 4.253910244153817e-05, + "loss": 0.2193, + "step": 1061 + }, + { + "epoch": 0.9184202940328625, + "grad_norm": 5.234436511993408, + "learning_rate": 4.2381026490388245e-05, + "loss": 0.258, + "step": 1062 + }, + { + "epoch": 0.9192850965696167, + "grad_norm": 8.499395370483398, + "learning_rate": 4.222302846446544e-05, + "loss": 0.4164, + "step": 1063 + }, + { + "epoch": 0.9201498991063707, + "grad_norm": 5.450392723083496, + "learning_rate": 4.206510997973963e-05, + "loss": 0.4783, + "step": 1064 + }, + { + "epoch": 0.9210147016431248, + "grad_norm": 5.65176248550415, + "learning_rate": 4.1907272651367226e-05, + "loss": 0.246, + "step": 1065 + }, + { + "epoch": 0.921879504179879, + "grad_norm": 8.317374229431152, + "learning_rate": 4.1749518093674566e-05, + "loss": 0.3821, + "step": 1066 + }, + { + "epoch": 0.922744306716633, + "grad_norm": 4.983073711395264, + "learning_rate": 4.159184792014145e-05, + "loss": 0.182, + "step": 1067 + }, + { + "epoch": 0.9236091092533871, + "grad_norm": 10.939299583435059, + "learning_rate": 4.143426374338459e-05, + "loss": 0.6648, + "step": 1068 + }, + { + "epoch": 0.9244739117901413, + "grad_norm": 5.333117485046387, + "learning_rate": 4.1276767175141125e-05, + "loss": 0.5405, + "step": 1069 + }, + { + "epoch": 0.9253387143268954, + "grad_norm": 6.263637542724609, + "learning_rate": 4.1119359826252226e-05, + "loss": 0.3681, + "step": 1070 + }, + { + "epoch": 0.9262035168636494, + "grad_norm": 5.16562032699585, + "learning_rate": 4.0962043306646455e-05, + "loss": 0.2323, + "step": 1071 + }, + { + "epoch": 0.9270683194004036, + "grad_norm": 6.132068634033203, + "learning_rate": 4.080481922532348e-05, + "loss": 0.4676, + "step": 1072 + }, + { + "epoch": 0.9279331219371577, + "grad_norm": 8.957972526550293, + "learning_rate": 4.064768919033746e-05, + "loss": 0.5141, + "step": 1073 + }, + { + "epoch": 0.9287979244739117, + "grad_norm": 7.958962440490723, + "learning_rate": 4.0490654808780685e-05, + "loss": 0.3067, + "step": 1074 + }, + { + "epoch": 0.9296627270106659, + "grad_norm": 6.653066158294678, + "learning_rate": 4.033371768676716e-05, + "loss": 0.4638, + "step": 1075 + }, + { + "epoch": 0.93052752954742, + "grad_norm": 5.897211074829102, + "learning_rate": 4.0176879429416086e-05, + "loss": 0.3082, + "step": 1076 + }, + { + "epoch": 0.9313923320841742, + "grad_norm": 8.102348327636719, + "learning_rate": 4.002014164083552e-05, + "loss": 0.4003, + "step": 1077 + }, + { + "epoch": 0.9322571346209282, + "grad_norm": 7.730281829833984, + "learning_rate": 3.9863505924105995e-05, + "loss": 0.3053, + "step": 1078 + }, + { + "epoch": 0.9331219371576823, + "grad_norm": 5.675047397613525, + "learning_rate": 3.970697388126397e-05, + "loss": 0.1876, + "step": 1079 + }, + { + "epoch": 0.9339867396944365, + "grad_norm": 9.553377151489258, + "learning_rate": 3.9550547113285665e-05, + "loss": 0.569, + "step": 1080 + }, + { + "epoch": 0.9348515422311905, + "grad_norm": 10.86451244354248, + "learning_rate": 3.9394227220070466e-05, + "loss": 0.8728, + "step": 1081 + }, + { + "epoch": 0.9357163447679446, + "grad_norm": 9.33718204498291, + "learning_rate": 3.923801580042476e-05, + "loss": 0.4347, + "step": 1082 + }, + { + "epoch": 0.9365811473046988, + "grad_norm": 8.696025848388672, + "learning_rate": 3.90819144520454e-05, + "loss": 0.8919, + "step": 1083 + }, + { + "epoch": 0.9374459498414529, + "grad_norm": 7.635885238647461, + "learning_rate": 3.892592477150352e-05, + "loss": 0.4828, + "step": 1084 + }, + { + "epoch": 0.9383107523782069, + "grad_norm": 7.686861038208008, + "learning_rate": 3.877004835422815e-05, + "loss": 0.4338, + "step": 1085 + }, + { + "epoch": 0.9391755549149611, + "grad_norm": 6.8635029792785645, + "learning_rate": 3.861428679448983e-05, + "loss": 0.359, + "step": 1086 + }, + { + "epoch": 0.9400403574517152, + "grad_norm": 4.335479736328125, + "learning_rate": 3.845864168538437e-05, + "loss": 0.3828, + "step": 1087 + }, + { + "epoch": 0.9409051599884694, + "grad_norm": 7.711667537689209, + "learning_rate": 3.8303114618816577e-05, + "loss": 0.5294, + "step": 1088 + }, + { + "epoch": 0.9417699625252234, + "grad_norm": 6.784587383270264, + "learning_rate": 3.814770718548396e-05, + "loss": 0.4212, + "step": 1089 + }, + { + "epoch": 0.9426347650619775, + "grad_norm": 8.687413215637207, + "learning_rate": 3.7992420974860384e-05, + "loss": 0.5723, + "step": 1090 + }, + { + "epoch": 0.9434995675987317, + "grad_norm": 3.785308361053467, + "learning_rate": 3.783725757517994e-05, + "loss": 0.2047, + "step": 1091 + }, + { + "epoch": 0.9443643701354857, + "grad_norm": 8.60908031463623, + "learning_rate": 3.7682218573420576e-05, + "loss": 0.4359, + "step": 1092 + }, + { + "epoch": 0.9452291726722398, + "grad_norm": 3.608921527862549, + "learning_rate": 3.7527305555287976e-05, + "loss": 0.2121, + "step": 1093 + }, + { + "epoch": 0.946093975208994, + "grad_norm": 7.160829544067383, + "learning_rate": 3.737252010519925e-05, + "loss": 0.461, + "step": 1094 + }, + { + "epoch": 0.9469587777457481, + "grad_norm": 2.5925629138946533, + "learning_rate": 3.721786380626675e-05, + "loss": 0.1127, + "step": 1095 + }, + { + "epoch": 0.9478235802825021, + "grad_norm": 9.759129524230957, + "learning_rate": 3.706333824028201e-05, + "loss": 0.5365, + "step": 1096 + }, + { + "epoch": 0.9486883828192563, + "grad_norm": 9.999465942382812, + "learning_rate": 3.690894498769933e-05, + "loss": 1.0112, + "step": 1097 + }, + { + "epoch": 0.9495531853560104, + "grad_norm": 9.034364700317383, + "learning_rate": 3.675468562761982e-05, + "loss": 0.6563, + "step": 1098 + }, + { + "epoch": 0.9504179878927644, + "grad_norm": 2.152198076248169, + "learning_rate": 3.6600561737775106e-05, + "loss": 0.0732, + "step": 1099 + }, + { + "epoch": 0.9512827904295186, + "grad_norm": 4.004874229431152, + "learning_rate": 3.6446574894511265e-05, + "loss": 0.1631, + "step": 1100 + }, + { + "epoch": 0.9521475929662727, + "grad_norm": 7.518155097961426, + "learning_rate": 3.629272667277274e-05, + "loss": 0.4512, + "step": 1101 + }, + { + "epoch": 0.9530123955030269, + "grad_norm": 5.864679336547852, + "learning_rate": 3.613901864608611e-05, + "loss": 0.2731, + "step": 1102 + }, + { + "epoch": 0.9538771980397809, + "grad_norm": 7.249544620513916, + "learning_rate": 3.598545238654416e-05, + "loss": 0.4866, + "step": 1103 + }, + { + "epoch": 0.954742000576535, + "grad_norm": 2.4601848125457764, + "learning_rate": 3.583202946478963e-05, + "loss": 0.2007, + "step": 1104 + }, + { + "epoch": 0.9556068031132892, + "grad_norm": 7.753067970275879, + "learning_rate": 3.567875144999925e-05, + "loss": 0.501, + "step": 1105 + }, + { + "epoch": 0.9564716056500432, + "grad_norm": 11.398188591003418, + "learning_rate": 3.5525619909867704e-05, + "loss": 0.8343, + "step": 1106 + }, + { + "epoch": 0.9573364081867973, + "grad_norm": 3.151561975479126, + "learning_rate": 3.537263641059152e-05, + "loss": 0.1781, + "step": 1107 + }, + { + "epoch": 0.9582012107235515, + "grad_norm": 5.797046184539795, + "learning_rate": 3.521980251685315e-05, + "loss": 0.3011, + "step": 1108 + }, + { + "epoch": 0.9590660132603056, + "grad_norm": 8.037071228027344, + "learning_rate": 3.506711979180485e-05, + "loss": 0.423, + "step": 1109 + }, + { + "epoch": 0.9599308157970596, + "grad_norm": 8.320140838623047, + "learning_rate": 3.49145897970528e-05, + "loss": 0.6317, + "step": 1110 + }, + { + "epoch": 0.9607956183338138, + "grad_norm": 7.24954080581665, + "learning_rate": 3.47622140926411e-05, + "loss": 0.3058, + "step": 1111 + }, + { + "epoch": 0.9616604208705679, + "grad_norm": 2.021778106689453, + "learning_rate": 3.4609994237035746e-05, + "loss": 0.1734, + "step": 1112 + }, + { + "epoch": 0.962525223407322, + "grad_norm": 4.613988876342773, + "learning_rate": 3.4457931787108774e-05, + "loss": 0.1935, + "step": 1113 + }, + { + "epoch": 0.9633900259440761, + "grad_norm": 4.552547454833984, + "learning_rate": 3.4306028298122316e-05, + "loss": 0.1533, + "step": 1114 + }, + { + "epoch": 0.9642548284808302, + "grad_norm": 10.501197814941406, + "learning_rate": 3.415428532371271e-05, + "loss": 0.9337, + "step": 1115 + }, + { + "epoch": 0.9651196310175844, + "grad_norm": 7.168083190917969, + "learning_rate": 3.40027044158745e-05, + "loss": 0.3695, + "step": 1116 + }, + { + "epoch": 0.9659844335543384, + "grad_norm": 10.598306655883789, + "learning_rate": 3.3851287124944756e-05, + "loss": 0.7095, + "step": 1117 + }, + { + "epoch": 0.9668492360910925, + "grad_norm": 5.203083038330078, + "learning_rate": 3.370003499958703e-05, + "loss": 0.4206, + "step": 1118 + }, + { + "epoch": 0.9677140386278467, + "grad_norm": 5.217127323150635, + "learning_rate": 3.3548949586775624e-05, + "loss": 0.235, + "step": 1119 + }, + { + "epoch": 0.9685788411646008, + "grad_norm": 4.155709266662598, + "learning_rate": 3.339803243177972e-05, + "loss": 0.1233, + "step": 1120 + }, + { + "epoch": 0.9694436437013548, + "grad_norm": 2.8669726848602295, + "learning_rate": 3.324728507814764e-05, + "loss": 0.1605, + "step": 1121 + }, + { + "epoch": 0.970308446238109, + "grad_norm": 3.5733962059020996, + "learning_rate": 3.3096709067691006e-05, + "loss": 0.1095, + "step": 1122 + }, + { + "epoch": 0.9711732487748631, + "grad_norm": 4.109647274017334, + "learning_rate": 3.294630594046892e-05, + "loss": 0.2737, + "step": 1123 + }, + { + "epoch": 0.9720380513116171, + "grad_norm": 7.015890121459961, + "learning_rate": 3.279607723477234e-05, + "loss": 0.3482, + "step": 1124 + }, + { + "epoch": 0.9729028538483713, + "grad_norm": 6.006662368774414, + "learning_rate": 3.2646024487108236e-05, + "loss": 0.4144, + "step": 1125 + }, + { + "epoch": 0.9737676563851254, + "grad_norm": 8.346697807312012, + "learning_rate": 3.249614923218391e-05, + "loss": 0.4055, + "step": 1126 + }, + { + "epoch": 0.9746324589218796, + "grad_norm": 6.663881778717041, + "learning_rate": 3.234645300289137e-05, + "loss": 0.5001, + "step": 1127 + }, + { + "epoch": 0.9754972614586336, + "grad_norm": 7.918451309204102, + "learning_rate": 3.21969373302915e-05, + "loss": 0.6129, + "step": 1128 + }, + { + "epoch": 0.9763620639953877, + "grad_norm": 5.889848709106445, + "learning_rate": 3.204760374359857e-05, + "loss": 0.3793, + "step": 1129 + }, + { + "epoch": 0.9772268665321419, + "grad_norm": 13.368314743041992, + "learning_rate": 3.189845377016448e-05, + "loss": 1.0901, + "step": 1130 + }, + { + "epoch": 0.9780916690688959, + "grad_norm": 2.977189064025879, + "learning_rate": 3.1749488935463145e-05, + "loss": 0.1219, + "step": 1131 + }, + { + "epoch": 0.97895647160565, + "grad_norm": 9.539501190185547, + "learning_rate": 3.160071076307497e-05, + "loss": 0.5126, + "step": 1132 + }, + { + "epoch": 0.9798212741424042, + "grad_norm": 2.8723487854003906, + "learning_rate": 3.145212077467118e-05, + "loss": 0.2261, + "step": 1133 + }, + { + "epoch": 0.9806860766791583, + "grad_norm": 4.65241813659668, + "learning_rate": 3.1303720489998326e-05, + "loss": 0.1636, + "step": 1134 + }, + { + "epoch": 0.9815508792159123, + "grad_norm": 6.660006999969482, + "learning_rate": 3.1155511426862654e-05, + "loss": 0.2711, + "step": 1135 + }, + { + "epoch": 0.9824156817526665, + "grad_norm": 4.477895259857178, + "learning_rate": 3.100749510111471e-05, + "loss": 0.3117, + "step": 1136 + }, + { + "epoch": 0.9832804842894206, + "grad_norm": 13.759649276733398, + "learning_rate": 3.085967302663375e-05, + "loss": 0.8633, + "step": 1137 + }, + { + "epoch": 0.9841452868261746, + "grad_norm": 5.8578948974609375, + "learning_rate": 3.071204671531221e-05, + "loss": 0.3619, + "step": 1138 + }, + { + "epoch": 0.9850100893629288, + "grad_norm": 2.4084582328796387, + "learning_rate": 3.056461767704037e-05, + "loss": 0.1079, + "step": 1139 + }, + { + "epoch": 0.9858748918996829, + "grad_norm": 7.170529842376709, + "learning_rate": 3.041738741969078e-05, + "loss": 0.4303, + "step": 1140 + }, + { + "epoch": 0.986739694436437, + "grad_norm": 4.021960735321045, + "learning_rate": 3.027035744910298e-05, + "loss": 0.1799, + "step": 1141 + }, + { + "epoch": 0.9876044969731911, + "grad_norm": 4.080975532531738, + "learning_rate": 3.012352926906794e-05, + "loss": 0.3902, + "step": 1142 + }, + { + "epoch": 0.9884692995099452, + "grad_norm": 2.9526562690734863, + "learning_rate": 2.9976904381312835e-05, + "loss": 0.1557, + "step": 1143 + }, + { + "epoch": 0.9893341020466994, + "grad_norm": 5.068524360656738, + "learning_rate": 2.9830484285485544e-05, + "loss": 0.1057, + "step": 1144 + }, + { + "epoch": 0.9901989045834535, + "grad_norm": 4.044787883758545, + "learning_rate": 2.968427047913942e-05, + "loss": 0.1142, + "step": 1145 + } + ], + "logging_steps": 1, + "max_steps": 3468, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1145, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}