Robot2050 commited on
Commit
0bfddb9
·
verified ·
1 Parent(s): abb443b

Upload 7 files

Browse files
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9985985985985986,
3
+ "total_flos": 222868335149056.0,
4
+ "train_loss": 0.15247767985376537,
5
+ "train_runtime": 21883.2415,
6
+ "train_samples_per_second": 8.217,
7
+ "train_steps_per_second": 0.086
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 1872, "loss": 1.2929, "lr": 5.319148936170213e-07, "epoch": 0.016016016016016016, "percentage": 0.53, "elapsed_time": "0:02:01", "remaining_time": "6:17:30"}
2
+ {"current_steps": 20, "total_steps": 1872, "loss": 1.2223, "lr": 1.0638297872340427e-06, "epoch": 0.03203203203203203, "percentage": 1.07, "elapsed_time": "0:03:56", "remaining_time": "6:04:40"}
3
+ {"current_steps": 30, "total_steps": 1872, "loss": 0.943, "lr": 1.595744680851064e-06, "epoch": 0.04804804804804805, "percentage": 1.6, "elapsed_time": "0:05:51", "remaining_time": "5:59:16"}
4
+ {"current_steps": 40, "total_steps": 1872, "loss": 0.7396, "lr": 2.1276595744680853e-06, "epoch": 0.06406406406406406, "percentage": 2.14, "elapsed_time": "0:07:46", "remaining_time": "5:55:44"}
5
+ {"current_steps": 50, "total_steps": 1872, "loss": 0.6245, "lr": 2.6595744680851065e-06, "epoch": 0.08008008008008008, "percentage": 2.67, "elapsed_time": "0:09:40", "remaining_time": "5:52:30"}
6
+ {"current_steps": 60, "total_steps": 1872, "loss": 0.5384, "lr": 3.191489361702128e-06, "epoch": 0.0960960960960961, "percentage": 3.21, "elapsed_time": "0:11:34", "remaining_time": "5:49:38"}
7
+ {"current_steps": 70, "total_steps": 1872, "loss": 0.4779, "lr": 3.723404255319149e-06, "epoch": 0.11211211211211211, "percentage": 3.74, "elapsed_time": "0:13:32", "remaining_time": "5:48:44"}
8
+ {"current_steps": 80, "total_steps": 1872, "loss": 0.4384, "lr": 4.255319148936171e-06, "epoch": 0.12812812812812813, "percentage": 4.27, "elapsed_time": "0:15:27", "remaining_time": "5:46:15"}
9
+ {"current_steps": 90, "total_steps": 1872, "loss": 0.4194, "lr": 4.787234042553192e-06, "epoch": 0.14414414414414414, "percentage": 4.81, "elapsed_time": "0:17:21", "remaining_time": "5:43:44"}
10
+ {"current_steps": 100, "total_steps": 1872, "loss": 0.3881, "lr": 5.319148936170213e-06, "epoch": 0.16016016016016016, "percentage": 5.34, "elapsed_time": "0:19:16", "remaining_time": "5:41:26"}
11
+ {"current_steps": 100, "total_steps": 1872, "eval_loss": 0.38211190700531006, "epoch": 0.16016016016016016, "percentage": 5.34, "elapsed_time": "0:19:21", "remaining_time": "5:43:00"}
12
+ {"current_steps": 110, "total_steps": 1872, "loss": 0.3735, "lr": 5.851063829787235e-06, "epoch": 0.17617617617617617, "percentage": 5.88, "elapsed_time": "0:21:37", "remaining_time": "5:46:19"}
13
+ {"current_steps": 120, "total_steps": 1872, "loss": 0.3597, "lr": 6.382978723404256e-06, "epoch": 0.1921921921921922, "percentage": 6.41, "elapsed_time": "0:23:32", "remaining_time": "5:43:35"}
14
+ {"current_steps": 130, "total_steps": 1872, "loss": 0.3451, "lr": 6.914893617021278e-06, "epoch": 0.2082082082082082, "percentage": 6.94, "elapsed_time": "0:25:26", "remaining_time": "5:40:57"}
15
+ {"current_steps": 140, "total_steps": 1872, "loss": 0.3299, "lr": 7.446808510638298e-06, "epoch": 0.22422422422422422, "percentage": 7.48, "elapsed_time": "0:27:21", "remaining_time": "5:38:26"}
16
+ {"current_steps": 150, "total_steps": 1872, "loss": 0.3224, "lr": 7.97872340425532e-06, "epoch": 0.24024024024024024, "percentage": 8.01, "elapsed_time": "0:29:15", "remaining_time": "5:35:55"}
17
+ {"current_steps": 160, "total_steps": 1872, "loss": 0.3077, "lr": 8.510638297872341e-06, "epoch": 0.25625625625625625, "percentage": 8.55, "elapsed_time": "0:31:09", "remaining_time": "5:33:28"}
18
+ {"current_steps": 170, "total_steps": 1872, "loss": 0.3017, "lr": 9.042553191489362e-06, "epoch": 0.2722722722722723, "percentage": 9.08, "elapsed_time": "0:33:04", "remaining_time": "5:31:07"}
19
+ {"current_steps": 180, "total_steps": 1872, "loss": 0.2859, "lr": 9.574468085106385e-06, "epoch": 0.2882882882882883, "percentage": 9.62, "elapsed_time": "0:34:58", "remaining_time": "5:28:50"}
20
+ {"current_steps": 190, "total_steps": 1872, "loss": 0.2877, "lr": 9.999965197129365e-06, "epoch": 0.30430430430430433, "percentage": 10.15, "elapsed_time": "0:36:52", "remaining_time": "5:26:30"}
21
+ {"current_steps": 200, "total_steps": 1872, "loss": 0.2773, "lr": 9.998747147528375e-06, "epoch": 0.3203203203203203, "percentage": 10.68, "elapsed_time": "0:38:47", "remaining_time": "5:24:15"}
22
+ {"current_steps": 200, "total_steps": 1872, "eval_loss": 0.2963399589061737, "epoch": 0.3203203203203203, "percentage": 10.68, "elapsed_time": "0:38:51", "remaining_time": "5:24:55"}
23
+ {"current_steps": 210, "total_steps": 1872, "loss": 0.2793, "lr": 9.995789438861128e-06, "epoch": 0.33633633633633636, "percentage": 11.22, "elapsed_time": "0:41:08", "remaining_time": "5:25:32"}
24
+ {"current_steps": 220, "total_steps": 1872, "loss": 0.2678, "lr": 9.991093100466482e-06, "epoch": 0.35235235235235235, "percentage": 11.75, "elapsed_time": "0:43:02", "remaining_time": "5:23:12"}
25
+ {"current_steps": 230, "total_steps": 1872, "loss": 0.2633, "lr": 9.98465976675951e-06, "epoch": 0.3683683683683684, "percentage": 12.29, "elapsed_time": "0:44:58", "remaining_time": "5:21:04"}
26
+ {"current_steps": 240, "total_steps": 1872, "loss": 0.2556, "lr": 9.976491676662679e-06, "epoch": 0.3843843843843844, "percentage": 12.82, "elapsed_time": "0:46:53", "remaining_time": "5:18:48"}
27
+ {"current_steps": 250, "total_steps": 1872, "loss": 0.2501, "lr": 9.966591672826674e-06, "epoch": 0.4004004004004004, "percentage": 13.35, "elapsed_time": "0:48:47", "remaining_time": "5:16:34"}
28
+ {"current_steps": 260, "total_steps": 1872, "loss": 0.239, "lr": 9.95496320064109e-06, "epoch": 0.4164164164164164, "percentage": 13.89, "elapsed_time": "0:50:42", "remaining_time": "5:14:25"}
29
+ {"current_steps": 270, "total_steps": 1872, "loss": 0.2352, "lr": 9.941610307035385e-06, "epoch": 0.43243243243243246, "percentage": 14.42, "elapsed_time": "0:52:38", "remaining_time": "5:12:19"}
30
+ {"current_steps": 280, "total_steps": 1872, "loss": 0.2343, "lr": 9.926537639070457e-06, "epoch": 0.44844844844844844, "percentage": 14.96, "elapsed_time": "0:54:34", "remaining_time": "5:10:16"}
31
+ {"current_steps": 290, "total_steps": 1872, "loss": 0.2246, "lr": 9.90975044232139e-06, "epoch": 0.4644644644644645, "percentage": 15.49, "elapsed_time": "0:56:29", "remaining_time": "5:08:11"}
32
+ {"current_steps": 300, "total_steps": 1872, "loss": 0.2209, "lr": 9.891254559051886e-06, "epoch": 0.4804804804804805, "percentage": 16.03, "elapsed_time": "0:58:24", "remaining_time": "5:06:04"}
33
+ {"current_steps": 300, "total_steps": 1872, "eval_loss": 0.22802673280239105, "epoch": 0.4804804804804805, "percentage": 16.03, "elapsed_time": "0:58:29", "remaining_time": "5:06:29"}
34
+ {"current_steps": 310, "total_steps": 1872, "loss": 0.2206, "lr": 9.871056426181052e-06, "epoch": 0.4964964964964965, "percentage": 16.56, "elapsed_time": "1:00:53", "remaining_time": "5:06:50"}
35
+ {"current_steps": 320, "total_steps": 1872, "loss": 0.2229, "lr": 9.849163073043223e-06, "epoch": 0.5125125125125125, "percentage": 17.09, "elapsed_time": "1:02:48", "remaining_time": "5:04:36"}
36
+ {"current_steps": 330, "total_steps": 1872, "loss": 0.206, "lr": 9.82558211894163e-06, "epoch": 0.5285285285285285, "percentage": 17.63, "elapsed_time": "1:04:43", "remaining_time": "5:02:26"}
37
+ {"current_steps": 340, "total_steps": 1872, "loss": 0.2106, "lr": 9.800321770496726e-06, "epoch": 0.5445445445445446, "percentage": 18.16, "elapsed_time": "1:06:56", "remaining_time": "5:01:36"}
38
+ {"current_steps": 350, "total_steps": 1872, "loss": 0.2049, "lr": 9.773390818790136e-06, "epoch": 0.5605605605605606, "percentage": 18.7, "elapsed_time": "1:08:54", "remaining_time": "4:59:39"}
39
+ {"current_steps": 360, "total_steps": 1872, "loss": 0.2045, "lr": 9.744798636305189e-06, "epoch": 0.5765765765765766, "percentage": 19.23, "elapsed_time": "1:10:49", "remaining_time": "4:57:28"}
40
+ {"current_steps": 370, "total_steps": 1872, "loss": 0.1948, "lr": 9.714555173665112e-06, "epoch": 0.5925925925925926, "percentage": 19.76, "elapsed_time": "1:12:45", "remaining_time": "4:55:23"}
41
+ {"current_steps": 380, "total_steps": 1872, "loss": 0.1947, "lr": 9.68267095617003e-06, "epoch": 0.6086086086086087, "percentage": 20.3, "elapsed_time": "1:14:41", "remaining_time": "4:53:17"}
42
+ {"current_steps": 390, "total_steps": 1872, "loss": 0.1923, "lr": 9.649157080133962e-06, "epoch": 0.6246246246246246, "percentage": 20.83, "elapsed_time": "1:16:40", "remaining_time": "4:51:22"}
43
+ {"current_steps": 400, "total_steps": 1872, "loss": 0.1945, "lr": 9.614025209023084e-06, "epoch": 0.6406406406406406, "percentage": 21.37, "elapsed_time": "1:18:37", "remaining_time": "4:49:19"}
44
+ {"current_steps": 400, "total_steps": 1872, "eval_loss": 0.19540859758853912, "epoch": 0.6406406406406406, "percentage": 21.37, "elapsed_time": "1:18:42", "remaining_time": "4:49:37"}
45
+ {"current_steps": 410, "total_steps": 1872, "loss": 0.1894, "lr": 9.577287569396632e-06, "epoch": 0.6566566566566566, "percentage": 21.9, "elapsed_time": "1:21:05", "remaining_time": "4:49:08"}
46
+ {"current_steps": 420, "total_steps": 1872, "loss": 0.1782, "lr": 9.538956946651816e-06, "epoch": 0.6726726726726727, "percentage": 22.44, "elapsed_time": "1:23:00", "remaining_time": "4:46:57"}
47
+ {"current_steps": 430, "total_steps": 1872, "loss": 0.1723, "lr": 9.499046680574267e-06, "epoch": 0.6886886886886887, "percentage": 22.97, "elapsed_time": "1:24:56", "remaining_time": "4:44:52"}
48
+ {"current_steps": 440, "total_steps": 1872, "loss": 0.1785, "lr": 9.457570660695542e-06, "epoch": 0.7047047047047047, "percentage": 23.5, "elapsed_time": "1:26:52", "remaining_time": "4:42:45"}
49
+ {"current_steps": 450, "total_steps": 1872, "loss": 0.1767, "lr": 9.41454332145928e-06, "epoch": 0.7207207207207207, "percentage": 24.04, "elapsed_time": "1:28:49", "remaining_time": "4:40:40"}
50
+ {"current_steps": 460, "total_steps": 1872, "loss": 0.1738, "lr": 9.369979637197774e-06, "epoch": 0.7367367367367368, "percentage": 24.57, "elapsed_time": "1:30:44", "remaining_time": "4:38:32"}
51
+ {"current_steps": 470, "total_steps": 1872, "loss": 0.1688, "lr": 9.323895116920591e-06, "epoch": 0.7527527527527528, "percentage": 25.11, "elapsed_time": "1:32:39", "remaining_time": "4:36:24"}
52
+ {"current_steps": 480, "total_steps": 1872, "loss": 0.1628, "lr": 9.27630579891716e-06, "epoch": 0.7687687687687688, "percentage": 25.64, "elapsed_time": "1:34:34", "remaining_time": "4:34:16"}
53
+ {"current_steps": 490, "total_steps": 1872, "loss": 0.1676, "lr": 9.227228245175127e-06, "epoch": 0.7847847847847848, "percentage": 26.18, "elapsed_time": "1:36:30", "remaining_time": "4:32:11"}
54
+ {"current_steps": 500, "total_steps": 1872, "loss": 0.1592, "lr": 9.176679535616477e-06, "epoch": 0.8008008008008008, "percentage": 26.71, "elapsed_time": "1:38:25", "remaining_time": "4:30:03"}
55
+ {"current_steps": 500, "total_steps": 1872, "eval_loss": 0.1710364669561386, "epoch": 0.8008008008008008, "percentage": 26.71, "elapsed_time": "1:38:29", "remaining_time": "4:30:16"}
56
+ {"current_steps": 510, "total_steps": 1872, "loss": 0.1586, "lr": 9.124677262153405e-06, "epoch": 0.8168168168168168, "percentage": 27.24, "elapsed_time": "1:40:49", "remaining_time": "4:29:15"}
57
+ {"current_steps": 520, "total_steps": 1872, "loss": 0.1556, "lr": 9.071239522565978e-06, "epoch": 0.8328328328328328, "percentage": 27.78, "elapsed_time": "1:42:45", "remaining_time": "4:27:09"}
58
+ {"current_steps": 530, "total_steps": 1872, "loss": 0.1592, "lr": 9.016384914203771e-06, "epoch": 0.8488488488488488, "percentage": 28.31, "elapsed_time": "1:44:40", "remaining_time": "4:25:02"}
59
+ {"current_steps": 540, "total_steps": 1872, "loss": 0.1616, "lr": 8.960132527513642e-06, "epoch": 0.8648648648648649, "percentage": 28.85, "elapsed_time": "1:46:36", "remaining_time": "4:22:58"}
60
+ {"current_steps": 550, "total_steps": 1872, "loss": 0.155, "lr": 8.902501939395887e-06, "epoch": 0.8808808808808809, "percentage": 29.38, "elapsed_time": "1:48:31", "remaining_time": "4:20:51"}
61
+ {"current_steps": 560, "total_steps": 1872, "loss": 0.1514, "lr": 8.8435132063911e-06, "epoch": 0.8968968968968969, "percentage": 29.91, "elapsed_time": "1:50:26", "remaining_time": "4:18:45"}
62
+ {"current_steps": 570, "total_steps": 1872, "loss": 0.1455, "lr": 8.783186857700137e-06, "epoch": 0.9129129129129129, "percentage": 30.45, "elapsed_time": "1:52:20", "remaining_time": "4:16:37"}
63
+ {"current_steps": 580, "total_steps": 1872, "loss": 0.1417, "lr": 8.721543888039534e-06, "epoch": 0.928928928928929, "percentage": 30.98, "elapsed_time": "1:54:15", "remaining_time": "4:14:32"}
64
+ {"current_steps": 590, "total_steps": 1872, "loss": 0.155, "lr": 8.658605750334972e-06, "epoch": 0.944944944944945, "percentage": 31.52, "elapsed_time": "1:56:10", "remaining_time": "4:12:26"}
65
+ {"current_steps": 600, "total_steps": 1872, "loss": 0.1393, "lr": 8.594394348255239e-06, "epoch": 0.960960960960961, "percentage": 32.05, "elapsed_time": "1:58:04", "remaining_time": "4:10:19"}
66
+ {"current_steps": 600, "total_steps": 1872, "eval_loss": 0.1448938250541687, "epoch": 0.960960960960961, "percentage": 32.05, "elapsed_time": "1:58:09", "remaining_time": "4:10:28"}
67
+ {"current_steps": 610, "total_steps": 1872, "loss": 0.142, "lr": 8.528932028589337e-06, "epoch": 0.9769769769769769, "percentage": 32.59, "elapsed_time": "2:00:27", "remaining_time": "4:09:12"}
68
+ {"current_steps": 620, "total_steps": 1872, "loss": 0.1426, "lr": 8.462241573469378e-06, "epoch": 0.992992992992993, "percentage": 33.12, "elapsed_time": "2:02:22", "remaining_time": "4:07:06"}
69
+ {"current_steps": 630, "total_steps": 1872, "loss": 0.1415, "lr": 8.394346192441967e-06, "epoch": 1.0092092092092093, "percentage": 33.65, "elapsed_time": "2:04:16", "remaining_time": "4:04:59"}
70
+ {"current_steps": 640, "total_steps": 1872, "loss": 0.1176, "lr": 8.325269514390835e-06, "epoch": 1.0252252252252252, "percentage": 34.19, "elapsed_time": "2:06:11", "remaining_time": "4:02:54"}
71
+ {"current_steps": 650, "total_steps": 1872, "loss": 0.1223, "lr": 8.255035579313545e-06, "epoch": 1.0412412412412413, "percentage": 34.72, "elapsed_time": "2:08:06", "remaining_time": "4:00:50"}
72
+ {"current_steps": 660, "total_steps": 1872, "loss": 0.1117, "lr": 8.183668829955111e-06, "epoch": 1.0572572572572572, "percentage": 35.26, "elapsed_time": "2:10:02", "remaining_time": "3:58:48"}
73
+ {"current_steps": 670, "total_steps": 1872, "loss": 0.1176, "lr": 8.111194103301461e-06, "epoch": 1.0732732732732733, "percentage": 35.79, "elapsed_time": "2:11:56", "remaining_time": "3:56:42"}
74
+ {"current_steps": 680, "total_steps": 1872, "loss": 0.1193, "lr": 8.037636621935686e-06, "epoch": 1.0892892892892894, "percentage": 36.32, "elapsed_time": "2:13:50", "remaining_time": "3:54:36"}
75
+ {"current_steps": 690, "total_steps": 1872, "loss": 0.1182, "lr": 7.96302198526011e-06, "epoch": 1.1053053053053052, "percentage": 36.86, "elapsed_time": "2:15:45", "remaining_time": "3:52:32"}
76
+ {"current_steps": 700, "total_steps": 1872, "loss": 0.1138, "lr": 7.887376160587214e-06, "epoch": 1.1213213213213213, "percentage": 37.39, "elapsed_time": "2:17:40", "remaining_time": "3:50:29"}
77
+ {"current_steps": 700, "total_steps": 1872, "eval_loss": 0.15331269800662994, "epoch": 1.1213213213213213, "percentage": 37.39, "elapsed_time": "2:17:44", "remaining_time": "3:50:37"}
78
+ {"current_steps": 710, "total_steps": 1872, "loss": 0.1116, "lr": 7.810725474102504e-06, "epoch": 1.1373373373373372, "percentage": 37.93, "elapsed_time": "2:20:03", "remaining_time": "3:49:13"}
79
+ {"current_steps": 720, "total_steps": 1872, "loss": 0.1123, "lr": 7.733096601702508e-06, "epoch": 1.1533533533533533, "percentage": 38.46, "elapsed_time": "2:21:58", "remaining_time": "3:47:09"}
80
+ {"current_steps": 730, "total_steps": 1872, "loss": 0.1142, "lr": 7.654516559711053e-06, "epoch": 1.1693693693693694, "percentage": 39.0, "elapsed_time": "2:23:53", "remaining_time": "3:45:06"}
81
+ {"current_steps": 740, "total_steps": 1872, "loss": 0.1095, "lr": 7.575012695477076e-06, "epoch": 1.1853853853853853, "percentage": 39.53, "elapsed_time": "2:25:48", "remaining_time": "3:43:03"}
82
+ {"current_steps": 750, "total_steps": 1872, "loss": 0.1091, "lr": 7.494612677857218e-06, "epoch": 1.2014014014014014, "percentage": 40.06, "elapsed_time": "2:27:43", "remaining_time": "3:40:59"}
83
+ {"current_steps": 760, "total_steps": 1872, "loss": 0.1099, "lr": 7.413344487586542e-06, "epoch": 1.2174174174174175, "percentage": 40.6, "elapsed_time": "2:29:38", "remaining_time": "3:38:56"}
84
+ {"current_steps": 770, "total_steps": 1872, "loss": 0.1137, "lr": 7.331236407540704e-06, "epoch": 1.2334334334334334, "percentage": 41.13, "elapsed_time": "2:31:32", "remaining_time": "3:36:53"}
85
+ {"current_steps": 780, "total_steps": 1872, "loss": 0.1128, "lr": 7.248317012892969e-06, "epoch": 1.2494494494494495, "percentage": 41.67, "elapsed_time": "2:33:27", "remaining_time": "3:34:51"}
86
+ {"current_steps": 790, "total_steps": 1872, "loss": 0.1087, "lr": 7.164615161169518e-06, "epoch": 1.2654654654654656, "percentage": 42.2, "elapsed_time": "2:35:22", "remaining_time": "3:32:47"}
87
+ {"current_steps": 800, "total_steps": 1872, "loss": 0.1096, "lr": 7.080159982206471e-06, "epoch": 1.2814814814814814, "percentage": 42.74, "elapsed_time": "2:37:18", "remaining_time": "3:30:47"}
88
+ {"current_steps": 800, "total_steps": 1872, "eval_loss": 0.140634223818779, "epoch": 1.2814814814814814, "percentage": 42.74, "elapsed_time": "2:37:23", "remaining_time": "3:30:53"}
89
+ {"current_steps": 810, "total_steps": 1872, "loss": 0.1109, "lr": 6.994980868012151e-06, "epoch": 1.2974974974974975, "percentage": 43.27, "elapsed_time": "2:39:47", "remaining_time": "3:29:30"}
90
+ {"current_steps": 820, "total_steps": 1872, "loss": 0.1104, "lr": 6.909107462538113e-06, "epoch": 1.3135135135135134, "percentage": 43.8, "elapsed_time": "2:41:42", "remaining_time": "3:27:27"}
91
+ {"current_steps": 830, "total_steps": 1872, "loss": 0.1091, "lr": 6.822569651362475e-06, "epoch": 1.3295295295295295, "percentage": 44.34, "elapsed_time": "2:43:37", "remaining_time": "3:25:25"}
92
+ {"current_steps": 840, "total_steps": 1872, "loss": 0.1072, "lr": 6.735397551289179e-06, "epoch": 1.3455455455455456, "percentage": 44.87, "elapsed_time": "2:45:32", "remaining_time": "3:23:22"}
93
+ {"current_steps": 850, "total_steps": 1872, "loss": 0.1065, "lr": 6.647621499866762e-06, "epoch": 1.3615615615615615, "percentage": 45.41, "elapsed_time": "2:47:29", "remaining_time": "3:21:22"}
94
+ {"current_steps": 860, "total_steps": 1872, "loss": 0.1049, "lr": 6.5592720448303174e-06, "epoch": 1.3775775775775776, "percentage": 45.94, "elapsed_time": "2:49:24", "remaining_time": "3:19:20"}
95
+ {"current_steps": 870, "total_steps": 1872, "loss": 0.1018, "lr": 6.470379933470296e-06, "epoch": 1.3935935935935935, "percentage": 46.47, "elapsed_time": "2:51:19", "remaining_time": "3:17:18"}
96
+ {"current_steps": 880, "total_steps": 1872, "loss": 0.1015, "lr": 6.380976101931879e-06, "epoch": 1.4096096096096096, "percentage": 47.01, "elapsed_time": "2:53:14", "remaining_time": "3:15:17"}
97
+ {"current_steps": 890, "total_steps": 1872, "loss": 0.1076, "lr": 6.291091664448589e-06, "epoch": 1.4256256256256257, "percentage": 47.54, "elapsed_time": "2:55:09", "remaining_time": "3:13:16"}
98
+ {"current_steps": 900, "total_steps": 1872, "loss": 0.1028, "lr": 6.200757902513962e-06, "epoch": 1.4416416416416418, "percentage": 48.08, "elapsed_time": "2:57:05", "remaining_time": "3:11:15"}
99
+ {"current_steps": 900, "total_steps": 1872, "eval_loss": 0.12726753950119019, "epoch": 1.4416416416416418, "percentage": 48.08, "elapsed_time": "2:57:10", "remaining_time": "3:11:20"}
100
+ {"current_steps": 910, "total_steps": 1872, "loss": 0.1058, "lr": 6.11000625399499e-06, "epoch": 1.4576576576576576, "percentage": 48.61, "elapsed_time": "2:59:30", "remaining_time": "3:09:45"}
101
+ {"current_steps": 920, "total_steps": 1872, "loss": 0.1008, "lr": 6.0188683021911394e-06, "epoch": 1.4736736736736737, "percentage": 49.15, "elapsed_time": "3:01:24", "remaining_time": "3:07:43"}
102
+ {"current_steps": 930, "total_steps": 1872, "loss": 0.0986, "lr": 5.927375764842766e-06, "epoch": 1.4896896896896896, "percentage": 49.68, "elapsed_time": "3:03:19", "remaining_time": "3:05:41"}
103
+ {"current_steps": 940, "total_steps": 1872, "loss": 0.1045, "lr": 5.835560483092743e-06, "epoch": 1.5057057057057057, "percentage": 50.21, "elapsed_time": "3:05:14", "remaining_time": "3:03:39"}
104
+ {"current_steps": 950, "total_steps": 1872, "loss": 0.1008, "lr": 5.743454410405126e-06, "epoch": 1.5217217217217218, "percentage": 50.75, "elapsed_time": "3:07:08", "remaining_time": "3:01:37"}
105
+ {"current_steps": 960, "total_steps": 1872, "loss": 0.0975, "lr": 5.651089601444752e-06, "epoch": 1.5377377377377377, "percentage": 51.28, "elapsed_time": "3:09:04", "remaining_time": "2:59:37"}
106
+ {"current_steps": 970, "total_steps": 1872, "loss": 0.103, "lr": 5.558498200921597e-06, "epoch": 1.5537537537537538, "percentage": 51.82, "elapsed_time": "3:10:58", "remaining_time": "2:57:35"}
107
+ {"current_steps": 980, "total_steps": 1872, "loss": 0.1009, "lr": 5.465712432403812e-06, "epoch": 1.5697697697697697, "percentage": 52.35, "elapsed_time": "3:12:53", "remaining_time": "2:55:34"}
108
+ {"current_steps": 990, "total_steps": 1872, "loss": 0.1026, "lr": 5.372764587103309e-06, "epoch": 1.5857857857857858, "percentage": 52.88, "elapsed_time": "3:14:48", "remaining_time": "2:53:33"}
109
+ {"current_steps": 1000, "total_steps": 1872, "loss": 0.0998, "lr": 5.279687012637798e-06, "epoch": 1.6018018018018019, "percentage": 53.42, "elapsed_time": "3:16:42", "remaining_time": "2:51:31"}
110
+ {"current_steps": 1000, "total_steps": 1872, "eval_loss": 0.12056411057710648, "epoch": 1.6018018018018019, "percentage": 53.42, "elapsed_time": "3:16:46", "remaining_time": "2:51:35"}
111
+ {"current_steps": 1010, "total_steps": 1872, "loss": 0.0987, "lr": 5.186512101773206e-06, "epoch": 1.617817817817818, "percentage": 53.95, "elapsed_time": "3:19:06", "remaining_time": "2:49:56"}
112
+ {"current_steps": 1020, "total_steps": 1872, "loss": 0.0967, "lr": 5.093272281150383e-06, "epoch": 1.6338338338338338, "percentage": 54.49, "elapsed_time": "3:21:01", "remaining_time": "2:47:54"}
113
+ {"current_steps": 1030, "total_steps": 1872, "loss": 0.102, "lr": 5e-06, "epoch": 1.6498498498498497, "percentage": 55.02, "elapsed_time": "3:22:55", "remaining_time": "2:45:52"}
114
+ {"current_steps": 1040, "total_steps": 1872, "loss": 0.0918, "lr": 4.906727718849619e-06, "epoch": 1.6658658658658658, "percentage": 55.56, "elapsed_time": "3:24:49", "remaining_time": "2:43:51"}
115
+ {"current_steps": 1050, "total_steps": 1872, "loss": 0.0972, "lr": 4.813487898226794e-06, "epoch": 1.681881881881882, "percentage": 56.09, "elapsed_time": "3:26:43", "remaining_time": "2:41:50"}
116
+ {"current_steps": 1060, "total_steps": 1872, "loss": 0.0963, "lr": 4.720312987362204e-06, "epoch": 1.697897897897898, "percentage": 56.62, "elapsed_time": "3:28:39", "remaining_time": "2:39:50"}
117
+ {"current_steps": 1070, "total_steps": 1872, "loss": 0.0947, "lr": 4.6272354128966924e-06, "epoch": 1.713913913913914, "percentage": 57.16, "elapsed_time": "3:30:33", "remaining_time": "2:37:49"}
118
+ {"current_steps": 1080, "total_steps": 1872, "loss": 0.1, "lr": 4.534287567596189e-06, "epoch": 1.7299299299299298, "percentage": 57.69, "elapsed_time": "3:32:28", "remaining_time": "2:35:48"}
119
+ {"current_steps": 1090, "total_steps": 1872, "loss": 0.0939, "lr": 4.441501799078405e-06, "epoch": 1.7459459459459459, "percentage": 58.23, "elapsed_time": "3:34:23", "remaining_time": "2:33:48"}
120
+ {"current_steps": 1100, "total_steps": 1872, "loss": 0.0952, "lr": 4.348910398555249e-06, "epoch": 1.761961961961962, "percentage": 58.76, "elapsed_time": "3:36:18", "remaining_time": "2:31:48"}
121
+ {"current_steps": 1100, "total_steps": 1872, "eval_loss": 0.12079311162233353, "epoch": 1.761961961961962, "percentage": 58.76, "elapsed_time": "3:36:22", "remaining_time": "2:31:51"}
122
+ {"current_steps": 1110, "total_steps": 1872, "loss": 0.095, "lr": 4.2565455895948745e-06, "epoch": 1.777977977977978, "percentage": 59.29, "elapsed_time": "3:38:41", "remaining_time": "2:30:07"}
123
+ {"current_steps": 1120, "total_steps": 1872, "loss": 0.096, "lr": 4.164439516907258e-06, "epoch": 1.793993993993994, "percentage": 59.83, "elapsed_time": "3:40:35", "remaining_time": "2:28:06"}
124
+ {"current_steps": 1130, "total_steps": 1872, "loss": 0.0932, "lr": 4.072624235157234e-06, "epoch": 1.81001001001001, "percentage": 60.36, "elapsed_time": "3:42:30", "remaining_time": "2:26:06"}
125
+ {"current_steps": 1140, "total_steps": 1872, "loss": 0.0961, "lr": 3.981131697808862e-06, "epoch": 1.826026026026026, "percentage": 60.9, "elapsed_time": "3:44:24", "remaining_time": "2:24:05"}
126
+ {"current_steps": 1150, "total_steps": 1872, "loss": 0.0938, "lr": 3.889993746005011e-06, "epoch": 1.842042042042042, "percentage": 61.43, "elapsed_time": "3:46:19", "remaining_time": "2:22:05"}
127
+ {"current_steps": 1160, "total_steps": 1872, "loss": 0.0958, "lr": 3.799242097486038e-06, "epoch": 1.8580580580580581, "percentage": 61.97, "elapsed_time": "3:48:13", "remaining_time": "2:20:05"}
128
+ {"current_steps": 1170, "total_steps": 1872, "loss": 0.0972, "lr": 3.708908335551412e-06, "epoch": 1.8740740740740742, "percentage": 62.5, "elapsed_time": "3:50:07", "remaining_time": "2:18:04"}
129
+ {"current_steps": 1180, "total_steps": 1872, "loss": 0.0931, "lr": 3.6190238980681235e-06, "epoch": 1.89009009009009, "percentage": 63.03, "elapsed_time": "3:52:01", "remaining_time": "2:16:03"}
130
+ {"current_steps": 1190, "total_steps": 1872, "loss": 0.0925, "lr": 3.529620066529704e-06, "epoch": 1.906106106106106, "percentage": 63.57, "elapsed_time": "3:53:54", "remaining_time": "2:14:03"}
131
+ {"current_steps": 1200, "total_steps": 1872, "loss": 0.092, "lr": 3.4407279551696846e-06, "epoch": 1.922122122122122, "percentage": 64.1, "elapsed_time": "3:55:46", "remaining_time": "2:12:02"}
132
+ {"current_steps": 1200, "total_steps": 1872, "eval_loss": 0.12118110805749893, "epoch": 1.922122122122122, "percentage": 64.1, "elapsed_time": "3:55:51", "remaining_time": "2:12:04"}
133
+ {"current_steps": 1210, "total_steps": 1872, "loss": 0.0912, "lr": 3.352378500133239e-06, "epoch": 1.9381381381381382, "percentage": 64.64, "elapsed_time": "3:58:07", "remaining_time": "2:10:16"}
134
+ {"current_steps": 1220, "total_steps": 1872, "loss": 0.092, "lr": 3.264602448710822e-06, "epoch": 1.9541541541541543, "percentage": 65.17, "elapsed_time": "3:59:59", "remaining_time": "2:08:15"}
135
+ {"current_steps": 1230, "total_steps": 1872, "loss": 0.0913, "lr": 3.177430348637527e-06, "epoch": 1.9701701701701702, "percentage": 65.71, "elapsed_time": "4:01:51", "remaining_time": "2:06:14"}
136
+ {"current_steps": 1240, "total_steps": 1872, "loss": 0.0874, "lr": 3.090892537461889e-06, "epoch": 1.986186186186186, "percentage": 66.24, "elapsed_time": "4:03:42", "remaining_time": "2:04:12"}
137
+ {"current_steps": 1250, "total_steps": 1872, "loss": 0.0869, "lr": 3.00501913198785e-06, "epoch": 2.0024024024024025, "percentage": 66.77, "elapsed_time": "4:05:33", "remaining_time": "2:02:11"}
138
+ {"current_steps": 1260, "total_steps": 1872, "loss": 0.0629, "lr": 2.9198400177935303e-06, "epoch": 2.0184184184184186, "percentage": 67.31, "elapsed_time": "4:07:24", "remaining_time": "2:00:10"}
139
+ {"current_steps": 1270, "total_steps": 1872, "loss": 0.0638, "lr": 2.835384838830481e-06, "epoch": 2.0344344344344343, "percentage": 67.84, "elapsed_time": "4:09:15", "remaining_time": "1:58:09"}
140
+ {"current_steps": 1280, "total_steps": 1872, "loss": 0.0588, "lr": 2.7516829871070295e-06, "epoch": 2.0504504504504504, "percentage": 68.38, "elapsed_time": "4:11:08", "remaining_time": "1:56:09"}
141
+ {"current_steps": 1290, "total_steps": 1872, "loss": 0.0595, "lr": 2.668763592459297e-06, "epoch": 2.0664664664664665, "percentage": 68.91, "elapsed_time": "4:13:00", "remaining_time": "1:54:08"}
142
+ {"current_steps": 1300, "total_steps": 1872, "loss": 0.061, "lr": 2.586655512413458e-06, "epoch": 2.0824824824824826, "percentage": 69.44, "elapsed_time": "4:14:51", "remaining_time": "1:52:08"}
143
+ {"current_steps": 1300, "total_steps": 1872, "eval_loss": 0.1126304417848587, "epoch": 2.0824824824824826, "percentage": 69.44, "elapsed_time": "4:14:55", "remaining_time": "1:52:10"}
144
+ {"current_steps": 1310, "total_steps": 1872, "loss": 0.0612, "lr": 2.505387322142782e-06, "epoch": 2.0984984984984987, "percentage": 69.98, "elapsed_time": "4:17:24", "remaining_time": "1:50:25"}
145
+ {"current_steps": 1320, "total_steps": 1872, "loss": 0.0607, "lr": 2.4249873045229244e-06, "epoch": 2.1145145145145143, "percentage": 70.51, "elapsed_time": "4:19:15", "remaining_time": "1:48:25"}
146
+ {"current_steps": 1330, "total_steps": 1872, "loss": 0.06, "lr": 2.345483440288947e-06, "epoch": 2.1305305305305304, "percentage": 71.05, "elapsed_time": "4:21:07", "remaining_time": "1:46:24"}
147
+ {"current_steps": 1340, "total_steps": 1872, "loss": 0.0588, "lr": 2.2669033982974946e-06, "epoch": 2.1465465465465465, "percentage": 71.58, "elapsed_time": "4:22:58", "remaining_time": "1:44:24"}
148
+ {"current_steps": 1350, "total_steps": 1872, "loss": 0.0636, "lr": 2.189274525897498e-06, "epoch": 2.1625625625625626, "percentage": 72.12, "elapsed_time": "4:24:48", "remaining_time": "1:42:23"}
149
+ {"current_steps": 1360, "total_steps": 1872, "loss": 0.0607, "lr": 2.1126238394127868e-06, "epoch": 2.1785785785785787, "percentage": 72.65, "elapsed_time": "4:26:39", "remaining_time": "1:40:23"}
150
+ {"current_steps": 1370, "total_steps": 1872, "loss": 0.062, "lr": 2.03697801473989e-06, "epoch": 2.1945945945945944, "percentage": 73.18, "elapsed_time": "4:28:30", "remaining_time": "1:38:23"}
151
+ {"current_steps": 1380, "total_steps": 1872, "loss": 0.0633, "lr": 1.962363378064316e-06, "epoch": 2.2106106106106105, "percentage": 73.72, "elapsed_time": "4:30:20", "remaining_time": "1:36:23"}
152
+ {"current_steps": 1390, "total_steps": 1872, "loss": 0.0633, "lr": 1.8888058966985407e-06, "epoch": 2.2266266266266266, "percentage": 74.25, "elapsed_time": "4:32:11", "remaining_time": "1:34:23"}
153
+ {"current_steps": 1400, "total_steps": 1872, "loss": 0.0624, "lr": 1.8163311700448899e-06, "epoch": 2.2426426426426427, "percentage": 74.79, "elapsed_time": "4:34:03", "remaining_time": "1:32:23"}
154
+ {"current_steps": 1400, "total_steps": 1872, "eval_loss": 0.1149037629365921, "epoch": 2.2426426426426427, "percentage": 74.79, "elapsed_time": "4:34:07", "remaining_time": "1:32:25"}
155
+ {"current_steps": 1410, "total_steps": 1872, "loss": 0.0623, "lr": 1.7449644206864564e-06, "epoch": 2.258658658658659, "percentage": 75.32, "elapsed_time": "4:36:18", "remaining_time": "1:30:32"}
156
+ {"current_steps": 1420, "total_steps": 1872, "loss": 0.059, "lr": 1.6747304856091662e-06, "epoch": 2.2746746746746744, "percentage": 75.85, "elapsed_time": "4:38:09", "remaining_time": "1:28:32"}
157
+ {"current_steps": 1430, "total_steps": 1872, "loss": 0.0627, "lr": 1.6056538075580342e-06, "epoch": 2.2906906906906905, "percentage": 76.39, "elapsed_time": "4:40:02", "remaining_time": "1:26:33"}
158
+ {"current_steps": 1440, "total_steps": 1872, "loss": 0.0593, "lr": 1.5377584265306222e-06, "epoch": 2.3067067067067066, "percentage": 76.92, "elapsed_time": "4:41:53", "remaining_time": "1:24:34"}
159
+ {"current_steps": 1450, "total_steps": 1872, "loss": 0.0583, "lr": 1.4710679714106635e-06, "epoch": 2.3227227227227227, "percentage": 77.46, "elapsed_time": "4:43:44", "remaining_time": "1:22:34"}
160
+ {"current_steps": 1460, "total_steps": 1872, "loss": 0.0626, "lr": 1.4056056517447637e-06, "epoch": 2.338738738738739, "percentage": 77.99, "elapsed_time": "4:45:34", "remaining_time": "1:20:35"}
161
+ {"current_steps": 1470, "total_steps": 1872, "loss": 0.0598, "lr": 1.3413942496650301e-06, "epoch": 2.354754754754755, "percentage": 78.53, "elapsed_time": "4:47:25", "remaining_time": "1:18:36"}
162
+ {"current_steps": 1480, "total_steps": 1872, "loss": 0.0609, "lr": 1.2784561119604683e-06, "epoch": 2.3707707707707706, "percentage": 79.06, "elapsed_time": "4:49:17", "remaining_time": "1:16:37"}
163
+ {"current_steps": 1490, "total_steps": 1872, "loss": 0.0618, "lr": 1.2168131422998653e-06, "epoch": 2.3867867867867867, "percentage": 79.59, "elapsed_time": "4:51:09", "remaining_time": "1:14:38"}
164
+ {"current_steps": 1500, "total_steps": 1872, "loss": 0.0607, "lr": 1.156486793608899e-06, "epoch": 2.402802802802803, "percentage": 80.13, "elapsed_time": "4:52:59", "remaining_time": "1:12:39"}
165
+ {"current_steps": 1500, "total_steps": 1872, "eval_loss": 0.10994044691324234, "epoch": 2.402802802802803, "percentage": 80.13, "elapsed_time": "4:53:03", "remaining_time": "1:12:40"}
166
+ {"current_steps": 1510, "total_steps": 1872, "loss": 0.0594, "lr": 1.0974980606041152e-06, "epoch": 2.418818818818819, "percentage": 80.66, "elapsed_time": "4:55:22", "remaining_time": "1:10:48"}
167
+ {"current_steps": 1520, "total_steps": 1872, "loss": 0.059, "lr": 1.0398674724863584e-06, "epoch": 2.434834834834835, "percentage": 81.2, "elapsed_time": "4:57:13", "remaining_time": "1:08:49"}
168
+ {"current_steps": 1530, "total_steps": 1872, "loss": 0.0596, "lr": 9.836150857962296e-07, "epoch": 2.450850850850851, "percentage": 81.73, "elapsed_time": "4:59:04", "remaining_time": "1:06:51"}
169
+ {"current_steps": 1540, "total_steps": 1872, "loss": 0.0581, "lr": 9.287604774340236e-07, "epoch": 2.4668668668668667, "percentage": 82.26, "elapsed_time": "5:00:55", "remaining_time": "1:04:52"}
170
+ {"current_steps": 1550, "total_steps": 1872, "loss": 0.0589, "lr": 8.753227378465956e-07, "epoch": 2.482882882882883, "percentage": 82.8, "elapsed_time": "5:02:47", "remaining_time": "1:02:54"}
171
+ {"current_steps": 1560, "total_steps": 1872, "loss": 0.0585, "lr": 8.233204643835235e-07, "epoch": 2.498898898898899, "percentage": 83.33, "elapsed_time": "5:04:37", "remaining_time": "1:00:55"}
172
+ {"current_steps": 1570, "total_steps": 1872, "loss": 0.0627, "lr": 7.72771754824877e-07, "epoch": 2.514914914914915, "percentage": 83.87, "elapsed_time": "5:06:28", "remaining_time": "0:58:57"}
173
+ {"current_steps": 1580, "total_steps": 1872, "loss": 0.0594, "lr": 7.23694201082843e-07, "epoch": 2.530930930930931, "percentage": 84.4, "elapsed_time": "5:08:20", "remaining_time": "0:56:59"}
174
+ {"current_steps": 1590, "total_steps": 1872, "loss": 0.0583, "lr": 6.761048830794098e-07, "epoch": 2.546946946946947, "percentage": 84.94, "elapsed_time": "5:10:19", "remaining_time": "0:55:02"}
175
+ {"current_steps": 1600, "total_steps": 1872, "loss": 0.0584, "lr": 6.300203628022272e-07, "epoch": 2.562962962962963, "percentage": 85.47, "elapsed_time": "5:12:15", "remaining_time": "0:53:05"}
176
+ {"current_steps": 1600, "total_steps": 1872, "eval_loss": 0.1075584813952446, "epoch": 2.562962962962963, "percentage": 85.47, "elapsed_time": "5:12:20", "remaining_time": "0:53:05"}
177
+ {"current_steps": 1610, "total_steps": 1872, "loss": 0.0615, "lr": 5.854566785407212e-07, "epoch": 2.578978978978979, "percentage": 86.0, "elapsed_time": "5:14:34", "remaining_time": "0:51:11"}
178
+ {"current_steps": 1620, "total_steps": 1872, "loss": 0.0589, "lr": 5.42429339304461e-07, "epoch": 2.594994994994995, "percentage": 86.54, "elapsed_time": "5:16:26", "remaining_time": "0:49:13"}
179
+ {"current_steps": 1630, "total_steps": 1872, "loss": 0.0576, "lr": 5.009533194257332e-07, "epoch": 2.611011011011011, "percentage": 87.07, "elapsed_time": "5:18:19", "remaining_time": "0:47:15"}
180
+ {"current_steps": 1640, "total_steps": 1872, "loss": 0.0581, "lr": 4.6104305334818577e-07, "epoch": 2.627027027027027, "percentage": 87.61, "elapsed_time": "5:20:10", "remaining_time": "0:45:17"}
181
+ {"current_steps": 1650, "total_steps": 1872, "loss": 0.0565, "lr": 4.2271243060336976e-07, "epoch": 2.643043043043043, "percentage": 88.14, "elapsed_time": "5:22:02", "remaining_time": "0:43:19"}
182
+ {"current_steps": 1660, "total_steps": 1872, "loss": 0.06, "lr": 3.8597479097691626e-07, "epoch": 2.659059059059059, "percentage": 88.68, "elapsed_time": "5:23:53", "remaining_time": "0:41:21"}
183
+ {"current_steps": 1670, "total_steps": 1872, "loss": 0.0592, "lr": 3.508429198660379e-07, "epoch": 2.675075075075075, "percentage": 89.21, "elapsed_time": "5:25:45", "remaining_time": "0:39:24"}
184
+ {"current_steps": 1680, "total_steps": 1872, "loss": 0.0531, "lr": 3.1732904382996975e-07, "epoch": 2.6910910910910912, "percentage": 89.74, "elapsed_time": "5:27:37", "remaining_time": "0:37:26"}
185
+ {"current_steps": 1690, "total_steps": 1872, "loss": 0.0592, "lr": 2.854448263348891e-07, "epoch": 2.707107107107107, "percentage": 90.28, "elapsed_time": "5:29:29", "remaining_time": "0:35:29"}
186
+ {"current_steps": 1700, "total_steps": 1872, "loss": 0.0574, "lr": 2.5520136369481194e-07, "epoch": 2.723123123123123, "percentage": 90.81, "elapsed_time": "5:31:20", "remaining_time": "0:33:31"}
187
+ {"current_steps": 1700, "total_steps": 1872, "eval_loss": 0.10619153082370758, "epoch": 2.723123123123123, "percentage": 90.81, "elapsed_time": "5:31:25", "remaining_time": "0:33:31"}
188
+ {"current_steps": 1710, "total_steps": 1872, "loss": 0.0581, "lr": 2.266091812098642e-07, "epoch": 2.739139139139139, "percentage": 91.35, "elapsed_time": "5:33:43", "remaining_time": "0:31:36"}
189
+ {"current_steps": 1720, "total_steps": 1872, "loss": 0.0609, "lr": 1.9967822950327453e-07, "epoch": 2.755155155155155, "percentage": 91.88, "elapsed_time": "5:35:36", "remaining_time": "0:29:39"}
190
+ {"current_steps": 1730, "total_steps": 1872, "loss": 0.0593, "lr": 1.7441788105837133e-07, "epoch": 2.7711711711711713, "percentage": 92.41, "elapsed_time": "5:37:27", "remaining_time": "0:27:41"}
191
+ {"current_steps": 1740, "total_steps": 1872, "loss": 0.0606, "lr": 1.508369269567783e-07, "epoch": 2.787187187187187, "percentage": 92.95, "elapsed_time": "5:39:19", "remaining_time": "0:25:44"}
192
+ {"current_steps": 1750, "total_steps": 1872, "loss": 0.0564, "lr": 1.2894357381894984e-07, "epoch": 2.803203203203203, "percentage": 93.48, "elapsed_time": "5:41:12", "remaining_time": "0:23:47"}
193
+ {"current_steps": 1760, "total_steps": 1872, "loss": 0.0586, "lr": 1.0874544094811424e-07, "epoch": 2.819219219219219, "percentage": 94.02, "elapsed_time": "5:43:04", "remaining_time": "0:21:49"}
194
+ {"current_steps": 1770, "total_steps": 1872, "loss": 0.0629, "lr": 9.024955767861054e-08, "epoch": 2.8352352352352352, "percentage": 94.55, "elapsed_time": "5:44:57", "remaining_time": "0:19:52"}
195
+ {"current_steps": 1780, "total_steps": 1872, "loss": 0.0562, "lr": 7.346236092954318e-08, "epoch": 2.8512512512512513, "percentage": 95.09, "elapsed_time": "5:46:49", "remaining_time": "0:17:55"}
196
+ {"current_steps": 1790, "total_steps": 1872, "loss": 0.0583, "lr": 5.838969296461605e-08, "epoch": 2.867267267267267, "percentage": 95.62, "elapsed_time": "5:48:40", "remaining_time": "0:15:58"}
197
+ {"current_steps": 1800, "total_steps": 1872, "loss": 0.0571, "lr": 4.50367993589107e-08, "epoch": 2.8832832832832835, "percentage": 96.15, "elapsed_time": "5:50:31", "remaining_time": "0:14:01"}
198
+ {"current_steps": 1800, "total_steps": 1872, "eval_loss": 0.10513312369585037, "epoch": 2.8832832832832835, "percentage": 96.15, "elapsed_time": "5:50:36", "remaining_time": "0:14:01"}
199
+ {"current_steps": 1810, "total_steps": 1872, "loss": 0.0605, "lr": 3.340832717332765e-08, "epoch": 2.899299299299299, "percentage": 96.69, "elapsed_time": "5:52:48", "remaining_time": "0:12:05"}
200
+ {"current_steps": 1820, "total_steps": 1872, "loss": 0.0567, "lr": 2.3508323337321225e-08, "epoch": 2.9153153153153153, "percentage": 97.22, "elapsed_time": "5:54:39", "remaining_time": "0:10:07"}
201
+ {"current_steps": 1830, "total_steps": 1872, "loss": 0.0564, "lr": 1.534023324049061e-08, "epoch": 2.9313313313313314, "percentage": 97.76, "elapsed_time": "5:56:31", "remaining_time": "0:08:10"}
202
+ {"current_steps": 1840, "total_steps": 1872, "loss": 0.0589, "lr": 8.906899533517866e-09, "epoch": 2.9473473473473475, "percentage": 98.29, "elapsed_time": "5:58:22", "remaining_time": "0:06:13"}
203
+ {"current_steps": 1850, "total_steps": 1872, "loss": 0.0576, "lr": 4.210561138873193e-09, "epoch": 2.9633633633633636, "percentage": 98.82, "elapsed_time": "6:00:14", "remaining_time": "0:04:17"}
204
+ {"current_steps": 1860, "total_steps": 1872, "loss": 0.0548, "lr": 1.2528524716259872e-09, "epoch": 2.9793793793793792, "percentage": 99.36, "elapsed_time": "6:02:06", "remaining_time": "0:02:20"}
205
+ {"current_steps": 1870, "total_steps": 1872, "loss": 0.0587, "lr": 3.480287063706289e-11, "epoch": 2.9953953953953953, "percentage": 99.89, "elapsed_time": "6:03:57", "remaining_time": "0:00:23"}
206
+ {"current_steps": 1872, "total_steps": 1872, "epoch": 2.9985985985985986, "percentage": 100.0, "elapsed_time": "6:04:43", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,1495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.10513312369585037,
3
+ "best_model_checkpoint": "/users/u2023000898/train_moe/pretrain_data_chunk/60000_3/checkpoint-1800",
4
+ "epoch": 2.9985985985985986,
5
+ "eval_steps": 100,
6
+ "global_step": 1872,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016016016016016016,
13
+ "grad_norm": 13.409518938793026,
14
+ "learning_rate": 5.319148936170213e-07,
15
+ "loss": 1.2929,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.03203203203203203,
20
+ "grad_norm": 10.369912201219414,
21
+ "learning_rate": 1.0638297872340427e-06,
22
+ "loss": 1.2223,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.04804804804804805,
27
+ "grad_norm": 5.44261186688239,
28
+ "learning_rate": 1.595744680851064e-06,
29
+ "loss": 0.943,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.06406406406406406,
34
+ "grad_norm": 3.605857199785854,
35
+ "learning_rate": 2.1276595744680853e-06,
36
+ "loss": 0.7396,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.08008008008008008,
41
+ "grad_norm": 1.9924246604906095,
42
+ "learning_rate": 2.6595744680851065e-06,
43
+ "loss": 0.6245,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.0960960960960961,
48
+ "grad_norm": 2.3307973177641075,
49
+ "learning_rate": 3.191489361702128e-06,
50
+ "loss": 0.5384,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.11211211211211211,
55
+ "grad_norm": 2.417305413795213,
56
+ "learning_rate": 3.723404255319149e-06,
57
+ "loss": 0.4779,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.12812812812812813,
62
+ "grad_norm": 2.4395863243528044,
63
+ "learning_rate": 4.255319148936171e-06,
64
+ "loss": 0.4384,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.14414414414414414,
69
+ "grad_norm": 2.967607994777358,
70
+ "learning_rate": 4.787234042553192e-06,
71
+ "loss": 0.4194,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.16016016016016016,
76
+ "grad_norm": 1.8226498622217497,
77
+ "learning_rate": 5.319148936170213e-06,
78
+ "loss": 0.3881,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.16016016016016016,
83
+ "eval_loss": 0.38211190700531006,
84
+ "eval_runtime": 5.3031,
85
+ "eval_samples_per_second": 11.314,
86
+ "eval_steps_per_second": 5.657,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.17617617617617617,
91
+ "grad_norm": 2.0417129648638404,
92
+ "learning_rate": 5.851063829787235e-06,
93
+ "loss": 0.3735,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.1921921921921922,
98
+ "grad_norm": 1.8910151101905892,
99
+ "learning_rate": 6.382978723404256e-06,
100
+ "loss": 0.3597,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.2082082082082082,
105
+ "grad_norm": 2.356033958506011,
106
+ "learning_rate": 6.914893617021278e-06,
107
+ "loss": 0.3451,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.22422422422422422,
112
+ "grad_norm": 1.6182433590940526,
113
+ "learning_rate": 7.446808510638298e-06,
114
+ "loss": 0.3299,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.24024024024024024,
119
+ "grad_norm": 1.5854757979381888,
120
+ "learning_rate": 7.97872340425532e-06,
121
+ "loss": 0.3224,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.25625625625625625,
126
+ "grad_norm": 1.6564206751789106,
127
+ "learning_rate": 8.510638297872341e-06,
128
+ "loss": 0.3077,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.2722722722722723,
133
+ "grad_norm": 1.647017991315564,
134
+ "learning_rate": 9.042553191489362e-06,
135
+ "loss": 0.3017,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.2882882882882883,
140
+ "grad_norm": 2.0446337917200434,
141
+ "learning_rate": 9.574468085106385e-06,
142
+ "loss": 0.2859,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.30430430430430433,
147
+ "grad_norm": 1.434904669145637,
148
+ "learning_rate": 9.999965197129365e-06,
149
+ "loss": 0.2877,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.3203203203203203,
154
+ "grad_norm": 2.9106697114797573,
155
+ "learning_rate": 9.998747147528375e-06,
156
+ "loss": 0.2773,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.3203203203203203,
161
+ "eval_loss": 0.2963399589061737,
162
+ "eval_runtime": 4.7335,
163
+ "eval_samples_per_second": 12.676,
164
+ "eval_steps_per_second": 6.338,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.33633633633633636,
169
+ "grad_norm": 2.310160659638816,
170
+ "learning_rate": 9.995789438861128e-06,
171
+ "loss": 0.2793,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.35235235235235235,
176
+ "grad_norm": 1.6140139693791007,
177
+ "learning_rate": 9.991093100466482e-06,
178
+ "loss": 0.2678,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.3683683683683684,
183
+ "grad_norm": 1.6727704090132902,
184
+ "learning_rate": 9.98465976675951e-06,
185
+ "loss": 0.2633,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.3843843843843844,
190
+ "grad_norm": 1.6386702854417348,
191
+ "learning_rate": 9.976491676662679e-06,
192
+ "loss": 0.2556,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 0.4004004004004004,
197
+ "grad_norm": 1.5318495172198847,
198
+ "learning_rate": 9.966591672826674e-06,
199
+ "loss": 0.2501,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.4164164164164164,
204
+ "grad_norm": 1.4131958267494484,
205
+ "learning_rate": 9.95496320064109e-06,
206
+ "loss": 0.239,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.43243243243243246,
211
+ "grad_norm": 2.116850571157825,
212
+ "learning_rate": 9.941610307035385e-06,
213
+ "loss": 0.2352,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.44844844844844844,
218
+ "grad_norm": 1.4833918481789057,
219
+ "learning_rate": 9.926537639070457e-06,
220
+ "loss": 0.2343,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 0.4644644644644645,
225
+ "grad_norm": 1.4499258247835383,
226
+ "learning_rate": 9.90975044232139e-06,
227
+ "loss": 0.2246,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 0.4804804804804805,
232
+ "grad_norm": 1.5412749043910834,
233
+ "learning_rate": 9.891254559051886e-06,
234
+ "loss": 0.2209,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 0.4804804804804805,
239
+ "eval_loss": 0.22802673280239105,
240
+ "eval_runtime": 4.7226,
241
+ "eval_samples_per_second": 12.705,
242
+ "eval_steps_per_second": 6.352,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.4964964964964965,
247
+ "grad_norm": 1.6016396794420027,
248
+ "learning_rate": 9.871056426181052e-06,
249
+ "loss": 0.2206,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 0.5125125125125125,
254
+ "grad_norm": 1.4910134315726407,
255
+ "learning_rate": 9.849163073043223e-06,
256
+ "loss": 0.2229,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 0.5285285285285285,
261
+ "grad_norm": 1.5947649628687839,
262
+ "learning_rate": 9.82558211894163e-06,
263
+ "loss": 0.206,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 0.5445445445445446,
268
+ "grad_norm": 2.281846286082721,
269
+ "learning_rate": 9.800321770496726e-06,
270
+ "loss": 0.2106,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 0.5605605605605606,
275
+ "grad_norm": 1.628823704341528,
276
+ "learning_rate": 9.773390818790136e-06,
277
+ "loss": 0.2049,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 0.5765765765765766,
282
+ "grad_norm": 1.8232097235612017,
283
+ "learning_rate": 9.744798636305189e-06,
284
+ "loss": 0.2045,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 0.5925925925925926,
289
+ "grad_norm": 1.880889712421178,
290
+ "learning_rate": 9.714555173665112e-06,
291
+ "loss": 0.1948,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 0.6086086086086087,
296
+ "grad_norm": 1.8941186642260008,
297
+ "learning_rate": 9.68267095617003e-06,
298
+ "loss": 0.1947,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 0.6246246246246246,
303
+ "grad_norm": 1.6000201393992286,
304
+ "learning_rate": 9.649157080133962e-06,
305
+ "loss": 0.1923,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 0.6406406406406406,
310
+ "grad_norm": 3.414988222682504,
311
+ "learning_rate": 9.614025209023084e-06,
312
+ "loss": 0.1945,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 0.6406406406406406,
317
+ "eval_loss": 0.19540859758853912,
318
+ "eval_runtime": 4.7637,
319
+ "eval_samples_per_second": 12.595,
320
+ "eval_steps_per_second": 6.298,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 0.6566566566566566,
325
+ "grad_norm": 1.9420429807772246,
326
+ "learning_rate": 9.577287569396632e-06,
327
+ "loss": 0.1894,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 0.6726726726726727,
332
+ "grad_norm": 1.3642644226107268,
333
+ "learning_rate": 9.538956946651816e-06,
334
+ "loss": 0.1782,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 0.6886886886886887,
339
+ "grad_norm": 1.6342771748206477,
340
+ "learning_rate": 9.499046680574267e-06,
341
+ "loss": 0.1723,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 0.7047047047047047,
346
+ "grad_norm": 1.487958949075534,
347
+ "learning_rate": 9.457570660695542e-06,
348
+ "loss": 0.1785,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 0.7207207207207207,
353
+ "grad_norm": 1.7299656441999525,
354
+ "learning_rate": 9.41454332145928e-06,
355
+ "loss": 0.1767,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 0.7367367367367368,
360
+ "grad_norm": 1.4998181975998246,
361
+ "learning_rate": 9.369979637197774e-06,
362
+ "loss": 0.1738,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 0.7527527527527528,
367
+ "grad_norm": 1.5911390545676716,
368
+ "learning_rate": 9.323895116920591e-06,
369
+ "loss": 0.1688,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 0.7687687687687688,
374
+ "grad_norm": 2.394239823663753,
375
+ "learning_rate": 9.27630579891716e-06,
376
+ "loss": 0.1628,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 0.7847847847847848,
381
+ "grad_norm": 1.542509136620149,
382
+ "learning_rate": 9.227228245175127e-06,
383
+ "loss": 0.1676,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 0.8008008008008008,
388
+ "grad_norm": 1.5343832382043492,
389
+ "learning_rate": 9.176679535616477e-06,
390
+ "loss": 0.1592,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 0.8008008008008008,
395
+ "eval_loss": 0.1710364669561386,
396
+ "eval_runtime": 4.7555,
397
+ "eval_samples_per_second": 12.617,
398
+ "eval_steps_per_second": 6.309,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 0.8168168168168168,
403
+ "grad_norm": 1.8010583431481462,
404
+ "learning_rate": 9.124677262153405e-06,
405
+ "loss": 0.1586,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 0.8328328328328328,
410
+ "grad_norm": 1.6686707098067743,
411
+ "learning_rate": 9.071239522565978e-06,
412
+ "loss": 0.1556,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 0.8488488488488488,
417
+ "grad_norm": 1.7504894762611782,
418
+ "learning_rate": 9.016384914203771e-06,
419
+ "loss": 0.1592,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 0.8648648648648649,
424
+ "grad_norm": 1.4989615763117103,
425
+ "learning_rate": 8.960132527513642e-06,
426
+ "loss": 0.1616,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 0.8808808808808809,
431
+ "grad_norm": 1.5542001700716142,
432
+ "learning_rate": 8.902501939395887e-06,
433
+ "loss": 0.155,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 0.8968968968968969,
438
+ "grad_norm": 2.502100279988363,
439
+ "learning_rate": 8.8435132063911e-06,
440
+ "loss": 0.1514,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 0.9129129129129129,
445
+ "grad_norm": 1.5418563195158612,
446
+ "learning_rate": 8.783186857700137e-06,
447
+ "loss": 0.1455,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 0.928928928928929,
452
+ "grad_norm": 1.6683319786348245,
453
+ "learning_rate": 8.721543888039534e-06,
454
+ "loss": 0.1417,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 0.944944944944945,
459
+ "grad_norm": 1.762643457697612,
460
+ "learning_rate": 8.658605750334972e-06,
461
+ "loss": 0.155,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 0.960960960960961,
466
+ "grad_norm": 1.9599503664864883,
467
+ "learning_rate": 8.594394348255239e-06,
468
+ "loss": 0.1393,
469
+ "step": 600
470
+ },
471
+ {
472
+ "epoch": 0.960960960960961,
473
+ "eval_loss": 0.1448938250541687,
474
+ "eval_runtime": 4.5683,
475
+ "eval_samples_per_second": 13.134,
476
+ "eval_steps_per_second": 6.567,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 0.9769769769769769,
481
+ "grad_norm": 1.9697402213513955,
482
+ "learning_rate": 8.528932028589337e-06,
483
+ "loss": 0.142,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 0.992992992992993,
488
+ "grad_norm": 1.6768033303014018,
489
+ "learning_rate": 8.462241573469378e-06,
490
+ "loss": 0.1426,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 1.0092092092092093,
495
+ "grad_norm": 1.5267241545230772,
496
+ "learning_rate": 8.394346192441967e-06,
497
+ "loss": 0.1415,
498
+ "step": 630
499
+ },
500
+ {
501
+ "epoch": 1.0252252252252252,
502
+ "grad_norm": 1.476716477059175,
503
+ "learning_rate": 8.325269514390835e-06,
504
+ "loss": 0.1176,
505
+ "step": 640
506
+ },
507
+ {
508
+ "epoch": 1.0412412412412413,
509
+ "grad_norm": 1.4540927119806955,
510
+ "learning_rate": 8.255035579313545e-06,
511
+ "loss": 0.1223,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 1.0572572572572572,
516
+ "grad_norm": 1.5802819169958848,
517
+ "learning_rate": 8.183668829955111e-06,
518
+ "loss": 0.1117,
519
+ "step": 660
520
+ },
521
+ {
522
+ "epoch": 1.0732732732732733,
523
+ "grad_norm": 2.1509571510914203,
524
+ "learning_rate": 8.111194103301461e-06,
525
+ "loss": 0.1176,
526
+ "step": 670
527
+ },
528
+ {
529
+ "epoch": 1.0892892892892894,
530
+ "grad_norm": 1.4577026982979966,
531
+ "learning_rate": 8.037636621935686e-06,
532
+ "loss": 0.1193,
533
+ "step": 680
534
+ },
535
+ {
536
+ "epoch": 1.1053053053053052,
537
+ "grad_norm": 1.5430361342124643,
538
+ "learning_rate": 7.96302198526011e-06,
539
+ "loss": 0.1182,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 1.1213213213213213,
544
+ "grad_norm": 1.5919162208902233,
545
+ "learning_rate": 7.887376160587214e-06,
546
+ "loss": 0.1138,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 1.1213213213213213,
551
+ "eval_loss": 0.15331269800662994,
552
+ "eval_runtime": 4.5367,
553
+ "eval_samples_per_second": 13.225,
554
+ "eval_steps_per_second": 6.613,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 1.1373373373373372,
559
+ "grad_norm": 1.2773733225900368,
560
+ "learning_rate": 7.810725474102504e-06,
561
+ "loss": 0.1116,
562
+ "step": 710
563
+ },
564
+ {
565
+ "epoch": 1.1533533533533533,
566
+ "grad_norm": 1.3037345062586103,
567
+ "learning_rate": 7.733096601702508e-06,
568
+ "loss": 0.1123,
569
+ "step": 720
570
+ },
571
+ {
572
+ "epoch": 1.1693693693693694,
573
+ "grad_norm": 1.5584266021091908,
574
+ "learning_rate": 7.654516559711053e-06,
575
+ "loss": 0.1142,
576
+ "step": 730
577
+ },
578
+ {
579
+ "epoch": 1.1853853853853853,
580
+ "grad_norm": 1.5547636549777273,
581
+ "learning_rate": 7.575012695477076e-06,
582
+ "loss": 0.1095,
583
+ "step": 740
584
+ },
585
+ {
586
+ "epoch": 1.2014014014014014,
587
+ "grad_norm": 1.473482889988994,
588
+ "learning_rate": 7.494612677857218e-06,
589
+ "loss": 0.1091,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 1.2174174174174175,
594
+ "grad_norm": 1.3570410291065769,
595
+ "learning_rate": 7.413344487586542e-06,
596
+ "loss": 0.1099,
597
+ "step": 760
598
+ },
599
+ {
600
+ "epoch": 1.2334334334334334,
601
+ "grad_norm": 1.5362278958719333,
602
+ "learning_rate": 7.331236407540704e-06,
603
+ "loss": 0.1137,
604
+ "step": 770
605
+ },
606
+ {
607
+ "epoch": 1.2494494494494495,
608
+ "grad_norm": 1.9191542465617197,
609
+ "learning_rate": 7.248317012892969e-06,
610
+ "loss": 0.1128,
611
+ "step": 780
612
+ },
613
+ {
614
+ "epoch": 1.2654654654654656,
615
+ "grad_norm": 1.3514066304735823,
616
+ "learning_rate": 7.164615161169518e-06,
617
+ "loss": 0.1087,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 1.2814814814814814,
622
+ "grad_norm": 1.6846669149108675,
623
+ "learning_rate": 7.080159982206471e-06,
624
+ "loss": 0.1096,
625
+ "step": 800
626
+ },
627
+ {
628
+ "epoch": 1.2814814814814814,
629
+ "eval_loss": 0.140634223818779,
630
+ "eval_runtime": 4.5862,
631
+ "eval_samples_per_second": 13.083,
632
+ "eval_steps_per_second": 6.541,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 1.2974974974974975,
637
+ "grad_norm": 1.1919650111365638,
638
+ "learning_rate": 6.994980868012151e-06,
639
+ "loss": 0.1109,
640
+ "step": 810
641
+ },
642
+ {
643
+ "epoch": 1.3135135135135134,
644
+ "grad_norm": 1.6072997465464531,
645
+ "learning_rate": 6.909107462538113e-06,
646
+ "loss": 0.1104,
647
+ "step": 820
648
+ },
649
+ {
650
+ "epoch": 1.3295295295295295,
651
+ "grad_norm": 1.5714907447670126,
652
+ "learning_rate": 6.822569651362475e-06,
653
+ "loss": 0.1091,
654
+ "step": 830
655
+ },
656
+ {
657
+ "epoch": 1.3455455455455456,
658
+ "grad_norm": 1.3715345334587152,
659
+ "learning_rate": 6.735397551289179e-06,
660
+ "loss": 0.1072,
661
+ "step": 840
662
+ },
663
+ {
664
+ "epoch": 1.3615615615615615,
665
+ "grad_norm": 1.5656149539968518,
666
+ "learning_rate": 6.647621499866762e-06,
667
+ "loss": 0.1065,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 1.3775775775775776,
672
+ "grad_norm": 1.747219292587951,
673
+ "learning_rate": 6.5592720448303174e-06,
674
+ "loss": 0.1049,
675
+ "step": 860
676
+ },
677
+ {
678
+ "epoch": 1.3935935935935935,
679
+ "grad_norm": 1.4629586813910707,
680
+ "learning_rate": 6.470379933470296e-06,
681
+ "loss": 0.1018,
682
+ "step": 870
683
+ },
684
+ {
685
+ "epoch": 1.4096096096096096,
686
+ "grad_norm": 2.030872539804225,
687
+ "learning_rate": 6.380976101931879e-06,
688
+ "loss": 0.1015,
689
+ "step": 880
690
+ },
691
+ {
692
+ "epoch": 1.4256256256256257,
693
+ "grad_norm": 1.229739648882705,
694
+ "learning_rate": 6.291091664448589e-06,
695
+ "loss": 0.1076,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 1.4416416416416418,
700
+ "grad_norm": 1.584198326139697,
701
+ "learning_rate": 6.200757902513962e-06,
702
+ "loss": 0.1028,
703
+ "step": 900
704
+ },
705
+ {
706
+ "epoch": 1.4416416416416418,
707
+ "eval_loss": 0.12726753950119019,
708
+ "eval_runtime": 4.6398,
709
+ "eval_samples_per_second": 12.932,
710
+ "eval_steps_per_second": 6.466,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 1.4576576576576576,
715
+ "grad_norm": 1.053882152208791,
716
+ "learning_rate": 6.11000625399499e-06,
717
+ "loss": 0.1058,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 1.4736736736736737,
722
+ "grad_norm": 1.5483751760051925,
723
+ "learning_rate": 6.0188683021911394e-06,
724
+ "loss": 0.1008,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 1.4896896896896896,
729
+ "grad_norm": 1.5020771306479124,
730
+ "learning_rate": 5.927375764842766e-06,
731
+ "loss": 0.0986,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 1.5057057057057057,
736
+ "grad_norm": 1.2818645291771074,
737
+ "learning_rate": 5.835560483092743e-06,
738
+ "loss": 0.1045,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 1.5217217217217218,
743
+ "grad_norm": 1.5262879682098887,
744
+ "learning_rate": 5.743454410405126e-06,
745
+ "loss": 0.1008,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 1.5377377377377377,
750
+ "grad_norm": 1.9445218478299358,
751
+ "learning_rate": 5.651089601444752e-06,
752
+ "loss": 0.0975,
753
+ "step": 960
754
+ },
755
+ {
756
+ "epoch": 1.5537537537537538,
757
+ "grad_norm": 1.153429430002744,
758
+ "learning_rate": 5.558498200921597e-06,
759
+ "loss": 0.103,
760
+ "step": 970
761
+ },
762
+ {
763
+ "epoch": 1.5697697697697697,
764
+ "grad_norm": 1.7142266460197206,
765
+ "learning_rate": 5.465712432403812e-06,
766
+ "loss": 0.1009,
767
+ "step": 980
768
+ },
769
+ {
770
+ "epoch": 1.5857857857857858,
771
+ "grad_norm": 1.0420472106372876,
772
+ "learning_rate": 5.372764587103309e-06,
773
+ "loss": 0.1026,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 1.6018018018018019,
778
+ "grad_norm": 1.0933198718882033,
779
+ "learning_rate": 5.279687012637798e-06,
780
+ "loss": 0.0998,
781
+ "step": 1000
782
+ },
783
+ {
784
+ "epoch": 1.6018018018018019,
785
+ "eval_loss": 0.12056411057710648,
786
+ "eval_runtime": 4.6363,
787
+ "eval_samples_per_second": 12.941,
788
+ "eval_steps_per_second": 6.471,
789
+ "step": 1000
790
+ },
791
+ {
792
+ "epoch": 1.617817817817818,
793
+ "grad_norm": 1.5249725941966104,
794
+ "learning_rate": 5.186512101773206e-06,
795
+ "loss": 0.0987,
796
+ "step": 1010
797
+ },
798
+ {
799
+ "epoch": 1.6338338338338338,
800
+ "grad_norm": 1.1551410324839504,
801
+ "learning_rate": 5.093272281150383e-06,
802
+ "loss": 0.0967,
803
+ "step": 1020
804
+ },
805
+ {
806
+ "epoch": 1.6498498498498497,
807
+ "grad_norm": 1.3764150156451525,
808
+ "learning_rate": 5e-06,
809
+ "loss": 0.102,
810
+ "step": 1030
811
+ },
812
+ {
813
+ "epoch": 1.6658658658658658,
814
+ "grad_norm": 1.4315332194265298,
815
+ "learning_rate": 4.906727718849619e-06,
816
+ "loss": 0.0918,
817
+ "step": 1040
818
+ },
819
+ {
820
+ "epoch": 1.681881881881882,
821
+ "grad_norm": 1.5641425623387613,
822
+ "learning_rate": 4.813487898226794e-06,
823
+ "loss": 0.0972,
824
+ "step": 1050
825
+ },
826
+ {
827
+ "epoch": 1.697897897897898,
828
+ "grad_norm": 1.364206236390582,
829
+ "learning_rate": 4.720312987362204e-06,
830
+ "loss": 0.0963,
831
+ "step": 1060
832
+ },
833
+ {
834
+ "epoch": 1.713913913913914,
835
+ "grad_norm": 1.2084858559030025,
836
+ "learning_rate": 4.6272354128966924e-06,
837
+ "loss": 0.0947,
838
+ "step": 1070
839
+ },
840
+ {
841
+ "epoch": 1.7299299299299298,
842
+ "grad_norm": 1.369095339213906,
843
+ "learning_rate": 4.534287567596189e-06,
844
+ "loss": 0.1,
845
+ "step": 1080
846
+ },
847
+ {
848
+ "epoch": 1.7459459459459459,
849
+ "grad_norm": 1.7200163939268696,
850
+ "learning_rate": 4.441501799078405e-06,
851
+ "loss": 0.0939,
852
+ "step": 1090
853
+ },
854
+ {
855
+ "epoch": 1.761961961961962,
856
+ "grad_norm": 1.0601289469307587,
857
+ "learning_rate": 4.348910398555249e-06,
858
+ "loss": 0.0952,
859
+ "step": 1100
860
+ },
861
+ {
862
+ "epoch": 1.761961961961962,
863
+ "eval_loss": 0.12079311162233353,
864
+ "eval_runtime": 4.5849,
865
+ "eval_samples_per_second": 13.087,
866
+ "eval_steps_per_second": 6.543,
867
+ "step": 1100
868
+ },
869
+ {
870
+ "epoch": 1.777977977977978,
871
+ "grad_norm": 1.3096671495162804,
872
+ "learning_rate": 4.2565455895948745e-06,
873
+ "loss": 0.095,
874
+ "step": 1110
875
+ },
876
+ {
877
+ "epoch": 1.793993993993994,
878
+ "grad_norm": 1.6474183248144982,
879
+ "learning_rate": 4.164439516907258e-06,
880
+ "loss": 0.096,
881
+ "step": 1120
882
+ },
883
+ {
884
+ "epoch": 1.81001001001001,
885
+ "grad_norm": 1.2638496884017785,
886
+ "learning_rate": 4.072624235157234e-06,
887
+ "loss": 0.0932,
888
+ "step": 1130
889
+ },
890
+ {
891
+ "epoch": 1.826026026026026,
892
+ "grad_norm": 1.1666611117167804,
893
+ "learning_rate": 3.981131697808862e-06,
894
+ "loss": 0.0961,
895
+ "step": 1140
896
+ },
897
+ {
898
+ "epoch": 1.842042042042042,
899
+ "grad_norm": 1.4101778997279006,
900
+ "learning_rate": 3.889993746005011e-06,
901
+ "loss": 0.0938,
902
+ "step": 1150
903
+ },
904
+ {
905
+ "epoch": 1.8580580580580581,
906
+ "grad_norm": 1.4720670252418966,
907
+ "learning_rate": 3.799242097486038e-06,
908
+ "loss": 0.0958,
909
+ "step": 1160
910
+ },
911
+ {
912
+ "epoch": 1.8740740740740742,
913
+ "grad_norm": 1.9229886608107283,
914
+ "learning_rate": 3.708908335551412e-06,
915
+ "loss": 0.0972,
916
+ "step": 1170
917
+ },
918
+ {
919
+ "epoch": 1.89009009009009,
920
+ "grad_norm": 1.5097045011947579,
921
+ "learning_rate": 3.6190238980681235e-06,
922
+ "loss": 0.0931,
923
+ "step": 1180
924
+ },
925
+ {
926
+ "epoch": 1.906106106106106,
927
+ "grad_norm": 1.3032785079127815,
928
+ "learning_rate": 3.529620066529704e-06,
929
+ "loss": 0.0925,
930
+ "step": 1190
931
+ },
932
+ {
933
+ "epoch": 1.922122122122122,
934
+ "grad_norm": 1.158235321697393,
935
+ "learning_rate": 3.4407279551696846e-06,
936
+ "loss": 0.092,
937
+ "step": 1200
938
+ },
939
+ {
940
+ "epoch": 1.922122122122122,
941
+ "eval_loss": 0.12118110805749893,
942
+ "eval_runtime": 4.5355,
943
+ "eval_samples_per_second": 13.229,
944
+ "eval_steps_per_second": 6.614,
945
+ "step": 1200
946
+ },
947
+ {
948
+ "epoch": 1.9381381381381382,
949
+ "grad_norm": 1.3273343135981375,
950
+ "learning_rate": 3.352378500133239e-06,
951
+ "loss": 0.0912,
952
+ "step": 1210
953
+ },
954
+ {
955
+ "epoch": 1.9541541541541543,
956
+ "grad_norm": 1.2638328817270799,
957
+ "learning_rate": 3.264602448710822e-06,
958
+ "loss": 0.092,
959
+ "step": 1220
960
+ },
961
+ {
962
+ "epoch": 1.9701701701701702,
963
+ "grad_norm": 1.0061664501466365,
964
+ "learning_rate": 3.177430348637527e-06,
965
+ "loss": 0.0913,
966
+ "step": 1230
967
+ },
968
+ {
969
+ "epoch": 1.986186186186186,
970
+ "grad_norm": 1.3814485492365947,
971
+ "learning_rate": 3.090892537461889e-06,
972
+ "loss": 0.0874,
973
+ "step": 1240
974
+ },
975
+ {
976
+ "epoch": 2.0024024024024025,
977
+ "grad_norm": 1.4890736474425244,
978
+ "learning_rate": 3.00501913198785e-06,
979
+ "loss": 0.0869,
980
+ "step": 1250
981
+ },
982
+ {
983
+ "epoch": 2.0184184184184186,
984
+ "grad_norm": 1.143647977026878,
985
+ "learning_rate": 2.9198400177935303e-06,
986
+ "loss": 0.0629,
987
+ "step": 1260
988
+ },
989
+ {
990
+ "epoch": 2.0344344344344343,
991
+ "grad_norm": 1.0231219995939023,
992
+ "learning_rate": 2.835384838830481e-06,
993
+ "loss": 0.0638,
994
+ "step": 1270
995
+ },
996
+ {
997
+ "epoch": 2.0504504504504504,
998
+ "grad_norm": 1.09838986733996,
999
+ "learning_rate": 2.7516829871070295e-06,
1000
+ "loss": 0.0588,
1001
+ "step": 1280
1002
+ },
1003
+ {
1004
+ "epoch": 2.0664664664664665,
1005
+ "grad_norm": 1.1570777805297492,
1006
+ "learning_rate": 2.668763592459297e-06,
1007
+ "loss": 0.0595,
1008
+ "step": 1290
1009
+ },
1010
+ {
1011
+ "epoch": 2.0824824824824826,
1012
+ "grad_norm": 0.9848777368332091,
1013
+ "learning_rate": 2.586655512413458e-06,
1014
+ "loss": 0.061,
1015
+ "step": 1300
1016
+ },
1017
+ {
1018
+ "epoch": 2.0824824824824826,
1019
+ "eval_loss": 0.1126304417848587,
1020
+ "eval_runtime": 4.4139,
1021
+ "eval_samples_per_second": 13.594,
1022
+ "eval_steps_per_second": 6.797,
1023
+ "step": 1300
1024
+ },
1025
+ {
1026
+ "epoch": 2.0984984984984987,
1027
+ "grad_norm": 1.0676656917105798,
1028
+ "learning_rate": 2.505387322142782e-06,
1029
+ "loss": 0.0612,
1030
+ "step": 1310
1031
+ },
1032
+ {
1033
+ "epoch": 2.1145145145145143,
1034
+ "grad_norm": 1.05569010042999,
1035
+ "learning_rate": 2.4249873045229244e-06,
1036
+ "loss": 0.0607,
1037
+ "step": 1320
1038
+ },
1039
+ {
1040
+ "epoch": 2.1305305305305304,
1041
+ "grad_norm": 0.9996457001160142,
1042
+ "learning_rate": 2.345483440288947e-06,
1043
+ "loss": 0.06,
1044
+ "step": 1330
1045
+ },
1046
+ {
1047
+ "epoch": 2.1465465465465465,
1048
+ "grad_norm": 1.07628545223274,
1049
+ "learning_rate": 2.2669033982974946e-06,
1050
+ "loss": 0.0588,
1051
+ "step": 1340
1052
+ },
1053
+ {
1054
+ "epoch": 2.1625625625625626,
1055
+ "grad_norm": 1.07905549545333,
1056
+ "learning_rate": 2.189274525897498e-06,
1057
+ "loss": 0.0636,
1058
+ "step": 1350
1059
+ },
1060
+ {
1061
+ "epoch": 2.1785785785785787,
1062
+ "grad_norm": 1.2549844294011818,
1063
+ "learning_rate": 2.1126238394127868e-06,
1064
+ "loss": 0.0607,
1065
+ "step": 1360
1066
+ },
1067
+ {
1068
+ "epoch": 2.1945945945945944,
1069
+ "grad_norm": 1.0059915885008073,
1070
+ "learning_rate": 2.03697801473989e-06,
1071
+ "loss": 0.062,
1072
+ "step": 1370
1073
+ },
1074
+ {
1075
+ "epoch": 2.2106106106106105,
1076
+ "grad_norm": 1.274382465169598,
1077
+ "learning_rate": 1.962363378064316e-06,
1078
+ "loss": 0.0633,
1079
+ "step": 1380
1080
+ },
1081
+ {
1082
+ "epoch": 2.2266266266266266,
1083
+ "grad_norm": 1.281532287957869,
1084
+ "learning_rate": 1.8888058966985407e-06,
1085
+ "loss": 0.0633,
1086
+ "step": 1390
1087
+ },
1088
+ {
1089
+ "epoch": 2.2426426426426427,
1090
+ "grad_norm": 1.1922439699801886,
1091
+ "learning_rate": 1.8163311700448899e-06,
1092
+ "loss": 0.0624,
1093
+ "step": 1400
1094
+ },
1095
+ {
1096
+ "epoch": 2.2426426426426427,
1097
+ "eval_loss": 0.1149037629365921,
1098
+ "eval_runtime": 4.4069,
1099
+ "eval_samples_per_second": 13.615,
1100
+ "eval_steps_per_second": 6.808,
1101
+ "step": 1400
1102
+ },
1103
+ {
1104
+ "epoch": 2.258658658658659,
1105
+ "grad_norm": 1.7050676523518535,
1106
+ "learning_rate": 1.7449644206864564e-06,
1107
+ "loss": 0.0623,
1108
+ "step": 1410
1109
+ },
1110
+ {
1111
+ "epoch": 2.2746746746746744,
1112
+ "grad_norm": 0.9435102102997306,
1113
+ "learning_rate": 1.6747304856091662e-06,
1114
+ "loss": 0.059,
1115
+ "step": 1420
1116
+ },
1117
+ {
1118
+ "epoch": 2.2906906906906905,
1119
+ "grad_norm": 1.0831267008587788,
1120
+ "learning_rate": 1.6056538075580342e-06,
1121
+ "loss": 0.0627,
1122
+ "step": 1430
1123
+ },
1124
+ {
1125
+ "epoch": 2.3067067067067066,
1126
+ "grad_norm": 1.08268476394283,
1127
+ "learning_rate": 1.5377584265306222e-06,
1128
+ "loss": 0.0593,
1129
+ "step": 1440
1130
+ },
1131
+ {
1132
+ "epoch": 2.3227227227227227,
1133
+ "grad_norm": 0.9770773292942597,
1134
+ "learning_rate": 1.4710679714106635e-06,
1135
+ "loss": 0.0583,
1136
+ "step": 1450
1137
+ },
1138
+ {
1139
+ "epoch": 2.338738738738739,
1140
+ "grad_norm": 1.3751312648199294,
1141
+ "learning_rate": 1.4056056517447637e-06,
1142
+ "loss": 0.0626,
1143
+ "step": 1460
1144
+ },
1145
+ {
1146
+ "epoch": 2.354754754754755,
1147
+ "grad_norm": 1.071282141200762,
1148
+ "learning_rate": 1.3413942496650301e-06,
1149
+ "loss": 0.0598,
1150
+ "step": 1470
1151
+ },
1152
+ {
1153
+ "epoch": 2.3707707707707706,
1154
+ "grad_norm": 0.9441682147431711,
1155
+ "learning_rate": 1.2784561119604683e-06,
1156
+ "loss": 0.0609,
1157
+ "step": 1480
1158
+ },
1159
+ {
1160
+ "epoch": 2.3867867867867867,
1161
+ "grad_norm": 1.2531258809500883,
1162
+ "learning_rate": 1.2168131422998653e-06,
1163
+ "loss": 0.0618,
1164
+ "step": 1490
1165
+ },
1166
+ {
1167
+ "epoch": 2.402802802802803,
1168
+ "grad_norm": 0.988649947120785,
1169
+ "learning_rate": 1.156486793608899e-06,
1170
+ "loss": 0.0607,
1171
+ "step": 1500
1172
+ },
1173
+ {
1174
+ "epoch": 2.402802802802803,
1175
+ "eval_loss": 0.10994044691324234,
1176
+ "eval_runtime": 4.4237,
1177
+ "eval_samples_per_second": 13.563,
1178
+ "eval_steps_per_second": 6.782,
1179
+ "step": 1500
1180
+ },
1181
+ {
1182
+ "epoch": 2.418818818818819,
1183
+ "grad_norm": 1.1998096160223135,
1184
+ "learning_rate": 1.0974980606041152e-06,
1185
+ "loss": 0.0594,
1186
+ "step": 1510
1187
+ },
1188
+ {
1189
+ "epoch": 2.434834834834835,
1190
+ "grad_norm": 0.9677720356738567,
1191
+ "learning_rate": 1.0398674724863584e-06,
1192
+ "loss": 0.059,
1193
+ "step": 1520
1194
+ },
1195
+ {
1196
+ "epoch": 2.450850850850851,
1197
+ "grad_norm": 1.3141425510127478,
1198
+ "learning_rate": 9.836150857962296e-07,
1199
+ "loss": 0.0596,
1200
+ "step": 1530
1201
+ },
1202
+ {
1203
+ "epoch": 2.4668668668668667,
1204
+ "grad_norm": 0.8985622286196838,
1205
+ "learning_rate": 9.287604774340236e-07,
1206
+ "loss": 0.0581,
1207
+ "step": 1540
1208
+ },
1209
+ {
1210
+ "epoch": 2.482882882882883,
1211
+ "grad_norm": 1.2915272829435755,
1212
+ "learning_rate": 8.753227378465956e-07,
1213
+ "loss": 0.0589,
1214
+ "step": 1550
1215
+ },
1216
+ {
1217
+ "epoch": 2.498898898898899,
1218
+ "grad_norm": 1.051170086009324,
1219
+ "learning_rate": 8.233204643835235e-07,
1220
+ "loss": 0.0585,
1221
+ "step": 1560
1222
+ },
1223
+ {
1224
+ "epoch": 2.514914914914915,
1225
+ "grad_norm": 1.1697265988828673,
1226
+ "learning_rate": 7.72771754824877e-07,
1227
+ "loss": 0.0627,
1228
+ "step": 1570
1229
+ },
1230
+ {
1231
+ "epoch": 2.530930930930931,
1232
+ "grad_norm": 1.045582836173671,
1233
+ "learning_rate": 7.23694201082843e-07,
1234
+ "loss": 0.0594,
1235
+ "step": 1580
1236
+ },
1237
+ {
1238
+ "epoch": 2.546946946946947,
1239
+ "grad_norm": 1.0460247904446258,
1240
+ "learning_rate": 6.761048830794098e-07,
1241
+ "loss": 0.0583,
1242
+ "step": 1590
1243
+ },
1244
+ {
1245
+ "epoch": 2.562962962962963,
1246
+ "grad_norm": 1.0766438599871688,
1247
+ "learning_rate": 6.300203628022272e-07,
1248
+ "loss": 0.0584,
1249
+ "step": 1600
1250
+ },
1251
+ {
1252
+ "epoch": 2.562962962962963,
1253
+ "eval_loss": 0.1075584813952446,
1254
+ "eval_runtime": 4.7311,
1255
+ "eval_samples_per_second": 12.682,
1256
+ "eval_steps_per_second": 6.341,
1257
+ "step": 1600
1258
+ },
1259
+ {
1260
+ "epoch": 2.578978978978979,
1261
+ "grad_norm": 1.0644137083005472,
1262
+ "learning_rate": 5.854566785407212e-07,
1263
+ "loss": 0.0615,
1264
+ "step": 1610
1265
+ },
1266
+ {
1267
+ "epoch": 2.594994994994995,
1268
+ "grad_norm": 0.8862800692246755,
1269
+ "learning_rate": 5.42429339304461e-07,
1270
+ "loss": 0.0589,
1271
+ "step": 1620
1272
+ },
1273
+ {
1274
+ "epoch": 2.611011011011011,
1275
+ "grad_norm": 1.0632023267290576,
1276
+ "learning_rate": 5.009533194257332e-07,
1277
+ "loss": 0.0576,
1278
+ "step": 1630
1279
+ },
1280
+ {
1281
+ "epoch": 2.627027027027027,
1282
+ "grad_norm": 1.0612601719189565,
1283
+ "learning_rate": 4.6104305334818577e-07,
1284
+ "loss": 0.0581,
1285
+ "step": 1640
1286
+ },
1287
+ {
1288
+ "epoch": 2.643043043043043,
1289
+ "grad_norm": 1.1363089897778307,
1290
+ "learning_rate": 4.2271243060336976e-07,
1291
+ "loss": 0.0565,
1292
+ "step": 1650
1293
+ },
1294
+ {
1295
+ "epoch": 2.659059059059059,
1296
+ "grad_norm": 1.0581328451419771,
1297
+ "learning_rate": 3.8597479097691626e-07,
1298
+ "loss": 0.06,
1299
+ "step": 1660
1300
+ },
1301
+ {
1302
+ "epoch": 2.675075075075075,
1303
+ "grad_norm": 1.0292294546952114,
1304
+ "learning_rate": 3.508429198660379e-07,
1305
+ "loss": 0.0592,
1306
+ "step": 1670
1307
+ },
1308
+ {
1309
+ "epoch": 2.6910910910910912,
1310
+ "grad_norm": 1.1041696576832245,
1311
+ "learning_rate": 3.1732904382996975e-07,
1312
+ "loss": 0.0531,
1313
+ "step": 1680
1314
+ },
1315
+ {
1316
+ "epoch": 2.707107107107107,
1317
+ "grad_norm": 1.1017298827099373,
1318
+ "learning_rate": 2.854448263348891e-07,
1319
+ "loss": 0.0592,
1320
+ "step": 1690
1321
+ },
1322
+ {
1323
+ "epoch": 2.723123123123123,
1324
+ "grad_norm": 1.2373658193384414,
1325
+ "learning_rate": 2.5520136369481194e-07,
1326
+ "loss": 0.0574,
1327
+ "step": 1700
1328
+ },
1329
+ {
1330
+ "epoch": 2.723123123123123,
1331
+ "eval_loss": 0.10619153082370758,
1332
+ "eval_runtime": 4.4379,
1333
+ "eval_samples_per_second": 13.52,
1334
+ "eval_steps_per_second": 6.76,
1335
+ "step": 1700
1336
+ },
1337
+ {
1338
+ "epoch": 2.739139139139139,
1339
+ "grad_norm": 1.0150510884990294,
1340
+ "learning_rate": 2.266091812098642e-07,
1341
+ "loss": 0.0581,
1342
+ "step": 1710
1343
+ },
1344
+ {
1345
+ "epoch": 2.755155155155155,
1346
+ "grad_norm": 1.4492779833818328,
1347
+ "learning_rate": 1.9967822950327453e-07,
1348
+ "loss": 0.0609,
1349
+ "step": 1720
1350
+ },
1351
+ {
1352
+ "epoch": 2.7711711711711713,
1353
+ "grad_norm": 1.104287468688097,
1354
+ "learning_rate": 1.7441788105837133e-07,
1355
+ "loss": 0.0593,
1356
+ "step": 1730
1357
+ },
1358
+ {
1359
+ "epoch": 2.787187187187187,
1360
+ "grad_norm": 1.0784218287885254,
1361
+ "learning_rate": 1.508369269567783e-07,
1362
+ "loss": 0.0606,
1363
+ "step": 1740
1364
+ },
1365
+ {
1366
+ "epoch": 2.803203203203203,
1367
+ "grad_norm": 1.2207885496699904,
1368
+ "learning_rate": 1.2894357381894984e-07,
1369
+ "loss": 0.0564,
1370
+ "step": 1750
1371
+ },
1372
+ {
1373
+ "epoch": 2.819219219219219,
1374
+ "grad_norm": 0.8638978123535029,
1375
+ "learning_rate": 1.0874544094811424e-07,
1376
+ "loss": 0.0586,
1377
+ "step": 1760
1378
+ },
1379
+ {
1380
+ "epoch": 2.8352352352352352,
1381
+ "grad_norm": 1.1498833578746668,
1382
+ "learning_rate": 9.024955767861054e-08,
1383
+ "loss": 0.0629,
1384
+ "step": 1770
1385
+ },
1386
+ {
1387
+ "epoch": 2.8512512512512513,
1388
+ "grad_norm": 0.9945976550163285,
1389
+ "learning_rate": 7.346236092954318e-08,
1390
+ "loss": 0.0562,
1391
+ "step": 1780
1392
+ },
1393
+ {
1394
+ "epoch": 2.867267267267267,
1395
+ "grad_norm": 0.9949658942902132,
1396
+ "learning_rate": 5.838969296461605e-08,
1397
+ "loss": 0.0583,
1398
+ "step": 1790
1399
+ },
1400
+ {
1401
+ "epoch": 2.8832832832832835,
1402
+ "grad_norm": 1.0428607480343814,
1403
+ "learning_rate": 4.50367993589107e-08,
1404
+ "loss": 0.0571,
1405
+ "step": 1800
1406
+ },
1407
+ {
1408
+ "epoch": 2.8832832832832835,
1409
+ "eval_loss": 0.10513312369585037,
1410
+ "eval_runtime": 4.4044,
1411
+ "eval_samples_per_second": 13.623,
1412
+ "eval_steps_per_second": 6.811,
1413
+ "step": 1800
1414
+ },
1415
+ {
1416
+ "epoch": 2.899299299299299,
1417
+ "grad_norm": 1.0170732585763345,
1418
+ "learning_rate": 3.340832717332765e-08,
1419
+ "loss": 0.0605,
1420
+ "step": 1810
1421
+ },
1422
+ {
1423
+ "epoch": 2.9153153153153153,
1424
+ "grad_norm": 0.9518652403398978,
1425
+ "learning_rate": 2.3508323337321225e-08,
1426
+ "loss": 0.0567,
1427
+ "step": 1820
1428
+ },
1429
+ {
1430
+ "epoch": 2.9313313313313314,
1431
+ "grad_norm": 1.0033269558751705,
1432
+ "learning_rate": 1.534023324049061e-08,
1433
+ "loss": 0.0564,
1434
+ "step": 1830
1435
+ },
1436
+ {
1437
+ "epoch": 2.9473473473473475,
1438
+ "grad_norm": 1.1693211443264353,
1439
+ "learning_rate": 8.906899533517866e-09,
1440
+ "loss": 0.0589,
1441
+ "step": 1840
1442
+ },
1443
+ {
1444
+ "epoch": 2.9633633633633636,
1445
+ "grad_norm": 1.2335037547499492,
1446
+ "learning_rate": 4.210561138873193e-09,
1447
+ "loss": 0.0576,
1448
+ "step": 1850
1449
+ },
1450
+ {
1451
+ "epoch": 2.9793793793793792,
1452
+ "grad_norm": 1.1025808173861562,
1453
+ "learning_rate": 1.2528524716259872e-09,
1454
+ "loss": 0.0548,
1455
+ "step": 1860
1456
+ },
1457
+ {
1458
+ "epoch": 2.9953953953953953,
1459
+ "grad_norm": 0.9062323167927434,
1460
+ "learning_rate": 3.480287063706289e-11,
1461
+ "loss": 0.0587,
1462
+ "step": 1870
1463
+ },
1464
+ {
1465
+ "epoch": 2.9985985985985986,
1466
+ "step": 1872,
1467
+ "total_flos": 222868335149056.0,
1468
+ "train_loss": 0.15247767985376537,
1469
+ "train_runtime": 21883.2415,
1470
+ "train_samples_per_second": 8.217,
1471
+ "train_steps_per_second": 0.086
1472
+ }
1473
+ ],
1474
+ "logging_steps": 10,
1475
+ "max_steps": 1872,
1476
+ "num_input_tokens_seen": 0,
1477
+ "num_train_epochs": 3,
1478
+ "save_steps": 100,
1479
+ "stateful_callbacks": {
1480
+ "TrainerControl": {
1481
+ "args": {
1482
+ "should_epoch_stop": false,
1483
+ "should_evaluate": false,
1484
+ "should_log": false,
1485
+ "should_save": true,
1486
+ "should_training_stop": true
1487
+ },
1488
+ "attributes": {}
1489
+ }
1490
+ },
1491
+ "total_flos": 222868335149056.0,
1492
+ "train_batch_size": 3,
1493
+ "trial_name": null,
1494
+ "trial_params": null
1495
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c02617fa7c810825deaede1c261589fc35c2f77f435ecc07a1268e9665ef61c5
3
+ size 7224
training_eval_loss.png ADDED
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff