EIFY commited on
Commit
31f6626
·
verified ·
1 Parent(s): a58b868

Grafted experiment: PyTorch model trained with tfds imagenet2012 data

Browse files
Files changed (45) hide show
  1. .gitattributes +1 -0
  2. grafted/big_vision_metrics.txt +0 -0
  3. grafted/checkpoint.pth.tar +3 -0
  4. grafted/config.json +1 -0
  5. grafted/wandb/run-20241127_213238-torch-grafted-redux/files/config.yaml +95 -0
  6. grafted/wandb/run-20241127_213238-torch-grafted-redux/files/output.log +31 -0
  7. grafted/wandb/run-20241127_213238-torch-grafted-redux/files/requirements.txt +133 -0
  8. grafted/wandb/run-20241127_213238-torch-grafted-redux/files/wandb-metadata.json +49 -0
  9. grafted/wandb/run-20241127_213238-torch-grafted-redux/files/wandb-summary.json +1 -0
  10. grafted/wandb/run-20241127_213238-torch-grafted-redux/logs/debug-internal.log +18 -0
  11. grafted/wandb/run-20241127_213238-torch-grafted-redux/logs/debug.log +54 -0
  12. grafted/wandb/run-20241127_213238-torch-grafted-redux/run-torch-grafted-redux.wandb +0 -0
  13. grafted/wandb/run-20241127_213407-torch-grafted-redux/files/config.yaml +97 -0
  14. grafted/wandb/run-20241127_213407-torch-grafted-redux/files/output.log +31 -0
  15. grafted/wandb/run-20241127_213407-torch-grafted-redux/files/requirements.txt +133 -0
  16. grafted/wandb/run-20241127_213407-torch-grafted-redux/files/wandb-metadata.json +49 -0
  17. grafted/wandb/run-20241127_213407-torch-grafted-redux/files/wandb-summary.json +1 -0
  18. grafted/wandb/run-20241127_213407-torch-grafted-redux/logs/debug-internal.log +18 -0
  19. grafted/wandb/run-20241127_213407-torch-grafted-redux/logs/debug.log +55 -0
  20. grafted/wandb/run-20241127_213407-torch-grafted-redux/run-torch-grafted-redux.wandb +0 -0
  21. grafted/wandb/run-20241127_213838-torch-grafted-redux/files/config.yaml +97 -0
  22. grafted/wandb/run-20241127_213838-torch-grafted-redux/files/output.log +152 -0
  23. grafted/wandb/run-20241127_213838-torch-grafted-redux/files/requirements.txt +133 -0
  24. grafted/wandb/run-20241127_213838-torch-grafted-redux/files/wandb-metadata.json +49 -0
  25. grafted/wandb/run-20241127_213838-torch-grafted-redux/files/wandb-summary.json +1 -0
  26. grafted/wandb/run-20241127_213838-torch-grafted-redux/logs/debug-internal.log +18 -0
  27. grafted/wandb/run-20241127_213838-torch-grafted-redux/logs/debug.log +55 -0
  28. grafted/wandb/run-20241127_213838-torch-grafted-redux/run-torch-grafted-redux.wandb +0 -0
  29. grafted/wandb/run-20241127_214327-torch-grafted-redux/files/config.yaml +97 -0
  30. grafted/wandb/run-20241127_214327-torch-grafted-redux/files/output.log +211 -0
  31. grafted/wandb/run-20241127_214327-torch-grafted-redux/files/requirements.txt +134 -0
  32. grafted/wandb/run-20241127_214327-torch-grafted-redux/files/wandb-metadata.json +49 -0
  33. grafted/wandb/run-20241127_214327-torch-grafted-redux/files/wandb-summary.json +1 -0
  34. grafted/wandb/run-20241127_214327-torch-grafted-redux/logs/debug-internal.log +18 -0
  35. grafted/wandb/run-20241127_214327-torch-grafted-redux/logs/debug.log +55 -0
  36. grafted/wandb/run-20241127_214327-torch-grafted-redux/run-torch-grafted-redux.wandb +0 -0
  37. grafted/wandb/run-20241127_215015-torch-grafted-redux/files/config.yaml +98 -0
  38. grafted/wandb/run-20241127_215015-torch-grafted-redux/files/output.log +0 -0
  39. grafted/wandb/run-20241127_215015-torch-grafted-redux/files/requirements.txt +134 -0
  40. grafted/wandb/run-20241127_215015-torch-grafted-redux/files/wandb-metadata.json +49 -0
  41. grafted/wandb/run-20241127_215015-torch-grafted-redux/files/wandb-summary.json +1 -0
  42. grafted/wandb/run-20241127_215015-torch-grafted-redux/logs/debug-internal.log +247 -0
  43. grafted/wandb/run-20241127_215015-torch-grafted-redux/logs/debug.log +55 -0
  44. grafted/wandb/run-20241127_215015-torch-grafted-redux/run-torch-grafted-redux.wandb +3 -0
  45. grafted/wandb/wandb-resume.json +1 -0
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  05-15_2003/wandb/run-20240515_200355-torch-on-big-vision-bfloat16/logs/debug-internal.log filter=lfs diff=lfs merge=lfs -text
37
  05-15_2003/wandb/run-20240515_200355-torch-on-big-vision-bfloat16/run-torch-on-big-vision-bfloat16.wandb filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  05-15_2003/wandb/run-20240515_200355-torch-on-big-vision-bfloat16/logs/debug-internal.log filter=lfs diff=lfs merge=lfs -text
37
  05-15_2003/wandb/run-20240515_200355-torch-on-big-vision-bfloat16/run-torch-on-big-vision-bfloat16.wandb filter=lfs diff=lfs merge=lfs -text
38
+ grafted/wandb/run-20241127_215015-torch-grafted-redux/run-torch-grafted-redux.wandb filter=lfs diff=lfs merge=lfs -text
grafted/big_vision_metrics.txt ADDED
The diff for this file is too large to render. See raw diff
 
grafted/checkpoint.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1a8535185a902993c4e798b63171f29063335d6073303d6f357609c25b6aea7
3
+ size 264187339
grafted/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"seed": 0, "total_epochs": 90, "num_classes": 1000, "loss": "softmax_xent", "input": {"data": {"name": "imagenet2012", "split": "train"}, "batch_size": 1024, "accum_freq": 8, "cache_raw": false, "shuffle_buffer_size": 150000, "pp": "decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000, key=\"label\", key_result=\"labels\")|keep(\"image\", \"labels\")"}, "pp_modules": ["ops_general", "ops_image", "ops_text", "archive.randaug"], "log_training_steps": 50, "ckpt_steps": 1000, "model_name": "vit", "model": {"variant": "S/16", "rep_size": false, "pool_type": "gap", "posemb": "sincos2d"}, "grad_clip_norm": 1.0, "optax_name": "scale_by_adam", "optax": {"mu_dtype": "bfloat16"}, "lr": 0.001, "wd": 0.0001, "schedule": {"warmup_steps": 10000, "decay_type": "cosine"}, "mixup": {"p": 0.2, "fold_in": null}, "evals": {"val": {"type": "classification", "data": {"name": "imagenet2012", "split": "validation"}, "pp_fn": "decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000, key=\"label\", key_result=\"labels\")|keep(\"image\", \"labels\")", "loss_name": "softmax_xent", "log_steps": 2500}}}
grafted/wandb/run-20241127_213238-torch-grafted-redux/files/config.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _allow_dotted_keys:
2
+ value: false
3
+ _convert_dict:
4
+ value: true
5
+ _fields:
6
+ value:
7
+ ckpt_steps: 1000
8
+ evals: |
9
+ val:
10
+ data:
11
+ name: imagenet2012
12
+ split: validation
13
+ log_steps: 2500
14
+ loss_name: softmax_xent
15
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
16
+ key="label", key_result="labels")|keep("image", "labels")
17
+ type: classification
18
+ grad_clip_norm: 1
19
+ input: |
20
+ accum_freq: 8
21
+ batch_size: 1024
22
+ cache_raw: false
23
+ data:
24
+ name: imagenet2012
25
+ split: train
26
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
27
+ key="label", key_result="labels")|keep("image", "labels")
28
+ shuffle_buffer_size: 150000
29
+ log_training_steps: 50
30
+ loss: softmax_xent
31
+ lr: 0.001
32
+ mixup: |
33
+ fold_in: null
34
+ p: 0.2
35
+ model: |
36
+ pool_type: gap
37
+ posemb: sincos2d
38
+ rep_size: false
39
+ variant: S/16
40
+ model_name: vit
41
+ num_classes: 1000
42
+ optax: |
43
+ mu_dtype: bfloat16
44
+ optax_name: scale_by_adam
45
+ pp_modules:
46
+ - ops_general
47
+ - ops_image
48
+ - ops_text
49
+ - archive.randaug
50
+ schedule: |
51
+ decay_type: cosine
52
+ warmup_steps: 10000
53
+ seed: 0
54
+ total_epochs: 90
55
+ wd: 0.0001
56
+ _locked:
57
+ value: true
58
+ _sort_keys:
59
+ value: true
60
+ _type_safe:
61
+ value: true
62
+ _wandb:
63
+ value:
64
+ cli_version: 0.18.7
65
+ m: []
66
+ python_version: 3.11.10
67
+ t:
68
+ "1":
69
+ - 1
70
+ - 2
71
+ - 3
72
+ - 12
73
+ - 41
74
+ - 45
75
+ - 55
76
+ "2":
77
+ - 1
78
+ - 2
79
+ - 3
80
+ - 12
81
+ - 41
82
+ - 45
83
+ - 55
84
+ "3":
85
+ - 13
86
+ - 14
87
+ - 16
88
+ - 23
89
+ - 55
90
+ "4": 3.11.10
91
+ "5": 0.18.7
92
+ "8":
93
+ - 5
94
+ "12": 0.18.7
95
+ "13": linux-x86_64
grafted/wandb/run-20241127_213238-torch-grafted-redux/files/output.log ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I1127 21:32:38.985260 139930170870656 train.py:125] NOTE: Initializing train dataset...
2
+ I1127 21:32:38.985419 139930170870656 train.py:125] NOTE: Global batch size 1024 on 1 hosts results in 1024 local batch size. With 1 dev per host (1 dev total), that's a 1024 per-device batch size.
3
+ I1127 21:32:39.139223 139930170870656 logging_logger.py:49] Constructing tf.data.Dataset imagenet2012 for split _EvenSplit(split='train', index=0, count=1, drop_remainder=False), from /home/jason-chou/tensorflow_datasets/imagenet2012/5.1.0
4
+ Traceback (most recent call last):
5
+ File "<frozen runpy>", line 198, in _run_module_as_main
6
+ File "<frozen runpy>", line 88, in _run_code
7
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 396, in <module>
8
+ app.run(main)
9
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 308, in run
10
+ _run_main(main, args)
11
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 254, in _run_main
12
+ sys.exit(main(argv))
13
+ ^^^^^^^^^^
14
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 189, in main
15
+ train_ds, ntrain_img = input_pipeline.training(config.input)
16
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
17
+ File "/home/jason-chou/Downloads/big_vision/big_vision/input_pipeline.py", line 101, in training
18
+ data=train_data.get_tfdata(ordered=False),
19
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20
+ File "/home/jason-chou/Downloads/big_vision/big_vision/datasets/tfds.py", line 39, in get_tfdata
21
+ return _get_dataset_from_builder(
22
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
23
+ File "/home/jason-chou/Downloads/big_vision/big_vision/datasets/tfds.py", line 70, in _get_dataset_from_builder
24
+ ds = builder.as_dataset(
25
+ ^^^^^^^^^^^^^^^^^^^
26
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/logging/__init__.py", line 176, in __call__
27
+ return function(*args, **kwargs)
28
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
29
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/dataset_builder.py", line 1008, in as_dataset
30
+ raise AssertionError(
31
+ AssertionError: Dataset imagenet2012: could not find data in /home/jason-chou/tensorflow_datasets. Please make sure to call dataset_builder.download_and_prepare(), or pass download=True to tfds.load() before trying to access the tf.data.Dataset object.
grafted/wandb/run-20241127_213238-torch-grafted-redux/files/requirements.txt ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ array_record==0.5.1
2
+ wandb==0.18.7
3
+ nvidia-curand-cu12==10.3.2.106
4
+ requests-oauthlib==2.0.0
5
+ zipp==3.21.0
6
+ Werkzeug==3.1.3
7
+ simple-parsing==0.1.6
8
+ mdurl==0.1.2
9
+ keras==2.15.0
10
+ nvidia-cuda-nvcc-cu12==12.6.85
11
+ google-auth-oauthlib==1.2.1
12
+ jaxlib==0.4.34
13
+ tf_keras==2.15.1
14
+ oauthlib==3.2.2
15
+ tensorflow-probability==0.25.0
16
+ cachetools==5.5.0
17
+ Jinja2==3.1.3
18
+ rich==13.9.4
19
+ filelock==3.13.1
20
+ google-pasta==0.2.0
21
+ optax==0.2.4
22
+ toolz==1.0.0
23
+ gast==0.6.0
24
+ tensorboard==2.15.2
25
+ pyasn1_modules==0.4.1
26
+ nvidia-cudnn-cu12==9.1.0.70
27
+ opt_einsum==3.4.0
28
+ nvidia-nvjitlink-cu12==12.6.85
29
+ chex==0.1.87
30
+ namex==0.0.8
31
+ termcolor==2.5.0
32
+ flax==0.10.2
33
+ cloudpickle==3.1.0
34
+ numpy==1.26.4
35
+ nvidia-nccl-cu12==2.21.5
36
+ tensorflow-cpu==2.15.0
37
+ nvidia-cusolver-cu12==11.4.5.107
38
+ typing_extensions==4.12.2
39
+ tensorflow-addons==0.23.0
40
+ typeguard==2.13.3
41
+ absl-py==2.1.0
42
+ flatbuffers==24.3.25
43
+ dlpack==0.1
44
+ setuptools==65.5.0
45
+ protobuf==4.25.5
46
+ jax-cuda12-plugin==0.4.35
47
+ tensorflow==2.15.0
48
+ msgpack==1.1.0
49
+ networkx==3.2.1
50
+ docker-pycreds==0.4.0
51
+ nvidia-cuda-cupti-cu12==12.1.105
52
+ pillow==11.0.0
53
+ libclang==18.1.1
54
+ nvidia-cuda-nvrtc-cu12==12.1.105
55
+ distrax==0.1.5
56
+ orbax-checkpoint==0.10.1
57
+ PyYAML==6.0.2
58
+ urllib3==2.2.3
59
+ aqtp==0.8.2
60
+ tensorflow-metadata==1.16.1
61
+ etils==1.11.0
62
+ smmap==5.0.1
63
+ pyasn1==0.6.1
64
+ docstring_parser==0.16
65
+ google-auth==2.36.0
66
+ simplejson==3.19.3
67
+ mpmath==1.3.0
68
+ h5py==3.12.1
69
+ jax-cuda12-pjrt==0.4.35
70
+ tensorflow-io-gcs-filesystem==0.37.1
71
+ tensorflow-estimator==2.15.0
72
+ triton==3.1.0
73
+ rsa==4.9
74
+ panopticapi==0.1
75
+ tensorflow-hub==0.16.1
76
+ requests==2.32.3
77
+ scipy==1.14.1
78
+ ml-dtypes==0.2.0
79
+ markdown-it-py==3.0.0
80
+ tensorflow-text==2.15.0
81
+ wrapt==1.14.1
82
+ immutabledict==4.2.1
83
+ MarkupSafe==3.0.2
84
+ jax==0.4.35
85
+ torch==2.5.1+cu121
86
+ wheel==0.45.1
87
+ einops==0.8.0
88
+ sentry-sdk==2.19.0
89
+ torchvision==0.20.1+cu121
90
+ humanize==4.11.0
91
+ toml==0.10.2
92
+ tensorstore==0.1.69
93
+ six==1.16.0
94
+ promise==2.3
95
+ certifi==2024.8.30
96
+ nvidia-cuda-runtime-cu12==12.1.105
97
+ flaxformer==0.8.8
98
+ nvidia-cufft-cu12==11.0.2.54
99
+ psutil==6.1.0
100
+ GitPython==3.1.43
101
+ platformdirs==4.3.6
102
+ importlib_resources==6.4.5
103
+ tfds-nightly==4.9.7.dev202411280044
104
+ tensorflow-gan==2.1.0
105
+ googleapis-common-protos==1.66.0
106
+ overrides==7.7.0
107
+ optree==0.13.1
108
+ Pygments==2.18.0
109
+ astunparse==1.6.3
110
+ ml_collections==1.0.0
111
+ setproctitle==1.3.4
112
+ tensorboard-data-server==0.7.2
113
+ sympy==1.13.1
114
+ packaging==24.2
115
+ nest-asyncio==1.6.0
116
+ nvidia-cublas-cu12==12.1.3.1
117
+ gitdb==4.0.11
118
+ click==8.1.7
119
+ idna==3.10
120
+ tqdm==4.67.1
121
+ grpcio==1.68.0
122
+ decorator==5.1.1
123
+ pyarrow==18.1.0
124
+ clu==0.0.12
125
+ charset-normalizer==3.4.0
126
+ fsspec==2024.10.0
127
+ dm-tree==0.1.8
128
+ sentencepiece==0.2.0
129
+ nvidia-cusparse-cu12==12.1.0.106
130
+ torchaudio==2.5.1+cu121
131
+ pip==24.3.1
132
+ Markdown==3.7
133
+ nvidia-nvtx-cu12==12.1.105
grafted/wandb/run-20241127_213238-torch-grafted-redux/files/wandb-metadata.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-49-generic-x86_64-with-glibc2.39",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-11-28T05:32:38.332723Z",
5
+ "args": [
6
+ "--config",
7
+ "/home/jason-chou/Downloads/big_vision/big_vision/configs/vit_s16_i1k_single_gpu_test.py",
8
+ "--workdir",
9
+ "/data/imagenet/grafted",
10
+ "--name",
11
+ "torch-grafted-redux"
12
+ ],
13
+ "program": "-m big_vision.train",
14
+ "git": {
15
+ "remote": "https://github.com/EIFY/big_vision.git",
16
+ "commit": "44649a64ff67e709f55cdb5e3adcf52064b17de5"
17
+ },
18
+ "email": "[email protected]",
19
+ "root": "/home/jason-chou/Downloads/big_vision",
20
+ "host": "jasonchou-TensorBook-late-2021",
21
+ "username": "jason-chou",
22
+ "executable": "/home/jason-chou/.pyenv/versions/3.11.10/bin/python",
23
+ "cpu_count": 8,
24
+ "cpu_count_logical": 16,
25
+ "gpu": "NVIDIA GeForce RTX 3080 Laptop GPU",
26
+ "gpu_count": 1,
27
+ "disk": {
28
+ "/": {
29
+ "total": "1006450962432",
30
+ "used": "584312053760"
31
+ }
32
+ },
33
+ "memory": {
34
+ "total": "67162914816"
35
+ },
36
+ "cpu": {
37
+ "count": 8,
38
+ "countLogical": 16
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA GeForce RTX 3080 Laptop GPU",
43
+ "memoryTotal": "17179869184",
44
+ "cudaCores": 6144,
45
+ "architecture": "Ampere"
46
+ }
47
+ ],
48
+ "cudaVersion": "12.2"
49
+ }
grafted/wandb/run-20241127_213238-torch-grafted-redux/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":0}}
grafted/wandb/run-20241127_213238-torch-grafted-redux/logs/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-27T21:32:38.333839182-08:00","level":"INFO","msg":"using version","core version":"0.18.7"}
2
+ {"time":"2024-11-27T21:32:38.333850342-08:00","level":"INFO","msg":"created symlink","path":"/home/jason-chou/Downloads/big_vision/wandb/run-20241127_213238-torch-grafted-redux/logs/debug-core.log"}
3
+ {"time":"2024-11-27T21:32:38.437006136-08:00","level":"INFO","msg":"created new stream","id":"torch-grafted-redux"}
4
+ {"time":"2024-11-27T21:32:38.437103435-08:00","level":"INFO","msg":"stream: started","id":"torch-grafted-redux"}
5
+ {"time":"2024-11-27T21:32:38.437187603-08:00","level":"INFO","msg":"writer: Do: started","stream_id":"torch-grafted-redux"}
6
+ {"time":"2024-11-27T21:32:38.437273441-08:00","level":"INFO","msg":"handler: started","stream_id":"torch-grafted-redux"}
7
+ {"time":"2024-11-27T21:32:38.437384909-08:00","level":"INFO","msg":"sender: started","stream_id":"torch-grafted-redux"}
8
+ {"time":"2024-11-27T21:32:38.935119097-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-27T21:32:39.180591766-08:00","level":"INFO","msg":"stream: closing","id":"torch-grafted-redux"}
10
+ {"time":"2024-11-27T21:32:39.180690557-08:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-11-27T21:32:39.181672971-08:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2024-11-27T21:32:39.361912852-08:00","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
13
+ {"time":"2024-11-27T21:32:39.361944716-08:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
14
+ {"time":"2024-11-27T21:32:39.739296662-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
15
+ {"time":"2024-11-27T21:32:39.941553056-08:00","level":"INFO","msg":"handler: closed","stream_id":"torch-grafted-redux"}
16
+ {"time":"2024-11-27T21:32:39.941619024-08:00","level":"INFO","msg":"sender: closed","stream_id":"torch-grafted-redux"}
17
+ {"time":"2024-11-27T21:32:39.941589779-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"torch-grafted-redux"}
18
+ {"time":"2024-11-27T21:32:39.941798781-08:00","level":"INFO","msg":"stream: closed","id":"torch-grafted-redux"}
grafted/wandb/run-20241127_213238-torch-grafted-redux/logs/debug.log ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7
2
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Configure stats pid to 68601
3
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/.config/wandb/settings
4
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/Downloads/big_vision/wandb/settings
5
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-27 21:32:38,330 WARNING MainThread:68601 [wandb_setup.py:_flush():79] Could not find program at -m big_vision.train
8
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m big_vision.train'}
9
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_init.py:_log_setup():533] Logging user logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_213238-torch-grafted-redux/logs/debug.log
11
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_init.py:_log_setup():534] Logging internal logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_213238-torch-grafted-redux/logs/debug-internal.log
12
+ 2024-11-27 21:32:38,330 INFO MainThread:68601 [wandb_init.py:init():619] calling init triggers
13
+ 2024-11-27 21:32:38,331 INFO MainThread:68601 [wandb_init.py:init():626] wandb.init called with sweep_config: {}
14
+ config: {'_fields': {'seed': 0, 'total_epochs': 90, 'num_classes': 1000, 'loss': 'softmax_xent', 'input': accum_freq: 8
15
+ batch_size: 1024
16
+ cache_raw: false
17
+ data:
18
+ name: imagenet2012
19
+ split: train
20
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
21
+ key="label", key_result="labels")|keep("image", "labels")
22
+ shuffle_buffer_size: 150000
23
+ , 'pp_modules': ['ops_general', 'ops_image', 'ops_text', 'archive.randaug'], 'log_training_steps': 50, 'ckpt_steps': 1000, 'model_name': 'vit', 'model': pool_type: gap
24
+ posemb: sincos2d
25
+ rep_size: false
26
+ variant: S/16
27
+ , 'grad_clip_norm': 1.0, 'optax_name': 'scale_by_adam', 'optax': mu_dtype: bfloat16
28
+ , 'lr': 0.001, 'wd': 0.0001, 'schedule': decay_type: cosine
29
+ warmup_steps: 10000
30
+ , 'mixup': fold_in: null
31
+ p: 0.2
32
+ , 'evals': val:
33
+ data:
34
+ name: imagenet2012
35
+ split: validation
36
+ log_steps: 2500
37
+ loss_name: softmax_xent
38
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
39
+ key="label", key_result="labels")|keep("image", "labels")
40
+ type: classification
41
+ }, '_locked': True, '_type_safe': True, '_convert_dict': True, '_allow_dotted_keys': False, '_sort_keys': True}
42
+ 2024-11-27 21:32:38,331 INFO MainThread:68601 [wandb_init.py:init():669] starting backend
43
+ 2024-11-27 21:32:38,331 INFO MainThread:68601 [wandb_init.py:init():673] sending inform_init request
44
+ 2024-11-27 21:32:38,332 INFO MainThread:68601 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
45
+ 2024-11-27 21:32:38,332 INFO MainThread:68601 [wandb_init.py:init():686] backend started and connected
46
+ 2024-11-27 21:32:38,336 INFO MainThread:68601 [wandb_init.py:init():781] updated telemetry
47
+ 2024-11-27 21:32:38,339 INFO MainThread:68601 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout
48
+ 2024-11-27 21:32:38,933 INFO MainThread:68601 [wandb_init.py:init():867] starting run threads in backend
49
+ 2024-11-27 21:32:38,983 INFO MainThread:68601 [wandb_run.py:_console_start():2456] atexit reg
50
+ 2024-11-27 21:32:38,983 INFO MainThread:68601 [wandb_run.py:_redirect():2305] redirect: wrap_raw
51
+ 2024-11-27 21:32:38,984 INFO MainThread:68601 [wandb_run.py:_redirect():2370] Wrapping output streams.
52
+ 2024-11-27 21:32:38,984 INFO MainThread:68601 [wandb_run.py:_redirect():2395] Redirects installed.
53
+ 2024-11-27 21:32:38,984 INFO MainThread:68601 [wandb_init.py:init():911] run started, returning control to user process
54
+ 2024-11-27 21:32:39,180 WARNING MsgRouterThr:68601 [router.py:message_loop():75] message_loop has been closed
grafted/wandb/run-20241127_213238-torch-grafted-redux/run-torch-grafted-redux.wandb ADDED
Binary file (6.53 kB). View file
 
grafted/wandb/run-20241127_213407-torch-grafted-redux/files/config.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _allow_dotted_keys:
2
+ value: false
3
+ _convert_dict:
4
+ value: true
5
+ _fields:
6
+ value:
7
+ ckpt_steps: 1000
8
+ evals: |
9
+ val:
10
+ data:
11
+ name: imagenet2012
12
+ split: validation
13
+ log_steps: 2500
14
+ loss_name: softmax_xent
15
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
16
+ key="label", key_result="labels")|keep("image", "labels")
17
+ type: classification
18
+ grad_clip_norm: 1
19
+ input: |
20
+ accum_freq: 8
21
+ batch_size: 1024
22
+ cache_raw: false
23
+ data:
24
+ name: imagenet2012
25
+ split: train
26
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
27
+ key="label", key_result="labels")|keep("image", "labels")
28
+ shuffle_buffer_size: 150000
29
+ log_training_steps: 50
30
+ loss: softmax_xent
31
+ lr: 0.001
32
+ mixup: |
33
+ fold_in: null
34
+ p: 0.2
35
+ model: |
36
+ pool_type: gap
37
+ posemb: sincos2d
38
+ rep_size: false
39
+ variant: S/16
40
+ model_name: vit
41
+ num_classes: 1000
42
+ optax: |
43
+ mu_dtype: bfloat16
44
+ optax_name: scale_by_adam
45
+ pp_modules:
46
+ - ops_general
47
+ - ops_image
48
+ - ops_text
49
+ - archive.randaug
50
+ schedule: |
51
+ decay_type: cosine
52
+ warmup_steps: 10000
53
+ seed: 0
54
+ total_epochs: 90
55
+ wd: 0.0001
56
+ _locked:
57
+ value: true
58
+ _sort_keys:
59
+ value: true
60
+ _type_safe:
61
+ value: true
62
+ _wandb:
63
+ value:
64
+ cli_version: 0.18.7
65
+ m: []
66
+ python_version: 3.11.10
67
+ t:
68
+ "1":
69
+ - 1
70
+ - 2
71
+ - 3
72
+ - 12
73
+ - 41
74
+ - 45
75
+ - 55
76
+ "2":
77
+ - 1
78
+ - 2
79
+ - 3
80
+ - 12
81
+ - 41
82
+ - 45
83
+ - 55
84
+ "3":
85
+ - 5
86
+ - 13
87
+ - 14
88
+ - 16
89
+ - 23
90
+ - 55
91
+ - 62
92
+ "4": 3.11.10
93
+ "5": 0.18.7
94
+ "8":
95
+ - 5
96
+ "12": 0.18.7
97
+ "13": linux-x86_64
grafted/wandb/run-20241127_213407-torch-grafted-redux/files/output.log ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I1127 21:34:08.317430 123604392237952 train.py:125] NOTE: Initializing train dataset...
2
+ I1127 21:34:08.317584 123604392237952 train.py:125] NOTE: Global batch size 1024 on 1 hosts results in 1024 local batch size. With 1 dev per host (1 dev total), that's a 1024 per-device batch size.
3
+ I1127 21:34:08.468933 123604392237952 logging_logger.py:49] Constructing tf.data.Dataset imagenet2012 for split _EvenSplit(split='train', index=0, count=1, drop_remainder=False), from /home/jason-chou/tensorflow_datasets/imagenet2012/5.1.0
4
+ Traceback (most recent call last):
5
+ File "<frozen runpy>", line 198, in _run_module_as_main
6
+ File "<frozen runpy>", line 88, in _run_code
7
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 396, in <module>
8
+ app.run(main)
9
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 308, in run
10
+ _run_main(main, args)
11
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 254, in _run_main
12
+ sys.exit(main(argv))
13
+ ^^^^^^^^^^
14
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 189, in main
15
+ train_ds, ntrain_img = input_pipeline.training(config.input)
16
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
17
+ File "/home/jason-chou/Downloads/big_vision/big_vision/input_pipeline.py", line 101, in training
18
+ data=train_data.get_tfdata(ordered=False),
19
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20
+ File "/home/jason-chou/Downloads/big_vision/big_vision/datasets/tfds.py", line 39, in get_tfdata
21
+ return _get_dataset_from_builder(
22
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
23
+ File "/home/jason-chou/Downloads/big_vision/big_vision/datasets/tfds.py", line 70, in _get_dataset_from_builder
24
+ ds = builder.as_dataset(
25
+ ^^^^^^^^^^^^^^^^^^^
26
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/logging/__init__.py", line 176, in __call__
27
+ return function(*args, **kwargs)
28
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
29
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/dataset_builder.py", line 1008, in as_dataset
30
+ raise AssertionError(
31
+ AssertionError: Dataset imagenet2012: could not find data in /home/jason-chou/tensorflow_datasets. Please make sure to call dataset_builder.download_and_prepare(), or pass download=True to tfds.load() before trying to access the tf.data.Dataset object.
grafted/wandb/run-20241127_213407-torch-grafted-redux/files/requirements.txt ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ array_record==0.5.1
2
+ wandb==0.18.7
3
+ nvidia-curand-cu12==10.3.2.106
4
+ requests-oauthlib==2.0.0
5
+ zipp==3.21.0
6
+ Werkzeug==3.1.3
7
+ simple-parsing==0.1.6
8
+ mdurl==0.1.2
9
+ keras==2.15.0
10
+ nvidia-cuda-nvcc-cu12==12.6.85
11
+ google-auth-oauthlib==1.2.1
12
+ jaxlib==0.4.34
13
+ tf_keras==2.15.1
14
+ oauthlib==3.2.2
15
+ tensorflow-probability==0.25.0
16
+ cachetools==5.5.0
17
+ Jinja2==3.1.3
18
+ rich==13.9.4
19
+ filelock==3.13.1
20
+ google-pasta==0.2.0
21
+ optax==0.2.4
22
+ toolz==1.0.0
23
+ gast==0.6.0
24
+ tensorboard==2.15.2
25
+ pyasn1_modules==0.4.1
26
+ nvidia-cudnn-cu12==9.1.0.70
27
+ opt_einsum==3.4.0
28
+ nvidia-nvjitlink-cu12==12.6.85
29
+ chex==0.1.87
30
+ namex==0.0.8
31
+ termcolor==2.5.0
32
+ flax==0.10.2
33
+ cloudpickle==3.1.0
34
+ numpy==1.26.4
35
+ nvidia-nccl-cu12==2.21.5
36
+ tensorflow-cpu==2.15.0
37
+ nvidia-cusolver-cu12==11.4.5.107
38
+ typing_extensions==4.12.2
39
+ tensorflow-addons==0.23.0
40
+ typeguard==2.13.3
41
+ absl-py==2.1.0
42
+ flatbuffers==24.3.25
43
+ dlpack==0.1
44
+ setuptools==65.5.0
45
+ protobuf==4.25.5
46
+ jax-cuda12-plugin==0.4.35
47
+ tensorflow==2.15.0
48
+ msgpack==1.1.0
49
+ networkx==3.2.1
50
+ docker-pycreds==0.4.0
51
+ nvidia-cuda-cupti-cu12==12.1.105
52
+ pillow==11.0.0
53
+ libclang==18.1.1
54
+ nvidia-cuda-nvrtc-cu12==12.1.105
55
+ distrax==0.1.5
56
+ orbax-checkpoint==0.10.1
57
+ PyYAML==6.0.2
58
+ urllib3==2.2.3
59
+ aqtp==0.8.2
60
+ tensorflow-metadata==1.16.1
61
+ etils==1.11.0
62
+ smmap==5.0.1
63
+ pyasn1==0.6.1
64
+ docstring_parser==0.16
65
+ google-auth==2.36.0
66
+ simplejson==3.19.3
67
+ mpmath==1.3.0
68
+ h5py==3.12.1
69
+ jax-cuda12-pjrt==0.4.35
70
+ tensorflow-io-gcs-filesystem==0.37.1
71
+ tensorflow-estimator==2.15.0
72
+ triton==3.1.0
73
+ rsa==4.9
74
+ panopticapi==0.1
75
+ tensorflow-hub==0.16.1
76
+ requests==2.32.3
77
+ scipy==1.14.1
78
+ ml-dtypes==0.2.0
79
+ markdown-it-py==3.0.0
80
+ tensorflow-text==2.15.0
81
+ wrapt==1.14.1
82
+ immutabledict==4.2.1
83
+ MarkupSafe==3.0.2
84
+ jax==0.4.35
85
+ torch==2.5.1+cu121
86
+ wheel==0.45.1
87
+ einops==0.8.0
88
+ sentry-sdk==2.19.0
89
+ torchvision==0.20.1+cu121
90
+ humanize==4.11.0
91
+ toml==0.10.2
92
+ tensorstore==0.1.69
93
+ six==1.16.0
94
+ promise==2.3
95
+ certifi==2024.8.30
96
+ nvidia-cuda-runtime-cu12==12.1.105
97
+ flaxformer==0.8.8
98
+ nvidia-cufft-cu12==11.0.2.54
99
+ psutil==6.1.0
100
+ GitPython==3.1.43
101
+ platformdirs==4.3.6
102
+ importlib_resources==6.4.5
103
+ tfds-nightly==4.9.7.dev202411280044
104
+ tensorflow-gan==2.1.0
105
+ googleapis-common-protos==1.66.0
106
+ overrides==7.7.0
107
+ optree==0.13.1
108
+ Pygments==2.18.0
109
+ astunparse==1.6.3
110
+ ml_collections==1.0.0
111
+ setproctitle==1.3.4
112
+ tensorboard-data-server==0.7.2
113
+ sympy==1.13.1
114
+ packaging==24.2
115
+ nest-asyncio==1.6.0
116
+ nvidia-cublas-cu12==12.1.3.1
117
+ gitdb==4.0.11
118
+ click==8.1.7
119
+ idna==3.10
120
+ tqdm==4.67.1
121
+ grpcio==1.68.0
122
+ decorator==5.1.1
123
+ pyarrow==18.1.0
124
+ clu==0.0.12
125
+ charset-normalizer==3.4.0
126
+ fsspec==2024.10.0
127
+ dm-tree==0.1.8
128
+ sentencepiece==0.2.0
129
+ nvidia-cusparse-cu12==12.1.0.106
130
+ torchaudio==2.5.1+cu121
131
+ pip==24.3.1
132
+ Markdown==3.7
133
+ nvidia-nvtx-cu12==12.1.105
grafted/wandb/run-20241127_213407-torch-grafted-redux/files/wandb-metadata.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-49-generic-x86_64-with-glibc2.39",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-11-28T05:34:07.870557Z",
5
+ "args": [
6
+ "--config",
7
+ "/home/jason-chou/Downloads/big_vision/big_vision/configs/vit_s16_i1k_single_gpu_test.py",
8
+ "--workdir",
9
+ "/data/imagenet/grafted",
10
+ "--name",
11
+ "torch-grafted-redux"
12
+ ],
13
+ "program": "-m big_vision.train",
14
+ "git": {
15
+ "remote": "https://github.com/EIFY/big_vision.git",
16
+ "commit": "44649a64ff67e709f55cdb5e3adcf52064b17de5"
17
+ },
18
+ "email": "[email protected]",
19
+ "root": "/home/jason-chou/Downloads/big_vision",
20
+ "host": "jasonchou-TensorBook-late-2021",
21
+ "username": "jason-chou",
22
+ "executable": "/home/jason-chou/.pyenv/versions/3.11.10/bin/python",
23
+ "cpu_count": 8,
24
+ "cpu_count_logical": 16,
25
+ "gpu": "NVIDIA GeForce RTX 3080 Laptop GPU",
26
+ "gpu_count": 1,
27
+ "disk": {
28
+ "/": {
29
+ "total": "1006450962432",
30
+ "used": "584312168448"
31
+ }
32
+ },
33
+ "memory": {
34
+ "total": "67162914816"
35
+ },
36
+ "cpu": {
37
+ "count": 8,
38
+ "countLogical": 16
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA GeForce RTX 3080 Laptop GPU",
43
+ "memoryTotal": "17179869184",
44
+ "cudaCores": 6144,
45
+ "architecture": "Ampere"
46
+ }
47
+ ],
48
+ "cudaVersion": "12.2"
49
+ }
grafted/wandb/run-20241127_213407-torch-grafted-redux/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":0}}
grafted/wandb/run-20241127_213407-torch-grafted-redux/logs/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-27T21:34:07.871305008-08:00","level":"INFO","msg":"using version","core version":"0.18.7"}
2
+ {"time":"2024-11-27T21:34:07.871312795-08:00","level":"INFO","msg":"created symlink","path":"/home/jason-chou/Downloads/big_vision/wandb/run-20241127_213407-torch-grafted-redux/logs/debug-core.log"}
3
+ {"time":"2024-11-27T21:34:07.974433557-08:00","level":"INFO","msg":"created new stream","id":"torch-grafted-redux"}
4
+ {"time":"2024-11-27T21:34:07.974483345-08:00","level":"INFO","msg":"stream: started","id":"torch-grafted-redux"}
5
+ {"time":"2024-11-27T21:34:07.974637769-08:00","level":"INFO","msg":"writer: Do: started","stream_id":"torch-grafted-redux"}
6
+ {"time":"2024-11-27T21:34:07.974694546-08:00","level":"INFO","msg":"handler: started","stream_id":"torch-grafted-redux"}
7
+ {"time":"2024-11-27T21:34:07.974824798-08:00","level":"INFO","msg":"sender: started","stream_id":"torch-grafted-redux"}
8
+ {"time":"2024-11-27T21:34:08.265740433-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-27T21:34:08.510645063-08:00","level":"INFO","msg":"stream: closing","id":"torch-grafted-redux"}
10
+ {"time":"2024-11-27T21:34:08.510725098-08:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-11-27T21:34:08.511601534-08:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2024-11-27T21:34:08.671262357-08:00","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
13
+ {"time":"2024-11-27T21:34:08.671295778-08:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
14
+ {"time":"2024-11-27T21:34:09.08563661-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
15
+ {"time":"2024-11-27T21:34:09.270472343-08:00","level":"INFO","msg":"handler: closed","stream_id":"torch-grafted-redux"}
16
+ {"time":"2024-11-27T21:34:09.270565975-08:00","level":"INFO","msg":"sender: closed","stream_id":"torch-grafted-redux"}
17
+ {"time":"2024-11-27T21:34:09.270568744-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"torch-grafted-redux"}
18
+ {"time":"2024-11-27T21:34:09.270733853-08:00","level":"INFO","msg":"stream: closed","id":"torch-grafted-redux"}
grafted/wandb/run-20241127_213407-torch-grafted-redux/logs/debug.log ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7
2
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Configure stats pid to 68916
3
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/.config/wandb/settings
4
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/Downloads/big_vision/wandb/settings
5
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-27 21:34:07,867 WARNING MainThread:68916 [wandb_setup.py:_flush():79] Could not find program at -m big_vision.train
8
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m big_vision.train'}
9
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_init.py:_log_setup():533] Logging user logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_213407-torch-grafted-redux/logs/debug.log
11
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_init.py:_log_setup():534] Logging internal logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_213407-torch-grafted-redux/logs/debug-internal.log
12
+ 2024-11-27 21:34:07,867 INFO MainThread:68916 [wandb_init.py:init():619] calling init triggers
13
+ 2024-11-27 21:34:07,869 INFO MainThread:68916 [wandb_init.py:init():626] wandb.init called with sweep_config: {}
14
+ config: {'_fields': {'seed': 0, 'total_epochs': 90, 'num_classes': 1000, 'loss': 'softmax_xent', 'input': accum_freq: 8
15
+ batch_size: 1024
16
+ cache_raw: false
17
+ data:
18
+ name: imagenet2012
19
+ split: train
20
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
21
+ key="label", key_result="labels")|keep("image", "labels")
22
+ shuffle_buffer_size: 150000
23
+ , 'pp_modules': ['ops_general', 'ops_image', 'ops_text', 'archive.randaug'], 'log_training_steps': 50, 'ckpt_steps': 1000, 'model_name': 'vit', 'model': pool_type: gap
24
+ posemb: sincos2d
25
+ rep_size: false
26
+ variant: S/16
27
+ , 'grad_clip_norm': 1.0, 'optax_name': 'scale_by_adam', 'optax': mu_dtype: bfloat16
28
+ , 'lr': 0.001, 'wd': 0.0001, 'schedule': decay_type: cosine
29
+ warmup_steps: 10000
30
+ , 'mixup': fold_in: null
31
+ p: 0.2
32
+ , 'evals': val:
33
+ data:
34
+ name: imagenet2012
35
+ split: validation
36
+ log_steps: 2500
37
+ loss_name: softmax_xent
38
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
39
+ key="label", key_result="labels")|keep("image", "labels")
40
+ type: classification
41
+ }, '_locked': True, '_type_safe': True, '_convert_dict': True, '_allow_dotted_keys': False, '_sort_keys': True}
42
+ 2024-11-27 21:34:07,869 INFO MainThread:68916 [wandb_init.py:init():669] starting backend
43
+ 2024-11-27 21:34:07,869 INFO MainThread:68916 [wandb_init.py:init():673] sending inform_init request
44
+ 2024-11-27 21:34:07,870 INFO MainThread:68916 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
45
+ 2024-11-27 21:34:07,870 INFO MainThread:68916 [wandb_init.py:init():686] backend started and connected
46
+ 2024-11-27 21:34:07,873 INFO MainThread:68916 [wandb_init.py:init():781] updated telemetry
47
+ 2024-11-27 21:34:07,875 INFO MainThread:68916 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout
48
+ 2024-11-27 21:34:08,241 INFO MainThread:68916 [wandb_init.py:init():859] run resumed
49
+ 2024-11-27 21:34:08,261 INFO MainThread:68916 [wandb_init.py:init():867] starting run threads in backend
50
+ 2024-11-27 21:34:08,315 INFO MainThread:68916 [wandb_run.py:_console_start():2456] atexit reg
51
+ 2024-11-27 21:34:08,315 INFO MainThread:68916 [wandb_run.py:_redirect():2305] redirect: wrap_raw
52
+ 2024-11-27 21:34:08,316 INFO MainThread:68916 [wandb_run.py:_redirect():2370] Wrapping output streams.
53
+ 2024-11-27 21:34:08,316 INFO MainThread:68916 [wandb_run.py:_redirect():2395] Redirects installed.
54
+ 2024-11-27 21:34:08,317 INFO MainThread:68916 [wandb_init.py:init():911] run started, returning control to user process
55
+ 2024-11-27 21:34:08,511 WARNING MsgRouterThr:68916 [router.py:message_loop():75] message_loop has been closed
grafted/wandb/run-20241127_213407-torch-grafted-redux/run-torch-grafted-redux.wandb ADDED
Binary file (6.64 kB). View file
 
grafted/wandb/run-20241127_213838-torch-grafted-redux/files/config.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _allow_dotted_keys:
2
+ value: false
3
+ _convert_dict:
4
+ value: true
5
+ _fields:
6
+ value:
7
+ ckpt_steps: 1000
8
+ evals: |
9
+ val:
10
+ data:
11
+ name: imagenet2012
12
+ split: validation
13
+ log_steps: 2500
14
+ loss_name: softmax_xent
15
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
16
+ key="label", key_result="labels")|keep("image", "labels")
17
+ type: classification
18
+ grad_clip_norm: 1
19
+ input: |
20
+ accum_freq: 8
21
+ batch_size: 1024
22
+ cache_raw: false
23
+ data:
24
+ name: imagenet2012
25
+ split: train
26
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
27
+ key="label", key_result="labels")|keep("image", "labels")
28
+ shuffle_buffer_size: 150000
29
+ log_training_steps: 50
30
+ loss: softmax_xent
31
+ lr: 0.001
32
+ mixup: |
33
+ fold_in: null
34
+ p: 0.2
35
+ model: |
36
+ pool_type: gap
37
+ posemb: sincos2d
38
+ rep_size: false
39
+ variant: S/16
40
+ model_name: vit
41
+ num_classes: 1000
42
+ optax: |
43
+ mu_dtype: bfloat16
44
+ optax_name: scale_by_adam
45
+ pp_modules:
46
+ - ops_general
47
+ - ops_image
48
+ - ops_text
49
+ - archive.randaug
50
+ schedule: |
51
+ decay_type: cosine
52
+ warmup_steps: 10000
53
+ seed: 0
54
+ total_epochs: 90
55
+ wd: 0.0001
56
+ _locked:
57
+ value: true
58
+ _sort_keys:
59
+ value: true
60
+ _type_safe:
61
+ value: true
62
+ _wandb:
63
+ value:
64
+ cli_version: 0.18.7
65
+ m: []
66
+ python_version: 3.11.10
67
+ t:
68
+ "1":
69
+ - 1
70
+ - 2
71
+ - 3
72
+ - 12
73
+ - 41
74
+ - 45
75
+ - 55
76
+ "2":
77
+ - 1
78
+ - 2
79
+ - 3
80
+ - 12
81
+ - 41
82
+ - 45
83
+ - 55
84
+ "3":
85
+ - 5
86
+ - 13
87
+ - 14
88
+ - 16
89
+ - 23
90
+ - 55
91
+ - 62
92
+ "4": 3.11.10
93
+ "5": 0.18.7
94
+ "8":
95
+ - 5
96
+ "12": 0.18.7
97
+ "13": linux-x86_64
grafted/wandb/run-20241127_213838-torch-grafted-redux/files/output.log ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I1127 21:38:39.183649 134109837835136 train.py:125] NOTE: Initializing train dataset...
2
+ I1127 21:38:39.183780 134109837835136 train.py:125] NOTE: Global batch size 1024 on 1 hosts results in 1024 local batch size. With 1 dev per host (1 dev total), that's a 1024 per-device batch size.
3
+ I1127 21:38:39.334601 134109837835136 dataset_info.py:707] Load dataset info from /data/tensorflow_datasets/imagenet2012/5.1.0
4
+ I1127 21:38:39.359155 134109837835136 reader.py:261] Creating a tf.data.Dataset reading 1024 files located in folders: /data/tensorflow_datasets/imagenet2012/5.1.0.
5
+ WARNING:tensorflow:From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/reader.py:101: CounterV2 (from tensorflow.python.data.experimental.ops.counter) is deprecated and will be removed in a future version.
6
+ Instructions for updating:
7
+ Use `tf.data.Dataset.counter(...)` instead.
8
+ W1127 21:38:39.392815 134109837835136 deprecation.py:50] From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/reader.py:101: CounterV2 (from tensorflow.python.data.experimental.ops.counter) is deprecated and will be removed in a future version.
9
+ Instructions for updating:
10
+ Use `tf.data.Dataset.counter(...)` instead.
11
+ I1127 21:38:39.415093 134109837835136 logging_logger.py:49] Constructing tf.data.Dataset imagenet2012 for split _EvenSplit(split='train', index=0, count=1, drop_remainder=False), from /data/tensorflow_datasets/imagenet2012/5.1.0
12
+ I1127 21:38:39.481757 134109837835136 api.py:460] Data before pre-processing:
13
+ {'file_name': <tf.Tensor 'args_1:0' shape=() dtype=string>, 'image': <tf.Tensor 'args_2:0' shape=() dtype=string>, 'label': <tf.Tensor 'args_3:0' shape=() dtype=int64>, 'tfds_id': <tf.Tensor 'args_4:0' shape=() dtype=string>, '_id': <tf.Tensor 'args_0:0' shape=() dtype=int32>}
14
+ INFO:tensorflow:Using RandAug.
15
+ I1127 21:38:39.744788 134109837835136 api.py:460] Using RandAug.
16
+ WARNING:tensorflow:From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow/python/util/dispatch.py:1260: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
17
+ Instructions for updating:
18
+ Use `tf.cast` instead.
19
+ W1127 21:38:39.922924 134109837835136 deprecation.py:50] From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow/python/util/dispatch.py:1260: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
20
+ Instructions for updating:
21
+ Use `tf.cast` instead.
22
+ I1127 21:38:42.147573 134109837835136 api.py:460] Data after pre-processing:
23
+ {'image': <tf.Tensor 'add:0' shape=(224, 224, 3) dtype=float32>, 'labels': <tf.Tensor 'one_hot:0' shape=(1000,) dtype=float32>}
24
+ I1127 21:38:42.218924 134109837835136 train.py:125] NOTE: Running for 112603 steps, that means 90.000345 epochs
25
+ I1127 21:38:42.850795 134109837835136 train.py:125] NOTE: Creating model...
26
+ Weight decay for: conv_proj.weight
27
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.in_proj_weight
28
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.out_proj.weight
29
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.0.weight
30
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.3.weight
31
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.in_proj_weight
32
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.out_proj.weight
33
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.0.weight
34
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.3.weight
35
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.in_proj_weight
36
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.out_proj.weight
37
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.0.weight
38
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.3.weight
39
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.in_proj_weight
40
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.out_proj.weight
41
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.0.weight
42
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.3.weight
43
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.in_proj_weight
44
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.out_proj.weight
45
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.0.weight
46
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.3.weight
47
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.in_proj_weight
48
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.out_proj.weight
49
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.0.weight
50
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.3.weight
51
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.in_proj_weight
52
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.out_proj.weight
53
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.0.weight
54
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.3.weight
55
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.in_proj_weight
56
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.out_proj.weight
57
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.0.weight
58
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.3.weight
59
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.in_proj_weight
60
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.out_proj.weight
61
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.0.weight
62
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.3.weight
63
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.in_proj_weight
64
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.out_proj.weight
65
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.0.weight
66
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.3.weight
67
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.in_proj_weight
68
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.out_proj.weight
69
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.0.weight
70
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.3.weight
71
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.in_proj_weight
72
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.out_proj.weight
73
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.0.weight
74
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.3.weight
75
+ Weight decay for: heads.head.weight
76
+ Weight decay for: conv_proj.weight
77
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.in_proj_weight
78
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.out_proj.weight
79
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.0.weight
80
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.3.weight
81
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.in_proj_weight
82
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.out_proj.weight
83
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.0.weight
84
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.3.weight
85
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.in_proj_weight
86
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.out_proj.weight
87
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.0.weight
88
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.3.weight
89
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.in_proj_weight
90
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.out_proj.weight
91
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.0.weight
92
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.3.weight
93
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.in_proj_weight
94
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.out_proj.weight
95
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.0.weight
96
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.3.weight
97
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.in_proj_weight
98
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.out_proj.weight
99
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.0.weight
100
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.3.weight
101
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.in_proj_weight
102
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.out_proj.weight
103
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.0.weight
104
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.3.weight
105
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.in_proj_weight
106
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.out_proj.weight
107
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.0.weight
108
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.3.weight
109
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.in_proj_weight
110
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.out_proj.weight
111
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.0.weight
112
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.3.weight
113
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.in_proj_weight
114
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.out_proj.weight
115
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.0.weight
116
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.3.weight
117
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.in_proj_weight
118
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.out_proj.weight
119
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.0.weight
120
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.3.weight
121
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.in_proj_weight
122
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.out_proj.weight
123
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.0.weight
124
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.3.weight
125
+ Weight decay for: heads.head.weight
126
+ I1127 21:38:43.733023 134109837835136 train.py:125] NOTE: Running initial or final evals...
127
+ I1127 21:38:43.733439 134109837835136 train.py:125] NOTE: Init evaluator: val…
128
+ Steps:0/112603 [0.0%]
129
+ I1127 21:38:43.735805 134109837835136 reader.py:261] Creating a tf.data.Dataset reading 64 files located in folders: /data/tensorflow_datasets/imagenet2012/5.1.0.
130
+ I1127 21:38:43.766425 134109837835136 logging_logger.py:49] Constructing tf.data.Dataset imagenet2012 for split _EvenSplit(split='validation', index=0, count=1, drop_remainder=False), from /data/tensorflow_datasets/imagenet2012/5.1.0
131
+ I1127 21:38:43.800241 134109837835136 api.py:460] Data before pre-processing:
132
+ {'file_name': <tf.Tensor 'args_1:0' shape=() dtype=string>, 'image': <tf.Tensor 'args_2:0' shape=() dtype=string>, 'label': <tf.Tensor 'args_3:0' shape=() dtype=int64>, 'tfds_id': <tf.Tensor 'args_4:0' shape=() dtype=string>, '_id': <tf.Tensor 'args_0:0' shape=() dtype=int32>}
133
+ I1127 21:38:43.969007 134109837835136 api.py:460] Data after pre-processing:
134
+ {'image': <tf.Tensor 'add:0' shape=(224, 224, 3) dtype=float32>, 'labels': <tf.Tensor 'one_hot:0' shape=(1000,) dtype=float32>}
135
+ I1127 21:38:44.062357 134109837835136 train.py:125] NOTE: val evaluation...
136
+ Steps:0/112603 [0.0%]
137
+ Traceback (most recent call last):
138
+ File "<frozen runpy>", line 198, in _run_module_as_main
139
+ File "<frozen runpy>", line 88, in _run_code
140
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 396, in <module>
141
+ app.run(main)
142
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 308, in run
143
+ _run_main(main, args)
144
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 254, in _run_main
145
+ sys.exit(main(argv))
146
+ ^^^^^^^^^^
147
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 296, in main
148
+ for key, value in evaluator.run(model, criterion, config.input.accum_freq):
149
+ File "/home/jason-chou/Downloads/big_vision/big_vision/evaluators/classification.py", line 74, in run
150
+ images, target = torch.from_dlpack(dlpack.asdlpack(images)), torch.from_dlpack(dlpack.asdlpack(target))
151
+ ^^^^^^^^^^^^^^^
152
+ AttributeError: module 'dlpack' has no attribute 'asdlpack'
grafted/wandb/run-20241127_213838-torch-grafted-redux/files/requirements.txt ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ array_record==0.5.1
2
+ wandb==0.18.7
3
+ nvidia-curand-cu12==10.3.2.106
4
+ requests-oauthlib==2.0.0
5
+ zipp==3.21.0
6
+ Werkzeug==3.1.3
7
+ simple-parsing==0.1.6
8
+ mdurl==0.1.2
9
+ keras==2.15.0
10
+ nvidia-cuda-nvcc-cu12==12.6.85
11
+ google-auth-oauthlib==1.2.1
12
+ jaxlib==0.4.34
13
+ tf_keras==2.15.1
14
+ oauthlib==3.2.2
15
+ tensorflow-probability==0.25.0
16
+ cachetools==5.5.0
17
+ Jinja2==3.1.3
18
+ rich==13.9.4
19
+ filelock==3.13.1
20
+ google-pasta==0.2.0
21
+ optax==0.2.4
22
+ toolz==1.0.0
23
+ gast==0.6.0
24
+ tensorboard==2.15.2
25
+ pyasn1_modules==0.4.1
26
+ nvidia-cudnn-cu12==9.1.0.70
27
+ opt_einsum==3.4.0
28
+ nvidia-nvjitlink-cu12==12.6.85
29
+ chex==0.1.87
30
+ namex==0.0.8
31
+ termcolor==2.5.0
32
+ flax==0.10.2
33
+ cloudpickle==3.1.0
34
+ numpy==1.26.4
35
+ nvidia-nccl-cu12==2.21.5
36
+ tensorflow-cpu==2.15.0
37
+ nvidia-cusolver-cu12==11.4.5.107
38
+ typing_extensions==4.12.2
39
+ tensorflow-addons==0.23.0
40
+ typeguard==2.13.3
41
+ absl-py==2.1.0
42
+ flatbuffers==24.3.25
43
+ dlpack==0.1
44
+ setuptools==65.5.0
45
+ protobuf==4.25.5
46
+ jax-cuda12-plugin==0.4.35
47
+ tensorflow==2.15.0
48
+ msgpack==1.1.0
49
+ networkx==3.2.1
50
+ docker-pycreds==0.4.0
51
+ nvidia-cuda-cupti-cu12==12.1.105
52
+ pillow==11.0.0
53
+ libclang==18.1.1
54
+ nvidia-cuda-nvrtc-cu12==12.1.105
55
+ distrax==0.1.5
56
+ orbax-checkpoint==0.10.1
57
+ PyYAML==6.0.2
58
+ urllib3==2.2.3
59
+ aqtp==0.8.2
60
+ tensorflow-metadata==1.16.1
61
+ etils==1.11.0
62
+ smmap==5.0.1
63
+ pyasn1==0.6.1
64
+ docstring_parser==0.16
65
+ google-auth==2.36.0
66
+ simplejson==3.19.3
67
+ mpmath==1.3.0
68
+ h5py==3.12.1
69
+ jax-cuda12-pjrt==0.4.35
70
+ tensorflow-io-gcs-filesystem==0.37.1
71
+ tensorflow-estimator==2.15.0
72
+ triton==3.1.0
73
+ rsa==4.9
74
+ panopticapi==0.1
75
+ tensorflow-hub==0.16.1
76
+ requests==2.32.3
77
+ scipy==1.14.1
78
+ ml-dtypes==0.2.0
79
+ markdown-it-py==3.0.0
80
+ tensorflow-text==2.15.0
81
+ wrapt==1.14.1
82
+ immutabledict==4.2.1
83
+ MarkupSafe==3.0.2
84
+ jax==0.4.35
85
+ torch==2.5.1+cu121
86
+ wheel==0.45.1
87
+ einops==0.8.0
88
+ sentry-sdk==2.19.0
89
+ torchvision==0.20.1+cu121
90
+ humanize==4.11.0
91
+ toml==0.10.2
92
+ tensorstore==0.1.69
93
+ six==1.16.0
94
+ promise==2.3
95
+ certifi==2024.8.30
96
+ nvidia-cuda-runtime-cu12==12.1.105
97
+ flaxformer==0.8.8
98
+ nvidia-cufft-cu12==11.0.2.54
99
+ psutil==6.1.0
100
+ GitPython==3.1.43
101
+ platformdirs==4.3.6
102
+ importlib_resources==6.4.5
103
+ tfds-nightly==4.9.7.dev202411280044
104
+ tensorflow-gan==2.1.0
105
+ googleapis-common-protos==1.66.0
106
+ overrides==7.7.0
107
+ optree==0.13.1
108
+ Pygments==2.18.0
109
+ astunparse==1.6.3
110
+ ml_collections==1.0.0
111
+ setproctitle==1.3.4
112
+ tensorboard-data-server==0.7.2
113
+ sympy==1.13.1
114
+ packaging==24.2
115
+ nest-asyncio==1.6.0
116
+ nvidia-cublas-cu12==12.1.3.1
117
+ gitdb==4.0.11
118
+ click==8.1.7
119
+ idna==3.10
120
+ tqdm==4.67.1
121
+ grpcio==1.68.0
122
+ decorator==5.1.1
123
+ pyarrow==18.1.0
124
+ clu==0.0.12
125
+ charset-normalizer==3.4.0
126
+ fsspec==2024.10.0
127
+ dm-tree==0.1.8
128
+ sentencepiece==0.2.0
129
+ nvidia-cusparse-cu12==12.1.0.106
130
+ torchaudio==2.5.1+cu121
131
+ pip==24.3.1
132
+ Markdown==3.7
133
+ nvidia-nvtx-cu12==12.1.105
grafted/wandb/run-20241127_213838-torch-grafted-redux/files/wandb-metadata.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-49-generic-x86_64-with-glibc2.39",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-11-28T05:38:38.667951Z",
5
+ "args": [
6
+ "--config",
7
+ "/home/jason-chou/Downloads/big_vision/big_vision/configs/vit_s16_i1k_single_gpu_test.py",
8
+ "--workdir",
9
+ "/data/imagenet/grafted",
10
+ "--name",
11
+ "torch-grafted-redux"
12
+ ],
13
+ "program": "-m big_vision.train",
14
+ "git": {
15
+ "remote": "https://github.com/EIFY/big_vision.git",
16
+ "commit": "44649a64ff67e709f55cdb5e3adcf52064b17de5"
17
+ },
18
+ "email": "[email protected]",
19
+ "root": "/home/jason-chou/Downloads/big_vision",
20
+ "host": "jasonchou-TensorBook-late-2021",
21
+ "username": "jason-chou",
22
+ "executable": "/home/jason-chou/.pyenv/versions/3.11.10/bin/python",
23
+ "cpu_count": 8,
24
+ "cpu_count_logical": 16,
25
+ "gpu": "NVIDIA GeForce RTX 3080 Laptop GPU",
26
+ "gpu_count": 1,
27
+ "disk": {
28
+ "/": {
29
+ "total": "1006450962432",
30
+ "used": "584311005184"
31
+ }
32
+ },
33
+ "memory": {
34
+ "total": "67162914816"
35
+ },
36
+ "cpu": {
37
+ "count": 8,
38
+ "countLogical": 16
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA GeForce RTX 3080 Laptop GPU",
43
+ "memoryTotal": "17179869184",
44
+ "cudaCores": 6144,
45
+ "architecture": "Ampere"
46
+ }
47
+ ],
48
+ "cudaVersion": "12.2"
49
+ }
grafted/wandb/run-20241127_213838-torch-grafted-redux/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":6}}
grafted/wandb/run-20241127_213838-torch-grafted-redux/logs/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-27T21:38:38.66921709-08:00","level":"INFO","msg":"using version","core version":"0.18.7"}
2
+ {"time":"2024-11-27T21:38:38.669224917-08:00","level":"INFO","msg":"created symlink","path":"/home/jason-chou/Downloads/big_vision/wandb/run-20241127_213838-torch-grafted-redux/logs/debug-core.log"}
3
+ {"time":"2024-11-27T21:38:38.772438836-08:00","level":"INFO","msg":"created new stream","id":"torch-grafted-redux"}
4
+ {"time":"2024-11-27T21:38:38.772485797-08:00","level":"INFO","msg":"stream: started","id":"torch-grafted-redux"}
5
+ {"time":"2024-11-27T21:38:38.772617036-08:00","level":"INFO","msg":"writer: Do: started","stream_id":"torch-grafted-redux"}
6
+ {"time":"2024-11-27T21:38:38.77266667-08:00","level":"INFO","msg":"sender: started","stream_id":"torch-grafted-redux"}
7
+ {"time":"2024-11-27T21:38:38.772762681-08:00","level":"INFO","msg":"handler: started","stream_id":"torch-grafted-redux"}
8
+ {"time":"2024-11-27T21:38:39.13369457-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-27T21:38:45.284216421-08:00","level":"INFO","msg":"stream: closing","id":"torch-grafted-redux"}
10
+ {"time":"2024-11-27T21:38:45.284267357-08:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-11-27T21:38:45.285410301-08:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2024-11-27T21:38:45.379673564-08:00","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
13
+ {"time":"2024-11-27T21:38:45.37971701-08:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
14
+ {"time":"2024-11-27T21:38:45.839243387-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
15
+ {"time":"2024-11-27T21:38:45.960027486-08:00","level":"INFO","msg":"handler: closed","stream_id":"torch-grafted-redux"}
16
+ {"time":"2024-11-27T21:38:45.960124482-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"torch-grafted-redux"}
17
+ {"time":"2024-11-27T21:38:45.96015466-08:00","level":"INFO","msg":"sender: closed","stream_id":"torch-grafted-redux"}
18
+ {"time":"2024-11-27T21:38:45.960287984-08:00","level":"INFO","msg":"stream: closed","id":"torch-grafted-redux"}
grafted/wandb/run-20241127_213838-torch-grafted-redux/logs/debug.log ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7
2
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Configure stats pid to 69602
3
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/.config/wandb/settings
4
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/Downloads/big_vision/wandb/settings
5
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-27 21:38:38,665 WARNING MainThread:69602 [wandb_setup.py:_flush():79] Could not find program at -m big_vision.train
8
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m big_vision.train'}
9
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_init.py:_log_setup():533] Logging user logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_213838-torch-grafted-redux/logs/debug.log
11
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_init.py:_log_setup():534] Logging internal logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_213838-torch-grafted-redux/logs/debug-internal.log
12
+ 2024-11-27 21:38:38,665 INFO MainThread:69602 [wandb_init.py:init():619] calling init triggers
13
+ 2024-11-27 21:38:38,667 INFO MainThread:69602 [wandb_init.py:init():626] wandb.init called with sweep_config: {}
14
+ config: {'_fields': {'seed': 0, 'total_epochs': 90, 'num_classes': 1000, 'loss': 'softmax_xent', 'input': accum_freq: 8
15
+ batch_size: 1024
16
+ cache_raw: false
17
+ data:
18
+ name: imagenet2012
19
+ split: train
20
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
21
+ key="label", key_result="labels")|keep("image", "labels")
22
+ shuffle_buffer_size: 150000
23
+ , 'pp_modules': ['ops_general', 'ops_image', 'ops_text', 'archive.randaug'], 'log_training_steps': 50, 'ckpt_steps': 1000, 'model_name': 'vit', 'model': pool_type: gap
24
+ posemb: sincos2d
25
+ rep_size: false
26
+ variant: S/16
27
+ , 'grad_clip_norm': 1.0, 'optax_name': 'scale_by_adam', 'optax': mu_dtype: bfloat16
28
+ , 'lr': 0.001, 'wd': 0.0001, 'schedule': decay_type: cosine
29
+ warmup_steps: 10000
30
+ , 'mixup': fold_in: null
31
+ p: 0.2
32
+ , 'evals': val:
33
+ data:
34
+ name: imagenet2012
35
+ split: validation
36
+ log_steps: 2500
37
+ loss_name: softmax_xent
38
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
39
+ key="label", key_result="labels")|keep("image", "labels")
40
+ type: classification
41
+ }, '_locked': True, '_type_safe': True, '_convert_dict': True, '_allow_dotted_keys': False, '_sort_keys': True}
42
+ 2024-11-27 21:38:38,667 INFO MainThread:69602 [wandb_init.py:init():669] starting backend
43
+ 2024-11-27 21:38:38,667 INFO MainThread:69602 [wandb_init.py:init():673] sending inform_init request
44
+ 2024-11-27 21:38:38,667 INFO MainThread:69602 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
45
+ 2024-11-27 21:38:38,667 INFO MainThread:69602 [wandb_init.py:init():686] backend started and connected
46
+ 2024-11-27 21:38:38,670 INFO MainThread:69602 [wandb_init.py:init():781] updated telemetry
47
+ 2024-11-27 21:38:38,672 INFO MainThread:69602 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout
48
+ 2024-11-27 21:38:39,116 INFO MainThread:69602 [wandb_init.py:init():859] run resumed
49
+ 2024-11-27 21:38:39,131 INFO MainThread:69602 [wandb_init.py:init():867] starting run threads in backend
50
+ 2024-11-27 21:38:39,182 INFO MainThread:69602 [wandb_run.py:_console_start():2456] atexit reg
51
+ 2024-11-27 21:38:39,182 INFO MainThread:69602 [wandb_run.py:_redirect():2305] redirect: wrap_raw
52
+ 2024-11-27 21:38:39,182 INFO MainThread:69602 [wandb_run.py:_redirect():2370] Wrapping output streams.
53
+ 2024-11-27 21:38:39,182 INFO MainThread:69602 [wandb_run.py:_redirect():2395] Redirects installed.
54
+ 2024-11-27 21:38:39,183 INFO MainThread:69602 [wandb_init.py:init():911] run started, returning control to user process
55
+ 2024-11-27 21:38:45,287 WARNING MsgRouterThr:69602 [router.py:message_loop():75] message_loop has been closed
grafted/wandb/run-20241127_213838-torch-grafted-redux/run-torch-grafted-redux.wandb ADDED
Binary file (46.3 kB). View file
 
grafted/wandb/run-20241127_214327-torch-grafted-redux/files/config.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _allow_dotted_keys:
2
+ value: false
3
+ _convert_dict:
4
+ value: true
5
+ _fields:
6
+ value:
7
+ ckpt_steps: 1000
8
+ evals: |
9
+ val:
10
+ data:
11
+ name: imagenet2012
12
+ split: validation
13
+ log_steps: 2500
14
+ loss_name: softmax_xent
15
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
16
+ key="label", key_result="labels")|keep("image", "labels")
17
+ type: classification
18
+ grad_clip_norm: 1
19
+ input: |
20
+ accum_freq: 8
21
+ batch_size: 1024
22
+ cache_raw: false
23
+ data:
24
+ name: imagenet2012
25
+ split: train
26
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
27
+ key="label", key_result="labels")|keep("image", "labels")
28
+ shuffle_buffer_size: 150000
29
+ log_training_steps: 50
30
+ loss: softmax_xent
31
+ lr: 0.001
32
+ mixup: |
33
+ fold_in: null
34
+ p: 0.2
35
+ model: |
36
+ pool_type: gap
37
+ posemb: sincos2d
38
+ rep_size: false
39
+ variant: S/16
40
+ model_name: vit
41
+ num_classes: 1000
42
+ optax: |
43
+ mu_dtype: bfloat16
44
+ optax_name: scale_by_adam
45
+ pp_modules:
46
+ - ops_general
47
+ - ops_image
48
+ - ops_text
49
+ - archive.randaug
50
+ schedule: |
51
+ decay_type: cosine
52
+ warmup_steps: 10000
53
+ seed: 0
54
+ total_epochs: 90
55
+ wd: 0.0001
56
+ _locked:
57
+ value: true
58
+ _sort_keys:
59
+ value: true
60
+ _type_safe:
61
+ value: true
62
+ _wandb:
63
+ value:
64
+ cli_version: 0.18.7
65
+ m: []
66
+ python_version: 3.11.10
67
+ t:
68
+ "1":
69
+ - 1
70
+ - 2
71
+ - 3
72
+ - 12
73
+ - 41
74
+ - 45
75
+ - 55
76
+ "2":
77
+ - 1
78
+ - 2
79
+ - 3
80
+ - 12
81
+ - 41
82
+ - 45
83
+ - 55
84
+ "3":
85
+ - 5
86
+ - 13
87
+ - 14
88
+ - 16
89
+ - 23
90
+ - 55
91
+ - 62
92
+ "4": 3.11.10
93
+ "5": 0.18.7
94
+ "8":
95
+ - 5
96
+ "12": 0.18.7
97
+ "13": linux-x86_64
grafted/wandb/run-20241127_214327-torch-grafted-redux/files/output.log ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I1127 21:43:28.307721 123439371660160 train.py:125] NOTE: Initializing train dataset...
2
+ I1127 21:43:28.307887 123439371660160 train.py:125] NOTE: Global batch size 1024 on 1 hosts results in 1024 local batch size. With 1 dev per host (1 dev total), that's a 1024 per-device batch size.
3
+ I1127 21:43:28.459977 123439371660160 dataset_info.py:707] Load dataset info from /data/tensorflow_datasets/imagenet2012/5.1.0
4
+ I1127 21:43:28.478083 123439371660160 reader.py:261] Creating a tf.data.Dataset reading 1024 files located in folders: /data/tensorflow_datasets/imagenet2012/5.1.0.
5
+ WARNING:tensorflow:From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/reader.py:101: CounterV2 (from tensorflow.python.data.experimental.ops.counter) is deprecated and will be removed in a future version.
6
+ Instructions for updating:
7
+ Use `tf.data.Dataset.counter(...)` instead.
8
+ W1127 21:43:28.511688 123439371660160 deprecation.py:50] From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow_datasets/core/reader.py:101: CounterV2 (from tensorflow.python.data.experimental.ops.counter) is deprecated and will be removed in a future version.
9
+ Instructions for updating:
10
+ Use `tf.data.Dataset.counter(...)` instead.
11
+ I1127 21:43:28.534458 123439371660160 logging_logger.py:49] Constructing tf.data.Dataset imagenet2012 for split _EvenSplit(split='train', index=0, count=1, drop_remainder=False), from /data/tensorflow_datasets/imagenet2012/5.1.0
12
+ I1127 21:43:28.601723 123439371660160 api.py:460] Data before pre-processing:
13
+ {'file_name': <tf.Tensor 'args_1:0' shape=() dtype=string>, 'image': <tf.Tensor 'args_2:0' shape=() dtype=string>, 'label': <tf.Tensor 'args_3:0' shape=() dtype=int64>, 'tfds_id': <tf.Tensor 'args_4:0' shape=() dtype=string>, '_id': <tf.Tensor 'args_0:0' shape=() dtype=int32>}
14
+ INFO:tensorflow:Using RandAug.
15
+ I1127 21:43:28.866792 123439371660160 api.py:460] Using RandAug.
16
+ WARNING:tensorflow:From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow/python/util/dispatch.py:1260: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
17
+ Instructions for updating:
18
+ Use `tf.cast` instead.
19
+ W1127 21:43:29.046336 123439371660160 deprecation.py:50] From /home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/tensorflow/python/util/dispatch.py:1260: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
20
+ Instructions for updating:
21
+ Use `tf.cast` instead.
22
+ I1127 21:43:31.278083 123439371660160 api.py:460] Data after pre-processing:
23
+ {'image': <tf.Tensor 'add:0' shape=(224, 224, 3) dtype=float32>, 'labels': <tf.Tensor 'one_hot:0' shape=(1000,) dtype=float32>}
24
+ I1127 21:43:31.349534 123439371660160 train.py:125] NOTE: Running for 112603 steps, that means 90.000345 epochs
25
+ I1127 21:43:31.991644 123439371660160 train.py:125] NOTE: Creating model...
26
+ Weight decay for: conv_proj.weight
27
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.in_proj_weight
28
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.out_proj.weight
29
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.0.weight
30
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.3.weight
31
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.in_proj_weight
32
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.out_proj.weight
33
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.0.weight
34
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.3.weight
35
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.in_proj_weight
36
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.out_proj.weight
37
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.0.weight
38
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.3.weight
39
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.in_proj_weight
40
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.out_proj.weight
41
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.0.weight
42
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.3.weight
43
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.in_proj_weight
44
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.out_proj.weight
45
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.0.weight
46
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.3.weight
47
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.in_proj_weight
48
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.out_proj.weight
49
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.0.weight
50
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.3.weight
51
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.in_proj_weight
52
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.out_proj.weight
53
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.0.weight
54
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.3.weight
55
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.in_proj_weight
56
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.out_proj.weight
57
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.0.weight
58
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.3.weight
59
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.in_proj_weight
60
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.out_proj.weight
61
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.0.weight
62
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.3.weight
63
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.in_proj_weight
64
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.out_proj.weight
65
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.0.weight
66
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.3.weight
67
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.in_proj_weight
68
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.out_proj.weight
69
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.0.weight
70
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.3.weight
71
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.in_proj_weight
72
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.out_proj.weight
73
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.0.weight
74
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.3.weight
75
+ Weight decay for: heads.head.weight
76
+ Weight decay for: conv_proj.weight
77
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.in_proj_weight
78
+ Weight decay for: encoder.layers.encoder_layer_0.self_attention.out_proj.weight
79
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.0.weight
80
+ Weight decay for: encoder.layers.encoder_layer_0.mlp.3.weight
81
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.in_proj_weight
82
+ Weight decay for: encoder.layers.encoder_layer_1.self_attention.out_proj.weight
83
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.0.weight
84
+ Weight decay for: encoder.layers.encoder_layer_1.mlp.3.weight
85
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.in_proj_weight
86
+ Weight decay for: encoder.layers.encoder_layer_2.self_attention.out_proj.weight
87
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.0.weight
88
+ Weight decay for: encoder.layers.encoder_layer_2.mlp.3.weight
89
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.in_proj_weight
90
+ Weight decay for: encoder.layers.encoder_layer_3.self_attention.out_proj.weight
91
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.0.weight
92
+ Weight decay for: encoder.layers.encoder_layer_3.mlp.3.weight
93
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.in_proj_weight
94
+ Weight decay for: encoder.layers.encoder_layer_4.self_attention.out_proj.weight
95
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.0.weight
96
+ Weight decay for: encoder.layers.encoder_layer_4.mlp.3.weight
97
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.in_proj_weight
98
+ Weight decay for: encoder.layers.encoder_layer_5.self_attention.out_proj.weight
99
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.0.weight
100
+ Weight decay for: encoder.layers.encoder_layer_5.mlp.3.weight
101
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.in_proj_weight
102
+ Weight decay for: encoder.layers.encoder_layer_6.self_attention.out_proj.weight
103
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.0.weight
104
+ Weight decay for: encoder.layers.encoder_layer_6.mlp.3.weight
105
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.in_proj_weight
106
+ Weight decay for: encoder.layers.encoder_layer_7.self_attention.out_proj.weight
107
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.0.weight
108
+ Weight decay for: encoder.layers.encoder_layer_7.mlp.3.weight
109
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.in_proj_weight
110
+ Weight decay for: encoder.layers.encoder_layer_8.self_attention.out_proj.weight
111
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.0.weight
112
+ Weight decay for: encoder.layers.encoder_layer_8.mlp.3.weight
113
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.in_proj_weight
114
+ Weight decay for: encoder.layers.encoder_layer_9.self_attention.out_proj.weight
115
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.0.weight
116
+ Weight decay for: encoder.layers.encoder_layer_9.mlp.3.weight
117
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.in_proj_weight
118
+ Weight decay for: encoder.layers.encoder_layer_10.self_attention.out_proj.weight
119
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.0.weight
120
+ Weight decay for: encoder.layers.encoder_layer_10.mlp.3.weight
121
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.in_proj_weight
122
+ Weight decay for: encoder.layers.encoder_layer_11.self_attention.out_proj.weight
123
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.0.weight
124
+ Weight decay for: encoder.layers.encoder_layer_11.mlp.3.weight
125
+ Weight decay for: heads.head.weight
126
+ I1127 21:43:32.855304 123439371660160 train.py:125] NOTE: Running initial or final evals...
127
+ I1127 21:43:32.855672 123439371660160 train.py:125] NOTE: Init evaluator: val…
128
+ Steps:0/112603 [0.0%]
129
+ I1127 21:43:32.857478 123439371660160 reader.py:261] Creating a tf.data.Dataset reading 64 files located in folders: /data/tensorflow_datasets/imagenet2012/5.1.0.
130
+ I1127 21:43:32.889285 123439371660160 logging_logger.py:49] Constructing tf.data.Dataset imagenet2012 for split _EvenSplit(split='validation', index=0, count=1, drop_remainder=False), from /data/tensorflow_datasets/imagenet2012/5.1.0
131
+ I1127 21:43:32.923067 123439371660160 api.py:460] Data before pre-processing:
132
+ {'file_name': <tf.Tensor 'args_1:0' shape=() dtype=string>, 'image': <tf.Tensor 'args_2:0' shape=() dtype=string>, 'label': <tf.Tensor 'args_3:0' shape=() dtype=int64>, 'tfds_id': <tf.Tensor 'args_4:0' shape=() dtype=string>, '_id': <tf.Tensor 'args_0:0' shape=() dtype=int32>}
133
+ I1127 21:43:33.088486 123439371660160 api.py:460] Data after pre-processing:
134
+ {'image': <tf.Tensor 'add:0' shape=(224, 224, 3) dtype=float32>, 'labels': <tf.Tensor 'one_hot:0' shape=(1000,) dtype=float32>}
135
+ I1127 21:43:33.183008 123439371660160 train.py:125] NOTE: val evaluation...
136
+ Steps:0/112603 [0.0%]
137
+ I1127 21:44:58.298490 123439371660160 utils.py:1231] [0] val/acc@1 = 0.0045041454081632655
138
+ I1127 21:44:58.298775 123439371660160 utils.py:1231] [0] val/loss = 6.883525214633163
139
+ I1127 21:44:58.298943 123439371660160 utils.py:1231] [0] z/secs/eval/val = 85.11567897199711
140
+ I1127 21:44:58.299036 123439371660160 utils.py:560] TIMING[z/secs/eval/val]: 85.11567897199711
141
+ I1127 21:44:58.299128 123439371660160 train.py:125] NOTE: Starting training loop, compiling the first step...
142
+ Traceback (most recent call last):
143
+ File "<frozen runpy>", line 198, in _run_module_as_main
144
+ File "<frozen runpy>", line 88, in _run_code
145
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 396, in <module>
146
+ app.run(main)
147
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 308, in run
148
+ _run_main(main, args)
149
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/absl/app.py", line 254, in _run_main
150
+ sys.exit(main(argv))
151
+ ^^^^^^^^^^
152
+ File "/home/jason-chou/Downloads/big_vision/big_vision/train.py", line 328, in main
153
+ output = model(img)
154
+ ^^^^^^^^^^
155
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
156
+ return self._call_impl(*args, **kwargs)
157
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
158
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
159
+ return forward_call(*args, **kwargs)
160
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
161
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 465, in _fn
162
+ return fn(*args, **kwargs)
163
+ ^^^^^^^^^^^^^^^^^^^
164
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
165
+ return self._call_impl(*args, **kwargs)
166
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
167
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
168
+ return forward_call(*args, **kwargs)
169
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
170
+ File "/home/jason-chou/Downloads/big_vision/big_vision/simple_vit.py", line 216, in forward
171
+ def forward(self, x: torch.Tensor):
172
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn
173
+ return fn(*args, **kwargs)
174
+ ^^^^^^^^^^^^^^^^^^^
175
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/aot_autograd.py", line 1100, in forward
176
+ return compiled_fn(full_args)
177
+ ^^^^^^^^^^^^^^^^^^^^^^
178
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 308, in runtime_wrapper
179
+ all_outs = call_func_at_runtime_with_args(
180
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
181
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/utils.py", line 124, in call_func_at_runtime_with_args
182
+ out = normalize_as_list(f(args))
183
+ ^^^^^^^
184
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/utils.py", line 98, in g
185
+ return f(*args)
186
+ ^^^^^^^^
187
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/autograd/function.py", line 575, in apply
188
+ return super().apply(*args, **kwargs) # type: ignore[misc]
189
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
190
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 1525, in forward
191
+ fw_outs = call_func_at_runtime_with_args(
192
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
193
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/utils.py", line 124, in call_func_at_runtime_with_args
194
+ out = normalize_as_list(f(args))
195
+ ^^^^^^^
196
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 488, in wrapper
197
+ return compiled_fn(runtime_args)
198
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
199
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 667, in inner_fn
200
+ outs = compiled_fn(args)
201
+ ^^^^^^^^^^^^^^^^^
202
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_inductor/codecache.py", line 1478, in __call__
203
+ return self.current_callable(inputs)
204
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
205
+ File "/home/jason-chou/.pyenv/versions/3.11.10/lib/python3.11/site-packages/torch/_inductor/utils.py", line 1977, in run
206
+ return model(new_inputs)
207
+ ^^^^^^^^^^^^^^^^^
208
+ File "/tmp/torchinductor_jason-chou/ka/ckacaj7mldyv4qgozpmsomr7zpv44qchrg7rtykk4mbmnm67wz36.py", line 1386, in call
209
+ buf141 = empty_strided_cuda((128, 6, 196, 64), (384, 64, 49152, 1), torch.float32)
210
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
211
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB. GPU 0 has a total capacity of 15.74 GiB of which 30.62 MiB is free. Including non-PyTorch memory, this process has 15.69 GiB memory in use. Of the allocated memory 3.51 GiB is allocated by PyTorch, and 176.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
grafted/wandb/run-20241127_214327-torch-grafted-redux/files/requirements.txt ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ array_record==0.5.1
2
+ wandb==0.18.7
3
+ nvidia-curand-cu12==10.3.2.106
4
+ requests-oauthlib==2.0.0
5
+ zipp==3.21.0
6
+ Werkzeug==3.1.3
7
+ simple-parsing==0.1.6
8
+ mdurl==0.1.2
9
+ keras==2.15.0
10
+ nvidia-cuda-nvcc-cu12==12.6.85
11
+ google-auth-oauthlib==1.2.1
12
+ jaxlib==0.4.34
13
+ tf_keras==2.15.1
14
+ oauthlib==3.2.2
15
+ tensorflow-probability==0.25.0
16
+ cachetools==5.5.0
17
+ Jinja2==3.1.3
18
+ rich==13.9.4
19
+ filelock==3.13.1
20
+ google-pasta==0.2.0
21
+ optax==0.2.4
22
+ toolz==1.0.0
23
+ gast==0.6.0
24
+ tensorboard==2.15.2
25
+ pyasn1_modules==0.4.1
26
+ nvidia-cudnn-cu12==9.1.0.70
27
+ opt_einsum==3.4.0
28
+ nvidia-nvjitlink-cu12==12.6.85
29
+ chex==0.1.87
30
+ namex==0.0.8
31
+ termcolor==2.5.0
32
+ flax==0.10.2
33
+ cloudpickle==3.1.0
34
+ numpy==1.26.4
35
+ nvidia-nccl-cu12==2.21.5
36
+ tensorflow-cpu==2.15.0
37
+ nvidia-cusolver-cu12==11.4.5.107
38
+ typing_extensions==4.12.2
39
+ tensorflow-addons==0.23.0
40
+ typeguard==2.13.3
41
+ absl-py==2.1.0
42
+ flatbuffers==24.3.25
43
+ dlpack==0.1
44
+ setuptools==65.5.0
45
+ protobuf==4.25.5
46
+ jax-cuda12-plugin==0.4.35
47
+ tensorflow==2.15.0
48
+ msgpack==1.1.0
49
+ networkx==3.2.1
50
+ docker-pycreds==0.4.0
51
+ nvidia-cuda-cupti-cu12==12.1.105
52
+ pillow==11.0.0
53
+ libclang==18.1.1
54
+ nvidia-cuda-nvrtc-cu12==12.1.105
55
+ distrax==0.1.5
56
+ orbax-checkpoint==0.10.1
57
+ PyYAML==6.0.2
58
+ urllib3==2.2.3
59
+ aqtp==0.8.2
60
+ tensorflow-metadata==1.16.1
61
+ etils==1.11.0
62
+ smmap==5.0.1
63
+ pyasn1==0.6.1
64
+ docstring_parser==0.16
65
+ google-auth==2.36.0
66
+ simplejson==3.19.3
67
+ mpmath==1.3.0
68
+ h5py==3.12.1
69
+ jax-cuda12-pjrt==0.4.35
70
+ tensorflow-io-gcs-filesystem==0.37.1
71
+ tensorflow-estimator==2.15.0
72
+ pydlpack==0.2.1
73
+ triton==3.1.0
74
+ rsa==4.9
75
+ panopticapi==0.1
76
+ tensorflow-hub==0.16.1
77
+ requests==2.32.3
78
+ scipy==1.14.1
79
+ ml-dtypes==0.2.0
80
+ markdown-it-py==3.0.0
81
+ tensorflow-text==2.15.0
82
+ wrapt==1.14.1
83
+ immutabledict==4.2.1
84
+ MarkupSafe==3.0.2
85
+ jax==0.4.35
86
+ torch==2.5.1+cu121
87
+ wheel==0.45.1
88
+ einops==0.8.0
89
+ sentry-sdk==2.19.0
90
+ torchvision==0.20.1+cu121
91
+ humanize==4.11.0
92
+ toml==0.10.2
93
+ tensorstore==0.1.69
94
+ six==1.16.0
95
+ promise==2.3
96
+ certifi==2024.8.30
97
+ nvidia-cuda-runtime-cu12==12.1.105
98
+ flaxformer==0.8.8
99
+ nvidia-cufft-cu12==11.0.2.54
100
+ psutil==6.1.0
101
+ GitPython==3.1.43
102
+ platformdirs==4.3.6
103
+ importlib_resources==6.4.5
104
+ tfds-nightly==4.9.7.dev202411280044
105
+ tensorflow-gan==2.1.0
106
+ googleapis-common-protos==1.66.0
107
+ overrides==7.7.0
108
+ optree==0.13.1
109
+ Pygments==2.18.0
110
+ astunparse==1.6.3
111
+ ml_collections==1.0.0
112
+ setproctitle==1.3.4
113
+ tensorboard-data-server==0.7.2
114
+ sympy==1.13.1
115
+ packaging==24.2
116
+ nest-asyncio==1.6.0
117
+ nvidia-cublas-cu12==12.1.3.1
118
+ gitdb==4.0.11
119
+ click==8.1.7
120
+ idna==3.10
121
+ tqdm==4.67.1
122
+ grpcio==1.68.0
123
+ decorator==5.1.1
124
+ pyarrow==18.1.0
125
+ clu==0.0.12
126
+ charset-normalizer==3.4.0
127
+ fsspec==2024.10.0
128
+ dm-tree==0.1.8
129
+ sentencepiece==0.2.0
130
+ nvidia-cusparse-cu12==12.1.0.106
131
+ torchaudio==2.5.1+cu121
132
+ pip==24.3.1
133
+ Markdown==3.7
134
+ nvidia-nvtx-cu12==12.1.105
grafted/wandb/run-20241127_214327-torch-grafted-redux/files/wandb-metadata.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-49-generic-x86_64-with-glibc2.39",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-11-28T05:43:27.740135Z",
5
+ "args": [
6
+ "--config",
7
+ "/home/jason-chou/Downloads/big_vision/big_vision/configs/vit_s16_i1k_single_gpu_test.py",
8
+ "--workdir",
9
+ "/data/imagenet/grafted",
10
+ "--name",
11
+ "torch-grafted-redux"
12
+ ],
13
+ "program": "-m big_vision.train",
14
+ "git": {
15
+ "remote": "https://github.com/EIFY/big_vision.git",
16
+ "commit": "44649a64ff67e709f55cdb5e3adcf52064b17de5"
17
+ },
18
+ "email": "[email protected]",
19
+ "root": "/home/jason-chou/Downloads/big_vision",
20
+ "host": "jasonchou-TensorBook-late-2021",
21
+ "username": "jason-chou",
22
+ "executable": "/home/jason-chou/.pyenv/versions/3.11.10/bin/python",
23
+ "cpu_count": 8,
24
+ "cpu_count_logical": 16,
25
+ "gpu": "NVIDIA GeForce RTX 3080 Laptop GPU",
26
+ "gpu_count": 1,
27
+ "disk": {
28
+ "/": {
29
+ "total": "1006450962432",
30
+ "used": "584312172544"
31
+ }
32
+ },
33
+ "memory": {
34
+ "total": "67162914816"
35
+ },
36
+ "cpu": {
37
+ "count": 8,
38
+ "countLogical": 16
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA GeForce RTX 3080 Laptop GPU",
43
+ "memoryTotal": "17179869184",
44
+ "cudaCores": 6144,
45
+ "architecture": "Ampere"
46
+ }
47
+ ],
48
+ "cudaVersion": "12.2"
49
+ }
grafted/wandb/run-20241127_214327-torch-grafted-redux/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":118}}
grafted/wandb/run-20241127_214327-torch-grafted-redux/logs/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-27T21:43:27.742580416-08:00","level":"INFO","msg":"using version","core version":"0.18.7"}
2
+ {"time":"2024-11-27T21:43:27.742604921-08:00","level":"INFO","msg":"created symlink","path":"/home/jason-chou/Downloads/big_vision/wandb/run-20241127_214327-torch-grafted-redux/logs/debug-core.log"}
3
+ {"time":"2024-11-27T21:43:27.845793864-08:00","level":"INFO","msg":"created new stream","id":"torch-grafted-redux"}
4
+ {"time":"2024-11-27T21:43:27.845820975-08:00","level":"INFO","msg":"stream: started","id":"torch-grafted-redux"}
5
+ {"time":"2024-11-27T21:43:27.845933768-08:00","level":"INFO","msg":"writer: Do: started","stream_id":"torch-grafted-redux"}
6
+ {"time":"2024-11-27T21:43:27.846046124-08:00","level":"INFO","msg":"handler: started","stream_id":"torch-grafted-redux"}
7
+ {"time":"2024-11-27T21:43:27.846062359-08:00","level":"INFO","msg":"sender: started","stream_id":"torch-grafted-redux"}
8
+ {"time":"2024-11-27T21:43:28.25683485-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-27T21:45:20.049310897-08:00","level":"INFO","msg":"stream: closing","id":"torch-grafted-redux"}
10
+ {"time":"2024-11-27T21:45:20.0498735-08:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-11-27T21:45:20.052125471-08:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2024-11-27T21:45:20.144258999-08:00","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
13
+ {"time":"2024-11-27T21:45:20.144292029-08:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
14
+ {"time":"2024-11-27T21:45:20.611424696-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
15
+ {"time":"2024-11-27T21:45:20.75808152-08:00","level":"INFO","msg":"handler: closed","stream_id":"torch-grafted-redux"}
16
+ {"time":"2024-11-27T21:45:20.758156552-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"torch-grafted-redux"}
17
+ {"time":"2024-11-27T21:45:20.75839778-08:00","level":"INFO","msg":"sender: closed","stream_id":"torch-grafted-redux"}
18
+ {"time":"2024-11-27T21:45:20.758445214-08:00","level":"INFO","msg":"stream: closed","id":"torch-grafted-redux"}
grafted/wandb/run-20241127_214327-torch-grafted-redux/logs/debug.log ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-27 21:43:27,731 INFO MainThread:70768 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7
2
+ 2024-11-27 21:43:27,731 INFO MainThread:70768 [wandb_setup.py:_flush():79] Configure stats pid to 70768
3
+ 2024-11-27 21:43:27,731 INFO MainThread:70768 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/.config/wandb/settings
4
+ 2024-11-27 21:43:27,731 INFO MainThread:70768 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/Downloads/big_vision/wandb/settings
5
+ 2024-11-27 21:43:27,731 INFO MainThread:70768 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-27 21:43:27,731 INFO MainThread:70768 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-27 21:43:27,731 WARNING MainThread:70768 [wandb_setup.py:_flush():79] Could not find program at -m big_vision.train
8
+ 2024-11-27 21:43:27,731 INFO MainThread:70768 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m big_vision.train'}
9
+ 2024-11-27 21:43:27,732 INFO MainThread:70768 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-11-27 21:43:27,732 INFO MainThread:70768 [wandb_init.py:_log_setup():533] Logging user logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_214327-torch-grafted-redux/logs/debug.log
11
+ 2024-11-27 21:43:27,732 INFO MainThread:70768 [wandb_init.py:_log_setup():534] Logging internal logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_214327-torch-grafted-redux/logs/debug-internal.log
12
+ 2024-11-27 21:43:27,732 INFO MainThread:70768 [wandb_init.py:init():619] calling init triggers
13
+ 2024-11-27 21:43:27,737 INFO MainThread:70768 [wandb_init.py:init():626] wandb.init called with sweep_config: {}
14
+ config: {'_fields': {'seed': 0, 'total_epochs': 90, 'num_classes': 1000, 'loss': 'softmax_xent', 'input': accum_freq: 8
15
+ batch_size: 1024
16
+ cache_raw: false
17
+ data:
18
+ name: imagenet2012
19
+ split: train
20
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
21
+ key="label", key_result="labels")|keep("image", "labels")
22
+ shuffle_buffer_size: 150000
23
+ , 'pp_modules': ['ops_general', 'ops_image', 'ops_text', 'archive.randaug'], 'log_training_steps': 50, 'ckpt_steps': 1000, 'model_name': 'vit', 'model': pool_type: gap
24
+ posemb: sincos2d
25
+ rep_size: false
26
+ variant: S/16
27
+ , 'grad_clip_norm': 1.0, 'optax_name': 'scale_by_adam', 'optax': mu_dtype: bfloat16
28
+ , 'lr': 0.001, 'wd': 0.0001, 'schedule': decay_type: cosine
29
+ warmup_steps: 10000
30
+ , 'mixup': fold_in: null
31
+ p: 0.2
32
+ , 'evals': val:
33
+ data:
34
+ name: imagenet2012
35
+ split: validation
36
+ log_steps: 2500
37
+ loss_name: softmax_xent
38
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
39
+ key="label", key_result="labels")|keep("image", "labels")
40
+ type: classification
41
+ }, '_locked': True, '_type_safe': True, '_convert_dict': True, '_allow_dotted_keys': False, '_sort_keys': True}
42
+ 2024-11-27 21:43:27,737 INFO MainThread:70768 [wandb_init.py:init():669] starting backend
43
+ 2024-11-27 21:43:27,737 INFO MainThread:70768 [wandb_init.py:init():673] sending inform_init request
44
+ 2024-11-27 21:43:27,739 INFO MainThread:70768 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
45
+ 2024-11-27 21:43:27,739 INFO MainThread:70768 [wandb_init.py:init():686] backend started and connected
46
+ 2024-11-27 21:43:27,745 INFO MainThread:70768 [wandb_init.py:init():781] updated telemetry
47
+ 2024-11-27 21:43:27,748 INFO MainThread:70768 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout
48
+ 2024-11-27 21:43:28,243 INFO MainThread:70768 [wandb_init.py:init():859] run resumed
49
+ 2024-11-27 21:43:28,254 INFO MainThread:70768 [wandb_init.py:init():867] starting run threads in backend
50
+ 2024-11-27 21:43:28,306 INFO MainThread:70768 [wandb_run.py:_console_start():2456] atexit reg
51
+ 2024-11-27 21:43:28,306 INFO MainThread:70768 [wandb_run.py:_redirect():2305] redirect: wrap_raw
52
+ 2024-11-27 21:43:28,306 INFO MainThread:70768 [wandb_run.py:_redirect():2370] Wrapping output streams.
53
+ 2024-11-27 21:43:28,306 INFO MainThread:70768 [wandb_run.py:_redirect():2395] Redirects installed.
54
+ 2024-11-27 21:43:28,307 INFO MainThread:70768 [wandb_init.py:init():911] run started, returning control to user process
55
+ 2024-11-27 21:45:20,049 WARNING MsgRouterThr:70768 [router.py:message_loop():75] message_loop has been closed
grafted/wandb/run-20241127_214327-torch-grafted-redux/run-torch-grafted-redux.wandb ADDED
Binary file (70.8 kB). View file
 
grafted/wandb/run-20241127_215015-torch-grafted-redux/files/config.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _allow_dotted_keys:
2
+ value: false
3
+ _convert_dict:
4
+ value: true
5
+ _fields:
6
+ value:
7
+ ckpt_steps: 1000
8
+ evals: |
9
+ val:
10
+ data:
11
+ name: imagenet2012
12
+ split: validation
13
+ log_steps: 2500
14
+ loss_name: softmax_xent
15
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
16
+ key="label", key_result="labels")|keep("image", "labels")
17
+ type: classification
18
+ grad_clip_norm: 1
19
+ input: |
20
+ accum_freq: 8
21
+ batch_size: 1024
22
+ cache_raw: false
23
+ data:
24
+ name: imagenet2012
25
+ split: train
26
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
27
+ key="label", key_result="labels")|keep("image", "labels")
28
+ shuffle_buffer_size: 150000
29
+ log_training_steps: 50
30
+ loss: softmax_xent
31
+ lr: 0.001
32
+ mixup: |
33
+ fold_in: null
34
+ p: 0.2
35
+ model: |
36
+ pool_type: gap
37
+ posemb: sincos2d
38
+ rep_size: false
39
+ variant: S/16
40
+ model_name: vit
41
+ num_classes: 1000
42
+ optax: |
43
+ mu_dtype: bfloat16
44
+ optax_name: scale_by_adam
45
+ pp_modules:
46
+ - ops_general
47
+ - ops_image
48
+ - ops_text
49
+ - archive.randaug
50
+ schedule: |
51
+ decay_type: cosine
52
+ warmup_steps: 10000
53
+ seed: 0
54
+ total_epochs: 90
55
+ wd: 0.0001
56
+ _locked:
57
+ value: true
58
+ _sort_keys:
59
+ value: true
60
+ _type_safe:
61
+ value: true
62
+ _wandb:
63
+ value:
64
+ cli_version: 0.18.7
65
+ m: []
66
+ python_version: 3.11.10
67
+ t:
68
+ "1":
69
+ - 1
70
+ - 2
71
+ - 3
72
+ - 12
73
+ - 41
74
+ - 45
75
+ - 55
76
+ "2":
77
+ - 1
78
+ - 2
79
+ - 3
80
+ - 12
81
+ - 41
82
+ - 45
83
+ - 55
84
+ "3":
85
+ - 5
86
+ - 13
87
+ - 14
88
+ - 16
89
+ - 23
90
+ - 55
91
+ - 61
92
+ - 62
93
+ "4": 3.11.10
94
+ "5": 0.18.7
95
+ "8":
96
+ - 5
97
+ "12": 0.18.7
98
+ "13": linux-x86_64
grafted/wandb/run-20241127_215015-torch-grafted-redux/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
grafted/wandb/run-20241127_215015-torch-grafted-redux/files/requirements.txt ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ array_record==0.5.1
2
+ wandb==0.18.7
3
+ nvidia-curand-cu12==10.3.2.106
4
+ requests-oauthlib==2.0.0
5
+ zipp==3.21.0
6
+ Werkzeug==3.1.3
7
+ simple-parsing==0.1.6
8
+ mdurl==0.1.2
9
+ keras==2.15.0
10
+ nvidia-cuda-nvcc-cu12==12.6.85
11
+ google-auth-oauthlib==1.2.1
12
+ jaxlib==0.4.34
13
+ tf_keras==2.15.1
14
+ oauthlib==3.2.2
15
+ tensorflow-probability==0.25.0
16
+ cachetools==5.5.0
17
+ Jinja2==3.1.3
18
+ rich==13.9.4
19
+ filelock==3.13.1
20
+ google-pasta==0.2.0
21
+ optax==0.2.4
22
+ toolz==1.0.0
23
+ gast==0.6.0
24
+ tensorboard==2.15.2
25
+ pyasn1_modules==0.4.1
26
+ nvidia-cudnn-cu12==9.1.0.70
27
+ opt_einsum==3.4.0
28
+ nvidia-nvjitlink-cu12==12.6.85
29
+ chex==0.1.87
30
+ namex==0.0.8
31
+ termcolor==2.5.0
32
+ flax==0.10.2
33
+ cloudpickle==3.1.0
34
+ numpy==1.26.4
35
+ nvidia-nccl-cu12==2.21.5
36
+ tensorflow-cpu==2.15.0
37
+ nvidia-cusolver-cu12==11.4.5.107
38
+ typing_extensions==4.12.2
39
+ tensorflow-addons==0.23.0
40
+ typeguard==2.13.3
41
+ absl-py==2.1.0
42
+ flatbuffers==24.3.25
43
+ dlpack==0.1
44
+ setuptools==65.5.0
45
+ protobuf==4.25.5
46
+ jax-cuda12-plugin==0.4.35
47
+ tensorflow==2.15.0
48
+ msgpack==1.1.0
49
+ networkx==3.2.1
50
+ docker-pycreds==0.4.0
51
+ nvidia-cuda-cupti-cu12==12.1.105
52
+ pillow==11.0.0
53
+ libclang==18.1.1
54
+ nvidia-cuda-nvrtc-cu12==12.1.105
55
+ distrax==0.1.5
56
+ orbax-checkpoint==0.10.1
57
+ PyYAML==6.0.2
58
+ urllib3==2.2.3
59
+ aqtp==0.8.2
60
+ tensorflow-metadata==1.16.1
61
+ etils==1.11.0
62
+ smmap==5.0.1
63
+ pyasn1==0.6.1
64
+ docstring_parser==0.16
65
+ google-auth==2.36.0
66
+ simplejson==3.19.3
67
+ mpmath==1.3.0
68
+ h5py==3.12.1
69
+ jax-cuda12-pjrt==0.4.35
70
+ tensorflow-io-gcs-filesystem==0.37.1
71
+ tensorflow-estimator==2.15.0
72
+ pydlpack==0.2.1
73
+ triton==3.1.0
74
+ rsa==4.9
75
+ panopticapi==0.1
76
+ tensorflow-hub==0.16.1
77
+ requests==2.32.3
78
+ scipy==1.14.1
79
+ ml-dtypes==0.2.0
80
+ markdown-it-py==3.0.0
81
+ tensorflow-text==2.15.0
82
+ wrapt==1.14.1
83
+ immutabledict==4.2.1
84
+ MarkupSafe==3.0.2
85
+ jax==0.4.35
86
+ torch==2.5.1+cu121
87
+ wheel==0.45.1
88
+ einops==0.8.0
89
+ sentry-sdk==2.19.0
90
+ torchvision==0.20.1+cu121
91
+ humanize==4.11.0
92
+ toml==0.10.2
93
+ tensorstore==0.1.69
94
+ six==1.16.0
95
+ promise==2.3
96
+ certifi==2024.8.30
97
+ nvidia-cuda-runtime-cu12==12.1.105
98
+ flaxformer==0.8.8
99
+ nvidia-cufft-cu12==11.0.2.54
100
+ psutil==6.1.0
101
+ GitPython==3.1.43
102
+ platformdirs==4.3.6
103
+ importlib_resources==6.4.5
104
+ tfds-nightly==4.9.7.dev202411280044
105
+ tensorflow-gan==2.1.0
106
+ googleapis-common-protos==1.66.0
107
+ overrides==7.7.0
108
+ optree==0.13.1
109
+ Pygments==2.18.0
110
+ astunparse==1.6.3
111
+ ml_collections==1.0.0
112
+ setproctitle==1.3.4
113
+ tensorboard-data-server==0.7.2
114
+ sympy==1.13.1
115
+ packaging==24.2
116
+ nest-asyncio==1.6.0
117
+ nvidia-cublas-cu12==12.1.3.1
118
+ gitdb==4.0.11
119
+ click==8.1.7
120
+ idna==3.10
121
+ tqdm==4.67.1
122
+ grpcio==1.68.0
123
+ decorator==5.1.1
124
+ pyarrow==18.1.0
125
+ clu==0.0.12
126
+ charset-normalizer==3.4.0
127
+ fsspec==2024.10.0
128
+ dm-tree==0.1.8
129
+ sentencepiece==0.2.0
130
+ nvidia-cusparse-cu12==12.1.0.106
131
+ torchaudio==2.5.1+cu121
132
+ pip==24.3.1
133
+ Markdown==3.7
134
+ nvidia-nvtx-cu12==12.1.105
grafted/wandb/run-20241127_215015-torch-grafted-redux/files/wandb-metadata.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-49-generic-x86_64-with-glibc2.39",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-11-28T05:50:15.071664Z",
5
+ "args": [
6
+ "--config",
7
+ "/home/jason-chou/Downloads/big_vision/big_vision/configs/vit_s16_i1k_single_gpu_test.py",
8
+ "--workdir",
9
+ "/data/imagenet/grafted",
10
+ "--name",
11
+ "torch-grafted-redux"
12
+ ],
13
+ "program": "-m big_vision.train",
14
+ "git": {
15
+ "remote": "https://github.com/EIFY/big_vision.git",
16
+ "commit": "44649a64ff67e709f55cdb5e3adcf52064b17de5"
17
+ },
18
+ "email": "[email protected]",
19
+ "root": "/home/jason-chou/Downloads/big_vision",
20
+ "host": "jasonchou-TensorBook-late-2021",
21
+ "username": "jason-chou",
22
+ "executable": "/home/jason-chou/.pyenv/versions/3.11.10/bin/python",
23
+ "cpu_count": 8,
24
+ "cpu_count_logical": 16,
25
+ "gpu": "NVIDIA GeForce RTX 3080 Laptop GPU",
26
+ "gpu_count": 1,
27
+ "disk": {
28
+ "/": {
29
+ "total": "1006450962432",
30
+ "used": "584324419584"
31
+ }
32
+ },
33
+ "memory": {
34
+ "total": "67162914816"
35
+ },
36
+ "cpu": {
37
+ "count": 8,
38
+ "countLogical": 16
39
+ },
40
+ "gpu_nvidia": [
41
+ {
42
+ "name": "NVIDIA GeForce RTX 3080 Laptop GPU",
43
+ "memoryTotal": "17179869184",
44
+ "cudaCores": 6144,
45
+ "architecture": "Ampere"
46
+ }
47
+ ],
48
+ "cudaVersion": "12.2"
49
+ }
grafted/wandb/run-20241127_215015-torch-grafted-redux/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lr":2.3437951579552636e-13,"progress":1,"l2_grads":2.6440370082855225,"val/loss":0.919307008081553,"val/acc@1":0.7654257015306123,"l2_params":237.9905797356025,"uptime":705834.820052186,"_wandb":{"runtime":706047},"_step":112603,"core_hours_NVIDIA GeForce RTX 3080 Laptop GPU":196.03091018991725,"train/loss":1.7738897949457169,"img/sec/core":164.21697038941156,"examples_seen":1.15305472e+08,"_timestamp":1.7334789428918557e+09,"_runtime":706047.114314126,"epoch":90.00034499795889,"core_hours":196.03091018991725,"z/secs/eval/val":97.07507979194634}
grafted/wandb/run-20241127_215015-torch-grafted-redux/logs/debug-internal.log ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-27T21:50:15.074175754-08:00","level":"INFO","msg":"using version","core version":"0.18.7"}
2
+ {"time":"2024-11-27T21:50:15.074199926-08:00","level":"INFO","msg":"created symlink","path":"/home/jason-chou/Downloads/big_vision/wandb/run-20241127_215015-torch-grafted-redux/logs/debug-core.log"}
3
+ {"time":"2024-11-27T21:50:15.178795589-08:00","level":"INFO","msg":"created new stream","id":"torch-grafted-redux"}
4
+ {"time":"2024-11-27T21:50:15.178841831-08:00","level":"INFO","msg":"stream: started","id":"torch-grafted-redux"}
5
+ {"time":"2024-11-27T21:50:15.179017283-08:00","level":"INFO","msg":"writer: Do: started","stream_id":"torch-grafted-redux"}
6
+ {"time":"2024-11-27T21:50:15.179013899-08:00","level":"INFO","msg":"sender: started","stream_id":"torch-grafted-redux"}
7
+ {"time":"2024-11-27T21:50:15.179140053-08:00","level":"INFO","msg":"handler: started","stream_id":"torch-grafted-redux"}
8
+ {"time":"2024-11-27T21:50:15.742647044-08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-28T02:34:16.031157097-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
10
+ {"time":"2024-11-28T02:34:48.365741462-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2024-11-28T02:35:23.360466143-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2024-11-28T02:36:02.179163429-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
13
+ {"time":"2024-11-28T02:36:48.506458125-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
14
+ {"time":"2024-11-28T02:37:26.943002842-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.0.0.84:35898->35.186.228.49:443: read: connection reset by peer"}
15
+ {"time":"2024-11-28T03:42:18.129115838-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp 35.186.228.49:443: connect: no route to host"}
16
+ {"time":"2024-11-28T03:42:20.139827897-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
17
+ {"time":"2024-11-28T03:42:24.372179347-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
18
+ {"time":"2024-11-28T03:42:32.586084468-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
19
+ {"time":"2024-11-28T03:42:46.132654012-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
20
+ {"time":"2024-11-28T03:42:52.120982292-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
21
+ {"time":"2024-11-28T03:43:18.285293593-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
22
+ {"time":"2024-11-28T03:43:28.907782037-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
23
+ {"time":"2024-11-28T03:43:53.098087627-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
24
+ {"time":"2024-11-28T03:44:28.909285538-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
25
+ {"time":"2024-11-28T03:44:31.614526811-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
26
+ {"time":"2024-11-28T03:45:17.871096255-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
27
+ {"time":"2024-11-28T03:45:28.911117824-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
28
+ {"time":"2024-11-28T04:27:01.204698067-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
29
+ {"time":"2024-11-28T04:27:33.238366122-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
30
+ {"time":"2024-11-28T04:28:07.420519765-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
31
+ {"time":"2024-11-28T04:28:47.085988867-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
32
+ {"time":"2024-11-28T04:29:35.767860694-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
33
+ {"time":"2024-11-28T08:14:46.411826832-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
34
+ {"time":"2024-11-29T07:48:56.401765469-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp 35.186.228.49:443: connect: no route to host"}
35
+ {"time":"2024-11-29T15:48:03.006388984-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
36
+ {"time":"2024-11-29T15:48:35.3494534-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
37
+ {"time":"2024-11-29T15:49:10.275739645-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
38
+ {"time":"2024-11-30T03:43:35.70265325-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": read tcp 10.0.0.84:53632->35.186.228.49:443: read: connection reset by peer"}
39
+ {"time":"2024-11-30T04:33:41.905126209-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp 35.186.228.49:443: connect: no route to host"}
40
+ {"time":"2024-11-30T12:32:33.434061272-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp 35.186.228.49:443: connect: no route to host"}
41
+ {"time":"2024-11-30T16:47:09.087172552-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": read tcp 10.0.0.84:57124->35.186.228.49:443: read: connection reset by peer"}
42
+ {"time":"2024-11-30T20:42:04.468697581-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
43
+ {"time":"2024-11-30T20:42:36.856408234-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
44
+ {"time":"2024-11-30T20:43:11.213955553-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
45
+ {"time":"2024-11-30T20:43:51.102603998-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
46
+ {"time":"2024-11-30T20:44:37.626630185-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
47
+ {"time":"2024-11-30T20:45:14.657693577-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.0.0.84:52436->35.186.228.49:443: read: connection reset by peer"}
48
+ {"time":"2024-12-01T00:45:34.74621148-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
49
+ {"time":"2024-12-01T00:46:06.894294578-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
50
+ {"time":"2024-12-01T00:46:41.891784748-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
51
+ {"time":"2024-12-01T00:47:20.240406935-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
52
+ {"time":"2024-12-01T00:48:08.857417923-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
53
+ {"time":"2024-12-01T03:18:49.908220918-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
54
+ {"time":"2024-12-01T03:19:22.177996379-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
55
+ {"time":"2024-12-01T03:19:57.167870716-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
56
+ {"time":"2024-12-01T03:20:35.786919209-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
57
+ {"time":"2024-12-01T03:21:23.817288934-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
58
+ {"time":"2024-12-01T04:35:50.003784946-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
59
+ {"time":"2024-12-01T04:36:22.26631399-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
60
+ {"time":"2024-12-01T04:36:56.640870294-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
61
+ {"time":"2024-12-01T04:37:35.443320555-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
62
+ {"time":"2024-12-01T04:38:23.987753899-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
63
+ {"time":"2024-12-01T04:46:05.057961007-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
64
+ {"time":"2024-12-01T04:46:37.546859547-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
65
+ {"time":"2024-12-01T04:47:11.581392685-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
66
+ {"time":"2024-12-01T04:47:50.252999457-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
67
+ {"time":"2024-12-01T04:48:37.650736566-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
68
+ {"time":"2024-12-01T04:49:11.191378623-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.0.0.84:38402->35.186.228.49:443: read: connection reset by peer"}
69
+ {"time":"2024-12-01T16:38:05.68648332-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
70
+ {"time":"2024-12-01T16:38:37.887436779-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
71
+ {"time":"2024-12-01T16:39:12.133333955-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
72
+ {"time":"2024-12-01T16:39:52.130430618-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
73
+ {"time":"2024-12-01T16:40:39.445088692-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
74
+ {"time":"2024-12-01T16:43:20.726602894-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
75
+ {"time":"2024-12-01T16:43:52.91904545-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
76
+ {"time":"2024-12-01T16:44:27.667498554-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
77
+ {"time":"2024-12-01T16:45:07.347776589-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
78
+ {"time":"2024-12-01T16:45:54.2028342-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
79
+ {"time":"2024-12-01T18:32:20.873980572-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
80
+ {"time":"2024-12-01T18:32:53.205902113-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
81
+ {"time":"2024-12-01T18:33:27.963313543-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
82
+ {"time":"2024-12-01T18:34:07.633098053-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
83
+ {"time":"2024-12-01T18:34:57.576633533-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
84
+ {"time":"2024-12-01T18:37:35.91680404-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
85
+ {"time":"2024-12-01T18:38:08.405404154-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
86
+ {"time":"2024-12-01T18:38:42.587426715-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
87
+ {"time":"2024-12-01T18:39:20.751273113-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
88
+ {"time":"2024-12-01T18:40:07.44210804-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
89
+ {"time":"2024-12-02T02:35:51.391509841-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
90
+ {"time":"2024-12-02T02:36:23.802341886-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
91
+ {"time":"2024-12-02T02:36:57.877540099-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
92
+ {"time":"2024-12-02T02:37:37.844207985-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
93
+ {"time":"2024-12-02T02:38:24.231618803-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
94
+ {"time":"2024-12-02T12:37:52.036374796-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
95
+ {"time":"2024-12-02T12:38:24.246737479-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
96
+ {"time":"2024-12-02T12:38:58.86214325-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
97
+ {"time":"2024-12-02T12:39:37.93956232-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
98
+ {"time":"2024-12-02T12:40:27.832359297-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
99
+ {"time":"2024-12-02T12:43:22.078082806-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
100
+ {"time":"2024-12-02T12:43:54.260401543-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
101
+ {"time":"2024-12-02T12:44:28.422029172-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
102
+ {"time":"2024-12-02T12:45:07.133435109-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
103
+ {"time":"2024-12-02T12:45:55.595030082-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
104
+ {"time":"2024-12-02T12:46:39.639355887-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.0.0.84:53050->35.186.228.49:443: read: connection reset by peer"}
105
+ {"time":"2024-12-02T14:32:22.213600631-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
106
+ {"time":"2024-12-02T14:32:54.409672429-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
107
+ {"time":"2024-12-02T14:33:29.313237828-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
108
+ {"time":"2024-12-02T14:34:08.561175398-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
109
+ {"time":"2024-12-02T14:34:58.280659826-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
110
+ {"time":"2024-12-02T14:35:40.950530237-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.0.0.84:53982->35.186.228.49:443: read: connection reset by peer"}
111
+ {"time":"2024-12-02T22:38:52.669417282-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
112
+ {"time":"2024-12-02T22:39:24.808021475-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
113
+ {"time":"2024-12-02T22:39:59.412976592-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
114
+ {"time":"2024-12-02T22:40:38.099901646-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
115
+ {"time":"2024-12-02T22:41:27.308506641-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
116
+ {"time":"2024-12-03T00:27:22.905950569-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
117
+ {"time":"2024-12-03T00:27:55.246674688-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
118
+ {"time":"2024-12-03T00:28:29.944219256-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
119
+ {"time":"2024-12-03T00:29:08.68500746-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
120
+ {"time":"2024-12-03T00:29:56.678315339-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
121
+ {"time":"2024-12-03T08:51:00.224082728-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": read tcp 10.0.0.84:54348->35.186.228.49:443: read: connection reset by peer"}
122
+ {"time":"2024-12-03T08:55:53.399562922-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
123
+ {"time":"2024-12-03T08:56:25.491144652-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
124
+ {"time":"2024-12-03T08:57:00.241113785-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
125
+ {"time":"2024-12-03T08:57:40.21004883-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
126
+ {"time":"2024-12-03T08:58:26.526472631-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
127
+ {"time":"2024-12-03T10:36:19.661719074-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
128
+ {"time":"2024-12-03T10:36:38.530396963-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
129
+ {"time":"2024-12-03T10:37:10.818041328-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
130
+ {"time":"2024-12-03T10:37:44.924643065-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
131
+ {"time":"2024-12-03T10:38:24.006775695-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
132
+ {"time":"2024-12-03T10:39:10.789085577-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
133
+ {"time":"2024-12-03T10:47:08.577691365-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
134
+ {"time":"2024-12-03T10:47:40.945024167-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
135
+ {"time":"2024-12-03T10:48:15.894144562-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
136
+ {"time":"2024-12-03T10:48:54.876775304-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
137
+ {"time":"2024-12-03T10:49:41.094974693-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
138
+ {"time":"2024-12-03T10:50:21.078869466-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.0.0.84:48148->35.186.228.49:443: read: connection reset by peer"}
139
+ {"time":"2024-12-03T12:23:51.403070328-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
140
+ {"time":"2024-12-03T12:40:53.731145089-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
141
+ {"time":"2024-12-03T23:32:54.290577171-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
142
+ {"time":"2024-12-03T23:33:26.617679262-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
143
+ {"time":"2024-12-03T23:34:00.757344429-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
144
+ {"time":"2024-12-03T23:34:39.966159301-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
145
+ {"time":"2024-12-03T23:35:26.529833061-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
146
+ {"time":"2024-12-04T14:19:55.049105743-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
147
+ {"time":"2024-12-04T15:28:55.116890309-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
148
+ {"time":"2024-12-04T15:29:27.135172446-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
149
+ {"time":"2024-12-04T15:30:02.078524164-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
150
+ {"time":"2024-12-04T15:30:40.31468816-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
151
+ {"time":"2024-12-04T15:31:29.901859494-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
152
+ {"time":"2024-12-04T17:18:55.237241846-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
153
+ {"time":"2024-12-04T17:19:27.334026079-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
154
+ {"time":"2024-12-04T17:20:02.022011033-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
155
+ {"time":"2024-12-04T17:20:41.434273822-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
156
+ {"time":"2024-12-04T17:21:29.254101761-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
157
+ {"time":"2024-12-04T18:04:10.322394576-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
158
+ {"time":"2024-12-04T18:04:42.484364529-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
159
+ {"time":"2024-12-04T18:05:16.646794563-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
160
+ {"time":"2024-12-04T18:05:55.608027075-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
161
+ {"time":"2024-12-04T18:06:44.553620127-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
162
+ {"time":"2024-12-04T22:27:40.607937922-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
163
+ {"time":"2024-12-04T22:28:12.836516946-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
164
+ {"time":"2024-12-04T22:28:47.459223154-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
165
+ {"time":"2024-12-04T22:29:26.147960147-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
166
+ {"time":"2024-12-04T22:30:15.062106005-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
167
+ {"time":"2024-12-05T01:47:39.133391386-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
168
+ {"time":"2024-12-05T01:47:55.82293439-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
169
+ {"time":"2024-12-05T01:47:58.69274009-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
170
+ {"time":"2024-12-05T01:48:14.974980479-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
171
+ {"time":"2024-12-05T01:48:20.402384234-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
172
+ {"time":"2024-12-05T01:48:35.966316067-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
173
+ {"time":"2024-12-05T01:48:45.592640286-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
174
+ {"time":"2024-12-05T01:49:02.593889106-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
175
+ {"time":"2024-12-05T01:49:19.131511604-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
176
+ {"time":"2024-12-05T01:49:36.282249089-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
177
+ {"time":"2024-12-05T01:50:15.299754947-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
178
+ {"time":"2024-12-05T01:50:27.504309484-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
179
+ {"time":"2024-12-05T01:51:32.820040328-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
180
+ {"time":"2024-12-05T01:51:45.124161387-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
181
+ {"time":"2024-12-05T01:52:32.91054814-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
182
+ {"time":"2024-12-05T01:53:01.802742601-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
183
+ {"time":"2024-12-05T01:53:49.829573071-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
184
+ {"time":"2024-12-05T01:54:01.810922094-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
185
+ {"time":"2024-12-05T01:55:01.864377192-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
186
+ {"time":"2024-12-05T01:55:07.143197064-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
187
+ {"time":"2024-12-05T01:56:07.173072531-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
188
+ {"time":"2024-12-05T01:56:18.825182849-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
189
+ {"time":"2024-12-05T01:57:24.055080888-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
190
+ {"time":"2024-12-05T01:57:25.832815739-08:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000041136,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"a5uf5b0ketip\" connection_id:\"127.0.0.1:40934\")"}
191
+ {"time":"2024-12-05T01:57:36.240628481-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
192
+ {"time":"2024-12-05T01:58:32.869123973-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
193
+ {"time":"2024-12-05T01:58:36.256158272-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
194
+ {"time":"2024-12-05T01:59:33.099733752-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
195
+ {"time":"2024-12-05T01:59:53.664227715-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
196
+ {"time":"2024-12-05T01:59:59.516848736-08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.00106625,"work":"WorkRecord(*service_go_proto.Request_Keepalive); Control(local:true connection_id:\"127.0.0.1:40934\")"}
197
+ {"time":"2024-12-05T02:00:05.744419675-08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000352825,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
198
+ {"time":"2024-12-05T02:00:05.744480013-08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000396572,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
199
+ {"time":"2024-12-05T02:00:05.745541341-08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000591547,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
200
+ {"time":"2024-12-05T02:00:05.745550259-08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000894539,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
201
+ {"time":"2024-12-05T02:00:05.747662841-08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000444392,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
202
+ {"time":"2024-12-05T02:00:33.136975098-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
203
+ {"time":"2024-12-05T02:01:11.07999853-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
204
+ {"time":"2024-12-05T02:01:41.902148444-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
205
+ {"time":"2024-12-05T02:02:27.673478132-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
206
+ {"time":"2024-12-05T02:02:50.303630914-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
207
+ {"time":"2024-12-05T02:03:36.076486567-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
208
+ {"time":"2024-12-05T02:04:07.620541718-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
209
+ {"time":"2024-12-05T02:04:52.67722497-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
210
+ {"time":"2024-12-05T02:05:07.755640903-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
211
+ {"time":"2024-12-05T02:05:52.699769224-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql"}
212
+ {"time":"2024-12-05T02:06:56.575240743-08:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1170.752439014,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"a5uf5b0ketip\" connection_id:\"127.0.0.1:40934\")"}
213
+ {"time":"2024-12-05T02:06:56.575254227-08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":1010.828027475,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
214
+ {"time":"2024-12-05T02:06:56.575270113-08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":1010.830331617,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
215
+ {"time":"2024-12-05T02:06:56.57528404-08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":1010.830637182,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
216
+ {"time":"2024-12-05T02:06:56.575306681-08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":1010.831224773,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
217
+ {"time":"2024-12-05T02:06:56.575330718-08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":1017.05961612,"work":"WorkRecord(*service_go_proto.Request_Keepalive); Control(local:true connection_id:\"127.0.0.1:40934\")"}
218
+ {"time":"2024-12-05T02:06:56.5753365-08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":1010.831326961,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
219
+ {"time":"2024-12-05T09:15:26.449381152-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
220
+ {"time":"2024-12-05T09:15:58.57629318-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
221
+ {"time":"2024-12-05T09:16:33.095093096-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
222
+ {"time":"2024-12-05T11:13:14.836120253-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
223
+ {"time":"2024-12-05T11:13:16.949701235-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp: lookup api.wandb.ai on 127.0.0.53:53: server misbehaving"}
224
+ {"time":"2024-12-05T15:47:11.925458079-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
225
+ {"time":"2024-12-05T18:23:27.087099548-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.0.0.84:41532->35.186.228.49:443: read: connection reset by peer"}
226
+ {"time":"2024-12-05T19:12:12.239890815-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
227
+ {"time":"2024-12-05T19:12:44.526201007-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
228
+ {"time":"2024-12-05T19:13:18.940020619-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
229
+ {"time":"2024-12-05T19:13:56.999079567-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
230
+ {"time":"2024-12-05T19:14:43.370678467-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
231
+ {"time":"2024-12-05T19:27:27.291303123-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
232
+ {"time":"2024-12-05T19:27:59.337388458-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
233
+ {"time":"2024-12-05T19:28:33.843756768-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
234
+ {"time":"2024-12-05T19:29:13.488021118-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
235
+ {"time":"2024-12-05T19:29:59.640302841-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
236
+ {"time":"2024-12-05T21:23:37.131648811-08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream"}
237
+ {"time":"2024-12-06T01:06:34.449145502-08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/eify/mup-vit/torch-grafted-redux/file_stream\": dial tcp 35.186.228.49:443: connect: no route to host"}
238
+ {"time":"2024-12-06T01:55:44.185807115-08:00","level":"INFO","msg":"stream: closing","id":"torch-grafted-redux"}
239
+ {"time":"2024-12-06T01:55:44.186163552-08:00","level":"INFO","msg":"Stopping system monitor"}
240
+ {"time":"2024-12-06T01:55:44.188365369-08:00","level":"INFO","msg":"Stopped system monitor"}
241
+ {"time":"2024-12-06T01:55:44.281103284-08:00","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
242
+ {"time":"2024-12-06T01:55:44.281124225-08:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
243
+ {"time":"2024-12-06T01:55:45.642796519-08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
244
+ {"time":"2024-12-06T01:55:45.770734316-08:00","level":"INFO","msg":"handler: closed","stream_id":"torch-grafted-redux"}
245
+ {"time":"2024-12-06T01:55:45.77080105-08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"torch-grafted-redux"}
246
+ {"time":"2024-12-06T01:55:45.770825146-08:00","level":"INFO","msg":"sender: closed","stream_id":"torch-grafted-redux"}
247
+ {"time":"2024-12-06T01:55:45.77092765-08:00","level":"INFO","msg":"stream: closed","id":"torch-grafted-redux"}
grafted/wandb/run-20241127_215015-torch-grafted-redux/logs/debug.log ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7
2
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Configure stats pid to 72239
3
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/.config/wandb/settings
4
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Loading settings from /home/jason-chou/Downloads/big_vision/wandb/settings
5
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-27 21:50:15,067 WARNING MainThread:72239 [wandb_setup.py:_flush():79] Could not find program at -m big_vision.train
8
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m big_vision.train'}
9
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_init.py:_log_setup():533] Logging user logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_215015-torch-grafted-redux/logs/debug.log
11
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_init.py:_log_setup():534] Logging internal logs to /home/jason-chou/Downloads/big_vision/wandb/run-20241127_215015-torch-grafted-redux/logs/debug-internal.log
12
+ 2024-11-27 21:50:15,067 INFO MainThread:72239 [wandb_init.py:init():619] calling init triggers
13
+ 2024-11-27 21:50:15,068 INFO MainThread:72239 [wandb_init.py:init():626] wandb.init called with sweep_config: {}
14
+ config: {'_fields': {'seed': 0, 'total_epochs': 90, 'num_classes': 1000, 'loss': 'softmax_xent', 'input': accum_freq: 8
15
+ batch_size: 1024
16
+ cache_raw: false
17
+ data:
18
+ name: imagenet2012
19
+ split: train
20
+ pp: decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)|value_range(-1, 1)|onehot(1000,
21
+ key="label", key_result="labels")|keep("image", "labels")
22
+ shuffle_buffer_size: 150000
23
+ , 'pp_modules': ['ops_general', 'ops_image', 'ops_text', 'archive.randaug'], 'log_training_steps': 50, 'ckpt_steps': 1000, 'model_name': 'vit', 'model': pool_type: gap
24
+ posemb: sincos2d
25
+ rep_size: false
26
+ variant: S/16
27
+ , 'grad_clip_norm': 1.0, 'optax_name': 'scale_by_adam', 'optax': mu_dtype: bfloat16
28
+ , 'lr': 0.001, 'wd': 0.0001, 'schedule': decay_type: cosine
29
+ warmup_steps: 10000
30
+ , 'mixup': fold_in: null
31
+ p: 0.2
32
+ , 'evals': val:
33
+ data:
34
+ name: imagenet2012
35
+ split: validation
36
+ log_steps: 2500
37
+ loss_name: softmax_xent
38
+ pp_fn: decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000,
39
+ key="label", key_result="labels")|keep("image", "labels")
40
+ type: classification
41
+ }, '_locked': True, '_type_safe': True, '_convert_dict': True, '_allow_dotted_keys': False, '_sort_keys': True}
42
+ 2024-11-27 21:50:15,068 INFO MainThread:72239 [wandb_init.py:init():669] starting backend
43
+ 2024-11-27 21:50:15,068 INFO MainThread:72239 [wandb_init.py:init():673] sending inform_init request
44
+ 2024-11-27 21:50:15,070 INFO MainThread:72239 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
45
+ 2024-11-27 21:50:15,071 INFO MainThread:72239 [wandb_init.py:init():686] backend started and connected
46
+ 2024-11-27 21:50:15,076 INFO MainThread:72239 [wandb_init.py:init():781] updated telemetry
47
+ 2024-11-27 21:50:15,080 INFO MainThread:72239 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout
48
+ 2024-11-27 21:50:15,728 INFO MainThread:72239 [wandb_init.py:init():859] run resumed
49
+ 2024-11-27 21:50:15,740 INFO MainThread:72239 [wandb_init.py:init():867] starting run threads in backend
50
+ 2024-11-27 21:50:15,808 INFO MainThread:72239 [wandb_run.py:_console_start():2456] atexit reg
51
+ 2024-11-27 21:50:15,808 INFO MainThread:72239 [wandb_run.py:_redirect():2305] redirect: wrap_raw
52
+ 2024-11-27 21:50:15,809 INFO MainThread:72239 [wandb_run.py:_redirect():2370] Wrapping output streams.
53
+ 2024-11-27 21:50:15,809 INFO MainThread:72239 [wandb_run.py:_redirect():2395] Redirects installed.
54
+ 2024-11-27 21:50:15,810 INFO MainThread:72239 [wandb_init.py:init():911] run started, returning control to user process
55
+ 2024-12-06 01:55:44,185 WARNING MsgRouterThr:72239 [router.py:message_loop():75] message_loop has been closed
grafted/wandb/run-20241127_215015-torch-grafted-redux/run-torch-grafted-redux.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57dfc6760ef360b69a2d3a3f55620d815f8518b1f2ddc461bcb21aaf665973a9
3
+ size 115832137
grafted/wandb/wandb-resume.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"run_id": "torch-grafted-redux"}