dayone3nder commited on
Commit
8e26f61
·
verified ·
1 Parent(s): 9148f38

Upload folder using huggingface_hub

Browse files
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. arguments.yaml +66 -0
  3. environ.txt +235 -0
  4. script.sh +46 -0
  5. slice_end/added_tokens.json +24 -0
  6. slice_end/config.json +29 -0
  7. slice_end/merges.txt +0 -0
  8. slice_end/pytorch_model.bin +3 -0
  9. slice_end/special_tokens_map.json +31 -0
  10. slice_end/tokenizer.json +3 -0
  11. slice_end/tokenizer_config.json +209 -0
  12. slice_end/vocab.json +0 -0
  13. wandb/debug-internal.log +17 -0
  14. wandb/debug.log +29 -0
  15. wandb/run-20250404_234514-h2gynfll/files/config.yaml +109 -0
  16. wandb/run-20250404_234514-h2gynfll/files/output.log +221 -0
  17. wandb/run-20250404_234514-h2gynfll/files/requirements.txt +253 -0
  18. wandb/run-20250404_234514-h2gynfll/files/wandb-metadata.json +112 -0
  19. wandb/run-20250404_234514-h2gynfll/files/wandb-summary.json +1 -0
  20. wandb/run-20250404_234514-h2gynfll/logs/debug-core.log +16 -0
  21. wandb/run-20250404_234514-h2gynfll/logs/debug-internal.log +17 -0
  22. wandb/run-20250404_234514-h2gynfll/logs/debug.log +29 -0
  23. wandb/run-20250404_234514-h2gynfll/run-h2gynfll.wandb +3 -0
  24. wandb/run-20250405_124142-wdmxf5un/files/config.yaml +109 -0
  25. wandb/run-20250405_124142-wdmxf5un/files/output.log +115 -0
  26. wandb/run-20250405_124142-wdmxf5un/files/requirements.txt +253 -0
  27. wandb/run-20250405_124142-wdmxf5un/files/wandb-metadata.json +112 -0
  28. wandb/run-20250405_124142-wdmxf5un/files/wandb-summary.json +1 -0
  29. wandb/run-20250405_124142-wdmxf5un/logs/debug-core.log +16 -0
  30. wandb/run-20250405_124142-wdmxf5un/logs/debug-internal.log +17 -0
  31. wandb/run-20250405_124142-wdmxf5un/logs/debug.log +29 -0
  32. wandb/run-20250405_124142-wdmxf5un/run-wdmxf5un.wandb +3 -0
  33. wandb/run-20250405_153219-puqja889/files/config.yaml +109 -0
  34. wandb/run-20250405_153219-puqja889/files/output.log +63 -0
  35. wandb/run-20250405_153219-puqja889/files/requirements.txt +253 -0
  36. wandb/run-20250405_153219-puqja889/files/wandb-metadata.json +112 -0
  37. wandb/run-20250405_153219-puqja889/files/wandb-summary.json +1 -0
  38. wandb/run-20250405_153219-puqja889/logs/debug-core.log +16 -0
  39. wandb/run-20250405_153219-puqja889/logs/debug-internal.log +17 -0
  40. wandb/run-20250405_153219-puqja889/logs/debug.log +29 -0
  41. wandb/run-20250405_153219-puqja889/run-puqja889.wandb +3 -0
  42. wandb/run-20250405_203209-jla7fqqr/files/config.yaml +109 -0
  43. wandb/run-20250405_203209-jla7fqqr/files/output.log +63 -0
  44. wandb/run-20250405_203209-jla7fqqr/files/requirements.txt +253 -0
  45. wandb/run-20250405_203209-jla7fqqr/files/wandb-metadata.json +112 -0
  46. wandb/run-20250405_203209-jla7fqqr/files/wandb-summary.json +1 -0
  47. wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log +16 -0
  48. wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log +17 -0
  49. wandb/run-20250405_203209-jla7fqqr/logs/debug.log +29 -0
  50. wandb/run-20250405_203209-jla7fqqr/run-jla7fqqr.wandb +3 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ slice_end/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ wandb/run-20250404_234514-h2gynfll/run-h2gynfll.wandb filter=lfs diff=lfs merge=lfs -text
38
+ wandb/run-20250405_124142-wdmxf5un/run-wdmxf5un.wandb filter=lfs diff=lfs merge=lfs -text
39
+ wandb/run-20250405_153219-puqja889/run-puqja889.wandb filter=lfs diff=lfs merge=lfs -text
40
+ wandb/run-20250405_203209-jla7fqqr/run-jla7fqqr.wandb filter=lfs diff=lfs merge=lfs -text
arguments.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bnb_cfgs:
2
+ bnb_4bit_compute_dtype: float16
3
+ bnb_4bit_quant_type: nf4
4
+ bnb_4bit_use_double_quant: true
5
+ load_in_4bit: true
6
+ load_in_8bit: false
7
+ use_bnb: false
8
+ data_cfgs:
9
+ eval_data_files: {}
10
+ eval_datasets: {}
11
+ eval_optional_args: []
12
+ eval_size: {}
13
+ eval_split: {}
14
+ eval_subset: {}
15
+ eval_template: {}
16
+ train_data_files: {}
17
+ train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
18
+ train_name: {}
19
+ train_optional_args: []
20
+ train_size: {}
21
+ train_split: train
22
+ train_template: Safe_thinking
23
+ logger_cfgs:
24
+ cache_dir: {}
25
+ log_project: safe-o1
26
+ log_run_name: sft
27
+ log_type: wandb
28
+ output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
29
+ save_interval: 100000
30
+ lora_cfgs:
31
+ inference_mode: false
32
+ lora_alpha: 16
33
+ lora_dropout: 0.1
34
+ r: 16
35
+ save_full_model: true
36
+ target_modules:
37
+ - q_proj
38
+ - v_proj
39
+ task_type: TaskType.CAUSAL_LM
40
+ use_lora: false
41
+ model_cfgs:
42
+ model_max_length: 16384
43
+ model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
44
+ trust_remote_code: true
45
+ special_tokens: {}
46
+ train_cfgs:
47
+ adam_betas:
48
+ - 0.9
49
+ - 0.95
50
+ adam_epsilon: 1.0e-08
51
+ bf16: true
52
+ ds_cfgs: ds_z3_config.json
53
+ epochs: 3
54
+ eval_interval: 10
55
+ eval_strategy: steps
56
+ fp16: false
57
+ gradient_accumulation_steps: 2
58
+ gradient_checkpointing: true
59
+ learning_rate: 2.0e-05
60
+ lr_scheduler_type: constant
61
+ lr_warmup_ratio: 0.03
62
+ max_grad_norm: 1.0
63
+ per_device_eval_batch_size: 4
64
+ per_device_train_batch_size: 4
65
+ seed: 42
66
+ weight_decay: 0.0
environ.txt ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ADDR2LINE=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-addr2line
2
+ AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ar
3
+ AS=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-as
4
+ BASH_FUNC__module_raw%%=() { unset _mlshdbg;
5
+ if [ "${MODULES_SILENT_SHELL_DEBUG:-0}" = '1' ]; then
6
+ case "$-" in
7
+ *v*x*)
8
+ set +vx;
9
+ _mlshdbg='vx'
10
+ ;;
11
+ *v*)
12
+ set +v;
13
+ _mlshdbg='v'
14
+ ;;
15
+ *x*)
16
+ set +x;
17
+ _mlshdbg='x'
18
+ ;;
19
+ *)
20
+ _mlshdbg=''
21
+ ;;
22
+ esac;
23
+ fi;
24
+ unset _mlre _mlIFS;
25
+ if [ -n "${IFS+x}" ]; then
26
+ _mlIFS=$IFS;
27
+ fi;
28
+ IFS=' ';
29
+ for _mlv in ${MODULES_RUN_QUARANTINE:-};
30
+ do
31
+ if [ "${_mlv}" = "${_mlv##*[!A-Za-z0-9_]}" -a "${_mlv}" = "${_mlv#[0-9]}" ]; then
32
+ if [ -n "`eval 'echo ${'$_mlv'+x}'`" ]; then
33
+ _mlre="${_mlre:-}${_mlv}_modquar='`eval 'echo ${'$_mlv'}'`' ";
34
+ fi;
35
+ _mlrv="MODULES_RUNENV_${_mlv}";
36
+ _mlre="${_mlre:-}${_mlv}='`eval 'echo ${'$_mlrv':-}'`' ";
37
+ fi;
38
+ done;
39
+ if [ -n "${_mlre:-}" ]; then
40
+ eval `eval ${_mlre} /usr/bin/tclsh /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl bash '"$@"'`;
41
+ else
42
+ eval `/usr/bin/tclsh /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl bash "$@"`;
43
+ fi;
44
+ _mlstatus=$?;
45
+ if [ -n "${_mlIFS+x}" ]; then
46
+ IFS=$_mlIFS;
47
+ else
48
+ unset IFS;
49
+ fi;
50
+ unset _mlre _mlv _mlrv _mlIFS;
51
+ if [ -n "${_mlshdbg:-}" ]; then
52
+ set -$_mlshdbg;
53
+ fi;
54
+ unset _mlshdbg;
55
+ return $_mlstatus
56
+ }
57
+ BASH_FUNC_ml%%=() { module ml "$@"
58
+ }
59
+ BASH_FUNC_module%%=() { _module_raw "$@" 2>&1
60
+ }
61
+ BASH_FUNC_switchml%%=() { typeset swfound=1;
62
+ if [ "${MODULES_USE_COMPAT_VERSION:-0}" = '1' ]; then
63
+ typeset swname='main';
64
+ if [ -e /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl ]; then
65
+ typeset swfound=0;
66
+ unset MODULES_USE_COMPAT_VERSION;
67
+ fi;
68
+ else
69
+ typeset swname='compatibility';
70
+ if [ -e /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd-compat ]; then
71
+ typeset swfound=0;
72
+ MODULES_USE_COMPAT_VERSION=1;
73
+ export MODULES_USE_COMPAT_VERSION;
74
+ fi;
75
+ fi;
76
+ if [ $swfound -eq 0 ]; then
77
+ echo "Switching to Modules $swname version";
78
+ source /cm/local/apps/environment-modules/4.5.3/init/bash;
79
+ else
80
+ echo "Cannot switch to Modules $swname version, command not found";
81
+ return 1;
82
+ fi
83
+ }
84
+ BUILD=x86_64-conda-linux-gnu
85
+ CC=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-cc
86
+ CC_FOR_BUILD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-cc
87
+ CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include -I/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/include -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
88
+ CMAKE_ARGS=-DCMAKE_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ar -DCMAKE_CXX_COMPILER_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_C_COMPILER_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ranlib -DCMAKE_CXX_COMPILER_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_C_COMPILER_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_LINKER=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-strip -DCMAKE_BUILD_TYPE=Release
89
+ CMAKE_PREFIX_PATH=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl:/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/x86_64-conda-linux-gnu/sysroot/usr
90
+ CMD_WLM_CLUSTER_NAME=slurm
91
+ CONDA_BUILD_SYSROOT=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/x86_64-conda-linux-gnu/sysroot
92
+ CONDA_DEFAULT_ENV=wenqi_qwen2vl
93
+ CONDA_EXE=/aifs4su/yaodong/miniconda3/bin/conda
94
+ CONDA_PREFIX=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
95
+ CONDA_PREFIX_1=/aifs4su/yaodong/miniconda3
96
+ CONDA_PREFIX_10=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
97
+ CONDA_PREFIX_11=/aifs4su/yaodong/miniconda3
98
+ CONDA_PREFIX_12=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
99
+ CONDA_PREFIX_13=/aifs4su/yaodong/miniconda3
100
+ CONDA_PREFIX_14=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
101
+ CONDA_PREFIX_15=/aifs4su/yaodong/miniconda3
102
+ CONDA_PREFIX_16=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
103
+ CONDA_PREFIX_17=/aifs4su/yaodong/miniconda3
104
+ CONDA_PREFIX_18=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
105
+ CONDA_PREFIX_19=/aifs4su/yaodong/miniconda3
106
+ CONDA_PREFIX_2=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
107
+ CONDA_PREFIX_3=/aifs4su/yaodong/miniconda3
108
+ CONDA_PREFIX_4=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
109
+ CONDA_PREFIX_5=/aifs4su/yaodong/miniconda3
110
+ CONDA_PREFIX_6=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
111
+ CONDA_PREFIX_7=/aifs4su/yaodong/miniconda3
112
+ CONDA_PREFIX_8=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
113
+ CONDA_PREFIX_9=/aifs4su/yaodong/miniconda3
114
+ CONDA_PROMPT_MODIFIER=(wenqi_qwen2vl)
115
+ CONDA_PYTHON_EXE=/aifs4su/yaodong/miniconda3/bin/python
116
+ CONDA_SHLVL=20
117
+ CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
118
+ CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
119
+ CPATH=/cm/shared/apps/slurm/current/include
120
+ CPATH_modshare=/cm/shared/apps/slurm/current/include:1
121
+ CPP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-cpp
122
+ CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include -I/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/include -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
123
+ CROSS_RANK=0
124
+ CROSS_SIZE=1
125
+ CUDA_MODULE_LOADING=LAZY
126
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
127
+ CXX=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++
128
+ CXXFILT=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++filt
129
+ CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include -I/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/include -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
130
+ CXX_FOR_BUILD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++
131
+ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1028/bus
132
+ DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include
133
+ DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include
134
+ DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include
135
+ DISABLE_VERSION_CHECK=1
136
+ DWP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-dwp
137
+ ELFEDIT=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-elfedit
138
+ ENABLE_LMOD=0
139
+ GCC=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc
140
+ GCC_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ar
141
+ GCC_NM=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-nm
142
+ GCC_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ranlib
143
+ GPROF=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gprof
144
+ GSETTINGS_SCHEMA_DIR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/share/glib-2.0/schemas
145
+ GSETTINGS_SCHEMA_DIR_CONDA_BACKUP=
146
+ GXX=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-g++
147
+ HF_DATASETS_CACHE=/aifs4su/yaodong/.cache/huggingface/datasets
148
+ HF_HOME=/aifs4su/yaodong/.cache/huggingface
149
+ HISTTIMEFORMAT=%y/%m/%d %T
150
+ HOME=/home/yangyaodong
151
+ HOST=x86_64-conda-linux-gnu
152
+ KMP_DUPLICATE_LIB_OK=True
153
+ KMP_INIT_AT_FORK=FALSE
154
+ LANG=C.UTF-8
155
+ LD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ld
156
+ LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib -Wl,-rpath-link,/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
157
+ LD_GOLD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ld.gold
158
+ LD_LIBRARY_PATH=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib/python3.11/site-packages/cv2/../../lib64:/usr/mpi/gcc/openmpi-4.1.7a1/lib:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
159
+ LD_LIBRARY_PATH_modshare=/cm/shared/apps/slurm/current/lib64:1:/usr/mpi/gcc/openmpi-4.1.7a1/lib:1:/cm/shared/apps/slurm/current/lib64/slurm:1
160
+ LD_RUN_PATH=/usr/mpi/gcc/openmpi-4.1.7a1/lib
161
+ LD_RUN_PATH_modshare=/usr/mpi/gcc/openmpi-4.1.7a1/lib:1
162
+ LESSCLOSE=/usr/bin/lesspipe %s %s
163
+ LESSOPEN=| /usr/bin/lesspipe %s
164
+ LIBRARY_PATH=/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
165
+ LIBRARY_PATH_modshare=/cm/shared/apps/slurm/current/lib64:1:/cm/shared/apps/slurm/current/lib64/slurm:1
166
+ LOADEDMODULES=slurm/slurm/23.02.6:gcc/64/4.1.7a1
167
+ LOADEDMODULES_modshare=slurm/slurm/23.02.6:1:gcc/64/4.1.7a1:1
168
+ LOCAL_RANK=0
169
+ LOCAL_SIZE=8
170
+ LOGLEVEL=WARNING
171
+ LOGNAME=yangyaodong
172
+ LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
173
+ MANPATH=/usr/mpi/gcc/openmpi-4.1.7a1/share/man:/cm/shared/apps/slurm/current/man:/cm/local/apps/environment-modules/4.5.3/share/man:/usr/local/man:/usr/local/share/man:/usr/share/man:/cm/local/apps/environment-modules/current/share/man:/cm/local/apps/environment-modules/current/share/man
174
+ MANPATH_modshare=/usr/local/share/man:1:/usr/mpi/gcc/openmpi-4.1.7a1/share/man:1:/cm/local/apps/environment-modules/current/share/man:1:/cm/local/apps/environment-modules/4.5.3/share/man:1:/usr/local/man:1:/usr/share/man:1:/cm/shared/apps/slurm/current/man:1
175
+ MASTER_ADDR=127.0.0.1
176
+ MASTER_PORT=47506
177
+ MESON_ARGS=-Dbuildtype=release
178
+ MIG_PARTED_CHECKPOINT_FILE=/var/lib/nvidia-mig-manager/checkpoint.json
179
+ MIG_PARTED_CONFIG_FILE=/etc/nvidia-mig-manager/config.yaml
180
+ MIG_PARTED_HOOKS_FILE=/etc/nvidia-mig-manager/hooks.yaml
181
+ MODULEPATH=/cm/local/modulefiles:/cm/shared/modulefiles
182
+ MODULESHOME=/cm/local/apps/environment-modules/4.5.3
183
+ MODULES_CMD=/cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl
184
+ MODULES_SET_SHELL_STARTUP=0
185
+ MOTD_SHOWN=pam
186
+ MPI_HOME=/usr/mpi/gcc/openmpi-4.1.7a1
187
+ MPI_RUN=/usr/mpi/gcc/openmpi-4.1.7a1/bin/mpirun
188
+ NM=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-nm
189
+ NVCC_PREPEND_FLAGS= -ccbin=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin/x86_64-conda-linux-gnu-c++ -ccbin=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++
190
+ NVCC_PREPEND_FLAGS_BACKUP= -ccbin=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin/x86_64-conda-linux-gnu-c++
191
+ NVITOP_MONITOR_MODE=colorful
192
+ OBJCOPY=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-objcopy
193
+ OBJDUMP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-objdump
194
+ OLDPWD=/aifs4su/yaodong/wenqi/projects/safe_o1_evaluation/deception/LLaMA-Factory
195
+ PATH=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin:/aifs4su/yaodong/miniconda3/condabin:/usr/lpp/mmfs/bin:/usr/local/cuda/bin:/opt/bin:/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin:/aifs4su/yaodong/miniconda3/condabin:/usr/mpi/gcc/openmpi-4.1.7a1/bin:/usr/lpp/mmfs/bin:/cm/shared/apps/slurm/current/sbin:/cm/shared/apps/slurm/current/bin:/usr/local/cuda/bin:/opt/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/sbin:/usr/sbin:/cm/local/apps/environment-modules/4.5.3/bin
196
+ PATH_modshare=/usr/mpi/gcc/openmpi-4.1.7a1/bin:2:/usr/bin:1:/opt/bin/:1:/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin:1:/opt/bin:1:/usr/local/bin:1:/cm/shared/apps/slurm/current/bin:1:/cm/shared/apps/slurm/current/sbin:1:/bin:1:/snap/bin:1:/sbin:1:/usr/sbin:1:/usr/games:1:/cm/local/apps/environment-modules/4.5.3/bin:1:/usr/local/sbin:1:/usr/lpp/mmfs/bin:1:/aifs4su/yaodong/miniconda3/condabin:1:/usr/local/cuda/bin:1:/usr/local/games:1
197
+ PWD=/aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts
198
+ PYTHONHASHSEED=42
199
+ PYTHONPATH=/aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts
200
+ RANK=0
201
+ RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ranlib
202
+ READELF=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-readelf
203
+ SHELL=/bin/bash
204
+ SHLVL=13
205
+ SIZE=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-size
206
+ SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf
207
+ SSH_CLIENT=10.33.4.232 49200 22
208
+ SSH_CONNECTION=10.33.4.76 36746 10.33.4.229 22
209
+ SSH_TTY=/dev/pts/0
210
+ STRINGS=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-strings
211
+ STRIP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-strip
212
+ TERM=screen
213
+ TERM_PROGRAM=tmux
214
+ TERM_PROGRAM_VERSION=3.2a
215
+ TMUX=/tmp/tmux-1028/default,2884537,5
216
+ TMUX_PANE=%5
217
+ USER=yangyaodong
218
+ WANDB_API_KEY=62c57a07add7cf80060d09b29e313990bc2fada2
219
+ WANDB_SERVICE=2-2888806-tcp-localhost-44473
220
+ WORLD_SIZE=8
221
+ XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
222
+ XDG_RUNTIME_DIR=/run/user/1028
223
+ XDG_SESSION_CLASS=user
224
+ XDG_SESSION_ID=60916
225
+ XDG_SESSION_TYPE=tty
226
+ XML_CATALOG_FILES=file:///aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/etc/xml/catalog file:///etc/xml/catalog
227
+ ZERO_STAGE=3
228
+ _=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/deepspeed
229
+ _CE_CONDA=
230
+ _CE_M=
231
+ _CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
232
+ _LMFILES_=/cm/local/modulefiles/slurm/slurm/23.02.6:/cm/local/modulefiles/gcc/64/4.1.7a1
233
+ _LMFILES__modshare=/cm/local/modulefiles/slurm/slurm/23.02.6:1:/cm/local/modulefiles/gcc/64/4.1.7a1:1
234
+ build_alias=x86_64-conda-linux-gnu
235
+ host_alias=x86_64-conda-linux-gnu
script.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MODELS_TO_TRAIN=(
4
+ "Qwen2.5-7B-Instruct"
5
+ # "Llama-3.2-3B"
6
+ # "Llama-3.2-1B"
7
+
8
+ )
9
+
10
+ export WANDB_API_KEY="62c57a07add7cf80060d09b29e313990bc2fada2"
11
+
12
+ for MODEL in "${MODELS_TO_TRAIN[@]}"; do
13
+ echo "Starting training for model: ${MODEL}"
14
+
15
+
16
+ # MODEL_NAME_OR_PATH="/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/${MODEL}-base/slice_end"
17
+ # Second training phase
18
+ MODEL_NAME_OR_PATH="/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct"
19
+ TRAIN_DATASETS="/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset"
20
+ OUTPUT_DIR="/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/${MODEL}_safe_thinking"
21
+ TRAIN_TEMPLATE="Safe_thinking"
22
+ LOG_PROJECT="safe-o1"
23
+
24
+ source ./setup.sh
25
+
26
+ deepspeed \
27
+ --master_port ${MASTER_PORT} \
28
+ --module align_anything.trainers.text_to_text.sft \
29
+ --model_name_or_path ${MODEL_NAME_OR_PATH} \
30
+ --train_datasets ${TRAIN_DATASETS} \
31
+ --train_split train \
32
+ --train_template ${TRAIN_TEMPLATE} \
33
+ --output_dir ${OUTPUT_DIR} \
34
+ --log_project ${LOG_PROJECT} \
35
+ --per_device_train_batch_size 4 \
36
+ --per_device_eval_batch_size 4 \
37
+ --gradient_accumulation_steps 2 \
38
+ --learning_rate 2e-5 \
39
+ --epochs 3 \
40
+ --model_max_length 16384 \
41
+
42
+
43
+ echo "Completed second phase training for ${MODEL}"
44
+ done
45
+
46
+ echo "All model training completed!"
slice_end/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
slice_end/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 28,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 4,
18
+ "pad_token_id": 151643,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": 131072,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.50.0.dev0",
26
+ "use_cache": true,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 152064
29
+ }
slice_end/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
slice_end/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30e1baecb16745b34dfc93ec163c21e0ef68e4af1237bcbf75727cf6a133d5fd
3
+ size 15231345338
slice_end/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
slice_end/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb8138e837fbbd50932cdb31eddc0832738f665fd265cb87ab5e5628b5eebe30
3
+ size 11421996
slice_end/tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 16384,
204
+ "pad_token": "<|endoftext|>",
205
+ "padding_side": "right",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
slice_end/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-05T20:32:09.145104832+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-05T20:32:09.145234633+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log"}
3
+ {"time":"2025-04-05T20:32:09.359661673+08:00","level":"INFO","msg":"created new stream","id":"jla7fqqr"}
4
+ {"time":"2025-04-05T20:32:09.359700555+08:00","level":"INFO","msg":"stream: started","id":"jla7fqqr"}
5
+ {"time":"2025-04-05T20:32:09.35975566+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"jla7fqqr"}
6
+ {"time":"2025-04-05T20:32:09.359831663+08:00","level":"INFO","msg":"handler: started","stream_id":"jla7fqqr"}
7
+ {"time":"2025-04-05T20:32:09.35975831+08:00","level":"INFO","msg":"sender: started","stream_id":"jla7fqqr"}
8
+ {"time":"2025-04-05T20:32:09.688023993+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-04-05T20:45:27.076637312+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-04-05T20:45:27.077489476+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-04-05T20:45:28.038487853+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.283111243,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.283100079,"progress":"10.8KB/10.8KB"}],"total_operations":2}}
12
+ {"time":"2025-04-05T20:45:28.204441985+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-04-05T20:45:29.440112056+08:00","level":"INFO","msg":"stream: closing","id":"jla7fqqr"}
14
+ {"time":"2025-04-05T20:45:29.440138846+08:00","level":"INFO","msg":"handler: closed","stream_id":"jla7fqqr"}
15
+ {"time":"2025-04-05T20:45:29.440146259+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"jla7fqqr"}
16
+ {"time":"2025-04-05T20:45:29.440285075+08:00","level":"INFO","msg":"sender: closed","stream_id":"jla7fqqr"}
17
+ {"time":"2025-04-05T20:45:29.440378039+08:00","level":"INFO","msg":"stream: closed","id":"jla7fqqr"}
wandb/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-05 20:32:09,135 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Configure stats pid to 2888806
3
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
5
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug.log
7
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log
8
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_thinking', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
11
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():680] starting backend
12
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-05 20:32:09,141 INFO MainThread:2888806 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-05 20:32:09,142 INFO MainThread:2888806 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-05 20:32:09,143 INFO MainThread:2888806 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-05 20:32:09,162 INFO MainThread:2888806 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-05 20:32:09,682 INFO MainThread:2888806 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-05 20:32:10,112 INFO MainThread:2888806 [wandb_init.py:init():916] run started, returning control to user process
23
+ 2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/jla7fqqr
24
+ 2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
25
+ 2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2321] restore
26
+ 2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2327] restore done
27
+ 2025-04-05 20:45:29,432 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3892] rendering history
28
+ 2025-04-05 20:45:29,433 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
29
+ 2025-04-05 20:45:29,439 INFO MainThread:2888806 [wandb_run.py:_footer_sync_info():3853] logging synced files
wandb/run-20250404_234514-h2gynfll/files/config.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.1
4
+ m: []
5
+ python_version: 3.11.0
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 63
17
+ - 71
18
+ - 83
19
+ - 98
20
+ - 105
21
+ "2":
22
+ - 1
23
+ - 5
24
+ - 11
25
+ - 41
26
+ - 49
27
+ - 51
28
+ - 53
29
+ - 55
30
+ - 63
31
+ - 71
32
+ - 83
33
+ - 98
34
+ - 105
35
+ "3":
36
+ - 2
37
+ - 13
38
+ - 16
39
+ - 23
40
+ - 55
41
+ - 61
42
+ "4": 3.11.0
43
+ "5": 0.19.1
44
+ "6": 4.50.0.dev0
45
+ "8":
46
+ - 5
47
+ "12": 0.19.1
48
+ "13": linux-x86_64
49
+ bnb_cfgs:
50
+ value:
51
+ bnb_4bit_compute_dtype: float16
52
+ bnb_4bit_quant_type: nf4
53
+ bnb_4bit_use_double_quant: true
54
+ load_in_4bit: true
55
+ load_in_8bit: false
56
+ use_bnb: false
57
+ data_cfgs:
58
+ value:
59
+ eval_optional_args: []
60
+ train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
61
+ train_optional_args: []
62
+ train_split: train
63
+ train_template: Safe_o1
64
+ logger_cfgs:
65
+ value:
66
+ log_project: safe-o1
67
+ log_run_name: sft
68
+ log_type: wandb
69
+ output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
70
+ save_interval: 100000
71
+ lora_cfgs:
72
+ value:
73
+ inference_mode: false
74
+ lora_alpha: 16
75
+ lora_dropout: 0.1
76
+ r: 16
77
+ save_full_model: true
78
+ target_modules:
79
+ - q_proj
80
+ - v_proj
81
+ task_type: TaskType.CAUSAL_LM
82
+ use_lora: false
83
+ model_cfgs:
84
+ value:
85
+ model_max_length: 16384
86
+ model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
87
+ trust_remote_code: true
88
+ train_cfgs:
89
+ value:
90
+ adam_betas:
91
+ - 0.9
92
+ - 0.95
93
+ adam_epsilon: 1e-08
94
+ bf16: true
95
+ ds_cfgs: ds_z3_config.json
96
+ epochs: 6
97
+ eval_interval: 10
98
+ eval_strategy: steps
99
+ fp16: false
100
+ gradient_accumulation_steps: 2
101
+ gradient_checkpointing: true
102
+ learning_rate: 2e-05
103
+ lr_scheduler_type: constant
104
+ lr_warmup_ratio: 0.03
105
+ max_grad_norm: 1
106
+ per_device_eval_batch_size: 4
107
+ per_device_train_batch_size: 4
108
+ seed: 42
109
+ weight_decay: 0
wandb/run-20250404_234514-h2gynfll/files/output.log ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/6 epoch: 0%| | 0/2112 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 1/6 epoch (loss 0.9676): 15%|███████████████████████▊ | 319/2112 [08:11<46:47, 1.57s/it]
4
+ [2025-04-04 23:45:49,795] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
5
+ [2025-04-04 23:45:49,796] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=10.722716430728992, CurrSamplesPerSec=9.564486815227898, MemAllocated=29.37GB, MaxMemAllocated=51.13GB
6
+ [2025-04-04 23:46:21,396] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
7
+ [2025-04-04 23:46:21,397] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=10.61486173550022, CurrSamplesPerSec=11.387013476257202, MemAllocated=29.37GB, MaxMemAllocated=51.13GB
8
+ [2025-04-04 23:46:52,750] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
9
+ [2025-04-04 23:46:52,750] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=10.607628001438231, CurrSamplesPerSec=13.11193874390518, MemAllocated=29.37GB, MaxMemAllocated=52.29GB
10
+ [2025-04-04 23:47:20,658] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
11
+ [2025-04-04 23:47:20,659] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=10.929159623513739, CurrSamplesPerSec=9.881152216939991, MemAllocated=29.37GB, MaxMemAllocated=52.29GB
12
+ [2025-04-04 23:47:53,176] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
13
+ [2025-04-04 23:47:53,177] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=10.769232617254287, CurrSamplesPerSec=10.506886069158746, MemAllocated=29.37GB, MaxMemAllocated=52.29GB
14
+ [2025-04-04 23:48:25,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
15
+ [2025-04-04 23:48:25,188] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=10.706402500792887, CurrSamplesPerSec=13.256381944370643, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
16
+ [2025-04-04 23:48:54,900] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
17
+ [2025-04-04 23:48:54,901] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=10.771832584979517, CurrSamplesPerSec=11.234739539724892, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
18
+ [2025-04-04 23:49:25,278] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
19
+ [2025-04-04 23:49:25,279] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=10.79860773669107, CurrSamplesPerSec=11.809016383898683, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
20
+ [2025-04-04 23:49:56,100] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
21
+ [2025-04-04 23:49:56,100] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=10.796911835100996, CurrSamplesPerSec=11.795726899427471, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
22
+ [2025-04-04 23:50:26,690] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
23
+ [2025-04-04 23:50:26,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=10.81413404235532, CurrSamplesPerSec=12.486791572711763, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
24
+ [2025-04-04 23:50:56,559] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
25
+ [2025-04-04 23:50:56,560] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=10.845791469144872, CurrSamplesPerSec=11.503468048822956, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
26
+ [2025-04-04 23:51:26,233] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
27
+ [2025-04-04 23:51:26,234] [INFO] [timer.py:264:stop] epoch=0/micro_step=240/global_step=120, RunningAvgSamplesPerSec=10.871702151329506, CurrSamplesPerSec=12.054733265665433, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
28
+ [2025-04-04 23:51:56,110] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
29
+ [2025-04-04 23:51:56,110] [INFO] [timer.py:264:stop] epoch=0/micro_step=260/global_step=130, RunningAvgSamplesPerSec=10.890944356652495, CurrSamplesPerSec=11.35227930719023, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
30
+ [2025-04-04 23:52:26,208] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
31
+ [2025-04-04 23:52:26,208] [INFO] [timer.py:264:stop] epoch=0/micro_step=280/global_step=140, RunningAvgSamplesPerSec=10.899459967950149, CurrSamplesPerSec=9.079894291769792, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
32
+ [2025-04-04 23:52:57,298] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
33
+ [2025-04-04 23:52:57,298] [INFO] [timer.py:264:stop] epoch=0/micro_step=300/global_step=150, RunningAvgSamplesPerSec=10.886001551743023, CurrSamplesPerSec=10.84443091570689, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
34
+ [2025-04-04 23:53:28,207] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
35
+ [2025-04-04 23:53:28,208] [INFO] [timer.py:264:stop] epoch=0/micro_step=320/global_step=160, RunningAvgSamplesPerSec=10.8748149413261, CurrSamplesPerSec=11.527765341507381, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
36
+ [2025-04-04 23:53:58,689] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
37
+ [2025-04-04 23:53:58,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=340/global_step=170, RunningAvgSamplesPerSec=10.88374807162155, CurrSamplesPerSec=10.619738627247132, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
38
+ [2025-04-04 23:54:29,570] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
39
+ [2025-04-04 23:54:29,571] [INFO] [timer.py:264:stop] epoch=1/micro_step=8/global_step=180, RunningAvgSamplesPerSec=10.875487421119185, CurrSamplesPerSec=12.660335656450862, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
40
+ [2025-04-04 23:55:00,249] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
41
+ [2025-04-04 23:55:00,250] [INFO] [timer.py:264:stop] epoch=1/micro_step=28/global_step=190, RunningAvgSamplesPerSec=10.872545347288018, CurrSamplesPerSec=11.075220259929674, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
42
+ [2025-04-04 23:55:30,213] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
43
+ [2025-04-04 23:55:30,213] [INFO] [timer.py:264:stop] epoch=1/micro_step=48/global_step=200, RunningAvgSamplesPerSec=10.882162387552667, CurrSamplesPerSec=11.73534073720176, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
44
+ [2025-04-04 23:56:00,688] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
45
+ [2025-04-04 23:56:00,689] [INFO] [timer.py:264:stop] epoch=1/micro_step=68/global_step=210, RunningAvgSamplesPerSec=10.882240702304028, CurrSamplesPerSec=13.440744149207571, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
46
+ [2025-04-04 23:56:31,300] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
47
+ [2025-04-04 23:56:31,300] [INFO] [timer.py:264:stop] epoch=1/micro_step=88/global_step=220, RunningAvgSamplesPerSec=10.878092074029748, CurrSamplesPerSec=10.428130570438391, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
48
+ [2025-04-04 23:57:04,592] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
49
+ [2025-04-04 23:57:04,592] [INFO] [timer.py:264:stop] epoch=1/micro_step=108/global_step=230, RunningAvgSamplesPerSec=10.833171978118708, CurrSamplesPerSec=10.846180971098232, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
50
+ [2025-04-04 23:57:32,933] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
51
+ [2025-04-04 23:57:32,933] [INFO] [timer.py:264:stop] epoch=1/micro_step=128/global_step=240, RunningAvgSamplesPerSec=10.867158335720406, CurrSamplesPerSec=9.61852783515831, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
52
+ [2025-04-04 23:58:03,491] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
53
+ [2025-04-04 23:58:03,492] [INFO] [timer.py:264:stop] epoch=1/micro_step=148/global_step=250, RunningAvgSamplesPerSec=10.866661161970612, CurrSamplesPerSec=12.195057132062441, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
54
+ [2025-04-04 23:58:33,526] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
55
+ [2025-04-04 23:58:33,526] [INFO] [timer.py:264:stop] epoch=1/micro_step=168/global_step=260, RunningAvgSamplesPerSec=10.873141761713997, CurrSamplesPerSec=10.176243052918215, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
56
+ [2025-04-04 23:59:05,507] [INFO] [logging.py:128:log_dist] [Rank 0] step=270, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
57
+ [2025-04-04 23:59:05,508] [INFO] [timer.py:264:stop] epoch=1/micro_step=188/global_step=270, RunningAvgSamplesPerSec=10.854559682524012, CurrSamplesPerSec=10.616882484742007, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
58
+ [2025-04-04 23:59:34,347] [INFO] [logging.py:128:log_dist] [Rank 0] step=280, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
59
+ [2025-04-04 23:59:34,347] [INFO] [timer.py:264:stop] epoch=1/micro_step=208/global_step=280, RunningAvgSamplesPerSec=10.878267585037369, CurrSamplesPerSec=9.115241933490342, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
60
+ [2025-04-05 00:00:05,154] [INFO] [logging.py:128:log_dist] [Rank 0] step=290, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
61
+ [2025-04-05 00:00:05,154] [INFO] [timer.py:264:stop] epoch=1/micro_step=228/global_step=290, RunningAvgSamplesPerSec=10.8751133915393, CurrSamplesPerSec=10.612563389272802, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
62
+ [2025-04-05 00:00:33,555] [INFO] [logging.py:128:log_dist] [Rank 0] step=300, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
63
+ [2025-04-05 00:00:33,555] [INFO] [timer.py:264:stop] epoch=1/micro_step=248/global_step=300, RunningAvgSamplesPerSec=10.901752001823205, CurrSamplesPerSec=11.59274166918304, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
64
+ [2025-04-05 00:01:03,565] [INFO] [logging.py:128:log_dist] [Rank 0] step=310, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
65
+ [2025-04-05 00:01:03,566] [INFO] [timer.py:264:stop] epoch=1/micro_step=268/global_step=310, RunningAvgSamplesPerSec=10.907293269923844, CurrSamplesPerSec=9.995996622494804, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
66
+ [2025-04-05 00:01:34,344] [INFO] [logging.py:128:log_dist] [Rank 0] step=320, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
67
+ [2025-04-05 00:01:34,345] [INFO] [timer.py:264:stop] epoch=1/micro_step=288/global_step=320, RunningAvgSamplesPerSec=10.907504943822554, CurrSamplesPerSec=11.022197256520487, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
68
+ [2025-04-05 00:02:07,406] [INFO] [logging.py:128:log_dist] [Rank 0] step=330, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
69
+ [2025-04-05 00:02:07,407] [INFO] [timer.py:264:stop] epoch=1/micro_step=308/global_step=330, RunningAvgSamplesPerSec=10.881103213238672, CurrSamplesPerSec=10.663874874443216, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
70
+ [2025-04-05 00:02:35,659] [INFO] [logging.py:128:log_dist] [Rank 0] step=340, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
71
+ [2025-04-05 00:02:35,659] [INFO] [timer.py:264:stop] epoch=1/micro_step=328/global_step=340, RunningAvgSamplesPerSec=10.907334658762007, CurrSamplesPerSec=13.854504112243248, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
72
+ [2025-04-05 00:03:07,329] [INFO] [logging.py:128:log_dist] [Rank 0] step=350, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
73
+ [2025-04-05 00:03:07,330] [INFO] [timer.py:264:stop] epoch=1/micro_step=348/global_step=350, RunningAvgSamplesPerSec=10.902276426516282, CurrSamplesPerSec=10.083614280472377, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
74
+ [2025-04-05 00:03:37,776] [INFO] [logging.py:128:log_dist] [Rank 0] step=360, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
75
+ [2025-04-05 00:03:37,776] [INFO] [timer.py:264:stop] epoch=2/micro_step=16/global_step=360, RunningAvgSamplesPerSec=10.904611092726073, CurrSamplesPerSec=11.53194905435867, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
76
+ [2025-04-05 00:04:09,459] [INFO] [logging.py:128:log_dist] [Rank 0] step=370, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
77
+ [2025-04-05 00:04:09,459] [INFO] [timer.py:264:stop] epoch=2/micro_step=36/global_step=370, RunningAvgSamplesPerSec=10.89630043473511, CurrSamplesPerSec=11.008557068097579, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
78
+ [2025-04-05 00:04:41,383] [INFO] [logging.py:128:log_dist] [Rank 0] step=380, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
79
+ [2025-04-05 00:04:41,383] [INFO] [timer.py:264:stop] epoch=2/micro_step=56/global_step=380, RunningAvgSamplesPerSec=10.884882705424278, CurrSamplesPerSec=9.811316896915017, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
80
+ [2025-04-05 00:05:08,841] [INFO] [logging.py:128:log_dist] [Rank 0] step=390, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
81
+ [2025-04-05 00:05:08,841] [INFO] [timer.py:264:stop] epoch=2/micro_step=76/global_step=390, RunningAvgSamplesPerSec=10.915433970019686, CurrSamplesPerSec=12.441151077420326, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
82
+ [2025-04-05 00:05:40,337] [INFO] [logging.py:128:log_dist] [Rank 0] step=400, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
83
+ [2025-04-05 00:05:40,338] [INFO] [timer.py:264:stop] epoch=2/micro_step=96/global_step=400, RunningAvgSamplesPerSec=10.905658171388385, CurrSamplesPerSec=12.03147003105065, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
84
+ [2025-04-05 00:06:13,617] [INFO] [logging.py:128:log_dist] [Rank 0] step=410, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
85
+ [2025-04-05 00:06:13,617] [INFO] [timer.py:264:stop] epoch=2/micro_step=116/global_step=410, RunningAvgSamplesPerSec=10.882479697107344, CurrSamplesPerSec=12.944935493618878, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
86
+ [2025-04-05 00:06:42,210] [INFO] [logging.py:128:log_dist] [Rank 0] step=420, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
87
+ [2025-04-05 00:06:42,210] [INFO] [timer.py:264:stop] epoch=2/micro_step=136/global_step=420, RunningAvgSamplesPerSec=10.902476276457532, CurrSamplesPerSec=11.251411338667834, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
88
+ [2025-04-05 00:07:13,632] [INFO] [logging.py:128:log_dist] [Rank 0] step=430, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
89
+ [2025-04-05 00:07:13,633] [INFO] [timer.py:264:stop] epoch=2/micro_step=156/global_step=430, RunningAvgSamplesPerSec=10.900374041873794, CurrSamplesPerSec=12.121469773864192, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
90
+ [2025-04-05 00:07:44,981] [INFO] [logging.py:128:log_dist] [Rank 0] step=440, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
91
+ [2025-04-05 00:07:44,981] [INFO] [timer.py:264:stop] epoch=2/micro_step=176/global_step=440, RunningAvgSamplesPerSec=10.896044041249382, CurrSamplesPerSec=10.324748615101747, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
92
+ [2025-04-05 00:08:17,623] [INFO] [logging.py:128:log_dist] [Rank 0] step=450, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
93
+ [2025-04-05 00:08:17,624] [INFO] [timer.py:264:stop] epoch=2/micro_step=196/global_step=450, RunningAvgSamplesPerSec=10.885513745657075, CurrSamplesPerSec=10.364115890586287, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
94
+ [2025-04-05 00:08:48,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=460, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
95
+ [2025-04-05 00:08:48,421] [INFO] [timer.py:264:stop] epoch=2/micro_step=216/global_step=460, RunningAvgSamplesPerSec=10.890322858413576, CurrSamplesPerSec=9.913507288254982, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
96
+ [2025-04-05 00:09:18,859] [INFO] [logging.py:128:log_dist] [Rank 0] step=470, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
97
+ [2025-04-05 00:09:18,860] [INFO] [timer.py:264:stop] epoch=2/micro_step=236/global_step=470, RunningAvgSamplesPerSec=10.897011731086295, CurrSamplesPerSec=11.582722357771235, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
98
+ [2025-04-05 00:09:48,991] [INFO] [logging.py:128:log_dist] [Rank 0] step=480, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
99
+ [2025-04-05 00:09:48,992] [INFO] [timer.py:264:stop] epoch=2/micro_step=256/global_step=480, RunningAvgSamplesPerSec=10.901656808546964, CurrSamplesPerSec=10.341647371016279, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
100
+ [2025-04-05 00:10:18,579] [INFO] [logging.py:128:log_dist] [Rank 0] step=490, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
101
+ [2025-04-05 00:10:18,580] [INFO] [timer.py:264:stop] epoch=2/micro_step=276/global_step=490, RunningAvgSamplesPerSec=10.90998996993192, CurrSamplesPerSec=12.032918655477928, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
102
+ [2025-04-05 00:10:50,691] [INFO] [logging.py:128:log_dist] [Rank 0] step=500, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
103
+ [2025-04-05 00:10:50,692] [INFO] [timer.py:264:stop] epoch=2/micro_step=296/global_step=500, RunningAvgSamplesPerSec=10.899989270395931, CurrSamplesPerSec=10.292938527296563, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
104
+ [2025-04-05 00:11:22,168] [INFO] [logging.py:128:log_dist] [Rank 0] step=510, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
105
+ [2025-04-05 00:11:22,169] [INFO] [timer.py:264:stop] epoch=2/micro_step=316/global_step=510, RunningAvgSamplesPerSec=10.89629369912652, CurrSamplesPerSec=10.290541044636166, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
106
+ [2025-04-05 00:11:51,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=520, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
107
+ [2025-04-05 00:11:51,929] [INFO] [timer.py:264:stop] epoch=2/micro_step=336/global_step=520, RunningAvgSamplesPerSec=10.90288718797129, CurrSamplesPerSec=11.410733496183932, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
108
+ [2025-04-05 00:12:22,106] [INFO] [logging.py:128:log_dist] [Rank 0] step=530, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
109
+ [2025-04-05 00:12:22,107] [INFO] [timer.py:264:stop] epoch=3/micro_step=4/global_step=530, RunningAvgSamplesPerSec=10.906725677726158, CurrSamplesPerSec=9.671531033968664, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
110
+ [2025-04-05 00:12:52,977] [INFO] [logging.py:128:log_dist] [Rank 0] step=540, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
111
+ [2025-04-05 00:12:52,978] [INFO] [timer.py:264:stop] epoch=3/micro_step=24/global_step=540, RunningAvgSamplesPerSec=10.906210663844542, CurrSamplesPerSec=10.504433111716455, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
112
+ [2025-04-05 00:13:24,041] [INFO] [logging.py:128:log_dist] [Rank 0] step=550, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
113
+ [2025-04-05 00:13:24,041] [INFO] [timer.py:264:stop] epoch=3/micro_step=44/global_step=550, RunningAvgSamplesPerSec=10.902891825560632, CurrSamplesPerSec=8.900087581387572, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
114
+ [2025-04-05 00:13:55,658] [INFO] [logging.py:128:log_dist] [Rank 0] step=560, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
115
+ [2025-04-05 00:13:55,658] [INFO] [timer.py:264:stop] epoch=3/micro_step=64/global_step=560, RunningAvgSamplesPerSec=10.898118424070445, CurrSamplesPerSec=10.754342827399102, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
116
+ [2025-04-05 00:14:23,748] [INFO] [logging.py:128:log_dist] [Rank 0] step=570, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
117
+ [2025-04-05 00:14:23,748] [INFO] [timer.py:264:stop] epoch=3/micro_step=84/global_step=570, RunningAvgSamplesPerSec=10.916669469902244, CurrSamplesPerSec=9.810931771693125, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
118
+ [2025-04-05 00:14:58,721] [INFO] [logging.py:128:log_dist] [Rank 0] step=580, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
119
+ [2025-04-05 00:14:58,722] [INFO] [timer.py:264:stop] epoch=3/micro_step=104/global_step=580, RunningAvgSamplesPerSec=10.88957716223878, CurrSamplesPerSec=9.16694888207876, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
120
+ [2025-04-05 00:15:28,031] [INFO] [logging.py:128:log_dist] [Rank 0] step=590, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
121
+ [2025-04-05 00:15:28,031] [INFO] [timer.py:264:stop] epoch=3/micro_step=124/global_step=590, RunningAvgSamplesPerSec=10.899943327977292, CurrSamplesPerSec=12.39936093438136, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
122
+ [2025-04-05 00:15:58,906] [INFO] [logging.py:128:log_dist] [Rank 0] step=600, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
123
+ [2025-04-05 00:15:58,907] [INFO] [timer.py:264:stop] epoch=3/micro_step=144/global_step=600, RunningAvgSamplesPerSec=10.8988989794667, CurrSamplesPerSec=10.836493968610279, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
124
+ [2025-04-05 00:16:28,668] [INFO] [logging.py:128:log_dist] [Rank 0] step=610, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
125
+ [2025-04-05 00:16:28,668] [INFO] [timer.py:264:stop] epoch=3/micro_step=164/global_step=610, RunningAvgSamplesPerSec=10.903900774650129, CurrSamplesPerSec=11.612509861641884, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
126
+ [2025-04-05 00:17:00,382] [INFO] [logging.py:128:log_dist] [Rank 0] step=620, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
127
+ [2025-04-05 00:17:00,382] [INFO] [timer.py:264:stop] epoch=3/micro_step=184/global_step=620, RunningAvgSamplesPerSec=10.897520261368054, CurrSamplesPerSec=9.668412642725407, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
128
+ [2025-04-05 00:17:29,289] [INFO] [logging.py:128:log_dist] [Rank 0] step=630, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
129
+ [2025-04-05 00:17:29,290] [INFO] [timer.py:264:stop] epoch=3/micro_step=204/global_step=630, RunningAvgSamplesPerSec=10.908485349036773, CurrSamplesPerSec=11.284010097390484, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
130
+ [2025-04-05 00:18:00,269] [INFO] [logging.py:128:log_dist] [Rank 0] step=640, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
131
+ [2025-04-05 00:18:00,270] [INFO] [timer.py:264:stop] epoch=3/micro_step=224/global_step=640, RunningAvgSamplesPerSec=10.907192494678394, CurrSamplesPerSec=9.731917183842711, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
132
+ [2025-04-05 00:18:29,192] [INFO] [logging.py:128:log_dist] [Rank 0] step=650, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
133
+ [2025-04-05 00:18:29,192] [INFO] [timer.py:264:stop] epoch=3/micro_step=244/global_step=650, RunningAvgSamplesPerSec=10.91708252161445, CurrSamplesPerSec=10.408032976984966, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
134
+ [2025-04-05 00:18:58,578] [INFO] [logging.py:128:log_dist] [Rank 0] step=660, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
135
+ [2025-04-05 00:18:58,579] [INFO] [timer.py:264:stop] epoch=3/micro_step=264/global_step=660, RunningAvgSamplesPerSec=10.92412428365038, CurrSamplesPerSec=12.284003581583471, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
136
+ [2025-04-05 00:19:30,459] [INFO] [logging.py:128:log_dist] [Rank 0] step=670, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
137
+ [2025-04-05 00:19:30,460] [INFO] [timer.py:264:stop] epoch=3/micro_step=284/global_step=670, RunningAvgSamplesPerSec=10.916816649165908, CurrSamplesPerSec=9.322337913360691, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
138
+ [2025-04-05 00:20:02,133] [INFO] [logging.py:128:log_dist] [Rank 0] step=680, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
139
+ [2025-04-05 00:20:02,134] [INFO] [timer.py:264:stop] epoch=3/micro_step=304/global_step=680, RunningAvgSamplesPerSec=10.911318041413768, CurrSamplesPerSec=10.483330656587459, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
140
+ [2025-04-05 00:20:31,713] [INFO] [logging.py:128:log_dist] [Rank 0] step=690, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
141
+ [2025-04-05 00:20:31,714] [INFO] [timer.py:264:stop] epoch=3/micro_step=324/global_step=690, RunningAvgSamplesPerSec=10.917770882722586, CurrSamplesPerSec=11.774822290085138, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
142
+ [2025-04-05 00:21:02,282] [INFO] [logging.py:128:log_dist] [Rank 0] step=700, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
143
+ [2025-04-05 00:21:02,282] [INFO] [timer.py:264:stop] epoch=3/micro_step=344/global_step=700, RunningAvgSamplesPerSec=10.917551948544697, CurrSamplesPerSec=13.614658205589432, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
144
+ [2025-04-05 00:21:32,740] [INFO] [logging.py:128:log_dist] [Rank 0] step=710, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
145
+ [2025-04-05 00:21:32,740] [INFO] [timer.py:264:stop] epoch=4/micro_step=12/global_step=710, RunningAvgSamplesPerSec=10.9181337689355, CurrSamplesPerSec=12.613885955557196, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
146
+ [2025-04-05 00:22:04,544] [INFO] [logging.py:128:log_dist] [Rank 0] step=720, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
147
+ [2025-04-05 00:22:04,544] [INFO] [timer.py:264:stop] epoch=4/micro_step=32/global_step=720, RunningAvgSamplesPerSec=10.91412724964451, CurrSamplesPerSec=9.51134217019512, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
148
+ [2025-04-05 00:22:35,183] [INFO] [logging.py:128:log_dist] [Rank 0] step=730, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
149
+ [2025-04-05 00:22:35,184] [INFO] [timer.py:264:stop] epoch=4/micro_step=52/global_step=730, RunningAvgSamplesPerSec=10.91414884787545, CurrSamplesPerSec=10.138197103888123, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
150
+ [2025-04-05 00:23:04,475] [INFO] [logging.py:128:log_dist] [Rank 0] step=740, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
151
+ [2025-04-05 00:23:04,475] [INFO] [timer.py:264:stop] epoch=4/micro_step=72/global_step=740, RunningAvgSamplesPerSec=10.920729378294386, CurrSamplesPerSec=12.715673733635349, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
152
+ [2025-04-05 00:23:35,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=750, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
153
+ [2025-04-05 00:23:35,986] [INFO] [timer.py:264:stop] epoch=4/micro_step=92/global_step=750, RunningAvgSamplesPerSec=10.915074902404724, CurrSamplesPerSec=10.352435836177008, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
154
+ [2025-04-05 00:24:09,924] [INFO] [logging.py:128:log_dist] [Rank 0] step=760, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
155
+ [2025-04-05 00:24:09,925] [INFO] [timer.py:264:stop] epoch=4/micro_step=112/global_step=760, RunningAvgSamplesPerSec=10.899539039695087, CurrSamplesPerSec=9.854495496659503, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
156
+ [2025-04-05 00:24:38,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=770, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
157
+ [2025-04-05 00:24:38,255] [INFO] [timer.py:264:stop] epoch=4/micro_step=132/global_step=770, RunningAvgSamplesPerSec=10.911327821307937, CurrSamplesPerSec=9.34124368043991, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
158
+ [2025-04-05 00:25:08,832] [INFO] [logging.py:128:log_dist] [Rank 0] step=780, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
159
+ [2025-04-05 00:25:08,833] [INFO] [timer.py:264:stop] epoch=4/micro_step=152/global_step=780, RunningAvgSamplesPerSec=10.91183210830314, CurrSamplesPerSec=10.903172459516146, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
160
+ [2025-04-05 00:25:39,741] [INFO] [logging.py:128:log_dist] [Rank 0] step=790, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
161
+ [2025-04-05 00:25:39,741] [INFO] [timer.py:264:stop] epoch=4/micro_step=172/global_step=790, RunningAvgSamplesPerSec=10.91031485235279, CurrSamplesPerSec=9.60001863492875, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
162
+ [2025-04-05 00:26:10,132] [INFO] [logging.py:128:log_dist] [Rank 0] step=800, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
163
+ [2025-04-05 00:26:10,133] [INFO] [timer.py:264:stop] epoch=4/micro_step=192/global_step=800, RunningAvgSamplesPerSec=10.911486959949936, CurrSamplesPerSec=13.04129409549576, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
164
+ [2025-04-05 00:26:39,720] [INFO] [logging.py:128:log_dist] [Rank 0] step=810, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
165
+ [2025-04-05 00:26:39,720] [INFO] [timer.py:264:stop] epoch=4/micro_step=212/global_step=810, RunningAvgSamplesPerSec=10.916327699656112, CurrSamplesPerSec=10.58162272777036, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
166
+ [2025-04-05 00:27:09,446] [INFO] [logging.py:128:log_dist] [Rank 0] step=820, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
167
+ [2025-04-05 00:27:09,447] [INFO] [timer.py:264:stop] epoch=4/micro_step=232/global_step=820, RunningAvgSamplesPerSec=10.920561311076566, CurrSamplesPerSec=14.155525610526686, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
168
+ [2025-04-05 00:27:39,685] [INFO] [logging.py:128:log_dist] [Rank 0] step=830, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
169
+ [2025-04-05 00:27:39,686] [INFO] [timer.py:264:stop] epoch=4/micro_step=252/global_step=830, RunningAvgSamplesPerSec=10.923770509825303, CurrSamplesPerSec=11.454142227015383, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
170
+ [2025-04-05 00:28:09,402] [INFO] [logging.py:128:log_dist] [Rank 0] step=840, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
171
+ [2025-04-05 00:28:09,402] [INFO] [timer.py:264:stop] epoch=4/micro_step=272/global_step=840, RunningAvgSamplesPerSec=10.92759224936308, CurrSamplesPerSec=10.562603606097442, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
172
+ [2025-04-05 00:28:39,865] [INFO] [logging.py:128:log_dist] [Rank 0] step=850, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
173
+ [2025-04-05 00:28:39,865] [INFO] [timer.py:264:stop] epoch=4/micro_step=292/global_step=850, RunningAvgSamplesPerSec=10.929102932944096, CurrSamplesPerSec=10.551309022814346, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
174
+ [2025-04-05 00:29:12,678] [INFO] [logging.py:128:log_dist] [Rank 0] step=860, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
175
+ [2025-04-05 00:29:12,678] [INFO] [timer.py:264:stop] epoch=4/micro_step=312/global_step=860, RunningAvgSamplesPerSec=10.919878183184371, CurrSamplesPerSec=12.868895213136868, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
176
+ [2025-04-05 00:29:42,776] [INFO] [logging.py:128:log_dist] [Rank 0] step=870, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
177
+ [2025-04-05 00:29:42,777] [INFO] [timer.py:264:stop] epoch=4/micro_step=332/global_step=870, RunningAvgSamplesPerSec=10.923420364343865, CurrSamplesPerSec=10.234367011093408, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
178
+ [2025-04-05 00:30:12,278] [INFO] [logging.py:128:log_dist] [Rank 0] step=880, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
179
+ [2025-04-05 00:30:12,279] [INFO] [timer.py:264:stop] epoch=4/micro_step=352/global_step=880, RunningAvgSamplesPerSec=10.928288969968747, CurrSamplesPerSec=15.232539061735583, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
180
+ [2025-04-05 00:30:43,713] [INFO] [logging.py:128:log_dist] [Rank 0] step=890, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
181
+ [2025-04-05 00:30:43,713] [INFO] [timer.py:264:stop] epoch=5/micro_step=20/global_step=890, RunningAvgSamplesPerSec=10.925241705024948, CurrSamplesPerSec=9.550210083587169, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
182
+ [2025-04-05 00:31:14,999] [INFO] [logging.py:128:log_dist] [Rank 0] step=900, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
183
+ [2025-04-05 00:31:14,999] [INFO] [timer.py:264:stop] epoch=5/micro_step=40/global_step=900, RunningAvgSamplesPerSec=10.922448695267986, CurrSamplesPerSec=11.707657892737233, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
184
+ [2025-04-05 00:31:46,772] [INFO] [logging.py:128:log_dist] [Rank 0] step=910, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
185
+ [2025-04-05 00:31:46,772] [INFO] [timer.py:264:stop] epoch=5/micro_step=60/global_step=910, RunningAvgSamplesPerSec=10.918367358130276, CurrSamplesPerSec=12.14226994099549, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
186
+ [2025-04-05 00:32:14,543] [INFO] [logging.py:128:log_dist] [Rank 0] step=920, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
187
+ [2025-04-05 00:32:14,543] [INFO] [timer.py:264:stop] epoch=5/micro_step=80/global_step=920, RunningAvgSamplesPerSec=10.929225932240328, CurrSamplesPerSec=9.593174888357035, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
188
+ [2025-04-05 00:32:47,323] [INFO] [logging.py:128:log_dist] [Rank 0] step=930, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
189
+ [2025-04-05 00:32:47,324] [INFO] [timer.py:264:stop] epoch=5/micro_step=100/global_step=930, RunningAvgSamplesPerSec=10.920947685322428, CurrSamplesPerSec=10.432728165219467, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
190
+ [2025-04-05 00:33:25,391] [INFO] [logging.py:128:log_dist] [Rank 0] step=940, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
191
+ [2025-04-05 00:33:25,393] [INFO] [timer.py:264:stop] epoch=5/micro_step=120/global_step=940, RunningAvgSamplesPerSec=10.899166352905562, CurrSamplesPerSec=9.46259530076961, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
192
+ [2025-04-05 00:34:05,124] [INFO] [logging.py:128:log_dist] [Rank 0] step=950, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
193
+ [2025-04-05 00:34:05,125] [INFO] [timer.py:264:stop] epoch=5/micro_step=140/global_step=950, RunningAvgSamplesPerSec=10.868709472254283, CurrSamplesPerSec=8.157583107922177, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
194
+ [2025-04-05 00:34:41,083] [INFO] [logging.py:128:log_dist] [Rank 0] step=960, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
195
+ [2025-04-05 00:34:41,084] [INFO] [timer.py:264:stop] epoch=5/micro_step=160/global_step=960, RunningAvgSamplesPerSec=10.85179390673428, CurrSamplesPerSec=11.129984184357816, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
196
+ [2025-04-05 00:35:18,183] [INFO] [logging.py:128:log_dist] [Rank 0] step=970, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
197
+ [2025-04-05 00:35:18,184] [INFO] [timer.py:264:stop] epoch=5/micro_step=180/global_step=970, RunningAvgSamplesPerSec=10.831398090103376, CurrSamplesPerSec=8.407739182923459, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
198
+ [2025-04-05 00:35:50,030] [INFO] [logging.py:128:log_dist] [Rank 0] step=980, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
199
+ [2025-04-05 00:35:50,031] [INFO] [timer.py:264:stop] epoch=5/micro_step=200/global_step=980, RunningAvgSamplesPerSec=10.830610124260065, CurrSamplesPerSec=12.386895729971858, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
200
+ [2025-04-05 00:36:19,966] [INFO] [logging.py:128:log_dist] [Rank 0] step=990, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
201
+ [2025-04-05 00:36:19,967] [INFO] [timer.py:264:stop] epoch=5/micro_step=220/global_step=990, RunningAvgSamplesPerSec=10.83462980390363, CurrSamplesPerSec=11.45762219440472, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
202
+ [2025-04-05 00:36:49,736] [INFO] [logging.py:128:log_dist] [Rank 0] step=1000, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
203
+ [2025-04-05 00:36:49,737] [INFO] [timer.py:264:stop] epoch=5/micro_step=240/global_step=1000, RunningAvgSamplesPerSec=10.838216915293264, CurrSamplesPerSec=12.083053599469995, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
204
+ [2025-04-05 00:37:19,659] [INFO] [logging.py:128:log_dist] [Rank 0] step=1010, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
205
+ [2025-04-05 00:37:19,659] [INFO] [timer.py:264:stop] epoch=5/micro_step=260/global_step=1010, RunningAvgSamplesPerSec=10.84138178767096, CurrSamplesPerSec=11.283992072611156, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
206
+ [2025-04-05 00:37:50,047] [INFO] [logging.py:128:log_dist] [Rank 0] step=1020, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
207
+ [2025-04-05 00:37:50,047] [INFO] [timer.py:264:stop] epoch=5/micro_step=280/global_step=1020, RunningAvgSamplesPerSec=10.843045795756481, CurrSamplesPerSec=9.382163129026653, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
208
+ [2025-04-05 00:38:21,442] [INFO] [logging.py:128:log_dist] [Rank 0] step=1030, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
209
+ [2025-04-05 00:38:21,443] [INFO] [timer.py:264:stop] epoch=5/micro_step=300/global_step=1030, RunningAvgSamplesPerSec=10.841309399376517, CurrSamplesPerSec=10.88982155903671, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
210
+ [2025-04-05 00:38:53,361] [INFO] [logging.py:128:log_dist] [Rank 0] step=1040, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
211
+ [2025-04-05 00:38:53,362] [INFO] [timer.py:264:stop] epoch=5/micro_step=320/global_step=1040, RunningAvgSamplesPerSec=10.837961689000009, CurrSamplesPerSec=11.827810675137716, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
212
+ [2025-04-05 00:39:23,651] [INFO] [logging.py:128:log_dist] [Rank 0] step=1050, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
213
+ [2025-04-05 00:39:23,652] [INFO] [timer.py:264:stop] epoch=5/micro_step=340/global_step=1050, RunningAvgSamplesPerSec=10.839644603106759, CurrSamplesPerSec=10.710433778235352, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
214
+ Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
215
+ Saving 16-bit model...
216
+ [2025-04-05 00:39:48,972] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step1056 is about to be saved!
217
+ [2025-04-05 00:39:48,973] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step1056
218
+ [2025-04-05 00:39:48,973] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
219
+ [2025-04-05 00:40:02,761] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
220
+ [2025-04-05 00:40:02,761] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1056 is ready now!
221
+ Model saved!
wandb/run-20250404_234514-h2gynfll/files/requirements.txt ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ deepspeed==0.16.1
3
+ uritemplate==4.1.1
4
+ pyairports==2.1.1
5
+ partial-json-parser==0.2.1.1.post4
6
+ tensorboard-data-server==0.7.2
7
+ pydantic==2.10.3
8
+ Werkzeug==3.1.3
9
+ attrs==24.3.0
10
+ Jinja2==3.1.4
11
+ email_validator==2.2.0
12
+ mdit-py-plugins==0.4.2
13
+ google-api-python-client==2.160.0
14
+ pandas==2.2.3
15
+ safehttpx==0.1.6
16
+ setproctitle==1.3.4
17
+ dill==0.3.8
18
+ torchaudio==2.5.1
19
+ frechet-audio-distance==0.1.2
20
+ blessed==1.20.0
21
+ llvmlite==0.43.0
22
+ litellm==1.60.8
23
+ nvidia-nvtx-cu12==12.4.127
24
+ nvidia-cusolver-cu12==11.6.1.9
25
+ einops==0.8.0
26
+ datasets==3.2.0
27
+ pycountry==24.6.1
28
+ airportsdata==20250224
29
+ idna==3.10
30
+ urllib3==2.2.3
31
+ mpmath==1.3.0
32
+ wandb==0.19.1
33
+ certifi==2024.12.14
34
+ markdown-it-py==3.0.0
35
+ align-anything==0.0.1.dev0
36
+ aiohttp==3.11.10
37
+ fsspec==2024.9.0
38
+ aiohappyeyeballs==2.4.4
39
+ httplib2==0.22.0
40
+ hjson==3.1.0
41
+ yarl==1.18.3
42
+ decorator==5.1.1
43
+ distlib==0.3.9
44
+ absl-py==2.1.0
45
+ huggingface-hub==0.27.0
46
+ memray==1.15.0
47
+ Pygments==2.18.0
48
+ soupsieve==2.6
49
+ shellingham==1.5.4
50
+ tokenizers==0.21.0
51
+ uvloop==0.21.0
52
+ numpy==1.26.4
53
+ linkify-it-py==2.0.3
54
+ sympy==1.13.1
55
+ python-dotenv==1.0.1
56
+ nvidia-cuda-runtime-cu12==12.4.127
57
+ tensorboard==2.18.0
58
+ fastrlock==0.8.3
59
+ rsa==4.9
60
+ lm-format-enforcer==0.10.9
61
+ openai==1.61.1
62
+ gpustat==1.1.1
63
+ librosa==0.10.2.post1
64
+ grpcio-status==1.70.0
65
+ nvidia-cudnn-cu12==9.1.0.70
66
+ zipp==3.21.0
67
+ nvidia-nvjitlink-cu12==12.4.127
68
+ cupy-cuda12x==13.3.0
69
+ Markdown==3.7
70
+ nvidia-cuda-cupti-cu12==12.4.127
71
+ nvidia-curand-cu12==10.3.5.147
72
+ rpds-py==0.22.3
73
+ outlines==0.1.11
74
+ docker-pycreds==0.4.0
75
+ distro==1.9.0
76
+ httpcore==1.0.7
77
+ gradio==5.9.0
78
+ google-auth-httplib2==0.2.0
79
+ iniconfig==2.0.0
80
+ gitdb==4.0.11
81
+ jsonschema==4.23.0
82
+ click==8.1.7
83
+ ninja==1.11.1.3
84
+ setuptools==75.6.0
85
+ audioread==3.0.1
86
+ frozenlist==1.5.0
87
+ transformers-stream-generator==0.0.5
88
+ nvidia-cublas-cu12==12.4.5.8
89
+ pycparser==2.22
90
+ GitPython==3.1.43
91
+ tqdm==4.67.1
92
+ importlib_metadata==8.5.0
93
+ patsy==1.0.1
94
+ networkx==3.4.2
95
+ semantic-version==2.10.0
96
+ alpaca_eval==0.6.6
97
+ google-cloud-core==2.4.1
98
+ prometheus_client==0.21.1
99
+ jiter==0.8.2
100
+ scipy==1.14.1
101
+ starlette==0.41.3
102
+ jq==1.8.0
103
+ opencensus-context==0.1.3
104
+ cachetools==5.5.1
105
+ cffi==1.17.1
106
+ opencv-python-headless==4.10.0.84
107
+ joblib==1.4.2
108
+ yt-dlp==2025.1.26
109
+ python-dateutil==2.9.0.post0
110
+ httpx==0.28.1
111
+ msgpack==1.1.0
112
+ pydub==0.25.1
113
+ tomlkit==0.13.2
114
+ nvitop==1.4.2
115
+ nvidia-cusparse-cu12==12.3.1.170
116
+ msgspec==0.18.6
117
+ aiosignal==1.3.2
118
+ wheel==0.45.1
119
+ filelock==3.16.1
120
+ pillow==10.4.0
121
+ typer==0.15.1
122
+ websockets==14.1
123
+ resampy==0.4.3
124
+ aiofiles==23.2.1
125
+ aiohttp-cors==0.7.0
126
+ platformdirs==4.3.6
127
+ gguf==0.10.0
128
+ diskcache==5.6.3
129
+ cloudpickle==3.1.0
130
+ multidict==6.1.0
131
+ py-cpuinfo==9.0.0
132
+ scikit-learn==1.6.0
133
+ smart-open==7.1.0
134
+ tiktoken==0.7.0
135
+ grpcio==1.70.0
136
+ charset-normalizer==3.4.0
137
+ nest-asyncio==1.6.0
138
+ lark==1.2.2
139
+ beautifulsoup4==4.13.3
140
+ pip==24.3.1
141
+ six==1.17.0
142
+ prometheus-fastapi-instrumentator==7.0.0
143
+ ruff==0.8.3
144
+ rich-toolkit==0.13.2
145
+ lazy_loader==0.4
146
+ grpc-google-iam-v1==0.14.0
147
+ psutil==6.1.0
148
+ mdurl==0.1.2
149
+ nvidia-nccl-cu12==2.21.5
150
+ triton==3.1.0
151
+ torchvision==0.20.1
152
+ fastapi==0.115.6
153
+ referencing==0.35.1
154
+ xxhash==3.5.0
155
+ pyzmq==26.2.0
156
+ torchlibrosa==0.1.0
157
+ googleapis-common-protos==1.66.0
158
+ pyasn1==0.6.1
159
+ soundfile==0.12.1
160
+ pyparsing==3.2.1
161
+ xgrammar==0.1.11
162
+ gradio_client==1.5.2
163
+ watchfiles==1.0.3
164
+ pluggy==1.5.0
165
+ py-spy==0.4.0
166
+ pybind11==2.13.6
167
+ diffusers==0.31.0
168
+ sentencepiece==0.2.0
169
+ flash_attn==2.7.4.post1
170
+ annotated-types==0.7.0
171
+ interegular==0.3.3
172
+ requests==2.32.3
173
+ opencensus==0.11.4
174
+ colorful==0.5.6
175
+ google-api-core==2.24.1
176
+ pytest==8.3.4
177
+ dnspython==2.7.0
178
+ pydantic_core==2.27.1
179
+ pytz==2024.2
180
+ pyasn1_modules==0.4.1
181
+ propcache==0.2.1
182
+ accelerate==1.2.1
183
+ fire==0.7.0
184
+ textual==1.0.0
185
+ sniffio==1.3.1
186
+ pyarrow==18.1.0
187
+ protobuf==5.29.1
188
+ wcwidth==0.2.13
189
+ packaging==24.2
190
+ uvicorn==0.34.0
191
+ sentry-sdk==2.19.2
192
+ google-auth==2.38.0
193
+ typing_extensions==4.12.2
194
+ peft==0.14.0
195
+ depyf==0.18.0
196
+ multiprocess==0.70.16
197
+ google-cloud-translate==3.19.0
198
+ nvidia-cuda-nvrtc-cu12==12.4.127
199
+ jsonschema-specifications==2024.10.1
200
+ vllm==0.7.3
201
+ nvidia-cufft-cu12==11.2.1.3
202
+ timm==1.0.12
203
+ rich==13.9.4
204
+ ffmpy==0.4.0
205
+ virtualenv==20.29.1
206
+ tzdata==2024.2
207
+ smmap==5.0.1
208
+ uc-micro-py==1.0.3
209
+ proto-plus==1.26.0
210
+ soxr==0.5.0.post1
211
+ h11==0.14.0
212
+ outlines_core==0.1.26
213
+ compressed-tensors==0.9.1
214
+ blake3==1.0.4
215
+ xformers==0.0.28.post3
216
+ orjson==3.10.12
217
+ ray==2.40.0
218
+ PyYAML==6.0.2
219
+ nvidia-ml-py==12.560.30
220
+ python-multipart==0.0.19
221
+ PySocks==1.7.1
222
+ regex==2024.11.6
223
+ pooch==1.8.2
224
+ termcolor==2.5.0
225
+ MarkupSafe==2.1.5
226
+ torch==2.5.1
227
+ fastapi-cli==0.0.7
228
+ gdown==5.2.0
229
+ numba==0.60.0
230
+ httptools==0.6.4
231
+ transformers==4.50.0.dev0
232
+ mistral_common==1.5.1
233
+ astor==0.8.1
234
+ anyio==4.7.0
235
+ safetensors==0.4.5
236
+ threadpoolctl==3.5.0
237
+ wrapt==1.17.2
238
+ wheel==0.43.0
239
+ jaraco.functools==4.0.1
240
+ inflect==7.3.1
241
+ jaraco.text==3.12.1
242
+ typeguard==4.3.0
243
+ jaraco.collections==5.1.0
244
+ importlib_metadata==8.0.0
245
+ backports.tarfile==1.2.0
246
+ tomli==2.0.1
247
+ autocommand==2.2.2
248
+ platformdirs==4.2.2
249
+ more-itertools==10.3.0
250
+ zipp==3.19.2
251
+ packaging==24.2
252
+ typing_extensions==4.12.2
253
+ jaraco.context==5.3.0
wandb/run-20250404_234514-h2gynfll/files/wandb-metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.0",
4
+ "startedAt": "2025-04-04T15:45:14.487401Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
11
+ "--train_split",
12
+ "train",
13
+ "--train_template",
14
+ "Safe_o1",
15
+ "--output_dir",
16
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
17
+ "--log_project",
18
+ "safe-o1",
19
+ "--per_device_train_batch_size",
20
+ "4",
21
+ "--per_device_eval_batch_size",
22
+ "4",
23
+ "--gradient_accumulation_steps",
24
+ "2",
25
+ "--learning_rate",
26
+ "2e-5",
27
+ "--epochs",
28
+ "6",
29
+ "--model_max_length",
30
+ "16384"
31
+ ],
32
+ "program": "-m align_anything.trainers.text_to_text.sft",
33
+ "git": {
34
+ "remote": "[email protected]:PKU-Alignment/align-anything.git",
35
+ "commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
36
+ },
37
+ "email": "[email protected]",
38
+ "root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
39
+ "host": "dgx-092",
40
+ "executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
41
+ "cpu_count": 112,
42
+ "cpu_count_logical": 224,
43
+ "gpu": "NVIDIA H800",
44
+ "gpu_count": 8,
45
+ "disk": {
46
+ "/": {
47
+ "total": "1888556142592",
48
+ "used": "149815398400"
49
+ }
50
+ },
51
+ "memory": {
52
+ "total": "2164195454976"
53
+ },
54
+ "cpu": {
55
+ "count": 112,
56
+ "countLogical": 224
57
+ },
58
+ "gpu_nvidia": [
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ },
101
+ {
102
+ "name": "NVIDIA H800",
103
+ "memoryTotal": "85520809984",
104
+ "cudaCores": 16896,
105
+ "architecture": "Hopper"
106
+ }
107
+ ],
108
+ "slurm": {
109
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
110
+ },
111
+ "cudaVersion": "12.2"
112
+ }
wandb/run-20250404_234514-h2gynfll/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":3288.280998361,"_step":2112,"train/step":2112,"train/loss":0.044164832681417465,"train/lr":2e-05,"train/epoch":6,"_wandb":{"runtime":3288},"_timestamp":1.7437847807259746e+09}
wandb/run-20250404_234514-h2gynfll/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-04T23:45:13.890415708+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpznxahgwg/port-2490920.txt","pid":2490920,"debug":false,"disable-analytics":false}
2
+ {"time":"2025-04-04T23:45:13.890455897+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2025-04-04T23:45:13.891244261+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2490920}
4
+ {"time":"2025-04-04T23:45:13.891234959+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44823,"Zone":""}}
5
+ {"time":"2025-04-04T23:45:14.073532455+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:55784"}
6
+ {"time":"2025-04-04T23:45:14.4882962+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"h2gynfll","id":"127.0.0.1:55784"}
7
+ {"time":"2025-04-04T23:45:14.804634542+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"h2gynfll","id":"127.0.0.1:55784"}
8
+ {"time":"2025-04-05T00:40:05.400228004+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"h2gynfll","id":"127.0.0.1:55784"}
9
+ {"time":"2025-04-05T00:40:05.400766541+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"h2gynfll","id":"127.0.0.1:55784"}
10
+ {"time":"2025-04-05T00:40:05.445957116+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:55784"}
11
+ {"time":"2025-04-05T00:40:05.445971641+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:55784"}
12
+ {"time":"2025-04-05T00:40:05.445983307+08:00","level":"INFO","msg":"server is shutting down"}
13
+ {"time":"2025-04-05T00:40:05.446007985+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:55784"}
14
+ {"time":"2025-04-05T00:40:05.446042966+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:55784"}
15
+ {"time":"2025-04-05T00:40:05.446045342+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:55784"}
16
+ {"time":"2025-04-05T00:40:05.446048272+08:00","level":"INFO","msg":"server is closed"}
wandb/run-20250404_234514-h2gynfll/logs/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-04T23:45:14.489825202+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-04T23:45:14.489982968+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250404_234514-h2gynfll/logs/debug-core.log"}
3
+ {"time":"2025-04-04T23:45:14.804571426+08:00","level":"INFO","msg":"created new stream","id":"h2gynfll"}
4
+ {"time":"2025-04-04T23:45:14.804627802+08:00","level":"INFO","msg":"stream: started","id":"h2gynfll"}
5
+ {"time":"2025-04-04T23:45:14.804640659+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"h2gynfll"}
6
+ {"time":"2025-04-04T23:45:14.804650005+08:00","level":"INFO","msg":"sender: started","stream_id":"h2gynfll"}
7
+ {"time":"2025-04-04T23:45:14.804666518+08:00","level":"INFO","msg":"handler: started","stream_id":"h2gynfll"}
8
+ {"time":"2025-04-04T23:45:15.109983443+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-04-05T00:40:02.76843027+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-04-05T00:40:02.769177866+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-04-05T00:40:03.76917444+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading output.log","runtime_seconds":0.106249371,"progress":"38.5KB/38.5KB"},{"desc":"uploading config.yaml","runtime_seconds":0.106241631,"progress":"2.7KB/2.7KB"}],"total_operations":2}}
12
+ {"time":"2025-04-05T00:40:04.14382192+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-04-05T00:40:05.400349491+08:00","level":"INFO","msg":"stream: closing","id":"h2gynfll"}
14
+ {"time":"2025-04-05T00:40:05.400395814+08:00","level":"INFO","msg":"handler: closed","stream_id":"h2gynfll"}
15
+ {"time":"2025-04-05T00:40:05.400407886+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"h2gynfll"}
16
+ {"time":"2025-04-05T00:40:05.400511465+08:00","level":"INFO","msg":"sender: closed","stream_id":"h2gynfll"}
17
+ {"time":"2025-04-05T00:40:05.400755931+08:00","level":"INFO","msg":"stream: closed","id":"h2gynfll"}
wandb/run-20250404_234514-h2gynfll/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Configure stats pid to 2490920
3
+ 2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
5
+ 2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250404_234514-h2gynfll/logs/debug.log
7
+ 2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250404_234514-h2gynfll/logs/debug-internal.log
8
+ 2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 6, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_o1', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
11
+ 2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():680] starting backend
12
+ 2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-04 23:45:14,486 INFO MainThread:2490920 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-04 23:45:14,487 INFO MainThread:2490920 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-04 23:45:14,488 INFO MainThread:2490920 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-04 23:45:14,548 INFO MainThread:2490920 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-04 23:45:15,103 INFO MainThread:2490920 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-04 23:45:15,490 INFO MainThread:2490920 [wandb_init.py:init():916] run started, returning control to user process
23
+ 2025-04-05 00:40:02,766 INFO MainThread:2490920 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/h2gynfll
24
+ 2025-04-05 00:40:02,767 INFO MainThread:2490920 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
25
+ 2025-04-05 00:40:02,767 INFO MainThread:2490920 [wandb_run.py:_restore():2321] restore
26
+ 2025-04-05 00:40:02,768 INFO MainThread:2490920 [wandb_run.py:_restore():2327] restore done
27
+ 2025-04-05 00:40:05,392 INFO MainThread:2490920 [wandb_run.py:_footer_history_summary_info():3892] rendering history
28
+ 2025-04-05 00:40:05,393 INFO MainThread:2490920 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
29
+ 2025-04-05 00:40:05,399 INFO MainThread:2490920 [wandb_run.py:_footer_sync_info():3853] logging synced files
wandb/run-20250404_234514-h2gynfll/run-h2gynfll.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e920529c8e186b25824a9c4ad1050a452aed4e52269e7b7ba6c5fcdd77c8095f
3
+ size 6833219
wandb/run-20250405_124142-wdmxf5un/files/config.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.1
4
+ m: []
5
+ python_version: 3.11.0
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 63
17
+ - 71
18
+ - 83
19
+ - 98
20
+ - 105
21
+ "2":
22
+ - 1
23
+ - 5
24
+ - 11
25
+ - 41
26
+ - 49
27
+ - 51
28
+ - 53
29
+ - 55
30
+ - 63
31
+ - 71
32
+ - 83
33
+ - 98
34
+ - 105
35
+ "3":
36
+ - 2
37
+ - 13
38
+ - 16
39
+ - 23
40
+ - 55
41
+ - 61
42
+ "4": 3.11.0
43
+ "5": 0.19.1
44
+ "6": 4.50.0.dev0
45
+ "8":
46
+ - 5
47
+ "12": 0.19.1
48
+ "13": linux-x86_64
49
+ bnb_cfgs:
50
+ value:
51
+ bnb_4bit_compute_dtype: float16
52
+ bnb_4bit_quant_type: nf4
53
+ bnb_4bit_use_double_quant: true
54
+ load_in_4bit: true
55
+ load_in_8bit: false
56
+ use_bnb: false
57
+ data_cfgs:
58
+ value:
59
+ eval_optional_args: []
60
+ train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
61
+ train_optional_args: []
62
+ train_split: train
63
+ train_template: Safe_o1
64
+ logger_cfgs:
65
+ value:
66
+ log_project: safe-o1
67
+ log_run_name: sft
68
+ log_type: wandb
69
+ output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
70
+ save_interval: 100000
71
+ lora_cfgs:
72
+ value:
73
+ inference_mode: false
74
+ lora_alpha: 16
75
+ lora_dropout: 0.1
76
+ r: 16
77
+ save_full_model: true
78
+ target_modules:
79
+ - q_proj
80
+ - v_proj
81
+ task_type: TaskType.CAUSAL_LM
82
+ use_lora: false
83
+ model_cfgs:
84
+ value:
85
+ model_max_length: 16384
86
+ model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
87
+ trust_remote_code: true
88
+ train_cfgs:
89
+ value:
90
+ adam_betas:
91
+ - 0.9
92
+ - 0.95
93
+ adam_epsilon: 1e-08
94
+ bf16: true
95
+ ds_cfgs: ds_z3_config.json
96
+ epochs: 6
97
+ eval_interval: 10
98
+ eval_strategy: steps
99
+ fp16: false
100
+ gradient_accumulation_steps: 2
101
+ gradient_checkpointing: true
102
+ learning_rate: 2e-05
103
+ lr_scheduler_type: constant
104
+ lr_warmup_ratio: 0.03
105
+ max_grad_norm: 1
106
+ per_device_eval_batch_size: 4
107
+ per_device_train_batch_size: 4
108
+ seed: 42
109
+ weight_decay: 0
wandb/run-20250405_124142-wdmxf5un/files/output.log ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/6 epoch: 0%| | 0/1056 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 2/6 epoch (loss 0.6039): 30%|█████████████████████████▋ | 319/1056 [09:05<19:54, 1.62s/it]
4
+ [2025-04-05 12:42:22,487] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
5
+ [2025-04-05 12:42:22,487] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=20.430520612254707, CurrSamplesPerSec=19.73994310767666, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
6
+ [2025-04-05 12:42:55,483] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
7
+ [2025-04-05 12:42:55,483] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=20.330484556424526, CurrSamplesPerSec=19.749929472671457, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
8
+ [2025-04-05 12:43:32,390] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
9
+ [2025-04-05 12:43:32,391] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=19.490752396407714, CurrSamplesPerSec=22.631995577606887, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
10
+ [2025-04-05 12:44:04,939] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
11
+ [2025-04-05 12:44:04,940] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=19.787231072811785, CurrSamplesPerSec=21.355656210314816, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
12
+ [2025-04-05 12:44:38,138] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
13
+ [2025-04-05 12:44:38,139] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=19.87305509037746, CurrSamplesPerSec=23.09551491786551, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
14
+ [2025-04-05 12:45:11,418] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
15
+ [2025-04-05 12:45:11,419] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=19.91152284007933, CurrSamplesPerSec=17.560894866383286, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
16
+ [2025-04-05 12:45:44,654] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
17
+ [2025-04-05 12:45:44,654] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=19.944567941119995, CurrSamplesPerSec=17.906646151944845, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
18
+ [2025-04-05 12:46:19,225] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
19
+ [2025-04-05 12:46:19,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=19.873932115504267, CurrSamplesPerSec=20.010007567016135, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
20
+ [2025-04-05 12:46:53,851] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
21
+ [2025-04-05 12:46:53,852] [INFO] [timer.py:264:stop] epoch=1/micro_step=4/global_step=90, RunningAvgSamplesPerSec=19.806830734808837, CurrSamplesPerSec=14.898176757575056, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
22
+ [2025-04-05 12:47:26,840] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
23
+ [2025-04-05 12:47:26,840] [INFO] [timer.py:264:stop] epoch=1/micro_step=24/global_step=100, RunningAvgSamplesPerSec=19.86084409306565, CurrSamplesPerSec=20.25306438527985, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
24
+ [2025-04-05 12:48:00,963] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
25
+ [2025-04-05 12:48:00,964] [INFO] [timer.py:264:stop] epoch=1/micro_step=44/global_step=110, RunningAvgSamplesPerSec=19.83197755374019, CurrSamplesPerSec=14.796357199340614, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
26
+ [2025-04-05 12:48:36,263] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
27
+ [2025-04-05 12:48:36,263] [INFO] [timer.py:264:stop] epoch=1/micro_step=64/global_step=120, RunningAvgSamplesPerSec=19.755458014995966, CurrSamplesPerSec=18.805691707671684, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
28
+ [2025-04-05 12:49:09,681] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
29
+ [2025-04-05 12:49:09,681] [INFO] [timer.py:264:stop] epoch=1/micro_step=84/global_step=130, RunningAvgSamplesPerSec=19.780419146519836, CurrSamplesPerSec=19.690861454132612, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
30
+ [2025-04-05 12:49:43,000] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
31
+ [2025-04-05 12:49:43,001] [INFO] [timer.py:264:stop] epoch=1/micro_step=104/global_step=140, RunningAvgSamplesPerSec=19.798657335646936, CurrSamplesPerSec=18.638907091791424, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
32
+ [2025-04-05 12:50:16,076] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
33
+ [2025-04-05 12:50:16,077] [INFO] [timer.py:264:stop] epoch=1/micro_step=124/global_step=150, RunningAvgSamplesPerSec=19.831999352959354, CurrSamplesPerSec=21.487376178731257, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
34
+ [2025-04-05 12:50:49,560] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
35
+ [2025-04-05 12:50:49,561] [INFO] [timer.py:264:stop] epoch=1/micro_step=144/global_step=160, RunningAvgSamplesPerSec=19.84491781415368, CurrSamplesPerSec=25.33945343262647, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
36
+ [2025-04-05 12:51:23,805] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
37
+ [2025-04-05 12:51:23,805] [INFO] [timer.py:264:stop] epoch=1/micro_step=164/global_step=170, RunningAvgSamplesPerSec=19.826904278206033, CurrSamplesPerSec=22.45413646824439, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
38
+ [2025-04-05 12:51:58,336] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
39
+ [2025-04-05 12:51:58,337] [INFO] [timer.py:264:stop] epoch=2/micro_step=8/global_step=180, RunningAvgSamplesPerSec=19.798618570342683, CurrSamplesPerSec=21.417902222668886, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
40
+ [2025-04-05 12:52:33,550] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
41
+ [2025-04-05 12:52:33,550] [INFO] [timer.py:264:stop] epoch=2/micro_step=28/global_step=190, RunningAvgSamplesPerSec=19.756965411737134, CurrSamplesPerSec=14.200132325260137, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
42
+ [2025-04-05 12:53:05,558] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
43
+ [2025-04-05 12:53:05,559] [INFO] [timer.py:264:stop] epoch=2/micro_step=48/global_step=200, RunningAvgSamplesPerSec=19.81483548351977, CurrSamplesPerSec=23.8050812324985, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
44
+ [2025-04-05 12:53:41,206] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
45
+ [2025-04-05 12:53:41,207] [INFO] [timer.py:264:stop] epoch=2/micro_step=68/global_step=210, RunningAvgSamplesPerSec=19.76297283536241, CurrSamplesPerSec=21.033590133844925, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
46
+ [2025-04-05 12:54:15,555] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
47
+ [2025-04-05 12:54:15,556] [INFO] [timer.py:264:stop] epoch=2/micro_step=88/global_step=220, RunningAvgSamplesPerSec=19.74868849123472, CurrSamplesPerSec=19.775005201497734, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
48
+ [2025-04-05 12:54:48,256] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
49
+ [2025-04-05 12:54:48,256] [INFO] [timer.py:264:stop] epoch=2/micro_step=108/global_step=230, RunningAvgSamplesPerSec=19.782728073976745, CurrSamplesPerSec=19.69564649230523, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
50
+ [2025-04-05 12:55:20,678] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
51
+ [2025-04-05 12:55:20,679] [INFO] [timer.py:264:stop] epoch=2/micro_step=128/global_step=240, RunningAvgSamplesPerSec=19.81915055199136, CurrSamplesPerSec=20.713842718681395, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
52
+ [2025-04-05 12:55:54,358] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
53
+ [2025-04-05 12:55:54,359] [INFO] [timer.py:264:stop] epoch=2/micro_step=148/global_step=250, RunningAvgSamplesPerSec=19.82191350770802, CurrSamplesPerSec=16.07218913804267, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
54
+ [2025-04-05 12:56:28,583] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
55
+ [2025-04-05 12:56:28,584] [INFO] [timer.py:264:stop] epoch=2/micro_step=168/global_step=260, RunningAvgSamplesPerSec=19.81595565426552, CurrSamplesPerSec=21.231867154990216, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
56
+ [2025-04-05 12:57:02,786] [INFO] [logging.py:128:log_dist] [Rank 0] step=270, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
57
+ [2025-04-05 12:57:02,786] [INFO] [timer.py:264:stop] epoch=3/micro_step=12/global_step=270, RunningAvgSamplesPerSec=19.810062754202114, CurrSamplesPerSec=19.989395411519066, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
58
+ [2025-04-05 12:57:37,851] [INFO] [logging.py:128:log_dist] [Rank 0] step=280, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
59
+ [2025-04-05 12:57:37,851] [INFO] [timer.py:264:stop] epoch=3/micro_step=32/global_step=280, RunningAvgSamplesPerSec=19.782856718965995, CurrSamplesPerSec=21.03165872299883, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
60
+ [2025-04-05 12:58:12,693] [INFO] [logging.py:128:log_dist] [Rank 0] step=290, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
61
+ [2025-04-05 12:58:12,693] [INFO] [timer.py:264:stop] epoch=3/micro_step=52/global_step=290, RunningAvgSamplesPerSec=19.758748532769488, CurrSamplesPerSec=13.064895896239026, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
62
+ [2025-04-05 12:58:46,457] [INFO] [logging.py:128:log_dist] [Rank 0] step=300, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
63
+ [2025-04-05 12:58:46,457] [INFO] [timer.py:264:stop] epoch=3/micro_step=72/global_step=300, RunningAvgSamplesPerSec=19.76271123968894, CurrSamplesPerSec=18.71946301781583, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
64
+ [2025-04-05 12:59:20,298] [INFO] [logging.py:128:log_dist] [Rank 0] step=310, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
65
+ [2025-04-05 12:59:20,298] [INFO] [timer.py:264:stop] epoch=3/micro_step=92/global_step=310, RunningAvgSamplesPerSec=19.765042125767838, CurrSamplesPerSec=18.12731313727023, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
66
+ [2025-04-05 12:59:53,130] [INFO] [logging.py:128:log_dist] [Rank 0] step=320, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
67
+ [2025-04-05 12:59:53,130] [INFO] [timer.py:264:stop] epoch=3/micro_step=112/global_step=320, RunningAvgSamplesPerSec=19.785059722720725, CurrSamplesPerSec=18.162258241524817, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
68
+ [2025-04-05 13:00:25,014] [INFO] [logging.py:128:log_dist] [Rank 0] step=330, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
69
+ [2025-04-05 13:00:25,015] [INFO] [timer.py:264:stop] epoch=3/micro_step=132/global_step=330, RunningAvgSamplesPerSec=19.822856898324304, CurrSamplesPerSec=23.68279231225561, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
70
+ [2025-04-05 13:01:00,162] [INFO] [logging.py:128:log_dist] [Rank 0] step=340, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
71
+ [2025-04-05 13:01:00,163] [INFO] [timer.py:264:stop] epoch=3/micro_step=152/global_step=340, RunningAvgSamplesPerSec=19.79772228473608, CurrSamplesPerSec=16.10940253591632, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
72
+ [2025-04-05 13:01:33,503] [INFO] [logging.py:128:log_dist] [Rank 0] step=350, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
73
+ [2025-04-05 13:01:33,503] [INFO] [timer.py:264:stop] epoch=3/micro_step=172/global_step=350, RunningAvgSamplesPerSec=19.80785855020464, CurrSamplesPerSec=21.302828589697363, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
74
+ [2025-04-05 13:02:07,911] [INFO] [logging.py:128:log_dist] [Rank 0] step=360, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
75
+ [2025-04-05 13:02:07,912] [INFO] [timer.py:264:stop] epoch=4/micro_step=16/global_step=360, RunningAvgSamplesPerSec=19.79618308624211, CurrSamplesPerSec=19.263544613759976, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
76
+ [2025-04-05 13:02:41,115] [INFO] [logging.py:128:log_dist] [Rank 0] step=370, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
77
+ [2025-04-05 13:02:41,115] [INFO] [timer.py:264:stop] epoch=4/micro_step=36/global_step=370, RunningAvgSamplesPerSec=19.80650332458013, CurrSamplesPerSec=23.701802484868033, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
78
+ [2025-04-05 13:03:18,843] [INFO] [logging.py:128:log_dist] [Rank 0] step=380, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
79
+ [2025-04-05 13:03:18,843] [INFO] [timer.py:264:stop] epoch=4/micro_step=56/global_step=380, RunningAvgSamplesPerSec=19.743233696848424, CurrSamplesPerSec=16.022804105468715, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
80
+ [2025-04-05 13:03:50,717] [INFO] [logging.py:128:log_dist] [Rank 0] step=390, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
81
+ [2025-04-05 13:03:50,717] [INFO] [timer.py:264:stop] epoch=4/micro_step=76/global_step=390, RunningAvgSamplesPerSec=19.777954611746445, CurrSamplesPerSec=21.092219908245035, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
82
+ [2025-04-05 13:04:24,195] [INFO] [logging.py:128:log_dist] [Rank 0] step=400, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
83
+ [2025-04-05 13:04:24,195] [INFO] [timer.py:264:stop] epoch=4/micro_step=96/global_step=400, RunningAvgSamplesPerSec=19.782590618091508, CurrSamplesPerSec=22.82894504523341, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
84
+ [2025-04-05 13:04:56,685] [INFO] [logging.py:128:log_dist] [Rank 0] step=410, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
85
+ [2025-04-05 13:04:56,686] [INFO] [timer.py:264:stop] epoch=4/micro_step=116/global_step=410, RunningAvgSamplesPerSec=19.802987040271688, CurrSamplesPerSec=25.181649421025643, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
86
+ [2025-04-05 13:05:29,407] [INFO] [logging.py:128:log_dist] [Rank 0] step=420, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
87
+ [2025-04-05 13:05:29,407] [INFO] [timer.py:264:stop] epoch=4/micro_step=136/global_step=420, RunningAvgSamplesPerSec=19.820600060602313, CurrSamplesPerSec=19.750884196105687, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
88
+ [2025-04-05 13:06:04,461] [INFO] [logging.py:128:log_dist] [Rank 0] step=430, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
89
+ [2025-04-05 13:06:04,461] [INFO] [timer.py:264:stop] epoch=4/micro_step=156/global_step=430, RunningAvgSamplesPerSec=19.804042411655413, CurrSamplesPerSec=22.677130413597084, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
90
+ [2025-04-05 13:06:37,387] [INFO] [logging.py:128:log_dist] [Rank 0] step=440, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
91
+ [2025-04-05 13:06:37,388] [INFO] [timer.py:264:stop] epoch=4/micro_step=176/global_step=440, RunningAvgSamplesPerSec=19.818469015439167, CurrSamplesPerSec=22.638250216890107, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
92
+ [2025-04-05 13:07:12,029] [INFO] [logging.py:128:log_dist] [Rank 0] step=450, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
93
+ [2025-04-05 13:07:12,030] [INFO] [timer.py:264:stop] epoch=5/micro_step=20/global_step=450, RunningAvgSamplesPerSec=19.80636742741741, CurrSamplesPerSec=19.86476517052075, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
94
+ [2025-04-05 13:07:44,787] [INFO] [logging.py:128:log_dist] [Rank 0] step=460, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
95
+ [2025-04-05 13:07:44,787] [INFO] [timer.py:264:stop] epoch=5/micro_step=40/global_step=460, RunningAvgSamplesPerSec=19.820052969987852, CurrSamplesPerSec=19.756540318386058, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
96
+ [2025-04-05 13:08:21,559] [INFO] [logging.py:128:log_dist] [Rank 0] step=470, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
97
+ [2025-04-05 13:08:21,560] [INFO] [timer.py:264:stop] epoch=5/micro_step=60/global_step=470, RunningAvgSamplesPerSec=19.781581527254854, CurrSamplesPerSec=22.70716870525464, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
98
+ [2025-04-05 13:08:54,264] [INFO] [logging.py:128:log_dist] [Rank 0] step=480, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
99
+ [2025-04-05 13:08:54,264] [INFO] [timer.py:264:stop] epoch=5/micro_step=80/global_step=480, RunningAvgSamplesPerSec=19.79817588971848, CurrSamplesPerSec=21.579297206481634, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
100
+ [2025-04-05 13:09:27,633] [INFO] [logging.py:128:log_dist] [Rank 0] step=490, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
101
+ [2025-04-05 13:09:27,633] [INFO] [timer.py:264:stop] epoch=5/micro_step=100/global_step=490, RunningAvgSamplesPerSec=19.80181812039545, CurrSamplesPerSec=23.258142067239728, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
102
+ [2025-04-05 13:10:00,647] [INFO] [logging.py:128:log_dist] [Rank 0] step=500, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
103
+ [2025-04-05 13:10:00,648] [INFO] [timer.py:264:stop] epoch=5/micro_step=120/global_step=500, RunningAvgSamplesPerSec=19.811462542719266, CurrSamplesPerSec=18.30783674776777, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
104
+ [2025-04-05 13:10:33,720] [INFO] [logging.py:128:log_dist] [Rank 0] step=510, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
105
+ [2025-04-05 13:10:33,721] [INFO] [timer.py:264:stop] epoch=5/micro_step=140/global_step=510, RunningAvgSamplesPerSec=19.821325678536503, CurrSamplesPerSec=18.060156227360213, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
106
+ [2025-04-05 13:11:08,188] [INFO] [logging.py:128:log_dist] [Rank 0] step=520, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
107
+ [2025-04-05 13:11:08,188] [INFO] [timer.py:264:stop] epoch=5/micro_step=160/global_step=520, RunningAvgSamplesPerSec=19.81436114478211, CurrSamplesPerSec=19.79953860428667, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
108
+ Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
109
+ Saving 16-bit model...
110
+ [2025-04-05 13:11:42,501] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step528 is about to be saved!
111
+ [2025-04-05 13:11:42,503] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step528
112
+ [2025-04-05 13:11:42,503] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
113
+ [2025-04-05 13:11:55,172] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
114
+ [2025-04-05 13:11:55,172] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step528 is ready now!
115
+ Model saved!
wandb/run-20250405_124142-wdmxf5un/files/requirements.txt ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ deepspeed==0.16.1
3
+ uritemplate==4.1.1
4
+ pyairports==2.1.1
5
+ partial-json-parser==0.2.1.1.post4
6
+ tensorboard-data-server==0.7.2
7
+ pydantic==2.10.3
8
+ Werkzeug==3.1.3
9
+ attrs==24.3.0
10
+ Jinja2==3.1.4
11
+ email_validator==2.2.0
12
+ mdit-py-plugins==0.4.2
13
+ google-api-python-client==2.160.0
14
+ pandas==2.2.3
15
+ safehttpx==0.1.6
16
+ setproctitle==1.3.4
17
+ dill==0.3.8
18
+ torchaudio==2.5.1
19
+ frechet-audio-distance==0.1.2
20
+ blessed==1.20.0
21
+ llvmlite==0.43.0
22
+ litellm==1.60.8
23
+ nvidia-nvtx-cu12==12.4.127
24
+ nvidia-cusolver-cu12==11.6.1.9
25
+ einops==0.8.0
26
+ datasets==3.2.0
27
+ pycountry==24.6.1
28
+ airportsdata==20250224
29
+ idna==3.10
30
+ urllib3==2.2.3
31
+ mpmath==1.3.0
32
+ wandb==0.19.1
33
+ certifi==2024.12.14
34
+ markdown-it-py==3.0.0
35
+ align-anything==0.0.1.dev0
36
+ aiohttp==3.11.10
37
+ fsspec==2024.9.0
38
+ aiohappyeyeballs==2.4.4
39
+ httplib2==0.22.0
40
+ hjson==3.1.0
41
+ yarl==1.18.3
42
+ decorator==5.1.1
43
+ distlib==0.3.9
44
+ absl-py==2.1.0
45
+ huggingface-hub==0.27.0
46
+ memray==1.15.0
47
+ Pygments==2.18.0
48
+ soupsieve==2.6
49
+ shellingham==1.5.4
50
+ tokenizers==0.21.0
51
+ uvloop==0.21.0
52
+ numpy==1.26.4
53
+ linkify-it-py==2.0.3
54
+ sympy==1.13.1
55
+ python-dotenv==1.0.1
56
+ nvidia-cuda-runtime-cu12==12.4.127
57
+ tensorboard==2.18.0
58
+ fastrlock==0.8.3
59
+ rsa==4.9
60
+ lm-format-enforcer==0.10.9
61
+ openai==1.61.1
62
+ gpustat==1.1.1
63
+ librosa==0.10.2.post1
64
+ grpcio-status==1.70.0
65
+ nvidia-cudnn-cu12==9.1.0.70
66
+ zipp==3.21.0
67
+ nvidia-nvjitlink-cu12==12.4.127
68
+ cupy-cuda12x==13.3.0
69
+ Markdown==3.7
70
+ nvidia-cuda-cupti-cu12==12.4.127
71
+ nvidia-curand-cu12==10.3.5.147
72
+ rpds-py==0.22.3
73
+ outlines==0.1.11
74
+ docker-pycreds==0.4.0
75
+ distro==1.9.0
76
+ httpcore==1.0.7
77
+ gradio==5.9.0
78
+ google-auth-httplib2==0.2.0
79
+ iniconfig==2.0.0
80
+ gitdb==4.0.11
81
+ jsonschema==4.23.0
82
+ click==8.1.7
83
+ ninja==1.11.1.3
84
+ setuptools==75.6.0
85
+ audioread==3.0.1
86
+ frozenlist==1.5.0
87
+ transformers-stream-generator==0.0.5
88
+ nvidia-cublas-cu12==12.4.5.8
89
+ pycparser==2.22
90
+ GitPython==3.1.43
91
+ tqdm==4.67.1
92
+ importlib_metadata==8.5.0
93
+ patsy==1.0.1
94
+ networkx==3.4.2
95
+ semantic-version==2.10.0
96
+ alpaca_eval==0.6.6
97
+ google-cloud-core==2.4.1
98
+ prometheus_client==0.21.1
99
+ jiter==0.8.2
100
+ scipy==1.14.1
101
+ starlette==0.41.3
102
+ jq==1.8.0
103
+ opencensus-context==0.1.3
104
+ cachetools==5.5.1
105
+ cffi==1.17.1
106
+ opencv-python-headless==4.10.0.84
107
+ joblib==1.4.2
108
+ yt-dlp==2025.1.26
109
+ python-dateutil==2.9.0.post0
110
+ httpx==0.28.1
111
+ msgpack==1.1.0
112
+ pydub==0.25.1
113
+ tomlkit==0.13.2
114
+ nvitop==1.4.2
115
+ nvidia-cusparse-cu12==12.3.1.170
116
+ msgspec==0.18.6
117
+ aiosignal==1.3.2
118
+ wheel==0.45.1
119
+ filelock==3.16.1
120
+ pillow==10.4.0
121
+ typer==0.15.1
122
+ websockets==14.1
123
+ resampy==0.4.3
124
+ aiofiles==23.2.1
125
+ aiohttp-cors==0.7.0
126
+ platformdirs==4.3.6
127
+ gguf==0.10.0
128
+ diskcache==5.6.3
129
+ cloudpickle==3.1.0
130
+ multidict==6.1.0
131
+ py-cpuinfo==9.0.0
132
+ scikit-learn==1.6.0
133
+ smart-open==7.1.0
134
+ tiktoken==0.7.0
135
+ grpcio==1.70.0
136
+ charset-normalizer==3.4.0
137
+ nest-asyncio==1.6.0
138
+ lark==1.2.2
139
+ beautifulsoup4==4.13.3
140
+ pip==24.3.1
141
+ six==1.17.0
142
+ prometheus-fastapi-instrumentator==7.0.0
143
+ ruff==0.8.3
144
+ rich-toolkit==0.13.2
145
+ lazy_loader==0.4
146
+ grpc-google-iam-v1==0.14.0
147
+ psutil==6.1.0
148
+ mdurl==0.1.2
149
+ nvidia-nccl-cu12==2.21.5
150
+ triton==3.1.0
151
+ torchvision==0.20.1
152
+ fastapi==0.115.6
153
+ referencing==0.35.1
154
+ xxhash==3.5.0
155
+ pyzmq==26.2.0
156
+ torchlibrosa==0.1.0
157
+ googleapis-common-protos==1.66.0
158
+ pyasn1==0.6.1
159
+ soundfile==0.12.1
160
+ pyparsing==3.2.1
161
+ xgrammar==0.1.11
162
+ gradio_client==1.5.2
163
+ watchfiles==1.0.3
164
+ pluggy==1.5.0
165
+ py-spy==0.4.0
166
+ pybind11==2.13.6
167
+ diffusers==0.31.0
168
+ sentencepiece==0.2.0
169
+ flash_attn==2.7.4.post1
170
+ annotated-types==0.7.0
171
+ interegular==0.3.3
172
+ requests==2.32.3
173
+ opencensus==0.11.4
174
+ colorful==0.5.6
175
+ google-api-core==2.24.1
176
+ pytest==8.3.4
177
+ dnspython==2.7.0
178
+ pydantic_core==2.27.1
179
+ pytz==2024.2
180
+ pyasn1_modules==0.4.1
181
+ propcache==0.2.1
182
+ accelerate==1.2.1
183
+ fire==0.7.0
184
+ textual==1.0.0
185
+ sniffio==1.3.1
186
+ pyarrow==18.1.0
187
+ protobuf==5.29.1
188
+ wcwidth==0.2.13
189
+ packaging==24.2
190
+ uvicorn==0.34.0
191
+ sentry-sdk==2.19.2
192
+ google-auth==2.38.0
193
+ typing_extensions==4.12.2
194
+ peft==0.14.0
195
+ depyf==0.18.0
196
+ multiprocess==0.70.16
197
+ google-cloud-translate==3.19.0
198
+ nvidia-cuda-nvrtc-cu12==12.4.127
199
+ jsonschema-specifications==2024.10.1
200
+ vllm==0.7.3
201
+ nvidia-cufft-cu12==11.2.1.3
202
+ timm==1.0.12
203
+ rich==13.9.4
204
+ ffmpy==0.4.0
205
+ virtualenv==20.29.1
206
+ tzdata==2024.2
207
+ smmap==5.0.1
208
+ uc-micro-py==1.0.3
209
+ proto-plus==1.26.0
210
+ soxr==0.5.0.post1
211
+ h11==0.14.0
212
+ outlines_core==0.1.26
213
+ compressed-tensors==0.9.1
214
+ blake3==1.0.4
215
+ xformers==0.0.28.post3
216
+ orjson==3.10.12
217
+ ray==2.40.0
218
+ PyYAML==6.0.2
219
+ nvidia-ml-py==12.560.30
220
+ python-multipart==0.0.19
221
+ PySocks==1.7.1
222
+ regex==2024.11.6
223
+ pooch==1.8.2
224
+ termcolor==2.5.0
225
+ MarkupSafe==2.1.5
226
+ torch==2.5.1
227
+ fastapi-cli==0.0.7
228
+ gdown==5.2.0
229
+ numba==0.60.0
230
+ httptools==0.6.4
231
+ transformers==4.50.0.dev0
232
+ mistral_common==1.5.1
233
+ astor==0.8.1
234
+ anyio==4.7.0
235
+ safetensors==0.4.5
236
+ threadpoolctl==3.5.0
237
+ wrapt==1.17.2
238
+ wheel==0.43.0
239
+ jaraco.functools==4.0.1
240
+ inflect==7.3.1
241
+ jaraco.text==3.12.1
242
+ typeguard==4.3.0
243
+ jaraco.collections==5.1.0
244
+ importlib_metadata==8.0.0
245
+ backports.tarfile==1.2.0
246
+ tomli==2.0.1
247
+ autocommand==2.2.2
248
+ platformdirs==4.2.2
249
+ more-itertools==10.3.0
250
+ zipp==3.19.2
251
+ packaging==24.2
252
+ typing_extensions==4.12.2
253
+ jaraco.context==5.3.0
wandb/run-20250405_124142-wdmxf5un/files/wandb-metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.0",
4
+ "startedAt": "2025-04-05T04:41:42.080694Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
11
+ "--train_split",
12
+ "train",
13
+ "--train_template",
14
+ "Safe_o1",
15
+ "--output_dir",
16
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
17
+ "--log_project",
18
+ "safe-o1",
19
+ "--per_device_train_batch_size",
20
+ "4",
21
+ "--per_device_eval_batch_size",
22
+ "4",
23
+ "--gradient_accumulation_steps",
24
+ "2",
25
+ "--learning_rate",
26
+ "2e-5",
27
+ "--epochs",
28
+ "6",
29
+ "--model_max_length",
30
+ "16384"
31
+ ],
32
+ "program": "-m align_anything.trainers.text_to_text.sft",
33
+ "git": {
34
+ "remote": "[email protected]:PKU-Alignment/align-anything.git",
35
+ "commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
36
+ },
37
+ "email": "[email protected]",
38
+ "root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
39
+ "host": "dgx-092",
40
+ "executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
41
+ "cpu_count": 112,
42
+ "cpu_count_logical": 224,
43
+ "gpu": "NVIDIA H800",
44
+ "gpu_count": 8,
45
+ "disk": {
46
+ "/": {
47
+ "total": "1888556142592",
48
+ "used": "149928067072"
49
+ }
50
+ },
51
+ "memory": {
52
+ "total": "2164195454976"
53
+ },
54
+ "cpu": {
55
+ "count": 112,
56
+ "countLogical": 224
57
+ },
58
+ "gpu_nvidia": [
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ },
101
+ {
102
+ "name": "NVIDIA H800",
103
+ "memoryTotal": "85520809984",
104
+ "cudaCores": 16896,
105
+ "architecture": "Hopper"
106
+ }
107
+ ],
108
+ "slurm": {
109
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
110
+ },
111
+ "cudaVersion": "12.2"
112
+ }
wandb/run-20250405_124142-wdmxf5un/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":1813.133208883,"_step":1056,"_wandb":{"runtime":1813},"train/step":1056,"train/loss":0.07423145323991776,"train/lr":2e-05,"train/epoch":6,"_timestamp":1.7438298946027555e+09}
wandb/run-20250405_124142-wdmxf5un/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-05T12:41:41.503771102+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpo0_9yrfw/port-3499761.txt","pid":3499761,"debug":false,"disable-analytics":false}
2
+ {"time":"2025-04-05T12:41:41.50383036+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2025-04-05T12:41:41.504662712+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3499761}
4
+ {"time":"2025-04-05T12:41:41.504649334+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41733,"Zone":""}}
5
+ {"time":"2025-04-05T12:41:41.685582021+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57286"}
6
+ {"time":"2025-04-05T12:41:42.081810281+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
7
+ {"time":"2025-04-05T12:41:42.297218189+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
8
+ {"time":"2025-04-05T13:11:58.021979029+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
9
+ {"time":"2025-04-05T13:11:58.02250833+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
10
+ {"time":"2025-04-05T13:11:58.069941299+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:57286"}
11
+ {"time":"2025-04-05T13:11:58.069957091+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:57286"}
12
+ {"time":"2025-04-05T13:11:58.069970949+08:00","level":"INFO","msg":"server is shutting down"}
13
+ {"time":"2025-04-05T13:11:58.069994407+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:57286"}
14
+ {"time":"2025-04-05T13:11:58.07003219+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:57286"}
15
+ {"time":"2025-04-05T13:11:58.070034704+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:57286"}
16
+ {"time":"2025-04-05T13:11:58.070037746+08:00","level":"INFO","msg":"server is closed"}
wandb/run-20250405_124142-wdmxf5un/logs/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-05T12:41:42.083502459+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-05T12:41:42.083646225+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_124142-wdmxf5un/logs/debug-core.log"}
3
+ {"time":"2025-04-05T12:41:42.297160304+08:00","level":"INFO","msg":"created new stream","id":"wdmxf5un"}
4
+ {"time":"2025-04-05T12:41:42.297211119+08:00","level":"INFO","msg":"stream: started","id":"wdmxf5un"}
5
+ {"time":"2025-04-05T12:41:42.297225618+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"wdmxf5un"}
6
+ {"time":"2025-04-05T12:41:42.297245103+08:00","level":"INFO","msg":"sender: started","stream_id":"wdmxf5un"}
7
+ {"time":"2025-04-05T12:41:42.297240136+08:00","level":"INFO","msg":"handler: started","stream_id":"wdmxf5un"}
8
+ {"time":"2025-04-05T12:41:42.608849544+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-04-05T13:11:55.213924936+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-04-05T13:11:55.214734471+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-04-05T13:11:56.17645318+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.303202353,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.303187719,"progress":"19.7KB/19.7KB"}],"total_operations":2}}
12
+ {"time":"2025-04-05T13:11:56.7588989+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-04-05T13:11:58.022118371+08:00","level":"INFO","msg":"stream: closing","id":"wdmxf5un"}
14
+ {"time":"2025-04-05T13:11:58.022162409+08:00","level":"INFO","msg":"handler: closed","stream_id":"wdmxf5un"}
15
+ {"time":"2025-04-05T13:11:58.022170713+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"wdmxf5un"}
16
+ {"time":"2025-04-05T13:11:58.022286446+08:00","level":"INFO","msg":"sender: closed","stream_id":"wdmxf5un"}
17
+ {"time":"2025-04-05T13:11:58.022499189+08:00","level":"INFO","msg":"stream: closed","id":"wdmxf5un"}
wandb/run-20250405_124142-wdmxf5un/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Configure stats pid to 3499761
3
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
5
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_124142-wdmxf5un/logs/debug.log
7
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_124142-wdmxf5un/logs/debug-internal.log
8
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 6, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_o1', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
11
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():680] starting backend
12
+ 2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-05 12:41:42,080 INFO MainThread:3499761 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-05 12:41:42,080 INFO MainThread:3499761 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-05 12:41:42,081 INFO MainThread:3499761 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-05 12:41:42,093 INFO MainThread:3499761 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-05 12:41:42,603 INFO MainThread:3499761 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-05 12:41:42,840 INFO MainThread:3499761 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-05 12:41:42,840 INFO MainThread:3499761 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-05 12:41:42,840 INFO MainThread:3499761 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-05 12:41:42,841 INFO MainThread:3499761 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-05 12:41:42,843 INFO MainThread:3499761 [wandb_init.py:init():916] run started, returning control to user process
23
+ 2025-04-05 13:11:55,174 INFO MainThread:3499761 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/wdmxf5un
24
+ 2025-04-05 13:11:55,175 INFO MainThread:3499761 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
25
+ 2025-04-05 13:11:55,175 INFO MainThread:3499761 [wandb_run.py:_restore():2321] restore
26
+ 2025-04-05 13:11:55,175 INFO MainThread:3499761 [wandb_run.py:_restore():2327] restore done
27
+ 2025-04-05 13:11:58,016 INFO MainThread:3499761 [wandb_run.py:_footer_history_summary_info():3892] rendering history
28
+ 2025-04-05 13:11:58,016 INFO MainThread:3499761 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
29
+ 2025-04-05 13:11:58,021 INFO MainThread:3499761 [wandb_run.py:_footer_sync_info():3853] logging synced files
wandb/run-20250405_124142-wdmxf5un/run-wdmxf5un.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:745d51aa8a60822ffdf463265dc9f734f3350d16bfa1d80cebc8c6babd74d586
3
+ size 3245024
wandb/run-20250405_153219-puqja889/files/config.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.1
4
+ m: []
5
+ python_version: 3.11.0
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 63
17
+ - 71
18
+ - 83
19
+ - 98
20
+ - 105
21
+ "2":
22
+ - 1
23
+ - 5
24
+ - 11
25
+ - 41
26
+ - 49
27
+ - 51
28
+ - 53
29
+ - 55
30
+ - 63
31
+ - 71
32
+ - 83
33
+ - 98
34
+ - 105
35
+ "3":
36
+ - 2
37
+ - 13
38
+ - 16
39
+ - 23
40
+ - 55
41
+ - 61
42
+ "4": 3.11.0
43
+ "5": 0.19.1
44
+ "6": 4.50.0.dev0
45
+ "8":
46
+ - 5
47
+ "12": 0.19.1
48
+ "13": linux-x86_64
49
+ bnb_cfgs:
50
+ value:
51
+ bnb_4bit_compute_dtype: float16
52
+ bnb_4bit_quant_type: nf4
53
+ bnb_4bit_use_double_quant: true
54
+ load_in_4bit: true
55
+ load_in_8bit: false
56
+ use_bnb: false
57
+ data_cfgs:
58
+ value:
59
+ eval_optional_args: []
60
+ train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
61
+ train_optional_args: []
62
+ train_split: train
63
+ train_template: Safe_o1
64
+ logger_cfgs:
65
+ value:
66
+ log_project: safe-o1
67
+ log_run_name: sft
68
+ log_type: wandb
69
+ output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
70
+ save_interval: 100000
71
+ lora_cfgs:
72
+ value:
73
+ inference_mode: false
74
+ lora_alpha: 16
75
+ lora_dropout: 0.1
76
+ r: 16
77
+ save_full_model: true
78
+ target_modules:
79
+ - q_proj
80
+ - v_proj
81
+ task_type: TaskType.CAUSAL_LM
82
+ use_lora: false
83
+ model_cfgs:
84
+ value:
85
+ model_max_length: 16384
86
+ model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
87
+ trust_remote_code: true
88
+ train_cfgs:
89
+ value:
90
+ adam_betas:
91
+ - 0.9
92
+ - 0.95
93
+ adam_epsilon: 1e-08
94
+ bf16: true
95
+ ds_cfgs: ds_z3_config.json
96
+ epochs: 3
97
+ eval_interval: 10
98
+ eval_strategy: steps
99
+ fp16: false
100
+ gradient_accumulation_steps: 2
101
+ gradient_checkpointing: true
102
+ learning_rate: 2e-05
103
+ lr_scheduler_type: constant
104
+ lr_warmup_ratio: 0.03
105
+ max_grad_norm: 1
106
+ per_device_eval_batch_size: 4
107
+ per_device_train_batch_size: 4
108
+ seed: 42
109
+ weight_decay: 0
wandb/run-20250405_153219-puqja889/files/output.log ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/3 epoch: 0%| | 0/528 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 2/3 epoch (loss 0.5996): 60%|███████████████████████████████████████████████████████████████████████████████▏ | 319/528 [09:05<05:41, 1.64s/it]
4
+ [2025-04-05 15:32:59,853] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
5
+ [2025-04-05 15:32:59,853] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=20.30692434041, CurrSamplesPerSec=19.741172703491532, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
6
+ [2025-04-05 15:33:32,887] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
7
+ [2025-04-05 15:33:32,887] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=20.267265680615793, CurrSamplesPerSec=19.119188499422773, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
8
+ [2025-04-05 15:34:09,740] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
9
+ [2025-04-05 15:34:09,740] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=19.441145165201213, CurrSamplesPerSec=22.89643941293659, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
10
+ [2025-04-05 15:34:42,269] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
11
+ [2025-04-05 15:34:42,269] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=19.71075418937523, CurrSamplesPerSec=21.574415014581692, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
12
+ [2025-04-05 15:35:15,596] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
13
+ [2025-04-05 15:35:15,596] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=19.786513682310243, CurrSamplesPerSec=23.265023869025228, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
14
+ [2025-04-05 15:35:48,802] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
15
+ [2025-04-05 15:35:48,803] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=19.84854756919738, CurrSamplesPerSec=18.438848766680724, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
16
+ [2025-04-05 15:36:21,949] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
17
+ [2025-04-05 15:36:21,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=19.89555189033183, CurrSamplesPerSec=18.18743542017038, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
18
+ [2025-04-05 15:36:56,642] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
19
+ [2025-04-05 15:36:56,642] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=19.808610765037212, CurrSamplesPerSec=19.790898299980608, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
20
+ [2025-04-05 15:37:31,295] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
21
+ [2025-04-05 15:37:31,296] [INFO] [timer.py:264:stop] epoch=1/micro_step=4/global_step=90, RunningAvgSamplesPerSec=19.75046050883055, CurrSamplesPerSec=15.143802444921475, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
22
+ [2025-04-05 15:38:04,353] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
23
+ [2025-04-05 15:38:04,354] [INFO] [timer.py:264:stop] epoch=1/micro_step=24/global_step=100, RunningAvgSamplesPerSec=19.799562880276337, CurrSamplesPerSec=20.365822624126473, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
24
+ [2025-04-05 15:38:38,700] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
25
+ [2025-04-05 15:38:38,701] [INFO] [timer.py:264:stop] epoch=1/micro_step=44/global_step=110, RunningAvgSamplesPerSec=19.76926764728122, CurrSamplesPerSec=14.58470845145212, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
26
+ [2025-04-05 15:39:13,809] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
27
+ [2025-04-05 15:39:13,810] [INFO] [timer.py:264:stop] epoch=1/micro_step=64/global_step=120, RunningAvgSamplesPerSec=19.702187119604087, CurrSamplesPerSec=18.840601998397894, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
28
+ [2025-04-05 15:39:47,221] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
29
+ [2025-04-05 15:39:47,222] [INFO] [timer.py:264:stop] epoch=1/micro_step=84/global_step=130, RunningAvgSamplesPerSec=19.734580817376962, CurrSamplesPerSec=19.789930950426523, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
30
+ [2025-04-05 15:40:20,485] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
31
+ [2025-04-05 15:40:20,486] [INFO] [timer.py:264:stop] epoch=1/micro_step=104/global_step=140, RunningAvgSamplesPerSec=19.761128557333294, CurrSamplesPerSec=18.234083740928202, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
32
+ [2025-04-05 15:40:53,748] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
33
+ [2025-04-05 15:40:53,748] [INFO] [timer.py:264:stop] epoch=1/micro_step=124/global_step=150, RunningAvgSamplesPerSec=19.78862789065117, CurrSamplesPerSec=21.41741861785655, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
34
+ [2025-04-05 15:41:26,856] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
35
+ [2025-04-05 15:41:26,856] [INFO] [timer.py:264:stop] epoch=1/micro_step=144/global_step=160, RunningAvgSamplesPerSec=19.81502168719465, CurrSamplesPerSec=24.666726261517713, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
36
+ [2025-04-05 15:42:01,110] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
37
+ [2025-04-05 15:42:01,110] [INFO] [timer.py:264:stop] epoch=1/micro_step=164/global_step=170, RunningAvgSamplesPerSec=19.792251767052175, CurrSamplesPerSec=22.756839975456966, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
38
+ [2025-04-05 15:42:35,842] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
39
+ [2025-04-05 15:42:35,842] [INFO] [timer.py:264:stop] epoch=2/micro_step=8/global_step=180, RunningAvgSamplesPerSec=19.7654807990148, CurrSamplesPerSec=21.057054092109006, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
40
+ [2025-04-05 15:43:11,122] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
41
+ [2025-04-05 15:43:11,122] [INFO] [timer.py:264:stop] epoch=2/micro_step=28/global_step=190, RunningAvgSamplesPerSec=19.72017429160466, CurrSamplesPerSec=14.290293089769191, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
42
+ [2025-04-05 15:43:43,794] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
43
+ [2025-04-05 15:43:43,795] [INFO] [timer.py:264:stop] epoch=2/micro_step=48/global_step=200, RunningAvgSamplesPerSec=19.76614861625657, CurrSamplesPerSec=23.281404168636236, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
44
+ [2025-04-05 15:44:19,358] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
45
+ [2025-04-05 15:44:19,359] [INFO] [timer.py:264:stop] epoch=2/micro_step=68/global_step=210, RunningAvgSamplesPerSec=19.71474683772797, CurrSamplesPerSec=22.07258105575799, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
46
+ [2025-04-05 15:44:53,709] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
47
+ [2025-04-05 15:44:53,709] [INFO] [timer.py:264:stop] epoch=2/micro_step=88/global_step=220, RunningAvgSamplesPerSec=19.705082012780586, CurrSamplesPerSec=19.593044905429547, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
48
+ [2025-04-05 15:45:26,365] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
49
+ [2025-04-05 15:45:26,366] [INFO] [timer.py:264:stop] epoch=2/micro_step=108/global_step=230, RunningAvgSamplesPerSec=19.739918406677425, CurrSamplesPerSec=19.835104573202546, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
50
+ [2025-04-05 15:45:59,052] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
51
+ [2025-04-05 15:45:59,053] [INFO] [timer.py:264:stop] epoch=2/micro_step=128/global_step=240, RunningAvgSamplesPerSec=19.772618786957285, CurrSamplesPerSec=20.382635106879853, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
52
+ [2025-04-05 15:46:32,875] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
53
+ [2025-04-05 15:46:32,876] [INFO] [timer.py:264:stop] epoch=2/micro_step=148/global_step=250, RunningAvgSamplesPerSec=19.769924873342227, CurrSamplesPerSec=15.725991281451387, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
54
+ [2025-04-05 15:47:06,948] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
55
+ [2025-04-05 15:47:06,948] [INFO] [timer.py:264:stop] epoch=2/micro_step=168/global_step=260, RunningAvgSamplesPerSec=19.769435747796898, CurrSamplesPerSec=21.595577884084456, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
56
+ Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
57
+ Saving 16-bit model...
58
+ [2025-04-05 15:47:28,005] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step264 is about to be saved!
59
+ [2025-04-05 15:47:28,007] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step264
60
+ [2025-04-05 15:47:28,007] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
61
+ [2025-04-05 15:47:41,242] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
62
+ [2025-04-05 15:47:41,242] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step264 is ready now!
63
+ Model saved!
wandb/run-20250405_153219-puqja889/files/requirements.txt ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ deepspeed==0.16.1
3
+ uritemplate==4.1.1
4
+ pyairports==2.1.1
5
+ partial-json-parser==0.2.1.1.post4
6
+ tensorboard-data-server==0.7.2
7
+ pydantic==2.10.3
8
+ Werkzeug==3.1.3
9
+ attrs==24.3.0
10
+ Jinja2==3.1.4
11
+ email_validator==2.2.0
12
+ mdit-py-plugins==0.4.2
13
+ google-api-python-client==2.160.0
14
+ pandas==2.2.3
15
+ safehttpx==0.1.6
16
+ setproctitle==1.3.4
17
+ dill==0.3.8
18
+ torchaudio==2.5.1
19
+ frechet-audio-distance==0.1.2
20
+ blessed==1.20.0
21
+ llvmlite==0.43.0
22
+ litellm==1.60.8
23
+ nvidia-nvtx-cu12==12.4.127
24
+ nvidia-cusolver-cu12==11.6.1.9
25
+ einops==0.8.0
26
+ datasets==3.2.0
27
+ pycountry==24.6.1
28
+ airportsdata==20250224
29
+ idna==3.10
30
+ urllib3==2.2.3
31
+ mpmath==1.3.0
32
+ wandb==0.19.1
33
+ certifi==2024.12.14
34
+ markdown-it-py==3.0.0
35
+ align-anything==0.0.1.dev0
36
+ aiohttp==3.11.10
37
+ fsspec==2024.9.0
38
+ aiohappyeyeballs==2.4.4
39
+ httplib2==0.22.0
40
+ hjson==3.1.0
41
+ yarl==1.18.3
42
+ decorator==5.1.1
43
+ distlib==0.3.9
44
+ absl-py==2.1.0
45
+ huggingface-hub==0.27.0
46
+ memray==1.15.0
47
+ Pygments==2.18.0
48
+ soupsieve==2.6
49
+ shellingham==1.5.4
50
+ tokenizers==0.21.0
51
+ uvloop==0.21.0
52
+ numpy==1.26.4
53
+ linkify-it-py==2.0.3
54
+ sympy==1.13.1
55
+ python-dotenv==1.0.1
56
+ nvidia-cuda-runtime-cu12==12.4.127
57
+ tensorboard==2.18.0
58
+ fastrlock==0.8.3
59
+ rsa==4.9
60
+ lm-format-enforcer==0.10.9
61
+ openai==1.61.1
62
+ gpustat==1.1.1
63
+ librosa==0.10.2.post1
64
+ grpcio-status==1.70.0
65
+ nvidia-cudnn-cu12==9.1.0.70
66
+ zipp==3.21.0
67
+ nvidia-nvjitlink-cu12==12.4.127
68
+ cupy-cuda12x==13.3.0
69
+ Markdown==3.7
70
+ nvidia-cuda-cupti-cu12==12.4.127
71
+ nvidia-curand-cu12==10.3.5.147
72
+ rpds-py==0.22.3
73
+ outlines==0.1.11
74
+ docker-pycreds==0.4.0
75
+ distro==1.9.0
76
+ httpcore==1.0.7
77
+ gradio==5.9.0
78
+ google-auth-httplib2==0.2.0
79
+ iniconfig==2.0.0
80
+ gitdb==4.0.11
81
+ jsonschema==4.23.0
82
+ click==8.1.7
83
+ ninja==1.11.1.3
84
+ setuptools==75.6.0
85
+ audioread==3.0.1
86
+ frozenlist==1.5.0
87
+ transformers-stream-generator==0.0.5
88
+ nvidia-cublas-cu12==12.4.5.8
89
+ pycparser==2.22
90
+ GitPython==3.1.43
91
+ tqdm==4.67.1
92
+ importlib_metadata==8.5.0
93
+ patsy==1.0.1
94
+ networkx==3.4.2
95
+ semantic-version==2.10.0
96
+ alpaca_eval==0.6.6
97
+ google-cloud-core==2.4.1
98
+ prometheus_client==0.21.1
99
+ jiter==0.8.2
100
+ scipy==1.14.1
101
+ starlette==0.41.3
102
+ jq==1.8.0
103
+ opencensus-context==0.1.3
104
+ cachetools==5.5.1
105
+ cffi==1.17.1
106
+ opencv-python-headless==4.10.0.84
107
+ joblib==1.4.2
108
+ yt-dlp==2025.1.26
109
+ python-dateutil==2.9.0.post0
110
+ httpx==0.28.1
111
+ msgpack==1.1.0
112
+ pydub==0.25.1
113
+ tomlkit==0.13.2
114
+ nvitop==1.4.2
115
+ nvidia-cusparse-cu12==12.3.1.170
116
+ msgspec==0.18.6
117
+ aiosignal==1.3.2
118
+ wheel==0.45.1
119
+ filelock==3.16.1
120
+ pillow==10.4.0
121
+ typer==0.15.1
122
+ websockets==14.1
123
+ resampy==0.4.3
124
+ aiofiles==23.2.1
125
+ aiohttp-cors==0.7.0
126
+ platformdirs==4.3.6
127
+ gguf==0.10.0
128
+ diskcache==5.6.3
129
+ cloudpickle==3.1.0
130
+ multidict==6.1.0
131
+ py-cpuinfo==9.0.0
132
+ scikit-learn==1.6.0
133
+ smart-open==7.1.0
134
+ tiktoken==0.7.0
135
+ grpcio==1.70.0
136
+ charset-normalizer==3.4.0
137
+ nest-asyncio==1.6.0
138
+ lark==1.2.2
139
+ beautifulsoup4==4.13.3
140
+ pip==24.3.1
141
+ six==1.17.0
142
+ prometheus-fastapi-instrumentator==7.0.0
143
+ ruff==0.8.3
144
+ rich-toolkit==0.13.2
145
+ lazy_loader==0.4
146
+ grpc-google-iam-v1==0.14.0
147
+ psutil==6.1.0
148
+ mdurl==0.1.2
149
+ nvidia-nccl-cu12==2.21.5
150
+ triton==3.1.0
151
+ torchvision==0.20.1
152
+ fastapi==0.115.6
153
+ referencing==0.35.1
154
+ xxhash==3.5.0
155
+ pyzmq==26.2.0
156
+ torchlibrosa==0.1.0
157
+ googleapis-common-protos==1.66.0
158
+ pyasn1==0.6.1
159
+ soundfile==0.12.1
160
+ pyparsing==3.2.1
161
+ xgrammar==0.1.11
162
+ gradio_client==1.5.2
163
+ watchfiles==1.0.3
164
+ pluggy==1.5.0
165
+ py-spy==0.4.0
166
+ pybind11==2.13.6
167
+ diffusers==0.31.0
168
+ sentencepiece==0.2.0
169
+ flash_attn==2.7.4.post1
170
+ annotated-types==0.7.0
171
+ interegular==0.3.3
172
+ requests==2.32.3
173
+ opencensus==0.11.4
174
+ colorful==0.5.6
175
+ google-api-core==2.24.1
176
+ pytest==8.3.4
177
+ dnspython==2.7.0
178
+ pydantic_core==2.27.1
179
+ pytz==2024.2
180
+ pyasn1_modules==0.4.1
181
+ propcache==0.2.1
182
+ accelerate==1.2.1
183
+ fire==0.7.0
184
+ textual==1.0.0
185
+ sniffio==1.3.1
186
+ pyarrow==18.1.0
187
+ protobuf==5.29.1
188
+ wcwidth==0.2.13
189
+ packaging==24.2
190
+ uvicorn==0.34.0
191
+ sentry-sdk==2.19.2
192
+ google-auth==2.38.0
193
+ typing_extensions==4.12.2
194
+ peft==0.14.0
195
+ depyf==0.18.0
196
+ multiprocess==0.70.16
197
+ google-cloud-translate==3.19.0
198
+ nvidia-cuda-nvrtc-cu12==12.4.127
199
+ jsonschema-specifications==2024.10.1
200
+ vllm==0.7.3
201
+ nvidia-cufft-cu12==11.2.1.3
202
+ timm==1.0.12
203
+ rich==13.9.4
204
+ ffmpy==0.4.0
205
+ virtualenv==20.29.1
206
+ tzdata==2024.2
207
+ smmap==5.0.1
208
+ uc-micro-py==1.0.3
209
+ proto-plus==1.26.0
210
+ soxr==0.5.0.post1
211
+ h11==0.14.0
212
+ outlines_core==0.1.26
213
+ compressed-tensors==0.9.1
214
+ blake3==1.0.4
215
+ xformers==0.0.28.post3
216
+ orjson==3.10.12
217
+ ray==2.40.0
218
+ PyYAML==6.0.2
219
+ nvidia-ml-py==12.560.30
220
+ python-multipart==0.0.19
221
+ PySocks==1.7.1
222
+ regex==2024.11.6
223
+ pooch==1.8.2
224
+ termcolor==2.5.0
225
+ MarkupSafe==2.1.5
226
+ torch==2.5.1
227
+ fastapi-cli==0.0.7
228
+ gdown==5.2.0
229
+ numba==0.60.0
230
+ httptools==0.6.4
231
+ transformers==4.50.0.dev0
232
+ mistral_common==1.5.1
233
+ astor==0.8.1
234
+ anyio==4.7.0
235
+ safetensors==0.4.5
236
+ threadpoolctl==3.5.0
237
+ wrapt==1.17.2
238
+ wheel==0.43.0
239
+ jaraco.functools==4.0.1
240
+ inflect==7.3.1
241
+ jaraco.text==3.12.1
242
+ typeguard==4.3.0
243
+ jaraco.collections==5.1.0
244
+ importlib_metadata==8.0.0
245
+ backports.tarfile==1.2.0
246
+ tomli==2.0.1
247
+ autocommand==2.2.2
248
+ platformdirs==4.2.2
249
+ more-itertools==10.3.0
250
+ zipp==3.19.2
251
+ packaging==24.2
252
+ typing_extensions==4.12.2
253
+ jaraco.context==5.3.0
wandb/run-20250405_153219-puqja889/files/wandb-metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.0",
4
+ "startedAt": "2025-04-05T07:32:19.230644Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
11
+ "--train_split",
12
+ "train",
13
+ "--train_template",
14
+ "Safe_o1",
15
+ "--output_dir",
16
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
17
+ "--log_project",
18
+ "safe-o1",
19
+ "--per_device_train_batch_size",
20
+ "4",
21
+ "--per_device_eval_batch_size",
22
+ "4",
23
+ "--gradient_accumulation_steps",
24
+ "2",
25
+ "--learning_rate",
26
+ "2e-5",
27
+ "--epochs",
28
+ "3",
29
+ "--model_max_length",
30
+ "16384"
31
+ ],
32
+ "program": "-m align_anything.trainers.text_to_text.sft",
33
+ "git": {
34
+ "remote": "[email protected]:PKU-Alignment/align-anything.git",
35
+ "commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
36
+ },
37
+ "email": "[email protected]",
38
+ "root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
39
+ "host": "dgx-092",
40
+ "executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
41
+ "cpu_count": 112,
42
+ "cpu_count_logical": 224,
43
+ "gpu": "NVIDIA H800",
44
+ "gpu_count": 8,
45
+ "disk": {
46
+ "/": {
47
+ "total": "1888556142592",
48
+ "used": "149958434816"
49
+ }
50
+ },
51
+ "memory": {
52
+ "total": "2164195454976"
53
+ },
54
+ "cpu": {
55
+ "count": 112,
56
+ "countLogical": 224
57
+ },
58
+ "gpu_nvidia": [
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ },
101
+ {
102
+ "name": "NVIDIA H800",
103
+ "memoryTotal": "85520809984",
104
+ "cudaCores": 16896,
105
+ "architecture": "Hopper"
106
+ }
107
+ ],
108
+ "slurm": {
109
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
110
+ },
111
+ "cudaVersion": "12.2"
112
+ }
wandb/run-20250405_153219-puqja889/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/epoch":3,"_timestamp":1.743839240297635e+09,"_runtime":922.055261897,"_step":528,"train/step":528,"_wandb":{"runtime":922},"train/loss":0.3990817368030548,"train/lr":2e-05}
wandb/run-20250405_153219-puqja889/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-05T15:32:18.65561336+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpxq7sqgm9/port-3746936.txt","pid":3746936,"debug":false,"disable-analytics":false}
2
+ {"time":"2025-04-05T15:32:18.655658183+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2025-04-05T15:32:18.656636391+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3746936}
4
+ {"time":"2025-04-05T15:32:18.656636606+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37857,"Zone":""}}
5
+ {"time":"2025-04-05T15:32:18.837886858+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34114"}
6
+ {"time":"2025-04-05T15:32:19.231604357+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"puqja889","id":"127.0.0.1:34114"}
7
+ {"time":"2025-04-05T15:32:19.446153043+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"puqja889","id":"127.0.0.1:34114"}
8
+ {"time":"2025-04-05T15:47:44.197928261+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"puqja889","id":"127.0.0.1:34114"}
9
+ {"time":"2025-04-05T15:47:44.198467502+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"puqja889","id":"127.0.0.1:34114"}
10
+ {"time":"2025-04-05T15:47:44.245957677+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:34114"}
11
+ {"time":"2025-04-05T15:47:44.24597843+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:34114"}
12
+ {"time":"2025-04-05T15:47:44.245991486+08:00","level":"INFO","msg":"server is shutting down"}
13
+ {"time":"2025-04-05T15:47:44.246014289+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:34114"}
14
+ {"time":"2025-04-05T15:47:44.246050481+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:34114"}
15
+ {"time":"2025-04-05T15:47:44.246052923+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:34114"}
16
+ {"time":"2025-04-05T15:47:44.246055739+08:00","level":"INFO","msg":"server is closed"}
wandb/run-20250405_153219-puqja889/logs/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-05T15:32:19.233194259+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-05T15:32:19.233418539+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_153219-puqja889/logs/debug-core.log"}
3
+ {"time":"2025-04-05T15:32:19.446097202+08:00","level":"INFO","msg":"created new stream","id":"puqja889"}
4
+ {"time":"2025-04-05T15:32:19.4461465+08:00","level":"INFO","msg":"stream: started","id":"puqja889"}
5
+ {"time":"2025-04-05T15:32:19.446170192+08:00","level":"INFO","msg":"sender: started","stream_id":"puqja889"}
6
+ {"time":"2025-04-05T15:32:19.44617482+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"puqja889"}
7
+ {"time":"2025-04-05T15:32:19.446195338+08:00","level":"INFO","msg":"handler: started","stream_id":"puqja889"}
8
+ {"time":"2025-04-05T15:32:19.762936247+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-04-05T15:47:41.285928309+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-04-05T15:47:41.287819762+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-04-05T15:47:42.247311433+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.312718096,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.312711599,"progress":"10.8KB/10.8KB"}],"total_operations":2}}
12
+ {"time":"2025-04-05T15:47:42.941747027+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-04-05T15:47:44.198051243+08:00","level":"INFO","msg":"stream: closing","id":"puqja889"}
14
+ {"time":"2025-04-05T15:47:44.198103033+08:00","level":"INFO","msg":"handler: closed","stream_id":"puqja889"}
15
+ {"time":"2025-04-05T15:47:44.198111233+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"puqja889"}
16
+ {"time":"2025-04-05T15:47:44.198219676+08:00","level":"INFO","msg":"sender: closed","stream_id":"puqja889"}
17
+ {"time":"2025-04-05T15:47:44.198457535+08:00","level":"INFO","msg":"stream: closed","id":"puqja889"}
wandb/run-20250405_153219-puqja889/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Configure stats pid to 3746936
3
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
5
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_153219-puqja889/logs/debug.log
7
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_153219-puqja889/logs/debug-internal.log
8
+ 2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-05 15:32:19,227 INFO MainThread:3746936 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_o1', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
11
+ 2025-04-05 15:32:19,227 INFO MainThread:3746936 [wandb_init.py:init():680] starting backend
12
+ 2025-04-05 15:32:19,227 INFO MainThread:3746936 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-05 15:32:19,230 INFO MainThread:3746936 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-05 15:32:19,230 INFO MainThread:3746936 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-05 15:32:19,232 INFO MainThread:3746936 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-05 15:32:19,245 INFO MainThread:3746936 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-05 15:32:19,756 INFO MainThread:3746936 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-05 15:32:20,003 INFO MainThread:3746936 [wandb_init.py:init():916] run started, returning control to user process
23
+ 2025-04-05 15:47:41,245 INFO MainThread:3746936 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/puqja889
24
+ 2025-04-05 15:47:41,245 INFO MainThread:3746936 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
25
+ 2025-04-05 15:47:41,246 INFO MainThread:3746936 [wandb_run.py:_restore():2321] restore
26
+ 2025-04-05 15:47:41,246 INFO MainThread:3746936 [wandb_run.py:_restore():2327] restore done
27
+ 2025-04-05 15:47:44,192 INFO MainThread:3746936 [wandb_run.py:_footer_history_summary_info():3892] rendering history
28
+ 2025-04-05 15:47:44,192 INFO MainThread:3746936 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
29
+ 2025-04-05 15:47:44,197 INFO MainThread:3746936 [wandb_run.py:_footer_sync_info():3853] logging synced files
wandb/run-20250405_153219-puqja889/run-puqja889.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e906c048d4517fa596f83a7df5e6325decfd9173e4b6199a4e4897a6c964488f
3
+ size 1744561
wandb/run-20250405_203209-jla7fqqr/files/config.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.1
4
+ m: []
5
+ python_version: 3.11.0
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 63
17
+ - 71
18
+ - 83
19
+ - 98
20
+ - 105
21
+ "2":
22
+ - 1
23
+ - 5
24
+ - 11
25
+ - 41
26
+ - 49
27
+ - 51
28
+ - 53
29
+ - 55
30
+ - 63
31
+ - 71
32
+ - 83
33
+ - 98
34
+ - 105
35
+ "3":
36
+ - 2
37
+ - 13
38
+ - 16
39
+ - 23
40
+ - 55
41
+ - 61
42
+ "4": 3.11.0
43
+ "5": 0.19.1
44
+ "6": 4.50.0.dev0
45
+ "8":
46
+ - 5
47
+ "12": 0.19.1
48
+ "13": linux-x86_64
49
+ bnb_cfgs:
50
+ value:
51
+ bnb_4bit_compute_dtype: float16
52
+ bnb_4bit_quant_type: nf4
53
+ bnb_4bit_use_double_quant: true
54
+ load_in_4bit: true
55
+ load_in_8bit: false
56
+ use_bnb: false
57
+ data_cfgs:
58
+ value:
59
+ eval_optional_args: []
60
+ train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
61
+ train_optional_args: []
62
+ train_split: train
63
+ train_template: Safe_thinking
64
+ logger_cfgs:
65
+ value:
66
+ log_project: safe-o1
67
+ log_run_name: sft
68
+ log_type: wandb
69
+ output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
70
+ save_interval: 100000
71
+ lora_cfgs:
72
+ value:
73
+ inference_mode: false
74
+ lora_alpha: 16
75
+ lora_dropout: 0.1
76
+ r: 16
77
+ save_full_model: true
78
+ target_modules:
79
+ - q_proj
80
+ - v_proj
81
+ task_type: TaskType.CAUSAL_LM
82
+ use_lora: false
83
+ model_cfgs:
84
+ value:
85
+ model_max_length: 16384
86
+ model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
87
+ trust_remote_code: true
88
+ train_cfgs:
89
+ value:
90
+ adam_betas:
91
+ - 0.9
92
+ - 0.95
93
+ adam_epsilon: 1e-08
94
+ bf16: true
95
+ ds_cfgs: ds_z3_config.json
96
+ epochs: 3
97
+ eval_interval: 10
98
+ eval_strategy: steps
99
+ fp16: false
100
+ gradient_accumulation_steps: 2
101
+ gradient_checkpointing: true
102
+ learning_rate: 2e-05
103
+ lr_scheduler_type: constant
104
+ lr_warmup_ratio: 0.03
105
+ max_grad_norm: 1
106
+ per_device_eval_batch_size: 4
107
+ per_device_train_batch_size: 4
108
+ seed: 42
109
+ weight_decay: 0
wandb/run-20250405_203209-jla7fqqr/files/output.log ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/3 epoch: 0%| | 0/528 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 2/3 epoch (loss 0.6368): 60%|██████████████████████████████████████████████████████████████████████████████▌ | 319/528 [07:48<04:48, 1.38s/it]
4
+ [2025-04-05 20:32:45,658] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
5
+ [2025-04-05 20:32:45,659] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=23.605714664413604, CurrSamplesPerSec=23.666011535180804, MemAllocated=15.18GB, MaxMemAllocated=33.78GB
6
+ [2025-04-05 20:33:13,955] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
7
+ [2025-04-05 20:33:13,956] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=23.754469865096492, CurrSamplesPerSec=23.631327040548573, MemAllocated=15.18GB, MaxMemAllocated=33.78GB
8
+ [2025-04-05 20:33:46,142] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
9
+ [2025-04-05 20:33:46,143] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=22.604152126481146, CurrSamplesPerSec=25.109336167565615, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
10
+ [2025-04-05 20:34:13,747] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
11
+ [2025-04-05 20:34:13,748] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=23.017619879751617, CurrSamplesPerSec=25.723433617374972, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
12
+ [2025-04-05 20:34:42,610] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
13
+ [2025-04-05 20:34:42,610] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=23.049331926082044, CurrSamplesPerSec=27.117450303376405, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
14
+ [2025-04-05 20:35:11,048] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
15
+ [2025-04-05 20:35:11,049] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=23.14298012524782, CurrSamplesPerSec=21.882912187585045, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
16
+ [2025-04-05 20:35:39,236] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
17
+ [2025-04-05 20:35:39,237] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=23.24975082248447, CurrSamplesPerSec=20.374822325722487, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
18
+ [2025-04-05 20:36:08,690] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
19
+ [2025-04-05 20:36:08,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=23.176213278752765, CurrSamplesPerSec=21.797783380622224, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
20
+ [2025-04-05 20:36:38,107] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
21
+ [2025-04-05 20:36:38,107] [INFO] [timer.py:264:stop] epoch=1/micro_step=4/global_step=90, RunningAvgSamplesPerSec=23.126311848009383, CurrSamplesPerSec=16.471299501610225, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
22
+ [2025-04-05 20:37:06,881] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
23
+ [2025-04-05 20:37:06,881] [INFO] [timer.py:264:stop] epoch=1/micro_step=24/global_step=100, RunningAvgSamplesPerSec=23.14780207678257, CurrSamplesPerSec=24.49579467500081, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
24
+ [2025-04-05 20:37:36,188] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
25
+ [2025-04-05 20:37:36,188] [INFO] [timer.py:264:stop] epoch=1/micro_step=44/global_step=110, RunningAvgSamplesPerSec=23.11597305149581, CurrSamplesPerSec=16.43245669677465, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
26
+ [2025-04-05 20:38:06,693] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
27
+ [2025-04-05 20:38:06,694] [INFO] [timer.py:264:stop] epoch=1/micro_step=64/global_step=120, RunningAvgSamplesPerSec=22.998939428830514, CurrSamplesPerSec=22.171908523048636, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
28
+ [2025-04-05 20:38:34,755] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
29
+ [2025-04-05 20:38:34,756] [INFO] [timer.py:264:stop] epoch=1/micro_step=84/global_step=130, RunningAvgSamplesPerSec=23.06196743535597, CurrSamplesPerSec=22.22566165861562, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
30
+ [2025-04-05 20:39:03,683] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
31
+ [2025-04-05 20:39:03,683] [INFO] [timer.py:264:stop] epoch=1/micro_step=104/global_step=140, RunningAvgSamplesPerSec=23.06195015699282, CurrSamplesPerSec=20.920198186863757, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
32
+ [2025-04-05 20:39:31,775] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
33
+ [2025-04-05 20:39:31,775] [INFO] [timer.py:264:stop] epoch=1/micro_step=124/global_step=150, RunningAvgSamplesPerSec=23.115183552624345, CurrSamplesPerSec=25.03072256562265, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
34
+ [2025-04-05 20:39:59,910] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
35
+ [2025-04-05 20:39:59,910] [INFO] [timer.py:264:stop] epoch=1/micro_step=144/global_step=160, RunningAvgSamplesPerSec=23.153977209977544, CurrSamplesPerSec=29.017234800239933, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
36
+ [2025-04-05 20:40:28,955] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
37
+ [2025-04-05 20:40:28,955] [INFO] [timer.py:264:stop] epoch=1/micro_step=164/global_step=170, RunningAvgSamplesPerSec=23.14329866214052, CurrSamplesPerSec=28.340181534100314, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
38
+ [2025-04-05 20:40:58,435] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
39
+ [2025-04-05 20:40:58,435] [INFO] [timer.py:264:stop] epoch=2/micro_step=8/global_step=180, RunningAvgSamplesPerSec=23.115090713886406, CurrSamplesPerSec=23.97360709248484, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
40
+ [2025-04-05 20:41:29,026] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
41
+ [2025-04-05 20:41:29,026] [INFO] [timer.py:264:stop] epoch=2/micro_step=28/global_step=190, RunningAvgSamplesPerSec=23.04166812821496, CurrSamplesPerSec=16.449150851444518, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
42
+ [2025-04-05 20:41:56,470] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
43
+ [2025-04-05 20:41:56,470] [INFO] [timer.py:264:stop] epoch=2/micro_step=48/global_step=200, RunningAvgSamplesPerSec=23.107848960875014, CurrSamplesPerSec=28.456930650131092, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
44
+ [2025-04-05 20:42:27,319] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
45
+ [2025-04-05 20:42:27,320] [INFO] [timer.py:264:stop] epoch=2/micro_step=68/global_step=210, RunningAvgSamplesPerSec=23.03557307846801, CurrSamplesPerSec=26.542707384497938, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
46
+ [2025-04-05 20:42:56,548] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
47
+ [2025-04-05 20:42:56,548] [INFO] [timer.py:264:stop] epoch=2/micro_step=88/global_step=220, RunningAvgSamplesPerSec=23.029435842150235, CurrSamplesPerSec=22.045011851042542, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
48
+ [2025-04-05 20:43:24,989] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
49
+ [2025-04-05 20:43:24,990] [INFO] [timer.py:264:stop] epoch=2/micro_step=108/global_step=230, RunningAvgSamplesPerSec=23.049331659158383, CurrSamplesPerSec=23.932027339346398, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
50
+ [2025-04-05 20:43:52,817] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
51
+ [2025-04-05 20:43:52,818] [INFO] [timer.py:264:stop] epoch=2/micro_step=128/global_step=240, RunningAvgSamplesPerSec=23.087664989846726, CurrSamplesPerSec=23.486799951214525, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
52
+ [2025-04-05 20:44:21,469] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
53
+ [2025-04-05 20:44:21,470] [INFO] [timer.py:264:stop] epoch=2/micro_step=148/global_step=250, RunningAvgSamplesPerSec=23.09636651082851, CurrSamplesPerSec=19.006478505688033, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
54
+ [2025-04-05 20:44:50,257] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
55
+ [2025-04-05 20:44:50,257] [INFO] [timer.py:264:stop] epoch=2/micro_step=168/global_step=260, RunningAvgSamplesPerSec=23.10289314788288, CurrSamplesPerSec=26.13127449094928, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
56
+ Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
57
+ Saving 16-bit model...
58
+ [2025-04-05 20:45:09,932] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step264 is about to be saved!
59
+ [2025-04-05 20:45:09,933] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step264
60
+ [2025-04-05 20:45:09,933] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
61
+ [2025-04-05 20:45:27,032] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
62
+ [2025-04-05 20:45:27,032] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step264 is ready now!
63
+ Model saved!
wandb/run-20250405_203209-jla7fqqr/files/requirements.txt ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ deepspeed==0.16.1
3
+ uritemplate==4.1.1
4
+ pyairports==2.1.1
5
+ partial-json-parser==0.2.1.1.post4
6
+ tensorboard-data-server==0.7.2
7
+ pydantic==2.10.3
8
+ Werkzeug==3.1.3
9
+ attrs==24.3.0
10
+ Jinja2==3.1.4
11
+ email_validator==2.2.0
12
+ mdit-py-plugins==0.4.2
13
+ google-api-python-client==2.160.0
14
+ pandas==2.2.3
15
+ safehttpx==0.1.6
16
+ setproctitle==1.3.4
17
+ dill==0.3.8
18
+ torchaudio==2.5.1
19
+ frechet-audio-distance==0.1.2
20
+ blessed==1.20.0
21
+ llvmlite==0.43.0
22
+ litellm==1.60.8
23
+ nvidia-nvtx-cu12==12.4.127
24
+ nvidia-cusolver-cu12==11.6.1.9
25
+ einops==0.8.0
26
+ datasets==3.2.0
27
+ pycountry==24.6.1
28
+ airportsdata==20250224
29
+ idna==3.10
30
+ urllib3==2.2.3
31
+ mpmath==1.3.0
32
+ wandb==0.19.1
33
+ certifi==2024.12.14
34
+ markdown-it-py==3.0.0
35
+ align-anything==0.0.1.dev0
36
+ aiohttp==3.11.10
37
+ fsspec==2024.9.0
38
+ aiohappyeyeballs==2.4.4
39
+ httplib2==0.22.0
40
+ hjson==3.1.0
41
+ yarl==1.18.3
42
+ decorator==5.1.1
43
+ distlib==0.3.9
44
+ absl-py==2.1.0
45
+ huggingface-hub==0.27.0
46
+ memray==1.15.0
47
+ Pygments==2.18.0
48
+ soupsieve==2.6
49
+ shellingham==1.5.4
50
+ tokenizers==0.21.0
51
+ uvloop==0.21.0
52
+ numpy==1.26.4
53
+ linkify-it-py==2.0.3
54
+ sympy==1.13.1
55
+ python-dotenv==1.0.1
56
+ nvidia-cuda-runtime-cu12==12.4.127
57
+ tensorboard==2.18.0
58
+ fastrlock==0.8.3
59
+ rsa==4.9
60
+ lm-format-enforcer==0.10.9
61
+ openai==1.61.1
62
+ gpustat==1.1.1
63
+ librosa==0.10.2.post1
64
+ grpcio-status==1.70.0
65
+ nvidia-cudnn-cu12==9.1.0.70
66
+ zipp==3.21.0
67
+ nvidia-nvjitlink-cu12==12.4.127
68
+ cupy-cuda12x==13.3.0
69
+ Markdown==3.7
70
+ nvidia-cuda-cupti-cu12==12.4.127
71
+ nvidia-curand-cu12==10.3.5.147
72
+ rpds-py==0.22.3
73
+ outlines==0.1.11
74
+ docker-pycreds==0.4.0
75
+ distro==1.9.0
76
+ httpcore==1.0.7
77
+ gradio==5.9.0
78
+ google-auth-httplib2==0.2.0
79
+ iniconfig==2.0.0
80
+ gitdb==4.0.11
81
+ jsonschema==4.23.0
82
+ click==8.1.7
83
+ ninja==1.11.1.3
84
+ setuptools==75.6.0
85
+ audioread==3.0.1
86
+ frozenlist==1.5.0
87
+ transformers-stream-generator==0.0.5
88
+ nvidia-cublas-cu12==12.4.5.8
89
+ pycparser==2.22
90
+ GitPython==3.1.43
91
+ tqdm==4.67.1
92
+ importlib_metadata==8.5.0
93
+ patsy==1.0.1
94
+ networkx==3.4.2
95
+ semantic-version==2.10.0
96
+ alpaca_eval==0.6.6
97
+ google-cloud-core==2.4.1
98
+ prometheus_client==0.21.1
99
+ jiter==0.8.2
100
+ scipy==1.14.1
101
+ starlette==0.41.3
102
+ jq==1.8.0
103
+ opencensus-context==0.1.3
104
+ cachetools==5.5.1
105
+ cffi==1.17.1
106
+ opencv-python-headless==4.10.0.84
107
+ joblib==1.4.2
108
+ yt-dlp==2025.1.26
109
+ python-dateutil==2.9.0.post0
110
+ httpx==0.28.1
111
+ msgpack==1.1.0
112
+ pydub==0.25.1
113
+ tomlkit==0.13.2
114
+ nvitop==1.4.2
115
+ nvidia-cusparse-cu12==12.3.1.170
116
+ msgspec==0.18.6
117
+ aiosignal==1.3.2
118
+ wheel==0.45.1
119
+ filelock==3.16.1
120
+ pillow==10.4.0
121
+ typer==0.15.1
122
+ websockets==14.1
123
+ resampy==0.4.3
124
+ aiofiles==23.2.1
125
+ aiohttp-cors==0.7.0
126
+ platformdirs==4.3.6
127
+ gguf==0.10.0
128
+ diskcache==5.6.3
129
+ cloudpickle==3.1.0
130
+ multidict==6.1.0
131
+ py-cpuinfo==9.0.0
132
+ scikit-learn==1.6.0
133
+ smart-open==7.1.0
134
+ tiktoken==0.7.0
135
+ grpcio==1.70.0
136
+ charset-normalizer==3.4.0
137
+ nest-asyncio==1.6.0
138
+ lark==1.2.2
139
+ beautifulsoup4==4.13.3
140
+ pip==24.3.1
141
+ six==1.17.0
142
+ prometheus-fastapi-instrumentator==7.0.0
143
+ ruff==0.8.3
144
+ rich-toolkit==0.13.2
145
+ lazy_loader==0.4
146
+ grpc-google-iam-v1==0.14.0
147
+ psutil==6.1.0
148
+ mdurl==0.1.2
149
+ nvidia-nccl-cu12==2.21.5
150
+ triton==3.1.0
151
+ torchvision==0.20.1
152
+ fastapi==0.115.6
153
+ referencing==0.35.1
154
+ xxhash==3.5.0
155
+ pyzmq==26.2.0
156
+ torchlibrosa==0.1.0
157
+ googleapis-common-protos==1.66.0
158
+ pyasn1==0.6.1
159
+ soundfile==0.12.1
160
+ pyparsing==3.2.1
161
+ xgrammar==0.1.11
162
+ gradio_client==1.5.2
163
+ watchfiles==1.0.3
164
+ pluggy==1.5.0
165
+ py-spy==0.4.0
166
+ pybind11==2.13.6
167
+ diffusers==0.31.0
168
+ sentencepiece==0.2.0
169
+ flash_attn==2.7.4.post1
170
+ annotated-types==0.7.0
171
+ interegular==0.3.3
172
+ requests==2.32.3
173
+ opencensus==0.11.4
174
+ colorful==0.5.6
175
+ google-api-core==2.24.1
176
+ pytest==8.3.4
177
+ dnspython==2.7.0
178
+ pydantic_core==2.27.1
179
+ pytz==2024.2
180
+ pyasn1_modules==0.4.1
181
+ propcache==0.2.1
182
+ accelerate==1.2.1
183
+ fire==0.7.0
184
+ textual==1.0.0
185
+ sniffio==1.3.1
186
+ pyarrow==18.1.0
187
+ protobuf==5.29.1
188
+ wcwidth==0.2.13
189
+ packaging==24.2
190
+ uvicorn==0.34.0
191
+ sentry-sdk==2.19.2
192
+ google-auth==2.38.0
193
+ typing_extensions==4.12.2
194
+ peft==0.14.0
195
+ depyf==0.18.0
196
+ multiprocess==0.70.16
197
+ google-cloud-translate==3.19.0
198
+ nvidia-cuda-nvrtc-cu12==12.4.127
199
+ jsonschema-specifications==2024.10.1
200
+ vllm==0.7.3
201
+ nvidia-cufft-cu12==11.2.1.3
202
+ timm==1.0.12
203
+ rich==13.9.4
204
+ ffmpy==0.4.0
205
+ virtualenv==20.29.1
206
+ tzdata==2024.2
207
+ smmap==5.0.1
208
+ uc-micro-py==1.0.3
209
+ proto-plus==1.26.0
210
+ soxr==0.5.0.post1
211
+ h11==0.14.0
212
+ outlines_core==0.1.26
213
+ compressed-tensors==0.9.1
214
+ blake3==1.0.4
215
+ xformers==0.0.28.post3
216
+ orjson==3.10.12
217
+ ray==2.40.0
218
+ PyYAML==6.0.2
219
+ nvidia-ml-py==12.560.30
220
+ python-multipart==0.0.19
221
+ PySocks==1.7.1
222
+ regex==2024.11.6
223
+ pooch==1.8.2
224
+ termcolor==2.5.0
225
+ MarkupSafe==2.1.5
226
+ torch==2.5.1
227
+ fastapi-cli==0.0.7
228
+ gdown==5.2.0
229
+ numba==0.60.0
230
+ httptools==0.6.4
231
+ transformers==4.50.0.dev0
232
+ mistral_common==1.5.1
233
+ astor==0.8.1
234
+ anyio==4.7.0
235
+ safetensors==0.4.5
236
+ threadpoolctl==3.5.0
237
+ wrapt==1.17.2
238
+ wheel==0.43.0
239
+ jaraco.functools==4.0.1
240
+ inflect==7.3.1
241
+ jaraco.text==3.12.1
242
+ typeguard==4.3.0
243
+ jaraco.collections==5.1.0
244
+ importlib_metadata==8.0.0
245
+ backports.tarfile==1.2.0
246
+ tomli==2.0.1
247
+ autocommand==2.2.2
248
+ platformdirs==4.2.2
249
+ more-itertools==10.3.0
250
+ zipp==3.19.2
251
+ packaging==24.2
252
+ typing_extensions==4.12.2
253
+ jaraco.context==5.3.0
wandb/run-20250405_203209-jla7fqqr/files/wandb-metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.0",
4
+ "startedAt": "2025-04-05T12:32:09.142317Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
11
+ "--train_split",
12
+ "train",
13
+ "--train_template",
14
+ "Safe_thinking",
15
+ "--output_dir",
16
+ "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
17
+ "--log_project",
18
+ "safe-o1",
19
+ "--per_device_train_batch_size",
20
+ "4",
21
+ "--per_device_eval_batch_size",
22
+ "4",
23
+ "--gradient_accumulation_steps",
24
+ "2",
25
+ "--learning_rate",
26
+ "2e-5",
27
+ "--epochs",
28
+ "3",
29
+ "--model_max_length",
30
+ "16384"
31
+ ],
32
+ "program": "-m align_anything.trainers.text_to_text.sft",
33
+ "git": {
34
+ "remote": "[email protected]:PKU-Alignment/align-anything.git",
35
+ "commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
36
+ },
37
+ "email": "[email protected]",
38
+ "root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
39
+ "host": "dgx-091",
40
+ "executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
41
+ "cpu_count": 112,
42
+ "cpu_count_logical": 224,
43
+ "gpu": "NVIDIA H800",
44
+ "gpu_count": 8,
45
+ "disk": {
46
+ "/": {
47
+ "total": "1888556142592",
48
+ "used": "1056715956224"
49
+ }
50
+ },
51
+ "memory": {
52
+ "total": "2164195573760"
53
+ },
54
+ "cpu": {
55
+ "count": 112,
56
+ "countLogical": 224
57
+ },
58
+ "gpu_nvidia": [
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ },
101
+ {
102
+ "name": "NVIDIA H800",
103
+ "memoryTotal": "85520809984",
104
+ "cudaCores": 16896,
105
+ "architecture": "Hopper"
106
+ }
107
+ ],
108
+ "slurm": {
109
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
110
+ },
111
+ "cudaVersion": "12.2"
112
+ }
wandb/run-20250405_203209-jla7fqqr/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/epoch":3,"_wandb":{"runtime":797},"_timestamp":1.7438571013796113e+09,"_runtime":797.934275661,"_step":528,"train/step":528,"train/loss":0.45351502299308777,"train/lr":2e-05}
wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-05T20:32:08.551820206+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp_xy7jfsn/port-2888806.txt","pid":2888806,"debug":false,"disable-analytics":false}
2
+ {"time":"2025-04-05T20:32:08.551894655+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2025-04-05T20:32:08.552966504+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2888806}
4
+ {"time":"2025-04-05T20:32:08.552967717+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44473,"Zone":""}}
5
+ {"time":"2025-04-05T20:32:08.707940509+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34916"}
6
+ {"time":"2025-04-05T20:32:09.143458736+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
7
+ {"time":"2025-04-05T20:32:09.359706562+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
8
+ {"time":"2025-04-05T20:45:29.439953745+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
9
+ {"time":"2025-04-05T20:45:29.440386793+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
10
+ {"time":"2025-04-05T20:45:29.484536234+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:34916"}
11
+ {"time":"2025-04-05T20:45:29.484552529+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:34916"}
12
+ {"time":"2025-04-05T20:45:29.484586228+08:00","level":"INFO","msg":"server is shutting down"}
13
+ {"time":"2025-04-05T20:45:29.484607146+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:34916"}
14
+ {"time":"2025-04-05T20:45:29.48465299+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:34916"}
15
+ {"time":"2025-04-05T20:45:29.48466913+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:34916"}
16
+ {"time":"2025-04-05T20:45:29.484674792+08:00","level":"INFO","msg":"server is closed"}
wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-05T20:32:09.145104832+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
2
+ {"time":"2025-04-05T20:32:09.145234633+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log"}
3
+ {"time":"2025-04-05T20:32:09.359661673+08:00","level":"INFO","msg":"created new stream","id":"jla7fqqr"}
4
+ {"time":"2025-04-05T20:32:09.359700555+08:00","level":"INFO","msg":"stream: started","id":"jla7fqqr"}
5
+ {"time":"2025-04-05T20:32:09.35975566+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"jla7fqqr"}
6
+ {"time":"2025-04-05T20:32:09.359831663+08:00","level":"INFO","msg":"handler: started","stream_id":"jla7fqqr"}
7
+ {"time":"2025-04-05T20:32:09.35975831+08:00","level":"INFO","msg":"sender: started","stream_id":"jla7fqqr"}
8
+ {"time":"2025-04-05T20:32:09.688023993+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-04-05T20:45:27.076637312+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-04-05T20:45:27.077489476+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-04-05T20:45:28.038487853+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.283111243,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.283100079,"progress":"10.8KB/10.8KB"}],"total_operations":2}}
12
+ {"time":"2025-04-05T20:45:28.204441985+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-04-05T20:45:29.440112056+08:00","level":"INFO","msg":"stream: closing","id":"jla7fqqr"}
14
+ {"time":"2025-04-05T20:45:29.440138846+08:00","level":"INFO","msg":"handler: closed","stream_id":"jla7fqqr"}
15
+ {"time":"2025-04-05T20:45:29.440146259+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"jla7fqqr"}
16
+ {"time":"2025-04-05T20:45:29.440285075+08:00","level":"INFO","msg":"sender: closed","stream_id":"jla7fqqr"}
17
+ {"time":"2025-04-05T20:45:29.440378039+08:00","level":"INFO","msg":"stream: closed","id":"jla7fqqr"}
wandb/run-20250405_203209-jla7fqqr/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-05 20:32:09,135 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
2
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Configure stats pid to 2888806
3
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
5
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug.log
7
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log
8
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():644] calling init triggers
9
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_thinking', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
11
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():680] starting backend
12
+ 2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():684] sending inform_init request
13
+ 2025-04-05 20:32:09,141 INFO MainThread:2888806 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-05 20:32:09,142 INFO MainThread:2888806 [wandb_init.py:init():697] backend started and connected
15
+ 2025-04-05 20:32:09,143 INFO MainThread:2888806 [wandb_init.py:init():790] updated telemetry
16
+ 2025-04-05 20:32:09,162 INFO MainThread:2888806 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
17
+ 2025-04-05 20:32:09,682 INFO MainThread:2888806 [wandb_init.py:init():874] starting run threads in backend
18
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_console_start():2374] atexit reg
19
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2224] redirect: wrap_raw
20
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2289] Wrapping output streams.
21
+ 2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2314] Redirects installed.
22
+ 2025-04-05 20:32:10,112 INFO MainThread:2888806 [wandb_init.py:init():916] run started, returning control to user process
23
+ 2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/jla7fqqr
24
+ 2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
25
+ 2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2321] restore
26
+ 2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2327] restore done
27
+ 2025-04-05 20:45:29,432 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3892] rendering history
28
+ 2025-04-05 20:45:29,433 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
29
+ 2025-04-05 20:45:29,439 INFO MainThread:2888806 [wandb_run.py:_footer_sync_info():3853] logging synced files
wandb/run-20250405_203209-jla7fqqr/run-jla7fqqr.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0312a8142b984ed4f9135492bf56d58d8037b91a040f35ff38db29091f30341f
3
+ size 1580734