Upload folder using huggingface_hub
Browse files- .gitattributes +5 -0
- arguments.yaml +66 -0
- environ.txt +235 -0
- script.sh +46 -0
- slice_end/added_tokens.json +24 -0
- slice_end/config.json +29 -0
- slice_end/merges.txt +0 -0
- slice_end/pytorch_model.bin +3 -0
- slice_end/special_tokens_map.json +31 -0
- slice_end/tokenizer.json +3 -0
- slice_end/tokenizer_config.json +209 -0
- slice_end/vocab.json +0 -0
- wandb/debug-internal.log +17 -0
- wandb/debug.log +29 -0
- wandb/run-20250404_234514-h2gynfll/files/config.yaml +109 -0
- wandb/run-20250404_234514-h2gynfll/files/output.log +221 -0
- wandb/run-20250404_234514-h2gynfll/files/requirements.txt +253 -0
- wandb/run-20250404_234514-h2gynfll/files/wandb-metadata.json +112 -0
- wandb/run-20250404_234514-h2gynfll/files/wandb-summary.json +1 -0
- wandb/run-20250404_234514-h2gynfll/logs/debug-core.log +16 -0
- wandb/run-20250404_234514-h2gynfll/logs/debug-internal.log +17 -0
- wandb/run-20250404_234514-h2gynfll/logs/debug.log +29 -0
- wandb/run-20250404_234514-h2gynfll/run-h2gynfll.wandb +3 -0
- wandb/run-20250405_124142-wdmxf5un/files/config.yaml +109 -0
- wandb/run-20250405_124142-wdmxf5un/files/output.log +115 -0
- wandb/run-20250405_124142-wdmxf5un/files/requirements.txt +253 -0
- wandb/run-20250405_124142-wdmxf5un/files/wandb-metadata.json +112 -0
- wandb/run-20250405_124142-wdmxf5un/files/wandb-summary.json +1 -0
- wandb/run-20250405_124142-wdmxf5un/logs/debug-core.log +16 -0
- wandb/run-20250405_124142-wdmxf5un/logs/debug-internal.log +17 -0
- wandb/run-20250405_124142-wdmxf5un/logs/debug.log +29 -0
- wandb/run-20250405_124142-wdmxf5un/run-wdmxf5un.wandb +3 -0
- wandb/run-20250405_153219-puqja889/files/config.yaml +109 -0
- wandb/run-20250405_153219-puqja889/files/output.log +63 -0
- wandb/run-20250405_153219-puqja889/files/requirements.txt +253 -0
- wandb/run-20250405_153219-puqja889/files/wandb-metadata.json +112 -0
- wandb/run-20250405_153219-puqja889/files/wandb-summary.json +1 -0
- wandb/run-20250405_153219-puqja889/logs/debug-core.log +16 -0
- wandb/run-20250405_153219-puqja889/logs/debug-internal.log +17 -0
- wandb/run-20250405_153219-puqja889/logs/debug.log +29 -0
- wandb/run-20250405_153219-puqja889/run-puqja889.wandb +3 -0
- wandb/run-20250405_203209-jla7fqqr/files/config.yaml +109 -0
- wandb/run-20250405_203209-jla7fqqr/files/output.log +63 -0
- wandb/run-20250405_203209-jla7fqqr/files/requirements.txt +253 -0
- wandb/run-20250405_203209-jla7fqqr/files/wandb-metadata.json +112 -0
- wandb/run-20250405_203209-jla7fqqr/files/wandb-summary.json +1 -0
- wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log +16 -0
- wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log +17 -0
- wandb/run-20250405_203209-jla7fqqr/logs/debug.log +29 -0
- wandb/run-20250405_203209-jla7fqqr/run-jla7fqqr.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
slice_end/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
wandb/run-20250404_234514-h2gynfll/run-h2gynfll.wandb filter=lfs diff=lfs merge=lfs -text
|
38 |
+
wandb/run-20250405_124142-wdmxf5un/run-wdmxf5un.wandb filter=lfs diff=lfs merge=lfs -text
|
39 |
+
wandb/run-20250405_153219-puqja889/run-puqja889.wandb filter=lfs diff=lfs merge=lfs -text
|
40 |
+
wandb/run-20250405_203209-jla7fqqr/run-jla7fqqr.wandb filter=lfs diff=lfs merge=lfs -text
|
arguments.yaml
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bnb_cfgs:
|
2 |
+
bnb_4bit_compute_dtype: float16
|
3 |
+
bnb_4bit_quant_type: nf4
|
4 |
+
bnb_4bit_use_double_quant: true
|
5 |
+
load_in_4bit: true
|
6 |
+
load_in_8bit: false
|
7 |
+
use_bnb: false
|
8 |
+
data_cfgs:
|
9 |
+
eval_data_files: {}
|
10 |
+
eval_datasets: {}
|
11 |
+
eval_optional_args: []
|
12 |
+
eval_size: {}
|
13 |
+
eval_split: {}
|
14 |
+
eval_subset: {}
|
15 |
+
eval_template: {}
|
16 |
+
train_data_files: {}
|
17 |
+
train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
|
18 |
+
train_name: {}
|
19 |
+
train_optional_args: []
|
20 |
+
train_size: {}
|
21 |
+
train_split: train
|
22 |
+
train_template: Safe_thinking
|
23 |
+
logger_cfgs:
|
24 |
+
cache_dir: {}
|
25 |
+
log_project: safe-o1
|
26 |
+
log_run_name: sft
|
27 |
+
log_type: wandb
|
28 |
+
output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
|
29 |
+
save_interval: 100000
|
30 |
+
lora_cfgs:
|
31 |
+
inference_mode: false
|
32 |
+
lora_alpha: 16
|
33 |
+
lora_dropout: 0.1
|
34 |
+
r: 16
|
35 |
+
save_full_model: true
|
36 |
+
target_modules:
|
37 |
+
- q_proj
|
38 |
+
- v_proj
|
39 |
+
task_type: TaskType.CAUSAL_LM
|
40 |
+
use_lora: false
|
41 |
+
model_cfgs:
|
42 |
+
model_max_length: 16384
|
43 |
+
model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
|
44 |
+
trust_remote_code: true
|
45 |
+
special_tokens: {}
|
46 |
+
train_cfgs:
|
47 |
+
adam_betas:
|
48 |
+
- 0.9
|
49 |
+
- 0.95
|
50 |
+
adam_epsilon: 1.0e-08
|
51 |
+
bf16: true
|
52 |
+
ds_cfgs: ds_z3_config.json
|
53 |
+
epochs: 3
|
54 |
+
eval_interval: 10
|
55 |
+
eval_strategy: steps
|
56 |
+
fp16: false
|
57 |
+
gradient_accumulation_steps: 2
|
58 |
+
gradient_checkpointing: true
|
59 |
+
learning_rate: 2.0e-05
|
60 |
+
lr_scheduler_type: constant
|
61 |
+
lr_warmup_ratio: 0.03
|
62 |
+
max_grad_norm: 1.0
|
63 |
+
per_device_eval_batch_size: 4
|
64 |
+
per_device_train_batch_size: 4
|
65 |
+
seed: 42
|
66 |
+
weight_decay: 0.0
|
environ.txt
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ADDR2LINE=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-addr2line
|
2 |
+
AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ar
|
3 |
+
AS=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-as
|
4 |
+
BASH_FUNC__module_raw%%=() { unset _mlshdbg;
|
5 |
+
if [ "${MODULES_SILENT_SHELL_DEBUG:-0}" = '1' ]; then
|
6 |
+
case "$-" in
|
7 |
+
*v*x*)
|
8 |
+
set +vx;
|
9 |
+
_mlshdbg='vx'
|
10 |
+
;;
|
11 |
+
*v*)
|
12 |
+
set +v;
|
13 |
+
_mlshdbg='v'
|
14 |
+
;;
|
15 |
+
*x*)
|
16 |
+
set +x;
|
17 |
+
_mlshdbg='x'
|
18 |
+
;;
|
19 |
+
*)
|
20 |
+
_mlshdbg=''
|
21 |
+
;;
|
22 |
+
esac;
|
23 |
+
fi;
|
24 |
+
unset _mlre _mlIFS;
|
25 |
+
if [ -n "${IFS+x}" ]; then
|
26 |
+
_mlIFS=$IFS;
|
27 |
+
fi;
|
28 |
+
IFS=' ';
|
29 |
+
for _mlv in ${MODULES_RUN_QUARANTINE:-};
|
30 |
+
do
|
31 |
+
if [ "${_mlv}" = "${_mlv##*[!A-Za-z0-9_]}" -a "${_mlv}" = "${_mlv#[0-9]}" ]; then
|
32 |
+
if [ -n "`eval 'echo ${'$_mlv'+x}'`" ]; then
|
33 |
+
_mlre="${_mlre:-}${_mlv}_modquar='`eval 'echo ${'$_mlv'}'`' ";
|
34 |
+
fi;
|
35 |
+
_mlrv="MODULES_RUNENV_${_mlv}";
|
36 |
+
_mlre="${_mlre:-}${_mlv}='`eval 'echo ${'$_mlrv':-}'`' ";
|
37 |
+
fi;
|
38 |
+
done;
|
39 |
+
if [ -n "${_mlre:-}" ]; then
|
40 |
+
eval `eval ${_mlre} /usr/bin/tclsh /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl bash '"$@"'`;
|
41 |
+
else
|
42 |
+
eval `/usr/bin/tclsh /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl bash "$@"`;
|
43 |
+
fi;
|
44 |
+
_mlstatus=$?;
|
45 |
+
if [ -n "${_mlIFS+x}" ]; then
|
46 |
+
IFS=$_mlIFS;
|
47 |
+
else
|
48 |
+
unset IFS;
|
49 |
+
fi;
|
50 |
+
unset _mlre _mlv _mlrv _mlIFS;
|
51 |
+
if [ -n "${_mlshdbg:-}" ]; then
|
52 |
+
set -$_mlshdbg;
|
53 |
+
fi;
|
54 |
+
unset _mlshdbg;
|
55 |
+
return $_mlstatus
|
56 |
+
}
|
57 |
+
BASH_FUNC_ml%%=() { module ml "$@"
|
58 |
+
}
|
59 |
+
BASH_FUNC_module%%=() { _module_raw "$@" 2>&1
|
60 |
+
}
|
61 |
+
BASH_FUNC_switchml%%=() { typeset swfound=1;
|
62 |
+
if [ "${MODULES_USE_COMPAT_VERSION:-0}" = '1' ]; then
|
63 |
+
typeset swname='main';
|
64 |
+
if [ -e /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl ]; then
|
65 |
+
typeset swfound=0;
|
66 |
+
unset MODULES_USE_COMPAT_VERSION;
|
67 |
+
fi;
|
68 |
+
else
|
69 |
+
typeset swname='compatibility';
|
70 |
+
if [ -e /cm/local/apps/environment-modules/4.5.3/libexec/modulecmd-compat ]; then
|
71 |
+
typeset swfound=0;
|
72 |
+
MODULES_USE_COMPAT_VERSION=1;
|
73 |
+
export MODULES_USE_COMPAT_VERSION;
|
74 |
+
fi;
|
75 |
+
fi;
|
76 |
+
if [ $swfound -eq 0 ]; then
|
77 |
+
echo "Switching to Modules $swname version";
|
78 |
+
source /cm/local/apps/environment-modules/4.5.3/init/bash;
|
79 |
+
else
|
80 |
+
echo "Cannot switch to Modules $swname version, command not found";
|
81 |
+
return 1;
|
82 |
+
fi
|
83 |
+
}
|
84 |
+
BUILD=x86_64-conda-linux-gnu
|
85 |
+
CC=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-cc
|
86 |
+
CC_FOR_BUILD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-cc
|
87 |
+
CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include -I/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/include -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
|
88 |
+
CMAKE_ARGS=-DCMAKE_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ar -DCMAKE_CXX_COMPILER_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_C_COMPILER_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ranlib -DCMAKE_CXX_COMPILER_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_C_COMPILER_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_LINKER=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-strip -DCMAKE_BUILD_TYPE=Release
|
89 |
+
CMAKE_PREFIX_PATH=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl:/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/x86_64-conda-linux-gnu/sysroot/usr
|
90 |
+
CMD_WLM_CLUSTER_NAME=slurm
|
91 |
+
CONDA_BUILD_SYSROOT=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/x86_64-conda-linux-gnu/sysroot
|
92 |
+
CONDA_DEFAULT_ENV=wenqi_qwen2vl
|
93 |
+
CONDA_EXE=/aifs4su/yaodong/miniconda3/bin/conda
|
94 |
+
CONDA_PREFIX=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
|
95 |
+
CONDA_PREFIX_1=/aifs4su/yaodong/miniconda3
|
96 |
+
CONDA_PREFIX_10=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
|
97 |
+
CONDA_PREFIX_11=/aifs4su/yaodong/miniconda3
|
98 |
+
CONDA_PREFIX_12=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
|
99 |
+
CONDA_PREFIX_13=/aifs4su/yaodong/miniconda3
|
100 |
+
CONDA_PREFIX_14=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
|
101 |
+
CONDA_PREFIX_15=/aifs4su/yaodong/miniconda3
|
102 |
+
CONDA_PREFIX_16=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
|
103 |
+
CONDA_PREFIX_17=/aifs4su/yaodong/miniconda3
|
104 |
+
CONDA_PREFIX_18=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
|
105 |
+
CONDA_PREFIX_19=/aifs4su/yaodong/miniconda3
|
106 |
+
CONDA_PREFIX_2=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
|
107 |
+
CONDA_PREFIX_3=/aifs4su/yaodong/miniconda3
|
108 |
+
CONDA_PREFIX_4=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
|
109 |
+
CONDA_PREFIX_5=/aifs4su/yaodong/miniconda3
|
110 |
+
CONDA_PREFIX_6=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3
|
111 |
+
CONDA_PREFIX_7=/aifs4su/yaodong/miniconda3
|
112 |
+
CONDA_PREFIX_8=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl
|
113 |
+
CONDA_PREFIX_9=/aifs4su/yaodong/miniconda3
|
114 |
+
CONDA_PROMPT_MODIFIER=(wenqi_qwen2vl)
|
115 |
+
CONDA_PYTHON_EXE=/aifs4su/yaodong/miniconda3/bin/python
|
116 |
+
CONDA_SHLVL=20
|
117 |
+
CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
|
118 |
+
CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
|
119 |
+
CPATH=/cm/shared/apps/slurm/current/include
|
120 |
+
CPATH_modshare=/cm/shared/apps/slurm/current/include:1
|
121 |
+
CPP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-cpp
|
122 |
+
CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include -I/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/include -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
|
123 |
+
CROSS_RANK=0
|
124 |
+
CROSS_SIZE=1
|
125 |
+
CUDA_MODULE_LOADING=LAZY
|
126 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
127 |
+
CXX=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++
|
128 |
+
CXXFILT=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++filt
|
129 |
+
CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include -I/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/include -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
|
130 |
+
CXX_FOR_BUILD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++
|
131 |
+
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1028/bus
|
132 |
+
DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include
|
133 |
+
DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include
|
134 |
+
DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/include
|
135 |
+
DISABLE_VERSION_CHECK=1
|
136 |
+
DWP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-dwp
|
137 |
+
ELFEDIT=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-elfedit
|
138 |
+
ENABLE_LMOD=0
|
139 |
+
GCC=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc
|
140 |
+
GCC_AR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ar
|
141 |
+
GCC_NM=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-nm
|
142 |
+
GCC_RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gcc-ranlib
|
143 |
+
GPROF=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-gprof
|
144 |
+
GSETTINGS_SCHEMA_DIR=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/share/glib-2.0/schemas
|
145 |
+
GSETTINGS_SCHEMA_DIR_CONDA_BACKUP=
|
146 |
+
GXX=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-g++
|
147 |
+
HF_DATASETS_CACHE=/aifs4su/yaodong/.cache/huggingface/datasets
|
148 |
+
HF_HOME=/aifs4su/yaodong/.cache/huggingface
|
149 |
+
HISTTIMEFORMAT=%y/%m/%d %T
|
150 |
+
HOME=/home/yangyaodong
|
151 |
+
HOST=x86_64-conda-linux-gnu
|
152 |
+
KMP_DUPLICATE_LIB_OK=True
|
153 |
+
KMP_INIT_AT_FORK=FALSE
|
154 |
+
LANG=C.UTF-8
|
155 |
+
LD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ld
|
156 |
+
LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib -Wl,-rpath-link,/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib -L/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/targets/x86_64-linux/lib/stubs
|
157 |
+
LD_GOLD=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ld.gold
|
158 |
+
LD_LIBRARY_PATH=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/lib/python3.11/site-packages/cv2/../../lib64:/usr/mpi/gcc/openmpi-4.1.7a1/lib:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
|
159 |
+
LD_LIBRARY_PATH_modshare=/cm/shared/apps/slurm/current/lib64:1:/usr/mpi/gcc/openmpi-4.1.7a1/lib:1:/cm/shared/apps/slurm/current/lib64/slurm:1
|
160 |
+
LD_RUN_PATH=/usr/mpi/gcc/openmpi-4.1.7a1/lib
|
161 |
+
LD_RUN_PATH_modshare=/usr/mpi/gcc/openmpi-4.1.7a1/lib:1
|
162 |
+
LESSCLOSE=/usr/bin/lesspipe %s %s
|
163 |
+
LESSOPEN=| /usr/bin/lesspipe %s
|
164 |
+
LIBRARY_PATH=/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
|
165 |
+
LIBRARY_PATH_modshare=/cm/shared/apps/slurm/current/lib64:1:/cm/shared/apps/slurm/current/lib64/slurm:1
|
166 |
+
LOADEDMODULES=slurm/slurm/23.02.6:gcc/64/4.1.7a1
|
167 |
+
LOADEDMODULES_modshare=slurm/slurm/23.02.6:1:gcc/64/4.1.7a1:1
|
168 |
+
LOCAL_RANK=0
|
169 |
+
LOCAL_SIZE=8
|
170 |
+
LOGLEVEL=WARNING
|
171 |
+
LOGNAME=yangyaodong
|
172 |
+
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
|
173 |
+
MANPATH=/usr/mpi/gcc/openmpi-4.1.7a1/share/man:/cm/shared/apps/slurm/current/man:/cm/local/apps/environment-modules/4.5.3/share/man:/usr/local/man:/usr/local/share/man:/usr/share/man:/cm/local/apps/environment-modules/current/share/man:/cm/local/apps/environment-modules/current/share/man
|
174 |
+
MANPATH_modshare=/usr/local/share/man:1:/usr/mpi/gcc/openmpi-4.1.7a1/share/man:1:/cm/local/apps/environment-modules/current/share/man:1:/cm/local/apps/environment-modules/4.5.3/share/man:1:/usr/local/man:1:/usr/share/man:1:/cm/shared/apps/slurm/current/man:1
|
175 |
+
MASTER_ADDR=127.0.0.1
|
176 |
+
MASTER_PORT=47506
|
177 |
+
MESON_ARGS=-Dbuildtype=release
|
178 |
+
MIG_PARTED_CHECKPOINT_FILE=/var/lib/nvidia-mig-manager/checkpoint.json
|
179 |
+
MIG_PARTED_CONFIG_FILE=/etc/nvidia-mig-manager/config.yaml
|
180 |
+
MIG_PARTED_HOOKS_FILE=/etc/nvidia-mig-manager/hooks.yaml
|
181 |
+
MODULEPATH=/cm/local/modulefiles:/cm/shared/modulefiles
|
182 |
+
MODULESHOME=/cm/local/apps/environment-modules/4.5.3
|
183 |
+
MODULES_CMD=/cm/local/apps/environment-modules/4.5.3/libexec/modulecmd.tcl
|
184 |
+
MODULES_SET_SHELL_STARTUP=0
|
185 |
+
MOTD_SHOWN=pam
|
186 |
+
MPI_HOME=/usr/mpi/gcc/openmpi-4.1.7a1
|
187 |
+
MPI_RUN=/usr/mpi/gcc/openmpi-4.1.7a1/bin/mpirun
|
188 |
+
NM=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-nm
|
189 |
+
NVCC_PREPEND_FLAGS= -ccbin=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin/x86_64-conda-linux-gnu-c++ -ccbin=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-c++
|
190 |
+
NVCC_PREPEND_FLAGS_BACKUP= -ccbin=/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin/x86_64-conda-linux-gnu-c++
|
191 |
+
NVITOP_MONITOR_MODE=colorful
|
192 |
+
OBJCOPY=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-objcopy
|
193 |
+
OBJDUMP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-objdump
|
194 |
+
OLDPWD=/aifs4su/yaodong/wenqi/projects/safe_o1_evaluation/deception/LLaMA-Factory
|
195 |
+
PATH=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin:/aifs4su/yaodong/miniconda3/condabin:/usr/lpp/mmfs/bin:/usr/local/cuda/bin:/opt/bin:/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin:/aifs4su/yaodong/miniconda3/condabin:/usr/mpi/gcc/openmpi-4.1.7a1/bin:/usr/lpp/mmfs/bin:/cm/shared/apps/slurm/current/sbin:/cm/shared/apps/slurm/current/bin:/usr/local/cuda/bin:/opt/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/sbin:/usr/sbin:/cm/local/apps/environment-modules/4.5.3/bin
|
196 |
+
PATH_modshare=/usr/mpi/gcc/openmpi-4.1.7a1/bin:2:/usr/bin:1:/opt/bin/:1:/aifs4su/yaodong/miniconda3/envs/wenqi_gemma3/bin:1:/opt/bin:1:/usr/local/bin:1:/cm/shared/apps/slurm/current/bin:1:/cm/shared/apps/slurm/current/sbin:1:/bin:1:/snap/bin:1:/sbin:1:/usr/sbin:1:/usr/games:1:/cm/local/apps/environment-modules/4.5.3/bin:1:/usr/local/sbin:1:/usr/lpp/mmfs/bin:1:/aifs4su/yaodong/miniconda3/condabin:1:/usr/local/cuda/bin:1:/usr/local/games:1
|
197 |
+
PWD=/aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts
|
198 |
+
PYTHONHASHSEED=42
|
199 |
+
PYTHONPATH=/aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts
|
200 |
+
RANK=0
|
201 |
+
RANLIB=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-ranlib
|
202 |
+
READELF=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-readelf
|
203 |
+
SHELL=/bin/bash
|
204 |
+
SHLVL=13
|
205 |
+
SIZE=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-size
|
206 |
+
SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf
|
207 |
+
SSH_CLIENT=10.33.4.232 49200 22
|
208 |
+
SSH_CONNECTION=10.33.4.76 36746 10.33.4.229 22
|
209 |
+
SSH_TTY=/dev/pts/0
|
210 |
+
STRINGS=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-strings
|
211 |
+
STRIP=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/x86_64-conda-linux-gnu-strip
|
212 |
+
TERM=screen
|
213 |
+
TERM_PROGRAM=tmux
|
214 |
+
TERM_PROGRAM_VERSION=3.2a
|
215 |
+
TMUX=/tmp/tmux-1028/default,2884537,5
|
216 |
+
TMUX_PANE=%5
|
217 |
+
USER=yangyaodong
|
218 |
+
WANDB_API_KEY=62c57a07add7cf80060d09b29e313990bc2fada2
|
219 |
+
WANDB_SERVICE=2-2888806-tcp-localhost-44473
|
220 |
+
WORLD_SIZE=8
|
221 |
+
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
222 |
+
XDG_RUNTIME_DIR=/run/user/1028
|
223 |
+
XDG_SESSION_CLASS=user
|
224 |
+
XDG_SESSION_ID=60916
|
225 |
+
XDG_SESSION_TYPE=tty
|
226 |
+
XML_CATALOG_FILES=file:///aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/etc/xml/catalog file:///etc/xml/catalog
|
227 |
+
ZERO_STAGE=3
|
228 |
+
_=/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/deepspeed
|
229 |
+
_CE_CONDA=
|
230 |
+
_CE_M=
|
231 |
+
_CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
|
232 |
+
_LMFILES_=/cm/local/modulefiles/slurm/slurm/23.02.6:/cm/local/modulefiles/gcc/64/4.1.7a1
|
233 |
+
_LMFILES__modshare=/cm/local/modulefiles/slurm/slurm/23.02.6:1:/cm/local/modulefiles/gcc/64/4.1.7a1:1
|
234 |
+
build_alias=x86_64-conda-linux-gnu
|
235 |
+
host_alias=x86_64-conda-linux-gnu
|
script.sh
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
MODELS_TO_TRAIN=(
|
4 |
+
"Qwen2.5-7B-Instruct"
|
5 |
+
# "Llama-3.2-3B"
|
6 |
+
# "Llama-3.2-1B"
|
7 |
+
|
8 |
+
)
|
9 |
+
|
10 |
+
export WANDB_API_KEY="62c57a07add7cf80060d09b29e313990bc2fada2"
|
11 |
+
|
12 |
+
for MODEL in "${MODELS_TO_TRAIN[@]}"; do
|
13 |
+
echo "Starting training for model: ${MODEL}"
|
14 |
+
|
15 |
+
|
16 |
+
# MODEL_NAME_OR_PATH="/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/${MODEL}-base/slice_end"
|
17 |
+
# Second training phase
|
18 |
+
MODEL_NAME_OR_PATH="/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct"
|
19 |
+
TRAIN_DATASETS="/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset"
|
20 |
+
OUTPUT_DIR="/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/${MODEL}_safe_thinking"
|
21 |
+
TRAIN_TEMPLATE="Safe_thinking"
|
22 |
+
LOG_PROJECT="safe-o1"
|
23 |
+
|
24 |
+
source ./setup.sh
|
25 |
+
|
26 |
+
deepspeed \
|
27 |
+
--master_port ${MASTER_PORT} \
|
28 |
+
--module align_anything.trainers.text_to_text.sft \
|
29 |
+
--model_name_or_path ${MODEL_NAME_OR_PATH} \
|
30 |
+
--train_datasets ${TRAIN_DATASETS} \
|
31 |
+
--train_split train \
|
32 |
+
--train_template ${TRAIN_TEMPLATE} \
|
33 |
+
--output_dir ${OUTPUT_DIR} \
|
34 |
+
--log_project ${LOG_PROJECT} \
|
35 |
+
--per_device_train_batch_size 4 \
|
36 |
+
--per_device_eval_batch_size 4 \
|
37 |
+
--gradient_accumulation_steps 2 \
|
38 |
+
--learning_rate 2e-5 \
|
39 |
+
--epochs 3 \
|
40 |
+
--model_max_length 16384 \
|
41 |
+
|
42 |
+
|
43 |
+
echo "Completed second phase training for ${MODEL}"
|
44 |
+
done
|
45 |
+
|
46 |
+
echo "All model training completed!"
|
slice_end/added_tokens.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</tool_call>": 151658,
|
3 |
+
"<tool_call>": 151657,
|
4 |
+
"<|box_end|>": 151649,
|
5 |
+
"<|box_start|>": 151648,
|
6 |
+
"<|endoftext|>": 151643,
|
7 |
+
"<|file_sep|>": 151664,
|
8 |
+
"<|fim_middle|>": 151660,
|
9 |
+
"<|fim_pad|>": 151662,
|
10 |
+
"<|fim_prefix|>": 151659,
|
11 |
+
"<|fim_suffix|>": 151661,
|
12 |
+
"<|im_end|>": 151645,
|
13 |
+
"<|im_start|>": 151644,
|
14 |
+
"<|image_pad|>": 151655,
|
15 |
+
"<|object_ref_end|>": 151647,
|
16 |
+
"<|object_ref_start|>": 151646,
|
17 |
+
"<|quad_end|>": 151651,
|
18 |
+
"<|quad_start|>": 151650,
|
19 |
+
"<|repo_name|>": 151663,
|
20 |
+
"<|video_pad|>": 151656,
|
21 |
+
"<|vision_end|>": 151653,
|
22 |
+
"<|vision_pad|>": 151654,
|
23 |
+
"<|vision_start|>": 151652
|
24 |
+
}
|
slice_end/config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_attn_implementation_autoset": true,
|
3 |
+
"architectures": [
|
4 |
+
"Qwen2ForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"eos_token_id": 151645,
|
8 |
+
"hidden_act": "silu",
|
9 |
+
"hidden_size": 3584,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"intermediate_size": 18944,
|
12 |
+
"max_position_embeddings": 32768,
|
13 |
+
"max_window_layers": 28,
|
14 |
+
"model_type": "qwen2",
|
15 |
+
"num_attention_heads": 28,
|
16 |
+
"num_hidden_layers": 28,
|
17 |
+
"num_key_value_heads": 4,
|
18 |
+
"pad_token_id": 151643,
|
19 |
+
"rms_norm_eps": 1e-06,
|
20 |
+
"rope_scaling": null,
|
21 |
+
"rope_theta": 1000000.0,
|
22 |
+
"sliding_window": 131072,
|
23 |
+
"tie_word_embeddings": false,
|
24 |
+
"torch_dtype": "bfloat16",
|
25 |
+
"transformers_version": "4.50.0.dev0",
|
26 |
+
"use_cache": true,
|
27 |
+
"use_sliding_window": false,
|
28 |
+
"vocab_size": 152064
|
29 |
+
}
|
slice_end/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
slice_end/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30e1baecb16745b34dfc93ec163c21e0ef68e4af1237bcbf75727cf6a133d5fd
|
3 |
+
size 15231345338
|
slice_end/special_tokens_map.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<|im_start|>",
|
4 |
+
"<|im_end|>",
|
5 |
+
"<|object_ref_start|>",
|
6 |
+
"<|object_ref_end|>",
|
7 |
+
"<|box_start|>",
|
8 |
+
"<|box_end|>",
|
9 |
+
"<|quad_start|>",
|
10 |
+
"<|quad_end|>",
|
11 |
+
"<|vision_start|>",
|
12 |
+
"<|vision_end|>",
|
13 |
+
"<|vision_pad|>",
|
14 |
+
"<|image_pad|>",
|
15 |
+
"<|video_pad|>"
|
16 |
+
],
|
17 |
+
"eos_token": {
|
18 |
+
"content": "<|im_end|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
},
|
24 |
+
"pad_token": {
|
25 |
+
"content": "<|endoftext|>",
|
26 |
+
"lstrip": false,
|
27 |
+
"normalized": false,
|
28 |
+
"rstrip": false,
|
29 |
+
"single_word": false
|
30 |
+
}
|
31 |
+
}
|
slice_end/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eb8138e837fbbd50932cdb31eddc0832738f665fd265cb87ab5e5628b5eebe30
|
3 |
+
size 11421996
|
slice_end/tokenizer_config.json
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_prefix_space": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"151643": {
|
6 |
+
"content": "<|endoftext|>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"151644": {
|
14 |
+
"content": "<|im_start|>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"151645": {
|
22 |
+
"content": "<|im_end|>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
},
|
29 |
+
"151646": {
|
30 |
+
"content": "<|object_ref_start|>",
|
31 |
+
"lstrip": false,
|
32 |
+
"normalized": false,
|
33 |
+
"rstrip": false,
|
34 |
+
"single_word": false,
|
35 |
+
"special": true
|
36 |
+
},
|
37 |
+
"151647": {
|
38 |
+
"content": "<|object_ref_end|>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false,
|
43 |
+
"special": true
|
44 |
+
},
|
45 |
+
"151648": {
|
46 |
+
"content": "<|box_start|>",
|
47 |
+
"lstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"rstrip": false,
|
50 |
+
"single_word": false,
|
51 |
+
"special": true
|
52 |
+
},
|
53 |
+
"151649": {
|
54 |
+
"content": "<|box_end|>",
|
55 |
+
"lstrip": false,
|
56 |
+
"normalized": false,
|
57 |
+
"rstrip": false,
|
58 |
+
"single_word": false,
|
59 |
+
"special": true
|
60 |
+
},
|
61 |
+
"151650": {
|
62 |
+
"content": "<|quad_start|>",
|
63 |
+
"lstrip": false,
|
64 |
+
"normalized": false,
|
65 |
+
"rstrip": false,
|
66 |
+
"single_word": false,
|
67 |
+
"special": true
|
68 |
+
},
|
69 |
+
"151651": {
|
70 |
+
"content": "<|quad_end|>",
|
71 |
+
"lstrip": false,
|
72 |
+
"normalized": false,
|
73 |
+
"rstrip": false,
|
74 |
+
"single_word": false,
|
75 |
+
"special": true
|
76 |
+
},
|
77 |
+
"151652": {
|
78 |
+
"content": "<|vision_start|>",
|
79 |
+
"lstrip": false,
|
80 |
+
"normalized": false,
|
81 |
+
"rstrip": false,
|
82 |
+
"single_word": false,
|
83 |
+
"special": true
|
84 |
+
},
|
85 |
+
"151653": {
|
86 |
+
"content": "<|vision_end|>",
|
87 |
+
"lstrip": false,
|
88 |
+
"normalized": false,
|
89 |
+
"rstrip": false,
|
90 |
+
"single_word": false,
|
91 |
+
"special": true
|
92 |
+
},
|
93 |
+
"151654": {
|
94 |
+
"content": "<|vision_pad|>",
|
95 |
+
"lstrip": false,
|
96 |
+
"normalized": false,
|
97 |
+
"rstrip": false,
|
98 |
+
"single_word": false,
|
99 |
+
"special": true
|
100 |
+
},
|
101 |
+
"151655": {
|
102 |
+
"content": "<|image_pad|>",
|
103 |
+
"lstrip": false,
|
104 |
+
"normalized": false,
|
105 |
+
"rstrip": false,
|
106 |
+
"single_word": false,
|
107 |
+
"special": true
|
108 |
+
},
|
109 |
+
"151656": {
|
110 |
+
"content": "<|video_pad|>",
|
111 |
+
"lstrip": false,
|
112 |
+
"normalized": false,
|
113 |
+
"rstrip": false,
|
114 |
+
"single_word": false,
|
115 |
+
"special": true
|
116 |
+
},
|
117 |
+
"151657": {
|
118 |
+
"content": "<tool_call>",
|
119 |
+
"lstrip": false,
|
120 |
+
"normalized": false,
|
121 |
+
"rstrip": false,
|
122 |
+
"single_word": false,
|
123 |
+
"special": false
|
124 |
+
},
|
125 |
+
"151658": {
|
126 |
+
"content": "</tool_call>",
|
127 |
+
"lstrip": false,
|
128 |
+
"normalized": false,
|
129 |
+
"rstrip": false,
|
130 |
+
"single_word": false,
|
131 |
+
"special": false
|
132 |
+
},
|
133 |
+
"151659": {
|
134 |
+
"content": "<|fim_prefix|>",
|
135 |
+
"lstrip": false,
|
136 |
+
"normalized": false,
|
137 |
+
"rstrip": false,
|
138 |
+
"single_word": false,
|
139 |
+
"special": false
|
140 |
+
},
|
141 |
+
"151660": {
|
142 |
+
"content": "<|fim_middle|>",
|
143 |
+
"lstrip": false,
|
144 |
+
"normalized": false,
|
145 |
+
"rstrip": false,
|
146 |
+
"single_word": false,
|
147 |
+
"special": false
|
148 |
+
},
|
149 |
+
"151661": {
|
150 |
+
"content": "<|fim_suffix|>",
|
151 |
+
"lstrip": false,
|
152 |
+
"normalized": false,
|
153 |
+
"rstrip": false,
|
154 |
+
"single_word": false,
|
155 |
+
"special": false
|
156 |
+
},
|
157 |
+
"151662": {
|
158 |
+
"content": "<|fim_pad|>",
|
159 |
+
"lstrip": false,
|
160 |
+
"normalized": false,
|
161 |
+
"rstrip": false,
|
162 |
+
"single_word": false,
|
163 |
+
"special": false
|
164 |
+
},
|
165 |
+
"151663": {
|
166 |
+
"content": "<|repo_name|>",
|
167 |
+
"lstrip": false,
|
168 |
+
"normalized": false,
|
169 |
+
"rstrip": false,
|
170 |
+
"single_word": false,
|
171 |
+
"special": false
|
172 |
+
},
|
173 |
+
"151664": {
|
174 |
+
"content": "<|file_sep|>",
|
175 |
+
"lstrip": false,
|
176 |
+
"normalized": false,
|
177 |
+
"rstrip": false,
|
178 |
+
"single_word": false,
|
179 |
+
"special": false
|
180 |
+
}
|
181 |
+
},
|
182 |
+
"additional_special_tokens": [
|
183 |
+
"<|im_start|>",
|
184 |
+
"<|im_end|>",
|
185 |
+
"<|object_ref_start|>",
|
186 |
+
"<|object_ref_end|>",
|
187 |
+
"<|box_start|>",
|
188 |
+
"<|box_end|>",
|
189 |
+
"<|quad_start|>",
|
190 |
+
"<|quad_end|>",
|
191 |
+
"<|vision_start|>",
|
192 |
+
"<|vision_end|>",
|
193 |
+
"<|vision_pad|>",
|
194 |
+
"<|image_pad|>",
|
195 |
+
"<|video_pad|>"
|
196 |
+
],
|
197 |
+
"bos_token": null,
|
198 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
199 |
+
"clean_up_tokenization_spaces": false,
|
200 |
+
"eos_token": "<|im_end|>",
|
201 |
+
"errors": "replace",
|
202 |
+
"extra_special_tokens": {},
|
203 |
+
"model_max_length": 16384,
|
204 |
+
"pad_token": "<|endoftext|>",
|
205 |
+
"padding_side": "right",
|
206 |
+
"split_special_tokens": false,
|
207 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
208 |
+
"unk_token": null
|
209 |
+
}
|
slice_end/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-05T20:32:09.145104832+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
|
2 |
+
{"time":"2025-04-05T20:32:09.145234633+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log"}
|
3 |
+
{"time":"2025-04-05T20:32:09.359661673+08:00","level":"INFO","msg":"created new stream","id":"jla7fqqr"}
|
4 |
+
{"time":"2025-04-05T20:32:09.359700555+08:00","level":"INFO","msg":"stream: started","id":"jla7fqqr"}
|
5 |
+
{"time":"2025-04-05T20:32:09.35975566+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"jla7fqqr"}
|
6 |
+
{"time":"2025-04-05T20:32:09.359831663+08:00","level":"INFO","msg":"handler: started","stream_id":"jla7fqqr"}
|
7 |
+
{"time":"2025-04-05T20:32:09.35975831+08:00","level":"INFO","msg":"sender: started","stream_id":"jla7fqqr"}
|
8 |
+
{"time":"2025-04-05T20:32:09.688023993+08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2025-04-05T20:45:27.076637312+08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2025-04-05T20:45:27.077489476+08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2025-04-05T20:45:28.038487853+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.283111243,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.283100079,"progress":"10.8KB/10.8KB"}],"total_operations":2}}
|
12 |
+
{"time":"2025-04-05T20:45:28.204441985+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2025-04-05T20:45:29.440112056+08:00","level":"INFO","msg":"stream: closing","id":"jla7fqqr"}
|
14 |
+
{"time":"2025-04-05T20:45:29.440138846+08:00","level":"INFO","msg":"handler: closed","stream_id":"jla7fqqr"}
|
15 |
+
{"time":"2025-04-05T20:45:29.440146259+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"jla7fqqr"}
|
16 |
+
{"time":"2025-04-05T20:45:29.440285075+08:00","level":"INFO","msg":"sender: closed","stream_id":"jla7fqqr"}
|
17 |
+
{"time":"2025-04-05T20:45:29.440378039+08:00","level":"INFO","msg":"stream: closed","id":"jla7fqqr"}
|
wandb/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-05 20:32:09,135 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
2 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Configure stats pid to 2888806
|
3 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
|
4 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
|
5 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
6 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug.log
|
7 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log
|
8 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():644] calling init triggers
|
9 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
10 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_thinking', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
|
11 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():680] starting backend
|
12 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():684] sending inform_init request
|
13 |
+
2025-04-05 20:32:09,141 INFO MainThread:2888806 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-05 20:32:09,142 INFO MainThread:2888806 [wandb_init.py:init():697] backend started and connected
|
15 |
+
2025-04-05 20:32:09,143 INFO MainThread:2888806 [wandb_init.py:init():790] updated telemetry
|
16 |
+
2025-04-05 20:32:09,162 INFO MainThread:2888806 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-05 20:32:09,682 INFO MainThread:2888806 [wandb_init.py:init():874] starting run threads in backend
|
18 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_console_start():2374] atexit reg
|
19 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
20 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
21 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2314] Redirects installed.
|
22 |
+
2025-04-05 20:32:10,112 INFO MainThread:2888806 [wandb_init.py:init():916] run started, returning control to user process
|
23 |
+
2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/jla7fqqr
|
24 |
+
2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
|
25 |
+
2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2321] restore
|
26 |
+
2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2327] restore done
|
27 |
+
2025-04-05 20:45:29,432 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3892] rendering history
|
28 |
+
2025-04-05 20:45:29,433 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
|
29 |
+
2025-04-05 20:45:29,439 INFO MainThread:2888806 [wandb_run.py:_footer_sync_info():3853] logging synced files
|
wandb/run-20250404_234514-h2gynfll/files/config.yaml
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.19.1
|
4 |
+
m: []
|
5 |
+
python_version: 3.11.0
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 5
|
10 |
+
- 11
|
11 |
+
- 41
|
12 |
+
- 49
|
13 |
+
- 51
|
14 |
+
- 53
|
15 |
+
- 55
|
16 |
+
- 63
|
17 |
+
- 71
|
18 |
+
- 83
|
19 |
+
- 98
|
20 |
+
- 105
|
21 |
+
"2":
|
22 |
+
- 1
|
23 |
+
- 5
|
24 |
+
- 11
|
25 |
+
- 41
|
26 |
+
- 49
|
27 |
+
- 51
|
28 |
+
- 53
|
29 |
+
- 55
|
30 |
+
- 63
|
31 |
+
- 71
|
32 |
+
- 83
|
33 |
+
- 98
|
34 |
+
- 105
|
35 |
+
"3":
|
36 |
+
- 2
|
37 |
+
- 13
|
38 |
+
- 16
|
39 |
+
- 23
|
40 |
+
- 55
|
41 |
+
- 61
|
42 |
+
"4": 3.11.0
|
43 |
+
"5": 0.19.1
|
44 |
+
"6": 4.50.0.dev0
|
45 |
+
"8":
|
46 |
+
- 5
|
47 |
+
"12": 0.19.1
|
48 |
+
"13": linux-x86_64
|
49 |
+
bnb_cfgs:
|
50 |
+
value:
|
51 |
+
bnb_4bit_compute_dtype: float16
|
52 |
+
bnb_4bit_quant_type: nf4
|
53 |
+
bnb_4bit_use_double_quant: true
|
54 |
+
load_in_4bit: true
|
55 |
+
load_in_8bit: false
|
56 |
+
use_bnb: false
|
57 |
+
data_cfgs:
|
58 |
+
value:
|
59 |
+
eval_optional_args: []
|
60 |
+
train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
|
61 |
+
train_optional_args: []
|
62 |
+
train_split: train
|
63 |
+
train_template: Safe_o1
|
64 |
+
logger_cfgs:
|
65 |
+
value:
|
66 |
+
log_project: safe-o1
|
67 |
+
log_run_name: sft
|
68 |
+
log_type: wandb
|
69 |
+
output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
|
70 |
+
save_interval: 100000
|
71 |
+
lora_cfgs:
|
72 |
+
value:
|
73 |
+
inference_mode: false
|
74 |
+
lora_alpha: 16
|
75 |
+
lora_dropout: 0.1
|
76 |
+
r: 16
|
77 |
+
save_full_model: true
|
78 |
+
target_modules:
|
79 |
+
- q_proj
|
80 |
+
- v_proj
|
81 |
+
task_type: TaskType.CAUSAL_LM
|
82 |
+
use_lora: false
|
83 |
+
model_cfgs:
|
84 |
+
value:
|
85 |
+
model_max_length: 16384
|
86 |
+
model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
|
87 |
+
trust_remote_code: true
|
88 |
+
train_cfgs:
|
89 |
+
value:
|
90 |
+
adam_betas:
|
91 |
+
- 0.9
|
92 |
+
- 0.95
|
93 |
+
adam_epsilon: 1e-08
|
94 |
+
bf16: true
|
95 |
+
ds_cfgs: ds_z3_config.json
|
96 |
+
epochs: 6
|
97 |
+
eval_interval: 10
|
98 |
+
eval_strategy: steps
|
99 |
+
fp16: false
|
100 |
+
gradient_accumulation_steps: 2
|
101 |
+
gradient_checkpointing: true
|
102 |
+
learning_rate: 2e-05
|
103 |
+
lr_scheduler_type: constant
|
104 |
+
lr_warmup_ratio: 0.03
|
105 |
+
max_grad_norm: 1
|
106 |
+
per_device_eval_batch_size: 4
|
107 |
+
per_device_train_batch_size: 4
|
108 |
+
seed: 42
|
109 |
+
weight_decay: 0
|
wandb/run-20250404_234514-h2gynfll/files/output.log
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
***** Running training *****
|
2 |
+
Training 1/6 epoch: 0%| | 0/2112 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
3 |
+
Training 1/6 epoch (loss 0.9676): 15%|███████████████████████▊ | 319/2112 [08:11<46:47, 1.57s/it]
|
4 |
+
[2025-04-04 23:45:49,795] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
5 |
+
[2025-04-04 23:45:49,796] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=10.722716430728992, CurrSamplesPerSec=9.564486815227898, MemAllocated=29.37GB, MaxMemAllocated=51.13GB
|
6 |
+
[2025-04-04 23:46:21,396] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
7 |
+
[2025-04-04 23:46:21,397] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=10.61486173550022, CurrSamplesPerSec=11.387013476257202, MemAllocated=29.37GB, MaxMemAllocated=51.13GB
|
8 |
+
[2025-04-04 23:46:52,750] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
9 |
+
[2025-04-04 23:46:52,750] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=10.607628001438231, CurrSamplesPerSec=13.11193874390518, MemAllocated=29.37GB, MaxMemAllocated=52.29GB
|
10 |
+
[2025-04-04 23:47:20,658] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
11 |
+
[2025-04-04 23:47:20,659] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=10.929159623513739, CurrSamplesPerSec=9.881152216939991, MemAllocated=29.37GB, MaxMemAllocated=52.29GB
|
12 |
+
[2025-04-04 23:47:53,176] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
13 |
+
[2025-04-04 23:47:53,177] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=10.769232617254287, CurrSamplesPerSec=10.506886069158746, MemAllocated=29.37GB, MaxMemAllocated=52.29GB
|
14 |
+
[2025-04-04 23:48:25,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
15 |
+
[2025-04-04 23:48:25,188] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=10.706402500792887, CurrSamplesPerSec=13.256381944370643, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
16 |
+
[2025-04-04 23:48:54,900] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
17 |
+
[2025-04-04 23:48:54,901] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=10.771832584979517, CurrSamplesPerSec=11.234739539724892, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
18 |
+
[2025-04-04 23:49:25,278] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
19 |
+
[2025-04-04 23:49:25,279] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=10.79860773669107, CurrSamplesPerSec=11.809016383898683, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
20 |
+
[2025-04-04 23:49:56,100] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
21 |
+
[2025-04-04 23:49:56,100] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=10.796911835100996, CurrSamplesPerSec=11.795726899427471, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
22 |
+
[2025-04-04 23:50:26,690] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
23 |
+
[2025-04-04 23:50:26,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=10.81413404235532, CurrSamplesPerSec=12.486791572711763, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
24 |
+
[2025-04-04 23:50:56,559] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
25 |
+
[2025-04-04 23:50:56,560] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=10.845791469144872, CurrSamplesPerSec=11.503468048822956, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
26 |
+
[2025-04-04 23:51:26,233] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
27 |
+
[2025-04-04 23:51:26,234] [INFO] [timer.py:264:stop] epoch=0/micro_step=240/global_step=120, RunningAvgSamplesPerSec=10.871702151329506, CurrSamplesPerSec=12.054733265665433, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
28 |
+
[2025-04-04 23:51:56,110] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
29 |
+
[2025-04-04 23:51:56,110] [INFO] [timer.py:264:stop] epoch=0/micro_step=260/global_step=130, RunningAvgSamplesPerSec=10.890944356652495, CurrSamplesPerSec=11.35227930719023, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
30 |
+
[2025-04-04 23:52:26,208] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
31 |
+
[2025-04-04 23:52:26,208] [INFO] [timer.py:264:stop] epoch=0/micro_step=280/global_step=140, RunningAvgSamplesPerSec=10.899459967950149, CurrSamplesPerSec=9.079894291769792, MemAllocated=29.37GB, MaxMemAllocated=54.61GB
|
32 |
+
[2025-04-04 23:52:57,298] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
33 |
+
[2025-04-04 23:52:57,298] [INFO] [timer.py:264:stop] epoch=0/micro_step=300/global_step=150, RunningAvgSamplesPerSec=10.886001551743023, CurrSamplesPerSec=10.84443091570689, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
34 |
+
[2025-04-04 23:53:28,207] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
35 |
+
[2025-04-04 23:53:28,208] [INFO] [timer.py:264:stop] epoch=0/micro_step=320/global_step=160, RunningAvgSamplesPerSec=10.8748149413261, CurrSamplesPerSec=11.527765341507381, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
36 |
+
[2025-04-04 23:53:58,689] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
37 |
+
[2025-04-04 23:53:58,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=340/global_step=170, RunningAvgSamplesPerSec=10.88374807162155, CurrSamplesPerSec=10.619738627247132, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
38 |
+
[2025-04-04 23:54:29,570] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
39 |
+
[2025-04-04 23:54:29,571] [INFO] [timer.py:264:stop] epoch=1/micro_step=8/global_step=180, RunningAvgSamplesPerSec=10.875487421119185, CurrSamplesPerSec=12.660335656450862, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
40 |
+
[2025-04-04 23:55:00,249] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
41 |
+
[2025-04-04 23:55:00,250] [INFO] [timer.py:264:stop] epoch=1/micro_step=28/global_step=190, RunningAvgSamplesPerSec=10.872545347288018, CurrSamplesPerSec=11.075220259929674, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
42 |
+
[2025-04-04 23:55:30,213] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
43 |
+
[2025-04-04 23:55:30,213] [INFO] [timer.py:264:stop] epoch=1/micro_step=48/global_step=200, RunningAvgSamplesPerSec=10.882162387552667, CurrSamplesPerSec=11.73534073720176, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
44 |
+
[2025-04-04 23:56:00,688] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
45 |
+
[2025-04-04 23:56:00,689] [INFO] [timer.py:264:stop] epoch=1/micro_step=68/global_step=210, RunningAvgSamplesPerSec=10.882240702304028, CurrSamplesPerSec=13.440744149207571, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
46 |
+
[2025-04-04 23:56:31,300] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
47 |
+
[2025-04-04 23:56:31,300] [INFO] [timer.py:264:stop] epoch=1/micro_step=88/global_step=220, RunningAvgSamplesPerSec=10.878092074029748, CurrSamplesPerSec=10.428130570438391, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
48 |
+
[2025-04-04 23:57:04,592] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
49 |
+
[2025-04-04 23:57:04,592] [INFO] [timer.py:264:stop] epoch=1/micro_step=108/global_step=230, RunningAvgSamplesPerSec=10.833171978118708, CurrSamplesPerSec=10.846180971098232, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
50 |
+
[2025-04-04 23:57:32,933] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
51 |
+
[2025-04-04 23:57:32,933] [INFO] [timer.py:264:stop] epoch=1/micro_step=128/global_step=240, RunningAvgSamplesPerSec=10.867158335720406, CurrSamplesPerSec=9.61852783515831, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
52 |
+
[2025-04-04 23:58:03,491] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
53 |
+
[2025-04-04 23:58:03,492] [INFO] [timer.py:264:stop] epoch=1/micro_step=148/global_step=250, RunningAvgSamplesPerSec=10.866661161970612, CurrSamplesPerSec=12.195057132062441, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
54 |
+
[2025-04-04 23:58:33,526] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
55 |
+
[2025-04-04 23:58:33,526] [INFO] [timer.py:264:stop] epoch=1/micro_step=168/global_step=260, RunningAvgSamplesPerSec=10.873141761713997, CurrSamplesPerSec=10.176243052918215, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
56 |
+
[2025-04-04 23:59:05,507] [INFO] [logging.py:128:log_dist] [Rank 0] step=270, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
57 |
+
[2025-04-04 23:59:05,508] [INFO] [timer.py:264:stop] epoch=1/micro_step=188/global_step=270, RunningAvgSamplesPerSec=10.854559682524012, CurrSamplesPerSec=10.616882484742007, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
58 |
+
[2025-04-04 23:59:34,347] [INFO] [logging.py:128:log_dist] [Rank 0] step=280, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
59 |
+
[2025-04-04 23:59:34,347] [INFO] [timer.py:264:stop] epoch=1/micro_step=208/global_step=280, RunningAvgSamplesPerSec=10.878267585037369, CurrSamplesPerSec=9.115241933490342, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
60 |
+
[2025-04-05 00:00:05,154] [INFO] [logging.py:128:log_dist] [Rank 0] step=290, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
61 |
+
[2025-04-05 00:00:05,154] [INFO] [timer.py:264:stop] epoch=1/micro_step=228/global_step=290, RunningAvgSamplesPerSec=10.8751133915393, CurrSamplesPerSec=10.612563389272802, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
62 |
+
[2025-04-05 00:00:33,555] [INFO] [logging.py:128:log_dist] [Rank 0] step=300, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
63 |
+
[2025-04-05 00:00:33,555] [INFO] [timer.py:264:stop] epoch=1/micro_step=248/global_step=300, RunningAvgSamplesPerSec=10.901752001823205, CurrSamplesPerSec=11.59274166918304, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
64 |
+
[2025-04-05 00:01:03,565] [INFO] [logging.py:128:log_dist] [Rank 0] step=310, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
65 |
+
[2025-04-05 00:01:03,566] [INFO] [timer.py:264:stop] epoch=1/micro_step=268/global_step=310, RunningAvgSamplesPerSec=10.907293269923844, CurrSamplesPerSec=9.995996622494804, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
66 |
+
[2025-04-05 00:01:34,344] [INFO] [logging.py:128:log_dist] [Rank 0] step=320, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
67 |
+
[2025-04-05 00:01:34,345] [INFO] [timer.py:264:stop] epoch=1/micro_step=288/global_step=320, RunningAvgSamplesPerSec=10.907504943822554, CurrSamplesPerSec=11.022197256520487, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
68 |
+
[2025-04-05 00:02:07,406] [INFO] [logging.py:128:log_dist] [Rank 0] step=330, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
69 |
+
[2025-04-05 00:02:07,407] [INFO] [timer.py:264:stop] epoch=1/micro_step=308/global_step=330, RunningAvgSamplesPerSec=10.881103213238672, CurrSamplesPerSec=10.663874874443216, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
70 |
+
[2025-04-05 00:02:35,659] [INFO] [logging.py:128:log_dist] [Rank 0] step=340, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
71 |
+
[2025-04-05 00:02:35,659] [INFO] [timer.py:264:stop] epoch=1/micro_step=328/global_step=340, RunningAvgSamplesPerSec=10.907334658762007, CurrSamplesPerSec=13.854504112243248, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
72 |
+
[2025-04-05 00:03:07,329] [INFO] [logging.py:128:log_dist] [Rank 0] step=350, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
73 |
+
[2025-04-05 00:03:07,330] [INFO] [timer.py:264:stop] epoch=1/micro_step=348/global_step=350, RunningAvgSamplesPerSec=10.902276426516282, CurrSamplesPerSec=10.083614280472377, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
74 |
+
[2025-04-05 00:03:37,776] [INFO] [logging.py:128:log_dist] [Rank 0] step=360, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
75 |
+
[2025-04-05 00:03:37,776] [INFO] [timer.py:264:stop] epoch=2/micro_step=16/global_step=360, RunningAvgSamplesPerSec=10.904611092726073, CurrSamplesPerSec=11.53194905435867, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
76 |
+
[2025-04-05 00:04:09,459] [INFO] [logging.py:128:log_dist] [Rank 0] step=370, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
77 |
+
[2025-04-05 00:04:09,459] [INFO] [timer.py:264:stop] epoch=2/micro_step=36/global_step=370, RunningAvgSamplesPerSec=10.89630043473511, CurrSamplesPerSec=11.008557068097579, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
78 |
+
[2025-04-05 00:04:41,383] [INFO] [logging.py:128:log_dist] [Rank 0] step=380, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
79 |
+
[2025-04-05 00:04:41,383] [INFO] [timer.py:264:stop] epoch=2/micro_step=56/global_step=380, RunningAvgSamplesPerSec=10.884882705424278, CurrSamplesPerSec=9.811316896915017, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
80 |
+
[2025-04-05 00:05:08,841] [INFO] [logging.py:128:log_dist] [Rank 0] step=390, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
81 |
+
[2025-04-05 00:05:08,841] [INFO] [timer.py:264:stop] epoch=2/micro_step=76/global_step=390, RunningAvgSamplesPerSec=10.915433970019686, CurrSamplesPerSec=12.441151077420326, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
82 |
+
[2025-04-05 00:05:40,337] [INFO] [logging.py:128:log_dist] [Rank 0] step=400, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
83 |
+
[2025-04-05 00:05:40,338] [INFO] [timer.py:264:stop] epoch=2/micro_step=96/global_step=400, RunningAvgSamplesPerSec=10.905658171388385, CurrSamplesPerSec=12.03147003105065, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
84 |
+
[2025-04-05 00:06:13,617] [INFO] [logging.py:128:log_dist] [Rank 0] step=410, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
85 |
+
[2025-04-05 00:06:13,617] [INFO] [timer.py:264:stop] epoch=2/micro_step=116/global_step=410, RunningAvgSamplesPerSec=10.882479697107344, CurrSamplesPerSec=12.944935493618878, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
86 |
+
[2025-04-05 00:06:42,210] [INFO] [logging.py:128:log_dist] [Rank 0] step=420, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
87 |
+
[2025-04-05 00:06:42,210] [INFO] [timer.py:264:stop] epoch=2/micro_step=136/global_step=420, RunningAvgSamplesPerSec=10.902476276457532, CurrSamplesPerSec=11.251411338667834, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
88 |
+
[2025-04-05 00:07:13,632] [INFO] [logging.py:128:log_dist] [Rank 0] step=430, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
89 |
+
[2025-04-05 00:07:13,633] [INFO] [timer.py:264:stop] epoch=2/micro_step=156/global_step=430, RunningAvgSamplesPerSec=10.900374041873794, CurrSamplesPerSec=12.121469773864192, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
90 |
+
[2025-04-05 00:07:44,981] [INFO] [logging.py:128:log_dist] [Rank 0] step=440, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
91 |
+
[2025-04-05 00:07:44,981] [INFO] [timer.py:264:stop] epoch=2/micro_step=176/global_step=440, RunningAvgSamplesPerSec=10.896044041249382, CurrSamplesPerSec=10.324748615101747, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
92 |
+
[2025-04-05 00:08:17,623] [INFO] [logging.py:128:log_dist] [Rank 0] step=450, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
93 |
+
[2025-04-05 00:08:17,624] [INFO] [timer.py:264:stop] epoch=2/micro_step=196/global_step=450, RunningAvgSamplesPerSec=10.885513745657075, CurrSamplesPerSec=10.364115890586287, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
94 |
+
[2025-04-05 00:08:48,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=460, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
95 |
+
[2025-04-05 00:08:48,421] [INFO] [timer.py:264:stop] epoch=2/micro_step=216/global_step=460, RunningAvgSamplesPerSec=10.890322858413576, CurrSamplesPerSec=9.913507288254982, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
96 |
+
[2025-04-05 00:09:18,859] [INFO] [logging.py:128:log_dist] [Rank 0] step=470, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
97 |
+
[2025-04-05 00:09:18,860] [INFO] [timer.py:264:stop] epoch=2/micro_step=236/global_step=470, RunningAvgSamplesPerSec=10.897011731086295, CurrSamplesPerSec=11.582722357771235, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
98 |
+
[2025-04-05 00:09:48,991] [INFO] [logging.py:128:log_dist] [Rank 0] step=480, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
99 |
+
[2025-04-05 00:09:48,992] [INFO] [timer.py:264:stop] epoch=2/micro_step=256/global_step=480, RunningAvgSamplesPerSec=10.901656808546964, CurrSamplesPerSec=10.341647371016279, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
100 |
+
[2025-04-05 00:10:18,579] [INFO] [logging.py:128:log_dist] [Rank 0] step=490, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
101 |
+
[2025-04-05 00:10:18,580] [INFO] [timer.py:264:stop] epoch=2/micro_step=276/global_step=490, RunningAvgSamplesPerSec=10.90998996993192, CurrSamplesPerSec=12.032918655477928, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
102 |
+
[2025-04-05 00:10:50,691] [INFO] [logging.py:128:log_dist] [Rank 0] step=500, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
103 |
+
[2025-04-05 00:10:50,692] [INFO] [timer.py:264:stop] epoch=2/micro_step=296/global_step=500, RunningAvgSamplesPerSec=10.899989270395931, CurrSamplesPerSec=10.292938527296563, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
104 |
+
[2025-04-05 00:11:22,168] [INFO] [logging.py:128:log_dist] [Rank 0] step=510, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
105 |
+
[2025-04-05 00:11:22,169] [INFO] [timer.py:264:stop] epoch=2/micro_step=316/global_step=510, RunningAvgSamplesPerSec=10.89629369912652, CurrSamplesPerSec=10.290541044636166, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
106 |
+
[2025-04-05 00:11:51,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=520, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
107 |
+
[2025-04-05 00:11:51,929] [INFO] [timer.py:264:stop] epoch=2/micro_step=336/global_step=520, RunningAvgSamplesPerSec=10.90288718797129, CurrSamplesPerSec=11.410733496183932, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
108 |
+
[2025-04-05 00:12:22,106] [INFO] [logging.py:128:log_dist] [Rank 0] step=530, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
109 |
+
[2025-04-05 00:12:22,107] [INFO] [timer.py:264:stop] epoch=3/micro_step=4/global_step=530, RunningAvgSamplesPerSec=10.906725677726158, CurrSamplesPerSec=9.671531033968664, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
110 |
+
[2025-04-05 00:12:52,977] [INFO] [logging.py:128:log_dist] [Rank 0] step=540, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
111 |
+
[2025-04-05 00:12:52,978] [INFO] [timer.py:264:stop] epoch=3/micro_step=24/global_step=540, RunningAvgSamplesPerSec=10.906210663844542, CurrSamplesPerSec=10.504433111716455, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
112 |
+
[2025-04-05 00:13:24,041] [INFO] [logging.py:128:log_dist] [Rank 0] step=550, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
113 |
+
[2025-04-05 00:13:24,041] [INFO] [timer.py:264:stop] epoch=3/micro_step=44/global_step=550, RunningAvgSamplesPerSec=10.902891825560632, CurrSamplesPerSec=8.900087581387572, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
114 |
+
[2025-04-05 00:13:55,658] [INFO] [logging.py:128:log_dist] [Rank 0] step=560, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
115 |
+
[2025-04-05 00:13:55,658] [INFO] [timer.py:264:stop] epoch=3/micro_step=64/global_step=560, RunningAvgSamplesPerSec=10.898118424070445, CurrSamplesPerSec=10.754342827399102, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
116 |
+
[2025-04-05 00:14:23,748] [INFO] [logging.py:128:log_dist] [Rank 0] step=570, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
117 |
+
[2025-04-05 00:14:23,748] [INFO] [timer.py:264:stop] epoch=3/micro_step=84/global_step=570, RunningAvgSamplesPerSec=10.916669469902244, CurrSamplesPerSec=9.810931771693125, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
118 |
+
[2025-04-05 00:14:58,721] [INFO] [logging.py:128:log_dist] [Rank 0] step=580, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
119 |
+
[2025-04-05 00:14:58,722] [INFO] [timer.py:264:stop] epoch=3/micro_step=104/global_step=580, RunningAvgSamplesPerSec=10.88957716223878, CurrSamplesPerSec=9.16694888207876, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
120 |
+
[2025-04-05 00:15:28,031] [INFO] [logging.py:128:log_dist] [Rank 0] step=590, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
121 |
+
[2025-04-05 00:15:28,031] [INFO] [timer.py:264:stop] epoch=3/micro_step=124/global_step=590, RunningAvgSamplesPerSec=10.899943327977292, CurrSamplesPerSec=12.39936093438136, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
122 |
+
[2025-04-05 00:15:58,906] [INFO] [logging.py:128:log_dist] [Rank 0] step=600, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
123 |
+
[2025-04-05 00:15:58,907] [INFO] [timer.py:264:stop] epoch=3/micro_step=144/global_step=600, RunningAvgSamplesPerSec=10.8988989794667, CurrSamplesPerSec=10.836493968610279, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
124 |
+
[2025-04-05 00:16:28,668] [INFO] [logging.py:128:log_dist] [Rank 0] step=610, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
125 |
+
[2025-04-05 00:16:28,668] [INFO] [timer.py:264:stop] epoch=3/micro_step=164/global_step=610, RunningAvgSamplesPerSec=10.903900774650129, CurrSamplesPerSec=11.612509861641884, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
126 |
+
[2025-04-05 00:17:00,382] [INFO] [logging.py:128:log_dist] [Rank 0] step=620, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
127 |
+
[2025-04-05 00:17:00,382] [INFO] [timer.py:264:stop] epoch=3/micro_step=184/global_step=620, RunningAvgSamplesPerSec=10.897520261368054, CurrSamplesPerSec=9.668412642725407, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
128 |
+
[2025-04-05 00:17:29,289] [INFO] [logging.py:128:log_dist] [Rank 0] step=630, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
129 |
+
[2025-04-05 00:17:29,290] [INFO] [timer.py:264:stop] epoch=3/micro_step=204/global_step=630, RunningAvgSamplesPerSec=10.908485349036773, CurrSamplesPerSec=11.284010097390484, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
130 |
+
[2025-04-05 00:18:00,269] [INFO] [logging.py:128:log_dist] [Rank 0] step=640, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
131 |
+
[2025-04-05 00:18:00,270] [INFO] [timer.py:264:stop] epoch=3/micro_step=224/global_step=640, RunningAvgSamplesPerSec=10.907192494678394, CurrSamplesPerSec=9.731917183842711, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
132 |
+
[2025-04-05 00:18:29,192] [INFO] [logging.py:128:log_dist] [Rank 0] step=650, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
133 |
+
[2025-04-05 00:18:29,192] [INFO] [timer.py:264:stop] epoch=3/micro_step=244/global_step=650, RunningAvgSamplesPerSec=10.91708252161445, CurrSamplesPerSec=10.408032976984966, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
134 |
+
[2025-04-05 00:18:58,578] [INFO] [logging.py:128:log_dist] [Rank 0] step=660, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
135 |
+
[2025-04-05 00:18:58,579] [INFO] [timer.py:264:stop] epoch=3/micro_step=264/global_step=660, RunningAvgSamplesPerSec=10.92412428365038, CurrSamplesPerSec=12.284003581583471, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
136 |
+
[2025-04-05 00:19:30,459] [INFO] [logging.py:128:log_dist] [Rank 0] step=670, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
137 |
+
[2025-04-05 00:19:30,460] [INFO] [timer.py:264:stop] epoch=3/micro_step=284/global_step=670, RunningAvgSamplesPerSec=10.916816649165908, CurrSamplesPerSec=9.322337913360691, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
138 |
+
[2025-04-05 00:20:02,133] [INFO] [logging.py:128:log_dist] [Rank 0] step=680, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
139 |
+
[2025-04-05 00:20:02,134] [INFO] [timer.py:264:stop] epoch=3/micro_step=304/global_step=680, RunningAvgSamplesPerSec=10.911318041413768, CurrSamplesPerSec=10.483330656587459, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
140 |
+
[2025-04-05 00:20:31,713] [INFO] [logging.py:128:log_dist] [Rank 0] step=690, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
141 |
+
[2025-04-05 00:20:31,714] [INFO] [timer.py:264:stop] epoch=3/micro_step=324/global_step=690, RunningAvgSamplesPerSec=10.917770882722586, CurrSamplesPerSec=11.774822290085138, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
142 |
+
[2025-04-05 00:21:02,282] [INFO] [logging.py:128:log_dist] [Rank 0] step=700, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
143 |
+
[2025-04-05 00:21:02,282] [INFO] [timer.py:264:stop] epoch=3/micro_step=344/global_step=700, RunningAvgSamplesPerSec=10.917551948544697, CurrSamplesPerSec=13.614658205589432, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
144 |
+
[2025-04-05 00:21:32,740] [INFO] [logging.py:128:log_dist] [Rank 0] step=710, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
145 |
+
[2025-04-05 00:21:32,740] [INFO] [timer.py:264:stop] epoch=4/micro_step=12/global_step=710, RunningAvgSamplesPerSec=10.9181337689355, CurrSamplesPerSec=12.613885955557196, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
146 |
+
[2025-04-05 00:22:04,544] [INFO] [logging.py:128:log_dist] [Rank 0] step=720, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
147 |
+
[2025-04-05 00:22:04,544] [INFO] [timer.py:264:stop] epoch=4/micro_step=32/global_step=720, RunningAvgSamplesPerSec=10.91412724964451, CurrSamplesPerSec=9.51134217019512, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
148 |
+
[2025-04-05 00:22:35,183] [INFO] [logging.py:128:log_dist] [Rank 0] step=730, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
149 |
+
[2025-04-05 00:22:35,184] [INFO] [timer.py:264:stop] epoch=4/micro_step=52/global_step=730, RunningAvgSamplesPerSec=10.91414884787545, CurrSamplesPerSec=10.138197103888123, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
150 |
+
[2025-04-05 00:23:04,475] [INFO] [logging.py:128:log_dist] [Rank 0] step=740, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
151 |
+
[2025-04-05 00:23:04,475] [INFO] [timer.py:264:stop] epoch=4/micro_step=72/global_step=740, RunningAvgSamplesPerSec=10.920729378294386, CurrSamplesPerSec=12.715673733635349, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
152 |
+
[2025-04-05 00:23:35,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=750, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
153 |
+
[2025-04-05 00:23:35,986] [INFO] [timer.py:264:stop] epoch=4/micro_step=92/global_step=750, RunningAvgSamplesPerSec=10.915074902404724, CurrSamplesPerSec=10.352435836177008, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
154 |
+
[2025-04-05 00:24:09,924] [INFO] [logging.py:128:log_dist] [Rank 0] step=760, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
155 |
+
[2025-04-05 00:24:09,925] [INFO] [timer.py:264:stop] epoch=4/micro_step=112/global_step=760, RunningAvgSamplesPerSec=10.899539039695087, CurrSamplesPerSec=9.854495496659503, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
156 |
+
[2025-04-05 00:24:38,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=770, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
157 |
+
[2025-04-05 00:24:38,255] [INFO] [timer.py:264:stop] epoch=4/micro_step=132/global_step=770, RunningAvgSamplesPerSec=10.911327821307937, CurrSamplesPerSec=9.34124368043991, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
158 |
+
[2025-04-05 00:25:08,832] [INFO] [logging.py:128:log_dist] [Rank 0] step=780, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
159 |
+
[2025-04-05 00:25:08,833] [INFO] [timer.py:264:stop] epoch=4/micro_step=152/global_step=780, RunningAvgSamplesPerSec=10.91183210830314, CurrSamplesPerSec=10.903172459516146, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
160 |
+
[2025-04-05 00:25:39,741] [INFO] [logging.py:128:log_dist] [Rank 0] step=790, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
161 |
+
[2025-04-05 00:25:39,741] [INFO] [timer.py:264:stop] epoch=4/micro_step=172/global_step=790, RunningAvgSamplesPerSec=10.91031485235279, CurrSamplesPerSec=9.60001863492875, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
162 |
+
[2025-04-05 00:26:10,132] [INFO] [logging.py:128:log_dist] [Rank 0] step=800, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
163 |
+
[2025-04-05 00:26:10,133] [INFO] [timer.py:264:stop] epoch=4/micro_step=192/global_step=800, RunningAvgSamplesPerSec=10.911486959949936, CurrSamplesPerSec=13.04129409549576, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
164 |
+
[2025-04-05 00:26:39,720] [INFO] [logging.py:128:log_dist] [Rank 0] step=810, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
165 |
+
[2025-04-05 00:26:39,720] [INFO] [timer.py:264:stop] epoch=4/micro_step=212/global_step=810, RunningAvgSamplesPerSec=10.916327699656112, CurrSamplesPerSec=10.58162272777036, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
166 |
+
[2025-04-05 00:27:09,446] [INFO] [logging.py:128:log_dist] [Rank 0] step=820, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
167 |
+
[2025-04-05 00:27:09,447] [INFO] [timer.py:264:stop] epoch=4/micro_step=232/global_step=820, RunningAvgSamplesPerSec=10.920561311076566, CurrSamplesPerSec=14.155525610526686, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
168 |
+
[2025-04-05 00:27:39,685] [INFO] [logging.py:128:log_dist] [Rank 0] step=830, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
169 |
+
[2025-04-05 00:27:39,686] [INFO] [timer.py:264:stop] epoch=4/micro_step=252/global_step=830, RunningAvgSamplesPerSec=10.923770509825303, CurrSamplesPerSec=11.454142227015383, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
170 |
+
[2025-04-05 00:28:09,402] [INFO] [logging.py:128:log_dist] [Rank 0] step=840, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
171 |
+
[2025-04-05 00:28:09,402] [INFO] [timer.py:264:stop] epoch=4/micro_step=272/global_step=840, RunningAvgSamplesPerSec=10.92759224936308, CurrSamplesPerSec=10.562603606097442, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
172 |
+
[2025-04-05 00:28:39,865] [INFO] [logging.py:128:log_dist] [Rank 0] step=850, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
173 |
+
[2025-04-05 00:28:39,865] [INFO] [timer.py:264:stop] epoch=4/micro_step=292/global_step=850, RunningAvgSamplesPerSec=10.929102932944096, CurrSamplesPerSec=10.551309022814346, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
174 |
+
[2025-04-05 00:29:12,678] [INFO] [logging.py:128:log_dist] [Rank 0] step=860, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
175 |
+
[2025-04-05 00:29:12,678] [INFO] [timer.py:264:stop] epoch=4/micro_step=312/global_step=860, RunningAvgSamplesPerSec=10.919878183184371, CurrSamplesPerSec=12.868895213136868, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
176 |
+
[2025-04-05 00:29:42,776] [INFO] [logging.py:128:log_dist] [Rank 0] step=870, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
177 |
+
[2025-04-05 00:29:42,777] [INFO] [timer.py:264:stop] epoch=4/micro_step=332/global_step=870, RunningAvgSamplesPerSec=10.923420364343865, CurrSamplesPerSec=10.234367011093408, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
178 |
+
[2025-04-05 00:30:12,278] [INFO] [logging.py:128:log_dist] [Rank 0] step=880, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
179 |
+
[2025-04-05 00:30:12,279] [INFO] [timer.py:264:stop] epoch=4/micro_step=352/global_step=880, RunningAvgSamplesPerSec=10.928288969968747, CurrSamplesPerSec=15.232539061735583, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
180 |
+
[2025-04-05 00:30:43,713] [INFO] [logging.py:128:log_dist] [Rank 0] step=890, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
181 |
+
[2025-04-05 00:30:43,713] [INFO] [timer.py:264:stop] epoch=5/micro_step=20/global_step=890, RunningAvgSamplesPerSec=10.925241705024948, CurrSamplesPerSec=9.550210083587169, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
182 |
+
[2025-04-05 00:31:14,999] [INFO] [logging.py:128:log_dist] [Rank 0] step=900, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
183 |
+
[2025-04-05 00:31:14,999] [INFO] [timer.py:264:stop] epoch=5/micro_step=40/global_step=900, RunningAvgSamplesPerSec=10.922448695267986, CurrSamplesPerSec=11.707657892737233, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
184 |
+
[2025-04-05 00:31:46,772] [INFO] [logging.py:128:log_dist] [Rank 0] step=910, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
185 |
+
[2025-04-05 00:31:46,772] [INFO] [timer.py:264:stop] epoch=5/micro_step=60/global_step=910, RunningAvgSamplesPerSec=10.918367358130276, CurrSamplesPerSec=12.14226994099549, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
186 |
+
[2025-04-05 00:32:14,543] [INFO] [logging.py:128:log_dist] [Rank 0] step=920, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
187 |
+
[2025-04-05 00:32:14,543] [INFO] [timer.py:264:stop] epoch=5/micro_step=80/global_step=920, RunningAvgSamplesPerSec=10.929225932240328, CurrSamplesPerSec=9.593174888357035, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
188 |
+
[2025-04-05 00:32:47,323] [INFO] [logging.py:128:log_dist] [Rank 0] step=930, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
189 |
+
[2025-04-05 00:32:47,324] [INFO] [timer.py:264:stop] epoch=5/micro_step=100/global_step=930, RunningAvgSamplesPerSec=10.920947685322428, CurrSamplesPerSec=10.432728165219467, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
190 |
+
[2025-04-05 00:33:25,391] [INFO] [logging.py:128:log_dist] [Rank 0] step=940, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
191 |
+
[2025-04-05 00:33:25,393] [INFO] [timer.py:264:stop] epoch=5/micro_step=120/global_step=940, RunningAvgSamplesPerSec=10.899166352905562, CurrSamplesPerSec=9.46259530076961, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
192 |
+
[2025-04-05 00:34:05,124] [INFO] [logging.py:128:log_dist] [Rank 0] step=950, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
193 |
+
[2025-04-05 00:34:05,125] [INFO] [timer.py:264:stop] epoch=5/micro_step=140/global_step=950, RunningAvgSamplesPerSec=10.868709472254283, CurrSamplesPerSec=8.157583107922177, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
194 |
+
[2025-04-05 00:34:41,083] [INFO] [logging.py:128:log_dist] [Rank 0] step=960, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
195 |
+
[2025-04-05 00:34:41,084] [INFO] [timer.py:264:stop] epoch=5/micro_step=160/global_step=960, RunningAvgSamplesPerSec=10.85179390673428, CurrSamplesPerSec=11.129984184357816, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
196 |
+
[2025-04-05 00:35:18,183] [INFO] [logging.py:128:log_dist] [Rank 0] step=970, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
197 |
+
[2025-04-05 00:35:18,184] [INFO] [timer.py:264:stop] epoch=5/micro_step=180/global_step=970, RunningAvgSamplesPerSec=10.831398090103376, CurrSamplesPerSec=8.407739182923459, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
198 |
+
[2025-04-05 00:35:50,030] [INFO] [logging.py:128:log_dist] [Rank 0] step=980, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
199 |
+
[2025-04-05 00:35:50,031] [INFO] [timer.py:264:stop] epoch=5/micro_step=200/global_step=980, RunningAvgSamplesPerSec=10.830610124260065, CurrSamplesPerSec=12.386895729971858, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
200 |
+
[2025-04-05 00:36:19,966] [INFO] [logging.py:128:log_dist] [Rank 0] step=990, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
201 |
+
[2025-04-05 00:36:19,967] [INFO] [timer.py:264:stop] epoch=5/micro_step=220/global_step=990, RunningAvgSamplesPerSec=10.83462980390363, CurrSamplesPerSec=11.45762219440472, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
202 |
+
[2025-04-05 00:36:49,736] [INFO] [logging.py:128:log_dist] [Rank 0] step=1000, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
203 |
+
[2025-04-05 00:36:49,737] [INFO] [timer.py:264:stop] epoch=5/micro_step=240/global_step=1000, RunningAvgSamplesPerSec=10.838216915293264, CurrSamplesPerSec=12.083053599469995, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
204 |
+
[2025-04-05 00:37:19,659] [INFO] [logging.py:128:log_dist] [Rank 0] step=1010, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
205 |
+
[2025-04-05 00:37:19,659] [INFO] [timer.py:264:stop] epoch=5/micro_step=260/global_step=1010, RunningAvgSamplesPerSec=10.84138178767096, CurrSamplesPerSec=11.283992072611156, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
206 |
+
[2025-04-05 00:37:50,047] [INFO] [logging.py:128:log_dist] [Rank 0] step=1020, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
207 |
+
[2025-04-05 00:37:50,047] [INFO] [timer.py:264:stop] epoch=5/micro_step=280/global_step=1020, RunningAvgSamplesPerSec=10.843045795756481, CurrSamplesPerSec=9.382163129026653, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
208 |
+
[2025-04-05 00:38:21,442] [INFO] [logging.py:128:log_dist] [Rank 0] step=1030, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
209 |
+
[2025-04-05 00:38:21,443] [INFO] [timer.py:264:stop] epoch=5/micro_step=300/global_step=1030, RunningAvgSamplesPerSec=10.841309399376517, CurrSamplesPerSec=10.88982155903671, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
210 |
+
[2025-04-05 00:38:53,361] [INFO] [logging.py:128:log_dist] [Rank 0] step=1040, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
211 |
+
[2025-04-05 00:38:53,362] [INFO] [timer.py:264:stop] epoch=5/micro_step=320/global_step=1040, RunningAvgSamplesPerSec=10.837961689000009, CurrSamplesPerSec=11.827810675137716, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
212 |
+
[2025-04-05 00:39:23,651] [INFO] [logging.py:128:log_dist] [Rank 0] step=1050, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
213 |
+
[2025-04-05 00:39:23,652] [INFO] [timer.py:264:stop] epoch=5/micro_step=340/global_step=1050, RunningAvgSamplesPerSec=10.839644603106759, CurrSamplesPerSec=10.710433778235352, MemAllocated=29.37GB, MaxMemAllocated=56.85GB
|
214 |
+
Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
|
215 |
+
Saving 16-bit model...
|
216 |
+
[2025-04-05 00:39:48,972] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step1056 is about to be saved!
|
217 |
+
[2025-04-05 00:39:48,973] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step1056
|
218 |
+
[2025-04-05 00:39:48,973] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
|
219 |
+
[2025-04-05 00:40:02,761] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
|
220 |
+
[2025-04-05 00:40:02,761] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1056 is ready now!
|
221 |
+
Model saved!
|
wandb/run-20250404_234514-h2gynfll/files/requirements.txt
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
maskrcnn_benchmark==0.0.0
|
2 |
+
deepspeed==0.16.1
|
3 |
+
uritemplate==4.1.1
|
4 |
+
pyairports==2.1.1
|
5 |
+
partial-json-parser==0.2.1.1.post4
|
6 |
+
tensorboard-data-server==0.7.2
|
7 |
+
pydantic==2.10.3
|
8 |
+
Werkzeug==3.1.3
|
9 |
+
attrs==24.3.0
|
10 |
+
Jinja2==3.1.4
|
11 |
+
email_validator==2.2.0
|
12 |
+
mdit-py-plugins==0.4.2
|
13 |
+
google-api-python-client==2.160.0
|
14 |
+
pandas==2.2.3
|
15 |
+
safehttpx==0.1.6
|
16 |
+
setproctitle==1.3.4
|
17 |
+
dill==0.3.8
|
18 |
+
torchaudio==2.5.1
|
19 |
+
frechet-audio-distance==0.1.2
|
20 |
+
blessed==1.20.0
|
21 |
+
llvmlite==0.43.0
|
22 |
+
litellm==1.60.8
|
23 |
+
nvidia-nvtx-cu12==12.4.127
|
24 |
+
nvidia-cusolver-cu12==11.6.1.9
|
25 |
+
einops==0.8.0
|
26 |
+
datasets==3.2.0
|
27 |
+
pycountry==24.6.1
|
28 |
+
airportsdata==20250224
|
29 |
+
idna==3.10
|
30 |
+
urllib3==2.2.3
|
31 |
+
mpmath==1.3.0
|
32 |
+
wandb==0.19.1
|
33 |
+
certifi==2024.12.14
|
34 |
+
markdown-it-py==3.0.0
|
35 |
+
align-anything==0.0.1.dev0
|
36 |
+
aiohttp==3.11.10
|
37 |
+
fsspec==2024.9.0
|
38 |
+
aiohappyeyeballs==2.4.4
|
39 |
+
httplib2==0.22.0
|
40 |
+
hjson==3.1.0
|
41 |
+
yarl==1.18.3
|
42 |
+
decorator==5.1.1
|
43 |
+
distlib==0.3.9
|
44 |
+
absl-py==2.1.0
|
45 |
+
huggingface-hub==0.27.0
|
46 |
+
memray==1.15.0
|
47 |
+
Pygments==2.18.0
|
48 |
+
soupsieve==2.6
|
49 |
+
shellingham==1.5.4
|
50 |
+
tokenizers==0.21.0
|
51 |
+
uvloop==0.21.0
|
52 |
+
numpy==1.26.4
|
53 |
+
linkify-it-py==2.0.3
|
54 |
+
sympy==1.13.1
|
55 |
+
python-dotenv==1.0.1
|
56 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
57 |
+
tensorboard==2.18.0
|
58 |
+
fastrlock==0.8.3
|
59 |
+
rsa==4.9
|
60 |
+
lm-format-enforcer==0.10.9
|
61 |
+
openai==1.61.1
|
62 |
+
gpustat==1.1.1
|
63 |
+
librosa==0.10.2.post1
|
64 |
+
grpcio-status==1.70.0
|
65 |
+
nvidia-cudnn-cu12==9.1.0.70
|
66 |
+
zipp==3.21.0
|
67 |
+
nvidia-nvjitlink-cu12==12.4.127
|
68 |
+
cupy-cuda12x==13.3.0
|
69 |
+
Markdown==3.7
|
70 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
71 |
+
nvidia-curand-cu12==10.3.5.147
|
72 |
+
rpds-py==0.22.3
|
73 |
+
outlines==0.1.11
|
74 |
+
docker-pycreds==0.4.0
|
75 |
+
distro==1.9.0
|
76 |
+
httpcore==1.0.7
|
77 |
+
gradio==5.9.0
|
78 |
+
google-auth-httplib2==0.2.0
|
79 |
+
iniconfig==2.0.0
|
80 |
+
gitdb==4.0.11
|
81 |
+
jsonschema==4.23.0
|
82 |
+
click==8.1.7
|
83 |
+
ninja==1.11.1.3
|
84 |
+
setuptools==75.6.0
|
85 |
+
audioread==3.0.1
|
86 |
+
frozenlist==1.5.0
|
87 |
+
transformers-stream-generator==0.0.5
|
88 |
+
nvidia-cublas-cu12==12.4.5.8
|
89 |
+
pycparser==2.22
|
90 |
+
GitPython==3.1.43
|
91 |
+
tqdm==4.67.1
|
92 |
+
importlib_metadata==8.5.0
|
93 |
+
patsy==1.0.1
|
94 |
+
networkx==3.4.2
|
95 |
+
semantic-version==2.10.0
|
96 |
+
alpaca_eval==0.6.6
|
97 |
+
google-cloud-core==2.4.1
|
98 |
+
prometheus_client==0.21.1
|
99 |
+
jiter==0.8.2
|
100 |
+
scipy==1.14.1
|
101 |
+
starlette==0.41.3
|
102 |
+
jq==1.8.0
|
103 |
+
opencensus-context==0.1.3
|
104 |
+
cachetools==5.5.1
|
105 |
+
cffi==1.17.1
|
106 |
+
opencv-python-headless==4.10.0.84
|
107 |
+
joblib==1.4.2
|
108 |
+
yt-dlp==2025.1.26
|
109 |
+
python-dateutil==2.9.0.post0
|
110 |
+
httpx==0.28.1
|
111 |
+
msgpack==1.1.0
|
112 |
+
pydub==0.25.1
|
113 |
+
tomlkit==0.13.2
|
114 |
+
nvitop==1.4.2
|
115 |
+
nvidia-cusparse-cu12==12.3.1.170
|
116 |
+
msgspec==0.18.6
|
117 |
+
aiosignal==1.3.2
|
118 |
+
wheel==0.45.1
|
119 |
+
filelock==3.16.1
|
120 |
+
pillow==10.4.0
|
121 |
+
typer==0.15.1
|
122 |
+
websockets==14.1
|
123 |
+
resampy==0.4.3
|
124 |
+
aiofiles==23.2.1
|
125 |
+
aiohttp-cors==0.7.0
|
126 |
+
platformdirs==4.3.6
|
127 |
+
gguf==0.10.0
|
128 |
+
diskcache==5.6.3
|
129 |
+
cloudpickle==3.1.0
|
130 |
+
multidict==6.1.0
|
131 |
+
py-cpuinfo==9.0.0
|
132 |
+
scikit-learn==1.6.0
|
133 |
+
smart-open==7.1.0
|
134 |
+
tiktoken==0.7.0
|
135 |
+
grpcio==1.70.0
|
136 |
+
charset-normalizer==3.4.0
|
137 |
+
nest-asyncio==1.6.0
|
138 |
+
lark==1.2.2
|
139 |
+
beautifulsoup4==4.13.3
|
140 |
+
pip==24.3.1
|
141 |
+
six==1.17.0
|
142 |
+
prometheus-fastapi-instrumentator==7.0.0
|
143 |
+
ruff==0.8.3
|
144 |
+
rich-toolkit==0.13.2
|
145 |
+
lazy_loader==0.4
|
146 |
+
grpc-google-iam-v1==0.14.0
|
147 |
+
psutil==6.1.0
|
148 |
+
mdurl==0.1.2
|
149 |
+
nvidia-nccl-cu12==2.21.5
|
150 |
+
triton==3.1.0
|
151 |
+
torchvision==0.20.1
|
152 |
+
fastapi==0.115.6
|
153 |
+
referencing==0.35.1
|
154 |
+
xxhash==3.5.0
|
155 |
+
pyzmq==26.2.0
|
156 |
+
torchlibrosa==0.1.0
|
157 |
+
googleapis-common-protos==1.66.0
|
158 |
+
pyasn1==0.6.1
|
159 |
+
soundfile==0.12.1
|
160 |
+
pyparsing==3.2.1
|
161 |
+
xgrammar==0.1.11
|
162 |
+
gradio_client==1.5.2
|
163 |
+
watchfiles==1.0.3
|
164 |
+
pluggy==1.5.0
|
165 |
+
py-spy==0.4.0
|
166 |
+
pybind11==2.13.6
|
167 |
+
diffusers==0.31.0
|
168 |
+
sentencepiece==0.2.0
|
169 |
+
flash_attn==2.7.4.post1
|
170 |
+
annotated-types==0.7.0
|
171 |
+
interegular==0.3.3
|
172 |
+
requests==2.32.3
|
173 |
+
opencensus==0.11.4
|
174 |
+
colorful==0.5.6
|
175 |
+
google-api-core==2.24.1
|
176 |
+
pytest==8.3.4
|
177 |
+
dnspython==2.7.0
|
178 |
+
pydantic_core==2.27.1
|
179 |
+
pytz==2024.2
|
180 |
+
pyasn1_modules==0.4.1
|
181 |
+
propcache==0.2.1
|
182 |
+
accelerate==1.2.1
|
183 |
+
fire==0.7.0
|
184 |
+
textual==1.0.0
|
185 |
+
sniffio==1.3.1
|
186 |
+
pyarrow==18.1.0
|
187 |
+
protobuf==5.29.1
|
188 |
+
wcwidth==0.2.13
|
189 |
+
packaging==24.2
|
190 |
+
uvicorn==0.34.0
|
191 |
+
sentry-sdk==2.19.2
|
192 |
+
google-auth==2.38.0
|
193 |
+
typing_extensions==4.12.2
|
194 |
+
peft==0.14.0
|
195 |
+
depyf==0.18.0
|
196 |
+
multiprocess==0.70.16
|
197 |
+
google-cloud-translate==3.19.0
|
198 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
199 |
+
jsonschema-specifications==2024.10.1
|
200 |
+
vllm==0.7.3
|
201 |
+
nvidia-cufft-cu12==11.2.1.3
|
202 |
+
timm==1.0.12
|
203 |
+
rich==13.9.4
|
204 |
+
ffmpy==0.4.0
|
205 |
+
virtualenv==20.29.1
|
206 |
+
tzdata==2024.2
|
207 |
+
smmap==5.0.1
|
208 |
+
uc-micro-py==1.0.3
|
209 |
+
proto-plus==1.26.0
|
210 |
+
soxr==0.5.0.post1
|
211 |
+
h11==0.14.0
|
212 |
+
outlines_core==0.1.26
|
213 |
+
compressed-tensors==0.9.1
|
214 |
+
blake3==1.0.4
|
215 |
+
xformers==0.0.28.post3
|
216 |
+
orjson==3.10.12
|
217 |
+
ray==2.40.0
|
218 |
+
PyYAML==6.0.2
|
219 |
+
nvidia-ml-py==12.560.30
|
220 |
+
python-multipart==0.0.19
|
221 |
+
PySocks==1.7.1
|
222 |
+
regex==2024.11.6
|
223 |
+
pooch==1.8.2
|
224 |
+
termcolor==2.5.0
|
225 |
+
MarkupSafe==2.1.5
|
226 |
+
torch==2.5.1
|
227 |
+
fastapi-cli==0.0.7
|
228 |
+
gdown==5.2.0
|
229 |
+
numba==0.60.0
|
230 |
+
httptools==0.6.4
|
231 |
+
transformers==4.50.0.dev0
|
232 |
+
mistral_common==1.5.1
|
233 |
+
astor==0.8.1
|
234 |
+
anyio==4.7.0
|
235 |
+
safetensors==0.4.5
|
236 |
+
threadpoolctl==3.5.0
|
237 |
+
wrapt==1.17.2
|
238 |
+
wheel==0.43.0
|
239 |
+
jaraco.functools==4.0.1
|
240 |
+
inflect==7.3.1
|
241 |
+
jaraco.text==3.12.1
|
242 |
+
typeguard==4.3.0
|
243 |
+
jaraco.collections==5.1.0
|
244 |
+
importlib_metadata==8.0.0
|
245 |
+
backports.tarfile==1.2.0
|
246 |
+
tomli==2.0.1
|
247 |
+
autocommand==2.2.2
|
248 |
+
platformdirs==4.2.2
|
249 |
+
more-itertools==10.3.0
|
250 |
+
zipp==3.19.2
|
251 |
+
packaging==24.2
|
252 |
+
typing_extensions==4.12.2
|
253 |
+
jaraco.context==5.3.0
|
wandb/run-20250404_234514-h2gynfll/files/wandb-metadata.json
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
3 |
+
"python": "CPython 3.11.0",
|
4 |
+
"startedAt": "2025-04-04T15:45:14.487401Z",
|
5 |
+
"args": [
|
6 |
+
"--local_rank=0",
|
7 |
+
"--model_name_or_path",
|
8 |
+
"/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
|
9 |
+
"--train_datasets",
|
10 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
|
11 |
+
"--train_split",
|
12 |
+
"train",
|
13 |
+
"--train_template",
|
14 |
+
"Safe_o1",
|
15 |
+
"--output_dir",
|
16 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
17 |
+
"--log_project",
|
18 |
+
"safe-o1",
|
19 |
+
"--per_device_train_batch_size",
|
20 |
+
"4",
|
21 |
+
"--per_device_eval_batch_size",
|
22 |
+
"4",
|
23 |
+
"--gradient_accumulation_steps",
|
24 |
+
"2",
|
25 |
+
"--learning_rate",
|
26 |
+
"2e-5",
|
27 |
+
"--epochs",
|
28 |
+
"6",
|
29 |
+
"--model_max_length",
|
30 |
+
"16384"
|
31 |
+
],
|
32 |
+
"program": "-m align_anything.trainers.text_to_text.sft",
|
33 |
+
"git": {
|
34 |
+
"remote": "[email protected]:PKU-Alignment/align-anything.git",
|
35 |
+
"commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
|
36 |
+
},
|
37 |
+
"email": "[email protected]",
|
38 |
+
"root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
39 |
+
"host": "dgx-092",
|
40 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
|
41 |
+
"cpu_count": 112,
|
42 |
+
"cpu_count_logical": 224,
|
43 |
+
"gpu": "NVIDIA H800",
|
44 |
+
"gpu_count": 8,
|
45 |
+
"disk": {
|
46 |
+
"/": {
|
47 |
+
"total": "1888556142592",
|
48 |
+
"used": "149815398400"
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"memory": {
|
52 |
+
"total": "2164195454976"
|
53 |
+
},
|
54 |
+
"cpu": {
|
55 |
+
"count": 112,
|
56 |
+
"countLogical": 224
|
57 |
+
},
|
58 |
+
"gpu_nvidia": [
|
59 |
+
{
|
60 |
+
"name": "NVIDIA H800",
|
61 |
+
"memoryTotal": "85520809984",
|
62 |
+
"cudaCores": 16896,
|
63 |
+
"architecture": "Hopper"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"name": "NVIDIA H800",
|
67 |
+
"memoryTotal": "85520809984",
|
68 |
+
"cudaCores": 16896,
|
69 |
+
"architecture": "Hopper"
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"name": "NVIDIA H800",
|
73 |
+
"memoryTotal": "85520809984",
|
74 |
+
"cudaCores": 16896,
|
75 |
+
"architecture": "Hopper"
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"name": "NVIDIA H800",
|
79 |
+
"memoryTotal": "85520809984",
|
80 |
+
"cudaCores": 16896,
|
81 |
+
"architecture": "Hopper"
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "NVIDIA H800",
|
85 |
+
"memoryTotal": "85520809984",
|
86 |
+
"cudaCores": 16896,
|
87 |
+
"architecture": "Hopper"
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"name": "NVIDIA H800",
|
91 |
+
"memoryTotal": "85520809984",
|
92 |
+
"cudaCores": 16896,
|
93 |
+
"architecture": "Hopper"
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"name": "NVIDIA H800",
|
97 |
+
"memoryTotal": "85520809984",
|
98 |
+
"cudaCores": 16896,
|
99 |
+
"architecture": "Hopper"
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"name": "NVIDIA H800",
|
103 |
+
"memoryTotal": "85520809984",
|
104 |
+
"cudaCores": 16896,
|
105 |
+
"architecture": "Hopper"
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"slurm": {
|
109 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
110 |
+
},
|
111 |
+
"cudaVersion": "12.2"
|
112 |
+
}
|
wandb/run-20250404_234514-h2gynfll/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_runtime":3288.280998361,"_step":2112,"train/step":2112,"train/loss":0.044164832681417465,"train/lr":2e-05,"train/epoch":6,"_wandb":{"runtime":3288},"_timestamp":1.7437847807259746e+09}
|
wandb/run-20250404_234514-h2gynfll/logs/debug-core.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-04T23:45:13.890415708+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpznxahgwg/port-2490920.txt","pid":2490920,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2025-04-04T23:45:13.890455897+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2025-04-04T23:45:13.891244261+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2490920}
|
4 |
+
{"time":"2025-04-04T23:45:13.891234959+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44823,"Zone":""}}
|
5 |
+
{"time":"2025-04-04T23:45:14.073532455+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:55784"}
|
6 |
+
{"time":"2025-04-04T23:45:14.4882962+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"h2gynfll","id":"127.0.0.1:55784"}
|
7 |
+
{"time":"2025-04-04T23:45:14.804634542+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"h2gynfll","id":"127.0.0.1:55784"}
|
8 |
+
{"time":"2025-04-05T00:40:05.400228004+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"h2gynfll","id":"127.0.0.1:55784"}
|
9 |
+
{"time":"2025-04-05T00:40:05.400766541+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"h2gynfll","id":"127.0.0.1:55784"}
|
10 |
+
{"time":"2025-04-05T00:40:05.445957116+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:55784"}
|
11 |
+
{"time":"2025-04-05T00:40:05.445971641+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:55784"}
|
12 |
+
{"time":"2025-04-05T00:40:05.445983307+08:00","level":"INFO","msg":"server is shutting down"}
|
13 |
+
{"time":"2025-04-05T00:40:05.446007985+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:55784"}
|
14 |
+
{"time":"2025-04-05T00:40:05.446042966+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:55784"}
|
15 |
+
{"time":"2025-04-05T00:40:05.446045342+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:55784"}
|
16 |
+
{"time":"2025-04-05T00:40:05.446048272+08:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20250404_234514-h2gynfll/logs/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-04T23:45:14.489825202+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
|
2 |
+
{"time":"2025-04-04T23:45:14.489982968+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250404_234514-h2gynfll/logs/debug-core.log"}
|
3 |
+
{"time":"2025-04-04T23:45:14.804571426+08:00","level":"INFO","msg":"created new stream","id":"h2gynfll"}
|
4 |
+
{"time":"2025-04-04T23:45:14.804627802+08:00","level":"INFO","msg":"stream: started","id":"h2gynfll"}
|
5 |
+
{"time":"2025-04-04T23:45:14.804640659+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"h2gynfll"}
|
6 |
+
{"time":"2025-04-04T23:45:14.804650005+08:00","level":"INFO","msg":"sender: started","stream_id":"h2gynfll"}
|
7 |
+
{"time":"2025-04-04T23:45:14.804666518+08:00","level":"INFO","msg":"handler: started","stream_id":"h2gynfll"}
|
8 |
+
{"time":"2025-04-04T23:45:15.109983443+08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2025-04-05T00:40:02.76843027+08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2025-04-05T00:40:02.769177866+08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2025-04-05T00:40:03.76917444+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading output.log","runtime_seconds":0.106249371,"progress":"38.5KB/38.5KB"},{"desc":"uploading config.yaml","runtime_seconds":0.106241631,"progress":"2.7KB/2.7KB"}],"total_operations":2}}
|
12 |
+
{"time":"2025-04-05T00:40:04.14382192+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2025-04-05T00:40:05.400349491+08:00","level":"INFO","msg":"stream: closing","id":"h2gynfll"}
|
14 |
+
{"time":"2025-04-05T00:40:05.400395814+08:00","level":"INFO","msg":"handler: closed","stream_id":"h2gynfll"}
|
15 |
+
{"time":"2025-04-05T00:40:05.400407886+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"h2gynfll"}
|
16 |
+
{"time":"2025-04-05T00:40:05.400511465+08:00","level":"INFO","msg":"sender: closed","stream_id":"h2gynfll"}
|
17 |
+
{"time":"2025-04-05T00:40:05.400755931+08:00","level":"INFO","msg":"stream: closed","id":"h2gynfll"}
|
wandb/run-20250404_234514-h2gynfll/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
2 |
+
2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Configure stats pid to 2490920
|
3 |
+
2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
|
4 |
+
2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
|
5 |
+
2025-04-04 23:45:14,479 INFO MainThread:2490920 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
6 |
+
2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250404_234514-h2gynfll/logs/debug.log
|
7 |
+
2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250404_234514-h2gynfll/logs/debug-internal.log
|
8 |
+
2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():644] calling init triggers
|
9 |
+
2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
10 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 6, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_o1', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
|
11 |
+
2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():680] starting backend
|
12 |
+
2025-04-04 23:45:14,480 INFO MainThread:2490920 [wandb_init.py:init():684] sending inform_init request
|
13 |
+
2025-04-04 23:45:14,486 INFO MainThread:2490920 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-04 23:45:14,487 INFO MainThread:2490920 [wandb_init.py:init():697] backend started and connected
|
15 |
+
2025-04-04 23:45:14,488 INFO MainThread:2490920 [wandb_init.py:init():790] updated telemetry
|
16 |
+
2025-04-04 23:45:14,548 INFO MainThread:2490920 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-04 23:45:15,103 INFO MainThread:2490920 [wandb_init.py:init():874] starting run threads in backend
|
18 |
+
2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_console_start():2374] atexit reg
|
19 |
+
2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
20 |
+
2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
21 |
+
2025-04-04 23:45:15,485 INFO MainThread:2490920 [wandb_run.py:_redirect():2314] Redirects installed.
|
22 |
+
2025-04-04 23:45:15,490 INFO MainThread:2490920 [wandb_init.py:init():916] run started, returning control to user process
|
23 |
+
2025-04-05 00:40:02,766 INFO MainThread:2490920 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/h2gynfll
|
24 |
+
2025-04-05 00:40:02,767 INFO MainThread:2490920 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
|
25 |
+
2025-04-05 00:40:02,767 INFO MainThread:2490920 [wandb_run.py:_restore():2321] restore
|
26 |
+
2025-04-05 00:40:02,768 INFO MainThread:2490920 [wandb_run.py:_restore():2327] restore done
|
27 |
+
2025-04-05 00:40:05,392 INFO MainThread:2490920 [wandb_run.py:_footer_history_summary_info():3892] rendering history
|
28 |
+
2025-04-05 00:40:05,393 INFO MainThread:2490920 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
|
29 |
+
2025-04-05 00:40:05,399 INFO MainThread:2490920 [wandb_run.py:_footer_sync_info():3853] logging synced files
|
wandb/run-20250404_234514-h2gynfll/run-h2gynfll.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e920529c8e186b25824a9c4ad1050a452aed4e52269e7b7ba6c5fcdd77c8095f
|
3 |
+
size 6833219
|
wandb/run-20250405_124142-wdmxf5un/files/config.yaml
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.19.1
|
4 |
+
m: []
|
5 |
+
python_version: 3.11.0
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 5
|
10 |
+
- 11
|
11 |
+
- 41
|
12 |
+
- 49
|
13 |
+
- 51
|
14 |
+
- 53
|
15 |
+
- 55
|
16 |
+
- 63
|
17 |
+
- 71
|
18 |
+
- 83
|
19 |
+
- 98
|
20 |
+
- 105
|
21 |
+
"2":
|
22 |
+
- 1
|
23 |
+
- 5
|
24 |
+
- 11
|
25 |
+
- 41
|
26 |
+
- 49
|
27 |
+
- 51
|
28 |
+
- 53
|
29 |
+
- 55
|
30 |
+
- 63
|
31 |
+
- 71
|
32 |
+
- 83
|
33 |
+
- 98
|
34 |
+
- 105
|
35 |
+
"3":
|
36 |
+
- 2
|
37 |
+
- 13
|
38 |
+
- 16
|
39 |
+
- 23
|
40 |
+
- 55
|
41 |
+
- 61
|
42 |
+
"4": 3.11.0
|
43 |
+
"5": 0.19.1
|
44 |
+
"6": 4.50.0.dev0
|
45 |
+
"8":
|
46 |
+
- 5
|
47 |
+
"12": 0.19.1
|
48 |
+
"13": linux-x86_64
|
49 |
+
bnb_cfgs:
|
50 |
+
value:
|
51 |
+
bnb_4bit_compute_dtype: float16
|
52 |
+
bnb_4bit_quant_type: nf4
|
53 |
+
bnb_4bit_use_double_quant: true
|
54 |
+
load_in_4bit: true
|
55 |
+
load_in_8bit: false
|
56 |
+
use_bnb: false
|
57 |
+
data_cfgs:
|
58 |
+
value:
|
59 |
+
eval_optional_args: []
|
60 |
+
train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
|
61 |
+
train_optional_args: []
|
62 |
+
train_split: train
|
63 |
+
train_template: Safe_o1
|
64 |
+
logger_cfgs:
|
65 |
+
value:
|
66 |
+
log_project: safe-o1
|
67 |
+
log_run_name: sft
|
68 |
+
log_type: wandb
|
69 |
+
output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
|
70 |
+
save_interval: 100000
|
71 |
+
lora_cfgs:
|
72 |
+
value:
|
73 |
+
inference_mode: false
|
74 |
+
lora_alpha: 16
|
75 |
+
lora_dropout: 0.1
|
76 |
+
r: 16
|
77 |
+
save_full_model: true
|
78 |
+
target_modules:
|
79 |
+
- q_proj
|
80 |
+
- v_proj
|
81 |
+
task_type: TaskType.CAUSAL_LM
|
82 |
+
use_lora: false
|
83 |
+
model_cfgs:
|
84 |
+
value:
|
85 |
+
model_max_length: 16384
|
86 |
+
model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
|
87 |
+
trust_remote_code: true
|
88 |
+
train_cfgs:
|
89 |
+
value:
|
90 |
+
adam_betas:
|
91 |
+
- 0.9
|
92 |
+
- 0.95
|
93 |
+
adam_epsilon: 1e-08
|
94 |
+
bf16: true
|
95 |
+
ds_cfgs: ds_z3_config.json
|
96 |
+
epochs: 6
|
97 |
+
eval_interval: 10
|
98 |
+
eval_strategy: steps
|
99 |
+
fp16: false
|
100 |
+
gradient_accumulation_steps: 2
|
101 |
+
gradient_checkpointing: true
|
102 |
+
learning_rate: 2e-05
|
103 |
+
lr_scheduler_type: constant
|
104 |
+
lr_warmup_ratio: 0.03
|
105 |
+
max_grad_norm: 1
|
106 |
+
per_device_eval_batch_size: 4
|
107 |
+
per_device_train_batch_size: 4
|
108 |
+
seed: 42
|
109 |
+
weight_decay: 0
|
wandb/run-20250405_124142-wdmxf5un/files/output.log
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
***** Running training *****
|
2 |
+
Training 1/6 epoch: 0%| | 0/1056 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
3 |
+
Training 2/6 epoch (loss 0.6039): 30%|█████████████████████████▋ | 319/1056 [09:05<19:54, 1.62s/it]
|
4 |
+
[2025-04-05 12:42:22,487] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
5 |
+
[2025-04-05 12:42:22,487] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=20.430520612254707, CurrSamplesPerSec=19.73994310767666, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
|
6 |
+
[2025-04-05 12:42:55,483] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
7 |
+
[2025-04-05 12:42:55,483] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=20.330484556424526, CurrSamplesPerSec=19.749929472671457, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
|
8 |
+
[2025-04-05 12:43:32,390] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
9 |
+
[2025-04-05 12:43:32,391] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=19.490752396407714, CurrSamplesPerSec=22.631995577606887, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
10 |
+
[2025-04-05 12:44:04,939] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
11 |
+
[2025-04-05 12:44:04,940] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=19.787231072811785, CurrSamplesPerSec=21.355656210314816, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
12 |
+
[2025-04-05 12:44:38,138] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
13 |
+
[2025-04-05 12:44:38,139] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=19.87305509037746, CurrSamplesPerSec=23.09551491786551, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
14 |
+
[2025-04-05 12:45:11,418] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
15 |
+
[2025-04-05 12:45:11,419] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=19.91152284007933, CurrSamplesPerSec=17.560894866383286, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
16 |
+
[2025-04-05 12:45:44,654] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
17 |
+
[2025-04-05 12:45:44,654] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=19.944567941119995, CurrSamplesPerSec=17.906646151944845, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
18 |
+
[2025-04-05 12:46:19,225] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
19 |
+
[2025-04-05 12:46:19,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=19.873932115504267, CurrSamplesPerSec=20.010007567016135, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
20 |
+
[2025-04-05 12:46:53,851] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
21 |
+
[2025-04-05 12:46:53,852] [INFO] [timer.py:264:stop] epoch=1/micro_step=4/global_step=90, RunningAvgSamplesPerSec=19.806830734808837, CurrSamplesPerSec=14.898176757575056, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
22 |
+
[2025-04-05 12:47:26,840] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
23 |
+
[2025-04-05 12:47:26,840] [INFO] [timer.py:264:stop] epoch=1/micro_step=24/global_step=100, RunningAvgSamplesPerSec=19.86084409306565, CurrSamplesPerSec=20.25306438527985, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
24 |
+
[2025-04-05 12:48:00,963] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
25 |
+
[2025-04-05 12:48:00,964] [INFO] [timer.py:264:stop] epoch=1/micro_step=44/global_step=110, RunningAvgSamplesPerSec=19.83197755374019, CurrSamplesPerSec=14.796357199340614, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
26 |
+
[2025-04-05 12:48:36,263] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
27 |
+
[2025-04-05 12:48:36,263] [INFO] [timer.py:264:stop] epoch=1/micro_step=64/global_step=120, RunningAvgSamplesPerSec=19.755458014995966, CurrSamplesPerSec=18.805691707671684, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
28 |
+
[2025-04-05 12:49:09,681] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
29 |
+
[2025-04-05 12:49:09,681] [INFO] [timer.py:264:stop] epoch=1/micro_step=84/global_step=130, RunningAvgSamplesPerSec=19.780419146519836, CurrSamplesPerSec=19.690861454132612, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
30 |
+
[2025-04-05 12:49:43,000] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
31 |
+
[2025-04-05 12:49:43,001] [INFO] [timer.py:264:stop] epoch=1/micro_step=104/global_step=140, RunningAvgSamplesPerSec=19.798657335646936, CurrSamplesPerSec=18.638907091791424, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
32 |
+
[2025-04-05 12:50:16,076] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
33 |
+
[2025-04-05 12:50:16,077] [INFO] [timer.py:264:stop] epoch=1/micro_step=124/global_step=150, RunningAvgSamplesPerSec=19.831999352959354, CurrSamplesPerSec=21.487376178731257, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
34 |
+
[2025-04-05 12:50:49,560] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
35 |
+
[2025-04-05 12:50:49,561] [INFO] [timer.py:264:stop] epoch=1/micro_step=144/global_step=160, RunningAvgSamplesPerSec=19.84491781415368, CurrSamplesPerSec=25.33945343262647, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
36 |
+
[2025-04-05 12:51:23,805] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
37 |
+
[2025-04-05 12:51:23,805] [INFO] [timer.py:264:stop] epoch=1/micro_step=164/global_step=170, RunningAvgSamplesPerSec=19.826904278206033, CurrSamplesPerSec=22.45413646824439, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
38 |
+
[2025-04-05 12:51:58,336] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
39 |
+
[2025-04-05 12:51:58,337] [INFO] [timer.py:264:stop] epoch=2/micro_step=8/global_step=180, RunningAvgSamplesPerSec=19.798618570342683, CurrSamplesPerSec=21.417902222668886, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
40 |
+
[2025-04-05 12:52:33,550] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
41 |
+
[2025-04-05 12:52:33,550] [INFO] [timer.py:264:stop] epoch=2/micro_step=28/global_step=190, RunningAvgSamplesPerSec=19.756965411737134, CurrSamplesPerSec=14.200132325260137, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
42 |
+
[2025-04-05 12:53:05,558] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
43 |
+
[2025-04-05 12:53:05,559] [INFO] [timer.py:264:stop] epoch=2/micro_step=48/global_step=200, RunningAvgSamplesPerSec=19.81483548351977, CurrSamplesPerSec=23.8050812324985, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
44 |
+
[2025-04-05 12:53:41,206] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
45 |
+
[2025-04-05 12:53:41,207] [INFO] [timer.py:264:stop] epoch=2/micro_step=68/global_step=210, RunningAvgSamplesPerSec=19.76297283536241, CurrSamplesPerSec=21.033590133844925, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
46 |
+
[2025-04-05 12:54:15,555] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
47 |
+
[2025-04-05 12:54:15,556] [INFO] [timer.py:264:stop] epoch=2/micro_step=88/global_step=220, RunningAvgSamplesPerSec=19.74868849123472, CurrSamplesPerSec=19.775005201497734, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
48 |
+
[2025-04-05 12:54:48,256] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
49 |
+
[2025-04-05 12:54:48,256] [INFO] [timer.py:264:stop] epoch=2/micro_step=108/global_step=230, RunningAvgSamplesPerSec=19.782728073976745, CurrSamplesPerSec=19.69564649230523, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
50 |
+
[2025-04-05 12:55:20,678] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
51 |
+
[2025-04-05 12:55:20,679] [INFO] [timer.py:264:stop] epoch=2/micro_step=128/global_step=240, RunningAvgSamplesPerSec=19.81915055199136, CurrSamplesPerSec=20.713842718681395, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
52 |
+
[2025-04-05 12:55:54,358] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
53 |
+
[2025-04-05 12:55:54,359] [INFO] [timer.py:264:stop] epoch=2/micro_step=148/global_step=250, RunningAvgSamplesPerSec=19.82191350770802, CurrSamplesPerSec=16.07218913804267, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
54 |
+
[2025-04-05 12:56:28,583] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
55 |
+
[2025-04-05 12:56:28,584] [INFO] [timer.py:264:stop] epoch=2/micro_step=168/global_step=260, RunningAvgSamplesPerSec=19.81595565426552, CurrSamplesPerSec=21.231867154990216, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
56 |
+
[2025-04-05 12:57:02,786] [INFO] [logging.py:128:log_dist] [Rank 0] step=270, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
57 |
+
[2025-04-05 12:57:02,786] [INFO] [timer.py:264:stop] epoch=3/micro_step=12/global_step=270, RunningAvgSamplesPerSec=19.810062754202114, CurrSamplesPerSec=19.989395411519066, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
58 |
+
[2025-04-05 12:57:37,851] [INFO] [logging.py:128:log_dist] [Rank 0] step=280, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
59 |
+
[2025-04-05 12:57:37,851] [INFO] [timer.py:264:stop] epoch=3/micro_step=32/global_step=280, RunningAvgSamplesPerSec=19.782856718965995, CurrSamplesPerSec=21.03165872299883, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
60 |
+
[2025-04-05 12:58:12,693] [INFO] [logging.py:128:log_dist] [Rank 0] step=290, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
61 |
+
[2025-04-05 12:58:12,693] [INFO] [timer.py:264:stop] epoch=3/micro_step=52/global_step=290, RunningAvgSamplesPerSec=19.758748532769488, CurrSamplesPerSec=13.064895896239026, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
62 |
+
[2025-04-05 12:58:46,457] [INFO] [logging.py:128:log_dist] [Rank 0] step=300, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
63 |
+
[2025-04-05 12:58:46,457] [INFO] [timer.py:264:stop] epoch=3/micro_step=72/global_step=300, RunningAvgSamplesPerSec=19.76271123968894, CurrSamplesPerSec=18.71946301781583, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
64 |
+
[2025-04-05 12:59:20,298] [INFO] [logging.py:128:log_dist] [Rank 0] step=310, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
65 |
+
[2025-04-05 12:59:20,298] [INFO] [timer.py:264:stop] epoch=3/micro_step=92/global_step=310, RunningAvgSamplesPerSec=19.765042125767838, CurrSamplesPerSec=18.12731313727023, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
66 |
+
[2025-04-05 12:59:53,130] [INFO] [logging.py:128:log_dist] [Rank 0] step=320, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
67 |
+
[2025-04-05 12:59:53,130] [INFO] [timer.py:264:stop] epoch=3/micro_step=112/global_step=320, RunningAvgSamplesPerSec=19.785059722720725, CurrSamplesPerSec=18.162258241524817, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
68 |
+
[2025-04-05 13:00:25,014] [INFO] [logging.py:128:log_dist] [Rank 0] step=330, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
69 |
+
[2025-04-05 13:00:25,015] [INFO] [timer.py:264:stop] epoch=3/micro_step=132/global_step=330, RunningAvgSamplesPerSec=19.822856898324304, CurrSamplesPerSec=23.68279231225561, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
70 |
+
[2025-04-05 13:01:00,162] [INFO] [logging.py:128:log_dist] [Rank 0] step=340, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
71 |
+
[2025-04-05 13:01:00,163] [INFO] [timer.py:264:stop] epoch=3/micro_step=152/global_step=340, RunningAvgSamplesPerSec=19.79772228473608, CurrSamplesPerSec=16.10940253591632, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
72 |
+
[2025-04-05 13:01:33,503] [INFO] [logging.py:128:log_dist] [Rank 0] step=350, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
73 |
+
[2025-04-05 13:01:33,503] [INFO] [timer.py:264:stop] epoch=3/micro_step=172/global_step=350, RunningAvgSamplesPerSec=19.80785855020464, CurrSamplesPerSec=21.302828589697363, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
74 |
+
[2025-04-05 13:02:07,911] [INFO] [logging.py:128:log_dist] [Rank 0] step=360, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
75 |
+
[2025-04-05 13:02:07,912] [INFO] [timer.py:264:stop] epoch=4/micro_step=16/global_step=360, RunningAvgSamplesPerSec=19.79618308624211, CurrSamplesPerSec=19.263544613759976, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
76 |
+
[2025-04-05 13:02:41,115] [INFO] [logging.py:128:log_dist] [Rank 0] step=370, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
77 |
+
[2025-04-05 13:02:41,115] [INFO] [timer.py:264:stop] epoch=4/micro_step=36/global_step=370, RunningAvgSamplesPerSec=19.80650332458013, CurrSamplesPerSec=23.701802484868033, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
78 |
+
[2025-04-05 13:03:18,843] [INFO] [logging.py:128:log_dist] [Rank 0] step=380, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
79 |
+
[2025-04-05 13:03:18,843] [INFO] [timer.py:264:stop] epoch=4/micro_step=56/global_step=380, RunningAvgSamplesPerSec=19.743233696848424, CurrSamplesPerSec=16.022804105468715, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
80 |
+
[2025-04-05 13:03:50,717] [INFO] [logging.py:128:log_dist] [Rank 0] step=390, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
81 |
+
[2025-04-05 13:03:50,717] [INFO] [timer.py:264:stop] epoch=4/micro_step=76/global_step=390, RunningAvgSamplesPerSec=19.777954611746445, CurrSamplesPerSec=21.092219908245035, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
82 |
+
[2025-04-05 13:04:24,195] [INFO] [logging.py:128:log_dist] [Rank 0] step=400, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
83 |
+
[2025-04-05 13:04:24,195] [INFO] [timer.py:264:stop] epoch=4/micro_step=96/global_step=400, RunningAvgSamplesPerSec=19.782590618091508, CurrSamplesPerSec=22.82894504523341, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
84 |
+
[2025-04-05 13:04:56,685] [INFO] [logging.py:128:log_dist] [Rank 0] step=410, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
85 |
+
[2025-04-05 13:04:56,686] [INFO] [timer.py:264:stop] epoch=4/micro_step=116/global_step=410, RunningAvgSamplesPerSec=19.802987040271688, CurrSamplesPerSec=25.181649421025643, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
86 |
+
[2025-04-05 13:05:29,407] [INFO] [logging.py:128:log_dist] [Rank 0] step=420, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
87 |
+
[2025-04-05 13:05:29,407] [INFO] [timer.py:264:stop] epoch=4/micro_step=136/global_step=420, RunningAvgSamplesPerSec=19.820600060602313, CurrSamplesPerSec=19.750884196105687, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
88 |
+
[2025-04-05 13:06:04,461] [INFO] [logging.py:128:log_dist] [Rank 0] step=430, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
89 |
+
[2025-04-05 13:06:04,461] [INFO] [timer.py:264:stop] epoch=4/micro_step=156/global_step=430, RunningAvgSamplesPerSec=19.804042411655413, CurrSamplesPerSec=22.677130413597084, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
90 |
+
[2025-04-05 13:06:37,387] [INFO] [logging.py:128:log_dist] [Rank 0] step=440, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
91 |
+
[2025-04-05 13:06:37,388] [INFO] [timer.py:264:stop] epoch=4/micro_step=176/global_step=440, RunningAvgSamplesPerSec=19.818469015439167, CurrSamplesPerSec=22.638250216890107, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
92 |
+
[2025-04-05 13:07:12,029] [INFO] [logging.py:128:log_dist] [Rank 0] step=450, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
93 |
+
[2025-04-05 13:07:12,030] [INFO] [timer.py:264:stop] epoch=5/micro_step=20/global_step=450, RunningAvgSamplesPerSec=19.80636742741741, CurrSamplesPerSec=19.86476517052075, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
94 |
+
[2025-04-05 13:07:44,787] [INFO] [logging.py:128:log_dist] [Rank 0] step=460, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
95 |
+
[2025-04-05 13:07:44,787] [INFO] [timer.py:264:stop] epoch=5/micro_step=40/global_step=460, RunningAvgSamplesPerSec=19.820052969987852, CurrSamplesPerSec=19.756540318386058, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
96 |
+
[2025-04-05 13:08:21,559] [INFO] [logging.py:128:log_dist] [Rank 0] step=470, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
97 |
+
[2025-04-05 13:08:21,560] [INFO] [timer.py:264:stop] epoch=5/micro_step=60/global_step=470, RunningAvgSamplesPerSec=19.781581527254854, CurrSamplesPerSec=22.70716870525464, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
98 |
+
[2025-04-05 13:08:54,264] [INFO] [logging.py:128:log_dist] [Rank 0] step=480, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
99 |
+
[2025-04-05 13:08:54,264] [INFO] [timer.py:264:stop] epoch=5/micro_step=80/global_step=480, RunningAvgSamplesPerSec=19.79817588971848, CurrSamplesPerSec=21.579297206481634, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
100 |
+
[2025-04-05 13:09:27,633] [INFO] [logging.py:128:log_dist] [Rank 0] step=490, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
101 |
+
[2025-04-05 13:09:27,633] [INFO] [timer.py:264:stop] epoch=5/micro_step=100/global_step=490, RunningAvgSamplesPerSec=19.80181812039545, CurrSamplesPerSec=23.258142067239728, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
102 |
+
[2025-04-05 13:10:00,647] [INFO] [logging.py:128:log_dist] [Rank 0] step=500, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
103 |
+
[2025-04-05 13:10:00,648] [INFO] [timer.py:264:stop] epoch=5/micro_step=120/global_step=500, RunningAvgSamplesPerSec=19.811462542719266, CurrSamplesPerSec=18.30783674776777, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
104 |
+
[2025-04-05 13:10:33,720] [INFO] [logging.py:128:log_dist] [Rank 0] step=510, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
105 |
+
[2025-04-05 13:10:33,721] [INFO] [timer.py:264:stop] epoch=5/micro_step=140/global_step=510, RunningAvgSamplesPerSec=19.821325678536503, CurrSamplesPerSec=18.060156227360213, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
106 |
+
[2025-04-05 13:11:08,188] [INFO] [logging.py:128:log_dist] [Rank 0] step=520, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
107 |
+
[2025-04-05 13:11:08,188] [INFO] [timer.py:264:stop] epoch=5/micro_step=160/global_step=520, RunningAvgSamplesPerSec=19.81436114478211, CurrSamplesPerSec=19.79953860428667, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
108 |
+
Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
|
109 |
+
Saving 16-bit model...
|
110 |
+
[2025-04-05 13:11:42,501] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step528 is about to be saved!
|
111 |
+
[2025-04-05 13:11:42,503] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step528
|
112 |
+
[2025-04-05 13:11:42,503] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
|
113 |
+
[2025-04-05 13:11:55,172] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
|
114 |
+
[2025-04-05 13:11:55,172] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step528 is ready now!
|
115 |
+
Model saved!
|
wandb/run-20250405_124142-wdmxf5un/files/requirements.txt
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
maskrcnn_benchmark==0.0.0
|
2 |
+
deepspeed==0.16.1
|
3 |
+
uritemplate==4.1.1
|
4 |
+
pyairports==2.1.1
|
5 |
+
partial-json-parser==0.2.1.1.post4
|
6 |
+
tensorboard-data-server==0.7.2
|
7 |
+
pydantic==2.10.3
|
8 |
+
Werkzeug==3.1.3
|
9 |
+
attrs==24.3.0
|
10 |
+
Jinja2==3.1.4
|
11 |
+
email_validator==2.2.0
|
12 |
+
mdit-py-plugins==0.4.2
|
13 |
+
google-api-python-client==2.160.0
|
14 |
+
pandas==2.2.3
|
15 |
+
safehttpx==0.1.6
|
16 |
+
setproctitle==1.3.4
|
17 |
+
dill==0.3.8
|
18 |
+
torchaudio==2.5.1
|
19 |
+
frechet-audio-distance==0.1.2
|
20 |
+
blessed==1.20.0
|
21 |
+
llvmlite==0.43.0
|
22 |
+
litellm==1.60.8
|
23 |
+
nvidia-nvtx-cu12==12.4.127
|
24 |
+
nvidia-cusolver-cu12==11.6.1.9
|
25 |
+
einops==0.8.0
|
26 |
+
datasets==3.2.0
|
27 |
+
pycountry==24.6.1
|
28 |
+
airportsdata==20250224
|
29 |
+
idna==3.10
|
30 |
+
urllib3==2.2.3
|
31 |
+
mpmath==1.3.0
|
32 |
+
wandb==0.19.1
|
33 |
+
certifi==2024.12.14
|
34 |
+
markdown-it-py==3.0.0
|
35 |
+
align-anything==0.0.1.dev0
|
36 |
+
aiohttp==3.11.10
|
37 |
+
fsspec==2024.9.0
|
38 |
+
aiohappyeyeballs==2.4.4
|
39 |
+
httplib2==0.22.0
|
40 |
+
hjson==3.1.0
|
41 |
+
yarl==1.18.3
|
42 |
+
decorator==5.1.1
|
43 |
+
distlib==0.3.9
|
44 |
+
absl-py==2.1.0
|
45 |
+
huggingface-hub==0.27.0
|
46 |
+
memray==1.15.0
|
47 |
+
Pygments==2.18.0
|
48 |
+
soupsieve==2.6
|
49 |
+
shellingham==1.5.4
|
50 |
+
tokenizers==0.21.0
|
51 |
+
uvloop==0.21.0
|
52 |
+
numpy==1.26.4
|
53 |
+
linkify-it-py==2.0.3
|
54 |
+
sympy==1.13.1
|
55 |
+
python-dotenv==1.0.1
|
56 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
57 |
+
tensorboard==2.18.0
|
58 |
+
fastrlock==0.8.3
|
59 |
+
rsa==4.9
|
60 |
+
lm-format-enforcer==0.10.9
|
61 |
+
openai==1.61.1
|
62 |
+
gpustat==1.1.1
|
63 |
+
librosa==0.10.2.post1
|
64 |
+
grpcio-status==1.70.0
|
65 |
+
nvidia-cudnn-cu12==9.1.0.70
|
66 |
+
zipp==3.21.0
|
67 |
+
nvidia-nvjitlink-cu12==12.4.127
|
68 |
+
cupy-cuda12x==13.3.0
|
69 |
+
Markdown==3.7
|
70 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
71 |
+
nvidia-curand-cu12==10.3.5.147
|
72 |
+
rpds-py==0.22.3
|
73 |
+
outlines==0.1.11
|
74 |
+
docker-pycreds==0.4.0
|
75 |
+
distro==1.9.0
|
76 |
+
httpcore==1.0.7
|
77 |
+
gradio==5.9.0
|
78 |
+
google-auth-httplib2==0.2.0
|
79 |
+
iniconfig==2.0.0
|
80 |
+
gitdb==4.0.11
|
81 |
+
jsonschema==4.23.0
|
82 |
+
click==8.1.7
|
83 |
+
ninja==1.11.1.3
|
84 |
+
setuptools==75.6.0
|
85 |
+
audioread==3.0.1
|
86 |
+
frozenlist==1.5.0
|
87 |
+
transformers-stream-generator==0.0.5
|
88 |
+
nvidia-cublas-cu12==12.4.5.8
|
89 |
+
pycparser==2.22
|
90 |
+
GitPython==3.1.43
|
91 |
+
tqdm==4.67.1
|
92 |
+
importlib_metadata==8.5.0
|
93 |
+
patsy==1.0.1
|
94 |
+
networkx==3.4.2
|
95 |
+
semantic-version==2.10.0
|
96 |
+
alpaca_eval==0.6.6
|
97 |
+
google-cloud-core==2.4.1
|
98 |
+
prometheus_client==0.21.1
|
99 |
+
jiter==0.8.2
|
100 |
+
scipy==1.14.1
|
101 |
+
starlette==0.41.3
|
102 |
+
jq==1.8.0
|
103 |
+
opencensus-context==0.1.3
|
104 |
+
cachetools==5.5.1
|
105 |
+
cffi==1.17.1
|
106 |
+
opencv-python-headless==4.10.0.84
|
107 |
+
joblib==1.4.2
|
108 |
+
yt-dlp==2025.1.26
|
109 |
+
python-dateutil==2.9.0.post0
|
110 |
+
httpx==0.28.1
|
111 |
+
msgpack==1.1.0
|
112 |
+
pydub==0.25.1
|
113 |
+
tomlkit==0.13.2
|
114 |
+
nvitop==1.4.2
|
115 |
+
nvidia-cusparse-cu12==12.3.1.170
|
116 |
+
msgspec==0.18.6
|
117 |
+
aiosignal==1.3.2
|
118 |
+
wheel==0.45.1
|
119 |
+
filelock==3.16.1
|
120 |
+
pillow==10.4.0
|
121 |
+
typer==0.15.1
|
122 |
+
websockets==14.1
|
123 |
+
resampy==0.4.3
|
124 |
+
aiofiles==23.2.1
|
125 |
+
aiohttp-cors==0.7.0
|
126 |
+
platformdirs==4.3.6
|
127 |
+
gguf==0.10.0
|
128 |
+
diskcache==5.6.3
|
129 |
+
cloudpickle==3.1.0
|
130 |
+
multidict==6.1.0
|
131 |
+
py-cpuinfo==9.0.0
|
132 |
+
scikit-learn==1.6.0
|
133 |
+
smart-open==7.1.0
|
134 |
+
tiktoken==0.7.0
|
135 |
+
grpcio==1.70.0
|
136 |
+
charset-normalizer==3.4.0
|
137 |
+
nest-asyncio==1.6.0
|
138 |
+
lark==1.2.2
|
139 |
+
beautifulsoup4==4.13.3
|
140 |
+
pip==24.3.1
|
141 |
+
six==1.17.0
|
142 |
+
prometheus-fastapi-instrumentator==7.0.0
|
143 |
+
ruff==0.8.3
|
144 |
+
rich-toolkit==0.13.2
|
145 |
+
lazy_loader==0.4
|
146 |
+
grpc-google-iam-v1==0.14.0
|
147 |
+
psutil==6.1.0
|
148 |
+
mdurl==0.1.2
|
149 |
+
nvidia-nccl-cu12==2.21.5
|
150 |
+
triton==3.1.0
|
151 |
+
torchvision==0.20.1
|
152 |
+
fastapi==0.115.6
|
153 |
+
referencing==0.35.1
|
154 |
+
xxhash==3.5.0
|
155 |
+
pyzmq==26.2.0
|
156 |
+
torchlibrosa==0.1.0
|
157 |
+
googleapis-common-protos==1.66.0
|
158 |
+
pyasn1==0.6.1
|
159 |
+
soundfile==0.12.1
|
160 |
+
pyparsing==3.2.1
|
161 |
+
xgrammar==0.1.11
|
162 |
+
gradio_client==1.5.2
|
163 |
+
watchfiles==1.0.3
|
164 |
+
pluggy==1.5.0
|
165 |
+
py-spy==0.4.0
|
166 |
+
pybind11==2.13.6
|
167 |
+
diffusers==0.31.0
|
168 |
+
sentencepiece==0.2.0
|
169 |
+
flash_attn==2.7.4.post1
|
170 |
+
annotated-types==0.7.0
|
171 |
+
interegular==0.3.3
|
172 |
+
requests==2.32.3
|
173 |
+
opencensus==0.11.4
|
174 |
+
colorful==0.5.6
|
175 |
+
google-api-core==2.24.1
|
176 |
+
pytest==8.3.4
|
177 |
+
dnspython==2.7.0
|
178 |
+
pydantic_core==2.27.1
|
179 |
+
pytz==2024.2
|
180 |
+
pyasn1_modules==0.4.1
|
181 |
+
propcache==0.2.1
|
182 |
+
accelerate==1.2.1
|
183 |
+
fire==0.7.0
|
184 |
+
textual==1.0.0
|
185 |
+
sniffio==1.3.1
|
186 |
+
pyarrow==18.1.0
|
187 |
+
protobuf==5.29.1
|
188 |
+
wcwidth==0.2.13
|
189 |
+
packaging==24.2
|
190 |
+
uvicorn==0.34.0
|
191 |
+
sentry-sdk==2.19.2
|
192 |
+
google-auth==2.38.0
|
193 |
+
typing_extensions==4.12.2
|
194 |
+
peft==0.14.0
|
195 |
+
depyf==0.18.0
|
196 |
+
multiprocess==0.70.16
|
197 |
+
google-cloud-translate==3.19.0
|
198 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
199 |
+
jsonschema-specifications==2024.10.1
|
200 |
+
vllm==0.7.3
|
201 |
+
nvidia-cufft-cu12==11.2.1.3
|
202 |
+
timm==1.0.12
|
203 |
+
rich==13.9.4
|
204 |
+
ffmpy==0.4.0
|
205 |
+
virtualenv==20.29.1
|
206 |
+
tzdata==2024.2
|
207 |
+
smmap==5.0.1
|
208 |
+
uc-micro-py==1.0.3
|
209 |
+
proto-plus==1.26.0
|
210 |
+
soxr==0.5.0.post1
|
211 |
+
h11==0.14.0
|
212 |
+
outlines_core==0.1.26
|
213 |
+
compressed-tensors==0.9.1
|
214 |
+
blake3==1.0.4
|
215 |
+
xformers==0.0.28.post3
|
216 |
+
orjson==3.10.12
|
217 |
+
ray==2.40.0
|
218 |
+
PyYAML==6.0.2
|
219 |
+
nvidia-ml-py==12.560.30
|
220 |
+
python-multipart==0.0.19
|
221 |
+
PySocks==1.7.1
|
222 |
+
regex==2024.11.6
|
223 |
+
pooch==1.8.2
|
224 |
+
termcolor==2.5.0
|
225 |
+
MarkupSafe==2.1.5
|
226 |
+
torch==2.5.1
|
227 |
+
fastapi-cli==0.0.7
|
228 |
+
gdown==5.2.0
|
229 |
+
numba==0.60.0
|
230 |
+
httptools==0.6.4
|
231 |
+
transformers==4.50.0.dev0
|
232 |
+
mistral_common==1.5.1
|
233 |
+
astor==0.8.1
|
234 |
+
anyio==4.7.0
|
235 |
+
safetensors==0.4.5
|
236 |
+
threadpoolctl==3.5.0
|
237 |
+
wrapt==1.17.2
|
238 |
+
wheel==0.43.0
|
239 |
+
jaraco.functools==4.0.1
|
240 |
+
inflect==7.3.1
|
241 |
+
jaraco.text==3.12.1
|
242 |
+
typeguard==4.3.0
|
243 |
+
jaraco.collections==5.1.0
|
244 |
+
importlib_metadata==8.0.0
|
245 |
+
backports.tarfile==1.2.0
|
246 |
+
tomli==2.0.1
|
247 |
+
autocommand==2.2.2
|
248 |
+
platformdirs==4.2.2
|
249 |
+
more-itertools==10.3.0
|
250 |
+
zipp==3.19.2
|
251 |
+
packaging==24.2
|
252 |
+
typing_extensions==4.12.2
|
253 |
+
jaraco.context==5.3.0
|
wandb/run-20250405_124142-wdmxf5un/files/wandb-metadata.json
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
3 |
+
"python": "CPython 3.11.0",
|
4 |
+
"startedAt": "2025-04-05T04:41:42.080694Z",
|
5 |
+
"args": [
|
6 |
+
"--local_rank=0",
|
7 |
+
"--model_name_or_path",
|
8 |
+
"/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
|
9 |
+
"--train_datasets",
|
10 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
|
11 |
+
"--train_split",
|
12 |
+
"train",
|
13 |
+
"--train_template",
|
14 |
+
"Safe_o1",
|
15 |
+
"--output_dir",
|
16 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
17 |
+
"--log_project",
|
18 |
+
"safe-o1",
|
19 |
+
"--per_device_train_batch_size",
|
20 |
+
"4",
|
21 |
+
"--per_device_eval_batch_size",
|
22 |
+
"4",
|
23 |
+
"--gradient_accumulation_steps",
|
24 |
+
"2",
|
25 |
+
"--learning_rate",
|
26 |
+
"2e-5",
|
27 |
+
"--epochs",
|
28 |
+
"6",
|
29 |
+
"--model_max_length",
|
30 |
+
"16384"
|
31 |
+
],
|
32 |
+
"program": "-m align_anything.trainers.text_to_text.sft",
|
33 |
+
"git": {
|
34 |
+
"remote": "[email protected]:PKU-Alignment/align-anything.git",
|
35 |
+
"commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
|
36 |
+
},
|
37 |
+
"email": "[email protected]",
|
38 |
+
"root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
39 |
+
"host": "dgx-092",
|
40 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
|
41 |
+
"cpu_count": 112,
|
42 |
+
"cpu_count_logical": 224,
|
43 |
+
"gpu": "NVIDIA H800",
|
44 |
+
"gpu_count": 8,
|
45 |
+
"disk": {
|
46 |
+
"/": {
|
47 |
+
"total": "1888556142592",
|
48 |
+
"used": "149928067072"
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"memory": {
|
52 |
+
"total": "2164195454976"
|
53 |
+
},
|
54 |
+
"cpu": {
|
55 |
+
"count": 112,
|
56 |
+
"countLogical": 224
|
57 |
+
},
|
58 |
+
"gpu_nvidia": [
|
59 |
+
{
|
60 |
+
"name": "NVIDIA H800",
|
61 |
+
"memoryTotal": "85520809984",
|
62 |
+
"cudaCores": 16896,
|
63 |
+
"architecture": "Hopper"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"name": "NVIDIA H800",
|
67 |
+
"memoryTotal": "85520809984",
|
68 |
+
"cudaCores": 16896,
|
69 |
+
"architecture": "Hopper"
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"name": "NVIDIA H800",
|
73 |
+
"memoryTotal": "85520809984",
|
74 |
+
"cudaCores": 16896,
|
75 |
+
"architecture": "Hopper"
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"name": "NVIDIA H800",
|
79 |
+
"memoryTotal": "85520809984",
|
80 |
+
"cudaCores": 16896,
|
81 |
+
"architecture": "Hopper"
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "NVIDIA H800",
|
85 |
+
"memoryTotal": "85520809984",
|
86 |
+
"cudaCores": 16896,
|
87 |
+
"architecture": "Hopper"
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"name": "NVIDIA H800",
|
91 |
+
"memoryTotal": "85520809984",
|
92 |
+
"cudaCores": 16896,
|
93 |
+
"architecture": "Hopper"
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"name": "NVIDIA H800",
|
97 |
+
"memoryTotal": "85520809984",
|
98 |
+
"cudaCores": 16896,
|
99 |
+
"architecture": "Hopper"
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"name": "NVIDIA H800",
|
103 |
+
"memoryTotal": "85520809984",
|
104 |
+
"cudaCores": 16896,
|
105 |
+
"architecture": "Hopper"
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"slurm": {
|
109 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
110 |
+
},
|
111 |
+
"cudaVersion": "12.2"
|
112 |
+
}
|
wandb/run-20250405_124142-wdmxf5un/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_runtime":1813.133208883,"_step":1056,"_wandb":{"runtime":1813},"train/step":1056,"train/loss":0.07423145323991776,"train/lr":2e-05,"train/epoch":6,"_timestamp":1.7438298946027555e+09}
|
wandb/run-20250405_124142-wdmxf5un/logs/debug-core.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-05T12:41:41.503771102+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpo0_9yrfw/port-3499761.txt","pid":3499761,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2025-04-05T12:41:41.50383036+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2025-04-05T12:41:41.504662712+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3499761}
|
4 |
+
{"time":"2025-04-05T12:41:41.504649334+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41733,"Zone":""}}
|
5 |
+
{"time":"2025-04-05T12:41:41.685582021+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57286"}
|
6 |
+
{"time":"2025-04-05T12:41:42.081810281+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
|
7 |
+
{"time":"2025-04-05T12:41:42.297218189+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
|
8 |
+
{"time":"2025-04-05T13:11:58.021979029+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
|
9 |
+
{"time":"2025-04-05T13:11:58.02250833+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"wdmxf5un","id":"127.0.0.1:57286"}
|
10 |
+
{"time":"2025-04-05T13:11:58.069941299+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:57286"}
|
11 |
+
{"time":"2025-04-05T13:11:58.069957091+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:57286"}
|
12 |
+
{"time":"2025-04-05T13:11:58.069970949+08:00","level":"INFO","msg":"server is shutting down"}
|
13 |
+
{"time":"2025-04-05T13:11:58.069994407+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:57286"}
|
14 |
+
{"time":"2025-04-05T13:11:58.07003219+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:57286"}
|
15 |
+
{"time":"2025-04-05T13:11:58.070034704+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:57286"}
|
16 |
+
{"time":"2025-04-05T13:11:58.070037746+08:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20250405_124142-wdmxf5un/logs/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-05T12:41:42.083502459+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
|
2 |
+
{"time":"2025-04-05T12:41:42.083646225+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_124142-wdmxf5un/logs/debug-core.log"}
|
3 |
+
{"time":"2025-04-05T12:41:42.297160304+08:00","level":"INFO","msg":"created new stream","id":"wdmxf5un"}
|
4 |
+
{"time":"2025-04-05T12:41:42.297211119+08:00","level":"INFO","msg":"stream: started","id":"wdmxf5un"}
|
5 |
+
{"time":"2025-04-05T12:41:42.297225618+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"wdmxf5un"}
|
6 |
+
{"time":"2025-04-05T12:41:42.297245103+08:00","level":"INFO","msg":"sender: started","stream_id":"wdmxf5un"}
|
7 |
+
{"time":"2025-04-05T12:41:42.297240136+08:00","level":"INFO","msg":"handler: started","stream_id":"wdmxf5un"}
|
8 |
+
{"time":"2025-04-05T12:41:42.608849544+08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2025-04-05T13:11:55.213924936+08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2025-04-05T13:11:55.214734471+08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2025-04-05T13:11:56.17645318+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.303202353,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.303187719,"progress":"19.7KB/19.7KB"}],"total_operations":2}}
|
12 |
+
{"time":"2025-04-05T13:11:56.7588989+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2025-04-05T13:11:58.022118371+08:00","level":"INFO","msg":"stream: closing","id":"wdmxf5un"}
|
14 |
+
{"time":"2025-04-05T13:11:58.022162409+08:00","level":"INFO","msg":"handler: closed","stream_id":"wdmxf5un"}
|
15 |
+
{"time":"2025-04-05T13:11:58.022170713+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"wdmxf5un"}
|
16 |
+
{"time":"2025-04-05T13:11:58.022286446+08:00","level":"INFO","msg":"sender: closed","stream_id":"wdmxf5un"}
|
17 |
+
{"time":"2025-04-05T13:11:58.022499189+08:00","level":"INFO","msg":"stream: closed","id":"wdmxf5un"}
|
wandb/run-20250405_124142-wdmxf5un/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
2 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Configure stats pid to 3499761
|
3 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
|
4 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
|
5 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
6 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_124142-wdmxf5un/logs/debug.log
|
7 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_124142-wdmxf5un/logs/debug-internal.log
|
8 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():644] calling init triggers
|
9 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
10 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 6, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_o1', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
|
11 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():680] starting backend
|
12 |
+
2025-04-05 12:41:42,077 INFO MainThread:3499761 [wandb_init.py:init():684] sending inform_init request
|
13 |
+
2025-04-05 12:41:42,080 INFO MainThread:3499761 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-05 12:41:42,080 INFO MainThread:3499761 [wandb_init.py:init():697] backend started and connected
|
15 |
+
2025-04-05 12:41:42,081 INFO MainThread:3499761 [wandb_init.py:init():790] updated telemetry
|
16 |
+
2025-04-05 12:41:42,093 INFO MainThread:3499761 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-05 12:41:42,603 INFO MainThread:3499761 [wandb_init.py:init():874] starting run threads in backend
|
18 |
+
2025-04-05 12:41:42,840 INFO MainThread:3499761 [wandb_run.py:_console_start():2374] atexit reg
|
19 |
+
2025-04-05 12:41:42,840 INFO MainThread:3499761 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
20 |
+
2025-04-05 12:41:42,840 INFO MainThread:3499761 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
21 |
+
2025-04-05 12:41:42,841 INFO MainThread:3499761 [wandb_run.py:_redirect():2314] Redirects installed.
|
22 |
+
2025-04-05 12:41:42,843 INFO MainThread:3499761 [wandb_init.py:init():916] run started, returning control to user process
|
23 |
+
2025-04-05 13:11:55,174 INFO MainThread:3499761 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/wdmxf5un
|
24 |
+
2025-04-05 13:11:55,175 INFO MainThread:3499761 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
|
25 |
+
2025-04-05 13:11:55,175 INFO MainThread:3499761 [wandb_run.py:_restore():2321] restore
|
26 |
+
2025-04-05 13:11:55,175 INFO MainThread:3499761 [wandb_run.py:_restore():2327] restore done
|
27 |
+
2025-04-05 13:11:58,016 INFO MainThread:3499761 [wandb_run.py:_footer_history_summary_info():3892] rendering history
|
28 |
+
2025-04-05 13:11:58,016 INFO MainThread:3499761 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
|
29 |
+
2025-04-05 13:11:58,021 INFO MainThread:3499761 [wandb_run.py:_footer_sync_info():3853] logging synced files
|
wandb/run-20250405_124142-wdmxf5un/run-wdmxf5un.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:745d51aa8a60822ffdf463265dc9f734f3350d16bfa1d80cebc8c6babd74d586
|
3 |
+
size 3245024
|
wandb/run-20250405_153219-puqja889/files/config.yaml
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.19.1
|
4 |
+
m: []
|
5 |
+
python_version: 3.11.0
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 5
|
10 |
+
- 11
|
11 |
+
- 41
|
12 |
+
- 49
|
13 |
+
- 51
|
14 |
+
- 53
|
15 |
+
- 55
|
16 |
+
- 63
|
17 |
+
- 71
|
18 |
+
- 83
|
19 |
+
- 98
|
20 |
+
- 105
|
21 |
+
"2":
|
22 |
+
- 1
|
23 |
+
- 5
|
24 |
+
- 11
|
25 |
+
- 41
|
26 |
+
- 49
|
27 |
+
- 51
|
28 |
+
- 53
|
29 |
+
- 55
|
30 |
+
- 63
|
31 |
+
- 71
|
32 |
+
- 83
|
33 |
+
- 98
|
34 |
+
- 105
|
35 |
+
"3":
|
36 |
+
- 2
|
37 |
+
- 13
|
38 |
+
- 16
|
39 |
+
- 23
|
40 |
+
- 55
|
41 |
+
- 61
|
42 |
+
"4": 3.11.0
|
43 |
+
"5": 0.19.1
|
44 |
+
"6": 4.50.0.dev0
|
45 |
+
"8":
|
46 |
+
- 5
|
47 |
+
"12": 0.19.1
|
48 |
+
"13": linux-x86_64
|
49 |
+
bnb_cfgs:
|
50 |
+
value:
|
51 |
+
bnb_4bit_compute_dtype: float16
|
52 |
+
bnb_4bit_quant_type: nf4
|
53 |
+
bnb_4bit_use_double_quant: true
|
54 |
+
load_in_4bit: true
|
55 |
+
load_in_8bit: false
|
56 |
+
use_bnb: false
|
57 |
+
data_cfgs:
|
58 |
+
value:
|
59 |
+
eval_optional_args: []
|
60 |
+
train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
|
61 |
+
train_optional_args: []
|
62 |
+
train_split: train
|
63 |
+
train_template: Safe_o1
|
64 |
+
logger_cfgs:
|
65 |
+
value:
|
66 |
+
log_project: safe-o1
|
67 |
+
log_run_name: sft
|
68 |
+
log_type: wandb
|
69 |
+
output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
|
70 |
+
save_interval: 100000
|
71 |
+
lora_cfgs:
|
72 |
+
value:
|
73 |
+
inference_mode: false
|
74 |
+
lora_alpha: 16
|
75 |
+
lora_dropout: 0.1
|
76 |
+
r: 16
|
77 |
+
save_full_model: true
|
78 |
+
target_modules:
|
79 |
+
- q_proj
|
80 |
+
- v_proj
|
81 |
+
task_type: TaskType.CAUSAL_LM
|
82 |
+
use_lora: false
|
83 |
+
model_cfgs:
|
84 |
+
value:
|
85 |
+
model_max_length: 16384
|
86 |
+
model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
|
87 |
+
trust_remote_code: true
|
88 |
+
train_cfgs:
|
89 |
+
value:
|
90 |
+
adam_betas:
|
91 |
+
- 0.9
|
92 |
+
- 0.95
|
93 |
+
adam_epsilon: 1e-08
|
94 |
+
bf16: true
|
95 |
+
ds_cfgs: ds_z3_config.json
|
96 |
+
epochs: 3
|
97 |
+
eval_interval: 10
|
98 |
+
eval_strategy: steps
|
99 |
+
fp16: false
|
100 |
+
gradient_accumulation_steps: 2
|
101 |
+
gradient_checkpointing: true
|
102 |
+
learning_rate: 2e-05
|
103 |
+
lr_scheduler_type: constant
|
104 |
+
lr_warmup_ratio: 0.03
|
105 |
+
max_grad_norm: 1
|
106 |
+
per_device_eval_batch_size: 4
|
107 |
+
per_device_train_batch_size: 4
|
108 |
+
seed: 42
|
109 |
+
weight_decay: 0
|
wandb/run-20250405_153219-puqja889/files/output.log
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
***** Running training *****
|
2 |
+
Training 1/3 epoch: 0%| | 0/528 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
3 |
+
Training 2/3 epoch (loss 0.5996): 60%|███████████████████████████████████████████████████████████████████████████████▏ | 319/528 [09:05<05:41, 1.64s/it]
|
4 |
+
[2025-04-05 15:32:59,853] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
5 |
+
[2025-04-05 15:32:59,853] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=20.30692434041, CurrSamplesPerSec=19.741172703491532, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
|
6 |
+
[2025-04-05 15:33:32,887] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
7 |
+
[2025-04-05 15:33:32,887] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=20.267265680615793, CurrSamplesPerSec=19.119188499422773, MemAllocated=15.18GB, MaxMemAllocated=36.69GB
|
8 |
+
[2025-04-05 15:34:09,740] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
9 |
+
[2025-04-05 15:34:09,740] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=19.441145165201213, CurrSamplesPerSec=22.89643941293659, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
10 |
+
[2025-04-05 15:34:42,269] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
11 |
+
[2025-04-05 15:34:42,269] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=19.71075418937523, CurrSamplesPerSec=21.574415014581692, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
12 |
+
[2025-04-05 15:35:15,596] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
13 |
+
[2025-04-05 15:35:15,596] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=19.786513682310243, CurrSamplesPerSec=23.265023869025228, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
14 |
+
[2025-04-05 15:35:48,802] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
15 |
+
[2025-04-05 15:35:48,803] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=19.84854756919738, CurrSamplesPerSec=18.438848766680724, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
16 |
+
[2025-04-05 15:36:21,949] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
17 |
+
[2025-04-05 15:36:21,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=19.89555189033183, CurrSamplesPerSec=18.18743542017038, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
18 |
+
[2025-04-05 15:36:56,642] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
19 |
+
[2025-04-05 15:36:56,642] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=19.808610765037212, CurrSamplesPerSec=19.790898299980608, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
20 |
+
[2025-04-05 15:37:31,295] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
21 |
+
[2025-04-05 15:37:31,296] [INFO] [timer.py:264:stop] epoch=1/micro_step=4/global_step=90, RunningAvgSamplesPerSec=19.75046050883055, CurrSamplesPerSec=15.143802444921475, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
22 |
+
[2025-04-05 15:38:04,353] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
23 |
+
[2025-04-05 15:38:04,354] [INFO] [timer.py:264:stop] epoch=1/micro_step=24/global_step=100, RunningAvgSamplesPerSec=19.799562880276337, CurrSamplesPerSec=20.365822624126473, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
24 |
+
[2025-04-05 15:38:38,700] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
25 |
+
[2025-04-05 15:38:38,701] [INFO] [timer.py:264:stop] epoch=1/micro_step=44/global_step=110, RunningAvgSamplesPerSec=19.76926764728122, CurrSamplesPerSec=14.58470845145212, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
26 |
+
[2025-04-05 15:39:13,809] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
27 |
+
[2025-04-05 15:39:13,810] [INFO] [timer.py:264:stop] epoch=1/micro_step=64/global_step=120, RunningAvgSamplesPerSec=19.702187119604087, CurrSamplesPerSec=18.840601998397894, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
28 |
+
[2025-04-05 15:39:47,221] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
29 |
+
[2025-04-05 15:39:47,222] [INFO] [timer.py:264:stop] epoch=1/micro_step=84/global_step=130, RunningAvgSamplesPerSec=19.734580817376962, CurrSamplesPerSec=19.789930950426523, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
30 |
+
[2025-04-05 15:40:20,485] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
31 |
+
[2025-04-05 15:40:20,486] [INFO] [timer.py:264:stop] epoch=1/micro_step=104/global_step=140, RunningAvgSamplesPerSec=19.761128557333294, CurrSamplesPerSec=18.234083740928202, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
32 |
+
[2025-04-05 15:40:53,748] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
33 |
+
[2025-04-05 15:40:53,748] [INFO] [timer.py:264:stop] epoch=1/micro_step=124/global_step=150, RunningAvgSamplesPerSec=19.78862789065117, CurrSamplesPerSec=21.41741861785655, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
34 |
+
[2025-04-05 15:41:26,856] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
35 |
+
[2025-04-05 15:41:26,856] [INFO] [timer.py:264:stop] epoch=1/micro_step=144/global_step=160, RunningAvgSamplesPerSec=19.81502168719465, CurrSamplesPerSec=24.666726261517713, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
36 |
+
[2025-04-05 15:42:01,110] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
37 |
+
[2025-04-05 15:42:01,110] [INFO] [timer.py:264:stop] epoch=1/micro_step=164/global_step=170, RunningAvgSamplesPerSec=19.792251767052175, CurrSamplesPerSec=22.756839975456966, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
38 |
+
[2025-04-05 15:42:35,842] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
39 |
+
[2025-04-05 15:42:35,842] [INFO] [timer.py:264:stop] epoch=2/micro_step=8/global_step=180, RunningAvgSamplesPerSec=19.7654807990148, CurrSamplesPerSec=21.057054092109006, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
40 |
+
[2025-04-05 15:43:11,122] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
41 |
+
[2025-04-05 15:43:11,122] [INFO] [timer.py:264:stop] epoch=2/micro_step=28/global_step=190, RunningAvgSamplesPerSec=19.72017429160466, CurrSamplesPerSec=14.290293089769191, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
42 |
+
[2025-04-05 15:43:43,794] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
43 |
+
[2025-04-05 15:43:43,795] [INFO] [timer.py:264:stop] epoch=2/micro_step=48/global_step=200, RunningAvgSamplesPerSec=19.76614861625657, CurrSamplesPerSec=23.281404168636236, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
44 |
+
[2025-04-05 15:44:19,358] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
45 |
+
[2025-04-05 15:44:19,359] [INFO] [timer.py:264:stop] epoch=2/micro_step=68/global_step=210, RunningAvgSamplesPerSec=19.71474683772797, CurrSamplesPerSec=22.07258105575799, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
46 |
+
[2025-04-05 15:44:53,709] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
47 |
+
[2025-04-05 15:44:53,709] [INFO] [timer.py:264:stop] epoch=2/micro_step=88/global_step=220, RunningAvgSamplesPerSec=19.705082012780586, CurrSamplesPerSec=19.593044905429547, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
48 |
+
[2025-04-05 15:45:26,365] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
49 |
+
[2025-04-05 15:45:26,366] [INFO] [timer.py:264:stop] epoch=2/micro_step=108/global_step=230, RunningAvgSamplesPerSec=19.739918406677425, CurrSamplesPerSec=19.835104573202546, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
50 |
+
[2025-04-05 15:45:59,052] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
51 |
+
[2025-04-05 15:45:59,053] [INFO] [timer.py:264:stop] epoch=2/micro_step=128/global_step=240, RunningAvgSamplesPerSec=19.772618786957285, CurrSamplesPerSec=20.382635106879853, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
52 |
+
[2025-04-05 15:46:32,875] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
53 |
+
[2025-04-05 15:46:32,876] [INFO] [timer.py:264:stop] epoch=2/micro_step=148/global_step=250, RunningAvgSamplesPerSec=19.769924873342227, CurrSamplesPerSec=15.725991281451387, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
54 |
+
[2025-04-05 15:47:06,948] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
55 |
+
[2025-04-05 15:47:06,948] [INFO] [timer.py:264:stop] epoch=2/micro_step=168/global_step=260, RunningAvgSamplesPerSec=19.769435747796898, CurrSamplesPerSec=21.595577884084456, MemAllocated=15.18GB, MaxMemAllocated=40.42GB
|
56 |
+
Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
|
57 |
+
Saving 16-bit model...
|
58 |
+
[2025-04-05 15:47:28,005] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step264 is about to be saved!
|
59 |
+
[2025-04-05 15:47:28,007] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step264
|
60 |
+
[2025-04-05 15:47:28,007] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
|
61 |
+
[2025-04-05 15:47:41,242] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
|
62 |
+
[2025-04-05 15:47:41,242] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step264 is ready now!
|
63 |
+
Model saved!
|
wandb/run-20250405_153219-puqja889/files/requirements.txt
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
maskrcnn_benchmark==0.0.0
|
2 |
+
deepspeed==0.16.1
|
3 |
+
uritemplate==4.1.1
|
4 |
+
pyairports==2.1.1
|
5 |
+
partial-json-parser==0.2.1.1.post4
|
6 |
+
tensorboard-data-server==0.7.2
|
7 |
+
pydantic==2.10.3
|
8 |
+
Werkzeug==3.1.3
|
9 |
+
attrs==24.3.0
|
10 |
+
Jinja2==3.1.4
|
11 |
+
email_validator==2.2.0
|
12 |
+
mdit-py-plugins==0.4.2
|
13 |
+
google-api-python-client==2.160.0
|
14 |
+
pandas==2.2.3
|
15 |
+
safehttpx==0.1.6
|
16 |
+
setproctitle==1.3.4
|
17 |
+
dill==0.3.8
|
18 |
+
torchaudio==2.5.1
|
19 |
+
frechet-audio-distance==0.1.2
|
20 |
+
blessed==1.20.0
|
21 |
+
llvmlite==0.43.0
|
22 |
+
litellm==1.60.8
|
23 |
+
nvidia-nvtx-cu12==12.4.127
|
24 |
+
nvidia-cusolver-cu12==11.6.1.9
|
25 |
+
einops==0.8.0
|
26 |
+
datasets==3.2.0
|
27 |
+
pycountry==24.6.1
|
28 |
+
airportsdata==20250224
|
29 |
+
idna==3.10
|
30 |
+
urllib3==2.2.3
|
31 |
+
mpmath==1.3.0
|
32 |
+
wandb==0.19.1
|
33 |
+
certifi==2024.12.14
|
34 |
+
markdown-it-py==3.0.0
|
35 |
+
align-anything==0.0.1.dev0
|
36 |
+
aiohttp==3.11.10
|
37 |
+
fsspec==2024.9.0
|
38 |
+
aiohappyeyeballs==2.4.4
|
39 |
+
httplib2==0.22.0
|
40 |
+
hjson==3.1.0
|
41 |
+
yarl==1.18.3
|
42 |
+
decorator==5.1.1
|
43 |
+
distlib==0.3.9
|
44 |
+
absl-py==2.1.0
|
45 |
+
huggingface-hub==0.27.0
|
46 |
+
memray==1.15.0
|
47 |
+
Pygments==2.18.0
|
48 |
+
soupsieve==2.6
|
49 |
+
shellingham==1.5.4
|
50 |
+
tokenizers==0.21.0
|
51 |
+
uvloop==0.21.0
|
52 |
+
numpy==1.26.4
|
53 |
+
linkify-it-py==2.0.3
|
54 |
+
sympy==1.13.1
|
55 |
+
python-dotenv==1.0.1
|
56 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
57 |
+
tensorboard==2.18.0
|
58 |
+
fastrlock==0.8.3
|
59 |
+
rsa==4.9
|
60 |
+
lm-format-enforcer==0.10.9
|
61 |
+
openai==1.61.1
|
62 |
+
gpustat==1.1.1
|
63 |
+
librosa==0.10.2.post1
|
64 |
+
grpcio-status==1.70.0
|
65 |
+
nvidia-cudnn-cu12==9.1.0.70
|
66 |
+
zipp==3.21.0
|
67 |
+
nvidia-nvjitlink-cu12==12.4.127
|
68 |
+
cupy-cuda12x==13.3.0
|
69 |
+
Markdown==3.7
|
70 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
71 |
+
nvidia-curand-cu12==10.3.5.147
|
72 |
+
rpds-py==0.22.3
|
73 |
+
outlines==0.1.11
|
74 |
+
docker-pycreds==0.4.0
|
75 |
+
distro==1.9.0
|
76 |
+
httpcore==1.0.7
|
77 |
+
gradio==5.9.0
|
78 |
+
google-auth-httplib2==0.2.0
|
79 |
+
iniconfig==2.0.0
|
80 |
+
gitdb==4.0.11
|
81 |
+
jsonschema==4.23.0
|
82 |
+
click==8.1.7
|
83 |
+
ninja==1.11.1.3
|
84 |
+
setuptools==75.6.0
|
85 |
+
audioread==3.0.1
|
86 |
+
frozenlist==1.5.0
|
87 |
+
transformers-stream-generator==0.0.5
|
88 |
+
nvidia-cublas-cu12==12.4.5.8
|
89 |
+
pycparser==2.22
|
90 |
+
GitPython==3.1.43
|
91 |
+
tqdm==4.67.1
|
92 |
+
importlib_metadata==8.5.0
|
93 |
+
patsy==1.0.1
|
94 |
+
networkx==3.4.2
|
95 |
+
semantic-version==2.10.0
|
96 |
+
alpaca_eval==0.6.6
|
97 |
+
google-cloud-core==2.4.1
|
98 |
+
prometheus_client==0.21.1
|
99 |
+
jiter==0.8.2
|
100 |
+
scipy==1.14.1
|
101 |
+
starlette==0.41.3
|
102 |
+
jq==1.8.0
|
103 |
+
opencensus-context==0.1.3
|
104 |
+
cachetools==5.5.1
|
105 |
+
cffi==1.17.1
|
106 |
+
opencv-python-headless==4.10.0.84
|
107 |
+
joblib==1.4.2
|
108 |
+
yt-dlp==2025.1.26
|
109 |
+
python-dateutil==2.9.0.post0
|
110 |
+
httpx==0.28.1
|
111 |
+
msgpack==1.1.0
|
112 |
+
pydub==0.25.1
|
113 |
+
tomlkit==0.13.2
|
114 |
+
nvitop==1.4.2
|
115 |
+
nvidia-cusparse-cu12==12.3.1.170
|
116 |
+
msgspec==0.18.6
|
117 |
+
aiosignal==1.3.2
|
118 |
+
wheel==0.45.1
|
119 |
+
filelock==3.16.1
|
120 |
+
pillow==10.4.0
|
121 |
+
typer==0.15.1
|
122 |
+
websockets==14.1
|
123 |
+
resampy==0.4.3
|
124 |
+
aiofiles==23.2.1
|
125 |
+
aiohttp-cors==0.7.0
|
126 |
+
platformdirs==4.3.6
|
127 |
+
gguf==0.10.0
|
128 |
+
diskcache==5.6.3
|
129 |
+
cloudpickle==3.1.0
|
130 |
+
multidict==6.1.0
|
131 |
+
py-cpuinfo==9.0.0
|
132 |
+
scikit-learn==1.6.0
|
133 |
+
smart-open==7.1.0
|
134 |
+
tiktoken==0.7.0
|
135 |
+
grpcio==1.70.0
|
136 |
+
charset-normalizer==3.4.0
|
137 |
+
nest-asyncio==1.6.0
|
138 |
+
lark==1.2.2
|
139 |
+
beautifulsoup4==4.13.3
|
140 |
+
pip==24.3.1
|
141 |
+
six==1.17.0
|
142 |
+
prometheus-fastapi-instrumentator==7.0.0
|
143 |
+
ruff==0.8.3
|
144 |
+
rich-toolkit==0.13.2
|
145 |
+
lazy_loader==0.4
|
146 |
+
grpc-google-iam-v1==0.14.0
|
147 |
+
psutil==6.1.0
|
148 |
+
mdurl==0.1.2
|
149 |
+
nvidia-nccl-cu12==2.21.5
|
150 |
+
triton==3.1.0
|
151 |
+
torchvision==0.20.1
|
152 |
+
fastapi==0.115.6
|
153 |
+
referencing==0.35.1
|
154 |
+
xxhash==3.5.0
|
155 |
+
pyzmq==26.2.0
|
156 |
+
torchlibrosa==0.1.0
|
157 |
+
googleapis-common-protos==1.66.0
|
158 |
+
pyasn1==0.6.1
|
159 |
+
soundfile==0.12.1
|
160 |
+
pyparsing==3.2.1
|
161 |
+
xgrammar==0.1.11
|
162 |
+
gradio_client==1.5.2
|
163 |
+
watchfiles==1.0.3
|
164 |
+
pluggy==1.5.0
|
165 |
+
py-spy==0.4.0
|
166 |
+
pybind11==2.13.6
|
167 |
+
diffusers==0.31.0
|
168 |
+
sentencepiece==0.2.0
|
169 |
+
flash_attn==2.7.4.post1
|
170 |
+
annotated-types==0.7.0
|
171 |
+
interegular==0.3.3
|
172 |
+
requests==2.32.3
|
173 |
+
opencensus==0.11.4
|
174 |
+
colorful==0.5.6
|
175 |
+
google-api-core==2.24.1
|
176 |
+
pytest==8.3.4
|
177 |
+
dnspython==2.7.0
|
178 |
+
pydantic_core==2.27.1
|
179 |
+
pytz==2024.2
|
180 |
+
pyasn1_modules==0.4.1
|
181 |
+
propcache==0.2.1
|
182 |
+
accelerate==1.2.1
|
183 |
+
fire==0.7.0
|
184 |
+
textual==1.0.0
|
185 |
+
sniffio==1.3.1
|
186 |
+
pyarrow==18.1.0
|
187 |
+
protobuf==5.29.1
|
188 |
+
wcwidth==0.2.13
|
189 |
+
packaging==24.2
|
190 |
+
uvicorn==0.34.0
|
191 |
+
sentry-sdk==2.19.2
|
192 |
+
google-auth==2.38.0
|
193 |
+
typing_extensions==4.12.2
|
194 |
+
peft==0.14.0
|
195 |
+
depyf==0.18.0
|
196 |
+
multiprocess==0.70.16
|
197 |
+
google-cloud-translate==3.19.0
|
198 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
199 |
+
jsonschema-specifications==2024.10.1
|
200 |
+
vllm==0.7.3
|
201 |
+
nvidia-cufft-cu12==11.2.1.3
|
202 |
+
timm==1.0.12
|
203 |
+
rich==13.9.4
|
204 |
+
ffmpy==0.4.0
|
205 |
+
virtualenv==20.29.1
|
206 |
+
tzdata==2024.2
|
207 |
+
smmap==5.0.1
|
208 |
+
uc-micro-py==1.0.3
|
209 |
+
proto-plus==1.26.0
|
210 |
+
soxr==0.5.0.post1
|
211 |
+
h11==0.14.0
|
212 |
+
outlines_core==0.1.26
|
213 |
+
compressed-tensors==0.9.1
|
214 |
+
blake3==1.0.4
|
215 |
+
xformers==0.0.28.post3
|
216 |
+
orjson==3.10.12
|
217 |
+
ray==2.40.0
|
218 |
+
PyYAML==6.0.2
|
219 |
+
nvidia-ml-py==12.560.30
|
220 |
+
python-multipart==0.0.19
|
221 |
+
PySocks==1.7.1
|
222 |
+
regex==2024.11.6
|
223 |
+
pooch==1.8.2
|
224 |
+
termcolor==2.5.0
|
225 |
+
MarkupSafe==2.1.5
|
226 |
+
torch==2.5.1
|
227 |
+
fastapi-cli==0.0.7
|
228 |
+
gdown==5.2.0
|
229 |
+
numba==0.60.0
|
230 |
+
httptools==0.6.4
|
231 |
+
transformers==4.50.0.dev0
|
232 |
+
mistral_common==1.5.1
|
233 |
+
astor==0.8.1
|
234 |
+
anyio==4.7.0
|
235 |
+
safetensors==0.4.5
|
236 |
+
threadpoolctl==3.5.0
|
237 |
+
wrapt==1.17.2
|
238 |
+
wheel==0.43.0
|
239 |
+
jaraco.functools==4.0.1
|
240 |
+
inflect==7.3.1
|
241 |
+
jaraco.text==3.12.1
|
242 |
+
typeguard==4.3.0
|
243 |
+
jaraco.collections==5.1.0
|
244 |
+
importlib_metadata==8.0.0
|
245 |
+
backports.tarfile==1.2.0
|
246 |
+
tomli==2.0.1
|
247 |
+
autocommand==2.2.2
|
248 |
+
platformdirs==4.2.2
|
249 |
+
more-itertools==10.3.0
|
250 |
+
zipp==3.19.2
|
251 |
+
packaging==24.2
|
252 |
+
typing_extensions==4.12.2
|
253 |
+
jaraco.context==5.3.0
|
wandb/run-20250405_153219-puqja889/files/wandb-metadata.json
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
3 |
+
"python": "CPython 3.11.0",
|
4 |
+
"startedAt": "2025-04-05T07:32:19.230644Z",
|
5 |
+
"args": [
|
6 |
+
"--local_rank=0",
|
7 |
+
"--model_name_or_path",
|
8 |
+
"/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
|
9 |
+
"--train_datasets",
|
10 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
|
11 |
+
"--train_split",
|
12 |
+
"train",
|
13 |
+
"--train_template",
|
14 |
+
"Safe_o1",
|
15 |
+
"--output_dir",
|
16 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
17 |
+
"--log_project",
|
18 |
+
"safe-o1",
|
19 |
+
"--per_device_train_batch_size",
|
20 |
+
"4",
|
21 |
+
"--per_device_eval_batch_size",
|
22 |
+
"4",
|
23 |
+
"--gradient_accumulation_steps",
|
24 |
+
"2",
|
25 |
+
"--learning_rate",
|
26 |
+
"2e-5",
|
27 |
+
"--epochs",
|
28 |
+
"3",
|
29 |
+
"--model_max_length",
|
30 |
+
"16384"
|
31 |
+
],
|
32 |
+
"program": "-m align_anything.trainers.text_to_text.sft",
|
33 |
+
"git": {
|
34 |
+
"remote": "[email protected]:PKU-Alignment/align-anything.git",
|
35 |
+
"commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
|
36 |
+
},
|
37 |
+
"email": "[email protected]",
|
38 |
+
"root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
39 |
+
"host": "dgx-092",
|
40 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
|
41 |
+
"cpu_count": 112,
|
42 |
+
"cpu_count_logical": 224,
|
43 |
+
"gpu": "NVIDIA H800",
|
44 |
+
"gpu_count": 8,
|
45 |
+
"disk": {
|
46 |
+
"/": {
|
47 |
+
"total": "1888556142592",
|
48 |
+
"used": "149958434816"
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"memory": {
|
52 |
+
"total": "2164195454976"
|
53 |
+
},
|
54 |
+
"cpu": {
|
55 |
+
"count": 112,
|
56 |
+
"countLogical": 224
|
57 |
+
},
|
58 |
+
"gpu_nvidia": [
|
59 |
+
{
|
60 |
+
"name": "NVIDIA H800",
|
61 |
+
"memoryTotal": "85520809984",
|
62 |
+
"cudaCores": 16896,
|
63 |
+
"architecture": "Hopper"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"name": "NVIDIA H800",
|
67 |
+
"memoryTotal": "85520809984",
|
68 |
+
"cudaCores": 16896,
|
69 |
+
"architecture": "Hopper"
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"name": "NVIDIA H800",
|
73 |
+
"memoryTotal": "85520809984",
|
74 |
+
"cudaCores": 16896,
|
75 |
+
"architecture": "Hopper"
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"name": "NVIDIA H800",
|
79 |
+
"memoryTotal": "85520809984",
|
80 |
+
"cudaCores": 16896,
|
81 |
+
"architecture": "Hopper"
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "NVIDIA H800",
|
85 |
+
"memoryTotal": "85520809984",
|
86 |
+
"cudaCores": 16896,
|
87 |
+
"architecture": "Hopper"
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"name": "NVIDIA H800",
|
91 |
+
"memoryTotal": "85520809984",
|
92 |
+
"cudaCores": 16896,
|
93 |
+
"architecture": "Hopper"
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"name": "NVIDIA H800",
|
97 |
+
"memoryTotal": "85520809984",
|
98 |
+
"cudaCores": 16896,
|
99 |
+
"architecture": "Hopper"
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"name": "NVIDIA H800",
|
103 |
+
"memoryTotal": "85520809984",
|
104 |
+
"cudaCores": 16896,
|
105 |
+
"architecture": "Hopper"
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"slurm": {
|
109 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
110 |
+
},
|
111 |
+
"cudaVersion": "12.2"
|
112 |
+
}
|
wandb/run-20250405_153219-puqja889/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/epoch":3,"_timestamp":1.743839240297635e+09,"_runtime":922.055261897,"_step":528,"train/step":528,"_wandb":{"runtime":922},"train/loss":0.3990817368030548,"train/lr":2e-05}
|
wandb/run-20250405_153219-puqja889/logs/debug-core.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-05T15:32:18.65561336+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpxq7sqgm9/port-3746936.txt","pid":3746936,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2025-04-05T15:32:18.655658183+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2025-04-05T15:32:18.656636391+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3746936}
|
4 |
+
{"time":"2025-04-05T15:32:18.656636606+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37857,"Zone":""}}
|
5 |
+
{"time":"2025-04-05T15:32:18.837886858+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34114"}
|
6 |
+
{"time":"2025-04-05T15:32:19.231604357+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"puqja889","id":"127.0.0.1:34114"}
|
7 |
+
{"time":"2025-04-05T15:32:19.446153043+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"puqja889","id":"127.0.0.1:34114"}
|
8 |
+
{"time":"2025-04-05T15:47:44.197928261+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"puqja889","id":"127.0.0.1:34114"}
|
9 |
+
{"time":"2025-04-05T15:47:44.198467502+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"puqja889","id":"127.0.0.1:34114"}
|
10 |
+
{"time":"2025-04-05T15:47:44.245957677+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:34114"}
|
11 |
+
{"time":"2025-04-05T15:47:44.24597843+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:34114"}
|
12 |
+
{"time":"2025-04-05T15:47:44.245991486+08:00","level":"INFO","msg":"server is shutting down"}
|
13 |
+
{"time":"2025-04-05T15:47:44.246014289+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:34114"}
|
14 |
+
{"time":"2025-04-05T15:47:44.246050481+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:34114"}
|
15 |
+
{"time":"2025-04-05T15:47:44.246052923+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:34114"}
|
16 |
+
{"time":"2025-04-05T15:47:44.246055739+08:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20250405_153219-puqja889/logs/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-05T15:32:19.233194259+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
|
2 |
+
{"time":"2025-04-05T15:32:19.233418539+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_153219-puqja889/logs/debug-core.log"}
|
3 |
+
{"time":"2025-04-05T15:32:19.446097202+08:00","level":"INFO","msg":"created new stream","id":"puqja889"}
|
4 |
+
{"time":"2025-04-05T15:32:19.4461465+08:00","level":"INFO","msg":"stream: started","id":"puqja889"}
|
5 |
+
{"time":"2025-04-05T15:32:19.446170192+08:00","level":"INFO","msg":"sender: started","stream_id":"puqja889"}
|
6 |
+
{"time":"2025-04-05T15:32:19.44617482+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"puqja889"}
|
7 |
+
{"time":"2025-04-05T15:32:19.446195338+08:00","level":"INFO","msg":"handler: started","stream_id":"puqja889"}
|
8 |
+
{"time":"2025-04-05T15:32:19.762936247+08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2025-04-05T15:47:41.285928309+08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2025-04-05T15:47:41.287819762+08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2025-04-05T15:47:42.247311433+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.312718096,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.312711599,"progress":"10.8KB/10.8KB"}],"total_operations":2}}
|
12 |
+
{"time":"2025-04-05T15:47:42.941747027+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2025-04-05T15:47:44.198051243+08:00","level":"INFO","msg":"stream: closing","id":"puqja889"}
|
14 |
+
{"time":"2025-04-05T15:47:44.198103033+08:00","level":"INFO","msg":"handler: closed","stream_id":"puqja889"}
|
15 |
+
{"time":"2025-04-05T15:47:44.198111233+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"puqja889"}
|
16 |
+
{"time":"2025-04-05T15:47:44.198219676+08:00","level":"INFO","msg":"sender: closed","stream_id":"puqja889"}
|
17 |
+
{"time":"2025-04-05T15:47:44.198457535+08:00","level":"INFO","msg":"stream: closed","id":"puqja889"}
|
wandb/run-20250405_153219-puqja889/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
2 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Configure stats pid to 3746936
|
3 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
|
4 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
|
5 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
6 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_153219-puqja889/logs/debug.log
|
7 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_153219-puqja889/logs/debug-internal.log
|
8 |
+
2025-04-05 15:32:19,226 INFO MainThread:3746936 [wandb_init.py:init():644] calling init triggers
|
9 |
+
2025-04-05 15:32:19,227 INFO MainThread:3746936 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
10 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_o1', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
|
11 |
+
2025-04-05 15:32:19,227 INFO MainThread:3746936 [wandb_init.py:init():680] starting backend
|
12 |
+
2025-04-05 15:32:19,227 INFO MainThread:3746936 [wandb_init.py:init():684] sending inform_init request
|
13 |
+
2025-04-05 15:32:19,230 INFO MainThread:3746936 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-05 15:32:19,230 INFO MainThread:3746936 [wandb_init.py:init():697] backend started and connected
|
15 |
+
2025-04-05 15:32:19,232 INFO MainThread:3746936 [wandb_init.py:init():790] updated telemetry
|
16 |
+
2025-04-05 15:32:19,245 INFO MainThread:3746936 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-05 15:32:19,756 INFO MainThread:3746936 [wandb_init.py:init():874] starting run threads in backend
|
18 |
+
2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_console_start():2374] atexit reg
|
19 |
+
2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
20 |
+
2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
21 |
+
2025-04-05 15:32:20,000 INFO MainThread:3746936 [wandb_run.py:_redirect():2314] Redirects installed.
|
22 |
+
2025-04-05 15:32:20,003 INFO MainThread:3746936 [wandb_init.py:init():916] run started, returning control to user process
|
23 |
+
2025-04-05 15:47:41,245 INFO MainThread:3746936 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/puqja889
|
24 |
+
2025-04-05 15:47:41,245 INFO MainThread:3746936 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
|
25 |
+
2025-04-05 15:47:41,246 INFO MainThread:3746936 [wandb_run.py:_restore():2321] restore
|
26 |
+
2025-04-05 15:47:41,246 INFO MainThread:3746936 [wandb_run.py:_restore():2327] restore done
|
27 |
+
2025-04-05 15:47:44,192 INFO MainThread:3746936 [wandb_run.py:_footer_history_summary_info():3892] rendering history
|
28 |
+
2025-04-05 15:47:44,192 INFO MainThread:3746936 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
|
29 |
+
2025-04-05 15:47:44,197 INFO MainThread:3746936 [wandb_run.py:_footer_sync_info():3853] logging synced files
|
wandb/run-20250405_153219-puqja889/run-puqja889.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e906c048d4517fa596f83a7df5e6325decfd9173e4b6199a4e4897a6c964488f
|
3 |
+
size 1744561
|
wandb/run-20250405_203209-jla7fqqr/files/config.yaml
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.19.1
|
4 |
+
m: []
|
5 |
+
python_version: 3.11.0
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 5
|
10 |
+
- 11
|
11 |
+
- 41
|
12 |
+
- 49
|
13 |
+
- 51
|
14 |
+
- 53
|
15 |
+
- 55
|
16 |
+
- 63
|
17 |
+
- 71
|
18 |
+
- 83
|
19 |
+
- 98
|
20 |
+
- 105
|
21 |
+
"2":
|
22 |
+
- 1
|
23 |
+
- 5
|
24 |
+
- 11
|
25 |
+
- 41
|
26 |
+
- 49
|
27 |
+
- 51
|
28 |
+
- 53
|
29 |
+
- 55
|
30 |
+
- 63
|
31 |
+
- 71
|
32 |
+
- 83
|
33 |
+
- 98
|
34 |
+
- 105
|
35 |
+
"3":
|
36 |
+
- 2
|
37 |
+
- 13
|
38 |
+
- 16
|
39 |
+
- 23
|
40 |
+
- 55
|
41 |
+
- 61
|
42 |
+
"4": 3.11.0
|
43 |
+
"5": 0.19.1
|
44 |
+
"6": 4.50.0.dev0
|
45 |
+
"8":
|
46 |
+
- 5
|
47 |
+
"12": 0.19.1
|
48 |
+
"13": linux-x86_64
|
49 |
+
bnb_cfgs:
|
50 |
+
value:
|
51 |
+
bnb_4bit_compute_dtype: float16
|
52 |
+
bnb_4bit_quant_type: nf4
|
53 |
+
bnb_4bit_use_double_quant: true
|
54 |
+
load_in_4bit: true
|
55 |
+
load_in_8bit: false
|
56 |
+
use_bnb: false
|
57 |
+
data_cfgs:
|
58 |
+
value:
|
59 |
+
eval_optional_args: []
|
60 |
+
train_datasets: /aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset
|
61 |
+
train_optional_args: []
|
62 |
+
train_split: train
|
63 |
+
train_template: Safe_thinking
|
64 |
+
logger_cfgs:
|
65 |
+
value:
|
66 |
+
log_project: safe-o1
|
67 |
+
log_run_name: sft
|
68 |
+
log_type: wandb
|
69 |
+
output_dir: /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking
|
70 |
+
save_interval: 100000
|
71 |
+
lora_cfgs:
|
72 |
+
value:
|
73 |
+
inference_mode: false
|
74 |
+
lora_alpha: 16
|
75 |
+
lora_dropout: 0.1
|
76 |
+
r: 16
|
77 |
+
save_full_model: true
|
78 |
+
target_modules:
|
79 |
+
- q_proj
|
80 |
+
- v_proj
|
81 |
+
task_type: TaskType.CAUSAL_LM
|
82 |
+
use_lora: false
|
83 |
+
model_cfgs:
|
84 |
+
value:
|
85 |
+
model_max_length: 16384
|
86 |
+
model_name_or_path: /aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct
|
87 |
+
trust_remote_code: true
|
88 |
+
train_cfgs:
|
89 |
+
value:
|
90 |
+
adam_betas:
|
91 |
+
- 0.9
|
92 |
+
- 0.95
|
93 |
+
adam_epsilon: 1e-08
|
94 |
+
bf16: true
|
95 |
+
ds_cfgs: ds_z3_config.json
|
96 |
+
epochs: 3
|
97 |
+
eval_interval: 10
|
98 |
+
eval_strategy: steps
|
99 |
+
fp16: false
|
100 |
+
gradient_accumulation_steps: 2
|
101 |
+
gradient_checkpointing: true
|
102 |
+
learning_rate: 2e-05
|
103 |
+
lr_scheduler_type: constant
|
104 |
+
lr_warmup_ratio: 0.03
|
105 |
+
max_grad_norm: 1
|
106 |
+
per_device_eval_batch_size: 4
|
107 |
+
per_device_train_batch_size: 4
|
108 |
+
seed: 42
|
109 |
+
weight_decay: 0
|
wandb/run-20250405_203209-jla7fqqr/files/output.log
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
***** Running training *****
|
2 |
+
Training 1/3 epoch: 0%| | 0/528 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
3 |
+
Training 2/3 epoch (loss 0.6368): 60%|██████████████████████████████████████████████████████████████████████████████▌ | 319/528 [07:48<04:48, 1.38s/it]
|
4 |
+
[2025-04-05 20:32:45,658] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
5 |
+
[2025-04-05 20:32:45,659] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=23.605714664413604, CurrSamplesPerSec=23.666011535180804, MemAllocated=15.18GB, MaxMemAllocated=33.78GB
|
6 |
+
[2025-04-05 20:33:13,955] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
7 |
+
[2025-04-05 20:33:13,956] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=23.754469865096492, CurrSamplesPerSec=23.631327040548573, MemAllocated=15.18GB, MaxMemAllocated=33.78GB
|
8 |
+
[2025-04-05 20:33:46,142] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
9 |
+
[2025-04-05 20:33:46,143] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=22.604152126481146, CurrSamplesPerSec=25.109336167565615, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
10 |
+
[2025-04-05 20:34:13,747] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
11 |
+
[2025-04-05 20:34:13,748] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=23.017619879751617, CurrSamplesPerSec=25.723433617374972, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
12 |
+
[2025-04-05 20:34:42,610] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
13 |
+
[2025-04-05 20:34:42,610] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=23.049331926082044, CurrSamplesPerSec=27.117450303376405, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
14 |
+
[2025-04-05 20:35:11,048] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
15 |
+
[2025-04-05 20:35:11,049] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=23.14298012524782, CurrSamplesPerSec=21.882912187585045, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
16 |
+
[2025-04-05 20:35:39,236] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
17 |
+
[2025-04-05 20:35:39,237] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=23.24975082248447, CurrSamplesPerSec=20.374822325722487, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
18 |
+
[2025-04-05 20:36:08,690] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
19 |
+
[2025-04-05 20:36:08,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=23.176213278752765, CurrSamplesPerSec=21.797783380622224, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
20 |
+
[2025-04-05 20:36:38,107] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
21 |
+
[2025-04-05 20:36:38,107] [INFO] [timer.py:264:stop] epoch=1/micro_step=4/global_step=90, RunningAvgSamplesPerSec=23.126311848009383, CurrSamplesPerSec=16.471299501610225, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
22 |
+
[2025-04-05 20:37:06,881] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
23 |
+
[2025-04-05 20:37:06,881] [INFO] [timer.py:264:stop] epoch=1/micro_step=24/global_step=100, RunningAvgSamplesPerSec=23.14780207678257, CurrSamplesPerSec=24.49579467500081, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
24 |
+
[2025-04-05 20:37:36,188] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
25 |
+
[2025-04-05 20:37:36,188] [INFO] [timer.py:264:stop] epoch=1/micro_step=44/global_step=110, RunningAvgSamplesPerSec=23.11597305149581, CurrSamplesPerSec=16.43245669677465, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
26 |
+
[2025-04-05 20:38:06,693] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
27 |
+
[2025-04-05 20:38:06,694] [INFO] [timer.py:264:stop] epoch=1/micro_step=64/global_step=120, RunningAvgSamplesPerSec=22.998939428830514, CurrSamplesPerSec=22.171908523048636, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
28 |
+
[2025-04-05 20:38:34,755] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
29 |
+
[2025-04-05 20:38:34,756] [INFO] [timer.py:264:stop] epoch=1/micro_step=84/global_step=130, RunningAvgSamplesPerSec=23.06196743535597, CurrSamplesPerSec=22.22566165861562, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
30 |
+
[2025-04-05 20:39:03,683] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
31 |
+
[2025-04-05 20:39:03,683] [INFO] [timer.py:264:stop] epoch=1/micro_step=104/global_step=140, RunningAvgSamplesPerSec=23.06195015699282, CurrSamplesPerSec=20.920198186863757, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
32 |
+
[2025-04-05 20:39:31,775] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
33 |
+
[2025-04-05 20:39:31,775] [INFO] [timer.py:264:stop] epoch=1/micro_step=124/global_step=150, RunningAvgSamplesPerSec=23.115183552624345, CurrSamplesPerSec=25.03072256562265, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
34 |
+
[2025-04-05 20:39:59,910] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
35 |
+
[2025-04-05 20:39:59,910] [INFO] [timer.py:264:stop] epoch=1/micro_step=144/global_step=160, RunningAvgSamplesPerSec=23.153977209977544, CurrSamplesPerSec=29.017234800239933, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
36 |
+
[2025-04-05 20:40:28,955] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
37 |
+
[2025-04-05 20:40:28,955] [INFO] [timer.py:264:stop] epoch=1/micro_step=164/global_step=170, RunningAvgSamplesPerSec=23.14329866214052, CurrSamplesPerSec=28.340181534100314, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
38 |
+
[2025-04-05 20:40:58,435] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
39 |
+
[2025-04-05 20:40:58,435] [INFO] [timer.py:264:stop] epoch=2/micro_step=8/global_step=180, RunningAvgSamplesPerSec=23.115090713886406, CurrSamplesPerSec=23.97360709248484, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
40 |
+
[2025-04-05 20:41:29,026] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
41 |
+
[2025-04-05 20:41:29,026] [INFO] [timer.py:264:stop] epoch=2/micro_step=28/global_step=190, RunningAvgSamplesPerSec=23.04166812821496, CurrSamplesPerSec=16.449150851444518, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
42 |
+
[2025-04-05 20:41:56,470] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
43 |
+
[2025-04-05 20:41:56,470] [INFO] [timer.py:264:stop] epoch=2/micro_step=48/global_step=200, RunningAvgSamplesPerSec=23.107848960875014, CurrSamplesPerSec=28.456930650131092, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
44 |
+
[2025-04-05 20:42:27,319] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
45 |
+
[2025-04-05 20:42:27,320] [INFO] [timer.py:264:stop] epoch=2/micro_step=68/global_step=210, RunningAvgSamplesPerSec=23.03557307846801, CurrSamplesPerSec=26.542707384497938, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
46 |
+
[2025-04-05 20:42:56,548] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
47 |
+
[2025-04-05 20:42:56,548] [INFO] [timer.py:264:stop] epoch=2/micro_step=88/global_step=220, RunningAvgSamplesPerSec=23.029435842150235, CurrSamplesPerSec=22.045011851042542, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
48 |
+
[2025-04-05 20:43:24,989] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
49 |
+
[2025-04-05 20:43:24,990] [INFO] [timer.py:264:stop] epoch=2/micro_step=108/global_step=230, RunningAvgSamplesPerSec=23.049331659158383, CurrSamplesPerSec=23.932027339346398, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
50 |
+
[2025-04-05 20:43:52,817] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
51 |
+
[2025-04-05 20:43:52,818] [INFO] [timer.py:264:stop] epoch=2/micro_step=128/global_step=240, RunningAvgSamplesPerSec=23.087664989846726, CurrSamplesPerSec=23.486799951214525, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
52 |
+
[2025-04-05 20:44:21,469] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
53 |
+
[2025-04-05 20:44:21,470] [INFO] [timer.py:264:stop] epoch=2/micro_step=148/global_step=250, RunningAvgSamplesPerSec=23.09636651082851, CurrSamplesPerSec=19.006478505688033, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
54 |
+
[2025-04-05 20:44:50,257] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=0, lr=[2e-05, 2e-05], mom=[[0.9, 0.95], [0.9, 0.95]]
|
55 |
+
[2025-04-05 20:44:50,257] [INFO] [timer.py:264:stop] epoch=2/micro_step=168/global_step=260, RunningAvgSamplesPerSec=23.10289314788288, CurrSamplesPerSec=26.13127449094928, MemAllocated=15.18GB, MaxMemAllocated=38.66GB
|
56 |
+
Saving model to "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end" ...
|
57 |
+
Saving 16-bit model...
|
58 |
+
[2025-04-05 20:45:09,932] [INFO] [logging.py:128:log_dist] [Rank 0] [Torch] Checkpoint global_step264 is about to be saved!
|
59 |
+
[2025-04-05 20:45:09,933] [INFO] [engine.py:3680:save_16bit_model] Saving model weights to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin, tag: global_step264
|
60 |
+
[2025-04-05 20:45:09,933] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin...
|
61 |
+
[2025-04-05 20:45:27,032] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/slice_end/pytorch_model.bin.
|
62 |
+
[2025-04-05 20:45:27,032] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step264 is ready now!
|
63 |
+
Model saved!
|
wandb/run-20250405_203209-jla7fqqr/files/requirements.txt
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
maskrcnn_benchmark==0.0.0
|
2 |
+
deepspeed==0.16.1
|
3 |
+
uritemplate==4.1.1
|
4 |
+
pyairports==2.1.1
|
5 |
+
partial-json-parser==0.2.1.1.post4
|
6 |
+
tensorboard-data-server==0.7.2
|
7 |
+
pydantic==2.10.3
|
8 |
+
Werkzeug==3.1.3
|
9 |
+
attrs==24.3.0
|
10 |
+
Jinja2==3.1.4
|
11 |
+
email_validator==2.2.0
|
12 |
+
mdit-py-plugins==0.4.2
|
13 |
+
google-api-python-client==2.160.0
|
14 |
+
pandas==2.2.3
|
15 |
+
safehttpx==0.1.6
|
16 |
+
setproctitle==1.3.4
|
17 |
+
dill==0.3.8
|
18 |
+
torchaudio==2.5.1
|
19 |
+
frechet-audio-distance==0.1.2
|
20 |
+
blessed==1.20.0
|
21 |
+
llvmlite==0.43.0
|
22 |
+
litellm==1.60.8
|
23 |
+
nvidia-nvtx-cu12==12.4.127
|
24 |
+
nvidia-cusolver-cu12==11.6.1.9
|
25 |
+
einops==0.8.0
|
26 |
+
datasets==3.2.0
|
27 |
+
pycountry==24.6.1
|
28 |
+
airportsdata==20250224
|
29 |
+
idna==3.10
|
30 |
+
urllib3==2.2.3
|
31 |
+
mpmath==1.3.0
|
32 |
+
wandb==0.19.1
|
33 |
+
certifi==2024.12.14
|
34 |
+
markdown-it-py==3.0.0
|
35 |
+
align-anything==0.0.1.dev0
|
36 |
+
aiohttp==3.11.10
|
37 |
+
fsspec==2024.9.0
|
38 |
+
aiohappyeyeballs==2.4.4
|
39 |
+
httplib2==0.22.0
|
40 |
+
hjson==3.1.0
|
41 |
+
yarl==1.18.3
|
42 |
+
decorator==5.1.1
|
43 |
+
distlib==0.3.9
|
44 |
+
absl-py==2.1.0
|
45 |
+
huggingface-hub==0.27.0
|
46 |
+
memray==1.15.0
|
47 |
+
Pygments==2.18.0
|
48 |
+
soupsieve==2.6
|
49 |
+
shellingham==1.5.4
|
50 |
+
tokenizers==0.21.0
|
51 |
+
uvloop==0.21.0
|
52 |
+
numpy==1.26.4
|
53 |
+
linkify-it-py==2.0.3
|
54 |
+
sympy==1.13.1
|
55 |
+
python-dotenv==1.0.1
|
56 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
57 |
+
tensorboard==2.18.0
|
58 |
+
fastrlock==0.8.3
|
59 |
+
rsa==4.9
|
60 |
+
lm-format-enforcer==0.10.9
|
61 |
+
openai==1.61.1
|
62 |
+
gpustat==1.1.1
|
63 |
+
librosa==0.10.2.post1
|
64 |
+
grpcio-status==1.70.0
|
65 |
+
nvidia-cudnn-cu12==9.1.0.70
|
66 |
+
zipp==3.21.0
|
67 |
+
nvidia-nvjitlink-cu12==12.4.127
|
68 |
+
cupy-cuda12x==13.3.0
|
69 |
+
Markdown==3.7
|
70 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
71 |
+
nvidia-curand-cu12==10.3.5.147
|
72 |
+
rpds-py==0.22.3
|
73 |
+
outlines==0.1.11
|
74 |
+
docker-pycreds==0.4.0
|
75 |
+
distro==1.9.0
|
76 |
+
httpcore==1.0.7
|
77 |
+
gradio==5.9.0
|
78 |
+
google-auth-httplib2==0.2.0
|
79 |
+
iniconfig==2.0.0
|
80 |
+
gitdb==4.0.11
|
81 |
+
jsonschema==4.23.0
|
82 |
+
click==8.1.7
|
83 |
+
ninja==1.11.1.3
|
84 |
+
setuptools==75.6.0
|
85 |
+
audioread==3.0.1
|
86 |
+
frozenlist==1.5.0
|
87 |
+
transformers-stream-generator==0.0.5
|
88 |
+
nvidia-cublas-cu12==12.4.5.8
|
89 |
+
pycparser==2.22
|
90 |
+
GitPython==3.1.43
|
91 |
+
tqdm==4.67.1
|
92 |
+
importlib_metadata==8.5.0
|
93 |
+
patsy==1.0.1
|
94 |
+
networkx==3.4.2
|
95 |
+
semantic-version==2.10.0
|
96 |
+
alpaca_eval==0.6.6
|
97 |
+
google-cloud-core==2.4.1
|
98 |
+
prometheus_client==0.21.1
|
99 |
+
jiter==0.8.2
|
100 |
+
scipy==1.14.1
|
101 |
+
starlette==0.41.3
|
102 |
+
jq==1.8.0
|
103 |
+
opencensus-context==0.1.3
|
104 |
+
cachetools==5.5.1
|
105 |
+
cffi==1.17.1
|
106 |
+
opencv-python-headless==4.10.0.84
|
107 |
+
joblib==1.4.2
|
108 |
+
yt-dlp==2025.1.26
|
109 |
+
python-dateutil==2.9.0.post0
|
110 |
+
httpx==0.28.1
|
111 |
+
msgpack==1.1.0
|
112 |
+
pydub==0.25.1
|
113 |
+
tomlkit==0.13.2
|
114 |
+
nvitop==1.4.2
|
115 |
+
nvidia-cusparse-cu12==12.3.1.170
|
116 |
+
msgspec==0.18.6
|
117 |
+
aiosignal==1.3.2
|
118 |
+
wheel==0.45.1
|
119 |
+
filelock==3.16.1
|
120 |
+
pillow==10.4.0
|
121 |
+
typer==0.15.1
|
122 |
+
websockets==14.1
|
123 |
+
resampy==0.4.3
|
124 |
+
aiofiles==23.2.1
|
125 |
+
aiohttp-cors==0.7.0
|
126 |
+
platformdirs==4.3.6
|
127 |
+
gguf==0.10.0
|
128 |
+
diskcache==5.6.3
|
129 |
+
cloudpickle==3.1.0
|
130 |
+
multidict==6.1.0
|
131 |
+
py-cpuinfo==9.0.0
|
132 |
+
scikit-learn==1.6.0
|
133 |
+
smart-open==7.1.0
|
134 |
+
tiktoken==0.7.0
|
135 |
+
grpcio==1.70.0
|
136 |
+
charset-normalizer==3.4.0
|
137 |
+
nest-asyncio==1.6.0
|
138 |
+
lark==1.2.2
|
139 |
+
beautifulsoup4==4.13.3
|
140 |
+
pip==24.3.1
|
141 |
+
six==1.17.0
|
142 |
+
prometheus-fastapi-instrumentator==7.0.0
|
143 |
+
ruff==0.8.3
|
144 |
+
rich-toolkit==0.13.2
|
145 |
+
lazy_loader==0.4
|
146 |
+
grpc-google-iam-v1==0.14.0
|
147 |
+
psutil==6.1.0
|
148 |
+
mdurl==0.1.2
|
149 |
+
nvidia-nccl-cu12==2.21.5
|
150 |
+
triton==3.1.0
|
151 |
+
torchvision==0.20.1
|
152 |
+
fastapi==0.115.6
|
153 |
+
referencing==0.35.1
|
154 |
+
xxhash==3.5.0
|
155 |
+
pyzmq==26.2.0
|
156 |
+
torchlibrosa==0.1.0
|
157 |
+
googleapis-common-protos==1.66.0
|
158 |
+
pyasn1==0.6.1
|
159 |
+
soundfile==0.12.1
|
160 |
+
pyparsing==3.2.1
|
161 |
+
xgrammar==0.1.11
|
162 |
+
gradio_client==1.5.2
|
163 |
+
watchfiles==1.0.3
|
164 |
+
pluggy==1.5.0
|
165 |
+
py-spy==0.4.0
|
166 |
+
pybind11==2.13.6
|
167 |
+
diffusers==0.31.0
|
168 |
+
sentencepiece==0.2.0
|
169 |
+
flash_attn==2.7.4.post1
|
170 |
+
annotated-types==0.7.0
|
171 |
+
interegular==0.3.3
|
172 |
+
requests==2.32.3
|
173 |
+
opencensus==0.11.4
|
174 |
+
colorful==0.5.6
|
175 |
+
google-api-core==2.24.1
|
176 |
+
pytest==8.3.4
|
177 |
+
dnspython==2.7.0
|
178 |
+
pydantic_core==2.27.1
|
179 |
+
pytz==2024.2
|
180 |
+
pyasn1_modules==0.4.1
|
181 |
+
propcache==0.2.1
|
182 |
+
accelerate==1.2.1
|
183 |
+
fire==0.7.0
|
184 |
+
textual==1.0.0
|
185 |
+
sniffio==1.3.1
|
186 |
+
pyarrow==18.1.0
|
187 |
+
protobuf==5.29.1
|
188 |
+
wcwidth==0.2.13
|
189 |
+
packaging==24.2
|
190 |
+
uvicorn==0.34.0
|
191 |
+
sentry-sdk==2.19.2
|
192 |
+
google-auth==2.38.0
|
193 |
+
typing_extensions==4.12.2
|
194 |
+
peft==0.14.0
|
195 |
+
depyf==0.18.0
|
196 |
+
multiprocess==0.70.16
|
197 |
+
google-cloud-translate==3.19.0
|
198 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
199 |
+
jsonschema-specifications==2024.10.1
|
200 |
+
vllm==0.7.3
|
201 |
+
nvidia-cufft-cu12==11.2.1.3
|
202 |
+
timm==1.0.12
|
203 |
+
rich==13.9.4
|
204 |
+
ffmpy==0.4.0
|
205 |
+
virtualenv==20.29.1
|
206 |
+
tzdata==2024.2
|
207 |
+
smmap==5.0.1
|
208 |
+
uc-micro-py==1.0.3
|
209 |
+
proto-plus==1.26.0
|
210 |
+
soxr==0.5.0.post1
|
211 |
+
h11==0.14.0
|
212 |
+
outlines_core==0.1.26
|
213 |
+
compressed-tensors==0.9.1
|
214 |
+
blake3==1.0.4
|
215 |
+
xformers==0.0.28.post3
|
216 |
+
orjson==3.10.12
|
217 |
+
ray==2.40.0
|
218 |
+
PyYAML==6.0.2
|
219 |
+
nvidia-ml-py==12.560.30
|
220 |
+
python-multipart==0.0.19
|
221 |
+
PySocks==1.7.1
|
222 |
+
regex==2024.11.6
|
223 |
+
pooch==1.8.2
|
224 |
+
termcolor==2.5.0
|
225 |
+
MarkupSafe==2.1.5
|
226 |
+
torch==2.5.1
|
227 |
+
fastapi-cli==0.0.7
|
228 |
+
gdown==5.2.0
|
229 |
+
numba==0.60.0
|
230 |
+
httptools==0.6.4
|
231 |
+
transformers==4.50.0.dev0
|
232 |
+
mistral_common==1.5.1
|
233 |
+
astor==0.8.1
|
234 |
+
anyio==4.7.0
|
235 |
+
safetensors==0.4.5
|
236 |
+
threadpoolctl==3.5.0
|
237 |
+
wrapt==1.17.2
|
238 |
+
wheel==0.43.0
|
239 |
+
jaraco.functools==4.0.1
|
240 |
+
inflect==7.3.1
|
241 |
+
jaraco.text==3.12.1
|
242 |
+
typeguard==4.3.0
|
243 |
+
jaraco.collections==5.1.0
|
244 |
+
importlib_metadata==8.0.0
|
245 |
+
backports.tarfile==1.2.0
|
246 |
+
tomli==2.0.1
|
247 |
+
autocommand==2.2.2
|
248 |
+
platformdirs==4.2.2
|
249 |
+
more-itertools==10.3.0
|
250 |
+
zipp==3.19.2
|
251 |
+
packaging==24.2
|
252 |
+
typing_extensions==4.12.2
|
253 |
+
jaraco.context==5.3.0
|
wandb/run-20250405_203209-jla7fqqr/files/wandb-metadata.json
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
3 |
+
"python": "CPython 3.11.0",
|
4 |
+
"startedAt": "2025-04-05T12:32:09.142317Z",
|
5 |
+
"args": [
|
6 |
+
"--local_rank=0",
|
7 |
+
"--model_name_or_path",
|
8 |
+
"/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct",
|
9 |
+
"--train_datasets",
|
10 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset",
|
11 |
+
"--train_split",
|
12 |
+
"train",
|
13 |
+
"--train_template",
|
14 |
+
"Safe_thinking",
|
15 |
+
"--output_dir",
|
16 |
+
"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
17 |
+
"--log_project",
|
18 |
+
"safe-o1",
|
19 |
+
"--per_device_train_batch_size",
|
20 |
+
"4",
|
21 |
+
"--per_device_eval_batch_size",
|
22 |
+
"4",
|
23 |
+
"--gradient_accumulation_steps",
|
24 |
+
"2",
|
25 |
+
"--learning_rate",
|
26 |
+
"2e-5",
|
27 |
+
"--epochs",
|
28 |
+
"3",
|
29 |
+
"--model_max_length",
|
30 |
+
"16384"
|
31 |
+
],
|
32 |
+
"program": "-m align_anything.trainers.text_to_text.sft",
|
33 |
+
"git": {
|
34 |
+
"remote": "[email protected]:PKU-Alignment/align-anything.git",
|
35 |
+
"commit": "7ee46f54200d18e3c6c37568ba688d3be5ae7619"
|
36 |
+
},
|
37 |
+
"email": "[email protected]",
|
38 |
+
"root": "/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking",
|
39 |
+
"host": "dgx-091",
|
40 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/wenqi_qwen2vl/bin/python3.11",
|
41 |
+
"cpu_count": 112,
|
42 |
+
"cpu_count_logical": 224,
|
43 |
+
"gpu": "NVIDIA H800",
|
44 |
+
"gpu_count": 8,
|
45 |
+
"disk": {
|
46 |
+
"/": {
|
47 |
+
"total": "1888556142592",
|
48 |
+
"used": "1056715956224"
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"memory": {
|
52 |
+
"total": "2164195573760"
|
53 |
+
},
|
54 |
+
"cpu": {
|
55 |
+
"count": 112,
|
56 |
+
"countLogical": 224
|
57 |
+
},
|
58 |
+
"gpu_nvidia": [
|
59 |
+
{
|
60 |
+
"name": "NVIDIA H800",
|
61 |
+
"memoryTotal": "85520809984",
|
62 |
+
"cudaCores": 16896,
|
63 |
+
"architecture": "Hopper"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"name": "NVIDIA H800",
|
67 |
+
"memoryTotal": "85520809984",
|
68 |
+
"cudaCores": 16896,
|
69 |
+
"architecture": "Hopper"
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"name": "NVIDIA H800",
|
73 |
+
"memoryTotal": "85520809984",
|
74 |
+
"cudaCores": 16896,
|
75 |
+
"architecture": "Hopper"
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"name": "NVIDIA H800",
|
79 |
+
"memoryTotal": "85520809984",
|
80 |
+
"cudaCores": 16896,
|
81 |
+
"architecture": "Hopper"
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "NVIDIA H800",
|
85 |
+
"memoryTotal": "85520809984",
|
86 |
+
"cudaCores": 16896,
|
87 |
+
"architecture": "Hopper"
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"name": "NVIDIA H800",
|
91 |
+
"memoryTotal": "85520809984",
|
92 |
+
"cudaCores": 16896,
|
93 |
+
"architecture": "Hopper"
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"name": "NVIDIA H800",
|
97 |
+
"memoryTotal": "85520809984",
|
98 |
+
"cudaCores": 16896,
|
99 |
+
"architecture": "Hopper"
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"name": "NVIDIA H800",
|
103 |
+
"memoryTotal": "85520809984",
|
104 |
+
"cudaCores": 16896,
|
105 |
+
"architecture": "Hopper"
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"slurm": {
|
109 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
110 |
+
},
|
111 |
+
"cudaVersion": "12.2"
|
112 |
+
}
|
wandb/run-20250405_203209-jla7fqqr/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/epoch":3,"_wandb":{"runtime":797},"_timestamp":1.7438571013796113e+09,"_runtime":797.934275661,"_step":528,"train/step":528,"train/loss":0.45351502299308777,"train/lr":2e-05}
|
wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-05T20:32:08.551820206+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp_xy7jfsn/port-2888806.txt","pid":2888806,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2025-04-05T20:32:08.551894655+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2025-04-05T20:32:08.552966504+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":2888806}
|
4 |
+
{"time":"2025-04-05T20:32:08.552967717+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44473,"Zone":""}}
|
5 |
+
{"time":"2025-04-05T20:32:08.707940509+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34916"}
|
6 |
+
{"time":"2025-04-05T20:32:09.143458736+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
|
7 |
+
{"time":"2025-04-05T20:32:09.359706562+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
|
8 |
+
{"time":"2025-04-05T20:45:29.439953745+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
|
9 |
+
{"time":"2025-04-05T20:45:29.440386793+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jla7fqqr","id":"127.0.0.1:34916"}
|
10 |
+
{"time":"2025-04-05T20:45:29.484536234+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:34916"}
|
11 |
+
{"time":"2025-04-05T20:45:29.484552529+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:34916"}
|
12 |
+
{"time":"2025-04-05T20:45:29.484586228+08:00","level":"INFO","msg":"server is shutting down"}
|
13 |
+
{"time":"2025-04-05T20:45:29.484607146+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:34916"}
|
14 |
+
{"time":"2025-04-05T20:45:29.48465299+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:34916"}
|
15 |
+
{"time":"2025-04-05T20:45:29.48466913+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:34916"}
|
16 |
+
{"time":"2025-04-05T20:45:29.484674792+08:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-05T20:32:09.145104832+08:00","level":"INFO","msg":"using version","core version":"0.19.1"}
|
2 |
+
{"time":"2025-04-05T20:32:09.145234633+08:00","level":"INFO","msg":"created symlink","path":"/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-core.log"}
|
3 |
+
{"time":"2025-04-05T20:32:09.359661673+08:00","level":"INFO","msg":"created new stream","id":"jla7fqqr"}
|
4 |
+
{"time":"2025-04-05T20:32:09.359700555+08:00","level":"INFO","msg":"stream: started","id":"jla7fqqr"}
|
5 |
+
{"time":"2025-04-05T20:32:09.35975566+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"jla7fqqr"}
|
6 |
+
{"time":"2025-04-05T20:32:09.359831663+08:00","level":"INFO","msg":"handler: started","stream_id":"jla7fqqr"}
|
7 |
+
{"time":"2025-04-05T20:32:09.35975831+08:00","level":"INFO","msg":"sender: started","stream_id":"jla7fqqr"}
|
8 |
+
{"time":"2025-04-05T20:32:09.688023993+08:00","level":"INFO","msg":"Starting system monitor"}
|
9 |
+
{"time":"2025-04-05T20:45:27.076637312+08:00","level":"INFO","msg":"Stopping system monitor"}
|
10 |
+
{"time":"2025-04-05T20:45:27.077489476+08:00","level":"INFO","msg":"Stopped system monitor"}
|
11 |
+
{"time":"2025-04-05T20:45:28.038487853+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading config.yaml","runtime_seconds":0.283111243,"progress":"2.7KB/2.7KB"},{"desc":"uploading output.log","runtime_seconds":0.283100079,"progress":"10.8KB/10.8KB"}],"total_operations":2}}
|
12 |
+
{"time":"2025-04-05T20:45:28.204441985+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
13 |
+
{"time":"2025-04-05T20:45:29.440112056+08:00","level":"INFO","msg":"stream: closing","id":"jla7fqqr"}
|
14 |
+
{"time":"2025-04-05T20:45:29.440138846+08:00","level":"INFO","msg":"handler: closed","stream_id":"jla7fqqr"}
|
15 |
+
{"time":"2025-04-05T20:45:29.440146259+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"jla7fqqr"}
|
16 |
+
{"time":"2025-04-05T20:45:29.440285075+08:00","level":"INFO","msg":"sender: closed","stream_id":"jla7fqqr"}
|
17 |
+
{"time":"2025-04-05T20:45:29.440378039+08:00","level":"INFO","msg":"stream: closed","id":"jla7fqqr"}
|
wandb/run-20250405_203209-jla7fqqr/logs/debug.log
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-05 20:32:09,135 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Current SDK version is 0.19.1
|
2 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Configure stats pid to 2888806
|
3 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /home/yangyaodong/.config/wandb/settings
|
4 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from /aifs4su/yaodong/wenqi/projects/align-anything_0218/align-anything/scripts/wandb/settings
|
5 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
6 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():528] Logging user logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug.log
|
7 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:_log_setup():529] Logging internal logs to /aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking/wandb/run-20250405_203209-jla7fqqr/logs/debug-internal.log
|
8 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():644] calling init triggers
|
9 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():650] wandb.init called with sweep_config: {}
|
10 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 2, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'constant', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'steps', 'eval_interval': 10, 'max_grad_norm': 1.0}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/wenqi/projects/first-time_safety/data_annotation/data_output/safe-o1_0403/baseline_dataset', 'train_template': 'Safe_thinking', 'train_size': {}, 'train_split': 'train', 'train_name': {}, 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'safe-o1', 'log_run_name': 'sft', 'output_dir': '/aifs4su/yaodong/wenqi/projects/first-time_safety/output_models/Qwen2.5-7B-Instruct_safe_thinking', 'cache_dir': {}, 'save_interval': 100000}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/wenqi/models/Qwen2.5-7B-Instruct', 'trust_remote_code': True, 'model_max_length': 16384}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': {}}
|
11 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():680] starting backend
|
12 |
+
2025-04-05 20:32:09,136 INFO MainThread:2888806 [wandb_init.py:init():684] sending inform_init request
|
13 |
+
2025-04-05 20:32:09,141 INFO MainThread:2888806 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-05 20:32:09,142 INFO MainThread:2888806 [wandb_init.py:init():697] backend started and connected
|
15 |
+
2025-04-05 20:32:09,143 INFO MainThread:2888806 [wandb_init.py:init():790] updated telemetry
|
16 |
+
2025-04-05 20:32:09,162 INFO MainThread:2888806 [wandb_init.py:init():822] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-05 20:32:09,682 INFO MainThread:2888806 [wandb_init.py:init():874] starting run threads in backend
|
18 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_console_start():2374] atexit reg
|
19 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2224] redirect: wrap_raw
|
20 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2289] Wrapping output streams.
|
21 |
+
2025-04-05 20:32:10,106 INFO MainThread:2888806 [wandb_run.py:_redirect():2314] Redirects installed.
|
22 |
+
2025-04-05 20:32:10,112 INFO MainThread:2888806 [wandb_init.py:init():916] run started, returning control to user process
|
23 |
+
2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_finish():2100] finishing run day-one/safe-o1/jla7fqqr
|
24 |
+
2025-04-05 20:45:27,036 INFO MainThread:2888806 [wandb_run.py:_atexit_cleanup():2339] got exitcode: 0
|
25 |
+
2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2321] restore
|
26 |
+
2025-04-05 20:45:27,037 INFO MainThread:2888806 [wandb_run.py:_restore():2327] restore done
|
27 |
+
2025-04-05 20:45:29,432 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3892] rendering history
|
28 |
+
2025-04-05 20:45:29,433 INFO MainThread:2888806 [wandb_run.py:_footer_history_summary_info():3924] rendering summary
|
29 |
+
2025-04-05 20:45:29,439 INFO MainThread:2888806 [wandb_run.py:_footer_sync_info():3853] logging synced files
|
wandb/run-20250405_203209-jla7fqqr/run-jla7fqqr.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0312a8142b984ed4f9135492bf56d58d8037b91a040f35ff38db29091f30341f
|
3 |
+
size 1580734
|