File size: 17,849 Bytes
113dbd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
v-haipe+   551 16041 99 08:16 pts/2    00:00:17 python LiLa/gsm8k_cluster.py
v-haipe+  9211 10235  3 Sep24 pts/10   00:32:12 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 0 --end 2000
v-haipe+  9288 10459  3 Sep24 pts/11   00:28:30 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 2000 --end 4000
v-haipe+  9310 10667  3 Sep24 pts/12   00:27:45 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 4000 --end 6000
v-haipe+  9341 10865  3 Sep24 pts/13   00:26:50 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 6000 --end 8000
v-haipe+  9379 25248  3 Sep24 pts/16   00:27:01 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 8000 --end 10000
v-haipe+  9410 25467  3 Sep24 pts/17   00:27:17 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 10000 --end 12000
v-haipe+  9438 26561  3 Sep24 pts/19   00:27:17 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 12000 --end 14000
v-haipe+  9469 26761  3 Sep24 pts/20   00:26:55 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 14000 --end 16000
v-haipe+  9500 26968  3 Sep24 pts/21   00:27:09 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 16000 --end 18000
v-haipe+  9531 27172  3 Sep24 pts/22   00:29:29 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 18000 --end 20000
v-haipe+  9775  9560  3 Sep24 pts/29   00:30:29 python LiLa/chatgpt_evol_lila_gsm8k_domain.py --start 20000 --end 22000
v-haipe+ 11262 24577  0 Sep23 pts/8    00:00:06 python app.py
v-haipe+ 11300 11262  0 Sep23 pts/8    00:20:54 /home/v-haipengluo/.conda/envs/wizardweb/bin/python /workspaceblobstore/qins/test/20220316/kai/research/code_repo/wizard_verse/code_repo/server_code/wizard_verse/lm/server_lm/app.py
v-haipe+ 11604 20782 98 Sep23 pts/4    2-00:06:57 python -m vllm.entrypoints.api_server --model /workspaceblobstore/caxu/trained_models/13Bv2_497kcontinueroleplay_dsys_2048_e4_2e_5/checkpoint-75 --host phlrr3006.guest.corp.microsoft.com --port 7991
v-haipe+ 13722 22601  0 Sep24 pts/6    00:09:37 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13830 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13834 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13837 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13839 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13841 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13843 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13845 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13847 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13849 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13851 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13853 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13855 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13857 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13859 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13861 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13863 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13865 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13867 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13869 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13871 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13873 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13875 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13877 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13879 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13881 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13883 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13885 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 13887 13722  0 Sep24 pts/6    00:00:05 /home/v-haipengluo/.conda/envs/sdxl/bin/python /home/v-haipengluo/.conda/envs/sdxl/bin/uvicorn --host=phlrr3006.guest.corp.microsoft.com --port 7999 --workers 1 --backlog 1 --limit-concurrency 4 main_v3:app
v-haipe+ 18319 15852  0 05:34 pts/1    00:00:03 /home/v-haipengluo/.conda/envs/llamax/bin/python /home/v-haipengluo/.conda/envs/llamax/bin/deepspeed --master_port 29500 --hostfile=hostfile --include=localhost:1,3,4,5,6,7 src/train.py --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 18333 18319  0 05:34 pts/1    00:00:03 /home/v-haipengluo/.conda/envs/llamax/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMSwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None src/train.py --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 18346 18333 99 05:34 pts/1    03:20:42 /home/v-haipengluo/.conda/envs/llamax/bin/python -u src/train.py --local_rank=0 --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 18347 18333 99 05:34 pts/1    03:40:59 /home/v-haipengluo/.conda/envs/llamax/bin/python -u src/train.py --local_rank=1 --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 18348 18333 99 05:34 pts/1    03:44:08 /home/v-haipengluo/.conda/envs/llamax/bin/python -u src/train.py --local_rank=2 --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 18349 18333 99 05:34 pts/1    03:32:51 /home/v-haipengluo/.conda/envs/llamax/bin/python -u src/train.py --local_rank=3 --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 18350 18333 99 05:34 pts/1    03:41:16 /home/v-haipengluo/.conda/envs/llamax/bin/python -u src/train.py --local_rank=4 --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 18351 18333 99 05:34 pts/1    03:42:27 /home/v-haipengluo/.conda/envs/llamax/bin/python -u src/train.py --local_rank=5 --model_name_or_path /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_stackexchange_MATH_12w_sample_5w_score0.5_trainset_2e-5/checkpoint-992 --data_path /workspaceblobstore/qins/test/20220316/haipeng/data/Math_datasets/MATH_the_answer_is_format/hendrycks_math_7500_ori_gpt4_ori_15k.json --output_dir /workspaceblobstore/qins/test/20220316/haipeng/output_weights/llamax_13b_continue_train_stackMATH5w_checkpoint992_hendrycks_math_7500_ori_gpt4_ori_15k --num_train_epochs 3 --model_max_length 1150 --per_device_train_batch_size 17 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 36 --save_total_limit 200 --learning_rate 2e-5 --warmup_steps 10 --logging_steps 2 --lr_scheduler_type cosine --report_to tensorboard --gradient_checkpointing True --deepspeed src/configs/deepspeed_config.json --fp16 True
v-haipe+ 24334 23818  0 Sep23 pts/7    00:00:25 python -m http.server