Skip to content

Commit

Permalink
fix run_benchmark for llama2_70b in auto_parallel (PaddlePaddle#8484)
Browse files Browse the repository at this point in the history
* remove tsinghua pypi

* modify gpt dateset addr for benchmark

* fix run_benchmark for llama2_70b in auto_parallel
  • Loading branch information
fightfat authored May 24, 2024
1 parent 0cd8fe7 commit a90f163
Showing 1 changed file with 6 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,11 @@ function _train(){
add_options=""
log_file=${train_log_file}
fi


# 70b需要关闭这个开关,否则会hang
if [[ "${MODEL_TYPE}" =~ "70b" ]]; then
unset CUDA_DEVICE_MAX_CONNECTIONS
fi
# Disable for hanging bug
# if [ "${tensor_parallel_degree}" != "1" ]; then
# export CUDA_DEVICE_MAX_CONNECTIONS=1
Expand Down Expand Up @@ -136,7 +140,7 @@ function _train(){
rm -rf mylog && rm -rf checkpoints

echo "train_cmd: ${train_cmd} log_file: ${log_file}"
timeout 15m ${train_cmd} > ${log_file} 2>&1
timeout 40m ${train_cmd} > ${log_file} 2>&1

if [ $? -ne 0 ];then
echo -e "${model_name}, FAIL"
Expand Down

0 comments on commit a90f163

Please sign in to comment.