fix run_benchmark for llama2_70b in auto_parallel (PaddlePaddle#8484)

* remove tsinghua pypi * modify gpt dateset addr for benchmark * fix run_benchmark for llama2_70b in auto_parallel
LeoMax-Xiong · May 24, 2024 · a90f163 · a90f163
1 parent 0cd8fe7
commit a90f163
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
@@ -74,7 +74,11 @@ function _train(){
         add_options=""
         log_file=${train_log_file}
     fi
-
+
+    # 70b需要关闭这个开关，否则会hang
+    if [[ "${MODEL_TYPE}" =~ "70b" ]]; then
+        unset CUDA_DEVICE_MAX_CONNECTIONS
+    fi
     # Disable for hanging bug
     # if [ "${tensor_parallel_degree}" != "1" ]; then
     #     export CUDA_DEVICE_MAX_CONNECTIONS=1
@@ -136,7 +140,7 @@ function _train(){
     rm -rf mylog && rm -rf checkpoints
 
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
-    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    timeout 40m ${train_cmd} > ${log_file} 2>&1
 
     if [ $? -ne 0 ];then
         echo -e "${model_name}, FAIL"