Skip to content

Commit

Permalink
Add more options to inference benchmark (#2325)
Browse files Browse the repository at this point in the history
  • Loading branch information
mrwyattii authored Sep 16, 2022
1 parent cf638be commit 1592381
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 31 deletions.
135 changes: 135 additions & 0 deletions benchmarks/inference/collect_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import os
import re
import argparse
import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument(
"--results-dir",
"-r",
type=str,
default="./results",
help="directory containing sweep results",
)
parser.add_argument("--version",
"-v",
type=int,
default=0,
help="version to be collected")
parser.add_argument("--gen-text-n",
"-n",
type=int,
default=1,
help="expected number of generated text")
parser.add_argument("--output",
"-o",
type=str,
default="./results.csv",
help="output file")
args = parser.parse_args()


def get_branch(file_path):
match = re.match(r".*\/(.*)\.log", file_path)
if match is None:
return False
else:
return match.groups()[0]


def get_benchmark_params(root_dir, file_path):
match = re.match(
rf"{root_dir}\/(.+?)_(fp\d+)_(true|false)_(true|false)_(\d+)gpus_v(\d+)\/",
file_path,
)
if match is None:
return False
else:
model, dtype, graphs, kernel, gpus, version = match.groups()
bool_dict = {"true": True, "false": False}
return {
"model": model,
"dtype": dtype,
"graphs": bool_dict[graphs.lower()],
"kernel": bool_dict[kernel.lower()],
"gpus": int(gpus),
"version": int(version),
}


def get_perf_data(file_content):
matches = re.findall(r"\s+(.+?)\sLatency:\s+(\d+\.\d+)\sms", file_content)
if matches is []:
return False
else:
return {f"latency-{key}": float(val) for key, val in matches}


def get_generated_text(file_content, gen_text_n):
file_content = file_content.replace("\n", " ")
file_content = file_content.replace("\t", " ")
matches = re.findall(r"RESPONSE\s(\d+):\s+[-]{30}\s+(.+?)\s+[-]{30}", file_content)
if len(matches) != gen_text_n:
return False
else:
return {f"generated-text-{key}": val for key, val in matches}


if __name__ == "__main__":
# List to collect data from all benchmarks
benchmarks_data = []

# Walk through directory of results from sweep.sh
for root, dirs, files in os.walk(args.results_dir):
# Because of how some models are named, the dir structure for results can vary, e.g.:
# "EleutherAI/gpt-neo_*/baseline.log" versus "gpt2_*/baseline.log"
if dirs:
continue

# Get data from baseline and each tested branch
for name in files:
file_path = os.path.join(root, name)

branch = get_branch(file_path)
if not branch:
print(f"WARNING: Could not detect branch for file {file_path}, skipping")
continue

params = get_benchmark_params(args.results_dir, file_path)
if not params:
print(
f"WARNING: Could not detect benchmark settings for file {file_path}, skipping"
)
continue

# Verify that the version matches that which we want to collect
if params["version"] != args.version:
continue

with open(file_path, "r") as f:
file_content = f.read()

perf_data = get_perf_data(file_content)
if not perf_data:
print(
f"WARNING: Could not detect benchmark performance data for file {file_path}, skipping"
)
continue

generated_text = get_generated_text(file_content, args.gen_text_n)
if not generated_text:
print(
f"WARNING: Could not detect generated text for file {file_path}, skipping"
)
continue

benchmarks_data.append({
"branch": branch,
**params,
**perf_data,
**generated_text
})

# Convert to a DataFrame and save
benchmarks_df = pd.DataFrame(benchmarks_data)
benchmarks_df.to_csv(args.output)
59 changes: 46 additions & 13 deletions benchmarks/inference/gpt-bench.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import torch
import time
import deepspeed
Expand All @@ -7,9 +8,26 @@
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, help="hf model name")
parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
parser.add_argument("--dtype",
type=str,
default="fp16",
choices=["fp16",
"fp32",
"int8"],
help="int8, fp16, or fp32")
parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
parser.add_argument("--local_rank", type=int, default=0, help="local rank")
parser.add_argument("--local_rank",
type=int,
default=int(os.getenv("LOCAL_RANK",
"0")),
help="local rank")
parser.add_argument("--world_size",
type=int,
default=int(os.getenv("WORLD_SIZE",
"1")),
help="world size")
parser.add_argument("--trials", type=int, default=30, help="number of trials")
args = parser.parse_args()

Expand Down Expand Up @@ -44,9 +62,17 @@ def print_latency(latency_set, title, warmup=3):

deepspeed.init_distributed("nccl")

print(args.model, args.max_tokens, args.dtype)
if args.local_rank == 0:
print("BENCHMARK SETTINGS:")
print(f"\tMODEL: {args.model}")
print(f"\tMAX_TOKENS: {args.max_tokens}")
print(f"\tDTYPE: {args.dtype}")
print(f"\tCUDA_GRAPHS: {args.graphs}")
print(f"\tKERNEL_INJECT: {args.kernel_inject}")

if args.dtype.lower() == "fp16":
if args.dtype == "int8":
dtype = torch.int8
elif args.dtype == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
Expand All @@ -56,26 +82,33 @@ def print_latency(latency_set, title, warmup=3):
framework="pt",
device=args.local_rank)

if dtype == torch.half:
if dtype == torch.float16:
pipe.model.half()

if args.deepspeed:
pipe.model = deepspeed.init_inference(pipe.model,
dtype=dtype,
replace_with_kernel_inject=True,
replace_method='auto')
pipe.model = deepspeed.init_inference(
pipe.model,
dtype=dtype,
mp_size=args.world_size,
replace_with_kernel_inject=args.kernel_inject,
replace_method="auto",
enable_cuda_graph=args.graphs,
)

responses = []
times = []
for i in range(args.trials):
torch.cuda.synchronize()
start = time.time()
r = pipe("DeepSpeed is", max_new_tokens=args.max_tokens)
r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
torch.cuda.synchronize()
end = time.time()
responses.append(r)
times.append((end - start) / (args.max_tokens - 3))

print_latency(times, "token latency")

print(responses[0:3])
if args.local_rank == 0:
print_latency(times, "token latency")
print(f"RESPONSE 0:")
print("-" * 30)
print(responses[0][0]["generated_text"])
print("-" * 30)
20 changes: 16 additions & 4 deletions benchmarks/inference/run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,34 @@ set -ex
model=$1
branch1=$2
branch2=$3
dtype=$4
graphs=$5
kernel=$6
gpus=$7

version=0
log_path=results/${model}_v${version}
log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
mkdir -p ${log_path}

params="--dtype $dtype "
if [[ "$graphs" == "true" ]]; then
params+="--graphs "
fi
if [[ "$kernel" == "true" ]]; then
params+="--kernel "
fi

echo "baseline $log_path"
deepspeed --num_gpus 1 gpt-bench.py -m "${model}" &> ${log_path}/baseline.log
deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log

cd ../../
git checkout ${branch1}
cd -
echo "ds ${branch1} $log_path"
deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch1}.log
deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/ds-${branch1}.log

cd ../../
git checkout ${branch2}
cd -
echo "ds ${branch2} $log_path"
deepspeed --num_gpus 1 gpt-bench.py --deepspeed -m "${model}" &> ${log_path}/ds-${branch2}.log
deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params&> ${log_path}/ds-${branch2}.log
43 changes: 29 additions & 14 deletions benchmarks/inference/sweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,37 @@ export TRANSFORMERS_CACHE=/tmp/hf-cache
branch1=$1
branch2=$2

for m in `echo "EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"`; do
bash run_model.sh $m $branch1 $branch2
done
gptneo_models="EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"
gpt2_models="gpt2 gpt2-large gpt2-xl"
gptj_models="EleutherAI/gpt-j-6B"
opt_models="facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"
bloom_models="bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"

for m in `echo "gpt2 gpt2-large gpt2-xl"`; do
bash run_model.sh $m $branch1 $branch2
done
for gpus in `echo "1 2 4 8"`; do
for dtype in `echo "fp16 fp32"`; do
for graphs in `echo "true false"`; do
for kernel in `echo "true false"`; do
params="$dtype $graphs $kernel $gpus"
for m in `echo "$gptneo_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done

for m in `echo "EleutherAI/gpt-j-6B"`; do
bash run_model.sh $m $branch1 $branch2
done
for m in `echo "$gpt2_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done

for m in `echo "facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"`; do
bash run_model.sh $m $branch1 $branch2
done
for m in `echo "$gptj_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done

for m in `echo "$opt_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done

for m in `echo "bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"`; do
bash run_model.sh $m $branch1 $branch2
for m in `echo "$bloom_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done
done
done
done
done

0 comments on commit 1592381

Please sign in to comment.