Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IBM Power10 (VSX, MMA) support for ppc64le #1748

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ set(SOURCES
src/cpu/backend.cc
src/cpu/cpu_info.cc
src/cpu/cpu_isa.cc
src/cpu/kernels.cc
#src/cpu/kernels.cc
src/cpu/parallel.cc
src/cpu/primitives.cc
src/decoding.cc
Expand Down Expand Up @@ -236,7 +236,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(aarch64)"
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)")
add_definitions(-DCT2_X86_BUILD)
set(CT2_BUILD_ARCH "x86_64")

elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(PPC64LE)")
add_definitions(-DCT2_PPC64LE_BUILD)
set(CT2_BUILD_ARCH "ppc64le")

if(BUILD_SHARED_LIBS)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()
Expand All @@ -263,6 +266,8 @@ if(ENABLE_CPU_DISPATCH)
endif()
elseif(CT2_BUILD_ARCH STREQUAL "arm64")
ct2_compile_kernels_for_isa(neon "-DUSE_NEON")
elseif(CT2_BUILD_ARCH STREQUAL "ppc64le")
ct2_compile_kernels_for_isa(ppc64le "-mcpu=power10 -O3 -flto")
endif()
endif()

Expand Down
91 changes: 91 additions & 0 deletions docker/Dockerfile.ppc64le
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
FROM ppc64le/ubuntu:22.04 as builder

RUN apt-get update && \
apt-get install -y --no-install-recommends \
python3-dev \
python3-pip \
wget \
git \
build-essential \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*


WORKDIR /root

RUN python3 -m pip --no-cache-dir install cmake==3.22.*

RUN wget -qO- https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu/dists/jammy/615d762f.gpg.key | tee /etc/apt/trusted.gpg.d/615d762f.asc && \
echo "deb [signed-by=/etc/apt/trusted.gpg.d/615d762f.asc] https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu jammy at17.0" >> /etc/apt/sources.list && \
cat /etc/apt/sources.list && \
cat /etc/apt/trusted.gpg.d/615d762f.asc && \
apt update && \
cat /etc/apt/sources.list && \
apt install -y advance-toolchain-at17.0-runtime advance-toolchain-at17.0-devel advance-toolchain-at17.0-perf advance-toolchain-at17.0-mcore-libs

ENV SLEEF_VERSION=3.6.1
RUN wget -q https://github.com/shibatch/sleef/archive/refs/tags/${SLEEF_VERSION}.tar.gz && \
tar xf *.tar.gz && \
rm *.tar.gz && \
cd sleef* && \
mkdir build && \
cd build && \
cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++ -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release .. && \
cd .. && \
cmake --build build -j --clean-first && \
cmake --install build --prefix=/usr/


ENV ONEDNN_VERSION=3.1.1
RUN wget -q https://github.com/oneapi-src/oneDNN/archive/refs/tags/v${ONEDNN_VERSION}.tar.gz && \
tar xf *.tar.gz && \
rm *.tar.gz && \
cd oneDNN-* && \
cmake -DCMAKE_BUILD_TYPE=Release -DONEDNN_LIBRARY_TYPE=STATIC -DONEDNN_BUILD_EXAMPLES=OFF -DONEDNN_BUILD_TESTS=OFF -DONEDNN_ENABLE_WORKLOAD=INFERENCE -DONEDNN_ENABLE_PRIMITIVE="CONVOLUTION;REORDER" -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP . && \
make -j$(nproc) install && \
cd .. && \
rm -r oneDNN-*

COPY third_party third_party
COPY cli cli
COPY include include
COPY src src
COPY cmake cmake
COPY python python
COPY CMakeLists.txt .

ARG CXX_FLAGS
ENV CXX_FLAGS=${CXX_FLAGS:-"-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off"}

ENV CTRANSLATE2_ROOT=/opt/ctranslate2

RUN mkdir build && \
cd build && \
cmake -DCMAKE_INSTALL_PREFIX=${CTRANSLATE2_ROOT} \
-DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF \
-DWITH_DNNL=ON -DOPENMP_RUNTIME=COMP \
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-DCMAKE_BUILD_TYPE=Release \
.. && \
VERBOSE=1 make -j$(nproc) install

ENV LANG=en_US.UTF-8
COPY README.md .

RUN cd python && \
python3 -m pip --no-cache-dir install -r install_requirements.txt && \
python3 setup-ppc64le.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT


ENV CTRANSLATE2_ROOT=/opt/ctranslate2
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CTRANSLATE2_ROOT/lib

#COPY --from=builder $CTRANSLATE2_ROOT $CTRANSLATE2_ROOT
RUN pip3 install --force-reinstall ninja


RUN python3 -m pip --no-cache-dir install $CTRANSLATE2_ROOT/*.whl && \
rm $CTRANSLATE2_ROOT/*.whl

ENTRYPOINT ["/opt/ctranslate2/bin/ct2-translator"]
56 changes: 56 additions & 0 deletions docs/ppc64le.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# IBM Power10 -ppc64le

CTranslate2 fully supports IBM Power10 MMA and VSX extensions. Each Power10 core has 4 Matrix Math Accelerator units. For optimum performance use at least SMT4, in some cases SMT8 seems to perform better, but it is advicable to try out both. A simple way to test this is to use --intra_threads parameter to control the number of threads CTranslate2 is executing. At maximum this should be 8*number of physical cores (SMT-8).

Based on preliminary testing Power10 core offer 27-42% higher tokens/s compared to Intel Gold Core.

It should be possible to build for Power9, but missing MMA units will have significant impact on performance.

OneDNN is used for int8 matrix math that is fully utilizing MMA units, it should be possible to build with OpenBLAS for 16bit MMA usage.

## Build docker / podman container

This is the easy way:
```git clone --recursive https://github.com/OpenNMT/CTranslate2/
cd CTranslate2/docker
podman build -t elinar.ai/ct2-ppc64le -f Dockerfile.ppc64le ..

```

Then run CTranslate2 container (substitue mount point, MODEL_LOCATION and SRC_FILE):
```podman run --security-opt=label=disable --ipc=host --ulimit=host -it --rm -v /tmp:/tmp elinar.ai/ct2-ppc64le --model MODEL_LOCATION --src SRC_FILE --intra_threads 16```

## Install from sources
This build has been tested on RHEL 9 / ppc64le and requires IBM Advance Toolchain 17.0 ( https://www.ibm.com/support/pages/advance-toolchain-linux-power )
```
#sleef:
git clone -b 3.6.1 https://github.com/shibatch/sleef

cd sleef
mkdir build && cd build
cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++ -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release ..

cmake --build build -j --clean-first
sudo cmake --install build --prefix=/usr/


#OneDNN;
git clone -b v3.2 --recursive https://github.com/oneapi-src/oneDNN
cd oneDNN
mkdir build && cd build
cmake -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP ..
make -j16
sudo make install


git clone --recursive https://github.com/Dagamies/CTranslate2
cd CTranslate2
mkdir build
cd build
cmake -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF -DWITH_DNNL=ON -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off' -DOPENMP_RUNTIME=COMP ..
make -j16
sudo make install
sudo ldconfig -v
export LD_LIBRARY_PATH=/usr/local/lib64/

```
126 changes: 126 additions & 0 deletions python/setup-ppc64le.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import glob
import os
import sys

import pybind11

from pybind11.setup_helpers import ParallelCompile
from setuptools import Extension, find_packages, setup

base_dir = os.path.dirname(os.path.abspath(__file__))
include_dirs = [pybind11.get_include()]
library_dirs = []


def _get_long_description():
readme_path = os.path.join(base_dir, "README.md")
if not os.path.exists(readme_path):
return ""
with open(readme_path, encoding="utf-8") as readme_file:
return readme_file.read()


def _get_project_version():
version_path = os.path.join(base_dir, "ctranslate2", "version.py")
version = {}
with open(version_path, encoding="utf-8") as fp:
exec(fp.read(), version)
return version["__version__"]


def _maybe_add_library_root(lib_name):
if "%s_ROOT" % lib_name in os.environ:
root = os.environ["%s_ROOT" % lib_name]
include_dirs.append("%s/include" % root)
for lib_dir in ("lib", "lib64"):
path = "%s/%s" % (root, lib_dir)
if os.path.exists(path):
library_dirs.append(path)
break


_maybe_add_library_root("CTRANSLATE2")

cflags = ["-std=c++17", "-fvisibility=hidden"]
ldflags = []
package_data = {}
if sys.platform == "darwin":
# std::visit requires macOS 10.14
cflags.append("-mmacosx-version-min=10.14")
ldflags.append("-Wl,-rpath,/usr/local/lib")
elif sys.platform == "win32":
cflags = ["/std:c++17", "/d2FH4-"]
package_data["ctranslate2"] = ["*.dll"]

ctranslate2_module = Extension(
"ctranslate2._ext",
sources=glob.glob(os.path.join("cpp", "*.cc")),
extra_compile_args=cflags,
extra_link_args=ldflags,
include_dirs=include_dirs,
library_dirs=library_dirs,
libraries=["ctranslate2"],
)

ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install()

setup(
name="ctranslate2",
version=_get_project_version(),
license="MIT",
description="Fast inference engine for Transformer models",
long_description=_get_long_description(),
long_description_content_type="text/markdown",
author="OpenNMT",
url="https://opennmt.net",
classifiers=[
"Development Status :: 5 - Production/Stable",
"Environment :: GPU :: NVIDIA CUDA :: 11.0",
"Environment :: GPU :: NVIDIA CUDA :: 11.1",
"Environment :: GPU :: NVIDIA CUDA :: 11.2",
"Environment :: GPU :: NVIDIA CUDA :: 11.3",
"Environment :: GPU :: NVIDIA CUDA :: 11.4",
"Environment :: GPU :: NVIDIA CUDA :: 11.5",
"Environment :: GPU :: NVIDIA CUDA :: 11.6",
"Environment :: GPU :: NVIDIA CUDA :: 11.7",
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
project_urls={
"Documentation": "https://opennmt.net/CTranslate2",
"Forum": "https://forum.opennmt.net",
"Gitter": "https://gitter.im/OpenNMT/CTranslate2",
"Source": "https://github.com/OpenNMT/CTranslate2",
},
keywords="opennmt nmt neural machine translation cuda mkl inference quantization",
packages=find_packages(exclude=["bin"]),
package_data=package_data,
ext_modules=[ctranslate2_module],
python_requires=">=3.8",
install_requires=[
"setuptools",
"numpy==1.25.2",
"pyyaml>=5.3,<7",
],
entry_points={
"console_scripts": [
"ct2-fairseq-converter=ctranslate2.converters.fairseq:main",
"ct2-marian-converter=ctranslate2.converters.marian:main",
"ct2-openai-gpt2-converter=ctranslate2.converters.openai_gpt2:main",
"ct2-opennmt-py-converter=ctranslate2.converters.opennmt_py:main",
"ct2-opennmt-tf-converter=ctranslate2.converters.opennmt_tf:main",
"ct2-opus-mt-converter=ctranslate2.converters.opus_mt:main",
"ct2-transformers-converter=ctranslate2.converters.transformers:main",
],
},
)
16 changes: 16 additions & 0 deletions src/cpu/cpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,20 @@ namespace ctranslate2 {
}
}

#elif defined(CT2_PPC64LE_BUILD)

namespace ctranslate2 {
namespace cpu {

const char* cpu_vendor() {
return "POWER";
}

bool cpu_supports_power10() {
return true;
}

}
}

#endif
2 changes: 2 additions & 0 deletions src/cpu/cpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ namespace ctranslate2 {
bool cpu_supports_avx512();
#elif defined(CT2_ARM64_BUILD)
bool cpu_supports_neon();
#elif defined(CT2_PPC64LE_BUILD)
bool cpu_supports_power10();
#endif

}
Expand Down
10 changes: 10 additions & 0 deletions src/cpu/cpu_isa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ namespace ctranslate2 {
#elif defined(CT2_ARM64_BUILD)
case CpuIsa::NEON:
return "NEON";
#elif defined(CT2_PPC64LE_BUILD)
case CpuIsa::POWER10:
return "POWER10";
#endif

default:
return "GENERIC";
}
Expand All @@ -54,6 +58,9 @@ namespace ctranslate2 {
#elif defined(CT2_ARM64_BUILD)
if (env_isa == "NEON")
return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon());
#elif defined(CT2_PPC64LE_BUILD)
if (env_isa == "POWER10")
return try_isa(env_isa, CpuIsa::POWER10, cpu_supports_power10());
#endif
if (env_isa == "GENERIC")
return CpuIsa::GENERIC;
Expand All @@ -71,6 +78,9 @@ namespace ctranslate2 {
# elif defined(CT2_ARM64_BUILD)
if (cpu_supports_neon())
return CpuIsa::NEON;
# elif defined(CT2_PPC64LE_BUILD)
if (cpu_supports_power10())
return CpuIsa::POWER10;
# endif
#endif

Expand Down
9 changes: 8 additions & 1 deletion src/cpu/cpu_isa.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ namespace ctranslate2 {
namespace cpu {

enum class CpuIsa {
GENERIC,
GENERIC,POWER10,
#if defined(CT2_X86_BUILD)
AVX,
AVX2,
AVX512,
#elif defined(CT2_ARM64_BUILD)
NEON,
/*#elif defined(CT2_PPC64LE_BUILD)
POWER10,*/
#endif
};

Expand Down Expand Up @@ -54,6 +56,11 @@ namespace ctranslate2 {
CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \
CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \
}
#elif defined(CT2_PPC64LE_BUILD)
# define CPU_ISA_DISPATCH(STMTS) \
switch (cpu::get_cpu_isa()) { \
CPU_ISA_DEFAULT(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS)) \
}
#endif
#elif defined(__AVX512F__)
# define CPU_ISA_DISPATCH(STMTS) \
Expand Down
Loading
Loading