Skip to content

Commit

Permalink
merge recent changes from ROCm/xformers (#1182)
Browse files Browse the repository at this point in the history
  • Loading branch information
qianfengz authored Jan 5, 2025
1 parent 2fdd396 commit 6440945
Show file tree
Hide file tree
Showing 1,087 changed files with 17,987 additions and 12,266 deletions.
5 changes: 3 additions & 2 deletions .github/actions/setup-build-cuda/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ runs:
"124": ("12.4.1", "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run"),
"121": ("12.1.0", "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"),
"118": ("11.8.0", "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"),
"6.0": ("6.0.2", "https://repo.radeon.com/amdgpu-install/6.0.2/rhel/7.9/amdgpu-install-6.0.60002-1.el7.noarch.rpm"),
"6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.2/el/7/amdgpu-install-6.1.60102-1.el7.noarch.rpm"),
"6.0": ("6.0.2", "https://repo.radeon.com/amdgpu-install/6.0.2/rhel/8.9/amdgpu-install-6.0.60002-1.el8.noarch.rpm"),
"6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.3/rhel/8.9/amdgpu-install-6.1.60103-1.el8.noarch.rpm"),
"6.2": ("6.2.3", "https://repo.radeon.com/amdgpu-install/6.2.3/rhel/8.9/amdgpu-install-6.2.60203-1.el8.noarch.rpm"),
}[cushort]
with open(os.environ['GITHUB_OUTPUT'], "r+") as fp:
fp.write("CUDA_VERSION=" + full_version + "\n")
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/rocm_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
python: ['3.11']
torch_version: ['2.5.1']
toolkit_type: ['rocm']
toolkit_short_version: ['6.0', '6.1']
toolkit_short_version: ['6.1', '6.2']

uses: ./.github/workflows/wheels_build.yml
if: github.repository == 'rocm/xformers'
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/rocm_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ on:
jobs:
build:
if: github.repository == 'rocm/xformers'
runs-on: self-hosted
runs-on: self-hosted-rocm-ci
container:
image: 'rocm/pytorch-nightly:latest'
options: ' --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G '
options: ' --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G --memory 32G '
steps:
- uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -57,7 +57,7 @@ jobs:
export PATH=/opt/conda/envs/xformers/bin:$PATH
python -VV
python -m pip install -U torch --index-url=https://download.pytorch.org/whl/nightly/rocm6.1
python -m pip install -U torch --index-url=https://download.pytorch.org/whl/rocm6.2
python -c "import torch; print(f'PyTorch version {torch.__version__}')"
python -m pip install ninja scipy pytest pytest-html
Expand All @@ -71,7 +71,7 @@ jobs:
- name: Build xformers
run: |
export PATH=/opt/conda/envs/xformers/bin:$PATH
export MAX_JOBS=144
export MAX_JOBS=20
python -m pip install -e ./_xformers --verbose
python -m xformers.info
Expand All @@ -97,7 +97,7 @@ jobs:
cd ..
clean:
runs-on: self-hosted
runs-on: self-hosted-rocm-ci
if: ${{ needs.build.result != 'skipped' }}
needs: [build]
steps:
Expand Down
27 changes: 27 additions & 0 deletions .github/workflows/rocm_docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Build and Publish ROCm Docker Image

on:
push:
branches:
- develop

jobs:
build-and-push:
runs-on: rocm
if: github.repository == 'rocm/xformers'
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Build and push
uses: docker/build-push-action@v6
with:
push: true
tags: rocm/xformers:latest
file: Dockerfile.rocm
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ xformers/csrc/attention/hip_fmha/*.hip
xformers/csrc/attention/hip_fmha/*_hip.h
xformers/csrc/attention/hip_fmha/instances/*.cu
xformers/csrc/attention/hip_fmha/instances/*.hip
xformers/csrc/attention/hip_fmha/instances/*.cu
xformers/csrc/attention/hip_fmha/instances/*.hip
xformers/csrc/attention/hip_fmha/instances/*_hip.h
xformers/csrc/attention/hip_decoder/*.cu
xformers/csrc/attention/hip_decoder/*.hip
xformers/csrc/attention/hip_decoder/*_hip.h


2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
[submodule "third_party/composable_kernel_tiled"]
path = third_party/composable_kernel_tiled
url = https://github.com/ROCm/composable_kernel.git
branch = develop
branch = develop
13 changes: 10 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,12 @@ def get_extensions():
]

source_hip = glob.glob(
os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cpp"),
os.path.join(extensions_dir, "attention", "hip_*", "**", "*.cpp"),
recursive=True,
)

source_hip_generated = glob.glob(
os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cu"),
os.path.join(extensions_dir, "attention", "hip_*", "**", "*.cu"),
recursive=True,
)
# avoid the temporary .cu files generated under xformers/csrc/attention/hip_fmha
Expand Down Expand Up @@ -539,7 +540,8 @@ def get_extensions():
extension = CUDAExtension
sources += source_hip_cu
include_dirs += [
Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha"
Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha",
Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_decoder",
]

include_dirs += [
Expand All @@ -557,12 +559,17 @@ def get_extensions():

arch_list = os.getenv("HIP_ARCHITECTURES", "native").split()

offload_compress_flag = []
if hip_version >= "6.2.":
offload_compress_flag = ["--offload-compress"]

extra_compile_args = {
"cxx": ["-O3", "-std=c++17"] + generator_flag,
"nvcc": [
"-O3",
"-std=c++17",
*[f"--offload-arch={arch}" for arch in arch_list],
*offload_compress_flag,
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-DCK_TILE_FMHA_FWD_FAST_EXP2=1",
Expand Down
19 changes: 17 additions & 2 deletions tests/test_mem_eff_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,8 @@ def test_backward(
if op_bw == fmha.ck.BwOp:
op_fw = fmha.ck.FwOp
if dtype == torch.bfloat16:
# bfloat16 testing can be enabled by export ENABLE_HIP_FMHA_RTN_BF16_CONVERT=1 when
# building xformers and get accurate results
pytest.skip(
"CK Fmha backward for bfloat16 currently is not very accurate for some cases!"
)
Expand Down Expand Up @@ -1937,7 +1939,7 @@ def test_forward_gqa(opFW_biasT, Mq: int):
"opBW",
[
fmha.flash.BwOp,
fmha.cutlass.BwOp,
fmha.ck.BwOp if torch.version.hip else fmha.cutlass.BwOp,
],
)
def test_backward_gqa(opBW):
Expand All @@ -1949,7 +1951,7 @@ def test_backward_gqa(opBW):
attn_bias_requires_grad=False,
fmt="BMHK",
)
op = (fmha.cutlass.FwOp, opBW)
op = (fmha.ck.FwOp if torch.version.hip else fmha.cutlass.FwOp, opBW)
key = key[:, :, :1].expand(-1, -1, H, -1)
value = value[:, :, :1].expand(-1, -1, H, -1)
key.requires_grad_(True)
Expand Down Expand Up @@ -2278,6 +2280,19 @@ def test_paged_attention(
)


@cuda_only
@pytest.mark.parametrize("B", [1, 5, 128])
@pytest.mark.parametrize("MAX_T", [64, 128, 2048, 4096, 8192])
@pytest.mark.parametrize("page_size", [128, 256])
@pytest.mark.parametrize("gappy", [False, True], ids=lambda x: "gappy" if x else "")
def test_paged_attention_ck(B, MAX_T: int, page_size: int, gappy: bool):
op = fmha.ck.FwOp
num_quant_groups = 0
paged_attention_run_inner(
B, MAX_T, num_quant_groups, page_size, op, bench=False, gappy=gappy
)


@sm80_or_better_only
@disable_on_rocm
@pytest.mark.parametrize("B", [1, 5, 128])
Expand Down
2 changes: 1 addition & 1 deletion third_party/composable_kernel_tiled
2 changes: 1 addition & 1 deletion xformers/csrc/attention/attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
"xformers::efficient_attention_forward_ck(Tensor query, "
"Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
"Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
"bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
"bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size, Tensor? block_tables, int? page_size) -> (Tensor, Tensor, int, int)"));
m.def(TORCH_SELECTIVE_SCHEMA(
"xformers::efficient_attention_forward_decoder_ck(Tensor query, "
"Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
int32_t smem_output = K_MAX * sizeof(float) *
threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
const size_t lds_bytes = max(smem_softmax, smem_output);
auto stream = at::cuda::getCurrentHIPStream().stream();
auto stream = at::hip::getCurrentHIPStream().stream();

AT_DISPATCH_SWITCH_3(
at::ScalarType::Half,
Expand Down
Loading

0 comments on commit 6440945

Please sign in to comment.