merge recent changes from ROCm/xformers (#1182)

facebookresearch · Jan 5, 2025 · 6440945 · 6440945
1 parent 2fdd396
commit 6440945
Show file tree

Hide file tree

Showing 1,087 changed files with 17,987 additions and 12,266 deletions.
diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
@@ -29,8 +29,9 @@ runs:
           "124": ("12.4.1", "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run"),
           "121": ("12.1.0", "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"),
           "118": ("11.8.0", "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"),
-          "6.0": ("6.0.2", "https://repo.radeon.com/amdgpu-install/6.0.2/rhel/7.9/amdgpu-install-6.0.60002-1.el7.noarch.rpm"),
-          "6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.2/el/7/amdgpu-install-6.1.60102-1.el7.noarch.rpm"),
+          "6.0": ("6.0.2", "https://repo.radeon.com/amdgpu-install/6.0.2/rhel/8.9/amdgpu-install-6.0.60002-1.el8.noarch.rpm"),
+          "6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.3/rhel/8.9/amdgpu-install-6.1.60103-1.el8.noarch.rpm"),
+          "6.2": ("6.2.3", "https://repo.radeon.com/amdgpu-install/6.2.3/rhel/8.9/amdgpu-install-6.2.60203-1.el8.noarch.rpm"),
         }[cushort]
         with open(os.environ['GITHUB_OUTPUT'], "r+") as fp:
           fp.write("CUDA_VERSION=" + full_version + "\n")

diff --git a/.github/workflows/rocm_build.yml b/.github/workflows/rocm_build.yml
@@ -24,7 +24,7 @@ jobs:
         python: ['3.11']
         torch_version: ['2.5.1']
         toolkit_type: ['rocm']
-        toolkit_short_version: ['6.0', '6.1']
+        toolkit_short_version: ['6.1', '6.2']
 
     uses: ./.github/workflows/wheels_build.yml
     if: github.repository == 'rocm/xformers'

diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
@@ -12,10 +12,10 @@ on:
 jobs:
   build:
     if: github.repository == 'rocm/xformers'
-    runs-on: self-hosted
+    runs-on: self-hosted-rocm-ci
     container:
       image: 'rocm/pytorch-nightly:latest'
-      options: ' --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G '
+      options: ' --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G --memory 32G '
     steps:
     - uses: actions/checkout@v4
       with:
@@ -57,7 +57,7 @@ jobs:
         export PATH=/opt/conda/envs/xformers/bin:$PATH
         python -VV
         
-        python -m pip install -U torch --index-url=https://download.pytorch.org/whl/nightly/rocm6.1 
+        python -m pip install -U torch --index-url=https://download.pytorch.org/whl/rocm6.2
         python -c "import torch; print(f'PyTorch version {torch.__version__}')"
         
         python -m pip install ninja scipy pytest pytest-html
@@ -71,7 +71,7 @@ jobs:
     - name: Build xformers
       run: |
         export PATH=/opt/conda/envs/xformers/bin:$PATH
-        export MAX_JOBS=144
+        export MAX_JOBS=20
         
         python -m pip install -e ./_xformers --verbose
         python -m xformers.info
@@ -97,7 +97,7 @@ jobs:
         cd ..
 
   clean:
-    runs-on: self-hosted
+    runs-on: self-hosted-rocm-ci
     if: ${{ needs.build.result != 'skipped' }}
     needs: [build]
     steps:

diff --git a/.github/workflows/rocm_docker.yml b/.github/workflows/rocm_docker.yml
@@ -0,0 +1,27 @@
+name: Build and Publish ROCm Docker Image
+
+on:
+  push:
+    branches:
+      - develop
+
+jobs:
+  build-and-push:
+    runs-on: rocm
+    if: github.repository == 'rocm/xformers'
+    steps:
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          push: true
+          tags: rocm/xformers:latest
+          file: Dockerfile.rocm
diff --git a/.gitignore b/.gitignore
@@ -67,7 +67,9 @@ xformers/csrc/attention/hip_fmha/*.hip
 xformers/csrc/attention/hip_fmha/*_hip.h
 xformers/csrc/attention/hip_fmha/instances/*.cu
 xformers/csrc/attention/hip_fmha/instances/*.hip
-xformers/csrc/attention/hip_fmha/instances/*.cu
-xformers/csrc/attention/hip_fmha/instances/*.hip
 xformers/csrc/attention/hip_fmha/instances/*_hip.h
+xformers/csrc/attention/hip_decoder/*.cu
+xformers/csrc/attention/hip_decoder/*.hip
+xformers/csrc/attention/hip_decoder/*_hip.h
+
 
diff --git a/.gitmodules b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "third_party/composable_kernel_tiled"]
 	path = third_party/composable_kernel_tiled
 	url = https://github.com/ROCm/composable_kernel.git
-	branch = develop 
+	branch = develop
diff --git a/setup.py b/setup.py
@@ -381,11 +381,12 @@ def get_extensions():
     ]
 
     source_hip = glob.glob(
-        os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cpp"),
+        os.path.join(extensions_dir, "attention", "hip_*", "**", "*.cpp"),
         recursive=True,
     )
+
     source_hip_generated = glob.glob(
-        os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cu"),
+        os.path.join(extensions_dir, "attention", "hip_*", "**", "*.cu"),
         recursive=True,
     )
     # avoid the temporary .cu files generated under xformers/csrc/attention/hip_fmha
@@ -539,7 +540,8 @@ def get_extensions():
         extension = CUDAExtension
         sources += source_hip_cu
         include_dirs += [
-            Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha"
+            Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha",
+            Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_decoder",
         ]
 
         include_dirs += [
@@ -557,12 +559,17 @@ def get_extensions():
 
         arch_list = os.getenv("HIP_ARCHITECTURES", "native").split()
 
+        offload_compress_flag = []
+        if hip_version >= "6.2.":
+            offload_compress_flag = ["--offload-compress"]
+
         extra_compile_args = {
             "cxx": ["-O3", "-std=c++17"] + generator_flag,
             "nvcc": [
                 "-O3",
                 "-std=c++17",
                 *[f"--offload-arch={arch}" for arch in arch_list],
+                *offload_compress_flag,
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
                 "-DCK_TILE_FMHA_FWD_FAST_EXP2=1",

diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
@@ -674,6 +674,8 @@ def test_backward(
     if op_bw == fmha.ck.BwOp:
         op_fw = fmha.ck.FwOp
         if dtype == torch.bfloat16:
+            # bfloat16 testing can be enabled by export ENABLE_HIP_FMHA_RTN_BF16_CONVERT=1 when
+            # building xformers and get accurate results
             pytest.skip(
                 "CK Fmha backward for bfloat16 currently is not very accurate for some cases!"
             )
@@ -1937,7 +1939,7 @@ def test_forward_gqa(opFW_biasT, Mq: int):
     "opBW",
     [
         fmha.flash.BwOp,
-        fmha.cutlass.BwOp,
+        fmha.ck.BwOp if torch.version.hip else fmha.cutlass.BwOp,
     ],
 )
 def test_backward_gqa(opBW):
@@ -1949,7 +1951,7 @@ def test_backward_gqa(opBW):
         attn_bias_requires_grad=False,
         fmt="BMHK",
     )
-    op = (fmha.cutlass.FwOp, opBW)
+    op = (fmha.ck.FwOp if torch.version.hip else fmha.cutlass.FwOp, opBW)
     key = key[:, :, :1].expand(-1, -1, H, -1)
     value = value[:, :, :1].expand(-1, -1, H, -1)
     key.requires_grad_(True)
@@ -2278,6 +2280,19 @@ def test_paged_attention(
     )
 
 
+@cuda_only
+@pytest.mark.parametrize("B", [1, 5, 128])
+@pytest.mark.parametrize("MAX_T", [64, 128, 2048, 4096, 8192])
+@pytest.mark.parametrize("page_size", [128, 256])
+@pytest.mark.parametrize("gappy", [False, True], ids=lambda x: "gappy" if x else "")
+def test_paged_attention_ck(B, MAX_T: int, page_size: int, gappy: bool):
+    op = fmha.ck.FwOp
+    num_quant_groups = 0
+    paged_attention_run_inner(
+        B, MAX_T, num_quant_groups, page_size, op, bench=False, gappy=gappy
+    )
+
+
 @sm80_or_better_only
 @disable_on_rocm
 @pytest.mark.parametrize("B", [1, 5, 128])

diff --git a/third_party/composable_kernel_tiled b/third_party/composable_kernel_tiled
diff --git a/xformers/csrc/attention/attention.cpp b/xformers/csrc/attention/attention.cpp
@@ -26,7 +26,7 @@ TORCH_LIBRARY_FRAGMENT(xformers, m) {
       "xformers::efficient_attention_forward_ck(Tensor query, "
       "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
       "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
-      "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size) -> (Tensor, Tensor, int, int)"));
+      "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size, Tensor? block_tables, int? page_size) -> (Tensor, Tensor, int, int)"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
       "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));

diff --git a/...rs/csrc/attention/hip_fmha/CMakeLists.txt → ...csrc/attention/hip_decoder/CMakeLists.txt b/...rs/csrc/attention/hip_fmha/CMakeLists.txt → ...csrc/attention/hip_decoder/CMakeLists.txt
diff --git a/...on/hip_fmha/attention_forward_decoder.cpp → ...hip_decoder/attention_forward_decoder.cpp b/...on/hip_fmha/attention_forward_decoder.cpp → ...hip_decoder/attention_forward_decoder.cpp
@@ -96,7 +96,7 @@ at::Tensor& efficient_attention_forward_decoder_ck_out_impl(
   int32_t smem_output = K_MAX * sizeof(float) *
       threads.y; // 4 * threadsPerBlock * sizeof(float) == sizeof(O[b][0][h][:])
   const size_t lds_bytes = max(smem_softmax, smem_output);
-  auto stream = at::cuda::getCurrentHIPStream().stream();
+  auto stream = at::hip::getCurrentHIPStream().stream();
 
   AT_DISPATCH_SWITCH_3(
       at::ScalarType::Half,