updates

microsoft · Nov 8, 2023 · e510e90 · e510e90
1 parent 4337e3c
commit e510e90
Show file tree

Hide file tree

Showing 13 changed files with 50 additions and 53 deletions.
diff --git a/include/mscclpp/atomic.hpp b/include/mscclpp/atomic.hpp
@@ -14,24 +14,24 @@ namespace mscclpp {
 
 #if defined(MSCCLPP_CUDA) || defined(MSCCLPP_CUDA_HOST)
 
-constexpr auto memoryOrderRelaxed = cuda::memory_order_relaxed;
-constexpr auto memoryOrderAcquire = cuda::memory_order_acquire;
-constexpr auto memoryOrderRelease = cuda::memory_order_release;
-constexpr auto memoryOrderAcqRel = cuda::memory_order_acq_rel;
-constexpr auto memoryOrderSeqCst = cuda::memory_order_seq_cst;
+constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed;
+constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire;
+constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release;
+constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel;
+constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst;
 
 template <typename T>
-MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) {
+MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) {
   return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.load(memoryOrder);
 }
 
 template <typename T>
-MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) {
+MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) {
   cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.store(val, memoryOrder);
 }
 
 template <typename T>
-MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) {
+MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) {
   return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.fetch_add(val, memoryOrder);
 }
 

diff --git a/include/mscclpp/device.hpp b/include/mscclpp/device.hpp
@@ -4,7 +4,8 @@
 #ifndef MSCCLPP_DEVICE_HPP_
 #define MSCCLPP_DEVICE_HPP_
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0)) || (defined(__HIP_DEVICE_COMPILE__) && (__HIP_DEVICE_COMPILE__ == 1))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0)) || \
+    (defined(__HIP_DEVICE_COMPILE__) && (__HIP_DEVICE_COMPILE__ == 1))
 /// Device code (compiled by GPU-aware compilers)
 
 #define MSCCLPP_ON_DEVICE
@@ -21,12 +22,12 @@
 
 #define MSCCLPP_HOST_DEVICE_INLINE MSCCLPP_DEVICE_INLINE
 
-#elif defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
+#elif defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)
 /// Host code but perhaps mixed with device code (compiled by GPU-aware compilers)
 
 #define MSCCLPP_ON_HOST_DEVICE
 
-#if defined(__CUDA_ARCH__)
+#if defined(__NVCC__)
 #include <cuda_runtime.h>
 #define MSCCLPP_CUDA_HOST
 #define MSCCLPP_DEVICE_INLINE __forceinline__ __device__

diff --git a/include/mscclpp/fifo_device.hpp b/include/mscclpp/fifo_device.hpp
@@ -4,11 +4,11 @@
 #ifndef MSCCLPP_FIFO_DEVICE_HPP_
 #define MSCCLPP_FIFO_DEVICE_HPP_
 
+#include <cstdint>
+
 #include "atomic.hpp"
 #include "poll_device.hpp"
 
-#include <cstdint>
-
 namespace mscclpp {
 
 /// A struct representing a pair of 64-bit unsigned integers used as a trigger for the proxy.
@@ -59,16 +59,14 @@ struct FifoDeviceHandle {
     // As atomic access is slow, we first check using the bare pointer and then use the atomic load if the
     // condition is not met.
     if (curFifoHead >= size + *(this->tailReplica)) {
-      OR_POLL_MAYBE_JAILBREAK(
-          (curFifoHead >= size + atomicLoad(this->tailReplica, memoryOrderRelaxed)),
-          (atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
-          maxSpinCount);
+      OR_POLL_MAYBE_JAILBREAK((curFifoHead >= size + atomicLoad(this->tailReplica, memoryOrderRelaxed)),
+                              (atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
+                              maxSpinCount);
     }
 
     longlong2* triggerPtr = (longlong2*)&(this->triggers[curFifoHead % size]);
 
     // store with memory order release so that the while loop does not go pass this.
-    // asm volatile("st.global.release.cta.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
     *triggerPtr = trigger.raw_;
 
     return curFifoHead;
@@ -81,10 +79,9 @@ struct FifoDeviceHandle {
   MSCCLPP_DEVICE_INLINE void sync(uint64_t curFifoHead, int64_t maxSpinCount = 1000000) {
     // Same as push but in this case checking the fist condition is probably faster since for tail to be pushed we need
     // to wait for cudaMemcpy to be done.
-    OR_POLL_MAYBE_JAILBREAK(
-        (curFifoHead >= atomicLoad(this->tailReplica, memoryOrderRelaxed)),
-        (atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
-        maxSpinCount);
+    OR_POLL_MAYBE_JAILBREAK((curFifoHead >= atomicLoad(this->tailReplica, memoryOrderRelaxed)),
+                            (atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
+                            maxSpinCount);
   }
 #endif  // defined(MSCCLPP_ON_HOST_DEVICE)
 

diff --git a/include/mscclpp/packet.hpp b/include/mscclpp/packet.hpp
@@ -50,7 +50,7 @@ union alignas(16) LLPacket {
   /// @return True if the flag is not equal to the given flag.
   MSCCLPP_DEVICE_INLINE bool readOnce(uint32_t flag, uint2& data) const {
     ulonglong2 reg = raw_;
-    uint4 *ptr = reinterpret_cast<uint4*>(&reg);
+    uint4* ptr = reinterpret_cast<uint4*>(&reg);
     data.x = ptr->w;
     data.y = ptr->y;
     return (ptr->x != flag) || (ptr->z != flag);
@@ -67,17 +67,15 @@ union alignas(16) LLPacket {
   }
 
   /// Clear the packet.
-  MSCCLPP_DEVICE_INLINE void clear() {
-    raw_ = make_ulonglong2(0, 0);
-  }
+  MSCCLPP_DEVICE_INLINE void clear() { raw_ = make_ulonglong2(0, 0); }
 #endif  // defined(MSCCLPP_ON_HOST_DEVICE)
 };
 
 #if defined(MSCCLPP_ON_HOST_DEVICE)
 /// Read from the origin and write to the target buffer.
 MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
-                                           uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
-                                           uint32_t numThreads, uint32_t flag) {
+                                      uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                      uint32_t numThreads, uint32_t flag) {
   // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
   const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
   LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
@@ -90,8 +88,8 @@ MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, co
 
 /// Read from the target buffer and write to the origin.
 MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
-                                           uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
-                                           uint32_t numThreads, uint32_t flag) {
+                                      uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                      uint32_t numThreads, uint32_t flag) {
   // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
   const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
   uint2* originBase = (uint2*)((char*)originPtr + originOffset);

diff --git a/include/mscclpp/poll_device.hpp b/include/mscclpp/poll_device.hpp
@@ -10,8 +10,13 @@
 
 #include <cstdint>
 
+#if defined(MSCCLPP_CUDA) || defined(MSCCLPP_CUDA_HOST)
+extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
+                                         const char *__function) __THROW;
+#else
 extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
                                          const char *__function);
+#endif
 
 // If a spin is stuck, escape from it and set status to 1.
 #define POLL_MAYBE_JAILBREAK_ESCAPE(__cond, __max_spin_cnt, __status) \

diff --git a/include/mscclpp/proxy_channel_device.hpp b/include/mscclpp/proxy_channel_device.hpp
@@ -90,8 +90,7 @@ struct ProxyChannelDeviceHandle {
   /// @param src The source memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  MSCCLPP_DEVICE_INLINE void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
-                                      uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) {
     fifo_.push(ChannelTrigger(TriggerData, dst, dstOffset, src, srcOffset, size, semaphoreId_).value);
   }
 
@@ -105,9 +104,7 @@ struct ProxyChannelDeviceHandle {
   }
 
   /// Push a @ref TriggerFlag to the FIFO.
-  MSCCLPP_DEVICE_INLINE void signal() {
-    fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, semaphoreId_).value);
-  }
+  MSCCLPP_DEVICE_INLINE void signal() { fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, semaphoreId_).value); }
 
   /// Push a @ref TriggerData and a @ref TriggerFlag at the same time to the FIFO.
   /// @param dst The destination memory region.
@@ -116,7 +113,7 @@ struct ProxyChannelDeviceHandle {
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
   MSCCLPP_DEVICE_INLINE void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
-                                                uint64_t size) {
+                                           uint64_t size) {
     fifo_.push(ChannelTrigger(TriggerData | TriggerFlag, dst, dstOffset, src, srcOffset, size, semaphoreId_).value);
   }
 
@@ -135,8 +132,8 @@ struct ProxyChannelDeviceHandle {
   /// @param src The source memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src,
-                                                        uint64_t srcOffset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
+                                                   uint64_t size) {
     uint64_t curFifoHead = fifo_.push(
         ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, dstOffset, src, srcOffset, size, semaphoreId_)
             .value);

diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp
@@ -69,8 +69,7 @@ struct SmChannelDeviceHandle {
 
   /// this is a helper for copy function
   template <typename T, bool CopyRemainder = true>
-  MSCCLPP_DEVICE_INLINE void copy_helper(void* dst, void* src, uint64_t bytes, uint32_t threadId,
-                                              uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void copy_helper(void* dst, void* src, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
     int* dstInt = reinterpret_cast<int*>(dst);
     int* srcInt = reinterpret_cast<int*>(src);
     const uintptr_t dstPtr = reinterpret_cast<uintptr_t>(dst);
@@ -136,8 +135,8 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  MSCCLPP_DEVICE_INLINE void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                      uint32_t threadId, uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                 uint32_t numThreads) {
     copy<Alignment, CopyRemainder>((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId,
                                    numThreads);
   }
@@ -157,8 +156,8 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  MSCCLPP_DEVICE_INLINE void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                      uint32_t threadId, uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                 uint32_t numThreads) {
     // Note that `dst` and `src` are swapped for `get()`.
     copy<Alignment, CopyRemainder>((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId,
                                    numThreads);
@@ -213,7 +212,7 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   MSCCLPP_DEVICE_INLINE void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                             uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+                                        uint32_t threadId, uint32_t numThreads, uint32_t flag) {
     mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
   }
 
@@ -229,7 +228,7 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   MSCCLPP_DEVICE_INLINE void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                             uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+                                        uint32_t threadId, uint32_t numThreads, uint32_t flag) {
     mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
   }
 

diff --git a/python/test/_cpp/proxy_test.cpp b/python/test/_cpp/proxy_test.cpp
@@ -9,8 +9,8 @@
 #include <iostream>
 #include <memory>
 #include <mscclpp/core.hpp>
-#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/proxy.hpp>
 #include <mscclpp/semaphore.hpp>

diff --git a/src/fifo.cc b/src/fifo.cc
@@ -2,8 +2,8 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/atomic.hpp>
-#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "api.h"
 

diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc
@@ -3,11 +3,11 @@
 
 #include "npkit.h"
 
-#include <mscclpp/gpu.hpp>
 #include <unistd.h>
 
 #include <chrono>
 #include <fstream>
+#include <mscclpp/gpu.hpp>
 
 uint64_t NpKit::rank_ = 0;
 

diff --git a/src/semaphore.cc b/src/semaphore.cc
@@ -67,8 +67,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::signal() {
 }
 
 MSCCLPP_API_CPP bool Host2HostSemaphore::poll() {
-
-  bool signaled = (atomicLoad((uint64_t*)localInboundSemaphore_.get(), memoryOrderAcquire) > (*expectedInboundSemaphore_));
+  bool signaled =
+      (atomicLoad((uint64_t*)localInboundSemaphore_.get(), memoryOrderAcquire) > (*expectedInboundSemaphore_));
   if (signaled) (*expectedInboundSemaphore_) += 1;
   return signaled;
 }

diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu
@@ -2,8 +2,8 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/core.hpp>
-#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/proxy.hpp>
 #include <mscclpp/semaphore.hpp>

diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
@@ -3,8 +3,8 @@
 
 #include <gtest/gtest.h>
 
-#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>