Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang committed Nov 8, 2023
1 parent 4337e3c commit e510e90
Show file tree
Hide file tree
Showing 13 changed files with 50 additions and 53 deletions.
16 changes: 8 additions & 8 deletions include/mscclpp/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,24 @@ namespace mscclpp {

#if defined(MSCCLPP_CUDA) || defined(MSCCLPP_CUDA_HOST)

constexpr auto memoryOrderRelaxed = cuda::memory_order_relaxed;
constexpr auto memoryOrderAcquire = cuda::memory_order_acquire;
constexpr auto memoryOrderRelease = cuda::memory_order_release;
constexpr auto memoryOrderAcqRel = cuda::memory_order_acq_rel;
constexpr auto memoryOrderSeqCst = cuda::memory_order_seq_cst;
constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed;
constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire;
constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release;
constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel;
constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst;

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) {
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) {
return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.load(memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) {
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) {
cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.store(val, memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) {
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) {
return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.fetch_add(val, memoryOrder);
}

Expand Down
7 changes: 4 additions & 3 deletions include/mscclpp/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
#ifndef MSCCLPP_DEVICE_HPP_
#define MSCCLPP_DEVICE_HPP_

#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0)) || (defined(__HIP_DEVICE_COMPILE__) && (__HIP_DEVICE_COMPILE__ == 1))
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0)) || \
(defined(__HIP_DEVICE_COMPILE__) && (__HIP_DEVICE_COMPILE__ == 1))
/// Device code (compiled by GPU-aware compilers)

#define MSCCLPP_ON_DEVICE
Expand All @@ -21,12 +22,12 @@

#define MSCCLPP_HOST_DEVICE_INLINE MSCCLPP_DEVICE_INLINE

#elif defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
#elif defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__)
/// Host code but perhaps mixed with device code (compiled by GPU-aware compilers)

#define MSCCLPP_ON_HOST_DEVICE

#if defined(__CUDA_ARCH__)
#if defined(__NVCC__)
#include <cuda_runtime.h>
#define MSCCLPP_CUDA_HOST
#define MSCCLPP_DEVICE_INLINE __forceinline__ __device__
Expand Down
19 changes: 8 additions & 11 deletions include/mscclpp/fifo_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
#ifndef MSCCLPP_FIFO_DEVICE_HPP_
#define MSCCLPP_FIFO_DEVICE_HPP_

#include <cstdint>

#include "atomic.hpp"
#include "poll_device.hpp"

#include <cstdint>

namespace mscclpp {

/// A struct representing a pair of 64-bit unsigned integers used as a trigger for the proxy.
Expand Down Expand Up @@ -59,16 +59,14 @@ struct FifoDeviceHandle {
// As atomic access is slow, we first check using the bare pointer and then use the atomic load if the
// condition is not met.
if (curFifoHead >= size + *(this->tailReplica)) {
OR_POLL_MAYBE_JAILBREAK(
(curFifoHead >= size + atomicLoad(this->tailReplica, memoryOrderRelaxed)),
(atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
maxSpinCount);
OR_POLL_MAYBE_JAILBREAK((curFifoHead >= size + atomicLoad(this->tailReplica, memoryOrderRelaxed)),
(atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
maxSpinCount);
}

longlong2* triggerPtr = (longlong2*)&(this->triggers[curFifoHead % size]);

// store with memory order release so that the while loop does not go pass this.
// asm volatile("st.global.release.cta.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
*triggerPtr = trigger.raw_;

return curFifoHead;
Expand All @@ -81,10 +79,9 @@ struct FifoDeviceHandle {
MSCCLPP_DEVICE_INLINE void sync(uint64_t curFifoHead, int64_t maxSpinCount = 1000000) {
// Same as push but in this case checking the fist condition is probably faster since for tail to be pushed we need
// to wait for cudaMemcpy to be done.
OR_POLL_MAYBE_JAILBREAK(
(curFifoHead >= atomicLoad(this->tailReplica, memoryOrderRelaxed)),
(atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
maxSpinCount);
OR_POLL_MAYBE_JAILBREAK((curFifoHead >= atomicLoad(this->tailReplica, memoryOrderRelaxed)),
(atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
maxSpinCount);
}
#endif // defined(MSCCLPP_ON_HOST_DEVICE)

Expand Down
14 changes: 6 additions & 8 deletions include/mscclpp/packet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ union alignas(16) LLPacket {
/// @return True if the flag is not equal to the given flag.
MSCCLPP_DEVICE_INLINE bool readOnce(uint32_t flag, uint2& data) const {
ulonglong2 reg = raw_;
uint4 *ptr = reinterpret_cast<uint4*>(&reg);
uint4* ptr = reinterpret_cast<uint4*>(&reg);
data.x = ptr->w;
data.y = ptr->y;
return (ptr->x != flag) || (ptr->z != flag);
Expand All @@ -67,17 +67,15 @@ union alignas(16) LLPacket {
}

/// Clear the packet.
MSCCLPP_DEVICE_INLINE void clear() {
raw_ = make_ulonglong2(0, 0);
}
MSCCLPP_DEVICE_INLINE void clear() { raw_ = make_ulonglong2(0, 0); }
#endif // defined(MSCCLPP_ON_HOST_DEVICE)
};

#if defined(MSCCLPP_ON_HOST_DEVICE)
/// Read from the origin and write to the target buffer.
MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads, uint32_t flag) {
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads, uint32_t flag) {
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
Expand All @@ -90,8 +88,8 @@ MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, co

/// Read from the target buffer and write to the origin.
MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads, uint32_t flag) {
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads, uint32_t flag) {
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
uint2* originBase = (uint2*)((char*)originPtr + originOffset);
Expand Down
5 changes: 5 additions & 0 deletions include/mscclpp/poll_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@

#include <cstdint>

#if defined(MSCCLPP_CUDA) || defined(MSCCLPP_CUDA_HOST)
extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
const char *__function) __THROW;
#else
extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
const char *__function);
#endif

// If a spin is stuck, escape from it and set status to 1.
#define POLL_MAYBE_JAILBREAK_ESCAPE(__cond, __max_spin_cnt, __status) \
Expand Down
13 changes: 5 additions & 8 deletions include/mscclpp/proxy_channel_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ struct ProxyChannelDeviceHandle {
/// @param src The source memory region.
/// @param srcOffset The offset into the source memory region.
/// @param size The size of the transfer.
MSCCLPP_DEVICE_INLINE void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
uint64_t size) {
MSCCLPP_DEVICE_INLINE void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) {
fifo_.push(ChannelTrigger(TriggerData, dst, dstOffset, src, srcOffset, size, semaphoreId_).value);
}

Expand All @@ -105,9 +104,7 @@ struct ProxyChannelDeviceHandle {
}

/// Push a @ref TriggerFlag to the FIFO.
MSCCLPP_DEVICE_INLINE void signal() {
fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, semaphoreId_).value);
}
MSCCLPP_DEVICE_INLINE void signal() { fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, semaphoreId_).value); }

/// Push a @ref TriggerData and a @ref TriggerFlag at the same time to the FIFO.
/// @param dst The destination memory region.
Expand All @@ -116,7 +113,7 @@ struct ProxyChannelDeviceHandle {
/// @param srcOffset The offset into the source memory region.
/// @param size The size of the transfer.
MSCCLPP_DEVICE_INLINE void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
uint64_t size) {
uint64_t size) {
fifo_.push(ChannelTrigger(TriggerData | TriggerFlag, dst, dstOffset, src, srcOffset, size, semaphoreId_).value);
}

Expand All @@ -135,8 +132,8 @@ struct ProxyChannelDeviceHandle {
/// @param src The source memory region.
/// @param srcOffset The offset into the source memory region.
/// @param size The size of the transfer.
MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src,
uint64_t srcOffset, uint64_t size) {
MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
uint64_t size) {
uint64_t curFifoHead = fifo_.push(
ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, dstOffset, src, srcOffset, size, semaphoreId_)
.value);
Expand Down
15 changes: 7 additions & 8 deletions include/mscclpp/sm_channel_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ struct SmChannelDeviceHandle {

/// this is a helper for copy function
template <typename T, bool CopyRemainder = true>
MSCCLPP_DEVICE_INLINE void copy_helper(void* dst, void* src, uint64_t bytes, uint32_t threadId,
uint32_t numThreads) {
MSCCLPP_DEVICE_INLINE void copy_helper(void* dst, void* src, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
int* dstInt = reinterpret_cast<int*>(dst);
int* srcInt = reinterpret_cast<int*>(src);
const uintptr_t dstPtr = reinterpret_cast<uintptr_t>(dst);
Expand Down Expand Up @@ -136,8 +135,8 @@ struct SmChannelDeviceHandle {
/// @param numThreads The total number of threads that run this function.
///
template <int Alignment = 16, bool CopyRemainder = true>
MSCCLPP_DEVICE_INLINE void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
uint32_t threadId, uint32_t numThreads) {
MSCCLPP_DEVICE_INLINE void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads) {
copy<Alignment, CopyRemainder>((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId,
numThreads);
}
Expand All @@ -157,8 +156,8 @@ struct SmChannelDeviceHandle {
/// @param numThreads The total number of threads that run this function.
///
template <int Alignment = 16, bool CopyRemainder = true>
MSCCLPP_DEVICE_INLINE void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
uint32_t threadId, uint32_t numThreads) {
MSCCLPP_DEVICE_INLINE void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads) {
// Note that `dst` and `src` are swapped for `get()`.
copy<Alignment, CopyRemainder>((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId,
numThreads);
Expand Down Expand Up @@ -213,7 +212,7 @@ struct SmChannelDeviceHandle {
/// @param numThreads The total number of threads that run this function.
///
MSCCLPP_DEVICE_INLINE void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
}

Expand All @@ -229,7 +228,7 @@ struct SmChannelDeviceHandle {
/// @param numThreads The total number of threads that run this function.
///
MSCCLPP_DEVICE_INLINE void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
}

Expand Down
2 changes: 1 addition & 1 deletion python/test/_cpp/proxy_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
#include <iostream>
#include <memory>
#include <mscclpp/core.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/fifo.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/numa.hpp>
#include <mscclpp/proxy.hpp>
#include <mscclpp/semaphore.hpp>
Expand Down
2 changes: 1 addition & 1 deletion src/fifo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
// Licensed under the MIT license.

#include <mscclpp/atomic.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/fifo.hpp>
#include <mscclpp/gpu_utils.hpp>

#include "api.h"

Expand Down
2 changes: 1 addition & 1 deletion src/npkit/npkit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

#include "npkit.h"

#include <mscclpp/gpu.hpp>
#include <unistd.h>

#include <chrono>
#include <fstream>
#include <mscclpp/gpu.hpp>

uint64_t NpKit::rank_ = 0;

Expand Down
4 changes: 2 additions & 2 deletions src/semaphore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::signal() {
}

MSCCLPP_API_CPP bool Host2HostSemaphore::poll() {

bool signaled = (atomicLoad((uint64_t*)localInboundSemaphore_.get(), memoryOrderAcquire) > (*expectedInboundSemaphore_));
bool signaled =
(atomicLoad((uint64_t*)localInboundSemaphore_.get(), memoryOrderAcquire) > (*expectedInboundSemaphore_));
if (signaled) (*expectedInboundSemaphore_) += 1;
return signaled;
}
Expand Down
2 changes: 1 addition & 1 deletion test/allgather_test_host_offloading.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
// Licensed under the MIT license.

#include <mscclpp/core.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/fifo.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/numa.hpp>
#include <mscclpp/proxy.hpp>
#include <mscclpp/semaphore.hpp>
Expand Down
2 changes: 1 addition & 1 deletion test/unit/fifo_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include <gtest/gtest.h>

#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/fifo.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/numa.hpp>
#include <mscclpp/utils.hpp>

Expand Down

0 comments on commit e510e90

Please sign in to comment.