Skip to content

Commit

Permalink
Tackle the comment
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang committed Oct 7, 2023
1 parent 8f21198 commit 76f5e28
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 46 deletions.
32 changes: 16 additions & 16 deletions include/mscclpp/packet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,29 +75,29 @@ union LLPacket {
};

#ifdef __CUDACC__
/// Read from the data and write to the packet buffer.
__forceinline__ __device__ void putPackets(void* bufPtr, uint64_t bufOffset, const void* dataPtr, uint64_t dataOffset,
uint64_t dataBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
/// Read from the origin and write to the target buffer.
__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, uint64_t originOffset,
uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
const uint32_t* dataBase = (const uint32_t*)((const char*)dataPtr + dataOffset);
LLPacket* bufBase = (LLPacket*)((char*)bufPtr + bufOffset);
size_t nElem = dataBytes / sizeof(uint64_t);
const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
size_t nElem = originBytes / sizeof(uint64_t);
for (size_t i = threadId; i < nElem; i += numThreads) {
LLPacket* pkt = &bufBase[i];
pkt->write(dataBase[2 * i], dataBase[2 * i + 1], flag);
LLPacket* pkt = &targetBase[i];
pkt->write(originBase[2 * i], originBase[2 * i + 1], flag);
}
}

/// Read from the packet buffer and write to the data.
__forceinline__ __device__ void getPackets(const void* bufPtr, uint64_t bufOffset, void* dataPtr, uint64_t dataOffset,
uint64_t dataBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
/// Read from the target buffer and write to the origin.
__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, uint64_t originOffset,
uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
const LLPacket* bufBase = (const LLPacket*)((const char*)bufPtr + bufOffset);
uint2* dataBase = (uint2*)((char*)dataPtr + dataOffset);
size_t nElem = dataBytes / sizeof(uint2);
const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
uint2* originBase = (uint2*)((char*)originPtr + originOffset);
size_t nElem = originBytes / sizeof(uint2);
for (size_t i = threadId; i < nElem; i += numThreads) {
const LLPacket* pkt = &bufBase[i];
dataBase[i] = pkt->read(flag);
const LLPacket* pkt = &targetBase[i];
originBase[i] = pkt->read(flag);
}
}
#endif // __CUDACC__
Expand Down
60 changes: 30 additions & 30 deletions include/mscclpp/sm_channel_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,48 +196,48 @@ struct SmChannelDeviceHandle {
}
}

/// Copy data from the local memory to the remote memory.
/// Copy data from the local memory (origin) to the remote memory (target).
///
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
///
/// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16.
/// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p
/// Alignment.
/// @param bufOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
/// @param dataOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
/// @param dataBytes Bytes of the data to be copied. Should be a multiple of @p Alignment.
/// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
/// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
/// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment.
/// @param threadId The index of the current thread among all threads running this function. This is different from
/// the `threadIdx` in CUDA.
/// @param numThreads The total number of threads that run this function.
///
template <int Alignment = 16, bool CopyRemainder = true>
__forceinline__ __device__ void put(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, uint32_t threadId,
__forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads) {
copy<Alignment, CopyRemainder>((char*)dst_ + bufOffset, (char*)src_ + dataOffset, dataBytes, threadId, numThreads);
copy<Alignment, CopyRemainder>((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId, numThreads);
}

/// Copy data from the remote memory to the local memory.
/// Copy data from the remote memory (target) to the local memory (origin).
///
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
///
/// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16.
/// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p
/// Alignment.
/// @param bufOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
/// @param dataOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
/// @param dataBytes Bytes of the data to be copied. Should be a multiple of @p Alignment.
/// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
/// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
/// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment.
/// @param threadId The index of the current thread among all threads running this function. This is different from
/// the `threadIdx` in CUDA.
/// @param numThreads The total number of threads that run this function.
///
template <int Alignment = 16, bool CopyRemainder = true>
__forceinline__ __device__ void get(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, uint32_t threadId,
__forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads) {
// Note that `dst` and `src` are swapped for `get()`.
copy<Alignment, CopyRemainder>((char*)src_ + dataOffset, (char*)dst_ + bufOffset, dataBytes, threadId, numThreads);
copy<Alignment, CopyRemainder>((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId, numThreads);
}

/// Copy data from the local memory to the remote memory.
/// Copy data from the local memory (origin) to the remote memory (target).
///
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
///
Expand All @@ -251,11 +251,11 @@ struct SmChannelDeviceHandle {
/// @param numThreads The total number of threads that run this function.
///
template <int Alignment = 16, bool CopyRemainder = true>
__forceinline__ __device__ void put(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) {
put<Alignment, CopyRemainder>(offset, offset, size, threadId, numThreads);
__forceinline__ __device__ void put(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
put<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
}

/// Copy data from the remote memory to the local memory.
/// Copy data from the remote memory (target) to the local memory (origin).
///
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
///
Expand All @@ -269,40 +269,40 @@ struct SmChannelDeviceHandle {
/// @param numThreads The total number of threads that run this function.
///
template <int Alignment = 16, bool CopyRemainder = true>
__forceinline__ __device__ void get(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) {
get<Alignment, CopyRemainder>(offset, offset, size, threadId, numThreads);
__forceinline__ __device__ void get(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
get<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
}

/// Construct @ref LLPacket from the data in the local memory and write it on the remote packet buffer.
/// Construct @ref LLPacket from the data in the local memory (origin) and write it on the remote packet buffer (target).
///
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets.
///
/// @param bufOffset The offset in bytes of the remote packet buffer.
/// @param dataOffset The offset in bytes of the local data.
/// @param dataBytes Bytes of the data to be copied.
/// @param targetOffset The offset in bytes of the remote packet buffer.
/// @param originOffset The offset in bytes of the local data.
/// @param originBytes Bytes of the origin to be copied.
/// @param threadId The index of the current thread among all threads running this function. This is different from
/// the `threadIdx` in CUDA.
/// @param numThreads The total number of threads that run this function.
///
__forceinline__ __device__ void putPackets(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes,
__forceinline__ __device__ void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
mscclpp::putPackets(dst_, bufOffset, src_, dataOffset, dataBytes, threadId, numThreads, flag);
mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
}

/// Retrieve data from @ref LLPacket in the local packet buffer and write it on the local data.
/// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin).
///
/// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
///
/// @param bufOffset The offset in bytes of the local packet buffer.
/// @param dataOffset The offset in bytes of the local data.
/// @param dataBytes Bytes of the data to be copied.
/// @param targetOffset The offset in bytes of the local packet buffer.
/// @param originOffset The offset in bytes of the local data.
/// @param originBytes Bytes of the origin to be copied.
/// @param threadId The index of the current thread among all threads running this function. This is different from
/// the `threadIdx` in CUDA.
/// @param numThreads The total number of threads that run this function.
///
__forceinline__ __device__ void getPackets(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes,
__forceinline__ __device__ void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
uint32_t threadId, uint32_t numThreads, uint32_t flag) {
mscclpp::getPackets(getPacketBuffer_, bufOffset, src_, dataOffset, dataBytes, threadId, numThreads, flag);
mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
}

/// Signal the remote semaphore.
Expand Down

0 comments on commit 76f5e28

Please sign in to comment.