From 76f5e28a576d0d2a77ed25087eb627cd5c0520e9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 7 Oct 2023 12:48:57 +0000 Subject: [PATCH] Tackle the comment --- include/mscclpp/packet.hpp | 32 +++++++------- include/mscclpp/sm_channel_device.hpp | 60 +++++++++++++-------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/include/mscclpp/packet.hpp b/include/mscclpp/packet.hpp index aa8637ee5..23ca93011 100644 --- a/include/mscclpp/packet.hpp +++ b/include/mscclpp/packet.hpp @@ -75,29 +75,29 @@ union LLPacket { }; #ifdef __CUDACC__ -/// Read from the data and write to the packet buffer. -__forceinline__ __device__ void putPackets(void* bufPtr, uint64_t bufOffset, const void* dataPtr, uint64_t dataOffset, - uint64_t dataBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { +/// Read from the origin and write to the target buffer. +__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, uint64_t originOffset, + uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes - const uint32_t* dataBase = (const uint32_t*)((const char*)dataPtr + dataOffset); - LLPacket* bufBase = (LLPacket*)((char*)bufPtr + bufOffset); - size_t nElem = dataBytes / sizeof(uint64_t); + const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); + LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset); + size_t nElem = originBytes / sizeof(uint64_t); for (size_t i = threadId; i < nElem; i += numThreads) { - LLPacket* pkt = &bufBase[i]; - pkt->write(dataBase[2 * i], dataBase[2 * i + 1], flag); + LLPacket* pkt = &targetBase[i]; + pkt->write(originBase[2 * i], originBase[2 * i + 1], flag); } } -/// Read from the packet buffer and write to the data. -__forceinline__ __device__ void getPackets(const void* bufPtr, uint64_t bufOffset, void* dataPtr, uint64_t dataOffset, - uint64_t dataBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { +/// Read from the target buffer and write to the origin. +__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, uint64_t originOffset, + uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes - const LLPacket* bufBase = (const LLPacket*)((const char*)bufPtr + bufOffset); - uint2* dataBase = (uint2*)((char*)dataPtr + dataOffset); - size_t nElem = dataBytes / sizeof(uint2); + const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset); + uint2* originBase = (uint2*)((char*)originPtr + originOffset); + size_t nElem = originBytes / sizeof(uint2); for (size_t i = threadId; i < nElem; i += numThreads) { - const LLPacket* pkt = &bufBase[i]; - dataBase[i] = pkt->read(flag); + const LLPacket* pkt = &targetBase[i]; + originBase[i] = pkt->read(flag); } } #endif // __CUDACC__ diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index 1cbee3394..a30faa932 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -196,48 +196,48 @@ struct SmChannelDeviceHandle { } } - /// Copy data from the local memory to the remote memory. + /// Copy data from the local memory (origin) to the remote memory (target). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// /// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16. /// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p /// Alignment. - /// @param bufOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. - /// @param dataOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. - /// @param dataBytes Bytes of the data to be copied. Should be a multiple of @p Alignment. + /// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. + /// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. + /// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void put(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, uint32_t threadId, + __forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, uint32_t numThreads) { - copy((char*)dst_ + bufOffset, (char*)src_ + dataOffset, dataBytes, threadId, numThreads); + copy((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId, numThreads); } - /// Copy data from the remote memory to the local memory. + /// Copy data from the remote memory (target) to the local memory (origin). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// /// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16. /// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p /// Alignment. - /// @param bufOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. - /// @param dataOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. - /// @param dataBytes Bytes of the data to be copied. Should be a multiple of @p Alignment. + /// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. + /// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. + /// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void get(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, uint32_t threadId, + __forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, uint32_t numThreads) { // Note that `dst` and `src` are swapped for `get()`. - copy((char*)src_ + dataOffset, (char*)dst_ + bufOffset, dataBytes, threadId, numThreads); + copy((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId, numThreads); } - /// Copy data from the local memory to the remote memory. + /// Copy data from the local memory (origin) to the remote memory (target). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// @@ -251,11 +251,11 @@ struct SmChannelDeviceHandle { /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void put(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) { - put(offset, offset, size, threadId, numThreads); + __forceinline__ __device__ void put(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) { + put(offset, offset, bytes, threadId, numThreads); } - /// Copy data from the remote memory to the local memory. + /// Copy data from the remote memory (target) to the local memory (origin). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// @@ -269,40 +269,40 @@ struct SmChannelDeviceHandle { /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void get(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) { - get(offset, offset, size, threadId, numThreads); + __forceinline__ __device__ void get(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) { + get(offset, offset, bytes, threadId, numThreads); } - /// Construct @ref LLPacket from the data in the local memory and write it on the remote packet buffer. + /// Construct @ref LLPacket from the data in the local memory (origin) and write it on the remote packet buffer (target). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets. /// - /// @param bufOffset The offset in bytes of the remote packet buffer. - /// @param dataOffset The offset in bytes of the local data. - /// @param dataBytes Bytes of the data to be copied. + /// @param targetOffset The offset in bytes of the remote packet buffer. + /// @param originOffset The offset in bytes of the local data. + /// @param originBytes Bytes of the origin to be copied. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// - __forceinline__ __device__ void putPackets(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, + __forceinline__ __device__ void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::putPackets(dst_, bufOffset, src_, dataOffset, dataBytes, threadId, numThreads, flag); + mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } - /// Retrieve data from @ref LLPacket in the local packet buffer and write it on the local data. + /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// - /// @param bufOffset The offset in bytes of the local packet buffer. - /// @param dataOffset The offset in bytes of the local data. - /// @param dataBytes Bytes of the data to be copied. + /// @param targetOffset The offset in bytes of the local packet buffer. + /// @param originOffset The offset in bytes of the local data. + /// @param originBytes Bytes of the origin to be copied. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// - __forceinline__ __device__ void getPackets(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, + __forceinline__ __device__ void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::getPackets(getPacketBuffer_, bufOffset, src_, dataOffset, dataBytes, threadId, numThreads, flag); + mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } /// Signal the remote semaphore.