Tackle the comment

microsoft · Oct 7, 2023 · 76f5e28 · 76f5e28
1 parent 8f21198
commit 76f5e28
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 46 deletions.
diff --git a/include/mscclpp/packet.hpp b/include/mscclpp/packet.hpp
@@ -75,29 +75,29 @@ union LLPacket {
 };
 
 #ifdef __CUDACC__
-/// Read from the data and write to the packet buffer.
-__forceinline__ __device__ void putPackets(void* bufPtr, uint64_t bufOffset, const void* dataPtr, uint64_t dataOffset,
-                                           uint64_t dataBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+/// Read from the origin and write to the target buffer.
+__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, uint64_t originOffset,
+                                           uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
   // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
-  const uint32_t* dataBase = (const uint32_t*)((const char*)dataPtr + dataOffset);
-  LLPacket* bufBase = (LLPacket*)((char*)bufPtr + bufOffset);
-  size_t nElem = dataBytes / sizeof(uint64_t);
+  const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
+  LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
+  size_t nElem = originBytes / sizeof(uint64_t);
   for (size_t i = threadId; i < nElem; i += numThreads) {
-    LLPacket* pkt = &bufBase[i];
-    pkt->write(dataBase[2 * i], dataBase[2 * i + 1], flag);
+    LLPacket* pkt = &targetBase[i];
+    pkt->write(originBase[2 * i], originBase[2 * i + 1], flag);
   }
 }
 
-/// Read from the packet buffer and write to the data.
-__forceinline__ __device__ void getPackets(const void* bufPtr, uint64_t bufOffset, void* dataPtr, uint64_t dataOffset,
-                                           uint64_t dataBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+/// Read from the target buffer and write to the origin.
+__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, uint64_t originOffset,
+                                           uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
   // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
-  const LLPacket* bufBase = (const LLPacket*)((const char*)bufPtr + bufOffset);
-  uint2* dataBase = (uint2*)((char*)dataPtr + dataOffset);
-  size_t nElem = dataBytes / sizeof(uint2);
+  const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
+  uint2* originBase = (uint2*)((char*)originPtr + originOffset);
+  size_t nElem = originBytes / sizeof(uint2);
   for (size_t i = threadId; i < nElem; i += numThreads) {
-    const LLPacket* pkt = &bufBase[i];
-    dataBase[i] = pkt->read(flag);
+    const LLPacket* pkt = &targetBase[i];
+    originBase[i] = pkt->read(flag);
   }
 }
 #endif  // __CUDACC__

diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp
@@ -196,48 +196,48 @@ struct SmChannelDeviceHandle {
     }
   }
 
-  /// Copy data from the local memory to the remote memory.
+  /// Copy data from the local memory (origin) to the remote memory (target).
   ///
   /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
   ///
   /// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16.
   /// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p
   /// Alignment.
-  /// @param bufOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
-  /// @param dataOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
-  /// @param dataBytes Bytes of the data to be copied. Should be a multiple of @p Alignment.
+  /// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
+  /// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
+  /// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment.
   /// @param threadId The index of the current thread among all threads running this function. This is different from
   /// the `threadIdx` in CUDA.
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void put(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, uint32_t threadId,
+  __forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
                                       uint32_t numThreads) {
-    copy<Alignment, CopyRemainder>((char*)dst_ + bufOffset, (char*)src_ + dataOffset, dataBytes, threadId, numThreads);
+    copy<Alignment, CopyRemainder>((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId, numThreads);
   }
 
-  /// Copy data from the remote memory to the local memory.
+  /// Copy data from the remote memory (target) to the local memory (origin).
   ///
   /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
   ///
   /// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16.
   /// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p
   /// Alignment.
-  /// @param bufOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
-  /// @param dataOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
-  /// @param dataBytes Bytes of the data to be copied. Should be a multiple of @p Alignment.
+  /// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment.
+  /// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment.
+  /// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment.
   /// @param threadId The index of the current thread among all threads running this function. This is different from
   /// the `threadIdx` in CUDA.
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void get(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes, uint32_t threadId,
+  __forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
                                       uint32_t numThreads) {
     // Note that `dst` and `src` are swapped for `get()`.
-    copy<Alignment, CopyRemainder>((char*)src_ + dataOffset, (char*)dst_ + bufOffset, dataBytes, threadId, numThreads);
+    copy<Alignment, CopyRemainder>((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId, numThreads);
   }
 
-  /// Copy data from the local memory to the remote memory.
+  /// Copy data from the local memory (origin) to the remote memory (target).
   ///
   /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
   ///
@@ -251,11 +251,11 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void put(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) {
-    put<Alignment, CopyRemainder>(offset, offset, size, threadId, numThreads);
+  __forceinline__ __device__ void put(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
+    put<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
   }
 
-  /// Copy data from the remote memory to the local memory.
+  /// Copy data from the remote memory (target) to the local memory (origin).
   ///
   /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
   ///
@@ -269,40 +269,40 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void get(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) {
-    get<Alignment, CopyRemainder>(offset, offset, size, threadId, numThreads);
+  __forceinline__ __device__ void get(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
+    get<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
   }
 
-  /// Construct @ref LLPacket from the data in the local memory and write it on the remote packet buffer.
+  /// Construct @ref LLPacket from the data in the local memory (origin) and write it on the remote packet buffer (target).
   ///
   /// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets.
   ///
-  /// @param bufOffset The offset in bytes of the remote packet buffer.
-  /// @param dataOffset The offset in bytes of the local data.
-  /// @param dataBytes Bytes of the data to be copied.
+  /// @param targetOffset The offset in bytes of the remote packet buffer.
+  /// @param originOffset The offset in bytes of the local data.
+  /// @param originBytes Bytes of the origin to be copied.
   /// @param threadId The index of the current thread among all threads running this function. This is different from
   /// the `threadIdx` in CUDA.
   /// @param numThreads The total number of threads that run this function.
   ///
-  __forceinline__ __device__ void putPackets(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes,
+  __forceinline__ __device__ void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
                                              uint32_t threadId, uint32_t numThreads, uint32_t flag) {
-    mscclpp::putPackets(dst_, bufOffset, src_, dataOffset, dataBytes, threadId, numThreads, flag);
+    mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
   }
 
-  /// Retrieve data from @ref LLPacket in the local packet buffer and write it on the local data.
+  /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin).
   ///
   /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data.
   ///
-  /// @param bufOffset The offset in bytes of the local packet buffer.
-  /// @param dataOffset The offset in bytes of the local data.
-  /// @param dataBytes Bytes of the data to be copied.
+  /// @param targetOffset The offset in bytes of the local packet buffer.
+  /// @param originOffset The offset in bytes of the local data.
+  /// @param originBytes Bytes of the origin to be copied.
   /// @param threadId The index of the current thread among all threads running this function. This is different from
   /// the `threadIdx` in CUDA.
   /// @param numThreads The total number of threads that run this function.
   ///
-  __forceinline__ __device__ void getPackets(uint64_t bufOffset, uint64_t dataOffset, uint64_t dataBytes,
+  __forceinline__ __device__ void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
                                              uint32_t threadId, uint32_t numThreads, uint32_t flag) {
-    mscclpp::getPackets(getPacketBuffer_, bufOffset, src_, dataOffset, dataBytes, threadId, numThreads, flag);
+    mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
   }
 
   /// Signal the remote semaphore.