diff --git a/include/mscclpp/packet.hpp b/include/mscclpp/packet.hpp index 23ca93011..6f95bb092 100644 --- a/include/mscclpp/packet.hpp +++ b/include/mscclpp/packet.hpp @@ -76,8 +76,9 @@ union LLPacket { #ifdef __CUDACC__ /// Read from the origin and write to the target buffer. -__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, uint64_t originOffset, - uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { +__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset); @@ -89,8 +90,9 @@ __forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffse } /// Read from the target buffer and write to the origin. -__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, uint64_t originOffset, - uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { +__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset); uint2* originBase = (uint2*)((char*)originPtr + originOffset); diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index a30faa932..30c69631c 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -211,9 +211,10 @@ struct SmChannelDeviceHandle { /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads) { - copy((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId, numThreads); + __forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads) { + copy((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId, + numThreads); } /// Copy data from the remote memory (target) to the local memory (origin). @@ -231,10 +232,11 @@ struct SmChannelDeviceHandle { /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads) { + __forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads) { // Note that `dst` and `src` are swapped for `get()`. - copy((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId, numThreads); + copy((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId, + numThreads); } /// Copy data from the local memory (origin) to the remote memory (target). @@ -273,7 +275,8 @@ struct SmChannelDeviceHandle { get(offset, offset, bytes, threadId, numThreads); } - /// Construct @ref LLPacket from the data in the local memory (origin) and write it on the remote packet buffer (target). + /// Construct @ref LLPacket from the data in the local memory (origin) and write it on the remote packet buffer + /// (target). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets. ///