diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 53395a785..b478dc5ae 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -12,7 +12,7 @@ jobs:
     name: Analyze
     runs-on: 'ubuntu-latest'
     container:
-      image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }}
+      image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
 
     permissions:
       actions: read
@@ -27,7 +27,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Check disk space
       run: |
@@ -38,12 +38,6 @@ jobs:
       with:
         languages: ${{ matrix.language }}
 
-    - name: Install cmake
-      run: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
-        sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake
-
     - name: Dubious ownership exception
       run: |
         git config --global --add safe.directory /__w/mscclpp/mscclpp
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
index 13eb10f0f..24dacf9ec 100644
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -4,7 +4,7 @@ on: workflow_dispatch
 
 jobs:
   IntegrationTest:
-    runs-on: self-hosted
+    runs-on: [ self-hosted, A100 ]
     defaults:
       run:
         shell: bash
@@ -13,22 +13,17 @@ jobs:
         cuda: [ cuda11.8, cuda12.1 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Install CMake
-        run: |
-          curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
-
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
 
       - name: Lock GPU clock frequency
@@ -41,7 +36,6 @@ jobs:
       - name: Run mscclpp AllGather test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -50,13 +44,11 @@ jobs:
       - name: Run mscclpp SendRecv test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
 
       - name: Run mscclpp AllReduce test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -68,7 +60,6 @@ jobs:
       - name: Run mscclpp AllToAll test
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
           mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
 
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 9cdcf443d..aaffe9578 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -11,7 +11,7 @@ jobs:
 
     steps:
     - name: Check out Git repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Install ClangFormat
       run: |
@@ -28,25 +28,25 @@ jobs:
 
     steps:
       - name: Check out Git repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
   
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: 3
 
       - name: Install Python dependencies
-        run: python3.8 -m pip install black
+        run: python3 -m pip install black
 
       - name: Run black
-        run: python3.8 -m black --check --config pyproject.toml .
+        run: python3 -m black --check --config pyproject.toml .
 
   spelling:
     runs-on: ubuntu-20.04
 
     steps:
     - name: Check out Git repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Download misspell
       run: |
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
index 9bcbf53b2..df8db2cbb 100644
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -4,7 +4,7 @@ on: workflow_dispatch
 
 jobs:
   UnitTest:
-    runs-on: self-hosted
+    runs-on: [ self-hosted, A100 ]
     defaults:
       run:
         shell: bash
@@ -14,7 +14,7 @@ jobs:
         cuda: [ cuda11.8, cuda12.1 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -23,10 +23,8 @@ jobs:
 
       - name: Build
         run: |
-          curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
         working-directory: ${{ github.workspace }}
 
@@ -36,31 +34,20 @@ jobs:
           for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
             sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
           done
-        working-directory: ${{ github.workspace }}
 
       - name: UnitTests
         run: |
           ./build/test/unit_tests
-        working-directory: ${{ github.workspace }}
 
       - name: MpUnitTests
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
           mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
           mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
-        working-directory: ${{ github.workspace }}
 
       - name: PyTests
         run: |
           set -e
-          export PATH=/usr/local/mpi/bin:$PATH
           cd build && make pylib-copy
-          if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
-            python3 -m pip install -r ../python/test/requirements_cu11.txt
-          else
-            python3 -m pip install -r ../python/test/requirements_cu12.txt
-          fi
           mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
-        working-directory: ${{ github.workspace }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c20c4cef6..3b33a6e96 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 set(MSCCLPP_MAJOR "0")
-set(MSCCLPP_MINOR "2")
+set(MSCCLPP_MINOR "3")
 set(MSCCLPP_PATCH "0")
 
 set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
diff --git a/README.md b/README.md
index 56a2fcf1e..7f0112ec1 100644
--- a/README.md
+++ b/README.md
@@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a
 
 * **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.
 
-## Key Features (v0.2)
+## Key Features (v0.3)
 
-MSCCL++ v0.2 supports the following features.
+MSCCL++ v0.3 supports the following features.
 
 ### In-Kernel Communication Interfaces
 
@@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans
 
 Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.
 
-## Status & Roadmap
+### New in MSCCL++ v0.3 (Latest Release)
+* Updated interfaces
+* Add Python bindings and interfaces
+* Add Python unit tests
+* Add more configurable parameters
+* Add a new single-node AllReduce kernel
+* Fix bugs
 
-MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version.
-
-### MSCCL++ v0.4 (TBU)
-* Automatic task scheduler
-* Dynamic performance tuning
-
-### MSCCL++ v0.3 (TBU)
-* Tile-based communication: efficient transport of 2D data patches (tiles)
-* GPU computation interfaces
-
-### MSCCL++ v0.2 (Latest Release)
-* Basic communication functionalities and new interfaces
-    - GPU-side communication interfaces
-    - Host-side helpers: bootstrap, communicator, and proxy
-    - Supports both NVLink and InfiniBand
-    - Supports both in-SM copy and DMA/RDMA
-* Communication performance optimization
-    - Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll
-* Development pipeline
-* Documentation
-
-### MSCCL++ v0.1
-* Proof-of-concept, preliminary interfaces
+See details from https://github.com/microsoft/mscclpp/issues/89.
 
 ## Contributing
 
diff --git a/docker/base-cuda12.1.dockerfile b/docker/base-cuda12.1.dockerfile
index b28a1995f..5c5bcd602 100644
--- a/docker/base-cuda12.1.dockerfile
+++ b/docker/base-cuda12.1.dockerfile
@@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get update && \
+RUN rm -rf /opt/nvidia
+
+RUN apt-get clean && \
+    apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -47,8 +50,10 @@ RUN cd /tmp && \
     cd .. && \
     rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
 
-ENV PATH="${PATH}:/usr/local/mpi/bin" \
-    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
+ENV PATH="/usr/local/mpi/bin:${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"
 
 RUN echo PATH="${PATH}" > /etc/environment && \
     echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
+
+ENTRYPOINT []
diff --git a/docker/dev-cuda11.8.dockerfile b/docker/dev-cuda11.8.dockerfile
new file mode 100644
index 000000000..094772b06
--- /dev/null
+++ b/docker/dev-cuda11.8.dockerfile
@@ -0,0 +1,28 @@
+FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
+    CMAKE_VERSION="3.26.4"
+
+ADD . ${MSCCLPP_SRC_DIR}
+WORKDIR ${MSCCLPP_SRC_DIR}
+
+# Install cmake 3.26.4
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
+    rm -rf ${CMAKE_HOME}.tar.gz
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install pytest & dependencies
+RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+WORKDIR /
+RUN rm -rf ${MSCCLPP_SRC_DIR}
diff --git a/docker/dev-cuda12.1.dockerfile b/docker/dev-cuda12.1.dockerfile
new file mode 100644
index 000000000..70fe684c1
--- /dev/null
+++ b/docker/dev-cuda12.1.dockerfile
@@ -0,0 +1,27 @@
+FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
+    CMAKE_VERSION="3.26.4"
+
+ADD . ${MSCCLPP_SRC_DIR}
+WORKDIR ${MSCCLPP_SRC_DIR}
+
+# Install cmake 3.26.4
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install pytest & dependencies
+RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+WORKDIR /
+RUN rm -rf ${MSCCLPP_SRC_DIR}
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index ea7b14602..306398fb0 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -5,7 +5,7 @@
 #define MSCCLPP_CORE_HPP_
 
 #define MSCCLPP_MAJOR 0
-#define MSCCLPP_MINOR 2
+#define MSCCLPP_MINOR 3
 #define MSCCLPP_PATCH 0
 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
 
@@ -24,6 +24,9 @@ namespace mscclpp {
 /// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
 using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;
 
+/// Return a version string.
+std::string version();
+
 /// Base class for bootstraps.
 class Bootstrap {
  public:
diff --git a/pyproject.toml b/pyproject.toml
index 698754bf6..5902c9464 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "mscclpp"
-version = "0.2.0"
+version = "0.3.0"
 
 [tool.scikit-build]
 cmake.minimum-version = "3.25.0"
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7776be62c..6bb8e2700 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -7,11 +7,10 @@ add_subdirectory(test)
 add_custom_target(pylib-copy)
 add_custom_command(TARGET pylib-copy POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so
+        ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so
+        ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
     COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
 )
-
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 89e889a22..5165e95cb 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -18,8 +18,11 @@
     TcpBootstrap,
     Transport,
     TransportFlags,
+    version,
 )
 
+__version__ = version()
+
 
 def get_include():
     """Return the directory that contains the MSCCL++ headers."""
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 1d1a064ce..60ceb96cc 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -29,6 +29,8 @@ void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
 }
 
 void register_core(nb::module_& m) {
+  m.def("version", &version);
+
   nb::class_<Bootstrap>(m, "Bootstrap")
       .def("get_rank", &Bootstrap::getRank)
       .def("get_n_ranks", &Bootstrap::getNranks)
diff --git a/python/test/mscclpp_group.py b/python/test/mscclpp_group.py
index 1b6138467..7a7c7b017 100644
--- a/python/test/mscclpp_group.py
+++ b/python/test/mscclpp_group.py
@@ -127,10 +127,7 @@ def make_sm_channels_with_packet(
         channels = {}
         for rank in connections:
             channels[rank] = SmChannel(
-                semaphores[rank],
-                registered_memories[rank],
-                tensor.data.ptr,
-                packetTensor.data.ptr,
+                semaphores[rank], registered_memories[rank], tensor.data.ptr, packetTensor.data.ptr
             )
         return channels
 
@@ -148,8 +145,6 @@ def make_proxy_channels_with_packet(
         channels = {}
         for rank in semaphores:
             channels[rank] = SimpleProxyChannel(
-                proxy_service.proxy_channel(semaphore_ids[rank]),
-                memory_ids[rank],
-                memory_ids[self.my_rank],
+                proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank]
             )
         return channels
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 0be3b2126..6674f4ea0 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -9,14 +9,7 @@
 import netifaces as ni
 import pytest
 
-from mscclpp import (
-    Fifo,
-    Host2DeviceSemaphore,
-    Host2HostSemaphore,
-    ProxyService,
-    SmDevice2DeviceSemaphore,
-    Transport,
-)
+from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport
 from ._cpp import _ext
 from .mscclpp_group import MscclppGroup
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
@@ -61,11 +54,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
     for rank in range(group.nranks):
         if rank == group.my_rank:
             continue
-        group.send(
-            memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))],
-            rank,
-            0,
-        )
+        group.send(memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))], rank, 0)
     for rank in range(group.nranks):
         if rank == group.my_rank:
             continue
@@ -207,43 +196,31 @@ def __init__(
     ):
         if test_name == "h2d_semaphore":
             self._kernel = KernelBuilder(
-                file="h2d_semaphore_test.cu",
-                kernel_name="h2d_semaphore",
+                file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore"
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
         elif test_name == "d2d_semaphore":
             self._kernel = KernelBuilder(
-                file="d2d_semaphore_test.cu",
-                kernel_name="d2d_semaphore",
+                file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore"
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
         elif test_name == "sm_channel":
-            self._kernel = KernelBuilder(
-                file="sm_channel_test.cu",
-                kernel_name="sm_channel",
-            ).get_compiled_kernel()
+            self._kernel = KernelBuilder(file="sm_channel_test.cu", kernel_name="sm_channel").get_compiled_kernel()
             self.nblocks = nranks
             self.nthreads = 1024
         elif test_name == "fifo":
-            self._kernel = KernelBuilder(
-                file="fifo_test.cu",
-                kernel_name="fifo",
-            ).get_compiled_kernel()
+            self._kernel = KernelBuilder(file="fifo_test.cu", kernel_name="fifo").get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = 1
         elif test_name == "proxy":
-            self._kernel = KernelBuilder(
-                file="proxy_test.cu",
-                kernel_name="proxy",
-            ).get_compiled_kernel()
+            self._kernel = KernelBuilder(file="proxy_test.cu", kernel_name="proxy").get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
         elif test_name == "simple_proxy_channel":
             self._kernel = KernelBuilder(
-                file="simple_proxy_channel_test.cu",
-                kernel_name="simple_proxy_channel",
+                file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel"
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = 1024
@@ -364,17 +341,10 @@ def test_fifo(
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
-def test_proxy(
-    mpi_group: MpiGroup,
-    nelem: int,
-    transport: str,
-):
+def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
     group, connections = create_and_connect(mpi_group, transport)
 
-    memory = cp.zeros(
-        nelem,
-        dtype=cp.int32,
-    )
+    memory = cp.zeros(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
     nelemPerRank * memory.itemsize
     memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -401,23 +371,12 @@ def test_proxy(
 
         list_reg_mem.append(all_reg_memories[rank])
 
-    proxy = _ext.MyProxyService(
-        group.my_rank,
-        group.nranks,
-        nelem * memory.itemsize,
-        list_conn,
-        list_reg_mem,
-        list_sem,
-    )
+    proxy = _ext.MyProxyService(group.my_rank, group.nranks, nelem * memory.itemsize, list_conn, list_reg_mem, list_sem)
 
     fifo_device_handle = proxy.fifo_device_handle()
 
     kernel = MscclppKernel(
-        "proxy",
-        my_rank=group.my_rank,
-        nranks=group.nranks,
-        semaphore_or_channels=list_sem,
-        fifo=fifo_device_handle,
+        "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle
     )
     proxy.start()
     group.barrier()
@@ -432,12 +391,7 @@ def test_proxy(
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["NVLink", "IB"])
 @pytest.mark.parametrize("use_packet", [False, True])
-def test_simple_proxy_channel(
-    mpi_group: MpiGroup,
-    nelem: int,
-    transport: str,
-    use_packet: bool,
-):
+def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
     group, connections = create_and_connect(mpi_group, transport)
 
     memory = cp.zeros(nelem, dtype=cp.int32)
diff --git a/src/core.cc b/src/core.cc
index 0282b2e9a..4d89250d0 100644
--- a/src/core.cc
+++ b/src/core.cc
@@ -2,11 +2,18 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/core.hpp>
+#include <sstream>
 
 #include "api.h"
 
 namespace mscclpp {
 
+MSCCLPP_API_CPP std::string version() {
+  std::stringstream ss;
+  ss << MSCCLPP_MAJOR << "." << MSCCLPP_MINOR << "." << MSCCLPP_PATCH;
+  return ss.str();
+}
+
 MSCCLPP_API_CPP TransportFlags::TransportFlags(Transport transport)
     : detail::TransportFlagsBase(1 << static_cast<size_t>(transport)) {}
 
diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp
index 0f868b140..55b5d5724 100644
--- a/src/include/communicator.hpp
+++ b/src/include/communicator.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCL_COMMUNICATOR_HPP_
-#define MSCCL_COMMUNICATOR_HPP_
+#ifndef MSCCLPP_COMMUNICATOR_HPP_
+#define MSCCLPP_COMMUNICATOR_HPP_
 
 #include <memory>
 #include <mscclpp/core.hpp>
@@ -31,4 +31,4 @@ struct Communicator::Impl {
 
 }  // namespace mscclpp
 
-#endif  // MSCCL_COMMUNICATOR_HPP_
+#endif  // MSCCLPP_COMMUNICATOR_HPP_
diff --git a/src/include/context.hpp b/src/include/context.hpp
index 11cc98d7d..6468b1d33 100644
--- a/src/include/context.hpp
+++ b/src/include/context.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCL_CONTEXT_HPP_
-#define MSCCL_CONTEXT_HPP_
+#ifndef MSCCLPP_CONTEXT_HPP_
+#define MSCCLPP_CONTEXT_HPP_
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/cuda_utils.hpp>
@@ -25,4 +25,4 @@ struct Context::Impl {
 
 }  // namespace mscclpp
 
-#endif  // MSCCL_CONTEXT_HPP_
+#endif  // MSCCLPP_CONTEXT_HPP_
diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp
index f246012c5..311fa9982 100644
--- a/src/include/endpoint.hpp
+++ b/src/include/endpoint.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCL_ENDPOINT_HPP_
-#define MSCCL_ENDPOINT_HPP_
+#ifndef MSCCLPP_ENDPOINT_HPP_
+#define MSCCLPP_ENDPOINT_HPP_
 
 #include <mscclpp/core.hpp>
 #include <vector>
@@ -26,4 +26,4 @@ struct Endpoint::Impl {
 
 }  // namespace mscclpp
 
-#endif  // MSCCL_ENDPOINT_HPP_
+#endif  // MSCCLPP_ENDPOINT_HPP_
diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py
index d5c5469a4..1430526ec 100644
--- a/test/mscclpp-test/check_perf_result.py
+++ b/test/mscclpp-test/check_perf_result.py
@@ -16,17 +16,9 @@ def load_perf_file(perf_fine: str) -> dict:
                 "time": data["time"],
             }
             if "target" in data:
-                res[
-                    (
-                        data["name"],
-                        data["kernel"],
-                        data["ranks"],
-                        data["ranksPerNode"],
-                        data["size"],
-                    )
-                ][
+                res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[
                     "target"
-                ] = data["target"]
+                ]
     return res