diff --git a/.circleci/config.yml b/.circleci/config.yml
index a1dbbba7..5141b056 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -37,7 +37,8 @@ jobs:
           command: |
             shopt -s globstar # to activate the ** globbing
             clang-format-11 --version
-            clang-format-11 -i $(find src -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
+            clang-format-11 -i $(find lct -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
+            clang-format-11 -i $(find lci -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
             clang-format-11 -i $(find examples -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
             clang-format-11 -i $(find tests -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
             git diff --exit-code > /tmp/clang_format_results.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e45dd4b1..42493568 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,9 +99,10 @@ endif()
 # LCI Optimization Options
 # ##############################################################################
 option(LCI_DEBUG "LCI Debug Mode" OFF)
-option(LCI_USE_MUTEX_CQ
-       "Use mutex lock to ensure the thread safety of the completion queue."
-       OFF)
+option(
+  LCI_USE_INLINE_CQ
+  "Use the C version of the completion queue so that it could be compiled inline."
+  OFF)
 option(LCI_ENABLE_MULTITHREAD_PROGRESS
        "LCI_progress can be called by multiple threads simultaneously" OFF)
 option(LCI_IBV_ENABLE_TRY_LOCK_QP
@@ -221,6 +222,11 @@ endif()
 # ##############################################################################
 # Add the actual LCI library
 # ##############################################################################
+add_library(LCT)
+set_target_properties(LCT PROPERTIES CXX_VISIBILITY_PRESET hidden CXX_STANDARD
+                                                                  17)
+target_link_libraries(LCT PUBLIC Threads::Threads)
+
 add_library(LCI)
 set_target_properties(
   LCI
@@ -228,7 +234,7 @@ set_target_properties(
              C_STANDARD 11
              C_EXTENSIONS ON)
 target_compile_definitions(LCI PRIVATE _GNU_SOURCE)
-target_link_libraries(LCI PUBLIC Threads::Threads ${FABRIC}::${FABRIC})
+target_link_libraries(LCI PUBLIC Threads::Threads ${FABRIC}::${FABRIC} LCT)
 if(LCI_USE_AVX)
   target_compile_options(LCI PUBLIC -mavx)
 endif()
@@ -237,8 +243,10 @@ if(LCI_USE_PAPI)
 endif()
 
 # set_target_properties(LCI PROPERTIES OUTPUT_NAME lci)
-add_subdirectory(src)
+add_subdirectory(lct)
+add_subdirectory(lci)
 add_subdirectory(dependency)
+target_link_libraries(LCT PRIVATE lci-ucx)
 target_link_libraries(LCI PRIVATE lci-ucx)
 if(LCI_WITH_EXAMPLES)
   add_subdirectory(examples)
@@ -266,12 +274,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   configure_file(liblci.pc.in liblci.pc @ONLY)
 
   install(
-    TARGETS LCI
-    EXPORT LCITargets
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
-  install(
-    TARGETS lci-ucx
+    TARGETS LCI lci-ucx LCT
     EXPORT LCITargets
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
@@ -281,12 +284,8 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     NAMESPACE LCI::
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/LCI)
   install(
-    DIRECTORY src/api/
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-    FILES_MATCHING
-    PATTERN "*.h")
-  install(
-    DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/src/api/
+    DIRECTORY lci/api/ lct/api/ ${CMAKE_CURRENT_BINARY_DIR}/lct/api/
+              ${CMAKE_CURRENT_BINARY_DIR}/lci/api/
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     FILES_MATCHING
     PATTERN "*.h")
diff --git a/LCIConfig.cmake.in b/LCIConfig.cmake.in
index e1313714..38221607 100644
--- a/LCIConfig.cmake.in
+++ b/LCIConfig.cmake.in
@@ -8,6 +8,16 @@ set(THREADS_PREFER_PTHREAD_FLAG TRUE)
 find_dependency(Threads)
 find_dependency(@FABRIC@)
 
+set_and_check(LCT_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@")
+set_and_check(LCT_SHARED_LIBRARY "@PACKAGE_CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@LCT@CMAKE_SHARED_LIBRARY_SUFFIX@")
+
+add_library(LCI::LCT SHARED IMPORTED)
+set_target_properties(LCI::LCT PROPERTIES
+  IMPORTED_LOCATION ${LCT_SHARED_LIBRARY}
+)
+target_include_directories(LCI::LCT INTERFACE ${LCT_INCLUDE_DIRS})
+target_link_libraries(LCI::LCT INTERFACE Threads::Threads)
+
 set_and_check(LCI_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@")
 set_and_check(LCI_SHARED_LIBRARY "@PACKAGE_CMAKE_INSTALL_LIBDIR@/@CMAKE_SHARED_LIBRARY_PREFIX@LCI@CMAKE_SHARED_LIBRARY_SUFFIX@")
 # set_and_check(LCI_STATIC_LIBRARY "@PACKAGE_CMAKE_INSTALL_LIBDIR@/@CMAKE_STATIC_LIBRARY_PREFIX@LCI@CMAKE_STATIC_LIBRARY_SUFFIX@")
@@ -17,7 +27,7 @@ set_target_properties(LCI::Shared PROPERTIES
   IMPORTED_LOCATION ${LCI_SHARED_LIBRARY}
 )
 target_include_directories(LCI::Shared INTERFACE ${LCI_INCLUDE_DIRS})
-target_link_libraries(LCI::Shared INTERFACE Threads::Threads @FABRIC@::@FABRIC@)
+target_link_libraries(LCI::Shared INTERFACE Threads::Threads @FABRIC@::@FABRIC@ LCI::LCT)
 
 add_library(LCI::LCI ALIAS LCI::Shared)
 
diff --git a/README.md b/README.md
index 88f815be..42100b3d 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ srun -n 2 ./hello_world
 
 See `examples` and `tests` for some example code.
 
-See `src/api/lci.h` for public APIs.
+See `lci/api/lci.h` for public APIs.
 
 `doxygen` for a full [documentation](https://uiuc-hpc.github.io/LC/).
 
diff --git a/cmake_modules/FindPAPI.cmake b/cmake_modules/FindPAPI.cmake
index f3a2c489..d29644c0 100644
--- a/cmake_modules/FindPAPI.cmake
+++ b/cmake_modules/FindPAPI.cmake
@@ -26,7 +26,7 @@ if(NOT TARGET Papi::papi)
   if(NOT PAPI_ROOT AND "$ENV{PAPI_ROOT}")
     set(PAPI_ROOT $ENV{PAPI_ROOT})
   elseif(NOT PAPI_ROOT)
-    string(REPLACE "/src/api" "" PAPI_ROOT "${PAPI_INCLUDE_DIR}")
+    string(REPLACE "/include" "" PAPI_ROOT "${PAPI_INCLUDE_DIR}")
   endif()
 
   # Set PAPI_ROOT in case the other hints are used
@@ -37,7 +37,7 @@ if(NOT TARGET Papi::papi)
     file(TO_CMAKE_PATH $ENV{PAPI_ROOT} PAPI_ROOT)
   else()
     file(TO_CMAKE_PATH "${PAPI_INCLUDE_DIR}" PAPI_INCLUDE_DIR)
-    string(REPLACE "/src/api" "" PAPI_ROOT "${PAPI_INCLUDE_DIR}")
+    string(REPLACE "/include" "" PAPI_ROOT "${PAPI_INCLUDE_DIR}")
   endif()
 
   set(PAPI_LIBRARIES ${PAPI_LIBRARY})
diff --git a/contrib/spack/packages/lci/package.py b/contrib/spack/packages/lci/package.py
index ee76334a..01eabe1a 100644
--- a/contrib/spack/packages/lci/package.py
+++ b/contrib/spack/packages/lci/package.py
@@ -36,8 +36,8 @@ def is_positive_int(val):
             description='Cache line size, in bytes')
     variant('native', default=True, description='Build with -march=native')
 
-    variant('cq', default='aqueue', values=('aqueue', 'mutex-dq'), multi=False,
-            description='Completion queue implementation')
+    variant('inline-cq', default=False,
+            description='Use the inline C completion queue implementation')
     variant('qp-lock', default=True,
             description='Lock queue pairs before access')
 
@@ -96,8 +96,7 @@ def cmake_args(self):
             self.define_from_variant('LCI_USE_AVX', 'vector'),
             self.define_from_variant('LCI_CONFIG_USE_ALIGNED_ALLOC', 'aligned'),
             self.define_from_variant('LCI_OPTIMIZE_FOR_NATIVE', 'native'),
-            self.define('LCI_USE_MUTEX_CQ',
-                        self.spec.variants['cq'].value == 'mutex-dq'),
+            self.define('LCI_USE_INLINE_CQ', 'inline-cq'),
             self.define_from_variant('LCI_IBV_ENABLE_TRY_LOCK_QP', 'qp-lock'),
             self.define('LCI_ENABLE_MULTITHREAD_PROGRESS', 'multithread-progress'),
             self.define('LCI_USE_DREG_DEFAULT',
diff --git a/dependency/ucx/CMakeLists.txt b/dependency/ucx/CMakeLists.txt
index 3fb7cb27..501c9c5f 100644
--- a/dependency/ucx/CMakeLists.txt
+++ b/dependency/ucx/CMakeLists.txt
@@ -120,6 +120,13 @@ if(LCI_UCX_ENABLE_TUNING)
   set(ENABLE_TUNING ON)
 endif()
 
+set(LCI_UCX_ENABLE_PROFILING
+    OFF
+    CACHE STRING "Enable profiling")
+if(LCI_UCX_ENABLE_PROFILING)
+  set(HAVE_PROFILING ON)
+endif()
+
 set(LCI_UCX_MAX_LOG_LEVEL
     debug
     CACHE STRING "Highest log level")
diff --git a/dependency/ucx/lci_ucx_api.h b/dependency/ucx/lci_ucx_api.h
index 3d95b586..31607f51 100644
--- a/dependency/ucx/lci_ucx_api.h
+++ b/dependency/ucx/lci_ucx_api.h
@@ -5,6 +5,10 @@
 
 #define LCII_API __attribute__((visibility("default")))
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 LCII_API void LCII_ucs_init(void);
 LCII_API void LCII_ucs_cleanup(void);
 
@@ -34,4 +38,8 @@ LCII_API double LCII_ucs_time_to_usec(LCII_ucs_time_t t);
 LCII_API double LCII_ucs_time_to_msec(LCII_ucs_time_t t);
 LCII_API double LCII_ucs_time_to_sec(LCII_ucs_time_t t);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  // LCI_LCI_UCX_API_H
diff --git a/format.sh b/format.sh
index 3af6502c..074af077 100755
--- a/format.sh
+++ b/format.sh
@@ -3,7 +3,8 @@
 echo "Formating c/c++ files..."
 shopt -s globstar # to activate the ** globbing
 clang-format-11 --version
-clang-format-11 -i $(find src -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
+clang-format-11 -i $(find lct -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
+clang-format-11 -i $(find lci -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
 clang-format-11 -i $(find examples -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
 clang-format-11 -i $(find tests -type f -name "*.h" -o -name "*.c" -o -name "*.hpp" -o -name "*.cpp")
 
diff --git a/src/CMakeLists.txt b/lci/CMakeLists.txt
similarity index 96%
rename from src/CMakeLists.txt
rename to lci/CMakeLists.txt
index 0f1ed895..ca15feac 100644
--- a/src/CMakeLists.txt
+++ b/lci/CMakeLists.txt
@@ -6,7 +6,7 @@ target_include_directories(LCI PRIVATE .)
 target_sources_relative(
   LCI
   PRIVATE
-  log/lcm_log.c
+  log/logger.c
   profile/papi_wrapper.c
   profile/performance_counter.c
   runtime/1sided_primitive.c
@@ -25,7 +25,6 @@ target_sources_relative(
   runtime/device.c
   runtime/endpoint.c
   runtime/completion/amhandler.c
-  runtime/monitor_thread.c
   runtime/rcache/lcii_rcache.c
   runtime/completion/sync_flag.c)
 
diff --git a/src/api/CMakeLists.txt b/lci/api/CMakeLists.txt
similarity index 100%
rename from src/api/CMakeLists.txt
rename to lci/api/CMakeLists.txt
diff --git a/src/api/lci.h b/lci/api/lci.h
similarity index 99%
rename from src/api/lci.h
rename to lci/api/lci.h
index c7f5b7aa..07dc3017 100644
--- a/src/api/lci.h
+++ b/lci/api/lci.h
@@ -14,6 +14,7 @@
 #include <stdbool.h>
 #include <stdio.h>
 #include "lci_config.h"
+#include "lct.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/api/lci_config.h.in b/lci/api/lci_config.h.in
similarity index 70%
rename from src/api/lci_config.h.in
rename to lci/api/lci_config.h.in
index 6d88ba4e..838ec7a7 100644
--- a/src/api/lci_config.h.in
+++ b/lci/api/lci_config.h.in
@@ -1,7 +1,3 @@
-//
-// Created by jiakunyan on 4/13/22.
-//
-
 #ifndef LCI_LCI_CONFIG_H
 #define LCI_LCI_CONFIG_H
 
diff --git a/src/backend/ibv/lcisi_ibv_detail.c b/lci/backend/ibv/lcisi_ibv_detail.c
similarity index 85%
rename from src/backend/ibv/lcisi_ibv_detail.c
rename to lci/backend/ibv/lcisi_ibv_detail.c
index ecb87fb2..f31d501c 100644
--- a/src/backend/ibv/lcisi_ibv_detail.c
+++ b/lci/backend/ibv/lcisi_ibv_detail.c
@@ -61,7 +61,7 @@ bool LCISI_ibv_select_best_device_port(struct ibv_device** dev_list,
     struct ibv_context* dev_ctx;
     dev_ctx = ibv_open_device(device);
     if (!dev_ctx) {
-      LCM_Log(LCM_LOG_INFO, "ibv", "Couldn't get context for %s.\n",
+      LCI_Log(LCI_LOG_INFO, "ibv", "Couldn't get context for %s.\n",
               ibv_get_device_name(device));
       continue;
     }
@@ -70,7 +70,7 @@ bool LCISI_ibv_select_best_device_port(struct ibv_device** dev_list,
     struct ibv_device_attr dev_attr;
     int ret = ibv_query_device(dev_ctx, &dev_attr);
     if (ret != 0) {
-      LCM_Log(LCM_LOG_INFO, "ibv", "Unable to query device %s.\n",
+      LCI_Log(LCI_LOG_INFO, "ibv", "Unable to query device %s.\n",
               ibv_get_device_name(device));
       goto close_device;
     }
@@ -81,13 +81,13 @@ bool LCISI_ibv_select_best_device_port(struct ibv_device** dev_list,
     for (uint8_t port_num = 1; port_num <= dev_attr.phys_port_cnt; port_num++) {
       ret = ibv_query_port(dev_ctx, port_num, &port_attr);
       if (ret != 0) {
-        LCM_Log(LCM_LOG_INFO, "ibv", "Unable to query port (%s:%d).\n",
+        LCI_Log(LCI_LOG_INFO, "ibv", "Unable to query port (%s:%d).\n",
                 ibv_get_device_name(device), port_num);
         continue;
       }
       // Check whether the port is active
       if (port_attr.state != IBV_PORT_ACTIVE) {
-        LCM_Log(LCM_LOG_INFO, "ibv", "%s:%d is not active (state: %d).\n",
+        LCI_Log(LCI_LOG_INFO, "ibv", "%s:%d is not active (state: %d).\n",
                 ibv_get_device_name(device), port_num, port_attr.state);
         continue;
       }
@@ -99,20 +99,20 @@ bool LCISI_ibv_select_best_device_port(struct ibv_device** dev_list,
       // Calculate its speed
       int width = translate_width(port_attr.active_width);
       if (width <= 0) {
-        LCM_Log(LCM_LOG_INFO, "ibv", "%s:%d invalid width %d (%d).\n",
+        LCI_Log(LCI_LOG_INFO, "ibv", "%s:%d invalid width %d (%d).\n",
                 ibv_get_device_name(device), port_num, width,
                 port_attr.active_width);
         continue;
       }
       double speed = translate_speed(port_attr.active_speed);
       if (speed <= 0) {
-        LCM_Log(LCM_LOG_INFO, "ibv", "%s:%d invalid speed %f (%d).\n",
+        LCI_Log(LCI_LOG_INFO, "ibv", "%s:%d invalid speed %f (%d).\n",
                 ibv_get_device_name(device), port_num, speed,
                 port_attr.active_width);
         continue;
       }
       double total_speed = speed * width;
-      LCM_Log(LCM_LOG_INFO, "ibv", "%s:%d speed is %.f (%d x %f).\n",
+      LCI_Log(LCI_LOG_INFO, "ibv", "%s:%d speed is %.f (%d x %f).\n",
               ibv_get_device_name(device), port_num, total_speed, width, speed);
       // Update the record if it is better.
       if (total_speed > best_speed) {
@@ -129,11 +129,11 @@ bool LCISI_ibv_select_best_device_port(struct ibv_device** dev_list,
   if (best_speed > 0) {
     *device_o = best_device;
     *port_o = best_port;
-    LCM_Log(LCM_LOG_INFO, "ibv", "Select the best device %s:%d.\n",
+    LCI_Log(LCI_LOG_INFO, "ibv", "Select the best device %s:%d.\n",
             ibv_get_device_name(best_device), best_port);
     return true;
   } else {
-    LCM_Log(LCM_LOG_INFO, "ibv", "No device is available!\n");
+    LCI_Log(LCI_LOG_INFO, "ibv", "No device is available!\n");
     return false;
   }
 }
\ No newline at end of file
diff --git a/src/backend/ibv/lcisi_ibv_detail.h b/lci/backend/ibv/lcisi_ibv_detail.h
similarity index 100%
rename from src/backend/ibv/lcisi_ibv_detail.h
rename to lci/backend/ibv/lcisi_ibv_detail.h
diff --git a/src/backend/ibv/server_ibv.c b/lci/backend/ibv/server_ibv.c
similarity index 94%
rename from src/backend/ibv/server_ibv.c
rename to lci/backend/ibv/server_ibv.c
index 67bacbaf..dda9abf5 100644
--- a/src/backend/ibv/server_ibv.c
+++ b/lci/backend/ibv/server_ibv.c
@@ -26,7 +26,7 @@ const char* mtu_str(enum ibv_mtu mtu)
 static inline void* LCISI_event_polling_thread_fn(void* argp)
 {
   LCISI_server_t* server = (LCISI_server_t*)argp;
-  LCM_Log(LCM_LOG_INFO, "event", "Start ibv event polling thread!\n");
+  LCI_Log(LCI_LOG_INFO, "event", "Start ibv event polling thread!\n");
   struct ibv_async_event event;
   while (atomic_load_explicit(&server->event_polling_thread_run,
                               LCIU_memory_order_acquire)) {
@@ -39,16 +39,16 @@ static inline void* LCISI_event_polling_thread_fn(void* argp)
       case IBV_EVENT_PATH_MIG_ERR:
       case IBV_EVENT_DEVICE_FATAL:
       case IBV_EVENT_SRQ_ERR:
-        LCM_Assert(false, "Got ibv async event error %d: %s\n",
+        LCI_Assert(false, "Got ibv async event error %d: %s\n",
                    event.event_type, ibv_event_type_str(event.event_type));
         break;
       default:
-        LCM_Log(LCM_LOG_INFO, "event", "Got ibv async event %d: %s\n",
+        LCI_Log(LCI_LOG_INFO, "event", "Got ibv async event %d: %s\n",
                 event.event_type, ibv_event_type_str(event.event_type));
     }
     ibv_ack_async_event(&event);
   }
-  LCM_Log(LCM_LOG_INFO, "event", "End ibv event polling thread!\n");
+  LCI_Log(LCI_LOG_INFO, "event", "End ibv event polling thread!\n");
   return NULL;
 }
 
@@ -64,7 +64,7 @@ void LCISI_event_polling_thread_init(LCISI_server_t* server)
 void LCISI_event_polling_thread_fina(LCISI_server_t* server)
 {
   if (LCI_IBV_ENABLE_EVENT_POLLING_THREAD) {
-    LCM_Warn(
+    LCI_Warn(
         "IBV event polling thread is enabled! The application may never "
         "finish!\n");
     atomic_store_explicit(&server->event_polling_thread_run, false,
@@ -88,7 +88,7 @@ void LCISD_server_init(LCI_device_t device, LCIS_server_t* s)
 
   bool ret = LCISI_ibv_select_best_device_port(
       server->dev_list, num_devices, &server->ib_dev, &server->dev_port);
-  LCM_Assert(ret, "Cannot find available ibv device/port!\n");
+  LCI_Assert(ret, "Cannot find available ibv device/port!\n");
 
   // ibv_open_device provides the user with a verbs context which is the object
   // that will be used for all other verb operations.
@@ -160,7 +160,7 @@ void LCISD_server_init(LCI_device_t device, LCIS_server_t* s)
     exit(EXIT_FAILURE);
   }
   server->dev_port = dev_port;
-  LCM_Log(LCM_LOG_INFO, "ibv", "Maximum MTU: %s; Active MTU: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ibv", "Maximum MTU: %s; Active MTU: %s\n",
           mtu_str(server->port_attr.max_mtu),
           mtu_str(server->port_attr.active_mtu));
   LCISI_event_polling_thread_init(server);
@@ -223,12 +223,12 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
       endpoint_p->pd =
           ibv_alloc_parent_domain(endpoint_p->server->dev_ctx, &attr);
       if (endpoint_p->pd == NULL) {
-        LCM_Log(LCM_LOG_INFO, "ibv", "ibv_alloc_parent_domain() failed (%s)\n",
+        LCI_Log(LCI_LOG_INFO, "ibv", "ibv_alloc_parent_domain() failed (%s)\n",
                 strerror(errno));
         IBV_SAFECALL(ibv_dealloc_td(endpoint_p->td));
       }
     } else {
-      LCM_Log(LCM_LOG_INFO, "ibv", "ibv_alloc_td() failed (%s)\n",
+      LCI_Log(LCI_LOG_INFO, "ibv", "ibv_alloc_td() failed (%s)\n",
               strerror(errno));
     }
   }
@@ -259,12 +259,12 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
         endpoint_p->qp_extras[i].pd =
             ibv_alloc_parent_domain(endpoint_p->server->dev_ctx, &attr);
         if (endpoint_p->qp_extras[i].pd == NULL) {
-          LCM_Log(LCM_LOG_INFO, "ibv",
+          LCI_Log(LCI_LOG_INFO, "ibv",
                   "ibv_alloc_parent_domain() failed (%s)\n", strerror(errno));
           IBV_SAFECALL(ibv_dealloc_td(endpoint_p->qp_extras[i].td));
         }
       } else {
-        LCM_Log(LCM_LOG_INFO, "ibv", "ibv_alloc_td() failed (%s)\n",
+        LCI_Log(LCI_LOG_INFO, "ibv", "ibv_alloc_td() failed (%s)\n",
                 strerror(errno));
       }
       if (endpoint_p->qp_extras[i].pd == NULL) {
@@ -308,11 +308,11 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
       struct ibv_qp_attr attr;
       memset(&attr, 0, sizeof(attr));
       ibv_query_qp(endpoint_p->qps[i], &attr, IBV_QP_CAP, &init_attr);
-      LCM_Assert(init_attr.cap.max_inline_data >= inline_size,
+      LCI_Assert(init_attr.cap.max_inline_data >= inline_size,
                  "Specified inline size %d is too large (maximum %d)",
                  inline_size, init_attr.cap.max_inline_data);
       if (inline_size < attr.cap.max_inline_data) {
-        LCM_Log(LCM_LOG_INFO, "ibv",
+        LCI_Log(LCI_LOG_INFO, "ibv",
                 "Maximum inline-size(%d) > requested inline-size(%d)\n",
                 attr.cap.max_inline_data, inline_size);
       }
@@ -345,7 +345,7 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
             endpoint_p->server->port_attr.lid);
     lcm_pm_publish(key, value);
   }
-  LCM_Log(LCM_LOG_INFO, "ibv", "Current inline data size is %d\n", inline_size);
+  LCI_Log(LCI_LOG_INFO, "ibv", "Current inline data size is %d\n", inline_size);
   endpoint_p->server->max_inline = inline_size;
   lcm_pm_barrier();
 
@@ -432,14 +432,14 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
     j++;
     free(b);
   }
-  LCM_Assert(j != INT32_MAX,
+  LCI_Assert(j != INT32_MAX,
              "Cannot find a suitable mod to hold qp2rank map\n");
   for (int i = 0; i < LCI_NUM_PROCESSES; i++) {
     b[endpoint_p->qps[i]->qp_num % j] = i;
   }
   endpoint_p->qp2rank_mod = j;
   endpoint_p->qp2rank = b;
-  LCM_Log(LCM_LOG_INFO, "ibv", "qp2rank_mod is %d\n", j);
+  LCI_Log(LCI_LOG_INFO, "ibv", "qp2rank_mod is %d\n", j);
   lcm_pm_barrier();
 }
 
diff --git a/src/backend/ibv/server_ibv.h b/lci/backend/ibv/server_ibv.h
similarity index 96%
rename from src/backend/ibv/server_ibv.h
rename to lci/backend/ibv/server_ibv.h
index bdcaf766..599d1933 100644
--- a/src/backend/ibv/server_ibv.h
+++ b/lci/backend/ibv/server_ibv.h
@@ -10,7 +10,7 @@
   {                                                                      \
     int err = (x);                                                       \
     if (err) {                                                           \
-      LCM_DBG_Assert(false, "err %d : %s (%s:%d)\n", err, strerror(err), \
+      LCI_DBG_Assert(false, "err %d : %s (%s:%d)\n", err, strerror(err), \
                      __FILE__, __LINE__);                                \
     }                                                                    \
   }                                                                      \
@@ -136,12 +136,12 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp,
   if (!LCIU_try_acquire_spinlock(&endpoint_p->cq_lock)) return 0;
 #endif
   int ne = ibv_poll_cq(endpoint_p->cq, LCI_CQ_MAX_POLL, wc);
-  LCM_DBG_Assert(ne >= 0, "ibv_poll_cq returns error %d\n", ne);
+  LCI_DBG_Assert(ne >= 0, "ibv_poll_cq returns error %d\n", ne);
 #ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
   LCIU_release_spinlock(&endpoint_p->cq_lock);
 #endif
   for (int i = 0; i < ne; i++) {
-    LCM_DBG_Assert(
+    LCI_DBG_Assert(
         wc[i].status == IBV_WC_SUCCESS, "Failed status %s (%d) for wr_id %d\n",
         ibv_wc_status_str(wc[i].status), wc[i].status, (int)wc[i].wr_id);
     if (wc[i].opcode == IBV_WC_RECV) {
@@ -156,7 +156,7 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp,
       entry[i].ctx = (void*)wc[i].wr_id;
       entry[i].imm_data = wc[i].imm_data;
     } else {
-      LCM_DBG_Assert(
+      LCI_DBG_Assert(
           wc[i].opcode == IBV_WC_SEND || wc[i].opcode == IBV_WC_RDMA_WRITE,
           "Unexpected IBV opcode!\n");
       entry[i].opcode = LCII_OP_SEND;
@@ -190,13 +190,13 @@ static inline LCI_error_t LCISD_post_sends(LCIS_endpoint_t endpoint_pp,
                                            LCIS_meta_t meta)
 {
   LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
-  LCM_DBG_Assert(size <= endpoint_p->server->max_inline,
+  LCI_DBG_Assert(size <= endpoint_p->server->max_inline,
                  "%lu exceed the inline message size\n"
                  "limit! %lu\n",
                  size, endpoint_p->server->max_inline);
   struct ibv_sge list;
   struct ibv_send_wr wr;
-  if (likely(size > 0)) {
+  if (LCT_likely(size > 0)) {
     list.addr = (uint64_t)buf;
     list.length = size;
     list.lkey = 0;
@@ -251,7 +251,7 @@ static inline LCI_error_t LCISD_post_send(LCIS_endpoint_t endpoint_pp, int rank,
 
   struct ibv_sge list;
   struct ibv_send_wr wr;
-  if (likely(size > 0)) {
+  if (LCT_likely(size > 0)) {
     list.addr = (uint64_t)buf;
     list.length = size;
     list.lkey = ibv_rma_lkey(mr);
@@ -300,14 +300,14 @@ static inline LCI_error_t LCISD_post_puts(LCIS_endpoint_t endpoint_pp, int rank,
                                           LCIS_rkey_t rkey)
 {
   LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
-  LCM_DBG_Assert(size <= endpoint_p->server->max_inline,
+  LCI_DBG_Assert(size <= endpoint_p->server->max_inline,
                  "%lu exceed the inline message size\n"
                  "limit! %lu\n",
                  size, endpoint_p->server->max_inline);
 
   struct ibv_sge list;
   struct ibv_send_wr wr;
-  if (likely(size > 0)) {
+  if (LCT_likely(size > 0)) {
     list.addr = (uint64_t)buf;
     list.length = size;
     list.lkey = 0;
@@ -357,7 +357,7 @@ static inline LCI_error_t LCISD_post_put(LCIS_endpoint_t endpoint_pp, int rank,
 
   struct ibv_sge list;
   struct ibv_send_wr wr;
-  if (likely(size > 0)) {
+  if (LCT_likely(size > 0)) {
     list.addr = (uint64_t)buf;
     list.length = size;
     list.lkey = ibv_rma_lkey(mr);
@@ -407,13 +407,13 @@ static inline LCI_error_t LCISD_post_putImms(LCIS_endpoint_t endpoint_pp,
                                              LCIS_rkey_t rkey, uint32_t meta)
 {
   LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
-  LCM_DBG_Assert(size <= endpoint_p->server->max_inline,
+  LCI_DBG_Assert(size <= endpoint_p->server->max_inline,
                  "%lu exceed the inline message size\n"
                  "limit! %lu\n",
                  size, endpoint_p->server->max_inline);
   struct ibv_sge list;
   struct ibv_send_wr wr;
-  if (likely(size > 0)) {
+  if (LCT_likely(size > 0)) {
     list.addr = (uint64_t)buf;
     list.length = size;
     list.lkey = 0;
@@ -466,7 +466,7 @@ static inline LCI_error_t LCISD_post_putImm(LCIS_endpoint_t endpoint_pp,
 
   struct ibv_sge list;
   struct ibv_send_wr wr;
-  if (likely(size > 0)) {
+  if (LCT_likely(size > 0)) {
     list.addr = (uint64_t)buf;
     list.length = size;
     list.lkey = ibv_rma_lkey(mr);
diff --git a/src/backend/ofi/server_ofi.c b/lci/backend/ofi/server_ofi.c
similarity index 85%
rename from src/backend/ofi/server_ofi.c
rename to lci/backend/ofi/server_ofi.c
index 988807dc..df9e6e8b 100644
--- a/src/backend/ofi/server_ofi.c
+++ b/lci/backend/ofi/server_ofi.c
@@ -64,39 +64,39 @@ void LCISD_server_init(LCI_device_t device, LCIS_server_t* s)
     server->info = fi_dupinfo(all_infos);
   }
   fi_freeinfo(all_infos);
-  LCM_Log(LCM_LOG_INFO, "ofi", "Provider name: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Provider name: %s\n",
           server->info->fabric_attr->prov_name);
-  LCM_Log(LCM_LOG_INFO, "ofi", "MR mode hints: [%s]\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "MR mode hints: [%s]\n",
           fi_tostr(&(hints->domain_attr->mr_mode), FI_TYPE_MR_MODE));
-  LCM_Log(LCM_LOG_INFO, "ofi", "MR mode provided: [%s]\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "MR mode provided: [%s]\n",
           fi_tostr(&(server->info->domain_attr->mr_mode), FI_TYPE_MR_MODE));
-  LCM_Log(LCM_LOG_INFO, "ofi", "Thread mode: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Thread mode: %s\n",
           fi_tostr(&(server->info->domain_attr->threading), FI_TYPE_THREADING));
-  LCM_Log(LCM_LOG_INFO, "ofi", "Control progress mode: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Control progress mode: %s\n",
           fi_tostr(&(server->info->domain_attr->control_progress),
                    FI_TYPE_PROGRESS));
-  LCM_Log(
-      LCM_LOG_INFO, "ofi", "Data progress mode: %s\n",
+  LCI_Log(
+      LCI_LOG_INFO, "ofi", "Data progress mode: %s\n",
       fi_tostr(&(server->info->domain_attr->data_progress), FI_TYPE_PROGRESS));
-  LCM_Log(LCM_LOG_INFO, "ofi", "Capacities: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Capacities: %s\n",
           fi_tostr(&(server->info->caps), FI_TYPE_CAPS));
-  LCM_Log(LCM_LOG_INFO, "ofi", "Mode: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Mode: %s\n",
           fi_tostr(&(server->info->mode), FI_TYPE_MODE));
-  LCM_Log(LCM_LOG_MAX, "ofi", "Fi_info provided: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Fi_info provided: %s\n",
           fi_tostr(server->info, FI_TYPE_INFO));
-  LCM_Log(LCM_LOG_MAX, "ofi", "Fabric attributes: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Fabric attributes: %s\n",
           fi_tostr(server->info->fabric_attr, FI_TYPE_FABRIC_ATTR));
-  LCM_Log(LCM_LOG_MAX, "ofi", "Domain attributes: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Domain attributes: %s\n",
           fi_tostr(server->info->domain_attr, FI_TYPE_DOMAIN_ATTR));
-  LCM_Log(LCM_LOG_MAX, "ofi", "Endpoint attributes: %s\n",
+  LCI_Log(LCI_LOG_INFO, "ofi", "Endpoint attributes: %s\n",
           fi_tostr(server->info->ep_attr, FI_TYPE_EP_ATTR));
-  LCM_Assert(server->info->domain_attr->cq_data_size >= 4,
+  LCI_Assert(server->info->domain_attr->cq_data_size >= 4,
              "cq_data_size (%lu) is too small!\n",
              server->info->domain_attr->cq_data_size);
-  LCM_Assert(server->info->domain_attr->mr_key_size <= 8,
+  LCI_Assert(server->info->domain_attr->mr_key_size <= 8,
              "mr_key_size (%lu) is too large!\n",
              server->info->domain_attr->mr_key_size);
-  LCM_Assert(server->info->tx_attr->inject_size >= sizeof(LCI_short_t),
+  LCI_Assert(server->info->tx_attr->inject_size >= sizeof(LCI_short_t),
              "inject_size (%lu) < sizeof(LCI_short_t) (%lu)!\n",
              server->info->tx_attr->inject_size, sizeof(LCI_short_t));
   fi_freeinfo(hints);
@@ -113,7 +113,7 @@ void LCISD_server_init(LCI_device_t device, LCIS_server_t* s)
 void LCISD_server_fina(LCIS_server_t s)
 {
   LCISI_server_t* server = (LCISI_server_t*)s;
-  LCM_Assert(server->endpoint_count == 0, "Endpoint count is not zero (%d)\n",
+  LCI_Assert(server->endpoint_count == 0, "Endpoint count is not zero (%d)\n",
              server->endpoint_count);
   FI_SAFECALL(fi_close((struct fid*)&server->domain->fid));
   FI_SAFECALL(fi_close((struct fid*)&server->fabric->fid));
@@ -161,8 +161,8 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
   const int EP_ADDR_LEN = 6;
   size_t addrlen = 0;
   fi_getname((fid_t)endpoint_p->ep, NULL, &addrlen);
-  LCM_Log(LCM_LOG_INFO, "ofi", "addrlen = %lu\n", addrlen);
-  LCM_Assert(addrlen <= 8 * EP_ADDR_LEN, "addrlen = %lu\n", addrlen);
+  LCI_Log(LCI_LOG_INFO, "ofi", "addrlen = %lu\n", addrlen);
+  LCI_Assert(addrlen <= 8 * EP_ADDR_LEN, "addrlen = %lu\n", addrlen);
   uint64_t my_addr[EP_ADDR_LEN];
   FI_SAFECALL(fi_getname((fid_t)endpoint_p->ep, my_addr, &addrlen));
 
@@ -186,11 +186,11 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
              &peer_addr[3], &peer_addr[4], &peer_addr[5]);
       int ret = fi_av_insert(endpoint_p->av, (void*)peer_addr, 1,
                              &endpoint_p->peer_addrs[i], 0, NULL);
-      LCM_Assert(ret == 1, "fi_av_insert failed! ret = %d\n", ret);
+      LCI_Assert(ret == 1, "fi_av_insert failed! ret = %d\n", ret);
     } else {
       int ret = fi_av_insert(endpoint_p->av, (void*)my_addr, 1,
                              &endpoint_p->peer_addrs[i], 0, NULL);
-      LCM_Assert(ret == 1, "fi_av_insert failed! ret = %d\n", ret);
+      LCI_Assert(ret == 1, "fi_av_insert failed! ret = %d\n", ret);
     }
   }
 
@@ -203,7 +203,7 @@ void LCISD_endpoint_fina(LCIS_endpoint_t endpoint_pp)
   LCISI_endpoint_t* endpoint_p = (LCISI_endpoint_t*)endpoint_pp;
   LCIU_free(endpoint_p->peer_addrs);
   int my_idx = --endpoint_p->server->endpoint_count;
-  LCM_Assert(endpoint_p->server->endpoints[my_idx] == endpoint_p,
+  LCI_Assert(endpoint_p->server->endpoints[my_idx] == endpoint_p,
              "This is not me!\n");
   endpoint_p->server->endpoints[my_idx] = NULL;
   FI_SAFECALL(fi_close((struct fid*)&endpoint_p->ep->fid));
diff --git a/src/backend/ofi/server_ofi.h b/lci/backend/ofi/server_ofi.h
similarity index 97%
rename from src/backend/ofi/server_ofi.h
rename to lci/backend/ofi/server_ofi.h
index a38235ff..a4c0b0e7 100644
--- a/src/backend/ofi/server_ofi.h
+++ b/lci/backend/ofi/server_ofi.h
@@ -17,7 +17,7 @@
     int err = (x);                                                            \
     if (err < 0) err = -err;                                                  \
     if (err) {                                                                \
-      LCM_DBG_Assert(false, "err : %s (%s:%d)\n", fi_strerror(err), __FILE__, \
+      LCI_DBG_Assert(false, "err : %s (%s:%d)\n", fi_strerror(err), __FILE__, \
                      __LINE__);                                               \
     }                                                                         \
   }                                                                           \
@@ -62,7 +62,7 @@ static inline void* LCISI_real_server_reg(LCIS_server_t s, void* buf,
                         FI_READ | FI_WRITE | FI_REMOTE_WRITE, 0, rdma_key, 0,
                         &mr, 0));
   if (server->info->domain_attr->mr_mode & FI_MR_ENDPOINT) {
-    LCM_DBG_Assert(server->endpoint_count >= 1, "No endpoints available!\n");
+    LCI_DBG_Assert(server->endpoint_count >= 1, "No endpoints available!\n");
     if (strcmp(server->info->fabric_attr->prov_name, "cxi") == 0) {
       // A temporary fix for the cxi provider. It appears cxi cannot bind a
       // memory region to more than one endpoint, but other endpoints can still
@@ -133,7 +133,7 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp,
         entry[i].ctx = NULL;
         entry[i].imm_data = fi_entry[i].data;
       } else {
-        LCM_DBG_Assert(
+        LCI_DBG_Assert(
             fi_entry[i].flags & FI_SEND || fi_entry[i].flags & FI_WRITE,
             "Unexpected OFI opcode!\n");
         entry[i].opcode = LCII_OP_SEND;
@@ -143,10 +143,10 @@ static inline int LCISD_poll_cq(LCIS_endpoint_t endpoint_pp,
   } else if (ne == -FI_EAGAIN) {
     ret = 0;
   } else {
-    LCM_DBG_Assert(ne == -FI_EAVAIL, "unexpected return error: %s\n",
+    LCI_DBG_Assert(ne == -FI_EAVAIL, "unexpected return error: %s\n",
                    fi_strerror(-ne));
     fi_cq_readerr(endpoint_p->cq, &error, 0);
-    LCM_Assert(false, "Err %d: %s\n", error.err, fi_strerror(error.err));
+    LCI_Assert(false, "Err %d: %s\n", error.err, fi_strerror(error.err));
   }
   return ret;
 }
diff --git a/src/backend/server.h b/lci/backend/server.h
similarity index 75%
rename from src/backend/server.h
rename to lci/backend/server.h
index e1e7bc71..daf84b43 100644
--- a/src/backend/server.h
+++ b/lci/backend/server.h
@@ -103,8 +103,10 @@ static inline LCIS_rkey_t LCIS_rma_rkey(LCIS_mr_t mr)
 
 static inline LCIS_mr_t LCIS_rma_reg(LCIS_server_t s, void* buf, size_t size)
 {
+  LCII_PCOUNTER_START(net_mem_reg_timer);
   LCIS_mr_t mr = LCISD_rma_reg(s, buf, size);
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server-reg",
+  LCII_PCOUNTER_END(net_mem_reg_timer);
+  LCI_DBG_Log(LCI_LOG_TRACE, "server-reg",
               "LCIS_rma_reg: mr %p buf %p size %lu rkey %lu\n", mr.mr_p, buf,
               size, LCISD_rma_rkey(mr));
   return mr;
@@ -112,10 +114,12 @@ static inline LCIS_mr_t LCIS_rma_reg(LCIS_server_t s, void* buf, size_t size)
 
 static inline void LCIS_rma_dereg(LCIS_mr_t mr)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server-reg",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server-reg",
               "LCIS_rma_dereg: mr %p buf %p size %lu rkey %lu\n", mr.mr_p,
               mr.address, mr.length, LCISD_rma_rkey(mr));
+  LCII_PCOUNTER_START(net_mem_dereg_timer);
   LCISD_rma_dereg(mr);
+  LCII_PCOUNTER_END(net_mem_dereg_timer);
 }
 
 static inline void LCIS_endpoint_init(LCIS_server_t server_pp,
@@ -140,7 +144,7 @@ static inline LCI_error_t LCIS_post_sends(LCIS_endpoint_t endpoint_pp, int rank,
                                           void* buf, size_t size,
                                           LCIS_meta_t meta)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server",
               "LCIS_post_sends: rank %d buf %p size %lu meta %d\n", rank, buf,
               size, meta);
 #ifdef LCI_ENABLE_SLOWDOWN
@@ -148,18 +152,12 @@ static inline LCI_error_t LCIS_post_sends(LCIS_endpoint_t endpoint_pp, int rank,
 #endif
   LCI_error_t ret = LCISD_post_sends(endpoint_pp, rank, buf, size, meta);
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_tx++);
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_tx +=
-                           size);
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].msgs_2sided_tx++);
+    LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
   } else if (ret == LCI_ERR_RETRY_LOCK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_lock++);
+    LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
     ret = LCI_ERR_RETRY;
   } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_nomem++);
+    LCII_PCOUNTER_ADD(net_send_failed_nomem, 1);
     ret = LCI_ERR_RETRY;
   }
   return ret;
@@ -168,7 +166,7 @@ static inline LCI_error_t LCIS_post_send(LCIS_endpoint_t endpoint_pp, int rank,
                                          void* buf, size_t size, LCIS_mr_t mr,
                                          LCIS_meta_t meta, void* ctx)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server",
               "LCIS_post_send: rank %d buf %p size %lu mr %p meta %d ctx %p\n",
               rank, buf, size, mr.mr_p, meta, ctx);
 #ifdef LCI_ENABLE_SLOWDOWN
@@ -177,18 +175,12 @@ static inline LCI_error_t LCIS_post_send(LCIS_endpoint_t endpoint_pp, int rank,
   LCI_error_t ret =
       LCISD_post_send(endpoint_pp, rank, buf, size, mr, meta, ctx);
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_tx += 1);
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_tx +=
-                           size);
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].msgs_2sided_tx += 1);
+    LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
   } else if (ret == LCI_ERR_RETRY_LOCK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_lock++);
+    LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
     ret = LCI_ERR_RETRY;
   } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_nomem++);
+    LCII_PCOUNTER_ADD(net_send_failed_nomem, 1);
     ret = LCI_ERR_RETRY;
   }
   return ret;
@@ -197,7 +189,7 @@ static inline LCI_error_t LCIS_post_puts(LCIS_endpoint_t endpoint_pp, int rank,
                                          void* buf, size_t size, uintptr_t base,
                                          LCIS_offset_t offset, LCIS_rkey_t rkey)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server",
               "LCIS_post_puts: rank %d buf %p size %lu base %p offset %lu "
               "rkey %lu\n",
               rank, buf, size, (void*)base, offset, rkey);
@@ -207,18 +199,12 @@ static inline LCI_error_t LCIS_post_puts(LCIS_endpoint_t endpoint_pp, int rank,
   LCI_error_t ret =
       LCISD_post_puts(endpoint_pp, rank, buf, size, base, offset, rkey);
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_tx += 1);
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_tx +=
-                           size);
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].msgs_1sided_tx += 1);
+    LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
   } else if (ret == LCI_ERR_RETRY_LOCK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_lock++);
+    LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
     ret = LCI_ERR_RETRY;
   } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_nomem++);
+    LCII_PCOUNTER_ADD(net_send_failed_nomem, 1);
     ret = LCI_ERR_RETRY;
   }
   return ret;
@@ -228,7 +214,7 @@ static inline LCI_error_t LCIS_post_put(LCIS_endpoint_t endpoint_pp, int rank,
                                         uintptr_t base, LCIS_offset_t offset,
                                         LCIS_rkey_t rkey, void* ctx)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server",
               "LCIS_post_put: rank %d buf %p size %lu mr %p base %p "
               "offset %lu rkey %lu ctx %p\n",
               rank, buf, size, mr.mr_p, (void*)base, offset, rkey, ctx);
@@ -238,18 +224,12 @@ static inline LCI_error_t LCIS_post_put(LCIS_endpoint_t endpoint_pp, int rank,
   LCI_error_t ret =
       LCISD_post_put(endpoint_pp, rank, buf, size, mr, base, offset, rkey, ctx);
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_tx += 1);
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_tx +=
-                           size);
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].msgs_1sided_tx += 1);
+    LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
   } else if (ret == LCI_ERR_RETRY_LOCK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_lock++);
+    LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
     ret = LCI_ERR_RETRY;
   } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_nomem++);
+    LCII_PCOUNTER_ADD(net_send_failed_nomem, 1);
     ret = LCI_ERR_RETRY;
   }
   return ret;
@@ -260,7 +240,7 @@ static inline LCI_error_t LCIS_post_putImms(LCIS_endpoint_t endpoint_pp,
                                             LCIS_offset_t offset,
                                             LCIS_rkey_t rkey, uint32_t meta)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server",
               "LCIS_post_putImms: rank %d buf %p size %lu base %p offset %lu "
               "rkey %lu meta %d\n",
               rank, buf, size, (void*)base, offset, rkey, meta);
@@ -270,18 +250,12 @@ static inline LCI_error_t LCIS_post_putImms(LCIS_endpoint_t endpoint_pp,
   LCI_error_t ret = LCISD_post_putImms(endpoint_pp, rank, buf, size, base,
                                        offset, rkey, meta);
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_tx += 1);
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_tx +=
-                           size);
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].msgs_1sided_tx += 1);
+    LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
   } else if (ret == LCI_ERR_RETRY_LOCK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_lock++);
+    LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
     ret = LCI_ERR_RETRY;
   } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_nomem++);
+    LCII_PCOUNTER_ADD(net_send_failed_nomem, 1);
     ret = LCI_ERR_RETRY;
   }
   return ret;
@@ -293,7 +267,7 @@ static inline LCI_error_t LCIS_post_putImm(LCIS_endpoint_t endpoint_pp,
                                            LCIS_rkey_t rkey, LCIS_meta_t meta,
                                            void* ctx)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server",
               "LCIS_post_putImm: rank %d buf %p size %lu mr %p base %p "
               "offset %lu rkey %lu meta %u ctx %p\n",
               rank, buf, size, mr.mr_p, (void*)base, offset, rkey, meta, ctx);
@@ -303,18 +277,12 @@ static inline LCI_error_t LCIS_post_putImm(LCIS_endpoint_t endpoint_pp,
   LCI_error_t ret = LCISD_post_putImm(endpoint_pp, rank, buf, size, mr, base,
                                       offset, rkey, meta, ctx);
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_tx += 1);
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_tx +=
-                           size);
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].msgs_1sided_tx += 1);
+    LCII_PCOUNTER_ADD(net_send_posted, (int64_t)size);
   } else if (ret == LCI_ERR_RETRY_LOCK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_lock++);
+    LCII_PCOUNTER_ADD(net_send_failed_lock, 1);
     ret = LCI_ERR_RETRY;
   } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_backend_failed_nomem++);
+    LCII_PCOUNTER_ADD(net_send_failed_nomem, 1);
     ret = LCI_ERR_RETRY;
   }
   return ret;
@@ -322,10 +290,11 @@ static inline LCI_error_t LCIS_post_putImm(LCIS_endpoint_t endpoint_pp,
 static inline LCI_error_t LCIS_post_recv(LCIS_endpoint_t endpoint_pp, void* buf,
                                          uint32_t size, LCIS_mr_t mr, void* ctx)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "server",
+  LCI_DBG_Log(LCI_LOG_TRACE, "server",
               "LCIS_post_recv: buf %p size %u mr %p user_context %p\n", buf,
               size, mr.mr_p, ctx);
   LCI_error_t ret = LCISD_post_recv(endpoint_pp, buf, size, mr, ctx);
+  if (ret == LCI_OK) LCII_PCOUNTER_ADD(net_recv_posted, 1);
   return ret;
 }
 
diff --git a/lci/datastructure/lcm_aqueue.h b/lci/datastructure/lcm_aqueue.h
new file mode 100644
index 00000000..ec6ce332
--- /dev/null
+++ b/lci/datastructure/lcm_aqueue.h
@@ -0,0 +1,121 @@
+#ifndef LCI_LCM_AQUEUE_H
+#define LCI_LCM_AQUEUE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct LCM_aqueue_entry_t {
+  void* data;
+  atomic_uint_fast64_t tag;
+  LCIU_CACHE_PADDING(sizeof(void*) + sizeof(atomic_uint_fast64_t));
+} LCM_aqueue_entry_t;
+
+typedef struct LCM_aqueue_t {
+  atomic_uint_fast64_t top;  // point to the next entry that is empty
+  LCIU_CACHE_PADDING(sizeof(atomic_uint_fast64_t));
+  atomic_uint_fast64_t bot;  // point to the fist entry that is full
+  LCIU_CACHE_PADDING(sizeof(atomic_uint_fast64_t));
+  uint_fast64_t length;
+  struct LCM_aqueue_entry_t* container;  // a pointer to type void*
+} LCM_aqueue_t;
+
+// The following functions are not thread-safe
+static inline void LCM_aqueue_init(LCM_aqueue_t* queue, uint_fast64_t capacity);
+static inline void LCM_aqueue_fina(LCM_aqueue_t* queue);
+// The following functions are thread-safe
+static inline void LCM_aqueue_push(LCM_aqueue_t* queue, void* val);
+static inline void* LCM_aqueue_pop(LCM_aqueue_t* queue);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline void LCM_aqueue_init(LCM_aqueue_t* queue, uint_fast64_t capacity)
+{
+  LCI_Assert(sizeof(LCM_aqueue_entry_t) == LCI_CACHE_LINE,
+             "Unexpected sizeof(LCM_aqueue_entry_t) %lu\n",
+             sizeof(LCM_aqueue_entry_t));
+  queue->container = LCIU_memalign(LCI_CACHE_LINE,
+                                   (capacity + 1) * sizeof(LCM_aqueue_entry_t));
+  atomic_init(&queue->top, 0);
+  atomic_init(&queue->bot, 0);
+  queue->length = capacity + 1;
+  for (int i = 0; i < queue->length; ++i) {
+    queue->container[i].data = 0;
+    atomic_init(&queue->container[i].tag, -1);
+  }
+  atomic_thread_fence(LCIU_memory_order_seq_cst);
+}
+
+static inline void LCM_aqueue_fina(LCM_aqueue_t* queue)
+{
+  atomic_thread_fence(LCIU_memory_order_seq_cst);
+  LCIU_free(queue->container);
+  queue->container = NULL;
+  atomic_init(&queue->top, 0);
+  atomic_init(&queue->bot, 0);
+  queue->length = 0;
+}
+
+static inline void LCM_aqueue_push(LCM_aqueue_t* queue, void* val)
+{
+  LCT_time_t time0 = LCT_now();
+  LCII_PCOUNTER_STARTT(cq_push_internal, time0);
+  // reserve a slot to write
+  LCII_PCOUNTER_STARTT(cq_push_faa, time0);
+  uint_fast64_t current_top =
+      atomic_fetch_add_explicit(&queue->top, 1, LCIU_memory_order_relaxed);
+  LCT_time_t time1 = LCT_now();
+  LCII_PCOUNTER_ENDT(cq_push_faa, time1);
+  // write to the slot
+  LCII_PCOUNTER_STARTT(cq_push_write, time1);
+  LCI_DBG_Assert(atomic_load_explicit(
+                     &queue->container[current_top % queue->length].tag,
+                     LCIU_memory_order_acquire) != current_top - queue->length,
+                 "wrote to a nonempty value!\n");
+  queue->container[current_top % queue->length].data = val;
+  LCT_time_t time2 = LCT_now();
+  LCII_PCOUNTER_ENDT(cq_push_write, time2);
+  // update top2 to tell the consumers they can safely read this slot.
+  LCII_PCOUNTER_STARTT(cq_push_store, time2);
+  atomic_store_explicit(&queue->container[current_top % queue->length].tag,
+                        current_top, LCIU_memory_order_release);
+  LCT_time_t time3 = LCT_now();
+  LCII_PCOUNTER_ENDT(cq_push_store, time3);
+  LCII_PCOUNTER_ENDT(cq_push_internal, time3);
+}
+
+static inline void* LCM_aqueue_pop(LCM_aqueue_t* queue)
+{
+  uint_fast64_t current_bot =
+      atomic_load_explicit(&queue->bot, LCIU_memory_order_relaxed);
+  if (atomic_load_explicit(&queue->container[current_bot % queue->length].tag,
+                           LCIU_memory_order_acquire) != current_bot) {
+    // the queue is empty
+    return NULL;
+  }
+  current_bot =
+      atomic_fetch_add_explicit(&queue->bot, 1, LCIU_memory_order_relaxed);
+  while (
+      atomic_load_explicit(&queue->container[current_bot % queue->length].tag,
+                           LCIU_memory_order_acquire) != current_bot) {
+    // some thread is ahead of us. We got a cell that is empty.
+    uint_fast64_t expected = current_bot + 1;
+    _Bool succeed = atomic_compare_exchange_weak_explicit(
+        &queue->bot, &expected, current_bot, LCIU_memory_order_relaxed,
+        LCIU_memory_order_relaxed);
+    if (succeed) return NULL;
+  }
+  LCII_PCOUNTER_START(cq_pop_timer);
+  // we have successfully reserve an entry
+  void* result = queue->container[current_bot % queue->length].data;
+#ifdef LCI_DEBUG
+  atomic_store_explicit(&queue->container[current_bot % queue->length].tag,
+                        current_bot + 1, LCIU_memory_order_release);
+#endif
+  LCII_PCOUNTER_END(cq_pop_timer);
+  return result;
+}
+
+#endif  // LCI_LCM_AQUEUE_H
\ No newline at end of file
diff --git a/src/datastructure/lcm_archive.h b/lci/datastructure/lcm_archive.h
similarity index 92%
rename from src/datastructure/lcm_archive.h
rename to lci/datastructure/lcm_archive.h
index b3c78ad4..a13d255b 100644
--- a/src/datastructure/lcm_archive.h
+++ b/lci/datastructure/lcm_archive.h
@@ -54,7 +54,7 @@ static int LCM_archive_init(LCM_archive_t* archive, int nbits)
   size_t cap = 1 << nbits;
   int ret = posix_memalign((void**)&archive->ptr, LCI_CACHE_LINE,
                            cap * sizeof(struct LCM_archive_entry_t));
-  LCM_Assert(ret == 0, "Memory allocation failed!\n");
+  LCI_Assert(ret == 0, "Memory allocation failed!\n");
   atomic_init(&archive->head, 0);
   archive->nbits = nbits;
 
@@ -82,7 +82,7 @@ static inline int LCM_archive_put(LCM_archive_t* archive,
 {
   uint_fast64_t head =
       atomic_fetch_add_explicit(&archive->head, 1, LCIU_memory_order_relaxed);
-  LCM_DBG_Assert(head != UINT_FAST64_MAX, "head %lu overflow!\n", head);
+  LCI_DBG_Assert(head != UINT_FAST64_MAX, "head %lu overflow!\n", head);
   LCM_archive_key_t key = head & ((1 << archive->nbits) - 1);
   LCM_archive_val_t expected_val = LCM_ARCHIVE_EMPTY;
   _Bool succeed = atomic_compare_exchange_strong_explicit(
@@ -90,11 +90,11 @@ static inline int LCM_archive_put(LCM_archive_t* archive,
       LCIU_memory_order_relaxed);
   if (succeed) {
     *key_ptr = key;
-    LCM_DBG_Log(LCM_LOG_DEBUG, "archive", "Archive (%lu, %p) succeed!\n", key,
+    LCI_DBG_Log(LCI_LOG_TRACE, "archive", "Archive (%lu, %p) succeed!\n", key,
                 (void*)value);
     return LCM_SUCCESS;
   } else {
-    LCM_DBG_Log(LCM_LOG_DEBUG, "archive",
+    LCI_DBG_Log(LCI_LOG_TRACE, "archive",
                 "Archive (%lu, %p) conflicting with %p RETRY!\n", key,
                 (void*)value, (void*)expected_val);
     return LCM_RETRY;
@@ -117,7 +117,7 @@ static inline LCM_archive_val_t LCM_archive_remove(LCM_archive_t* archive,
       atomic_load_explicit(&archive->ptr[key].val, LCIU_memory_order_relaxed);
   atomic_store_explicit(&archive->ptr[key].val, LCM_ARCHIVE_EMPTY,
                         LCIU_memory_order_relaxed);
-  LCM_DBG_Log(LCM_LOG_DEBUG, "archive", "Archive remove (%lu, %p)\n", key,
+  LCI_DBG_Log(LCI_LOG_TRACE, "archive", "Archive remove (%lu, %p)\n", key,
               (void*)val);
   return val;
 }
diff --git a/src/datastructure/lcm_dequeue.h b/lci/datastructure/lcm_dequeue.h
similarity index 95%
rename from src/datastructure/lcm_dequeue.h
rename to lci/datastructure/lcm_dequeue.h
index fbdbc972..d9b4cc81 100644
--- a/src/datastructure/lcm_dequeue.h
+++ b/lci/datastructure/lcm_dequeue.h
@@ -16,9 +16,12 @@ extern "C" {
 
 typedef struct LCM_dequeue_t {
   size_t top;
+  LCIU_CACHE_PADDING(sizeof(size_t));
   size_t bot;
+  LCIU_CACHE_PADDING(sizeof(size_t));
   size_t length;
   void** container;  // a pointer to type void*
+  LCIU_CACHE_PADDING(sizeof(size_t) + sizeof(void**));
 } LCM_dequeue_t;
 
 static inline void LCM_dq_init(LCM_dequeue_t* dq, size_t capacity);
diff --git a/src/experimental/CMakeLists.txt b/lci/experimental/CMakeLists.txt
similarity index 100%
rename from src/experimental/CMakeLists.txt
rename to lci/experimental/CMakeLists.txt
diff --git a/src/experimental/coll/CMakeLists.txt b/lci/experimental/coll/CMakeLists.txt
similarity index 100%
rename from src/experimental/coll/CMakeLists.txt
rename to lci/experimental/coll/CMakeLists.txt
diff --git a/src/experimental/coll/allreduce.c b/lci/experimental/coll/allreduce.c
similarity index 100%
rename from src/experimental/coll/allreduce.c
rename to lci/experimental/coll/allreduce.c
diff --git a/src/experimental/coll/barrier.c b/lci/experimental/coll/barrier.c
similarity index 100%
rename from src/experimental/coll/barrier.c
rename to lci/experimental/coll/barrier.c
diff --git a/src/experimental/coll/bcast.c b/lci/experimental/coll/bcast.c
similarity index 100%
rename from src/experimental/coll/bcast.c
rename to lci/experimental/coll/bcast.c
diff --git a/src/experimental/coll/coll.c b/lci/experimental/coll/coll.c
similarity index 100%
rename from src/experimental/coll/coll.c
rename to lci/experimental/coll/coll.c
diff --git a/src/experimental/coll/coll.h b/lci/experimental/coll/coll.h
similarity index 98%
rename from src/experimental/coll/coll.h
rename to lci/experimental/coll/coll.h
index 076c6a58..a0db50dc 100644
--- a/src/experimental/coll/coll.h
+++ b/lci/experimental/coll/coll.h
@@ -149,7 +149,6 @@ static inline void LCIXC_mcoll_complete(LCI_endpoint_t ep, LCI_mbuffer_t buffer,
   ctx->rank = -1; /* this doesn't make much sense for collectives */
   ctx->tag = tag;
   ctx->completion = completion;
-  LCII_PCOUNTERS_WRAPPER(ctx->timer = LCII_ucs_get_time());
   lc_ce_dispatch(ctx);
 }
 
@@ -167,7 +166,6 @@ static inline void LCIXC_lcoll_complete(LCI_endpoint_t ep, LCI_lbuffer_t buffer,
   ctx->rank = -1; /* this doesn't make much sense for collectives */
   ctx->tag = tag;
   ctx->completion = completion;
-  LCII_PCOUNTERS_WRAPPER(ctx->timer = LCII_ucs_get_time());
   lc_ce_dispatch(ctx);
 }
 
diff --git a/src/lcii_config.h.in b/lci/lcii_config.h.in
similarity index 94%
rename from src/lcii_config.h.in
rename to lci/lcii_config.h.in
index d364f888..6ce5782d 100644
--- a/src/lcii_config.h.in
+++ b/lci/lcii_config.h.in
@@ -17,7 +17,7 @@
 #cmakedefine LCI_PM_BACKEND_ENABLE_MPI
 #cmakedefine LCI_PM_BACKEND_ENABLE_PMIX
 #cmakedefine LCI_OFI_PROVIDER_HINT_DEFAULT "@LCI_OFI_PROVIDER_HINT_DEFAULT@"
-#cmakedefine LCI_USE_MUTEX_CQ
+#cmakedefine LCI_USE_INLINE_CQ
 #cmakedefine LCI_ENABLE_MULTITHREAD_PROGRESS
 #cmakedefine LCI_IBV_ENABLE_TRY_LOCK_QP
 #cmakedefine LCI_OFI_ENABLE_TRY_LOCK_EP
@@ -37,8 +37,4 @@
 #define LCI_SERVER_MAX_ENDPOINTS 8
 #define LCI_PM_BACKEND_DEFAULT "@LCI_PM_BACKEND_DEFAULT@"
 
-#ifdef LCI_DEBUG
-#define LCM_DEBUG
-#endif
-
 #endif // LCII_CONFIG_H_
diff --git a/lci/log/logger.c b/lci/log/logger.c
new file mode 100644
index 00000000..2703f76b
--- /dev/null
+++ b/lci/log/logger.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include "logger.h"
+
+LCT_log_ctx_t LCII_log_ctx;
+
+void LCII_log_init()
+{
+  const char* const log_levels[] = {
+      [LCI_LOG_ERROR] = "error", [LCI_LOG_WARN] = "warn",
+      [LCI_LOG_DIAG] = "diag",   [LCI_LOG_INFO] = "info",
+      [LCI_LOG_DEBUG] = "debug", [LCI_LOG_TRACE] = "trace"};
+  LCII_log_ctx = LCT_log_ctx_alloc(
+      log_levels, sizeof(log_levels) / sizeof(log_levels[0]), LCI_LOG_WARN,
+      "lci", getenv("LCI_LOG_OUTFILE"), getenv("LCI_LOG_LEVEL"),
+      getenv("LCI_LOG_WHITELIST"), getenv("LCI_LOG_BLACKLIST"));
+}
+
+void LCII_log_fina() { LCT_log_ctx_free(&LCII_log_ctx); }
diff --git a/lci/log/logger.h b/lci/log/logger.h
new file mode 100644
index 00000000..27df268c
--- /dev/null
+++ b/lci/log/logger.h
@@ -0,0 +1,40 @@
+#ifndef LCII_LOG_H_
+#define LCII_LOG_H_
+
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include "lct.h"
+
+enum LCI_log_level_t {
+  LCI_LOG_ERROR,
+  LCI_LOG_WARN,
+  LCI_LOG_DIAG,
+  LCI_LOG_INFO,
+  LCI_LOG_DEBUG,
+  LCI_LOG_TRACE,
+  LCI_LOG_MAX
+};
+
+extern LCT_log_ctx_t LCII_log_ctx;
+
+void LCII_log_init();
+void LCII_log_fina();
+static inline void LCI_Log_flush() { LCT_Log_flush(LCII_log_ctx); }
+
+#define LCI_Assert(...) LCT_Assert(LCII_log_ctx, __VA_ARGS__)
+#define LCI_Log(...) LCT_Log(LCII_log_ctx, __VA_ARGS__)
+#define LCI_Warn(...) LCI_Log(LCI_LOG_WARN, "warn", __VA_ARGS__)
+
+#ifdef LCI_DEBUG
+#define LCI_DBG_Assert(...) LCI_Assert(__VA_ARGS__)
+#define LCI_DBG_Log(...) LCI_Log(__VA_ARGS__)
+#define LCI_DBG_Warn(...) LCI_Warn(__VA_ARGS__)
+#else
+#define LCI_DBG_Assert(...)
+#define LCI_DBG_Log(...)
+#define LCI_DBG_Warn(...)
+#endif
+
+#endif  // LCII_LOG_H_
diff --git a/src/pmi/CMakeLists.txt b/lci/pmi/CMakeLists.txt
similarity index 100%
rename from src/pmi/CMakeLists.txt
rename to lci/pmi/CMakeLists.txt
diff --git a/src/pmi/pmi1/CMakeLists.txt b/lci/pmi/pmi1/CMakeLists.txt
similarity index 100%
rename from src/pmi/pmi1/CMakeLists.txt
rename to lci/pmi/pmi1/CMakeLists.txt
diff --git a/src/pmi/pmi1/include/pmi.h b/lci/pmi/pmi1/include/pmi.h
similarity index 100%
rename from src/pmi/pmi1/include/pmi.h
rename to lci/pmi/pmi1/include/pmi.h
diff --git a/src/pmi/pmi1/simple/Makefile b/lci/pmi/pmi1/simple/Makefile
similarity index 100%
rename from src/pmi/pmi1/simple/Makefile
rename to lci/pmi/pmi1/simple/Makefile
diff --git a/src/pmi/pmi1/simple/simple_pmi.c b/lci/pmi/pmi1/simple/simple_pmi.c
similarity index 100%
rename from src/pmi/pmi1/simple/simple_pmi.c
rename to lci/pmi/pmi1/simple/simple_pmi.c
diff --git a/src/pmi/pmi1/simple/simple_pmiutil.c b/lci/pmi/pmi1/simple/simple_pmiutil.c
similarity index 100%
rename from src/pmi/pmi1/simple/simple_pmiutil.c
rename to lci/pmi/pmi1/simple/simple_pmiutil.c
diff --git a/src/pmi/pmi1/simple/simple_pmiutil.h b/lci/pmi/pmi1/simple/simple_pmiutil.h
similarity index 100%
rename from src/pmi/pmi1/simple/simple_pmiutil.h
rename to lci/pmi/pmi1/simple/simple_pmiutil.h
diff --git a/src/pmi/pmi1/simple/test.c b/lci/pmi/pmi1/simple/test.c
similarity index 100%
rename from src/pmi/pmi1/simple/test.c
rename to lci/pmi/pmi1/simple/test.c
diff --git a/src/pmi/pmi2/CMakeLists.txt b/lci/pmi/pmi2/CMakeLists.txt
similarity index 100%
rename from src/pmi/pmi2/CMakeLists.txt
rename to lci/pmi/pmi2/CMakeLists.txt
diff --git a/src/pmi/pmi2/client/COPYRIGHT b/lci/pmi/pmi2/client/COPYRIGHT
similarity index 100%
rename from src/pmi/pmi2/client/COPYRIGHT
rename to lci/pmi/pmi2/client/COPYRIGHT
diff --git a/src/pmi/pmi2/client/pmi2_api.c b/lci/pmi/pmi2/client/pmi2_api.c
similarity index 100%
rename from src/pmi/pmi2/client/pmi2_api.c
rename to lci/pmi/pmi2/client/pmi2_api.c
diff --git a/src/pmi/pmi2/client/pmi2_util.c b/lci/pmi/pmi2/client/pmi2_util.c
similarity index 100%
rename from src/pmi/pmi2/client/pmi2_util.c
rename to lci/pmi/pmi2/client/pmi2_util.c
diff --git a/src/pmi/pmi2/client/pmi2_util.h b/lci/pmi/pmi2/client/pmi2_util.h
similarity index 100%
rename from src/pmi/pmi2/client/pmi2_util.h
rename to lci/pmi/pmi2/client/pmi2_util.h
diff --git a/src/pmi/pmi2/client/test.c b/lci/pmi/pmi2/client/test.c
similarity index 100%
rename from src/pmi/pmi2/client/test.c
rename to lci/pmi/pmi2/client/test.c
diff --git a/src/pmi/pmi2/include/pmi2.h b/lci/pmi/pmi2/include/pmi2.h
similarity index 100%
rename from src/pmi/pmi2/include/pmi2.h
rename to lci/pmi/pmi2/include/pmi2.h
diff --git a/src/pmi/pmi2/test.c b/lci/pmi/pmi2/test.c
similarity index 100%
rename from src/pmi/pmi2/test.c
rename to lci/pmi/pmi2/test.c
diff --git a/src/pmi/pmi_wrapper.c b/lci/pmi/pmi_wrapper.c
similarity index 91%
rename from src/pmi/pmi_wrapper.c
rename to lci/pmi/pmi_wrapper.c
index eea5baaf..107e70c7 100644
--- a/src/pmi/pmi_wrapper.c
+++ b/lci/pmi/pmi_wrapper.c
@@ -46,7 +46,7 @@ void lcm_pm_initialize()
       continue;
 #endif
     } else
-      LCM_Assert(
+      LCI_Assert(
           false,
           "Unknown env LCM_PM_BACKEND (%s against local|pmi1|pmi2|pmix|mpi).\n",
           word);
@@ -61,7 +61,7 @@ void lcm_pm_initialize()
     }
   }
   free(str);
-  LCM_Assert(found_valid_backend,
+  LCI_Assert(found_valid_backend,
              "Tried [%s]. Did not find valid PMI backend! Give up!\n", p);
   LCM_PM_ops.initialize();
 }
@@ -74,5 +74,10 @@ void lcm_pm_getname(int rank, char* key, char* value)
 {
   LCM_PM_ops.getname(rank, key, value);
 }
-void lcm_pm_barrier() { LCM_PM_ops.barrier(); }
+void lcm_pm_barrier()
+{
+  LCI_DBG_Log(LCI_LOG_TRACE, "pmi", "enter pmi barrier\n");
+  LCM_PM_ops.barrier();
+  LCI_DBG_Log(LCI_LOG_TRACE, "pmi", "leave pmi barrier\n");
+}
 void lcm_pm_finalize() { LCM_PM_ops.finalize(); }
\ No newline at end of file
diff --git a/src/pmi/pmi_wrapper.h b/lci/pmi/pmi_wrapper.h
similarity index 100%
rename from src/pmi/pmi_wrapper.h
rename to lci/pmi/pmi_wrapper.h
diff --git a/src/pmi/pmi_wrapper_local.c b/lci/pmi/pmi_wrapper_local.c
similarity index 99%
rename from src/pmi/pmi_wrapper_local.c
rename to lci/pmi/pmi_wrapper_local.c
index 5210bbcd..352faf75 100644
--- a/src/pmi/pmi_wrapper_local.c
+++ b/lci/pmi/pmi_wrapper_local.c
@@ -11,7 +11,7 @@ int lcm_pm_local_check_availability()
 {
 #ifndef LCI_PM_BACKEND_ENABLE_PMIX
   if (getenv("PMIX_RANK"))
-    LCM_Warn(
+    LCI_Warn(
         "LCI detects the PMIx environment. However, the LCI PMIx support is "
         "not enabled. LCI assumes the number of processes of this job is 1. If "
         "you intended to run more than one processes, please do one of the "
diff --git a/src/pmi/pmi_wrapper_mpi.c b/lci/pmi/pmi_wrapper_mpi.c
similarity index 100%
rename from src/pmi/pmi_wrapper_mpi.c
rename to lci/pmi/pmi_wrapper_mpi.c
diff --git a/src/pmi/pmi_wrapper_pmi1.c b/lci/pmi/pmi_wrapper_pmi1.c
similarity index 100%
rename from src/pmi/pmi_wrapper_pmi1.c
rename to lci/pmi/pmi_wrapper_pmi1.c
diff --git a/src/pmi/pmi_wrapper_pmi2.c b/lci/pmi/pmi_wrapper_pmi2.c
similarity index 100%
rename from src/pmi/pmi_wrapper_pmi2.c
rename to lci/pmi/pmi_wrapper_pmi2.c
diff --git a/src/pmi/pmi_wrapper_pmix.c b/lci/pmi/pmi_wrapper_pmix.c
similarity index 98%
rename from src/pmi/pmi_wrapper_pmix.c
rename to lci/pmi/pmi_wrapper_pmix.c
index ac4adb03..4c53ed41 100644
--- a/src/pmi/pmi_wrapper_pmix.c
+++ b/lci/pmi/pmi_wrapper_pmix.c
@@ -68,7 +68,7 @@ void lcm_pm_pmix_getname(int rank, char* key, char* value)
   PMIX_SAFECALL(PMIx_Get(&proc_wild, key, NULL, 0, &val));
   proc_wild.rank = PMIX_RANK_WILDCARD;
   int n = snprintf(value, LCM_PMI_STRING_LIMIT + 1, "%s", val->data.string);
-  LCM_Assert(0 < n && n <= LCM_PMI_STRING_LIMIT,
+  LCI_Assert(0 < n && n <= LCM_PMI_STRING_LIMIT,
              "snprintf failed (return %d)!\n", n);
   PMIX_VALUE_RELEASE(val);
 }
diff --git a/src/pmi/pmii_archive.h b/lci/pmi/pmii_archive.h
similarity index 100%
rename from src/pmi/pmii_archive.h
rename to lci/pmi/pmii_archive.h
diff --git a/src/profile/papi_wrapper.c b/lci/profile/papi_wrapper.c
similarity index 63%
rename from src/profile/papi_wrapper.c
rename to lci/profile/papi_wrapper.c
index 18ebfe32..837f3e18 100644
--- a/src/profile/papi_wrapper.c
+++ b/lci/profile/papi_wrapper.c
@@ -3,15 +3,15 @@
 #ifdef LCI_USE_PAPI
 #include "papi.h"
 
-#define PAPI_SAFECALL(x)                                                      \
-  {                                                                           \
-    int err = (x);                                                            \
-    if (err != PAPI_OK) {                                                     \
-      LCM_Log(LCM_LOG_WARN, "papi", "err %d: %s\n", err, PAPI_strerror(err)); \
-      return;                                                                 \
-    }                                                                         \
-  }                                                                           \
-  while (0)                                                                   \
+#define PAPI_SAFECALL(x)                                 \
+  {                                                      \
+    int err = (x);                                       \
+    if (err != PAPI_OK) {                                \
+      LCI_Warn("err %d: %s\n", err, PAPI_strerror(err)); \
+      return;                                            \
+    }                                                    \
+  }                                                      \
+  while (0)                                              \
     ;
 
 int event_set = PAPI_NULL;
@@ -26,7 +26,7 @@ void LCII_papi_init()
   int retval;
   retval = PAPI_library_init(PAPI_VER_CURRENT);
   if (retval != PAPI_VER_CURRENT) {
-    LCM_Log(LCM_LOG_WARN, "papi", "PAPI library init error!\n");
+    LCI_Warn("PAPI library init error!\n");
     return;
   }
   PAPI_SAFECALL(PAPI_create_eventset(&event_set));
@@ -46,7 +46,7 @@ void LCII_papi_init()
       q = ++p;
       continue;
     }
-    LCM_Assert(p - q < sizeof(event_code_str),
+    LCI_Assert(p - q < sizeof(event_code_str),
                "Unexpected string length %lu!\n", p - q);
     // the string is between q and p
     memset(event_code_str, 0, sizeof(event_code_str));
@@ -57,12 +57,12 @@ void LCII_papi_init()
     //    int ret = PAPI_event_name_to_code(event_code_str, &event_code);
     if (ret == PAPI_OK) {
       //      PAPI_SAFECALL(PAPI_add_event(event_set, event_code));
-      //      LCM_Log(LCM_LOG_INFO, "papi", "Add event %s(%x)\n",
+      //      LCI_Log(LCI_LOG_INFO, "papi", "Add event %s(%x)\n",
       //      event_code_str, event_code);
-      LCM_Log(LCM_LOG_INFO, "papi", "Add event %s\n", event_code_str);
+      LCI_Log(LCI_LOG_INFO, "papi", "Add event %s\n", event_code_str);
     } else {
-      LCM_Log(LCM_LOG_WARN, "papi", "Cannot figure out event \"%s\" (%s)\n",
-              event_code_str, PAPI_strerror(ret));
+      LCI_Warn("Cannot figure out event \"%s\" (%s)\n", event_code_str,
+               PAPI_strerror(ret));
     }
     // move to the next character
     if (*p == '\0')
@@ -75,7 +75,7 @@ void LCII_papi_init()
     PAPI_SAFECALL(PAPI_start(event_set));
     enabled = true;
   } else {
-    LCM_Log(LCM_LOG_WARN, "papi", "No valid event detected!\n");
+    LCI_Warn("No valid event detected!\n");
   }
 }
 
@@ -91,21 +91,21 @@ void LCII_papi_fina()
 
   int number = num_events;
   PAPI_SAFECALL(PAPI_list_events(event_set, event_codes, &number));
-  LCM_Assert(num_events == number, "Unexpected event count!\n");
+  LCI_Assert(num_events == number, "Unexpected event count!\n");
 
   char event_code_str[PAPI_MAX_STR_LEN];
   static char buf[1024];
   size_t consumed = 0;
   consumed += snprintf(buf + consumed, sizeof(buf) - consumed,
                        "rank,papi_event,count\n");
-  LCM_Assert(sizeof(buf) > consumed, "buffer overflowed!\n");
+  LCI_Assert(sizeof(buf) > consumed, "buffer overflowed!\n");
   for (int i = 0; i < num_events; ++i) {
     PAPI_SAFECALL(PAPI_event_code_to_name(event_codes[i], event_code_str));
     consumed += snprintf(buf + consumed, sizeof(buf) - consumed, "%d,%s,%lld\n",
                          LCI_RANK, event_code_str, event_counters[i]);
-    LCM_Assert(sizeof(buf) > consumed, "buffer overflowed!\n");
+    LCI_Assert(sizeof(buf) > consumed, "buffer overflowed!\n");
   }
-  LCM_Log(LCM_LOG_TRACE, "papi", "\nPAPI counters:\n%s", buf);
+  LCI_Log(LCI_LOG_DIAG, "papi", "\nPAPI counters:\n%s", buf);
 
   LCIU_free(event_codes);
   LCIU_free(event_counters);
diff --git a/src/profile/papi_wrapper.h b/lci/profile/papi_wrapper.h
similarity index 100%
rename from src/profile/papi_wrapper.h
rename to lci/profile/papi_wrapper.h
diff --git a/lci/profile/performance_counter.c b/lci/profile/performance_counter.c
new file mode 100644
index 00000000..dc594a26
--- /dev/null
+++ b/lci/profile/performance_counter.c
@@ -0,0 +1,39 @@
+#include "runtime/lcii.h"
+
+LCT_pcounter_ctx_t LCII_pcounter_ctx;
+
+#define LCII_PCOUNTER_HANDLE_DEF(name) \
+  LCT_pcounter_handle_t LCII_pcounter_handle_##name;
+
+LCII_PCOUNTER_NONE_FOR_EACH(LCII_PCOUNTER_HANDLE_DEF)
+LCII_PCOUNTER_TREND_FOR_EACH(LCII_PCOUNTER_HANDLE_DEF)
+LCII_PCOUNTER_TIMER_FOR_EACH(LCII_PCOUNTER_HANDLE_DEF)
+
+void LCII_pcounters_init()
+{
+#ifdef LCI_USE_PERFORMANCE_COUNTER
+  LCII_pcounter_ctx = LCT_pcounter_ctx_alloc("lci");
+
+#define LCII_PCOUNTER_NONE_REGISTER(name) \
+  LCII_pcounter_handle_##name =           \
+      LCT_pcounter_register(LCII_pcounter_ctx, #name, LCT_PCOUNTER_NONE);
+  LCII_PCOUNTER_NONE_FOR_EACH(LCII_PCOUNTER_NONE_REGISTER)
+
+#define LCII_PCOUNTER_TREND_REGISTER(name) \
+  LCII_pcounter_handle_##name =            \
+      LCT_pcounter_register(LCII_pcounter_ctx, #name, LCT_PCOUNTER_TREND);
+  LCII_PCOUNTER_TREND_FOR_EACH(LCII_PCOUNTER_TREND_REGISTER)
+
+#define LCII_PCOUNTER_TIMER_REGISTER(name) \
+  LCII_pcounter_handle_##name =            \
+      LCT_pcounter_register(LCII_pcounter_ctx, #name, LCT_PCOUNTER_TIMER);
+  LCII_PCOUNTER_TIMER_FOR_EACH(LCII_PCOUNTER_TIMER_REGISTER)
+#endif  // LCI_USE_PERFORMANCE_COUNTER
+}
+
+void LCII_pcounters_fina()
+{
+#ifdef LCI_USE_PERFORMANCE_COUNTER
+  LCT_pcounter_ctx_free(&LCII_pcounter_ctx);
+#endif
+}
\ No newline at end of file
diff --git a/lci/profile/performance_counter.h b/lci/profile/performance_counter.h
new file mode 100644
index 00000000..c6edd329
--- /dev/null
+++ b/lci/profile/performance_counter.h
@@ -0,0 +1,93 @@
+#ifndef LCI_PERFORMANCE_COUNTER_H
+#define LCI_PERFORMANCE_COUNTER_H
+
+extern LCT_pcounter_ctx_t LCII_pcounter_ctx;
+
+// clang-format off
+#define LCII_PCOUNTER_NONE_FOR_EACH(_macro)  \
+    _macro(get_packet_timer)                 \
+    _macro(get_packet_pool_id_timer)         \
+    _macro(get_packet_lock_timer)            \
+    _macro(get_packet_local_timer)           \
+    _macro(get_packet_unlock_timer)          \
+    _macro(cq_push_internal)                 \
+    _macro(cq_push_faa)                      \
+    _macro(cq_push_write)                    \
+    _macro(cq_push_store)                    \
+
+#define LCII_PCOUNTER_TREND_FOR_EACH(_macro) \
+    _macro(send)                             \
+    _macro(put)                              \
+    _macro(recv)                             \
+    _macro(comp_produce)                     \
+    _macro(comp_consume)                     \
+    _macro(net_send_posted)                  \
+    _macro(net_recv_posted)                  \
+    _macro(net_send_comp)                    \
+    _macro(net_recv_comp)                    \
+    _macro(net_send_failed_lock)             \
+    _macro(net_send_failed_nomem)            \
+    _macro(net_recv_failed_nopacket)         \
+    _macro(progress_call)                    \
+    _macro(packet_get)                       \
+    _macro(packet_put)                       \
+    _macro(packet_stealing)                  \
+    _macro(packet_stealing_succeeded)        \
+    _macro(packet_stealing_failed)           \
+    _macro(backlog_queue_push)               \
+    _macro(backlog_queue_pop)                \
+    _macro(expected_msg)                     \
+    _macro(unexpected_msg)
+
+#define LCII_PCOUNTER_TIMER_FOR_EACH(_macro) \
+    _macro(useful_progress_timer)            \
+    _macro(refill_rq_timer)                  \
+    _macro(update_posted_recv)               \
+    _macro(post_recv_timer)                  \
+    _macro(get_recv_packet_timer)            \
+    _macro(cq_push_timer)                    \
+    _macro(cq_pop_timer)                     \
+    _macro(serve_rts_timer)                  \
+    _macro(rts_mem_reg_timer)                \
+    _macro(rts_send_timer)                   \
+    _macro(serve_rtr_timer)                  \
+    _macro(rtr_mem_reg_timer)                \
+    _macro(rtr_putimm_timer)                 \
+    _macro(serve_rdma_timer)                 \
+    _macro(packet_stealing_timer)            \
+    _macro(mem_reg_timer)                    \
+    _macro(mem_dereg_timer)                  \
+    _macro(net_mem_reg_timer)                \
+    _macro(net_mem_dereg_timer)
+// clang-format on
+
+#define LCII_PCOUNTER_HANDLE_DECL(name) \
+  extern LCT_pcounter_handle_t LCII_pcounter_handle_##name;
+
+LCII_PCOUNTER_NONE_FOR_EACH(LCII_PCOUNTER_HANDLE_DECL)
+LCII_PCOUNTER_TREND_FOR_EACH(LCII_PCOUNTER_HANDLE_DECL)
+LCII_PCOUNTER_TIMER_FOR_EACH(LCII_PCOUNTER_HANDLE_DECL)
+
+#ifdef LCI_USE_PERFORMANCE_COUNTER
+#define LCII_PCOUNTER_ADD(name, val) \
+  LCT_pcounter_add(LCII_pcounter_ctx, LCII_pcounter_handle_##name, val);
+#define LCII_PCOUNTER_START(name) \
+  LCT_pcounter_start(LCII_pcounter_ctx, LCII_pcounter_handle_##name);
+#define LCII_PCOUNTER_END(name) \
+  LCT_pcounter_end(LCII_pcounter_ctx, LCII_pcounter_handle_##name);
+#define LCII_PCOUNTER_STARTT(name, time) \
+  LCT_pcounter_startt(LCII_pcounter_ctx, LCII_pcounter_handle_##name, time);
+#define LCII_PCOUNTER_ENDT(name, time) \
+  LCT_pcounter_endt(LCII_pcounter_ctx, LCII_pcounter_handle_##name, time);
+#else
+#define LCII_PCOUNTER_ADD(name, val)
+#define LCII_PCOUNTER_START(name)
+#define LCII_PCOUNTER_END(name)
+#define LCII_PCOUNTER_STARTT(name, time)
+#define LCII_PCOUNTER_ENDT(name, time)
+#endif
+
+void LCII_pcounters_init();
+void LCII_pcounters_fina();
+
+#endif  // LCI_PERFORMANCE_COUNTER_H
diff --git a/src/runtime/1sided_primitive.c b/lci/runtime/1sided_primitive.c
similarity index 79%
rename from src/runtime/1sided_primitive.c
rename to lci/runtime/1sided_primitive.c
index ba971564..16974ea5 100644
--- a/src/runtime/1sided_primitive.c
+++ b/lci/runtime/1sided_primitive.c
@@ -4,9 +4,9 @@
 LCI_error_t LCI_puts(LCI_endpoint_t ep, LCI_short_t src, int rank,
                      LCI_tag_t tag, uintptr_t remote_completion)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
+  LCI_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
                  "Only support default remote completion "
                  "(set by LCI_plist_set_default_comp, "
                  "the default value is LCI_UR_CQ)\n");
@@ -14,13 +14,9 @@ LCI_error_t LCI_puts(LCI_endpoint_t ep, LCI_short_t src, int rank,
       ep->device->endpoint_worker.endpoint, rank, &src, sizeof(LCI_short_t),
       LCII_MAKE_PROTO(ep->gid, LCI_MSG_RDMA_SHORT, tag));
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(put, sizeof(LCI_short_t));
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_puts(ep %p, rank %d, tag %u, remote_completion %p) -> %d\n",
               ep, rank, tag, (void*)remote_completion, ret);
   return ret;
@@ -42,12 +38,12 @@ LCI_error_t LCI_putm(LCI_endpoint_t ep, LCI_mbuffer_t mbuffer, int rank,
 LCI_error_t LCI_putma(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
                       LCI_tag_t tag, uintptr_t remote_completion)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
+  LCI_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
                  "buffer is too large %lu (maximum: %d)\n", buffer.length,
                  LCI_MEDIUM_SIZE);
-  LCM_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
+  LCI_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
                  "Only support default remote completion "
                  "(set by LCI_plist_set_default_comp, "
                  "the default value is LCI_UR_CQ)\n");
@@ -62,8 +58,6 @@ LCI_error_t LCI_putma(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
     LCII_packet_t* packet = LCII_alloc_packet_nb(ep->pkpool);
     if (packet == NULL) {
       // no packet is available
-      LCII_PCOUNTERS_WRAPPER(
-          LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_packet++);
       return LCI_ERR_RETRY;
     }
     packet->context.poolid = (buffer.length > LCI_PACKET_RETURN_THRESHOLD)
@@ -73,7 +67,6 @@ LCI_error_t LCI_putma(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
 
     LCII_context_t* ctx = LCIU_malloc(sizeof(LCII_context_t));
     ctx->data.mbuffer.address = (void*)packet->data.address;
-    LCII_PCOUNTERS_WRAPPER(ctx->timer = LCII_ucs_get_time());
     LCII_initilize_comp_attr(ctx->comp_attr);
     LCII_comp_attr_set_msg_type(ctx->comp_attr, LCI_MSG_RDMA_MEDIUM);
     LCII_comp_attr_set_free_packet(ctx->comp_attr, 1);
@@ -88,13 +81,9 @@ LCI_error_t LCI_putma(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
     }
   }
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(put, (int64_t)buffer.length);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_putm(ep %p, buffer {%p, %lu}, rank %d, tag %u, "
               "remote_completion %p) -> %d\n",
               ep, buffer.address, buffer.length, rank, tag,
@@ -105,12 +94,12 @@ LCI_error_t LCI_putma(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
 LCI_error_t LCI_putmna(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
                        LCI_tag_t tag, uintptr_t remote_completion)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
+  LCI_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
                  "buffer is too large %lu (maximum: %d)\n", buffer.length,
                  LCI_MEDIUM_SIZE);
-  LCM_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
+  LCI_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
                  "Only support default remote completion "
                  "(set by LCI_plist_set_default_comp, "
                  "the default value is LCI_UR_CQ)\n");
@@ -120,7 +109,6 @@ LCI_error_t LCI_putmna(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
                                : -1;
 
   LCII_context_t* ctx = LCIU_malloc(sizeof(LCII_context_t));
-  LCII_PCOUNTERS_WRAPPER(ctx->timer = LCII_ucs_get_time());
   ctx->data.mbuffer.address = (void*)packet->data.address;
   LCII_initilize_comp_attr(ctx->comp_attr);
   LCII_comp_attr_set_msg_type(ctx->comp_attr, LCI_MSG_RDMA_MEDIUM);
@@ -134,13 +122,9 @@ LCI_error_t LCI_putmna(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
     LCIU_free(ctx);
   }
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(put, (int64_t)buffer.length);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_putmna(ep %p, buffer {%p, %lu}, rank %d, tag %u, "
               "remote_completion %p) -> %d\n",
               ep, buffer.address, buffer.length, rank, tag,
@@ -159,22 +143,18 @@ LCI_error_t LCI_putla(LCI_endpoint_t ep, LCI_lbuffer_t buffer,
                       LCI_comp_t completion, int rank, LCI_tag_t tag,
                       uintptr_t remote_completion, void* user_context)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
+  LCI_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
                  "Only support default remote completion "
                  "(set by LCI_plist_set_default_comp, "
                  "the default value is LCI_UR_CQ)\n");
   if (!LCII_bq_is_empty(ep->bq_p)) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_bq++);
     return LCI_ERR_RETRY;
   }
   LCII_packet_t* packet = LCII_alloc_packet_nb(ep->pkpool);
   if (packet == NULL) {
     // no packet is available
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_packet++);
     return LCI_ERR_RETRY;
   }
   packet->context.poolid = -1;
@@ -202,7 +182,7 @@ LCI_error_t LCI_putla(LCI_endpoint_t ep, LCI_lbuffer_t buffer,
   packet->data.rts.send_ctx = (uintptr_t)rdv_ctx;
   packet->data.rts.size = buffer.length;
 
-  LCM_DBG_Log(LCM_LOG_DEBUG, "rdv", "send rts: type %d sctx %p size %lu\n",
+  LCI_DBG_Log(LCI_LOG_TRACE, "rdv", "send rts: type %d sctx %p size %lu\n",
               packet->data.rts.msg_type, (void*)packet->data.rts.send_ctx,
               packet->data.rts.size);
   LCI_error_t ret = LCIS_post_send(
@@ -215,13 +195,9 @@ LCI_error_t LCI_putla(LCI_endpoint_t ep, LCI_lbuffer_t buffer,
     LCIU_free(rdv_ctx);
   }
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(put, (int64_t)buffer.length);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_putla(ep %p, buffer {%p, %lu, %p}, completion %p, rank %d, "
               "tag %u, remote_completion %p, user_context %p) -> %d\n",
               ep, buffer.address, buffer.length, buffer.segment, completion,
@@ -233,41 +209,37 @@ LCI_error_t LCI_putva(LCI_endpoint_t ep, LCI_iovec_t iovec,
                       LCI_comp_t completion, int rank, LCI_tag_t tag,
                       uintptr_t remote_completion, void* user_context)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
+  LCI_DBG_Assert(remote_completion == LCI_DEFAULT_COMP_REMOTE,
                  "Only support default remote completion "
                  "(set by LCI_plist_set_default_comp, "
                  "the default value is LCI_UR_CQ)\n");
-  LCM_DBG_Assert(iovec.count > 0, "iovec.count = %d!\n", iovec.count);
-  LCM_DBG_Assert(iovec.count <= LCI_IOVEC_SIZE,
+  LCI_DBG_Assert(iovec.count > 0, "iovec.count = %d!\n", iovec.count);
+  LCI_DBG_Assert(iovec.count <= LCI_IOVEC_SIZE,
                  "iovec.count = %d > "
                  "LCI_IOVEC_SIZE %d!\n",
                  iovec.count, LCI_IOVEC_SIZE);
-  LCM_DBG_Assert(
+  LCI_DBG_Assert(
       iovec.piggy_back.length <= LCI_get_iovec_piggy_back_size(iovec.count),
       "iovec's piggy back is too large! (%lu > %lu)\n", iovec.piggy_back.length,
       LCI_get_iovec_piggy_back_size(iovec.count));
   for (int i = 0; i < iovec.count; ++i) {
-    LCM_DBG_Assert(
+    LCI_DBG_Assert(
         (iovec.lbuffers[0].segment == LCI_SEGMENT_ALL &&
          iovec.lbuffers[i].segment == LCI_SEGMENT_ALL) ||
             (iovec.lbuffers[0].segment != LCI_SEGMENT_ALL &&
              iovec.lbuffers[i].segment != LCI_SEGMENT_ALL),
         "We currently require either all lbuffers to be registers or "
         "all of them are LCI_SEGMENT_ALL\n");
-    LCM_DBG_Assert(iovec.lbuffers[i].length > 0, "Invalid lbuffer length\n");
+    LCI_DBG_Assert(iovec.lbuffers[i].length > 0, "Invalid lbuffer length\n");
   }
   if (!LCII_bq_is_empty(ep->bq_p)) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_bq++);
     return LCI_ERR_RETRY;
   }
   LCII_packet_t* packet = LCII_alloc_packet_nb(ep->pkpool);
   if (packet == NULL) {
     // no packet is available
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_packet++);
     return LCI_ERR_RETRY;
   }
   packet->context.poolid =
@@ -282,7 +254,6 @@ LCI_error_t LCI_putva(LCI_endpoint_t ep, LCI_iovec_t iovec,
   LCII_comp_attr_set_free_packet(rts_ctx->comp_attr, 1);
 
   LCII_context_t* rdv_ctx = LCIU_malloc(sizeof(LCII_context_t));
-  LCII_PCOUNTERS_WRAPPER(rdv_ctx->timer = LCII_ucs_get_time());
   rdv_ctx->data.iovec = iovec;
   rdv_ctx->data_type = LCI_IOVEC;
   rdv_ctx->rank = rank;
@@ -303,7 +274,7 @@ LCI_error_t LCI_putva(LCI_endpoint_t ep, LCI_iovec_t iovec,
   memcpy((void*)&packet->data.rts.size_p[iovec.count], iovec.piggy_back.address,
          iovec.piggy_back.length);
 
-  LCM_DBG_Log(LCM_LOG_DEBUG, "rdv",
+  LCI_DBG_Log(LCI_LOG_TRACE, "rdv",
               "send rts: type %d sctx %p count %d "
               "piggy_back_size %lu\n",
               packet->data.rts.msg_type, (void*)packet->data.rts.send_ctx,
@@ -320,13 +291,13 @@ LCI_error_t LCI_putva(LCI_endpoint_t ep, LCI_iovec_t iovec,
     LCIU_free(rdv_ctx);
   }
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    uint64_t total_length = iovec.piggy_back.length;
+    for (int i = 0; i < iovec.count; ++i) {
+      total_length += iovec.lbuffers[i].length;
+    }
+    LCII_PCOUNTER_ADD(put, (int64_t)total_length);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_putva(ep %p, iovec {{%p, %lu}, %d, %p}, completion %p, rank "
               "%d, tag %u, remote_completion %p, user_context %p) -> %d\n",
               ep, iovec.piggy_back.address, iovec.piggy_back.length,
@@ -337,10 +308,10 @@ LCI_error_t LCI_putva(LCI_endpoint_t ep, LCI_iovec_t iovec,
 
 size_t LCI_get_iovec_piggy_back_size(int count)
 {
-  LCM_DBG_Assert(LCI_MEDIUM_SIZE > 0,
+  LCI_DBG_Assert(LCI_MEDIUM_SIZE > 0,
                  "LCI_MEDIUM_SIZE <=0! You should run "
                  "LCI_initialize() before calling this function\n");
-  LCM_DBG_Assert(LCI_MEDIUM_SIZE - sizeof(struct LCII_packet_rts_t) >=
+  LCI_DBG_Assert(LCI_MEDIUM_SIZE - sizeof(struct LCII_packet_rts_t) >=
                      sizeof(size_t) * count,
                  "Too many lbuffers to send in one iovec!\n");
   return LCI_MEDIUM_SIZE - sizeof(struct LCII_packet_rts_t) -
diff --git a/src/runtime/2sided_primitive.c b/lci/runtime/2sided_primitive.c
similarity index 83%
rename from src/runtime/2sided_primitive.c
rename to lci/runtime/2sided_primitive.c
index a899dc8a..e620dcc7 100644
--- a/src/runtime/2sided_primitive.c
+++ b/lci/runtime/2sided_primitive.c
@@ -4,19 +4,15 @@
 LCI_error_t LCI_sends(LCI_endpoint_t ep, LCI_short_t src, int rank,
                       LCI_tag_t tag)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
   LCI_error_t ret = LCIS_post_sends(
       ep->device->endpoint_worker.endpoint, rank, &src, sizeof(LCI_short_t),
       LCII_MAKE_PROTO(ep->gid, LCI_MSG_SHORT, tag));
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(send, sizeof(LCI_short_t));
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_sends(ep %p, rank %d, tag %u) -> %d\n", ep, rank, tag, ret);
   return ret;
 }
@@ -24,9 +20,9 @@ LCI_error_t LCI_sends(LCI_endpoint_t ep, LCI_short_t src, int rank,
 LCI_error_t LCI_sendm(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
                       LCI_tag_t tag)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
+  LCI_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
                  "buffer is too large %lu (maximum: %d)\n", buffer.length,
                  LCI_MEDIUM_SIZE);
   LCI_error_t ret = LCI_OK;
@@ -40,8 +36,6 @@ LCI_error_t LCI_sendm(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
     LCII_packet_t* packet = LCII_alloc_packet_nb(ep->pkpool);
     if (packet == NULL) {
       // no packet is available
-      LCII_PCOUNTERS_WRAPPER(
-          LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_packet++);
       return LCI_ERR_RETRY;
     }
     packet->context.poolid = (buffer.length > LCI_PACKET_RETURN_THRESHOLD)
@@ -65,13 +59,9 @@ LCI_error_t LCI_sendm(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
     }
   }
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(send, (int64_t)buffer.length);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_sendm(ep %p, buffer {%p, %lu}, rank %d, tag %u) -> %d\n", ep,
               buffer.address, buffer.length, rank, tag, ret);
   return ret;
@@ -80,9 +70,9 @@ LCI_error_t LCI_sendm(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
 LCI_error_t LCI_sendmn(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
                        LCI_tag_t tag)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
+  LCI_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
                  "buffer is too large %lu (maximum: %d)\n", buffer.length,
                  LCI_MEDIUM_SIZE);
   LCII_packet_t* packet = LCII_mbuffer2packet(buffer);
@@ -104,13 +94,9 @@ LCI_error_t LCI_sendmn(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
     LCIU_free(ctx);
   }
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(send, (int64_t)buffer.length);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_sendmn(ep %p, buffer {%p, %lu}, rank %d, tag %u) -> %d\n",
               ep, buffer.address, buffer.length, rank, tag, ret);
   return ret;
@@ -119,18 +105,14 @@ LCI_error_t LCI_sendmn(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
 LCI_error_t LCI_sendl(LCI_endpoint_t ep, LCI_lbuffer_t buffer, int rank,
                       LCI_tag_t tag, LCI_comp_t completion, void* user_context)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
   if (!LCII_bq_is_empty(ep->bq_p)) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_bq++);
     return LCI_ERR_RETRY;
   }
   LCII_packet_t* packet = LCII_alloc_packet_nb(ep->pkpool);
   if (packet == NULL) {
     // no packet is available
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_packet++);
     return LCI_ERR_RETRY;
   }
   packet->context.poolid = -1;
@@ -168,13 +150,9 @@ LCI_error_t LCI_sendl(LCI_endpoint_t ep, LCI_lbuffer_t buffer, int rank,
     LCIU_free(rdv_ctx);
   }
   if (ret == LCI_OK) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_succeeded++);
-  } else {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].send_lci_failed_backend++);
+    LCII_PCOUNTER_ADD(send, (int64_t)buffer.length);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_sendl(ep %p, buffer {%p, %lu, %p}, rank %d, tag %u, "
               "completion %p, user_context %p) -> %d\n",
               ep, buffer.address, buffer.length, buffer.segment, rank, tag,
@@ -185,7 +163,7 @@ LCI_error_t LCI_sendl(LCI_endpoint_t ep, LCI_lbuffer_t buffer, int rank,
 LCI_error_t LCI_recvs(LCI_endpoint_t ep, int rank, LCI_tag_t tag,
                       LCI_comp_t completion, void* user_context)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
   LCII_context_t* ctx = LCIU_malloc(sizeof(LCII_context_t));
   ctx->data_type = LCI_IMMEDIATE;
@@ -207,7 +185,8 @@ LCI_error_t LCI_recvs(LCI_endpoint_t ep, int rank, LCI_tag_t tag,
     LCII_free_packet(packet);
     lc_ce_dispatch(ctx);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCII_PCOUNTER_ADD(recv, 1);
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_recvs(ep %p, rank %d, tag %u, completion %p, user_context "
               "%p) -> %d\n",
               ep, rank, tag, completion, user_context, LCI_OK);
@@ -217,9 +196,9 @@ LCI_error_t LCI_recvs(LCI_endpoint_t ep, int rank, LCI_tag_t tag,
 LCI_error_t LCI_recvm(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
                       LCI_tag_t tag, LCI_comp_t completion, void* user_context)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
-  LCM_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
+  LCI_DBG_Assert(buffer.length <= LCI_MEDIUM_SIZE,
                  "buffer is too large %lu (maximum: %d)\n", buffer.length,
                  LCI_MEDIUM_SIZE);
   LCII_context_t* ctx = LCIU_malloc(sizeof(LCII_context_t));
@@ -246,7 +225,8 @@ LCI_error_t LCI_recvm(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
     LCII_free_packet(packet);
     lc_ce_dispatch(ctx);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCII_PCOUNTER_ADD(recv, 1);
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_recvm(ep %p, buffer {%p, %lu}, rank %d, tag %u, completion "
               "%p, user_context %p) -> %d\n",
               ep, buffer.address, buffer.length, rank, tag, completion,
@@ -257,7 +237,7 @@ LCI_error_t LCI_recvm(LCI_endpoint_t ep, LCI_mbuffer_t buffer, int rank,
 LCI_error_t LCI_recvmn(LCI_endpoint_t ep, int rank, LCI_tag_t tag,
                        LCI_comp_t completion, void* user_context)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
   LCII_context_t* ctx = LCIU_malloc(sizeof(LCII_context_t));
   ctx->data.mbuffer.address = NULL;
@@ -281,7 +261,8 @@ LCI_error_t LCI_recvmn(LCI_endpoint_t ep, int rank, LCI_tag_t tag,
     ctx->data.mbuffer.address = packet->data.address;
     lc_ce_dispatch(ctx);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCII_PCOUNTER_ADD(recv, 1);
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_recvmn(ep %p, rank %d, tag %u, completion %p, user_context "
               "%p) -> %d\n",
               ep, rank, tag, completion, user_context, LCI_OK);
@@ -291,7 +272,7 @@ LCI_error_t LCI_recvmn(LCI_endpoint_t ep, int rank, LCI_tag_t tag,
 LCI_error_t LCI_recvl(LCI_endpoint_t ep, LCI_lbuffer_t buffer, int rank,
                       LCI_tag_t tag, LCI_comp_t completion, void* user_context)
 {
-  LCM_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
+  LCI_DBG_Assert(tag <= LCI_MAX_TAG, "tag %d is too large (maximum: %d)\n", tag,
                  LCI_MAX_TAG);
   LCII_context_t* rdv_ctx = LCIU_malloc(sizeof(LCII_context_t));
   rdv_ctx->data.lbuffer = buffer;
@@ -314,7 +295,8 @@ LCI_error_t LCI_recvl(LCI_endpoint_t ep, LCI_lbuffer_t buffer, int rank,
     LCII_packet_t* p = (LCII_packet_t*)value;
     LCII_handle_2sided_rts(ep, p, rdv_ctx, false);
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "comm",
+  LCII_PCOUNTER_ADD(recv, 1);
+  LCI_DBG_Log(LCI_LOG_TRACE, "comm",
               "LCI_recvl(ep %p, buffer {%p, %lu, %p}, rank %d, tag %u, "
               "completion %p, user_context %p) -> %d\n",
               ep, buffer.address, buffer.length, buffer.segment, rank, tag,
diff --git a/src/runtime/backlog_queue.h b/lci/runtime/backlog_queue.h
similarity index 87%
rename from src/runtime/backlog_queue.h
rename to lci/runtime/backlog_queue.h
index 2a14b8c4..06a9c58f 100644
--- a/src/runtime/backlog_queue.h
+++ b/lci/runtime/backlog_queue.h
@@ -41,7 +41,7 @@ static inline void LCII_bq_init(LCII_backlog_queue_t* bq_p)
 static inline void LCII_bq_fini(LCII_backlog_queue_t* bq_p)
 {
   if (bq_p->length != 0) {
-    LCM_Warn("There are still %d pending entries in the backlog queue\n",
+    LCI_Warn("There are still %d pending entries in the backlog queue\n",
              bq_p->length);
     LCII_bq_entry_t* p = bq_p->head;
     LCII_bq_entry_t* q;
@@ -53,7 +53,7 @@ static inline void LCII_bq_fini(LCII_backlog_queue_t* bq_p)
     }
     bq_p->head = NULL;
     bq_p->tail = NULL;
-    LCM_Assert(bq_p->length == 0, "backlog queue is in an incosistent state\n");
+    LCI_Assert(bq_p->length == 0, "backlog queue is in an incosistent state\n");
   }
 }
 
@@ -62,11 +62,6 @@ static inline void LCII_bq_push(LCII_backlog_queue_t* bq_p,
                                 LCII_bq_entry_t* entry)
 {
   ++bq_p->length;
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  LCII_pcounters[LCIU_get_thread_id()].backlog_queue_total_count++;
-  LCIU_MAX_ASSIGN(LCII_pcounters[LCIU_get_thread_id()].backlog_queue_max_len,
-                  bq_p->length);
-#endif
   entry->next = NULL;
   if (bq_p->head == NULL) {
     bq_p->head = entry;
@@ -74,6 +69,7 @@ static inline void LCII_bq_push(LCII_backlog_queue_t* bq_p,
     bq_p->tail->next = entry;
   }
   bq_p->tail = entry;
+  LCII_PCOUNTER_ADD(backlog_queue_push, 1);
 }
 
 // this could be called by all threads
@@ -101,6 +97,7 @@ static inline LCII_bq_entry_t* LCII_bq_pop(LCII_backlog_queue_t* bq_p)
   }
   bq_p->head = bq_p->head->next;
   ret->next = NULL;
+  LCII_PCOUNTER_ADD(backlog_queue_pop, 1);
   return ret;
 }
 
diff --git a/src/runtime/completion/amhandler.c b/lci/runtime/completion/amhandler.c
similarity index 100%
rename from src/runtime/completion/amhandler.c
rename to lci/runtime/completion/amhandler.c
diff --git a/lci/runtime/completion/cq.c b/lci/runtime/completion/cq.c
new file mode 100644
index 00000000..ed711c1f
--- /dev/null
+++ b/lci/runtime/completion/cq.c
@@ -0,0 +1,127 @@
+#include "runtime/lcii.h"
+
+LCT_queue_type_t cq_type;
+
+void LCII_env_init_cq_type()
+{
+  const LCT_queue_type_t cq_type_default = LCT_QUEUE_ARRAY_ATOMIC_FAA;
+  LCT_dict_str_int_t dict[] = {
+      {NULL, LCT_QUEUE_ARRAY_ATOMIC_FAA},
+      {"array_atomic_faa", LCT_QUEUE_ARRAY_ATOMIC_FAA},
+      {"array_atomic_cas", LCT_QUEUE_ARRAY_ATOMIC_CAS},
+      {"array_atomic_basic", LCT_QUEUE_ARRAY_ATOMIC_BASIC},
+      {"array_mutex", LCT_QUEUE_ARRAY_MUTEX},
+      {"std_mutex", LCT_QUEUE_STD_MUTEX},
+  };
+  bool succeed = LCT_str_int_search(dict, sizeof(dict) / sizeof(dict[0]),
+                                    getenv("LCI_CQ_TYPE"), cq_type_default,
+                                    (int*)&cq_type);
+  if (!succeed) {
+    LCI_Warn("Unknown LCI_CQ_TYPE %s. Use the default type: array_atomic_faa\n",
+             cq_type);
+  }
+  LCI_Log(LCI_LOG_INFO, "comp", "Set LCI_CQ_TYPE to %d\n", cq_type);
+}
+
+LCI_error_t LCI_queue_create(LCI_device_t device, LCI_comp_t* cq)
+{
+#ifdef LCI_USE_INLINE_CQ
+  LCII_cq_t* cq_ptr = LCIU_malloc(sizeof(LCII_cq_t));
+  LCM_aqueue_init(cq_ptr, LCI_DEFAULT_QUEUE_LENGTH);
+  *cq = cq_ptr;
+#else
+  *cq = LCT_queue_alloc(cq_type, LCI_DEFAULT_QUEUE_LENGTH);
+#endif
+  return LCI_OK;
+}
+
+LCI_error_t LCI_queue_free(LCI_comp_t* cq_ptr)
+{
+#ifdef LCI_USE_INLINE_CQ
+  LCM_aqueue_fina(*cq_ptr);
+  LCIU_free(*cq_ptr);
+  *cq_ptr = NULL;
+#else
+  LCT_queue_free((LCT_queue_t*)cq_ptr);
+#endif
+  return LCI_OK;
+}
+
+LCI_error_t LCI_queue_pop(LCI_comp_t cq, LCI_request_t* request)
+{
+#ifdef LCI_USE_INLINE_CQ
+  LCII_context_t* ctx = LCM_aqueue_pop(cq);
+#else
+  LCII_context_t* ctx = LCT_queue_pop(cq);
+#endif
+  if (ctx == NULL) return LCI_ERR_RETRY;
+  *request = LCII_ctx2req(ctx);
+  LCII_PCOUNTER_ADD(comp_consume, 1);
+  return LCI_OK;
+}
+
+LCI_error_t LCI_queue_wait(LCI_comp_t cq, LCI_request_t* request)
+{
+  LCII_context_t* ctx = NULL;
+  while (ctx == NULL) {
+#ifdef LCI_USE_INLINE_CQ
+    ctx = LCM_aqueue_pop((LCII_cq_t*)cq);
+#else
+    ctx = LCT_queue_pop(cq);
+#endif
+  }
+  *request = LCII_ctx2req(ctx);
+  LCII_PCOUNTER_ADD(comp_consume, 1);
+  return LCI_OK;
+}
+
+LCI_error_t LCI_queue_pop_multiple(LCI_comp_t cq, size_t request_count,
+                                   LCI_request_t* requests,
+                                   size_t* return_count)
+{
+  int count = 0;
+  LCII_context_t* ctx;
+  while (count < request_count) {
+#ifdef LCI_USE_INLINE_CQ
+    ctx = LCM_aqueue_pop(cq);
+#else
+    ctx = LCT_queue_pop(cq);
+#endif
+    if (ctx != NULL) {
+      requests[count] = LCII_ctx2req(ctx);
+      ++count;
+    } else {
+      break;
+    }
+  }
+  *return_count = count;
+  LCII_PCOUNTER_ADD(comp_consume, (int64_t)request_count);
+  return LCI_OK;
+}
+
+LCI_error_t LCI_queue_wait_multiple(LCI_comp_t cq, size_t request_count,
+                                    LCI_request_t* requests)
+{
+  int count = 0;
+  LCII_context_t* ctx;
+  while (count < request_count) {
+#ifdef LCI_USE_INLINE_CQ
+    ctx = LCM_aqueue_pop(cq);
+#else
+    ctx = LCT_queue_pop(cq);
+#endif
+    if (ctx != NULL) {
+      requests[count] = LCII_ctx2req(ctx);
+      ++count;
+    } else {
+      continue;
+    }
+  }
+  LCII_PCOUNTER_ADD(comp_consume, (int64_t)request_count);
+  return LCI_OK;
+}
+
+LCI_error_t LCI_queue_len(LCI_comp_t cq, size_t* len)
+{
+  return LCI_ERR_FEATURE_NA;
+}
\ No newline at end of file
diff --git a/lci/runtime/completion/cq.h b/lci/runtime/completion/cq.h
new file mode 100644
index 00000000..e7e434e1
--- /dev/null
+++ b/lci/runtime/completion/cq.h
@@ -0,0 +1,20 @@
+#ifndef LC_CQ_H
+#define LC_CQ_H
+
+#ifdef LCI_USE_INLINE_CQ
+typedef LCM_aqueue_t LCII_cq_t;
+#endif
+
+static inline void LCII_queue_push(LCI_comp_t cq, LCII_context_t* ctx)
+{
+  LCII_PCOUNTER_START(cq_push_timer);
+#ifdef LCI_USE_INLINE_CQ
+  LCM_aqueue_push(cq, ctx);
+#else
+  LCT_queue_push(cq, ctx);
+#endif
+  LCII_PCOUNTER_END(cq_push_timer);
+  LCII_PCOUNTER_ADD(comp_produce, 1);
+}
+
+#endif  // LC_CQ_H
diff --git a/src/runtime/completion/sync_flag.c b/lci/runtime/completion/sync_flag.c
similarity index 85%
rename from src/runtime/completion/sync_flag.c
rename to lci/runtime/completion/sync_flag.c
index 47cd122b..09521657 100644
--- a/src/runtime/completion/sync_flag.c
+++ b/lci/runtime/completion/sync_flag.c
@@ -12,7 +12,7 @@ LCI_error_t LCI_sync_create(LCI_device_t device, int threshold,
 {
   // we don't need device for this simple synchronizer
   (void)device;
-  LCM_DBG_Assert(threshold > 0, "threshold (%d) <= 0!\n", threshold);
+  LCI_DBG_Assert(threshold > 0, "threshold (%d) <= 0!\n", threshold);
   LCII_sync_t* sync = LCIU_malloc(sizeof(LCII_sync_t));
   sync->threshold = threshold;
   atomic_init(&sync->count, 0);
@@ -27,7 +27,7 @@ LCI_error_t LCI_sync_free(LCI_comp_t* completion)
 {
   atomic_thread_fence(LCIU_memory_order_seq_cst);
   LCII_sync_t* sync = *completion;
-  LCM_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
+  LCI_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
   LCIU_free(sync->ctx);
   LCIU_free(sync);
   *completion = NULL;
@@ -37,16 +37,17 @@ LCI_error_t LCI_sync_free(LCI_comp_t* completion)
 LCI_error_t LCII_sync_signal(LCI_comp_t completion, LCII_context_t* ctx)
 {
   LCII_sync_t* sync = completion;
-  LCM_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
+  LCI_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
   int pos = 0;
   if (sync->threshold > 1)
     pos = atomic_fetch_add_explicit(&sync->count, 1, LCIU_memory_order_relaxed);
-  LCM_DBG_Assert(pos < sync->threshold, "Receive more signals than expected\n");
+  LCI_DBG_Assert(pos < sync->threshold, "Receive more signals than expected\n");
   sync->ctx[pos] = ctx;
   if (sync->threshold > 1)
     atomic_fetch_add_explicit(&sync->confirm, 1, LCIU_memory_order_release);
   else
     atomic_store_explicit(&sync->confirm, 1, LCIU_memory_order_release);
+  LCII_PCOUNTER_ADD(comp_produce, 1);
   return LCI_OK;
 }
 
@@ -68,7 +69,7 @@ LCI_error_t LCI_sync_signal(LCI_comp_t completion, LCI_request_t request)
 LCI_error_t LCI_sync_wait(LCI_comp_t completion, LCI_request_t request[])
 {
   LCII_sync_t* sync = completion;
-  LCM_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
+  LCI_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
   while (atomic_load_explicit(&sync->confirm, LCIU_memory_order_acquire) <
          sync->threshold)
     continue;
@@ -83,6 +84,7 @@ LCI_error_t LCI_sync_wait(LCI_comp_t completion, LCI_request_t request[])
   atomic_store_explicit(&sync->confirm, 0, LCIU_memory_order_relaxed);
   if (sync->threshold > 1)
     atomic_store_explicit(&sync->count, 0, LCIU_memory_order_relaxed);
+  LCII_PCOUNTER_ADD(comp_consume, sync->threshold);
   return LCI_OK;
 }
 
@@ -91,12 +93,12 @@ LCI_error_t LCI_sync_wait(LCI_comp_t completion, LCI_request_t request[])
 LCI_error_t LCI_sync_test(LCI_comp_t completion, LCI_request_t request[])
 {
   LCII_sync_t* sync = completion;
-  LCM_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
+  LCI_DBG_Assert(sync != NULL, "synchronizer is a NULL pointer!\n");
   if (atomic_load_explicit(&sync->confirm, LCIU_memory_order_acquire) <
       sync->threshold) {
     return LCI_ERR_RETRY;
   } else {
-    LCM_DBG_Assert(sync->confirm == sync->threshold,
+    LCI_DBG_Assert(sync->confirm == sync->threshold,
                    "Receive more signals (%d) than expected (%d)\n",
                    sync->confirm, sync->threshold);
     if (request)
@@ -110,6 +112,7 @@ LCI_error_t LCI_sync_test(LCI_comp_t completion, LCI_request_t request[])
     atomic_store_explicit(&sync->confirm, 0, LCIU_memory_order_relaxed);
     if (sync->threshold > 1)
       atomic_store_explicit(&sync->count, 0, LCIU_memory_order_relaxed);
+    LCII_PCOUNTER_ADD(comp_consume, sync->threshold);
     return LCI_OK;
   }
 }
diff --git a/src/runtime/device.c b/lci/runtime/device.c
similarity index 86%
rename from src/runtime/device.c
rename to lci/runtime/device.c
index 7f5018df..097e8876 100644
--- a/src/runtime/device.c
+++ b/lci/runtime/device.c
@@ -37,20 +37,20 @@ LCI_error_t LCI_device_init(LCI_device_t* device_ptr)
       LCI_CACHE_LINE + (size_t)LCI_SERVER_NUM_PKTS * LCI_PACKET_SIZE;
   LCI_error_t ret =
       LCI_lbuffer_memalign(device, heap_size, LCI_PAGESIZE, &device->heap);
-  LCM_Assert(ret == LCI_OK, "Device heap memory allocation failed\n");
+  LCI_Assert(ret == LCI_OK, "Device heap memory allocation failed\n");
   uintptr_t base_addr = (uintptr_t)device->heap.address;
 
   uintptr_t base_packet;
-  LCM_Assert(sizeof(struct LCII_packet_context) <= LCI_CACHE_LINE,
+  LCI_Assert(sizeof(struct LCII_packet_context) <= LCI_CACHE_LINE,
              "Unexpected packet_context size\n");
   base_packet = base_addr + LCI_CACHE_LINE - sizeof(struct LCII_packet_context);
-  LCM_Assert(LCI_PACKET_SIZE % LCI_CACHE_LINE == 0,
+  LCI_Assert(LCI_PACKET_SIZE % LCI_CACHE_LINE == 0,
              "The size of packets should be a multiple of cache line size\n");
 
   LCII_pool_create(&device->pkpool);
   for (size_t i = 0; i < LCI_SERVER_NUM_PKTS; i++) {
     LCII_packet_t* packet = (LCII_packet_t*)(base_packet + i * LCI_PACKET_SIZE);
-    LCM_Assert(((uint64_t) & (packet->data)) % LCI_CACHE_LINE == 0,
+    LCI_Assert(((uint64_t) & (packet->data)) % LCI_CACHE_LINE == 0,
                "packet.data is not well-aligned\n");
     packet->context.pkpool = device->pkpool;
     packet->context.poolid = 0;
@@ -59,20 +59,19 @@ LCI_error_t LCI_device_init(LCI_device_t* device_ptr)
 #endif
     LCII_pool_put(device->pkpool, packet);
   }
-  device->did_work_consecutive = 0;
-  LCM_Assert(LCI_SERVER_NUM_PKTS > 2 * LCI_SERVER_MAX_RECVS,
+  LCI_Assert(LCI_SERVER_NUM_PKTS > 2 * LCI_SERVER_MAX_RECVS,
              "The packet number is too small!\n");
   LCII_fill_rq(&device->endpoint_progress, true);
   LCII_fill_rq(&device->endpoint_worker, true);
   LCI_barrier();
-  LCM_Log(LCM_LOG_INFO, "device", "device %p initialized\n", device);
+  LCI_Log(LCI_LOG_INFO, "device", "device %p initialized\n", device);
   return LCI_OK;
 }
 
 LCI_error_t LCI_device_free(LCI_device_t* device_ptr)
 {
   LCI_device_t device = *device_ptr;
-  LCM_Log(LCM_LOG_INFO, "device", "free device %p\n", device);
+  LCI_Log(LCI_LOG_INFO, "device", "free device %p\n", device);
   LCI_barrier();
   int total_num = LCII_pool_count(device->pkpool) +
 #ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
@@ -83,7 +82,7 @@ LCI_error_t LCI_device_free(LCI_device_t* device_ptr)
                   device->endpoint_worker.recv_posted;
 #endif
   if (total_num != LCI_SERVER_NUM_PKTS)
-    LCM_Warn("Potentially losing packets %d != %d\n", total_num,
+    LCI_Warn("Potentially losing packets %d != %d\n", total_num,
              LCI_SERVER_NUM_PKTS);
   LCII_matchtable_free(&device->mt);
   LCM_archive_fini(&(device->ctx_archive));
diff --git a/src/runtime/endpoint.c b/lci/runtime/endpoint.c
similarity index 93%
rename from src/runtime/endpoint.c
rename to lci/runtime/endpoint.c
index 85531f2b..cab086a2 100644
--- a/src/runtime/endpoint.c
+++ b/lci/runtime/endpoint.c
@@ -7,7 +7,7 @@ LCI_error_t LCI_endpoint_init(LCI_endpoint_t* ep_ptr, LCI_device_t device,
 {
   static int num_endpoints = 0;
   LCI_endpoint_t ep = LCIU_malloc(sizeof(struct LCI_endpoint_s));
-  LCM_Assert(num_endpoints < LCI_MAX_ENDPOINTS, "Too many endpoints!\n");
+  LCI_Assert(num_endpoints < LCI_MAX_ENDPOINTS, "Too many endpoints!\n");
   ep->gid = num_endpoints++;
   LCI_ENDPOINTS[ep->gid] = ep;
   *ep_ptr = ep;
diff --git a/src/runtime/env.c b/lci/runtime/env.c
similarity index 97%
rename from src/runtime/env.c
rename to lci/runtime/env.c
index 2d17bd34..d8caa67f 100644
--- a/src/runtime/env.c
+++ b/lci/runtime/env.c
@@ -28,6 +28,8 @@ LCI_API LCI_device_t LCI_UR_DEVICE;
 LCI_API LCI_endpoint_t LCI_UR_ENDPOINT;
 LCI_API LCI_comp_t LCI_UR_CQ;
 
+void LCII_env_init_cq_type();
+
 void LCII_env_init(int num_proc, int rank)
 {
   LCI_PAGESIZE = sysconf(_SC_PAGESIZE);
@@ -43,13 +45,13 @@ void LCII_env_init(int num_proc, int rank)
   LCI_TOUCH_LBUFFER = LCIU_getenv_or("LCI_TOUCH_LBUFFER", 0);
   LCI_USE_DREG = LCIU_getenv_or("LCI_USE_DREG", LCI_USE_DREG_DEFAULT);
   if (LCI_USE_DREG && LCI_IBV_USE_ODP == 2) {
-    LCM_Warn(
+    LCI_Warn(
         "It doesn't make too much sense to use registration cache "
         "with implicit on-demand paging\n");
   }
   LCI_IBV_USE_PREFETCH = LCIU_getenv_or("LCI_IBV_USE_PREFETCH", 0);
   if (LCI_IBV_USE_PREFETCH && LCI_IBV_USE_ODP == 0) {
-    LCM_Warn(
+    LCI_Warn(
         "It doesn't make too much sense to use prefetch "
         "without on-demand paging\n");
   }
@@ -82,4 +84,6 @@ void LCII_env_init(int num_proc, int rank)
                (LCI_PACKET_SIZE - sizeof(struct LCII_packet_context) -
                 sizeof(struct LCII_packet_rtr_t)) /
                    sizeof(struct LCII_packet_rtr_iovec_info_t));
+
+  LCII_env_init_cq_type();
 }
\ No newline at end of file
diff --git a/src/runtime/lci.c b/lci/runtime/lci.c
similarity index 84%
rename from src/runtime/lci.c
rename to lci/runtime/lci.c
index 8deeb6aa..b9af1cfa 100644
--- a/src/runtime/lci.c
+++ b/lci/runtime/lci.c
@@ -8,16 +8,17 @@ __thread unsigned int LCIU_rand_seed = 0;
 
 LCI_error_t LCI_initialize()
 {
+  LCT_init();
+  LCII_log_init();
+  LCII_pcounters_init();
+  // Initialize PMI.
   int num_proc, rank;
-  // Initialize processes in this job.
   lcm_pm_initialize();
   rank = lcm_pm_get_rank();
   num_proc = lcm_pm_get_size();
-  LCM_Init(rank);
+  LCT_set_rank(rank);
   // Set some constant from environment variable.
   LCII_env_init(num_proc, rank);
-  LCII_pcounters_init();
-  LCII_monitor_thread_init();
   LCII_papi_init();
   if (LCI_USE_DREG) {
     LCII_ucs_init();
@@ -30,12 +31,12 @@ LCI_error_t LCI_initialize()
   LCI_plist_create(&plist);
   LCI_endpoint_init(&LCI_UR_ENDPOINT, LCI_UR_DEVICE, plist);
   LCI_plist_free(&plist);
-  LCM_DBG_Warn(
+  LCI_DBG_Warn(
       "Macro LCI_DEBUG is defined. Running in low-performance debug mode!\n");
 
   opened = 1;
   LCI_barrier();
-  LCM_Log(LCM_LOG_INFO, "device", "LCI_initialize is called\n");
+  LCI_Log(LCI_LOG_INFO, "device", "LCI_initialize is called\n");
   return LCI_OK;
 }
 
@@ -47,22 +48,19 @@ LCI_error_t LCI_initialized(int* flag)
 
 LCI_error_t LCI_finalize()
 {
-  LCM_Log(LCM_LOG_INFO, "device", "LCI_finalize is called\n");
+  LCI_Log(LCI_LOG_INFO, "device", "LCI_finalize is called\n");
   LCI_barrier();
   LCII_papi_fina();
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  LCM_Log(LCM_LOG_TRACE, "monitor", "\nPerformance counters:\n%s",
-          LCII_pcounters_to_string(LCII_pcounters_accumulate()));
-#endif
   LCI_endpoint_free(&LCI_UR_ENDPOINT);
   LCI_queue_free(&LCI_UR_CQ);
   LCI_device_free(&LCI_UR_DEVICE);
   if (LCI_USE_DREG) {
     LCII_ucs_cleanup();
   }
-  LCII_monitor_thread_fina();
-  LCM_Fina();
   lcm_pm_finalize();
+  LCII_pcounters_fina();
+  LCII_log_fina();
+  LCT_fina();
 
   opened = 0;
   return LCI_OK;
@@ -84,7 +82,7 @@ LCI_error_t LCII_barrier()
     LCI_plist_free(&plist);
   }
   LCI_tag_t tag = next_tag++;
-  LCM_Log(LCM_LOG_INFO, "coll", "Start barrier (%d, %p).\n", tag, ep);
+  LCI_Log(LCI_LOG_INFO, "coll", "Start barrier (%d, %p).\n", tag, ep);
   LCI_mbuffer_t buffer;
   int nonsense;
   buffer.address = &nonsense;
@@ -122,7 +120,7 @@ LCI_error_t LCII_barrier()
         LCI_progress(LCI_UR_DEVICE);
     }
   }
-  LCM_Log(LCM_LOG_INFO, "coll", "End barrier (%d, %p).\n", tag, ep);
+  LCI_Log(LCI_LOG_INFO, "coll", "End barrier (%d, %p).\n", tag, ep);
   return LCI_OK;
 }
 
diff --git a/src/runtime/lcii.h b/lci/runtime/lcii.h
similarity index 96%
rename from src/runtime/lcii.h
rename to lci/runtime/lcii.h
index 9773d7f3..9d2acbb9 100644
--- a/src/runtime/lcii.h
+++ b/lci/runtime/lcii.h
@@ -4,7 +4,7 @@
 #include "lcii_config.h"
 #include "lci.h"
 #include "lci_ucx_api.h"
-#include "log/lcm_log.h"
+#include "log/logger.h"
 #include "sys/lciu_misc.h"
 #include "sys/lciu_atomic.h"
 #include "sys/lciu_spinlock.h"
@@ -75,9 +75,6 @@ struct __attribute__((aligned(LCI_CACHE_LINE))) LCI_device_s {
   LCIU_CACHE_PADDING(sizeof(LCIS_server_t) + 2 * sizeof(LCIS_endpoint_t) -
                      sizeof(LCII_pool_t*) + sizeof(LCI_matchtable_t) -
                      sizeof(LCII_rcache_t*) + sizeof(LCI_lbuffer_t));
-  // the following will be changed locally by a progress thread
-  uint64_t did_work_consecutive;  // for performance counter
-  LCIU_CACHE_PADDING(sizeof(uint64_t));
   // the following is shared by both progress threads and worker threads
   LCM_archive_t ctx_archive;  // used for long message protocol
   LCIU_CACHE_PADDING(sizeof(LCM_archive_t));
@@ -238,10 +235,6 @@ static inline void LCII_handle_2sided_rtr(LCI_endpoint_t ep,
 static inline void LCII_handle_2sided_writeImm(LCI_endpoint_t ep,
                                                uint64_t ctx_key);
 
-// monitor thread
-void LCII_monitor_thread_init();
-void LCII_monitor_thread_fina();
-
 #include "runtime/completion/cq.h"
 #include "runtime/matchtable/matchtable.h"
 #include "packet_pool.h"
diff --git a/src/runtime/matchtable/matchtable.c b/lci/runtime/matchtable/matchtable.c
similarity index 69%
rename from src/runtime/matchtable/matchtable.c
rename to lci/runtime/matchtable/matchtable.c
index 69ea1814..0a7ac983 100644
--- a/src/runtime/matchtable/matchtable.c
+++ b/lci/runtime/matchtable/matchtable.c
@@ -14,20 +14,20 @@ void initialize_ops()
     p = LCI_MT_BACKEND_DEFAULT;
   }
   if (strcmp(p, "hashqueue") == 0 || strcmp(p, "HASHQUEUE") == 0) {
-    LCM_Log(LCM_LOG_INFO, "mt",
+    LCI_Log(LCI_LOG_INFO, "mt",
             "Use `hash queue` as the matching table backend.\n");
     LCII_matchtable_hashqueue_setup_ops(&LCII_matchtable_ops);
   } else if (strcmp(p, "hash") == 0 || strcmp(p, "HASH") == 0) {
-    LCM_Log(LCM_LOG_INFO, "mt", "Use `hash` as the matching table backend.\n");
+    LCI_Log(LCI_LOG_INFO, "mt", "Use `hash` as the matching table backend.\n");
     LCII_matchtable_hash_setup_ops(&LCII_matchtable_ops);
   } else if (strcmp(p, "queue") == 0 || strcmp(p, "QUEUE") == 0) {
-    LCM_Log(LCM_LOG_INFO, "mt", "Use `queue` as the matching table backend.\n");
+    LCI_Log(LCI_LOG_INFO, "mt", "Use `queue` as the matching table backend.\n");
     LCII_matchtable_queue_setup_ops(&LCII_matchtable_ops);
   } else
-    LCM_Log_default(LCM_LOG_WARN,
-                    "unknown env LCI_MT_BACKEND (%s against "
-                    "hash|queue|hashqueue). use the default hash.\n",
-                    p);
+    LCI_Warn(
+        "unknown env LCI_MT_BACKEND (%s against "
+        "hash|queue|hashqueue). use the default hash.\n",
+        p);
   initialized = true;
 }
 
@@ -41,7 +41,7 @@ void LCII_matchtable_create(LCI_matchtable_t* mt_p)
 
 void LCII_matchtable_free(LCI_matchtable_t* mt_p)
 {
-  LCM_Assert(initialized, "\n");
+  LCI_Assert(initialized, "\n");
   LCII_matchtable_ops.free(mt_p);
 }
 
@@ -49,6 +49,14 @@ LCI_error_t LCII_matchtable_insert(LCI_matchtable_t mt, uint64_t key,
                                    uint64_t* value,
                                    enum LCII_matchtable_insert_type type)
 {
-  LCM_Assert(initialized, "\n");
-  return LCII_matchtable_ops.insert(mt, key, value, type);
+  LCI_Assert(initialized, "\n");
+  LCI_error_t ret = LCII_matchtable_ops.insert(mt, key, value, type);
+  if (ret == LCI_OK) {
+    if (type == LCII_MATCHTABLE_RECV) {
+      LCII_PCOUNTER_ADD(unexpected_msg, 1);
+    } else {
+      LCII_PCOUNTER_ADD(expected_msg, 1);
+    }
+  }
+  return ret;
 }
\ No newline at end of file
diff --git a/src/runtime/matchtable/matchtable.h b/lci/runtime/matchtable/matchtable.h
similarity index 100%
rename from src/runtime/matchtable/matchtable.h
rename to lci/runtime/matchtable/matchtable.h
diff --git a/src/runtime/matchtable/matchtable_hash.c b/lci/runtime/matchtable/matchtable_hash.c
similarity index 93%
rename from src/runtime/matchtable/matchtable_hash.c
rename to lci/runtime/matchtable/matchtable_hash.c
index 6bbd9588..ce6b8075 100644
--- a/src/runtime/matchtable/matchtable_hash.c
+++ b/lci/runtime/matchtable/matchtable_hash.c
@@ -156,21 +156,13 @@ LCI_error_t LCII_matchtable_hash_insert(LCI_matchtable_t mt, uint64_t key,
       }
     }
   }
-  LCII_PCOUNTERS_WRAPPER(
-      LCII_pcounters[LCIU_get_thread_id()].hashtable_insert_num++);
-  LCII_PCOUNTERS_WRAPPER(
-      LCII_pcounters[LCIU_get_thread_id()].hashtable_walk_steps_total +=
-      checked_slot);
-  LCII_PCOUNTERS_WRAPPER(LCIU_MAX_ASSIGN(
-      LCII_pcounters[LCIU_get_thread_id()].hashtable_walk_steps_max,
-      checked_slot));
-  LCM_DBG_Assert(!(ret == LCI_OK && cannot_found),
+  LCI_DBG_Assert(!(ret == LCI_OK && cannot_found),
                  "Unexpected result! Something is wrong!\n");
   if (ret == LCI_ERR_RETRY) {
     empty_hentry->entry.tag = (key << 1) | type;
     empty_hentry->entry.val = *value;
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "matchtable", "insert (%lx, %p, %d), return 1\n",
+  LCI_DBG_Log(LCI_LOG_TRACE, "matchtable", "insert (%lx, %p, %d), return 1\n",
               key, value, type);
   LCIU_release_spinlock(&master->control.lock);
   return ret;
diff --git a/src/runtime/matchtable/matchtable_hashqueue.c b/lci/runtime/matchtable/matchtable_hashqueue.c
similarity index 95%
rename from src/runtime/matchtable/matchtable_hashqueue.c
rename to lci/runtime/matchtable/matchtable_hashqueue.c
index 80c9f296..2c5eafb3 100644
--- a/src/runtime/matchtable/matchtable_hashqueue.c
+++ b/lci/runtime/matchtable/matchtable_hashqueue.c
@@ -180,13 +180,13 @@ static LCI_error_t LCII_matchtable_hashqueue_insert(
         }
       }
     }
-    LCM_DBG_Assert(ret == LCI_OK, "This header node is empty!\n");
+    LCI_DBG_Assert(ret == LCI_OK, "This header node is empty!\n");
     if (!at_least_one_nonempty) {
       // Remove the head node
       node_t* tmp = target_queue->head;
       target_queue->head = target_queue->head->next;
       if (target_queue->head == NULL) {
-        LCM_DBG_Assert(target_queue->tail == tmp, "\n");
+        LCI_DBG_Assert(target_queue->tail == tmp, "\n");
         target_queue->tail = NULL;
         target_queue->tag = TABLE_TAG_EMPTY;
       }
@@ -196,7 +196,7 @@ static LCI_error_t LCII_matchtable_hashqueue_insert(
     // didn't find the target queue
     if (same_type_queue) {
       // Just append an entry to the queue
-      LCM_DBG_Assert(same_type_queue->head && same_type_queue->tail, "\n");
+      LCI_DBG_Assert(same_type_queue->head && same_type_queue->tail, "\n");
       if (same_type_queue->tail->values[NODE_VALUES_WIDTH - 1] !=
           QUEUE_VAL_EMPTY) {
         // push this entry to a new linked node
@@ -214,7 +214,7 @@ static LCI_error_t LCII_matchtable_hashqueue_insert(
             break;
           }
         }
-        LCM_DBG_Assert(
+        LCI_DBG_Assert(
             same_type_queue->tail->values[empty_slot] == QUEUE_VAL_EMPTY, "\n");
         same_type_queue->tail->values[empty_slot] = *val;
       }
@@ -224,7 +224,7 @@ static LCI_error_t LCII_matchtable_hashqueue_insert(
         // Create a new bucket.
         bucket_t* new_bucket = LCIU_malloc(sizeof(bucket_t));
         initialize_bucket(new_bucket);
-        LCM_DBG_Assert(current_bucket == NULL, "\n");
+        LCI_DBG_Assert(current_bucket == NULL, "\n");
         previous_bucket->control.next = new_bucket;
         empty_queue = &new_bucket->queues[0];
       }
@@ -238,10 +238,10 @@ static LCI_error_t LCII_matchtable_hashqueue_insert(
     }
     ret = LCI_ERR_RETRY;
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "matchtable", "insert (%lx, %p, %d) return %d\n",
+  LCI_DBG_Log(LCI_LOG_TRACE, "matchtable", "insert (%lx, %p, %d) return %d\n",
               key, val, type, ret);
   LCIU_release_spinlock(&master->control.lock);
-  LCM_DBG_Assert(ret != LCI_ERR_FATAL, "Unexpected return value!\n");
+  LCI_DBG_Assert(ret != LCI_ERR_FATAL, "Unexpected return value!\n");
   return ret;
 }
 
diff --git a/src/runtime/matchtable/matchtable_queue.c b/lci/runtime/matchtable/matchtable_queue.c
similarity index 96%
rename from src/runtime/matchtable/matchtable_queue.c
rename to lci/runtime/matchtable/matchtable_queue.c
index 603ef447..6bc39436 100644
--- a/src/runtime/matchtable/matchtable_queue.c
+++ b/lci/runtime/matchtable/matchtable_queue.c
@@ -73,7 +73,7 @@ static uint64_t search_queue(queue_t* queue, uint64_t key)
     // Try to compress the list
     if (!previous_node && current_node->count == 0) {
       // remove this node (head node).
-      LCM_DBG_Assert(current_node == queue->head, "%p != %p\n", queue->head,
+      LCI_DBG_Assert(current_node == queue->head, "%p != %p\n", queue->head,
                      current_node);
       queue->head = current_node->next;
       if (queue->head == NULL) queue->tail = NULL;
@@ -146,7 +146,7 @@ static void push_queue(queue_t* queue, uint64_t key, uint64_t val)
           break;
         }
       }
-      LCM_DBG_Assert(queue->tail->entries[empty_slot].key == QUEUE_KEY_EMPTY,
+      LCI_DBG_Assert(queue->tail->entries[empty_slot].key == QUEUE_KEY_EMPTY,
                      "\n");
       queue->tail->entries[empty_slot].key = key;
       queue->tail->entries[empty_slot].val = val;
@@ -211,7 +211,7 @@ static LCI_error_t LCII_matchtable_queue_insert(
       ret = LCI_OK;
     }
   }
-  LCM_DBG_Log(LCM_LOG_DEBUG, "matchtable", "insert (%lx, %p, %d), return %d\n",
+  LCI_DBG_Log(LCI_LOG_TRACE, "matchtable", "insert (%lx, %p, %d), return %d\n",
               key, val, type, ret);
   LCIU_release_spinlock(&table->lock);
   return ret;
diff --git a/src/runtime/memory_registration.c b/lci/runtime/memory_registration.c
similarity index 92%
rename from src/runtime/memory_registration.c
rename to lci/runtime/memory_registration.c
index 8237e31c..41444abc 100644
--- a/src/runtime/memory_registration.c
+++ b/lci/runtime/memory_registration.c
@@ -4,6 +4,7 @@
 LCI_error_t LCI_memory_register(LCI_device_t device, void* address,
                                 size_t length, LCI_segment_t* segment)
 {
+  LCII_PCOUNTER_START(mem_reg_timer);
   LCI_segment_t mr = (LCI_segment_t)LCIU_malloc(sizeof(struct LCII_mr_t));
   if (LCI_USE_DREG)
     LCII_rcache_reg(device, address, length, mr);
@@ -11,12 +12,14 @@ LCI_error_t LCI_memory_register(LCI_device_t device, void* address,
     mr->mr = LCIS_rma_reg(device->server, address, length);
   }
   *segment = mr;
+  LCII_PCOUNTER_END(mem_reg_timer);
   return LCI_OK;
 }
 
 LCI_error_t LCI_memory_deregister(LCI_segment_t* segment)
 {
-  LCM_DBG_Assert(*segment != NULL, "*segment is NULL\n");
+  LCI_DBG_Assert(*segment != NULL, "*segment is NULL\n");
+  LCII_PCOUNTER_START(mem_dereg_timer);
   if (LCI_USE_DREG) {
     LCII_rcache_dereg(*segment);
   } else {
@@ -24,6 +27,7 @@ LCI_error_t LCI_memory_deregister(LCI_segment_t* segment)
   }
   LCIU_free(*segment);
   *segment = NULL;
+  LCII_PCOUNTER_END(mem_dereg_timer);
   return LCI_OK;
 }
 
diff --git a/src/runtime/packet.h b/lci/runtime/packet.h
similarity index 92%
rename from src/runtime/packet.h
rename to lci/runtime/packet.h
index 232b21aa..e8d6dad3 100644
--- a/src/runtime/packet.h
+++ b/lci/runtime/packet.h
@@ -74,27 +74,29 @@ typedef struct __attribute__((packed)) LCII_packet_t {
 static inline LCII_packet_t* LCII_alloc_packet_nb(struct LCII_pool_t* pool)
 {
   LCII_packet_t* packet = LCII_pool_get_nb(pool);
-#ifdef LCI_DEBUG
   if (packet != NULL) {
-    LCM_DBG_Assert(packet->context.isInPool,
+    LCII_PCOUNTER_ADD(packet_get, 1);
+#ifdef LCI_DEBUG
+    LCI_DBG_Assert(packet->context.isInPool,
                    "This packet has already been allocated!\n");
     packet->context.isInPool = false;
-  }
 #endif
+  }
   return packet;
 }
 
 static inline void LCII_free_packet(LCII_packet_t* packet)
 {
-  LCM_DBG_Assert(((uint64_t)packet + sizeof(struct LCII_packet_context)) %
+  LCI_DBG_Assert(((uint64_t)packet + sizeof(struct LCII_packet_context)) %
                          LCI_CACHE_LINE ==
                      0,
                  "Not a packet (address %p)!\n", packet);
 #ifdef LCI_DEBUG
-  LCM_DBG_Assert(!packet->context.isInPool,
+  LCI_DBG_Assert(!packet->context.isInPool,
                  "This packet has already been freed!\n");
   packet->context.isInPool = true;
 #endif
+  LCII_PCOUNTER_ADD(packet_put, 1);
   if (packet->context.poolid != -1)
     LCII_pool_put_to(packet->context.pkpool, packet, packet->context.poolid);
   else
diff --git a/src/runtime/packet_pool.c b/lci/runtime/packet_pool.c
similarity index 96%
rename from src/runtime/packet_pool.c
rename to lci/runtime/packet_pool.c
index 03ac1e43..9da653e7 100644
--- a/src/runtime/packet_pool.c
+++ b/lci/runtime/packet_pool.c
@@ -24,7 +24,7 @@ static inline void LCII_pool_init()
 
 void LCII_pool_create(struct LCII_pool_t** pool)
 {
-  if (unlikely(!initialized)) LCII_pool_init();
+  if (LCT_unlikely(!initialized)) LCII_pool_init();
   struct LCII_pool_t* p = 0;
   posix_memalign((void**)&p, LCI_CACHE_LINE, sizeof(struct LCII_pool_t));
   p->npools = 0;
diff --git a/src/runtime/packet_pool.h b/lci/runtime/packet_pool.h
similarity index 73%
rename from src/runtime/packet_pool.h
rename to lci/runtime/packet_pool.h
index c61c748e..64c78f3c 100644
--- a/src/runtime/packet_pool.h
+++ b/lci/runtime/packet_pool.h
@@ -48,9 +48,9 @@ static inline void* LCII_pool_get_nb(LCII_pool_t* pool);
 
 static inline int32_t lc_pool_get_local(struct LCII_pool_t* pool)
 {
-  int wid = LCIU_get_thread_id();
+  int wid = LCT_get_thread_id();
   int32_t pid = LCII_tls_pool_metadata[wid][pool->key];
-  if (unlikely(pid == POOL_UNINIT)) {
+  if (LCT_unlikely(pid == POOL_UNINIT)) {
     LCIU_acquire_spinlock(&init_lock);
     pid = LCII_tls_pool_metadata[wid][pool->key];
     if (pid == POOL_UNINIT) {
@@ -68,23 +68,27 @@ static inline int32_t lc_pool_get_local(struct LCII_pool_t* pool)
 
 static inline void* lc_pool_get_slow(struct LCII_pool_t* pool, int32_t pid)
 {
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].packet_stealing +=
-                         1);
+  LCII_PCOUNTER_ADD(packet_stealing, 1);
+  LCII_PCOUNTER_START(packet_stealing_timer);
   void* ret = NULL;
   int32_t steal = LCIU_rand() % (pool->npools);
   size_t target_size = LCM_dq_size(pool->lpools[steal].dq);
-  if (steal != pid && target_size > 0) {
+  if (steal != pid && target_size > 1) {
     if (LCIU_try_acquire_spinlock(&pool->lpools[steal].lock)) {
       size_t steal_size =
           LCM_dq_steal(&pool->lpools[pid].dq, &pool->lpools[steal].dq);
       if (steal_size > 0) {
-        LCM_DBG_Log(LCM_LOG_DEBUG, "packet", "Packet steal %d->%d: %lu\n",
+        LCI_DBG_Log(LCI_LOG_TRACE, "packet", "Packet steal %d->%d: %lu\n",
                     steal, pid, steal_size);
         ret = LCM_dq_pop_top(&pool->lpools[pid].dq);
+        LCII_PCOUNTER_ADD(packet_stealing_succeeded, steal_size);
+      } else {
+        LCII_PCOUNTER_ADD(packet_stealing_failed, 1);
       }
       LCIU_release_spinlock(&pool->lpools[steal].lock);
     }
   }
+  LCII_PCOUNTER_END(packet_stealing_timer);
   return ret;
 }
 
@@ -104,16 +108,32 @@ static inline void LCII_pool_put(struct LCII_pool_t* pool, void* elm)
 
 static inline void* LCII_pool_get_nb(struct LCII_pool_t* pool)
 {
+  LCT_time_t time0 = LCT_now();
+  LCII_PCOUNTER_STARTT(get_packet_timer, time0);
+  LCII_PCOUNTER_STARTT(get_packet_pool_id_timer, time0);
   int32_t pid = lc_pool_get_local(pool);
+  LCT_time_t time1 = LCT_now();
+  LCII_PCOUNTER_ENDT(get_packet_pool_id_timer, time1);
+  LCII_PCOUNTER_STARTT(get_packet_lock_timer, time1);
   LCIU_acquire_spinlock(&pool->lpools[pid].lock);
+  LCT_time_t time2 = LCT_now();
+  LCII_PCOUNTER_ENDT(get_packet_lock_timer, time2);
+  LCII_PCOUNTER_STARTT(get_packet_local_timer, time2);
   void* elm = LCM_dq_pop_top(&pool->lpools[pid].dq);
+  LCT_time_t time3 = LCT_now();
+  LCII_PCOUNTER_ENDT(get_packet_local_timer, time3);
   if (elm == NULL) {
-    LCM_DBG_Assert(LCM_dq_size(pool->lpools[pid].dq) == 0,
+    LCI_DBG_Assert(LCM_dq_size(pool->lpools[pid].dq) == 0,
                    "Unexpected pool length! %lu\n",
                    LCM_dq_size(pool->lpools[pid].dq));
     elm = lc_pool_get_slow(pool, pid);
   }
+  LCT_time_t time4 = LCT_now();
+  LCII_PCOUNTER_STARTT(get_packet_unlock_timer, time4);
   LCIU_release_spinlock(&pool->lpools[pid].lock);
+  LCT_time_t time5 = LCT_now();
+  LCII_PCOUNTER_ENDT(get_packet_unlock_timer, time5);
+  LCII_PCOUNTER_ENDT(get_packet_timer, time5);
   return elm;
 }
 
diff --git a/src/runtime/progress.c b/lci/runtime/progress.c
similarity index 81%
rename from src/runtime/progress.c
rename to lci/runtime/progress.c
index bb1dbecc..fa3863f6 100644
--- a/src/runtime/progress.c
+++ b/lci/runtime/progress.c
@@ -7,8 +7,6 @@ static inline LCI_error_t LCII_progress_bq(LCI_device_t device)
   LCI_error_t ret = LCI_ERR_RETRY;
   LCII_bq_entry_t* entry = LCII_bq_top(&device->bq);
   if (entry != NULL) {
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].backlog_queue_send_attempts++);
     if (entry->bqe_type == LCII_BQ_SENDS) {
       ret = LCIS_post_sends(device->endpoint_progress.endpoint, entry->rank,
                             entry->buf, entry->size, entry->meta);
@@ -26,10 +24,10 @@ static inline LCI_error_t LCII_progress_bq(LCI_device_t device)
                            entry->buf, entry->size, entry->mr, entry->base,
                            entry->offset, entry->rkey, entry->meta, entry->ctx);
     } else {
-      LCM_DBG_Assert(false, "Unknown bqe_type (%d)!\n", entry->bqe_type);
+      LCI_DBG_Assert(false, "Unknown bqe_type (%d)!\n", entry->bqe_type);
     }
     if (ret == LCI_OK) {
-      LCM_DBG_Log(LCM_LOG_DEBUG, "bq", "Pop from backlog queue: type %d\n",
+      LCI_DBG_Log(LCI_LOG_TRACE, "bq", "Pop from backlog queue: type %d\n",
                   entry->bqe_type);
       LCII_bq_pop(&device->bq);
       if (entry->bqe_type == LCII_BQ_SENDS) LCIU_free(entry->buf);
@@ -47,8 +45,9 @@ LCI_error_t LCII_poll_cq(LCII_endpoint_t* endpoint)
   int count = LCIS_poll_cq(endpoint->endpoint, entry);
   if (count > 0) {
     ret = LCI_OK;
+    LCII_PCOUNTER_START(useful_progress_timer);
   } else {
-    LCM_DBG_Assert(count >= 0, "ibv_poll_cq returns error %d\n", count);
+    LCI_DBG_Assert(count >= 0, "ibv_poll_cq returns error %d\n", count);
   }
   for (int i = 0; i < count; i++) {
 #ifdef LCI_ENABLE_SLOWDOWN
@@ -56,20 +55,22 @@ LCI_error_t LCII_poll_cq(LCII_endpoint_t* endpoint)
 #endif
     if (entry[i].opcode == LCII_OP_RECV) {
       // two-sided recv.
-      LCM_DBG_Log(LCM_LOG_DEBUG, "device",
+      LCI_DBG_Log(LCI_LOG_TRACE, "device",
                   "complete recv: packet %p rank %d length %lu imm_data %u\n",
                   entry[i].ctx, entry[i].rank, entry[i].length,
                   entry[i].imm_data);
       LCIS_serve_recv((LCII_packet_t*)entry[i].ctx, entry[i].rank,
                       entry[i].length, entry[i].imm_data);
+      LCII_PCOUNTER_START(update_posted_recv);
 #ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
       atomic_fetch_sub_explicit(&endpoint->recv_posted, 1,
                                 LCIU_memory_order_relaxed);
 #else
       --endpoint->recv_posted;
 #endif
+      LCII_PCOUNTER_END(update_posted_recv);
     } else if (entry[i].opcode == LCII_OP_RDMA_WRITE) {
-      LCM_DBG_Log(LCM_LOG_DEBUG, "device", "complete write: imm_data %u\n",
+      LCI_DBG_Log(LCI_LOG_TRACE, "device", "complete write: imm_data %u\n",
                   entry[i].imm_data);
       if (entry[i].ctx != NULL) {
         LCII_free_packet((LCII_packet_t*)entry[i].ctx);
@@ -83,18 +84,25 @@ LCI_error_t LCII_poll_cq(LCII_endpoint_t* endpoint)
       LCIS_serve_rdma(entry[i].imm_data);
     } else {
       // entry[i].opcode == LCII_OP_SEND
-      LCM_DBG_Log(LCM_LOG_DEBUG, "device", "complete send: address %p\n",
+      LCI_DBG_Log(LCI_LOG_TRACE, "device", "complete send: address %p\n",
                   (void*)entry[i].ctx);
-      if (entry[i].ctx == NULL) continue;
+      LCII_PCOUNTER_ADD(net_send_comp, 1);
+      if (entry[i].ctx == NULL) {
+        continue;
+      }
       LCIS_serve_send((void*)entry[i].ctx);
     }
   }
+  if (ret == LCI_OK) {
+    LCII_PCOUNTER_END(useful_progress_timer);
+  }
   return ret;
 }
 
 LCI_error_t LCII_fill_rq(LCII_endpoint_t* endpoint, bool block)
 {
   int ret = LCI_ERR_RETRY;
+  LCII_PCOUNTER_START(refill_rq_timer);
 #ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
   while (atomic_load_explicit(&endpoint->recv_posted, memory_order_relaxed) <
          LCI_SERVER_MAX_RECVS) {
@@ -103,28 +111,35 @@ LCI_error_t LCII_fill_rq(LCII_endpoint_t* endpoint, bool block)
 #endif
     bool post_recv_succeed = false;
 
+    // First, get a packet.
+    LCII_PCOUNTER_START(get_recv_packet_timer);
     LCII_packet_t* packet = LCII_alloc_packet_nb(endpoint->device->pkpool);
+    LCII_PCOUNTER_END(get_recv_packet_timer);
     if (packet == NULL) {
-      LCII_PCOUNTERS_WRAPPER(
-          LCII_pcounters[LCIU_get_thread_id()].recv_backend_no_packet++);
+      LCII_PCOUNTER_ADD(net_recv_failed_nopacket, 1);
     } else {
+      // We got the packet, post a recv
+      LCII_PCOUNTER_START(post_recv_timer);
       // TODO: figure out what is the right poolid to set
       // packet->context.poolid = lc_pool_get_local(endpoint->device->pkpool);
       LCI_error_t rc = LCIS_post_recv(
           endpoint->endpoint, packet->data.address, LCI_MEDIUM_SIZE,
           endpoint->device->heap.segment->mr, packet);
       if (rc == LCI_OK) {
+        LCII_PCOUNTER_START(update_posted_recv);
 #ifdef LCI_ENABLE_MULTITHREAD_PROGRESS
         atomic_fetch_add_explicit(&endpoint->recv_posted, 1,
                                   LCIU_memory_order_relaxed);
 #else
         ++endpoint->recv_posted;
 #endif
+        LCII_PCOUNTER_END(update_posted_recv);
         post_recv_succeed = true;
         ret = LCI_OK;
       } else {
         LCII_free_packet(packet);
       }
+      LCII_PCOUNTER_END(post_recv_timer);
     }
     if (post_recv_succeed || block) {
       continue;
@@ -132,6 +147,7 @@ LCI_error_t LCII_fill_rq(LCII_endpoint_t* endpoint, bool block)
       break;
     }
   }
+  LCII_PCOUNTER_END(refill_rq_timer);
   return ret;
 }
 
@@ -157,20 +173,6 @@ LCI_error_t LCI_progress(LCI_device_t device)
   if (LCII_fill_rq(&device->endpoint_worker, false) == LCI_OK) {
     ret = LCI_OK;
   }
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].progress_call +=
-                         1);
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  if (ret == LCI_OK) {
-    ++device->did_work_consecutive;
-    LCII_pcounters[LCIU_get_thread_id()].progress_useful_call++;
-  } else {
-    LCIU_MAX_ASSIGN(LCII_pcounters[LCIU_get_thread_id()]
-                        .progress_useful_call_consecutive_max,
-                    device->did_work_consecutive);
-    LCII_pcounters[LCIU_get_thread_id()].progress_useful_call_consecutive_sum +=
-        device->did_work_consecutive;
-    device->did_work_consecutive = 0;
-  }
-#endif
+  LCII_PCOUNTER_ADD(progress_call, 1);
   return ret;
 }
diff --git a/src/runtime/property_list.c b/lci/runtime/property_list.c
similarity index 97%
rename from src/runtime/property_list.c
rename to lci/runtime/property_list.c
index c221d8a9..54fc0a2b 100644
--- a/src/runtime/property_list.c
+++ b/lci/runtime/property_list.c
@@ -60,7 +60,7 @@ LCI_error_t LCI_plist_set_comp_type(LCI_plist_t plist, LCI_port_t port,
       plist->msg_comp_type = comp_type;
       break;
     default:
-      LCM_DBG_Assert(false, "unknown port!\n");
+      LCI_DBG_Assert(false, "unknown port!\n");
   }
   return LCI_OK;
 }
diff --git a/src/runtime/protocol.h b/lci/runtime/protocol.h
similarity index 85%
rename from src/runtime/protocol.h
rename to lci/runtime/protocol.h
index 29081b6f..627b7065 100644
--- a/src/runtime/protocol.h
+++ b/lci/runtime/protocol.h
@@ -11,7 +11,7 @@ static inline uint64_t LCII_make_key(LCI_endpoint_t ep, int rank, LCI_tag_t tag,
     ret = (uint64_t)(rank) << 32 | (uint64_t)(msg_type) << 30 |
           (uint64_t)(ep->gid) << 16 | (uint64_t)(tag);
   } else {
-    LCM_DBG_Assert(ep->match_type == LCI_MATCH_TAG, "Unknown match_type %d\n",
+    LCI_DBG_Assert(ep->match_type == LCI_MATCH_TAG, "Unknown match_type %d\n",
                    ep->match_type);
     ret = (uint64_t)(-1) << 32 | (uint64_t)(msg_type) << 30 |
           (uint64_t)(ep->gid) << 16 | (uint64_t)(tag);
@@ -43,12 +43,14 @@ static inline void lc_ce_dispatch(LCII_context_t* ctx)
 #ifdef LCI_SERVER_HAS_AM
     case LCI_COMPLETION_HANDLER: {
       LCI_handler_t handler = ctx->completion;
+      LCII_PCOUNTER_ADD(comp_produce, 1);
+      LCII_PCOUNTER_ADD(comp_consume, 1);
       (*handler)(LCII_ctx2req(ctx));
       break;
     }
 #endif
     default:
-      LCM_DBG_Assert(false, "Unknown completion type: %d!\n",
+      LCI_DBG_Assert(false, "Unknown completion type: %d!\n",
                      (int)LCII_comp_attr_get_comp_type(ctx->comp_attr));
   }
 }
@@ -56,11 +58,7 @@ static inline void lc_ce_dispatch(LCII_context_t* ctx)
 static inline void LCIS_serve_recv(void* p, int src_rank, size_t length,
                                    uint32_t imm_data)
 {
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_rx += 1);
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_rx +=
-                         length);
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_2sided_rx +=
-                         1);
+  LCII_PCOUNTER_ADD(net_recv_comp, length);
   LCII_packet_t* packet = (LCII_packet_t*)p;
   LCII_proto_t proto = imm_data;
   // NOTE: this should be RGID because it is received from remote.
@@ -71,7 +69,7 @@ static inline void LCIS_serve_recv(void* p, int src_rank, size_t length,
 
   switch (msg_type) {
     case LCI_MSG_SHORT: {
-      LCM_DBG_Assert(length == LCI_SHORT_SIZE,
+      LCI_DBG_Assert(length == LCI_SHORT_SIZE,
                      "Unexpected message length %lu\n", length);
       uint64_t key = LCII_make_key(ep, src_rank, tag, LCI_MSG_SHORT);
       uint64_t value = (uint64_t)packet;
@@ -109,6 +107,7 @@ static inline void LCIS_serve_recv(void* p, int src_rank, size_t length,
       break;
     }
     case LCI_MSG_RTS: {
+      LCII_PCOUNTER_START(serve_rts_timer);
       switch (packet->data.rts.msg_type) {
         case LCI_MSG_LONG: {
           const uint64_t key = LCII_make_key(ep, src_rank, tag, LCI_MSG_LONG);
@@ -128,12 +127,14 @@ static inline void LCIS_serve_recv(void* p, int src_rank, size_t length,
           break;
         }
         default:
-          LCM_Assert(false, "Unknown message type %d!\n",
+          LCI_Assert(false, "Unknown message type %d!\n",
                      packet->data.rts.msg_type);
       }
+      LCII_PCOUNTER_END(serve_rts_timer);
       break;
     }
-    case LCI_MSG_RTR:
+    case LCI_MSG_RTR: {
+      LCII_PCOUNTER_START(serve_rtr_timer);
       switch (packet->data.rts.msg_type) {
         case LCI_MSG_LONG: {
           LCII_handle_2sided_rtr(ep, packet);
@@ -148,13 +149,15 @@ static inline void LCIS_serve_recv(void* p, int src_rank, size_t length,
           break;
         }
         default:
-          LCM_Assert(false, "Unknown message type %d!\n",
+          LCI_Assert(false, "Unknown message type %d!\n",
                      packet->data.rts.msg_type);
       }
+      LCII_PCOUNTER_END(serve_rtr_timer);
       break;
+    }
     case LCI_MSG_RDMA_SHORT: {
       // dynamic put
-      LCM_DBG_Assert(length == LCI_SHORT_SIZE,
+      LCI_DBG_Assert(length == LCI_SHORT_SIZE,
                      "Unexpected message length %lu\n", length);
       LCII_context_t* ctx = LCIU_malloc(sizeof(LCII_context_t));
       memcpy(&(ctx->data.immediate), packet->data.address, LCI_SHORT_SIZE);
@@ -184,18 +187,19 @@ static inline void LCIS_serve_recv(void* p, int src_rank, size_t length,
       break;
     }
     case LCI_MSG_FIN: {
-      LCM_DBG_Assert(length == sizeof(LCII_context_t*),
+      LCI_DBG_Assert(length == sizeof(LCII_context_t*),
                      "Unexpected FIN message length (%lu)!\n", length);
       LCII_handle_iovec_recv_FIN(packet);
       break;
     }
     default:
-      LCM_Assert(false, "Unknown proto!\n");
+      LCI_Assert(false, "Unknown proto!\n");
   }
 }
 
 static inline void LCIS_serve_rdma(uint32_t imm_data)
 {
+  LCII_PCOUNTER_START(serve_rdma_timer);
   LCII_proto_t proto = imm_data;
   LCI_endpoint_t ep = LCI_ENDPOINTS[PROTO_GET_RGID(proto)];
   uint16_t tag = PROTO_GET_TAG(proto);
@@ -211,8 +215,9 @@ static inline void LCIS_serve_rdma(uint32_t imm_data)
       break;
     }
     default:
-      LCM_DBG_Assert(false, "unknown proto!\n");
+      LCI_DBG_Assert(false, "unknown proto!\n");
   }
+  LCII_PCOUNTER_END(serve_rdma_timer);
 }
 
 // local completion
@@ -227,14 +232,6 @@ static inline void LCIS_serve_send(void* raw_ctx)
   if (LCII_comp_attr_get_free_packet(ctx->comp_attr) == 1) {
     LCII_free_packet(LCII_mbuffer2packet(ctx->data.mbuffer));
   }
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  if (LCII_comp_attr_get_msg_type(ctx->comp_attr) == LCI_MSG_RDMA_MEDIUM) {
-    LCIU_update_average(
-        &LCII_pcounters[LCIU_get_thread_id()].send_eager_latency_nsec_ave,
-        &LCII_pcounters[LCIU_get_thread_id()].send_eager_latency_nsec_count,
-        (int64_t)LCII_ucs_time_to_nsec(LCII_ucs_get_time() - ctx->timer), 1);
-  }
-#endif
   lc_ce_dispatch(ctx);
 }
 
diff --git a/src/runtime/rcache/lcii_rcache.c b/lci/runtime/rcache/lcii_rcache.c
similarity index 94%
rename from src/runtime/rcache/lcii_rcache.c
rename to lci/runtime/rcache/lcii_rcache.c
index e9b2f23c..74dac847 100644
--- a/src/runtime/rcache/lcii_rcache.c
+++ b/lci/runtime/rcache/lcii_rcache.c
@@ -17,8 +17,8 @@ static void LCII_rcache_dump_region_cb(void* context, ucs_rcache_t* rcache,
                                        size_t max);
 
 static ucs_rcache_ops_t LCII_mem_rcache_ops = {
-    .mem_reg = LCII_rcache_mem_reg_cb,
-    .mem_dereg = LCII_rcache_mem_dereg_cb,
+    .mem_reg_timer = LCII_rcache_mem_reg_cb,
+    .mem_dereg_timer = LCII_rcache_mem_dereg_cb,
     .dump_region = LCII_rcache_dump_region_cb};
 
 static ucs_status_t LCII_rcache_mem_reg_cb(void* context, ucs_rcache_t* rcache,
@@ -71,7 +71,7 @@ LCI_error_t LCII_rcache_init(LCI_device_t device)
   ucs_status_t ret = LCII_ucs_rcache_create(&rcache_params, "lci_rcache",
                                             LCII_ucs_stats_get_root(),
                                             (ucs_rcache_t**)&device->rcache);
-  LCM_Assert(ret == UCS_OK, "Unexpected return value %d\n", ret);
+  LCI_Assert(ret == UCS_OK, "Unexpected return value %d\n", ret);
   return LCI_OK;
 }
 
@@ -92,7 +92,7 @@ void LCII_rcache_reg(LCI_device_t device, void* address, size_t length,
       ucs_derived_of(segment->region, LCII_rcache_entry_t);
   segment->device = device;
   segment->mr = region->mr;
-  LCM_Assert(ret == UCS_OK, "ucs_rcache_get failed (%d)!\n", ret);
+  LCI_Assert(ret == UCS_OK, "ucs_rcache_get failed (%d)!\n", ret);
 }
 
 LCI_error_t LCII_rcache_dereg(LCI_segment_t segment)
diff --git a/src/runtime/rcache/lcii_rcache.h b/lci/runtime/rcache/lcii_rcache.h
similarity index 100%
rename from src/runtime/rcache/lcii_rcache.h
rename to lci/runtime/rcache/lcii_rcache.h
diff --git a/src/runtime/rendezvous.h b/lci/runtime/rendezvous.h
similarity index 83%
rename from src/runtime/rendezvous.h
rename to lci/runtime/rendezvous.h
index 31a7a24a..f4ff7b11 100644
--- a/src/runtime/rendezvous.h
+++ b/lci/runtime/rendezvous.h
@@ -12,11 +12,11 @@ static inline void LCIS_post_sends_bq(LCII_backlog_queue_t* bq_p,
     if (ret == LCI_OK)
       return;
     else {
-      LCM_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
+      LCI_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
     }
   }
   // push to backlog queue
-  LCM_DBG_Log(LCM_LOG_DEBUG, "bq",
+  LCI_DBG_Log(LCI_LOG_TRACE, "bq",
               "Pushed to backlog queue (sends): "
               "post sends: rank %d buf %p size %lu meta %d\n",
               rank, buf, size, meta);
@@ -43,11 +43,11 @@ static inline void LCIS_post_send_bq(LCII_backlog_queue_t* bq_p,
     if (ret == LCI_OK)
       return;
     else {
-      LCM_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
+      LCI_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
     }
   }
   // push to backlog queue
-  LCM_DBG_Log(LCM_LOG_DEBUG, "bq",
+  LCI_DBG_Log(LCI_LOG_TRACE, "bq",
               "Pushed to backlog queue (send): "
               "rank %d buf %p size %lu mr %p meta %d ctx %p\n",
               rank, buf, size, mr.mr_p, meta, ctx);
@@ -77,11 +77,11 @@ static inline void LCIS_post_put_bq(LCII_backlog_queue_t* bq_p,
     if (ret == LCI_OK)
       return;
     else {
-      LCM_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
+      LCI_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
     }
   }
   // push to backlog queue
-  LCM_DBG_Log(LCM_LOG_DEBUG, "bq",
+  LCI_DBG_Log(LCI_LOG_TRACE, "bq",
               "Pushed to backlog queue (put): "
               "rank %d buf %p size %lu mr %p base %p "
               "offset %lu rkey %lu ctx %p\n",
@@ -115,11 +115,11 @@ static inline void LCIS_post_putImm_bq(LCII_backlog_queue_t* bq_p,
     if (ret == LCI_OK)
       return;
     else {
-      LCM_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
+      LCI_Assert(ret == LCI_ERR_RETRY, "fatal error!\n");
     }
   }
   // push to backlog queue
-  LCM_DBG_Log(LCM_LOG_DEBUG, "bq",
+  LCI_DBG_Log(LCI_LOG_TRACE, "bq",
               "Pushed to backlog queue (putImm): "
               "rank %d buf %p size %lu mr %p base %p "
               "offset %lu rkey %lu meta %u ctx %p\n",
@@ -145,7 +145,7 @@ static inline void LCII_handle_2sided_rts(LCI_endpoint_t ep,
                                           LCII_context_t* rdv_ctx,
                                           bool is_progress)
 {
-  LCM_DBG_Assert(rdv_ctx->data.lbuffer.address == NULL ||
+  LCI_DBG_Assert(rdv_ctx->data.lbuffer.address == NULL ||
                      rdv_ctx->data.lbuffer.length >= packet->data.rts.size,
                  "the message sent by sendl (%lu) is larger than the buffer "
                  "posted by recvl (%lu)!\n",
@@ -163,20 +163,22 @@ static inline void LCII_handle_2sided_rts(LCI_endpoint_t ep,
   uint64_t ctx_key;
   int result = LCM_archive_put(ep->ctx_archive_p, (uintptr_t)rdv_ctx, &ctx_key);
   // TODO: be able to pass back pressure to user
-  LCM_Assert(result == LCM_SUCCESS, "Archive is full!\n");
+  LCI_Assert(result == LCM_SUCCESS, "Archive is full!\n");
   packet->data.rtr.recv_ctx_key = ctx_key;
+  LCII_PCOUNTER_START(rts_mem_reg_timer);
   if (rdv_ctx->data.lbuffer.address == NULL) {
     LCI_lbuffer_alloc(ep->device, packet->data.rts.size,
                       &rdv_ctx->data.lbuffer);
   }
   if (rdv_ctx->data.lbuffer.segment == LCI_SEGMENT_ALL) {
-    LCM_DBG_Assert(LCII_comp_attr_get_dereg(rdv_ctx->comp_attr) == 1, "\n");
+    LCI_DBG_Assert(LCII_comp_attr_get_dereg(rdv_ctx->comp_attr) == 1, "\n");
     LCI_memory_register(ep->device, rdv_ctx->data.lbuffer.address,
                         rdv_ctx->data.lbuffer.length,
                         &rdv_ctx->data.lbuffer.segment);
   } else {
-    LCM_DBG_Assert(LCII_comp_attr_get_dereg(rdv_ctx->comp_attr) == 0, "\n");
+    LCI_DBG_Assert(LCII_comp_attr_get_dereg(rdv_ctx->comp_attr) == 0, "\n");
   }
+  LCII_PCOUNTER_END(rts_mem_reg_timer);
   packet->data.rtr.remote_addr_base =
       (uintptr_t)rdv_ctx->data.lbuffer.segment->mr.address;
   packet->data.rtr.remote_addr_offset =
@@ -190,23 +192,28 @@ static inline void LCII_handle_2sided_rts(LCI_endpoint_t ep,
   } else {
     endpoint_to_use = ep->device->endpoint_worker.endpoint;
   }
+  LCII_PCOUNTER_START(rts_send_timer);
   LCIS_post_send_bq(ep->bq_p, ep->bq_spinlock_p, endpoint_to_use, rdv_ctx->rank,
                     packet->data.address, sizeof(struct LCII_packet_rtr_t),
                     ep->device->heap.segment->mr,
                     LCII_MAKE_PROTO(ep->gid, LCI_MSG_RTR, 0), rtr_ctx);
+  LCII_PCOUNTER_END(rts_send_timer);
 }
 
 static inline void LCII_handle_2sided_rtr(LCI_endpoint_t ep,
                                           LCII_packet_t* packet)
 {
   LCII_context_t* ctx = (LCII_context_t*)packet->data.rtr.send_ctx;
+  LCII_PCOUNTER_START(rtr_mem_reg_timer);
   if (ctx->data.lbuffer.segment == LCI_SEGMENT_ALL) {
-    LCM_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 1, "\n");
+    LCI_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 1, "\n");
     LCI_memory_register(ep->device, ctx->data.lbuffer.address,
                         ctx->data.lbuffer.length, &ctx->data.lbuffer.segment);
   } else {
-    LCM_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 0, "\n");
+    LCI_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 0, "\n");
   }
+  LCII_PCOUNTER_END(rtr_mem_reg_timer);
+  LCII_PCOUNTER_START(rtr_putimm_timer);
   LCIS_post_putImm_bq(
       ep->bq_p, ep->bq_spinlock_p, ep->device->endpoint_progress.endpoint,
       ctx->rank, ctx->data.lbuffer.address, ctx->data.lbuffer.length,
@@ -214,6 +221,7 @@ static inline void LCII_handle_2sided_rtr(LCI_endpoint_t ep,
       packet->data.rtr.remote_addr_offset, packet->data.rtr.rkey,
       LCII_MAKE_PROTO(ep->gid, LCI_MSG_LONG, packet->data.rtr.recv_ctx_key),
       ctx);
+  LCII_PCOUNTER_END(rtr_putimm_timer);
   LCII_free_packet(packet);
 }
 
@@ -222,21 +230,16 @@ static inline void LCII_handle_2sided_writeImm(LCI_endpoint_t ep,
 {
   LCII_context_t* ctx =
       (LCII_context_t*)LCM_archive_remove(ep->ctx_archive_p, ctx_key);
-  LCM_DBG_Assert(ctx->data_type == LCI_LONG,
+  LCI_DBG_Assert(ctx->data_type == LCI_LONG,
                  "Didn't get the right context! This might imply some bugs in "
                  "the LCM_archive_t.\n");
-  LCM_DBG_Log(LCM_LOG_DEBUG, "rdv",
+  LCI_DBG_Log(LCI_LOG_TRACE, "rdv",
               "complete recvl: ctx %p rank %d buf %p size %lu "
               "tag %d user_ctx %p completion attr %x completion %p\n",
               ctx, ctx->rank, ctx->data.lbuffer.address,
               ctx->data.lbuffer.length, ctx->tag, ctx->user_context,
               ctx->comp_attr, ctx->completion);
-
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_rx += 1);
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_rx +=
-                         ctx->data.lbuffer.length);
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_1sided_rx +=
-                         1);
+  LCII_PCOUNTER_ADD(net_recv_comp, ctx->data.lbuffer.length);
   lc_ce_dispatch(ctx);
 }
 
@@ -266,7 +269,7 @@ static inline void LCII_handle_1sided_rts(LCI_endpoint_t ep,
   uint64_t ctx_key;
   int result = LCM_archive_put(ep->ctx_archive_p, (uintptr_t)rdv_ctx, &ctx_key);
   // TODO: be able to pass back pressure to user
-  LCM_Assert(result == LCM_SUCCESS, "Archive is full!\n");
+  LCI_Assert(result == LCM_SUCCESS, "Archive is full!\n");
   packet->data.rtr.recv_ctx_key = ctx_key;
   packet->data.rtr.remote_addr_base =
       (uintptr_t)rdv_ctx->data.lbuffer.segment->mr.address;
@@ -275,7 +278,7 @@ static inline void LCII_handle_1sided_rts(LCI_endpoint_t ep,
       packet->data.rtr.remote_addr_base;
   packet->data.rtr.rkey = LCIS_rma_rkey(rdv_ctx->data.lbuffer.segment->mr);
 
-  LCM_DBG_Log(LCM_LOG_DEBUG, "rdv",
+  LCI_DBG_Log(LCI_LOG_TRACE, "rdv",
               "send rtr: type %d sctx %p base %p offset %lu "
               "rkey %lu rctx_key %u\n",
               packet->data.rtr.msg_type, (void*)packet->data.rtr.send_ctx,
@@ -294,11 +297,11 @@ static inline void LCII_handle_1sided_rtr(LCI_endpoint_t ep,
 {
   LCII_context_t* ctx = (LCII_context_t*)packet->data.rtr.send_ctx;
   if (ctx->data.lbuffer.segment == LCI_SEGMENT_ALL) {
-    LCM_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 1, "\n");
+    LCI_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 1, "\n");
     LCI_memory_register(ep->device, ctx->data.lbuffer.address,
                         ctx->data.lbuffer.length, &ctx->data.lbuffer.segment);
   } else {
-    LCM_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 0, "\n");
+    LCI_DBG_Assert(LCII_comp_attr_get_dereg(ctx->comp_attr) == 0, "\n");
   }
   LCIS_post_putImm_bq(
       ep->bq_p, ep->bq_spinlock_p, ep->device->endpoint_progress.endpoint,
@@ -316,15 +319,11 @@ static inline void LCII_handle_1sided_writeImm(LCI_endpoint_t ep,
 {
   LCII_context_t* ctx =
       (LCII_context_t*)LCM_archive_remove(ep->ctx_archive_p, ctx_key);
-  LCM_DBG_Assert(ctx->data_type == LCI_LONG,
+  LCI_DBG_Assert(ctx->data_type == LCI_LONG,
                  "Didn't get the right context! This might imply some bugs in "
                  "the LCM_archive_t.\n");
   // putl has been completed locally. Need to process completion.
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_rx += 1);
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_rx +=
-                         ctx->data.lbuffer.length);
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_1sided_rx +=
-                         1);
+  LCII_PCOUNTER_ADD(net_recv_comp, ctx->data.lbuffer.length);
   lc_ce_dispatch(ctx);
 }
 
@@ -333,7 +332,6 @@ static inline void LCII_handle_iovec_rts(LCI_endpoint_t ep,
                                          uint32_t src_rank, uint16_t tag)
 {
   LCII_context_t* rdv_ctx = LCIU_malloc(sizeof(LCII_context_t));
-  LCII_PCOUNTERS_WRAPPER(rdv_ctx->timer = LCII_ucs_get_time());
   rdv_ctx->data.iovec.count = packet->data.rts.count;
   rdv_ctx->data.iovec.piggy_back.length = packet->data.rts.piggy_back_size;
   rdv_ctx->data.iovec.piggy_back.address =
@@ -375,7 +373,7 @@ static inline void LCII_handle_iovec_rts(LCI_endpoint_t ep,
         LCIS_rma_rkey(rdv_ctx->data.iovec.lbuffers[i].segment->mr);
   }
 
-  LCM_DBG_Log(LCM_LOG_DEBUG, "rdv",
+  LCI_DBG_Log(LCI_LOG_TRACE, "rdv",
               "send rtr: type %d sctx %p count %d rctx %p\n",
               packet->data.rtr.msg_type, (void*)packet->data.rtr.send_ctx,
               rdv_ctx->data.iovec.count, (void*)packet->data.rtr.recv_ctx);
@@ -386,24 +384,12 @@ static inline void LCII_handle_iovec_rts(LCI_endpoint_t ep,
                     ep->device->endpoint_progress.endpoint, rdv_ctx->rank,
                     packet->data.address, length, ep->device->heap.segment->mr,
                     LCII_MAKE_PROTO(ep->gid, LCI_MSG_RTR, 0), rtr_ctx);
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  LCIU_update_average(
-      &LCII_pcounters[LCIU_get_thread_id()].recv_iovec_handle_rts_nsec_ave,
-      &LCII_pcounters[LCIU_get_thread_id()].recv_iovec_handle_rts_nsec_count,
-      (int64_t)LCII_ucs_time_to_nsec(LCII_ucs_get_time() - rdv_ctx->timer), 1);
-#endif
 }
 
 static inline void LCII_handle_iovec_rtr(LCI_endpoint_t ep,
                                          LCII_packet_t* packet)
 {
   LCII_context_t* ctx = (LCII_context_t*)packet->data.rtr.send_ctx;
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  LCIU_update_average(
-      &LCII_pcounters[LCIU_get_thread_id()].send_iovec_handshake_nsec_ave,
-      &LCII_pcounters[LCIU_get_thread_id()].send_iovec_handshake_nsec_count,
-      (int64_t)LCII_ucs_time_to_nsec(LCII_ucs_get_time() - ctx->timer), 1);
-#endif
   LCII_extended_context_t* ectx = LCIU_malloc(sizeof(LCII_extended_context_t));
   LCII_initilize_comp_attr(ectx->comp_attr);
   LCII_comp_attr_set_extended(ectx->comp_attr, 1);
@@ -420,12 +406,12 @@ static inline void LCII_handle_iovec_rtr(LCI_endpoint_t ep,
       ectx->comp_attr, ctx->data.iovec.lbuffers[0].segment == LCI_SEGMENT_ALL);
   for (int i = 0; i < ctx->data.iovec.count; ++i) {
     if (ctx->data.iovec.lbuffers[i].segment == LCI_SEGMENT_ALL) {
-      LCM_DBG_Assert(LCII_comp_attr_get_dereg(ectx->comp_attr) == 1, "\n");
+      LCI_DBG_Assert(LCII_comp_attr_get_dereg(ectx->comp_attr) == 1, "\n");
       LCI_memory_register(ep->device, ctx->data.iovec.lbuffers[i].address,
                           ctx->data.iovec.lbuffers[i].length,
                           &ctx->data.iovec.lbuffers[i].segment);
     } else {
-      LCM_DBG_Assert(LCII_comp_attr_get_dereg(ectx->comp_attr) == 0, "\n");
+      LCI_DBG_Assert(LCII_comp_attr_get_dereg(ectx->comp_attr) == 0, "\n");
     }
     LCIS_post_put_bq(ep->bq_p, ep->bq_spinlock_p,
                      ep->device->endpoint_progress.endpoint, ctx->rank,
@@ -453,9 +439,9 @@ static inline void LCII_handle_iovec_put_comp(LCII_extended_context_t* ectx)
   if (signal_count < ectx->signal_expected) {
     return;
   }
-  LCM_DBG_Assert(signal_count == ectx->signal_expected, "Unexpected signal!\n");
+  LCI_DBG_Assert(signal_count == ectx->signal_expected, "Unexpected signal!\n");
   LCII_context_t* ctx = ectx->context;
-  LCM_DBG_Log(LCM_LOG_DEBUG, "rdv", "send FIN: rctx %p\n",
+  LCI_DBG_Log(LCI_LOG_TRACE, "rdv", "send FIN: rctx %p\n",
               (void*)ectx->recv_ctx);
   LCIS_post_sends_bq(ectx->ep->bq_p, ectx->ep->bq_spinlock_p,
                      ectx->ep->device->endpoint_progress.endpoint, ctx->rank,
@@ -469,36 +455,20 @@ static inline void LCII_handle_iovec_put_comp(LCII_extended_context_t* ectx)
   }
   LCIU_free(ectx);
   lc_ce_dispatch(ctx);
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  LCIU_update_average(
-      &LCII_pcounters[LCIU_get_thread_id()].send_iovec_latency_nsec_ave,
-      &LCII_pcounters[LCIU_get_thread_id()].send_iovec_latency_nsec_count,
-      (int64_t)LCII_ucs_time_to_nsec(LCII_ucs_get_time() - ctx->timer), 1);
-#endif
 }
 
 static inline void LCII_handle_iovec_recv_FIN(LCII_packet_t* packet)
 {
   LCII_context_t* ctx;
   memcpy(&ctx, packet->data.address, sizeof(ctx));
-  LCM_DBG_Log(LCM_LOG_DEBUG, "rdv", "recv FIN: rctx %p\n", ctx);
-  LCM_DBG_Assert(ctx->data_type == LCI_IOVEC,
+  LCI_DBG_Log(LCI_LOG_TRACE, "rdv", "recv FIN: rctx %p\n", ctx);
+  LCI_DBG_Assert(ctx->data_type == LCI_IOVEC,
                  "Didn't get the right context (%p type=%d)!.\n", ctx,
                  ctx->data_type);
   // putva has been completed locally. Need to process completion.
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-  LCIU_update_average(
-      &LCII_pcounters[LCIU_get_thread_id()].recv_iovec_latency_nsec_ave,
-      &LCII_pcounters[LCIU_get_thread_id()].recv_iovec_latency_nsec_count,
-      (int64_t)LCII_ucs_time_to_nsec(LCII_ucs_get_time() - ctx->timer), 1);
-#endif
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_rx +=
-                         ctx->data.iovec.count);
-  LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].msgs_1sided_rx +=
-                         ctx->data.iovec.count);
+  LCII_PCOUNTER_ADD(net_recv_comp, sizeof(ctx));
   for (int i = 0; i < ctx->data.iovec.count; ++i)
-    LCII_PCOUNTERS_WRAPPER(LCII_pcounters[LCIU_get_thread_id()].bytes_rx +=
-                           ctx->data.iovec.lbuffers[i].length);
+    LCII_PCOUNTER_ADD(net_recv_comp, ctx->data.iovec.lbuffers[i].length);
   LCII_free_packet(packet);
   lc_ce_dispatch(ctx);
 }
diff --git a/src/sys/lciu_atomic.h b/lci/sys/lciu_atomic.h
similarity index 100%
rename from src/sys/lciu_atomic.h
rename to lci/sys/lciu_atomic.h
diff --git a/src/sys/lciu_malloc.h b/lci/sys/lciu_malloc.h
similarity index 100%
rename from src/sys/lciu_malloc.h
rename to lci/sys/lciu_malloc.h
diff --git a/src/sys/lciu_misc.h b/lci/sys/lciu_misc.h
similarity index 85%
rename from src/sys/lciu_misc.h
rename to lci/sys/lciu_misc.h
index 1955bea8..8784cd2b 100644
--- a/src/sys/lciu_misc.h
+++ b/lci/sys/lciu_misc.h
@@ -4,9 +4,6 @@
 #include <assert.h>
 #include <time.h>
 
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
 #define LCIU_MIN(x, y) ((x) < (y) ? (x) : (y))
 #define LCIU_MAX(x, y) ((x) > (y) ? (x) : (y))
 #define LCIU_MAX_ASSIGN(x, y) x = LCIU_MAX(x, y)
@@ -85,31 +82,11 @@ static inline uint64_t LCIU_get_bits64(uint64_t flag, int width, int offset)
   return (flag >> offset) & ((1UL << width) - 1);
 }
 
-/*
- * We would like to hide these two global variable,
- * but we cannot do it easily, because:
- * - we want to make LCIU_thread_id an inline function
- */
-extern int LCIU_nthreads;
-extern __thread int LCIU_thread_id;
-
-/* thread id */
-static inline int LCIU_get_thread_id()
-{
-  if (unlikely(LCIU_thread_id == -1)) {
-    //    LCIU_thread_id = sched_getcpu();
-    //    if (LCIU_thread_id == -1) {
-    LCIU_thread_id = __sync_fetch_and_add(&LCIU_nthreads, 1);
-    //    }
-  }
-  return LCIU_thread_id;
-}
-
 extern __thread unsigned int LCIU_rand_seed;
 static inline int LCIU_rand()
 {
   if (LCIU_rand_seed == 0) {
-    LCIU_rand_seed = time(NULL) + LCIU_get_thread_id() + rand();
+    LCIU_rand_seed = time(NULL) + LCT_get_thread_id() + rand();
   }
   return rand_r(&LCIU_rand_seed);
 }
@@ -124,7 +101,7 @@ static inline int LCIU_getenv_or(char* env, int def)
   } else {
     ret = def;
   }
-  LCM_Log(LCM_LOG_INFO, "env", "set %s to be %d\n", env, ret);
+  LCI_Log(LCI_LOG_INFO, "env", "set %s to be %d\n", env, ret);
   return ret;
 }
 
diff --git a/src/sys/lciu_spinlock.h b/lci/sys/lciu_spinlock.h
similarity index 100%
rename from src/sys/lciu_spinlock.h
rename to lci/sys/lciu_spinlock.h
diff --git a/src/unused/experimental/hashtable/hashtbl_cock.h b/lci/unused/experimental/hashtable/hashtbl_cock.h
similarity index 100%
rename from src/unused/experimental/hashtable/hashtbl_cock.h
rename to lci/unused/experimental/hashtable/hashtbl_cock.h
diff --git a/src/unused/experimental/hashtable/hashtbl_lf.h b/lci/unused/experimental/hashtable/hashtbl_lf.h
similarity index 100%
rename from src/unused/experimental/hashtable/hashtbl_lf.h
rename to lci/unused/experimental/hashtable/hashtbl_lf.h
diff --git a/src/unused/experimental/hashtable/hashtbl_tbb.h b/lci/unused/experimental/hashtable/hashtbl_tbb.h
similarity index 100%
rename from src/unused/experimental/hashtable/hashtbl_tbb.h
rename to lci/unused/experimental/hashtable/hashtbl_tbb.h
diff --git a/src/unused/experimental/hashtable/lf_hash.h b/lci/unused/experimental/hashtable/lf_hash.h
similarity index 100%
rename from src/unused/experimental/hashtable/lf_hash.h
rename to lci/unused/experimental/hashtable/lf_hash.h
diff --git a/src/unused/experimental/packet/mpmcqueue-inl.h b/lci/unused/experimental/packet/mpmcqueue-inl.h
similarity index 100%
rename from src/unused/experimental/packet/mpmcqueue-inl.h
rename to lci/unused/experimental/packet/mpmcqueue-inl.h
diff --git a/src/unused/experimental/packet/mpmcqueue.h b/lci/unused/experimental/packet/mpmcqueue.h
similarity index 100%
rename from src/unused/experimental/packet/mpmcqueue.h
rename to lci/unused/experimental/packet/mpmcqueue.h
diff --git a/src/unused/experimental/packet/packet_manager_misc.h b/lci/unused/experimental/packet/packet_manager_misc.h
similarity index 100%
rename from src/unused/experimental/packet/packet_manager_misc.h
rename to lci/unused/experimental/packet/packet_manager_misc.h
diff --git a/src/unused/glob.c b/lci/unused/glob.c
similarity index 100%
rename from src/unused/glob.c
rename to lci/unused/glob.c
diff --git a/src/unused/lcrq.c b/lci/unused/lcrq.c
similarity index 100%
rename from src/unused/lcrq.c
rename to lci/unused/lcrq.c
diff --git a/src/unused/lcrq.h b/lci/unused/lcrq.h
similarity index 100%
rename from src/unused/lcrq.h
rename to lci/unused/lcrq.h
diff --git a/src/unused/profiler.h b/lci/unused/profiler.h
similarity index 100%
rename from src/unused/profiler.h
rename to lci/unused/profiler.h
diff --git a/src/unused/sync_abt.c b/lci/unused/sync_abt.c
similarity index 100%
rename from src/unused/sync_abt.c
rename to lci/unused/sync_abt.c
diff --git a/lct/CMakeLists.txt b/lct/CMakeLists.txt
new file mode 100644
index 00000000..767948c7
--- /dev/null
+++ b/lct/CMakeLists.txt
@@ -0,0 +1,18 @@
+option(LCTI_CONFIG_USE_ALIGNED_ALLOC "Enable memory alignment" ON)
+
+add_subdirectory(api)
+add_subdirectory(data_structure)
+
+target_include_directories(LCT PRIVATE . api)
+target_sources_relative(
+  LCT
+  PRIVATE
+  lct.cpp
+  log/logger.cpp
+  pcounter/pcounter.cpp
+  util/thread.cpp
+  util/time.cpp
+  util/string.cpp)
+
+configure_file(lcti_config.hpp.in lcti_config.h @ONLY)
+target_include_directories(LCT PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/lct/api/CMakeLists.txt b/lct/api/CMakeLists.txt
new file mode 100644
index 00000000..e7e02c21
--- /dev/null
+++ b/lct/api/CMakeLists.txt
@@ -0,0 +1,6 @@
+configure_file(lct_config.h.in lct_config.h @ONLY)
+target_include_directories(
+  LCT
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+         $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
diff --git a/lct/api/lct.h b/lct/api/lct.h
new file mode 100644
index 00000000..c6975990
--- /dev/null
+++ b/lct/api/lct.h
@@ -0,0 +1,165 @@
+#ifndef LCI_LCT_H
+#define LCI_LCT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "lct_config.h"
+
+#define LCT_API __attribute__((visibility("default")))
+
+#define LCT_likely(x) __builtin_expect(!!(x), 1)
+#define LCT_unlikely(x) __builtin_expect(!!(x), 0)
+
+LCT_API void LCT_init();
+LCT_API void LCT_fina();
+
+// rank
+LCT_API void LCT_set_rank(int rank);
+LCT_API int LCT_get_rank();
+
+// time
+typedef uint64_t LCT_time_t;
+LCT_API LCT_time_t LCT_now();
+LCT_API double LCT_time_to_ns(LCT_time_t time);
+LCT_API double LCT_time_to_us(LCT_time_t time);
+LCT_API double LCT_time_to_ms(LCT_time_t time);
+LCT_API double LCT_time_to_s(LCT_time_t time);
+
+// string
+LCT_API const char* LCT_str_replace_one(const char* in, const char* from,
+                                        const char* to);
+LCT_API const char* LCT_str_replace_all(const char* in, const char* from,
+                                        const char* to);
+typedef struct {
+  const char* key;
+  int val;
+} LCT_dict_str_int_t;
+LCT_API int LCT_str_int_search(LCT_dict_str_int_t dict[], int count,
+                               const char* key, int default_val, int* val);
+
+// thread
+LCT_API int LCT_get_thread_id();
+LCT_API int LCT_get_nthreads();
+
+// log
+typedef void* LCT_log_ctx_t;
+LCT_API LCT_log_ctx_t LCT_log_ctx_alloc(const char* const log_levels_[],
+                                        int count, int default_log_level,
+                                        const char* ctx_name, char* filename,
+                                        char* log_level_str, char* whitelist,
+                                        char* blacklist);
+LCT_API void LCT_log_ctx_free(LCT_log_ctx_t* log_ctx_p);
+LCT_API int LCT_log_get_level(LCT_log_ctx_t log_ctx);
+#define LCT_Assert(log_ctx, Expr, ...) \
+  LCT_Assert_(log_ctx, #Expr, Expr, __FILE__, __func__, __LINE__, __VA_ARGS__)
+#define LCT_Log(log_ctx, log_level, log_tag, ...)                     \
+  LCT_Log_(log_ctx, log_level, log_tag, __FILE__, __func__, __LINE__, \
+           __VA_ARGS__)
+#define LCT_Warn(log_ctx, ...) \
+  LCT_Log(log_ctx, LCT_LOG_WARN, "warn", __VA_ARGS__)
+#define LCT_Logv(log_ctx, log_level, log_tag, format, vargs)                   \
+  LCT_Logv_(log_ctx, log_level, log_tag, __FILE__, __func__, __LINE__, format, \
+            vargs)
+LCT_API void LCT_Assert_(LCT_log_ctx_t log_ctx, const char* expr_str,
+                         uint64_t expr, const char* file, const char* func,
+                         int line, const char* format, ...);
+LCT_API void LCT_Log_(LCT_log_ctx_t log_ctx, int log_level, const char* log_tag,
+                      const char* file, const char* func, int line,
+                      const char* format, ...);
+LCT_API void LCT_Logv_(LCT_log_ctx_t log_ctx, int log_level,
+                       const char* log_tag, const char* file, const char* func,
+                       int line, const char* format, va_list vargs);
+LCT_API void LCT_Log_flush(LCT_log_ctx_t log_ctx);
+
+#ifdef LCT_DEBUG
+#define LCT_DBG_Assert(...) LCT_Assert(__VA_ARGS__)
+#define LCT_DBG_Log(...) LCT_Log(__VA_ARGS__)
+#define LCT_DBG_Warn(...) LCT_Warn(__VA_ARGS__)
+#else
+#define LCT_DBG_Assert(...)
+#define LCT_DBG_Log(...)
+#define LCT_DBG_Warn(...)
+#endif
+
+// default log ctx
+extern LCT_log_ctx_t LCT_log_ctx_default;
+enum LCT_log_level_default_t {
+  LCT_LOG_ERROR,
+  LCT_LOG_WARN,
+  LCT_LOG_DIAG,
+  LCT_LOG_INFO,
+  LCT_LOG_DEBUG,
+  LCT_LOG_TRACE,
+  LCT_LOG_MAX
+};
+
+// cache
+#define LCT_ASSERT_SAME_CACHE_LINE(p1, p2)                                     \
+  LCT_Assert(LCT_log_ctx_default,                                              \
+             (uintptr_t)p1 / LCT_CACHE_LINE == (uintptr_t)p2 / LCT_CACHE_LINE, \
+             "%p and %p is not in the same L1 cache line (%d B)\n", p1, p2,    \
+             LCT_CACHE_LINE)
+#define LCT_ASSERT_DIFF_CACHE_LINE(p1, p2)                                     \
+  LCT_Assert(LCT_log_ctx_default,                                              \
+             (uintptr_t)p1 / LCT_CACHE_LINE != (uintptr_t)p2 / LCT_CACHE_LINE, \
+             "%p and %p is not in different L1 cache lines (%d B)\n", p1, p2,  \
+             LCT_CACHE_LINE)
+
+// Performance Counter
+typedef enum {
+  LCT_PCOUNTER_NONE,
+  LCT_PCOUNTER_COUNTER,
+  LCT_PCOUNTER_TREND,
+  LCT_PCOUNTER_TIMER,
+} LCT_pcounter_type_t;
+typedef void* LCT_pcounter_ctx_t;
+typedef struct {
+  LCT_pcounter_type_t type;
+  int idx;
+} LCT_pcounter_handle_t;
+LCT_API LCT_pcounter_ctx_t LCT_pcounter_ctx_alloc(const char* ctx_name);
+LCT_API LCT_pcounter_handle_t
+LCT_pcounter_register(LCT_pcounter_ctx_t pcounter_ctx, const char* name,
+                      LCT_pcounter_type_t type);
+LCT_API void LCT_pcounter_ctx_free(LCT_pcounter_ctx_t* pcounter_ctx);
+LCT_API void LCT_pcounter_add(LCT_pcounter_ctx_t pcounter_ctx,
+                              LCT_pcounter_handle_t handle, int64_t val);
+LCT_API void LCT_pcounter_start(LCT_pcounter_ctx_t pcounter_ctx,
+                                LCT_pcounter_handle_t handle);
+LCT_API void LCT_pcounter_end(LCT_pcounter_ctx_t pcounter_ctx,
+                              LCT_pcounter_handle_t handle);
+LCT_API void LCT_pcounter_startt(LCT_pcounter_ctx_t pcounter_ctx,
+                                 LCT_pcounter_handle_t handle, LCT_time_t time);
+LCT_API void LCT_pcounter_endt(LCT_pcounter_ctx_t pcounter_ctx,
+                               LCT_pcounter_handle_t handle, LCT_time_t time);
+LCT_API void LCT_pcounter_record(LCT_pcounter_ctx_t pcounter_ctx);
+LCT_API void LCT_pcounter_dump(LCT_pcounter_ctx_t pcounter_ctx, FILE* out);
+
+// Data Structures
+// Queues
+typedef enum {
+  LCT_QUEUE_ARRAY_ATOMIC_FAA,
+  LCT_QUEUE_ARRAY_ATOMIC_CAS,
+  LCT_QUEUE_ARRAY_ATOMIC_BASIC,
+  LCT_QUEUE_ARRAY_ST,
+  LCT_QUEUE_ARRAY_MUTEX,
+  LCT_QUEUE_STD,
+  LCT_QUEUE_STD_MUTEX,
+} LCT_queue_type_t;
+struct LCT_queue_opaque_t;
+typedef struct LCT_queue_opaque_t* LCT_queue_t;
+LCT_API LCT_queue_t LCT_queue_alloc(LCT_queue_type_t type, size_t length);
+LCT_API void LCT_queue_free(LCT_queue_t* queue_p);
+LCT_API void LCT_queue_push(LCT_queue_t queue, void* val);
+LCT_API void* LCT_queue_pop(LCT_queue_t queue);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // LCI_LCT_H
diff --git a/lct/api/lct_config.h.in b/lct/api/lct_config.h.in
new file mode 100644
index 00000000..b114aa11
--- /dev/null
+++ b/lct/api/lct_config.h.in
@@ -0,0 +1,7 @@
+#ifndef LCI_LCT_CONFIG_H
+#define LCI_LCT_CONFIG_H
+
+#define LCT_CACHE_LINE @LCI_CACHE_LINE@
+#cmakedefine LCT_DEBUG
+
+#endif  // LCI_LCT_CONFIG_H
diff --git a/lct/data_structure/CMakeLists.txt b/lct/data_structure/CMakeLists.txt
new file mode 100644
index 00000000..3e444d18
--- /dev/null
+++ b/lct/data_structure/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(queue)
diff --git a/lct/data_structure/queue/CMakeLists.txt b/lct/data_structure/queue/CMakeLists.txt
new file mode 100644
index 00000000..c5e026b4
--- /dev/null
+++ b/lct/data_structure/queue/CMakeLists.txt
@@ -0,0 +1 @@
+target_sources_relative(LCT PRIVATE queue.cpp)
diff --git a/lct/data_structure/queue/queue.cpp b/lct/data_structure/queue/queue.cpp
new file mode 100644
index 00000000..a6da279b
--- /dev/null
+++ b/lct/data_structure/queue/queue.cpp
@@ -0,0 +1,55 @@
+#include <stdexcept>
+#include "lcti.hpp"
+#include "data_structure/queue/queue_base.hpp"
+#include "data_structure/queue/queue_array_atomic_faa.hpp"
+#include "data_structure/queue/queue_array_atomic_cas.hpp"
+#include "data_structure/queue/queue_array_atomic_basic.hpp"
+#include "data_structure/queue/queue_array.hpp"
+#include "data_structure/queue/queue_std.hpp"
+
+LCT_queue_t LCT_queue_alloc(LCT_queue_type_t type, size_t length)
+{
+  lct::queue_base_t* q;
+  switch (type) {
+    case LCT_QUEUE_ARRAY_ATOMIC_FAA:
+      q = new lct::queue_array_atomic_faa_t(length);
+      break;
+    case LCT_QUEUE_ARRAY_ATOMIC_CAS:
+      q = new lct::queue_array_atomic_cas_t(length);
+      break;
+    case LCT_QUEUE_ARRAY_ATOMIC_BASIC:
+      q = new lct::queue_array_atomic_basic_t(length);
+      break;
+    case LCT_QUEUE_ARRAY_ST:
+      q = new lct::queue_array_t<false>(length);
+      break;
+    case LCT_QUEUE_ARRAY_MUTEX:
+      q = new lct::queue_array_t<true>(length);
+      break;
+    case LCT_QUEUE_STD:
+      q = new lct::queue_std_t<false>();
+      break;
+    case LCT_QUEUE_STD_MUTEX:
+      q = new lct::queue_std_t<true>();
+      break;
+    default:
+      throw std::runtime_error("unknown queue type " + std::to_string(type));
+  }
+  return reinterpret_cast<LCT_queue_t>(q);
+}
+void LCT_queue_free(LCT_queue_t* queue_p)
+{
+  auto q = reinterpret_cast<lct::queue_base_t*>(*queue_p);
+  delete q;
+  *queue_p = nullptr;
+}
+void LCT_queue_push(LCT_queue_t queue, void* val)
+{
+  auto q = reinterpret_cast<lct::queue_base_t*>(queue);
+  q->push(val);
+}
+void* LCT_queue_pop(LCT_queue_t queue)
+{
+  auto q = reinterpret_cast<lct::queue_base_t*>(queue);
+  return q->pop();
+}
\ No newline at end of file
diff --git a/lct/data_structure/queue/queue_array.hpp b/lct/data_structure/queue/queue_array.hpp
new file mode 100644
index 00000000..89a4ce48
--- /dev/null
+++ b/lct/data_structure/queue/queue_array.hpp
@@ -0,0 +1,60 @@
+#ifndef LCI_QUEUE_ARRAY_HPP
+#define LCI_QUEUE_ARRAY_HPP
+
+namespace lct
+{
+template <bool THREAD_SAFE = false>
+struct queue_array_t : public queue_base_t {
+  explicit queue_array_t(size_t capacity)
+      : top(0), bot(0), length(capacity + 1), container(capacity + 1)
+  {
+    LCT_ASSERT_DIFF_CACHE_LINE(&top, &bot);
+    LCT_ASSERT_DIFF_CACHE_LINE(&bot, &length);
+    LCT_ASSERT_SAME_CACHE_LINE(&length, &container);
+    LCT_ASSERT_DIFF_CACHE_LINE(&container, &lock);
+  }
+  void push(void* val) override;
+  void* pop() override;
+
+ private:
+  struct entry_t;
+  alignas(LCT_CACHE_LINE) uint_fast64_t top;
+  alignas(LCT_CACHE_LINE) uint_fast64_t bot;
+  alignas(LCT_CACHE_LINE) uint_fast64_t length;
+  std::vector<entry_t> container;
+  alignas(LCT_CACHE_LINE) spinlock_t lock;
+};
+
+template <bool THREAD_SAFE>
+struct alignas(LCT_CACHE_LINE) queue_array_t<THREAD_SAFE>::entry_t {
+  entry_t() : data(nullptr) {}
+  void* data;
+};
+
+template <bool THREAD_SAFE>
+void queue_array_t<THREAD_SAFE>::push(void* val)
+{
+  if constexpr (THREAD_SAFE) lock.lock();
+  size_t new_top = (top + 1) % length;
+  LCT_DBG_Assert(LCT_log_ctx_default, new_top != length, "the queue is full\n");
+  container[top].data = val;
+  top = new_top;
+  if constexpr (THREAD_SAFE) lock.unlock();
+}
+
+template <bool THREAD_SAFE>
+void* queue_array_t<THREAD_SAFE>::pop()
+{
+  void* ret = nullptr;
+  if constexpr (THREAD_SAFE)
+    if (!lock.try_lock()) return ret;
+  if (top != bot) {
+    ret = container[bot].data;
+    bot = (bot + 1) % length;
+  }
+  if constexpr (THREAD_SAFE) lock.unlock();
+  return ret;
+}
+}  // namespace lct
+
+#endif  // LCI_QUEUE_ARRAY_HPP
diff --git a/lct/data_structure/queue/queue_array_atomic_basic.hpp b/lct/data_structure/queue/queue_array_atomic_basic.hpp
new file mode 100644
index 00000000..52eb7036
--- /dev/null
+++ b/lct/data_structure/queue/queue_array_atomic_basic.hpp
@@ -0,0 +1,107 @@
+#ifndef LCI_QUEUE_ARRAY_ATOMIC_BASIC_HPP
+#define LCI_QUEUE_ARRAY_ATOMIC_BASIC_HPP
+
+#include <vector>
+
+namespace lct
+{
+struct queue_array_atomic_basic_t : public queue_base_t {
+  explicit queue_array_atomic_basic_t(size_t capacity);
+  void push(void* val) override;
+  void* pop() override;
+
+ private:
+  struct entry_t;
+  // point to the next entry that is empty
+  alignas(LCT_CACHE_LINE) std::atomic<uint_fast64_t> top;
+  std::atomic<uint_fast64_t> top2;
+  // point to the fist entry that is full
+  alignas(LCT_CACHE_LINE) std::atomic<uint_fast64_t> bot;
+  std::atomic<uint_fast64_t> bot2;
+  // queue length
+  alignas(LCT_CACHE_LINE) uint_fast64_t length;
+  // a pointer to type void*
+  std::vector<entry_t> container;
+};
+
+struct alignas(LCT_CACHE_LINE) queue_array_atomic_basic_t::entry_t {
+  entry_t() : data(nullptr) {}
+  void* data;
+};
+
+queue_array_atomic_basic_t::queue_array_atomic_basic_t(size_t capacity)
+    : top(0),
+      top2(0),
+      bot(0),
+      bot2(0),
+      length(capacity + 1),
+      container(capacity + 1)
+{
+  LCT_ASSERT_SAME_CACHE_LINE(&top, &top2);
+  LCT_ASSERT_DIFF_CACHE_LINE(&top2, &bot);
+  LCT_ASSERT_SAME_CACHE_LINE(&bot, &bot2);
+  LCT_ASSERT_DIFF_CACHE_LINE(&bot2, &length);
+  LCT_ASSERT_SAME_CACHE_LINE(&length, &container);
+  static_assert(sizeof(queue_array_atomic_basic_t::entry_t) == LCT_CACHE_LINE,
+                "unexpected sizeof(LCM_aqueue_entry_t)");
+}
+
+void queue_array_atomic_basic_t::push(void* val)
+{
+  uint_fast64_t current_top = top.fetch_add(1, std::memory_order_relaxed);
+  // make sure the queue is not full
+  LCT_DBG_Assert(
+      LCT_log_ctx_default,
+      current_top - bot2.load(std::memory_order_acquire) > length - 2,
+      "wrote to a nonempty value!\n");
+  // write to the slot
+  container[current_top % length].data = val;
+  // update top2 to tell the consumers they can safely read this slot.
+  while (true) {
+    uint_fast64_t expected = current_top;
+    bool succeed = top2.compare_exchange_weak(expected, current_top + 1,
+                                              std::memory_order_release,
+                                              std::memory_order_relaxed);
+    if (succeed) {
+      // succeed!
+      break;
+    }
+  }
+}
+
+void* queue_array_atomic_basic_t::pop()
+{
+  uint_fast64_t current_bot = bot.load(std::memory_order_relaxed);
+  if (top2.load(std::memory_order_acquire) <= current_bot) {
+    // the queue is empty
+    return nullptr;
+  }
+  uint_fast64_t expected = current_bot;
+  bool succeed = bot.compare_exchange_strong(expected, current_bot + 1,
+                                             std::memory_order_relaxed,
+                                             std::memory_order_relaxed);
+  if (!succeed) {
+    // some thread is ahead of us.
+    return nullptr;
+  }
+  // we have successfully reserve an entry
+  void* result = container[current_bot % length].data;
+#ifdef LCT_DEBUG
+  // now that we got the value, we can update bot2 to tell the producers they
+  // can safely write to this entry.
+  while (true) {
+    expected = current_bot;
+    succeed = bot2.compare_exchange_weak(expected, current_bot + 1,
+                                         std::memory_order_release,
+                                         std::memory_order_relaxed);
+    if (succeed) {
+      // succeed!
+      break;
+    }
+  }
+#endif
+  return result;
+}
+}  // namespace lct
+
+#endif  // LCI_QUEUE_ARRAY_ATOMIC_BASIC_HPP
diff --git a/lct/data_structure/queue/queue_array_atomic_cas.hpp b/lct/data_structure/queue/queue_array_atomic_cas.hpp
new file mode 100644
index 00000000..7c38efac
--- /dev/null
+++ b/lct/data_structure/queue/queue_array_atomic_cas.hpp
@@ -0,0 +1,82 @@
+#ifndef LCI_QUEUE_ARRAY_ATOMIC_CAS_HPP
+#define LCI_QUEUE_ARRAY_ATOMIC_CAS_HPP
+
+#include <vector>
+
+namespace lct
+{
+struct queue_array_atomic_cas_t : public queue_base_t {
+  explicit queue_array_atomic_cas_t(size_t capacity);
+  void push(void* val) override;
+  void* pop() override;
+
+ private:
+  struct entry_t;
+  // point to the next entry that is empty
+  alignas(LCT_CACHE_LINE) std::atomic<uint_fast64_t> top;
+  // point to the fist entry that is full
+  alignas(LCT_CACHE_LINE) std::atomic<uint_fast64_t> bot;
+  // queue length
+  alignas(LCT_CACHE_LINE) uint_fast64_t length;
+  // a pointer to type void*
+  std::vector<entry_t> container;
+};
+
+struct alignas(LCT_CACHE_LINE) queue_array_atomic_cas_t::entry_t {
+  entry_t() : data(nullptr), tag(-1) {}
+  void* data;
+  std::atomic<uint_fast64_t> tag;
+};
+
+queue_array_atomic_cas_t::queue_array_atomic_cas_t(size_t capacity)
+    : top(0), bot(0), length(capacity + 1), container(capacity + 1)
+{
+  LCT_ASSERT_DIFF_CACHE_LINE(&top, &bot);
+  LCT_ASSERT_DIFF_CACHE_LINE(&bot, &length);
+  LCT_ASSERT_SAME_CACHE_LINE(&length, &container);
+  static_assert(sizeof(queue_array_atomic_cas_t::entry_t) == LCT_CACHE_LINE,
+                "unexpected sizeof(LCM_aqueue_entry_t)");
+}
+
+void queue_array_atomic_cas_t::push(void* val)
+{
+  uint_fast64_t current_top = top.fetch_add(1, std::memory_order_relaxed);
+  // make sure the queue is not full
+  LCT_DBG_Assert(LCT_log_ctx_default,
+                 container[current_top % length].tag.load(
+                     std::memory_order_acquire) != current_top - length,
+                 "wrote to a nonempty value!\n");
+  // write to the slot
+  container[current_top % length].data = val;
+  // update tag to tell the consumers they can safely read this slot.
+  container[current_top % length].tag.store(current_top,
+                                            std::memory_order_release);
+}
+
+void* queue_array_atomic_cas_t::pop()
+{
+  uint_fast64_t current_bot = bot.load(std::memory_order_relaxed);
+  if (container[current_bot % length].tag.load(std::memory_order_acquire) !=
+      current_bot) {
+    // the queue is empty
+    return nullptr;
+  }
+  uint_fast64_t expected = current_bot;
+  bool succeed = bot.compare_exchange_strong(expected, current_bot + 1,
+                                             std::memory_order_relaxed,
+                                             std::memory_order_relaxed);
+  if (!succeed) {
+    // some thread is ahead of us.
+    return nullptr;
+  }
+  // we have successfully reserve an entry
+  void* result = container[current_bot % length].data;
+#ifdef LCT_DEBUG
+  container[current_bot % length].tag.store(current_bot + 1,
+                                            std::memory_order_release);
+#endif
+  return result;
+}
+}  // namespace lct
+
+#endif  // LCI_QUEUE_ARRAY_ATOMIC_CAS_HPP
diff --git a/lct/data_structure/queue/queue_array_atomic_faa.hpp b/lct/data_structure/queue/queue_array_atomic_faa.hpp
new file mode 100644
index 00000000..600f148e
--- /dev/null
+++ b/lct/data_structure/queue/queue_array_atomic_faa.hpp
@@ -0,0 +1,84 @@
+#ifndef LCI_QUEUE_ARRAY_ATOMIC_FAA_HPP
+#define LCI_QUEUE_ARRAY_ATOMIC_FAA_HPP
+
+#include <vector>
+
+namespace lct
+{
+struct queue_array_atomic_faa_t : public queue_base_t {
+  explicit queue_array_atomic_faa_t(size_t capacity);
+  void push(void* val) override;
+  void* pop() override;
+
+ private:
+  struct entry_t;
+  // point to the next entry that is empty
+  alignas(LCT_CACHE_LINE) std::atomic<uint_fast64_t> top;
+  // point to the fist entry that is full
+  alignas(LCT_CACHE_LINE) std::atomic<uint_fast64_t> bot;
+  // queue length
+  alignas(LCT_CACHE_LINE) uint_fast64_t length;
+  // a pointer to type void*
+  std::vector<entry_t> container;
+};
+
+struct alignas(LCT_CACHE_LINE) queue_array_atomic_faa_t::entry_t {
+  entry_t() : data(nullptr), tag(-1) {}
+  void* data;
+  std::atomic<uint_fast64_t> tag;
+};
+
+queue_array_atomic_faa_t::queue_array_atomic_faa_t(size_t capacity)
+    : top(0), bot(0), length(capacity + 1), container(capacity + 1)
+{
+  LCT_ASSERT_DIFF_CACHE_LINE(&top, &bot);
+  LCT_ASSERT_DIFF_CACHE_LINE(&bot, &length);
+  LCT_ASSERT_SAME_CACHE_LINE(&length, &container);
+  static_assert(sizeof(queue_array_atomic_faa_t::entry_t) == LCT_CACHE_LINE,
+                "unexpected sizeof(LCM_aqueue_entry_t)");
+}
+
+void queue_array_atomic_faa_t::push(void* val)
+{
+  uint_fast64_t current_top = top.fetch_add(1, std::memory_order_relaxed);
+  // make sure the queue is not full
+  LCT_DBG_Assert(LCT_log_ctx_default,
+                 container[current_top % length].tag.load(
+                     std::memory_order_acquire) != current_top - length,
+                 std::memory_order_acquire, "wrote to a nonempty value!\n");
+  // write to the slot
+  container[current_top % length].data = val;
+  // update tag to tell the consumers they can safely read this slot.
+  container[current_top % length].tag.store(current_top,
+                                            std::memory_order_release);
+}
+
+void* queue_array_atomic_faa_t::pop()
+{
+  uint_fast64_t current_bot = bot.load(std::memory_order_relaxed);
+  if (container[current_bot % length].tag.load(std::memory_order_acquire) !=
+      current_bot) {
+    // the queue is empty
+    return nullptr;
+  }
+  current_bot = bot.fetch_add(1, std::memory_order_relaxed);
+  while (container[current_bot % length].tag.load(std::memory_order_acquire) !=
+         current_bot) {
+    // some thread is ahead of us. We got a cell that is empty.
+    uint_fast64_t expected = current_bot + 1;
+    bool succeed = bot.compare_exchange_weak(expected, current_bot,
+                                             std::memory_order_relaxed,
+                                             std::memory_order_relaxed);
+    if (succeed) return nullptr;
+  }
+  // we have successfully reserve an entry
+  void* result = container[current_bot % length].data;
+#ifdef LCT_DEBUG
+  container[current_bot % length].tag.store(current_bot + 1,
+                                            std::memory_order_release);
+#endif
+  return result;
+}
+}  // namespace lct
+
+#endif  // LCI_QUEUE_ARRAY_ATOMIC_FAA_HPP
diff --git a/lct/data_structure/queue/queue_base.hpp b/lct/data_structure/queue/queue_base.hpp
new file mode 100644
index 00000000..d322f373
--- /dev/null
+++ b/lct/data_structure/queue/queue_base.hpp
@@ -0,0 +1,16 @@
+#ifndef LCI_QUEUE_BASE_HPP
+#define LCI_QUEUE_BASE_HPP
+
+#include <atomic>
+
+namespace lct
+{
+struct queue_base_t {
+  virtual ~queue_base_t() = default;
+  virtual void push(void* val) = 0;
+  virtual void* pop() = 0;
+};
+
+}  // namespace lct
+
+#endif  // LCI_QUEUE_BASE_HPP
diff --git a/lct/data_structure/queue/queue_std.hpp b/lct/data_structure/queue/queue_std.hpp
new file mode 100644
index 00000000..a6234963
--- /dev/null
+++ b/lct/data_structure/queue/queue_std.hpp
@@ -0,0 +1,36 @@
+#ifndef LCI_QUEUE_STD_HPP
+#define LCI_QUEUE_STD_HPP
+
+#include <queue>
+
+namespace lct
+{
+template <bool THREAD_SAFE = false>
+struct queue_std_t : public queue_base_t {
+  queue_std_t() = default;
+  void push(void* val) override
+  {
+    if constexpr (THREAD_SAFE) lock.lock();
+    queue.push(val);
+    if constexpr (THREAD_SAFE) lock.unlock();
+  }
+  void* pop() override
+  {
+    if constexpr (THREAD_SAFE)
+      if (!lock.try_lock()) return nullptr;
+    void* ret = nullptr;
+    if (!queue.empty()) {
+      ret = queue.front();
+      queue.pop();
+    }
+    if constexpr (THREAD_SAFE) lock.unlock();
+    return ret;
+  }
+
+ private:
+  alignas(LCT_CACHE_LINE) std::queue<void*> queue;
+  alignas(LCT_CACHE_LINE) spinlock_t lock;
+};
+}  // namespace lct
+
+#endif  // LCI_QUEUE_STD_HPP
diff --git a/lct/lct.cpp b/lct/lct.cpp
new file mode 100644
index 00000000..f470398e
--- /dev/null
+++ b/lct/lct.cpp
@@ -0,0 +1,56 @@
+#include "lcti.hpp"
+#include <atomic>
+#include <unistd.h>
+
+LCT_API LCT_log_ctx_t LCT_log_ctx_default = nullptr;
+
+namespace lct
+{
+std::atomic<int> init_count(0);
+
+void init()
+{
+  if (lct::init_count.fetch_add(1) > 0)
+    // init has been called
+    return;
+
+  // initialize LCT_log_ctx_default
+  const char* const log_levels[] = {
+      "error", "warn", "diag", "info", "debug", "trace",
+  };
+  LCT_log_ctx_default = LCT_log_ctx_alloc(
+      log_levels, sizeof(log_levels) / sizeof(log_levels[0]), LCT_LOG_WARN,
+      "lci", getenv("LCT_LOG_OUTFILE"), getenv("LCT_LOG_LEVEL"),
+      getenv("LCT_LOG_WHITELIST"), getenv("LCT_LOG_BLACKLIST"));
+
+  // check cache size
+  long cache_line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+  LCT_Assert(LCT_log_ctx_default, LCT_CACHE_LINE == cache_line_size,
+             "LCT_CACHE_LINE is set incorrectly! (%ld != %d)\n", LCT_CACHE_LINE,
+             cache_line_size);
+}
+
+void fina()
+{
+  int count = lct::init_count.fetch_sub(1);
+  if (count > 1)
+    // fina has not been called enough times
+    return;
+  else if (count <= 0) {
+    fprintf(stderr, "lct::fina has been called too many times (count: %d).\n",
+            count);
+    return;
+  }
+  LCT_log_ctx_free(&LCT_log_ctx_default);
+}
+}  // namespace lct
+
+void LCT_init() { lct::init(); }
+
+void LCT_fina() { lct::fina(); }
+
+int LCTI_rank = -1;
+
+void LCT_set_rank(int rank) { LCTI_rank = rank; }
+
+int LCT_get_rank() { return LCTI_rank; }
\ No newline at end of file
diff --git a/lct/lcti.hpp b/lct/lcti.hpp
new file mode 100644
index 00000000..96c3150d
--- /dev/null
+++ b/lct/lcti.hpp
@@ -0,0 +1,10 @@
+#ifndef LCI_LCTI_H
+#define LCI_LCTI_H
+
+#include "lct.h"
+#include "lcti_config.h"
+#include "util/misc.hpp"
+#include "util/spinlock.hpp"
+#include "util/mem.hpp"
+
+#endif  // LCI_LCTI_H
diff --git a/lct/lcti_config.hpp.in b/lct/lcti_config.hpp.in
new file mode 100644
index 00000000..1107956a
--- /dev/null
+++ b/lct/lcti_config.hpp.in
@@ -0,0 +1,6 @@
+#ifndef LCTI_CONFIG_H_
+#define LCTI_CONFIG_H_
+
+#cmakedefine LCTI_CONFIG_USE_ALIGNED_ALLOC
+
+#endif // LCTI_CONFIG_H_
diff --git a/lct/log/logger.cpp b/lct/log/logger.cpp
new file mode 100644
index 00000000..71ca702f
--- /dev/null
+++ b/lct/log/logger.cpp
@@ -0,0 +1,212 @@
+#include <utility>
+#include <vector>
+#include <stdarg.h>
+#include <unistd.h>
+#include "lcti.hpp"
+
+namespace lct
+{
+namespace log
+{
+struct ctx_t {
+  ctx_t(const std::vector<std::string>& log_levels_, std::string ctx_name_,
+        int default_log_level, char* filename, char* log_level_str,
+        char* whitelist, char* blacklist)
+      : log_levels(log_levels_),
+        ctx_name(std::move(ctx_name_)),
+        whitelist(whitelist),
+        blacklist(blacklist)
+  {
+    // set log level
+    log_level_setting = default_log_level;
+    if (log_level_str) {
+      bool succeed = false;
+      for (int i = 0; i < log_levels_.size(); ++i) {
+        if (log_levels_[i] == std::string(log_level_str)) {
+          log_level_setting = i;
+          succeed = true;
+          break;
+        }
+      }
+      if (!succeed) {
+        fprintf(stderr, "%s: unknown log_level %s. use the default %s.\n",
+                ctx_name.c_str(), log_level_str, log_levels_[0].c_str());
+      }
+    }
+    // set output file
+    if (filename == nullptr || strcmp(filename, "stderr") == 0)
+      outfile = stderr;
+    else if (strcmp(filename, "stdout") == 0)
+      outfile = stdout;
+    else {
+      std::string ofilename =
+          replaceOne(filename, "%", std::to_string(LCT_get_rank()));
+      outfile = fopen(ofilename.c_str(), "a");
+      if (outfile == nullptr) {
+        fprintf(stderr, "Cannot open the logfile %s!\n", filename);
+      }
+    }
+  }
+
+  ~ctx_t()
+  {
+    if (outfile != stdout && outfile != stderr) {
+      fclose(outfile);
+    }
+  }
+
+  void do_assert(const char* expr_str, uint64_t expr, const char* file,
+                 const char* func, int line, const char* format, va_list vargs)
+  {
+    if (expr) return;
+
+    char buf[1024];
+    int size;
+
+    size =
+        snprintf(buf, sizeof(buf), "%d:%d:%d:%s:%s:%d<%s:Assert failed: %s> ",
+                 LCT_get_rank(), getpid(), LCT_get_thread_id(), file, func,
+                 line, ctx_name.c_str(), expr_str);
+
+    vsnprintf(buf + size, sizeof(buf) - size, format, vargs);
+
+    fprintf(stderr, "%s", buf);
+    fflush(stderr);
+    abort();
+  }
+
+  void do_log(int log_level, const char* log_tag, const char* file,
+              const char* func, int line, const char* format, va_list vargs)
+  {
+    char buf[2048];
+    int size;
+    LCT_Assert(LCT_log_ctx_default, log_level < log_levels.size(),
+               "Unexpected log level!\n");
+    // if log_level is weaker than the configured log level, do nothing.
+    if (log_level > log_level_setting) return;
+    // if whitelist is enabled and log_type is not include in the whitelist,
+    // do nothing.
+    if (whitelist != nullptr && strstr(whitelist, log_tag) == nullptr) return;
+    // if blacklist is enabled and log_type is not include in the blacklist,
+    // do nothing.
+    if (blacklist != nullptr && strstr(blacklist, log_tag) != nullptr) return;
+    // print the log
+    size = snprintf(buf, sizeof(buf), "%d:%d:%d:%s:%s:%d<%s:%s:%s> ",
+                    LCT_get_rank(), getpid(), LCT_get_thread_id(), file, func,
+                    line, ctx_name.c_str(), log_levels[log_level].c_str(),
+                    log_tag);
+
+    vsnprintf(buf + size, sizeof(buf) - size, format, vargs);
+
+    fprintf(outfile, "%s", buf);
+  }
+
+  void do_flush() { fflush(outfile); }
+
+  std::vector<std::string> log_levels;
+  std::string ctx_name;
+  int log_level_setting;
+  char* whitelist;
+  char* blacklist;
+  FILE* outfile;
+};
+
+}  // namespace log
+}  // namespace lct
+
+LCT_log_ctx_t LCT_log_ctx_alloc(const char* const log_levels_[], int count,
+                                int default_log_level, const char* ctx_name,
+                                char* filename, char* log_level_str,
+                                char* whitelist, char* blacklist)
+{
+  std::vector<std::string> log_levels;
+  log_levels.resize(count);
+  for (int i = 0; i < count; ++i) {
+    log_levels[i] = log_levels_[i];
+  }
+  auto* ctx =
+      new lct::log::ctx_t(log_levels, ctx_name, default_log_level, filename,
+                          log_level_str, whitelist, blacklist);
+  return ctx;
+}
+
+void LCT_log_ctx_free(LCT_log_ctx_t* log_ctx_p)
+{
+  if (LCT_unlikely(log_ctx_p == nullptr || *log_ctx_p == nullptr)) {
+    if (LCT_log_ctx_default)
+      LCT_Warn(LCT_log_ctx_default, "LCT_log_ctx_free: Invalid log context!\n");
+    else
+      fprintf(stderr, "LCT_log_ctx_free: LCT_log_ctx_default is invalid!\n");
+  }
+  auto* ctx = static_cast<lct::log::ctx_t*>(*log_ctx_p);
+  delete ctx;
+  *log_ctx_p = nullptr;
+}
+
+int LCT_log_get_level(LCT_log_ctx_t log_ctx)
+{
+  if (LCT_unlikely(log_ctx == nullptr)) {
+    if (LCT_log_ctx_default)
+      LCT_Warn(LCT_log_ctx_default,
+               "LCT_log_get_level: Invalid log context!\n");
+    else
+      fprintf(stderr, "LCT_log_get_level: LCT_log_ctx_default is invalid!\n");
+  }
+  auto* ctx = static_cast<lct::log::ctx_t*>(log_ctx);
+  return ctx->log_level_setting;
+}
+
+void LCT_Assert_(LCT_log_ctx_t log_ctx, const char* expr_str, uint64_t expr,
+                 const char* file, const char* func, int line,
+                 const char* format, ...)
+{
+  if (LCT_unlikely(log_ctx == nullptr)) {
+    if (LCT_log_ctx_default)
+      LCT_Warn(LCT_log_ctx_default, "LCT_Assert_: Invalid log context!\n");
+    else
+      fprintf(stderr, "LCT_Assert_: LCT_log_ctx_default is invalid! %s:%s:%d\n",
+              file, func, line);
+  }
+  auto* ctx = static_cast<lct::log::ctx_t*>(log_ctx);
+  va_list vargs;
+  va_start(vargs, format);
+  ctx->do_assert(expr_str, expr, file, func, line, format, vargs);
+  va_end(vargs);
+}
+
+void LCT_Logv_(LCT_log_ctx_t log_ctx, int log_level, const char* log_tag,
+               const char* file, const char* func, int line, const char* format,
+               va_list vargs)
+{
+  if (LCT_unlikely(log_ctx == nullptr)) {
+    if (LCT_log_ctx_default)
+      LCT_Warn(LCT_log_ctx_default, "LCT_Logv_: Invalid log context!\n");
+    else
+      fprintf(stderr, "LCT_Logv_: LCT_log_ctx_default is invalid! %s:%s:%d\n",
+              file, func, line);
+  }
+  auto* ctx = static_cast<lct::log::ctx_t*>(log_ctx);
+  ctx->do_log(log_level, log_tag, file, func, line, format, vargs);
+}
+
+void LCT_Log_(LCT_log_ctx_t log_ctx, int log_level, const char* log_tag,
+              const char* file, const char* func, int line, const char* format,
+              ...)
+{
+  va_list vargs;
+  va_start(vargs, format);
+  LCT_Logv_(log_ctx, log_level, log_tag, file, func, line, format, vargs);
+  va_end(vargs);
+}
+
+void LCT_Log_flush(LCT_log_ctx_t log_ctx)
+{
+  if (LCT_unlikely(log_ctx == nullptr)) {
+    if (LCT_log_ctx_default)
+      LCT_Warn(LCT_log_ctx_default, "LCT_Log_flush: Invalid log context!\n");
+    else
+      fprintf(stderr, "LCT_Log_flush: LCT_log_ctx_default is invalid!\n");
+  }
+  auto* ctx = static_cast<lct::log::ctx_t*>(log_ctx);
+  ctx->do_flush();
+}
diff --git a/lct/pcounter/pcounter.cpp b/lct/pcounter/pcounter.cpp
new file mode 100644
index 00000000..1bfad9a3
--- /dev/null
+++ b/lct/pcounter/pcounter.cpp
@@ -0,0 +1,492 @@
+#include <utility>
+#include <vector>
+#include <string>
+#include <atomic>
+#include <mutex>
+#include <thread>
+#include <unistd.h>
+#include "lcti.hpp"
+
+namespace lct::pcounter
+{
+struct record_t;
+struct ctx_t;
+
+// magic number
+const int naccesses_per_record = 1000;
+
+struct entry_t {
+  entry_t() : total(0), count(0), min(INT64_MAX), max(INT64_MIN) {}
+  void add(int64_t val)
+  {
+    total += val;
+    ++count;
+    min = val < min ? val : min;
+    max = val > max ? val : max;
+  }
+  void merge(entry_t other)
+  {
+    total += other.total;
+    count += other.count;
+    min = other.min < min ? other.min : min;
+    max = other.max > max ? other.max : max;
+  }
+
+  int64_t total;
+  int64_t count;
+  int64_t min;
+  int64_t max;
+};
+
+struct record_t {
+  explicit record_t(LCT_time_t time_, const std::vector<entry_t>& entries_)
+      : time(time_), entries(entries_)
+  {
+  }
+  LCT_time_t time;
+  std::vector<entry_t> entries;
+};
+
+struct timer_t {
+  timer_t() : consecutive_start(false), start_time(0), start_count(0) {}
+  bool start(LCT_time_t time)
+  {
+    if (start_count != 0) consecutive_start = true;
+    start_time += time;
+    ++start_count;
+    return true;
+  }
+  void end(LCT_time_t time)
+  {
+    entry.add(static_cast<int64_t>(time - start_time));
+    start_time = 0;
+    --start_count;
+  }
+  void add(LCT_time_t time) { entry.add(static_cast<int64_t>(time)); }
+  [[nodiscard]] entry_t get() const
+  {
+    entry_t ret = entry;
+    if (consecutive_start) {
+      // min and max is not valid
+      ret.min = -1;
+      ret.max = -1;
+    }
+    ret.total = static_cast<int64_t>(LCT_time_to_ns(ret.total));
+    return ret;
+  }
+  bool consecutive_start;
+  LCT_time_t start_time;
+  int64_t start_count;
+  entry_t entry;
+};
+
+struct ctx_t;
+
+struct tls_ctx_t {
+  explicit tls_ctx_t(std::string name_,
+                     const std::vector<std::string>& counter_names_,
+                     const std::vector<std::string>& trend_names_,
+                     const std::vector<std::string>& timer_names_)
+      : name(std::move(name_)),
+        counters(counter_names_.size()),
+        trends(trend_names_.size()),
+        timers(timer_names_.size()),
+        counter_names(counter_names_),
+        trend_names(trend_names_),
+        timer_names(timer_names_)
+  {
+  }
+
+  void add(LCT_pcounter_handle_t handle, int64_t val)
+  {
+    lock.lock();
+    switch (handle.type) {
+      case LCT_PCOUNTER_COUNTER:
+        if (handle.idx >= counters.size()) {
+          counters.resize(handle.idx + 1);
+        }
+        counters[handle.idx].add(val);
+        break;
+      case LCT_PCOUNTER_TREND:
+        if (handle.idx >= trends.size()) {
+          trends.resize(handle.idx + 1);
+        }
+        trends[handle.idx].add(val);
+        break;
+      case LCT_PCOUNTER_TIMER:
+        if (handle.idx >= timers.size()) {
+          timers.resize(handle.idx + 1);
+        }
+        timers[handle.idx].add(val);
+        break;
+      default:
+        throw std::runtime_error("add: unexpected type! " + name +
+                                 ", type: " + std::to_string(handle.type) +
+                                 ", idx: " + std::to_string(handle.idx));
+    }
+    lock.unlock();
+  }
+
+  void start(LCT_pcounter_handle_t handle, LCT_time_t time)
+  {
+    lock.lock();
+    if (handle.type != LCT_PCOUNTER_TIMER)
+      throw std::runtime_error("start: unexpected type! " + name +
+                               ", type: " + std::to_string(handle.type) + ", " +
+                               std::to_string(handle.idx));
+    if (handle.idx >= timers.size()) {
+      timers.resize(handle.idx + 1);
+    }
+    timers[handle.idx].start(time);
+    lock.unlock();
+  }
+
+  void end(LCT_pcounter_handle_t handle, LCT_time_t time)
+  {
+    lock.lock();
+    if (handle.type != LCT_PCOUNTER_TIMER)
+      throw std::runtime_error("end: unexpected type! " + name + " " +
+                               std::to_string(handle.type) + " " +
+                               std::to_string(handle.idx));
+    if (handle.idx >= timers.size()) {
+      timers.resize(handle.idx + 1);
+    }
+    timers[handle.idx].end(time);
+    lock.unlock();
+  }
+
+  std::vector<entry_t> get_counters()
+  {
+    lock.lock();
+    auto ret = counters;
+    lock.unlock();
+    return ret;
+  }
+
+  std::vector<entry_t> get_trends()
+  {
+    lock.lock();
+    auto ret = trends;
+    lock.unlock();
+    return ret;
+  }
+
+  std::vector<timer_t> get_timers()
+  {
+    lock.lock();
+    auto ret = timers;
+    lock.unlock();
+    return ret;
+  }
+
+  std::vector<entry_t> counters;
+  std::vector<entry_t> trends;
+  std::vector<timer_t> timers;
+  std::vector<std::string> counter_names;
+  std::vector<std::string> trend_names;
+  std::vector<std::string> timer_names;
+  spinlock_t lock;
+  std::string name;
+};
+
+thread_local std::vector<tls_ctx_t*> tls_ctxs;
+
+void record_thread_fn(ctx_t* ctx, uint64_t record_interval);
+
+struct ctx_t {
+  explicit ctx_t(const char* name_)
+      : name(name_),
+        id(next_id++),
+        record_thread(nullptr),
+        do_record(false),
+        total_record_time(0),
+        total_initialize_time(0)
+  {
+    uint64_t record_interval = 0;
+    char* p = getenv("LCT_PCOUNTER_RECORD_INTERVAL");
+    if (p) {
+      record_interval = std::stoull(p);
+    }
+    if (record_interval > 0) {
+      keep_recording = true;
+      record_thread = new std::thread(record_thread_fn, this, record_interval);
+    }
+    // For now, we just assume there will only be a single thread initializing.
+    if (start_time == -1) {
+      start_time = LCT_now();
+    }
+  }
+  ~ctx_t()
+  {
+    if (record_thread) {
+      keep_recording = false;
+      record_thread->join();
+    }
+    record();
+    char* result = getenv("LCT_PCOUNTER_AUTO_DUMP");
+    if (result) {
+      FILE* fp;
+      if (strcmp(result, "stderr") == 0)
+        fp = stderr;
+      else if (strcmp(result, "stdout") == 0)
+        fp = stdout;
+      else {
+        std::string ofilename =
+            replaceOne(result, "%", std::to_string(LCT_get_rank()));
+        fp = fopen(ofilename.c_str(), "a");
+        if (fp == nullptr) {
+          fprintf(stderr, "Cannot open the logfile %s!\n", ofilename.c_str());
+        }
+      }
+      dump(fp);
+      if (fp != stdout && fp != stderr) fclose(fp);
+    }
+    for (auto thread_ctx : thread_ctxs) {
+      delete thread_ctx;
+    }
+    thread_ctxs.clear();
+  }
+
+  int register_counter(const char* name_, LCT_pcounter_type_t type)
+  {
+    int ret;
+    switch (type) {
+      case LCT_PCOUNTER_COUNTER:
+        ret = static_cast<int>(counter_names.size());
+        counter_names.emplace_back(name_);
+        break;
+      case LCT_PCOUNTER_TREND:
+        ret = static_cast<int>(trend_names.size());
+        trend_names.emplace_back(name_);
+        break;
+      case LCT_PCOUNTER_TIMER:
+        ret = static_cast<int>(timer_names.size());
+        timer_names.emplace_back(name_);
+        break;
+    }
+    return ret;
+  }
+
+  void initialize_tls_if_necessary()
+  {
+    if (LCT_unlikely(id >= tls_ctxs.size() || tls_ctxs[id] == nullptr)) {
+      auto start = LCT_now();
+      // we need to allocate a new tls_ctx
+      auto* tls_ctx_p =
+          new tls_ctx_t(name, counter_names, trend_names, timer_names);
+      if (id >= tls_ctxs.size()) tls_ctxs.resize(id + 1);
+      tls_ctxs[id] = tls_ctx_p;
+
+      lock.lock();
+      int thread_id = LCT_get_thread_id();
+      if (thread_id >= thread_ctxs.size()) {
+        thread_ctxs.resize(thread_id + 1, nullptr);
+      }
+      thread_ctxs[thread_id] = tls_ctx_p;
+      lock.unlock();
+      total_initialize_time += LCT_now() - start;
+    }
+  }
+
+  void add(LCT_pcounter_handle_t handle, int64_t val)
+  {
+    if (handle.type == LCT_PCOUNTER_NONE) return;
+    initialize_tls_if_necessary();
+    tls_ctxs[id]->add(handle, val);
+    // check whether to record
+    if (handle.type == LCT_PCOUNTER_TREND && do_record) {
+      bool expected = true;
+      if (do_record.compare_exchange_weak(expected, false)) record();
+    }
+  }
+
+  void start(LCT_pcounter_handle_t handle, LCT_time_t time)
+  {
+    if (handle.type == LCT_PCOUNTER_NONE) return;
+    initialize_tls_if_necessary();
+    tls_ctxs[id]->start(handle, time);
+  }
+
+  void end(LCT_pcounter_handle_t handle, LCT_time_t time)
+  {
+    if (handle.type == LCT_PCOUNTER_NONE) return;
+    initialize_tls_if_necessary();
+    tls_ctxs[id]->end(handle, time);
+  }
+
+  void record()
+  {
+    lock.lock();
+    auto start_record = LCT_now();
+    std::vector<entry_t> entries;
+    entries.resize(trend_names.size());
+    for (const auto& thread_ctx : thread_ctxs) {
+      if (thread_ctx == nullptr) continue;
+      auto tls_entries = thread_ctx->get_trends();
+      for (int i = 0; i < tls_entries.size(); ++i) {
+        entries[i].merge(tls_entries[i]);
+      }
+    }
+    records.emplace_back(start_record, entries);
+    total_record_time += LCT_now() - start_record;
+    lock.unlock();
+  }
+
+  void print_entry(FILE* out, const std::string& type_name,
+                   LCT_time_t record_time, const std::string& entry_name,
+                   const entry_t& entry) const
+  {
+    if (entry.count > 0) {
+      fprintf(out, "pcounter,%s,%d,%s,%ld,%s,%ld,%ld,%ld,%ld,%ld\n",
+              type_name.c_str(), LCT_get_rank(), name.c_str(),
+              static_cast<int64_t>(LCT_time_to_us(record_time - start_time)),
+              entry_name.c_str(), entry.total, entry.count,
+              entry.total / entry.count, entry.min, entry.max);
+    } else {
+      fprintf(out, "pcounter,%s,%d,%s,%ld,%s,,0,,\n", type_name.c_str(),
+              LCT_get_rank(), name.c_str(),
+              static_cast<int64_t>(LCT_time_to_us(record_time - start_time)),
+              entry_name.c_str());
+    }
+  }
+
+  void dump(FILE* out)
+  {
+    lock.lock();
+    fprintf(out,
+            "pcounter-summary: rank %d, name %s, "
+            "total records %lu, total record time %ld ns, "
+            "total init time %ld ns\n",
+            LCT_get_rank(), name.c_str(), records.size(),
+            static_cast<int64_t>(LCT_time_to_ns(total_record_time)),
+            static_cast<int64_t>(LCT_time_to_ns(total_initialize_time)));
+    // dump all records
+    for (const auto& record : records) {
+      for (int i = 0; i < record.entries.size(); ++i) {
+        print_entry(out, "trend", record.time, trend_names[i],
+                    record.entries[i]);
+      }
+    }
+    // dump all counters and timers
+    std::vector<entry_t> counters;
+    std::vector<entry_t> timers;
+    counters.resize(counter_names.size());
+    timers.resize(timer_names.size());
+    for (const auto& thread_ctx : thread_ctxs) {
+      if (thread_ctx == nullptr) continue;
+      auto tls_counters = thread_ctx->get_counters();
+      for (int i = 0; i < tls_counters.size(); ++i) {
+        counters[i].merge(tls_counters[i]);
+      }
+      auto tls_timers = thread_ctx->get_timers();
+      for (int i = 0; i < tls_timers.size(); ++i) {
+        timers[i].merge(tls_timers[i].get());
+      }
+    }
+    LCT_time_t now = LCT_now();
+    for (int i = 0; i < counters.size(); ++i) {
+      print_entry(out, "counter", now, counter_names[i], counters[i]);
+    }
+    for (int i = 0; i < timers.size(); ++i) {
+      print_entry(out, "timer", now, timer_names[i], timers[i]);
+    }
+    lock.unlock();
+  }
+
+  std::vector<std::string> counter_names;
+  std::vector<std::string> trend_names;
+  std::vector<std::string> timer_names;
+  std::vector<tls_ctx_t*> thread_ctxs;
+  std::vector<record_t> records;
+  LCT_time_t total_record_time;
+  LCT_time_t total_initialize_time;
+  spinlock_t lock;
+  std::string name;
+  char padding0[LCT_CACHE_LINE];
+  std::atomic<bool> do_record;
+  char padding1[LCT_CACHE_LINE];
+  std::atomic<bool> keep_recording{};
+  std::thread* record_thread;
+  int id;
+  static std::atomic<int> next_id;
+  static LCT_time_t start_time;
+};
+
+void record_thread_fn(ctx_t* ctx, uint64_t record_interval)
+{
+  while (ctx->keep_recording) {
+    ctx->do_record = true;
+    usleep(record_interval);
+  }
+}
+std::atomic<int> ctx_t::next_id(0);
+LCT_time_t ctx_t::start_time = -1;
+}  // namespace lct::pcounter
+
+LCT_pcounter_ctx_t LCT_pcounter_ctx_alloc(const char* ctx_name)
+{
+  auto* ctx = new lct::pcounter::ctx_t(ctx_name);
+  return ctx;
+}
+
+void LCT_pcounter_ctx_free(LCT_pcounter_ctx_t* pcounter_ctx)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(*pcounter_ctx);
+  delete ctx;
+  *pcounter_ctx = nullptr;
+}
+
+LCT_pcounter_handle_t LCT_pcounter_register(LCT_pcounter_ctx_t pcounter_ctx,
+                                            const char* name,
+                                            LCT_pcounter_type_t type)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  return {type, ctx->register_counter(name, type)};
+}
+
+void LCT_pcounter_add(LCT_pcounter_ctx_t pcounter_ctx,
+                      LCT_pcounter_handle_t handle, int64_t val)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  ctx->add(handle, val);
+}
+
+void LCT_pcounter_start(LCT_pcounter_ctx_t pcounter_ctx,
+                        LCT_pcounter_handle_t handle)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  ctx->start(handle, LCT_now());
+}
+
+void LCT_pcounter_end(LCT_pcounter_ctx_t pcounter_ctx,
+                      LCT_pcounter_handle_t handle)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  ctx->end(handle, LCT_now());
+}
+
+void LCT_pcounter_startt(LCT_pcounter_ctx_t pcounter_ctx,
+                         LCT_pcounter_handle_t handle, LCT_time_t time)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  ctx->start(handle, time);
+}
+
+void LCT_pcounter_endt(LCT_pcounter_ctx_t pcounter_ctx,
+                       LCT_pcounter_handle_t handle, LCT_time_t time)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  ctx->end(handle, time);
+}
+
+void LCT_pcounter_record(LCT_pcounter_ctx_t pcounter_ctx)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  ctx->record();
+}
+
+void LCT_pcounter_dump(LCT_pcounter_ctx_t pcounter_ctx, FILE* out)
+{
+  auto* ctx = static_cast<lct::pcounter::ctx_t*>(pcounter_ctx);
+  ctx->dump(out);
+}
\ No newline at end of file
diff --git a/lct/util/mem.hpp b/lct/util/mem.hpp
new file mode 100644
index 00000000..5915b73c
--- /dev/null
+++ b/lct/util/mem.hpp
@@ -0,0 +1,61 @@
+#ifndef LCT_MEM_HPP
+#define LCT_MEM_HPP
+
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+
+/* Memory Utility Functions */
+
+static inline void* LCTI_memalign(size_t alignment, size_t size)
+{
+  void* p_ptr;
+  int ret = posix_memalign(&p_ptr, alignment, size);
+  assert(ret == 0);
+  return p_ptr;
+}
+static inline void LCTI_free(void* ptr) { free(ptr); }
+
+#ifdef LCTI_CONFIG_USE_ALIGNED_ALLOC
+
+static inline void* LCTI_malloc(size_t size)
+{
+  /* Round up to the smallest multiple of LCI_CACHE_LINE
+   * which is greater than or equal to size in order to avoid any
+   * false-sharing. */
+  size = (size + LCT_CACHE_LINE - 1) & (~(LCT_CACHE_LINE - 1));
+  return LCTI_memalign(LCT_CACHE_LINE, size);
+}
+
+static inline void* LCTI_calloc(size_t num, size_t size)
+{
+  void* ptr = LCTI_malloc(num * size);
+  memset(ptr, 0, num * size);
+  return ptr;
+}
+
+static inline void* LCTI_realloc(void* ptr, size_t old_size, size_t new_size)
+{
+  void* new_ptr = LCTI_malloc(new_size);
+  memcpy(new_ptr, ptr, (old_size < new_size) ? old_size : new_size);
+  LCTI_free(ptr);
+  return new_ptr;
+}
+
+#else /* LCTI_CONFIG_USE_ALIGNED_ALLOC */
+
+static inline void* LCTI_malloc(size_t size) { return malloc(size); }
+
+static inline void* LCTI_calloc(size_t num, size_t size)
+{
+  return calloc(num, size);
+}
+
+static inline void* LCTI_realloc(void* ptr, size_t old_size, size_t new_size)
+{
+  (void)old_size;
+  return realloc(ptr, new_size);
+}
+
+#endif /* !LCTI_CONFIG_USE_ALIGNED_ALLOC */
+#endif /* LCT_MEM_HPP */
diff --git a/lct/util/misc.hpp b/lct/util/misc.hpp
new file mode 100644
index 00000000..64102a39
--- /dev/null
+++ b/lct/util/misc.hpp
@@ -0,0 +1,37 @@
+#ifndef LCI_MISC_HPP
+#define LCI_MISC_HPP
+
+#include <string>
+
+namespace lct
+{
+// string
+static inline std::string replaceOne(const std::string& in,
+                                     const std::string& from,
+                                     const std::string& to)
+{
+  std::string str(in);
+  if (from.empty()) return str;
+  size_t start_pos = str.find(from);
+  if (start_pos == std::string::npos) return str;
+  str.replace(start_pos, from.length(), to);
+  return str;
+}
+
+static inline std::string replaceAll(const std::string in,
+                                     const std::string& from,
+                                     const std::string& to)
+{
+  std::string str(in);
+  if (from.empty()) return str;
+  size_t start_pos = 0;
+  while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
+    str.replace(start_pos, from.length(), to);
+    start_pos += to.length();  // In case 'to' contains 'from', like replacing
+                               // 'x' with 'yx'
+  }
+  return str;
+}
+}  // namespace lct
+
+#endif  // LCI_MISC_HPP
diff --git a/lct/util/spinlock.hpp b/lct/util/spinlock.hpp
new file mode 100644
index 00000000..9d0c001b
--- /dev/null
+++ b/lct/util/spinlock.hpp
@@ -0,0 +1,26 @@
+#ifndef LCI_SPINLOCK_HPP
+#define LCI_SPINLOCK_HPP
+
+#include <pthread.h>
+
+namespace lct
+{
+class spinlock_t
+{
+ public:
+  spinlock_t() { pthread_spin_init(&l, PTHREAD_PROCESS_PRIVATE); }
+
+  ~spinlock_t() { pthread_spin_destroy(&l); }
+
+  bool try_lock() { return pthread_spin_trylock(&l) == 0; }
+
+  void lock() { pthread_spin_lock(&l); }
+
+  void unlock() { pthread_spin_unlock(&l); }
+
+ private:
+  pthread_spinlock_t l;
+};
+}  // namespace lct
+
+#endif  // LCI_SPINLOCK_HPP
diff --git a/lct/util/string.cpp b/lct/util/string.cpp
new file mode 100644
index 00000000..8538a32a
--- /dev/null
+++ b/lct/util/string.cpp
@@ -0,0 +1,36 @@
+#include "lcti.hpp"
+
+const char* LCT_str_replace_one(const char* in, const char* from,
+                                const char* to)
+{
+  static auto str = lct::replaceOne(in, from, to);
+  return str.c_str();
+}
+
+const char* LCT_str_replace_all(const char* in, const char* from,
+                                const char* to)
+{
+  static auto str = lct::replaceAll(in, from, to);
+  return str.c_str();
+}
+
+int LCT_str_int_search(LCT_dict_str_int_t dict[], int count, const char* key,
+                       int default_val, int* val)
+{
+  for (int i = 0; i < count; ++i) {
+    bool match = false;
+    if (key == nullptr || dict[i].key == nullptr) {
+      if (key == nullptr && dict[i].key == nullptr) {
+        match = true;
+      }
+    } else if (strcmp(key, dict[i].key) == 0) {
+      match = true;
+    }
+    if (match) {
+      *val = dict[i].val;
+      return true;
+    }
+  }
+  *val = default_val;
+  return false;
+}
\ No newline at end of file
diff --git a/lct/util/thread.cpp b/lct/util/thread.cpp
new file mode 100644
index 00000000..3498257b
--- /dev/null
+++ b/lct/util/thread.cpp
@@ -0,0 +1,16 @@
+#include <atomic>
+#include "lcti.hpp"
+
+std::atomic<int> LCT_nthreads(0);
+__thread int LCT_thread_id = -1;
+
+/* thread id */
+int LCT_get_thread_id()
+{
+  if (LCT_unlikely(LCT_thread_id == -1)) {
+    LCT_thread_id = LCT_nthreads.fetch_add(1, std::memory_order_relaxed);
+  }
+  return LCT_thread_id;
+}
+
+int LCT_get_nthreads() { return LCT_nthreads.load(std::memory_order_relaxed); }
\ No newline at end of file
diff --git a/lct/util/time.cpp b/lct/util/time.cpp
new file mode 100644
index 00000000..8906ab39
--- /dev/null
+++ b/lct/util/time.cpp
@@ -0,0 +1,35 @@
+#include <unistd.h>
+#include <cstdio>
+#include "lcti.hpp"
+
+LCT_time_t LCT_now()
+{
+  struct timespec t1;
+  int ret = clock_gettime(CLOCK_MONOTONIC, &t1);
+  if (ret != 0) {
+    fprintf(stderr, "Cannot get time!\n");
+    abort();
+  }
+  return t1.tv_sec * long(1e9) + t1.tv_nsec;
+}
+
+double LCT_time_to_ns(LCT_time_t time) { return (double)time; }
+
+double LCT_time_to_us(LCT_time_t time) { return (double)time / 1e3; }
+
+double LCT_time_to_ms(LCT_time_t time) { return (double)time / 1e6; }
+
+double LCT_time_to_s(LCT_time_t time) { return (double)time / 1e9; }
+
+// LCT_time_t LCT_now() { return LCII_ucs_get_time(); }
+//
+// double LCT_time_to_ns(LCT_time_t time) { return LCII_ucs_time_to_nsec(time);
+// }
+//
+// double LCT_time_to_us(LCT_time_t time) { return LCII_ucs_time_to_usec(time);
+// }
+//
+// double LCT_time_to_ms(LCT_time_t time) { return LCII_ucs_time_to_msec(time);
+// }
+//
+// double LCT_time_to_s(LCT_time_t time) { return LCII_ucs_time_to_sec(time); }
\ No newline at end of file
diff --git a/src/datastructure/lcm_aqueue.h b/src/datastructure/lcm_aqueue.h
deleted file mode 100644
index b4c6e9ea..00000000
--- a/src/datastructure/lcm_aqueue.h
+++ /dev/null
@@ -1,146 +0,0 @@
-#ifndef LCI_LCM_AQUEUE_H
-#define LCI_LCM_AQUEUE_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct LCM_aqueue_entry_t {
-  void* data;
-  LCIU_CACHE_PADDING(sizeof(void*));
-} LCM_aqueue_entry_t;
-
-typedef struct LCM_aqueue_t {
-  atomic_uint_fast64_t top;   // point to the next entry that is empty
-  atomic_uint_fast64_t top2;  // point to the last entry that is full
-  LCIU_CACHE_PADDING(2 * sizeof(atomic_uint_fast64_t));
-  atomic_uint_fast64_t bot;   // point to the fist entry that is full
-  atomic_uint_fast64_t bot2;  // point to the last entry that is empty
-  LCIU_CACHE_PADDING(2 * sizeof(atomic_uint_fast64_t));
-  uint_fast64_t length;
-  struct LCM_aqueue_entry_t* container;  // a pointer to type void*
-} LCM_aqueue_t;
-
-// The following functions are not thread-safe
-static inline void LCM_aqueue_init(LCM_aqueue_t* queue, uint_fast64_t capacity);
-static inline void LCM_aqueue_fina(LCM_aqueue_t* queue);
-// The following functions are thread-safe
-static inline void LCM_aqueue_push(LCM_aqueue_t* queue, void* val);
-static inline void* LCM_aqueue_pop(LCM_aqueue_t* queue);
-
-#ifdef __cplusplus
-}
-#endif
-
-static inline void LCM_aqueue_init(LCM_aqueue_t* queue, uint_fast64_t capacity)
-{
-  LCM_Assert(sizeof(LCM_aqueue_entry_t) == LCI_CACHE_LINE,
-             "Unexpected sizeof(LCM_aqueue_entry_t) %lu\n",
-             sizeof(LCM_aqueue_entry_t));
-  queue->container = LCIU_memalign(LCI_CACHE_LINE,
-                                   (capacity + 1) * sizeof(LCM_aqueue_entry_t));
-  atomic_init(&queue->top, 0);
-  atomic_init(&queue->top2, 0);
-  atomic_init(&queue->bot, 0);
-  atomic_init(&queue->bot2, 0);
-  queue->length = capacity + 1;
-#ifdef LCI_DEBUG
-  for (int i = 0; i < queue->length; ++i) {
-    queue->container[i].data = NULL;
-  }
-#endif
-  atomic_thread_fence(LCIU_memory_order_seq_cst);
-}
-
-static inline void LCM_aqueue_fina(LCM_aqueue_t* queue)
-{
-  atomic_thread_fence(LCIU_memory_order_seq_cst);
-  LCIU_free(queue->container);
-  queue->container = NULL;
-  atomic_init(&queue->top, 0);
-  atomic_init(&queue->top2, 0);
-  atomic_init(&queue->bot, 0);
-  atomic_init(&queue->bot2, 0);
-  queue->length = 0;
-}
-
-static inline void LCM_aqueue_push(LCM_aqueue_t* queue, void* val)
-{
-  uint_fast64_t current_bot2 =
-      atomic_load_explicit(&queue->bot2, LCIU_memory_order_acquire);
-  // reserve a slot to write
-  uint_fast64_t current_top =
-      atomic_fetch_add_explicit(&queue->top, 1, LCIU_memory_order_relaxed);
-  if (current_top - current_bot2 > queue->length - 1) {
-    LCM_Assert(false, "The atomic queue is full! %lu - %lu > %lu\n",
-               current_top, current_bot2, queue->length - 1);
-  }
-  // write to the slot
-  LCM_DBG_Assert(queue->container[current_top % queue->length].data == NULL,
-                 "wrote to a nonempty value!\n");
-  queue->container[current_top % queue->length].data = val;
-  // update top2 to tell the consumers they can safely read this slot.
-  while (true) {
-    uint_fast64_t expected = current_top;
-    _Bool succeed = atomic_compare_exchange_weak_explicit(
-        &queue->top2, &expected, current_top + 1, LCIU_memory_order_release,
-        LCIU_memory_order_relaxed);
-    if (succeed) {
-      // succeed!
-      break;
-    }
-  }
-}
-
-static inline void* LCM_aqueue_pop(LCM_aqueue_t* queue)
-{
-  uint_fast64_t current_top2 =
-      atomic_load_explicit(&queue->top2, LCIU_memory_order_acquire);
-  uint_fast64_t current_bot =
-      atomic_load_explicit(&queue->bot, LCIU_memory_order_relaxed);
-  if (current_top2 <= current_bot) {
-    // the queue is empty
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].lci_cq_pop_failed_empty++);
-    return NULL;
-  }
-  //  LCM_DBG_Assert(current_top2 > current_bot, "bot %lu is ahead of top2
-  //  %lu!\n", current_bot, current_top2);
-  uint_fast64_t expected = current_bot;
-  _Bool succeed = atomic_compare_exchange_strong_explicit(
-      &queue->bot, &expected, current_bot + 1, LCIU_memory_order_relaxed,
-      LCIU_memory_order_relaxed);
-  if (!succeed) {
-    // other thread is ahead of us
-    LCII_PCOUNTERS_WRAPPER(
-        LCII_pcounters[LCIU_get_thread_id()].lci_cq_pop_failed_contention++);
-    return NULL;
-  }
-  // we have successfully reserve an entry
-  //  __sync_synchronize();
-  void* result = queue->container[current_bot % queue->length].data;
-#ifdef LCI_DEBUG
-  queue->container[current_bot % queue->length].data = NULL;
-#endif
-  //  __sync_synchronize();
-  // now that we got the value, we can update bot2 to tell the producers they
-  // can safely write to this entry.
-  while (true) {
-    expected = current_bot;
-    succeed = atomic_compare_exchange_weak_explicit(
-        &queue->bot2, &expected, current_bot + 1, LCIU_memory_order_release,
-        LCIU_memory_order_relaxed);
-    if (succeed) {
-      // succeed!
-      break;
-    }
-  }
-  LCII_PCOUNTERS_WRAPPER(
-      LCII_pcounters[LCIU_get_thread_id()].lci_cq_pop_len_accumulated +=
-      current_top2 - current_bot);
-  LCII_PCOUNTERS_WRAPPER(
-      LCII_pcounters[LCIU_get_thread_id()].lci_cq_pop_succeeded++);
-  return result;
-}
-
-#endif  // LCI_LCM_AQUEUE_H
diff --git a/src/log/lcm_log.c b/src/log/lcm_log.c
deleted file mode 100644
index 61842137..00000000
--- a/src/log/lcm_log.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <pthread.h>
-#include "lcm_log.h"
-
-LCM_API const char* const log_levels[] = {[LCM_LOG_WARN] = "warn",
-                                          [LCM_LOG_TRACE] = "trace",
-                                          [LCM_LOG_INFO] = "info",
-                                          [LCM_LOG_DEBUG] = "debug",
-                                          [LCM_LOG_MAX] = NULL};
-LCM_API int LCM_LOG_RANK;
-LCM_API int LCM_LOG_LEVEL = LCM_LOG_WARN;
-LCM_API char* LCM_LOG_whitelist_p = NULL;
-LCM_API char* LCM_LOG_blacklist_p = NULL;
-LCM_API FILE* LCM_LOG_OUTFILE = NULL;
-
-void LCM_Init(int rank)
-{
-  {
-    LCM_LOG_RANK = rank;
-    char* p = getenv("LCM_LOG_LEVEL");
-    if (p == NULL)
-      ;
-    else if (strcmp(p, "none") == 0 || strcmp(p, "NONE") == 0)
-      LCM_LOG_LEVEL = LCM_LOG_NONE;
-    else if (strcmp(p, "warn") == 0 || strcmp(p, "WARN") == 0)
-      LCM_LOG_LEVEL = LCM_LOG_WARN;
-    else if (strcmp(p, "trace") == 0 || strcmp(p, "TRACE") == 0)
-      LCM_LOG_LEVEL = LCM_LOG_TRACE;
-    else if (strcmp(p, "info") == 0 || strcmp(p, "INFO") == 0)
-      LCM_LOG_LEVEL = LCM_LOG_INFO;
-    else if (strcmp(p, "debug") == 0 || strcmp(p, "DEBUG") == 0)
-      LCM_LOG_LEVEL = LCM_LOG_DEBUG;
-    else if (strcmp(p, "max") == 0 || strcmp(p, "MAX") == 0)
-      LCM_LOG_LEVEL = LCM_LOG_MAX;
-    else
-      LCM_Log_default(
-          LCM_LOG_WARN,
-          "unknown env LCM_LOG_LEVEL (%s against "
-          "none|warn|trace|info|debug|max). use the default LCM_LOG_WARN.\n",
-          p);
-  }
-  LCM_LOG_whitelist_p = getenv("LCM_LOG_WHITELIST");
-  LCM_LOG_blacklist_p = getenv("LCM_LOG_BLACKLIST");
-  {
-    char* p = getenv("LCM_LOG_OUTFILE");
-    if (p == NULL || strcmp(p, "stderr") == 0)
-      LCM_LOG_OUTFILE = stderr;
-    else if (strcmp(p, "stdout") == 0)
-      LCM_LOG_OUTFILE = stdout;
-    else {
-      const int filename_max = 256;
-      char filename[filename_max];
-      char* p0_old = p;
-      char* p0_new = strchr(p, '%');
-      char* p1 = filename;
-      while (p0_new) {
-        long nbytes = p0_new - p0_old;
-        LCM_Assert(p1 + nbytes < filename + filename_max,
-                   "Filename is too long!\n");
-        memcpy(p1, p0_old, nbytes);
-        p1 += nbytes;
-        nbytes = snprintf(p1, filename + filename_max - p1, "%d", rank);
-        p1 += nbytes;
-        p0_old = p0_new + 1;
-        p0_new = strchr(p0_old, '%');
-      }
-      strncat(p1, p0_old, filename + filename_max - p1 - 1);
-      LCM_LOG_OUTFILE = fopen(filename, "w+");
-      if (LCM_LOG_OUTFILE == NULL) {
-        fprintf(stderr, "Cannot open the logfile %s!\n", filename);
-      }
-    }
-  }
-}
-
-void LCM_Fina()
-{
-  if (fclose(LCM_LOG_OUTFILE) != 0) {
-    fprintf(stderr, "The log file did not close successfully!\n");
-  }
-}
diff --git a/src/log/lcm_log.h b/src/log/lcm_log.h
deleted file mode 100644
index 3922cb36..00000000
--- a/src/log/lcm_log.h
+++ /dev/null
@@ -1,131 +0,0 @@
-#ifndef LCM_LOG_H_
-#define LCM_LOG_H_
-
-#include <stdarg.h>
-#include <unistd.h>
-#include <string.h>
-#include <stdint.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define LCM_API __attribute__((visibility("default")))
-
-#define LCM_Assert(Expr, ...) \
-  LCM_Assert_(#Expr, Expr, __FILE__, __func__, __LINE__, __VA_ARGS__)
-#define LCM_Log(log_level, log_type, ...) \
-  LCM_Log_(log_level, log_type, __FILE__, __func__, __LINE__, __VA_ARGS__)
-#define LCM_Log_default(log_level, ...) \
-  LCM_Log(log_level, "default", __VA_ARGS__)
-#define LCM_Warn(...) LCM_Log(LCM_LOG_WARN, "warn", __VA_ARGS__)
-
-#ifdef LCM_DEBUG
-#define LCM_DBG_Assert(...) LCM_Assert(__VA_ARGS__)
-#define LCM_DBG_Log(...) LCM_Log(__VA_ARGS__)
-#define LCM_DBG_Log_default(...) LCM_Log_default(__VA_ARGS__)
-#define LCM_DBG_Warn(...) LCM_Warn(__VA_ARGS__)
-#else
-#define LCM_DBG_Assert(...)
-#define LCM_DBG_Log(...)
-#define LCM_DBG_Log_default(...)
-#define LCM_DBG_Warn(...)
-#endif
-
-enum LCM_log_level_t {
-  LCM_LOG_NONE = 0,
-  LCM_LOG_WARN,
-  LCM_LOG_TRACE,
-  LCM_LOG_INFO,
-  LCM_LOG_DEBUG,
-  LCM_LOG_MAX
-};
-
-extern const char* const log_levels[LCM_LOG_MAX + 1];
-
-void LCM_Init(int rank);
-
-void LCM_Fina();
-
-static inline void LCM_Assert_(const char* expr_str, uint64_t expr,
-                               const char* file, const char* func, int line,
-                               const char* format, ...)
-    __attribute__((__format__(__printf__, 6, 7)));
-
-static inline void LCM_Log_(enum LCM_log_level_t log_level,
-                            const char* log_type, const char* file,
-                            const char* func, int line, const char* format, ...)
-    __attribute__((__format__(__printf__, 6, 7)));
-
-static inline void LCM_Log_flush();
-
-/* =============== Implementation ================*/
-
-extern int LCM_LOG_RANK;
-extern int LCM_LOG_LEVEL;
-extern char* LCM_LOG_whitelist_p;
-extern char* LCM_LOG_blacklist_p;
-extern FILE* LCM_LOG_OUTFILE;
-
-void LCM_Assert_(const char* expr_str, uint64_t expr, const char* file,
-                 const char* func, int line, const char* format, ...)
-{
-  if (expr) return;
-
-  char buf[1024];
-  int size;
-  va_list vargs;
-
-  size = snprintf(buf, sizeof(buf), "%d:%d:%s:%s:%d<Assert failed: %s> ",
-                  LCM_LOG_RANK, getpid(), file, func, line, expr_str);
-
-  va_start(vargs, format);
-  vsnprintf(buf + size, sizeof(buf) - size, format, vargs);
-  va_end(vargs);
-
-  fprintf(stderr, "%s", buf);
-  fflush(stderr);
-  abort();
-}
-
-void LCM_Log_(enum LCM_log_level_t log_level, const char* log_type,
-              const char* file, const char* func, int line, const char* format,
-              ...)
-{
-  char buf[2048];
-  int size;
-  va_list vargs;
-  LCM_Assert(log_level != LCM_LOG_NONE, "You should not use LCM_LOG_NONE!\n");
-  // if log_level is weaker than the configured log level, do nothing.
-  if (log_level > LCM_LOG_LEVEL) return;
-  // always show LCM_LOG_WARN message
-  if (log_level != LCM_LOG_WARN) {
-    // if whitelist is enabled and log_type is not include in the whitelist,
-    // do nothing.
-    if (LCM_LOG_whitelist_p != NULL &&
-        strstr(LCM_LOG_whitelist_p, log_type) == NULL)
-      return;
-    // if blacklist is enabled and log_type is not include in the blacklist,
-    // do nothing.
-    if (LCM_LOG_blacklist_p != NULL &&
-        strstr(LCM_LOG_blacklist_p, log_type) != NULL)
-      return;
-  }
-  // print the log
-  size = snprintf(buf, sizeof(buf), "%d:%d:%s:%s:%d<%s:%s> ", LCM_LOG_RANK,
-                  getpid(), file, func, line, log_levels[log_level], log_type);
-
-  va_start(vargs, format);
-  vsnprintf(buf + size, sizeof(buf) - size, format, vargs);
-  va_end(vargs);
-
-  fprintf(LCM_LOG_OUTFILE, "%s", buf);
-}
-
-static inline void LCM_Log_flush() { fflush(LCM_LOG_OUTFILE); }
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif  // LCM_LOG_H_
diff --git a/src/profile/performance_counter.c b/src/profile/performance_counter.c
deleted file mode 100644
index dc9717d4..00000000
--- a/src/profile/performance_counter.c
+++ /dev/null
@@ -1,203 +0,0 @@
-#include "runtime/lcii.h"
-
-LCII_pcounters_per_thread_t LCII_pcounters[LCI_PCOUNTER_MAX_NTHREADS]
-    __attribute__((aligned(LCI_CACHE_LINE)));
-
-void LCII_pcounters_init()
-{
-  memset(&LCII_pcounters, 0, sizeof(LCII_pcounters));
-}
-
-#define LCII_PCOUNTERS_FIELD_ADD(field) ret.field += LCII_pcounters[i].field
-#define LCII_PCOUNTERS_FIELD_MAX(field) \
-  LCIU_MAX_ASSIGN(ret.field, LCII_pcounters[i].field)
-#define LCII_PCOUNTERS_FIELD_AVE(ave, count)                       \
-  LCIU_update_average(&ret.ave, &ret.count, LCII_pcounters[i].ave, \
-                      LCII_pcounters[i].count)
-LCII_pcounters_per_thread_t LCII_pcounters_accumulate()
-{
-  LCII_pcounters_per_thread_t ret;
-  memset(&ret, 0, sizeof(ret));
-  for (int i = 0; i < LCIU_nthreads; ++i) {
-    LCII_PCOUNTERS_FIELD_ADD(msgs_tx);
-    LCII_PCOUNTERS_FIELD_ADD(bytes_tx);
-    LCII_PCOUNTERS_FIELD_ADD(msgs_rx);
-    LCII_PCOUNTERS_FIELD_ADD(bytes_rx);
-    LCII_PCOUNTERS_FIELD_ADD(msgs_2sided_tx);
-    LCII_PCOUNTERS_FIELD_ADD(msgs_2sided_rx);
-    LCII_PCOUNTERS_FIELD_ADD(msgs_1sided_tx);
-    LCII_PCOUNTERS_FIELD_ADD(msgs_1sided_rx);
-    LCII_PCOUNTERS_FIELD_ADD(packet_stealing);
-    LCII_PCOUNTERS_FIELD_ADD(send_lci_succeeded);
-    LCII_PCOUNTERS_FIELD_ADD(send_lci_failed_packet);
-    LCII_PCOUNTERS_FIELD_ADD(send_lci_failed_bq);
-    LCII_PCOUNTERS_FIELD_ADD(send_lci_failed_backend);
-    LCII_PCOUNTERS_FIELD_ADD(send_backend_failed_lock);
-    LCII_PCOUNTERS_FIELD_ADD(send_backend_failed_nomem);
-    LCII_PCOUNTERS_FIELD_ADD(lci_cq_pop_succeeded);
-    LCII_PCOUNTERS_FIELD_ADD(lci_cq_pop_failed_empty);
-    LCII_PCOUNTERS_FIELD_ADD(lci_cq_pop_failed_contention);
-    LCII_PCOUNTERS_FIELD_ADD(lci_cq_pop_len_accumulated);
-    LCII_PCOUNTERS_FIELD_ADD(progress_call);
-    LCII_PCOUNTERS_FIELD_ADD(progress_useful_call);
-    LCII_PCOUNTERS_FIELD_MAX(progress_useful_call_consecutive_max);
-    LCII_PCOUNTERS_FIELD_ADD(progress_useful_call_consecutive_sum);
-    LCII_PCOUNTERS_FIELD_ADD(recv_backend_no_packet);
-    LCII_PCOUNTERS_FIELD_ADD(backlog_queue_total_count);
-    LCII_PCOUNTERS_FIELD_ADD(backlog_queue_send_attempts);
-    LCII_PCOUNTERS_FIELD_MAX(backlog_queue_max_len);
-    LCII_PCOUNTERS_FIELD_ADD(hashtable_insert_num);
-    LCII_PCOUNTERS_FIELD_ADD(hashtable_walk_steps_total);
-    LCII_PCOUNTERS_FIELD_MAX(hashtable_walk_steps_max);
-    LCII_PCOUNTERS_FIELD_AVE(send_eager_latency_nsec_ave,
-                             send_eager_latency_nsec_count);
-    LCII_PCOUNTERS_FIELD_AVE(send_iovec_handshake_nsec_ave,
-                             send_iovec_handshake_nsec_count);
-    LCII_PCOUNTERS_FIELD_AVE(send_iovec_latency_nsec_ave,
-                             send_iovec_latency_nsec_count);
-    LCII_PCOUNTERS_FIELD_AVE(recv_iovec_handle_rts_nsec_ave,
-                             recv_iovec_handle_rts_nsec_count);
-    LCII_PCOUNTERS_FIELD_AVE(recv_iovec_latency_nsec_ave,
-                             recv_iovec_latency_nsec_count);
-  }
-  return ret;
-}
-
-#define LCII_PCOUNTERS_FIELD_KEEP(field) ret.field = c1.field
-#define LCII_PCOUNTERS_FIELD_DIFF(field) ret.field = (c1.field - c2.field)
-LCII_pcounters_per_thread_t LCII_pcounters_diff(LCII_pcounters_per_thread_t c1,
-                                                LCII_pcounters_per_thread_t c2)
-{
-  LCII_pcounters_per_thread_t ret;
-  memset(&ret, 0, sizeof(ret));
-  for (int i = 0; i < LCIU_nthreads; ++i) {
-    LCII_PCOUNTERS_FIELD_DIFF(msgs_tx);
-    LCII_PCOUNTERS_FIELD_DIFF(bytes_tx);
-    LCII_PCOUNTERS_FIELD_DIFF(msgs_rx);
-    LCII_PCOUNTERS_FIELD_DIFF(bytes_rx);
-    LCII_PCOUNTERS_FIELD_DIFF(msgs_2sided_tx);
-    LCII_PCOUNTERS_FIELD_DIFF(msgs_2sided_rx);
-    LCII_PCOUNTERS_FIELD_DIFF(msgs_1sided_tx);
-    LCII_PCOUNTERS_FIELD_DIFF(msgs_1sided_rx);
-    LCII_PCOUNTERS_FIELD_DIFF(packet_stealing);
-    LCII_PCOUNTERS_FIELD_DIFF(send_lci_succeeded);
-    LCII_PCOUNTERS_FIELD_DIFF(send_lci_failed_packet);
-    LCII_PCOUNTERS_FIELD_DIFF(send_lci_failed_bq);
-    LCII_PCOUNTERS_FIELD_DIFF(send_lci_failed_backend);
-    LCII_PCOUNTERS_FIELD_DIFF(send_backend_failed_lock);
-    LCII_PCOUNTERS_FIELD_DIFF(send_backend_failed_nomem);
-    LCII_PCOUNTERS_FIELD_DIFF(lci_cq_pop_succeeded);
-    LCII_PCOUNTERS_FIELD_DIFF(lci_cq_pop_failed_empty);
-    LCII_PCOUNTERS_FIELD_DIFF(lci_cq_pop_failed_contention);
-    LCII_PCOUNTERS_FIELD_DIFF(lci_cq_pop_len_accumulated);
-    LCII_PCOUNTERS_FIELD_DIFF(progress_call);
-    LCII_PCOUNTERS_FIELD_DIFF(progress_useful_call);
-    LCII_PCOUNTERS_FIELD_KEEP(progress_useful_call_consecutive_max);
-    LCII_PCOUNTERS_FIELD_DIFF(progress_useful_call_consecutive_sum);
-    LCII_PCOUNTERS_FIELD_DIFF(recv_backend_no_packet);
-    LCII_PCOUNTERS_FIELD_DIFF(backlog_queue_total_count);
-    LCII_PCOUNTERS_FIELD_DIFF(backlog_queue_send_attempts);
-    LCII_PCOUNTERS_FIELD_KEEP(backlog_queue_max_len);
-    LCII_PCOUNTERS_FIELD_DIFF(hashtable_insert_num);
-    LCII_PCOUNTERS_FIELD_DIFF(hashtable_walk_steps_total);
-    LCII_PCOUNTERS_FIELD_KEEP(hashtable_walk_steps_max);
-    LCII_PCOUNTERS_FIELD_KEEP(send_eager_latency_nsec_ave);
-    LCII_PCOUNTERS_FIELD_DIFF(send_eager_latency_nsec_count);
-    LCII_PCOUNTERS_FIELD_KEEP(send_iovec_handshake_nsec_ave);
-    LCII_PCOUNTERS_FIELD_DIFF(send_iovec_handshake_nsec_count);
-    LCII_PCOUNTERS_FIELD_KEEP(send_iovec_latency_nsec_ave);
-    LCII_PCOUNTERS_FIELD_DIFF(send_iovec_latency_nsec_count);
-    LCII_PCOUNTERS_FIELD_KEEP(recv_iovec_handle_rts_nsec_ave);
-    LCII_PCOUNTERS_FIELD_DIFF(recv_iovec_handle_rts_nsec_count);
-    LCII_PCOUNTERS_FIELD_KEEP(recv_iovec_latency_nsec_ave);
-    LCII_PCOUNTERS_FIELD_DIFF(recv_iovec_latency_nsec_count);
-  }
-  return ret;
-}
-
-#define LCII_PCOUNTERS_FIELD_TO_STRING(field, annotation)                     \
-  consumed += snprintf(buf + consumed, sizeof(buf) - consumed, "%d,%s,%ld\n", \
-                       LCI_RANK, annotation, pcounter.field);                 \
-  LCM_Assert(sizeof(buf) > consumed, "buffer overflowed!\n");
-char* LCII_pcounters_to_string(LCII_pcounters_per_thread_t pcounter)
-{
-  static char buf[2048];
-  size_t consumed = 0;
-  LCII_PCOUNTERS_FIELD_TO_STRING(msgs_tx, "Total message sent");
-  LCII_PCOUNTERS_FIELD_TO_STRING(msgs_2sided_tx, "2sided message sent");
-  LCII_PCOUNTERS_FIELD_TO_STRING(msgs_1sided_tx, "1sided message sent");
-  LCII_PCOUNTERS_FIELD_TO_STRING(bytes_tx, "Bytes sent");
-  LCII_PCOUNTERS_FIELD_TO_STRING(msgs_rx, "Total message recved");
-  LCII_PCOUNTERS_FIELD_TO_STRING(msgs_2sided_rx, "2sided message recved");
-  LCII_PCOUNTERS_FIELD_TO_STRING(msgs_1sided_rx, "1sided message recved");
-  LCII_PCOUNTERS_FIELD_TO_STRING(bytes_rx, "Bytes recved");
-  LCII_PCOUNTERS_FIELD_TO_STRING(packet_stealing, "Packet stealing attempts");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_lci_succeeded,
-                                 "LCI send attempts succeeded");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_lci_failed_packet,
-                                 "LCI send attempts failed due to no packet");
-  LCII_PCOUNTERS_FIELD_TO_STRING(
-      send_lci_failed_bq,
-      "LCI send attempts failed due to non-empty backlog queue");
-  LCII_PCOUNTERS_FIELD_TO_STRING(
-      send_lci_failed_backend,
-      "LCI send attempts failed due to failed backend send");
-  LCII_PCOUNTERS_FIELD_TO_STRING(msgs_tx, "Backend send attempts succeeded");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_backend_failed_lock,
-                                 "Backend send attempts failed due to lock");
-  LCII_PCOUNTERS_FIELD_TO_STRING(
-      send_backend_failed_nomem,
-      "Backend send attempts failed due to no memory");
-  LCII_PCOUNTERS_FIELD_TO_STRING(lci_cq_pop_succeeded, "LCI cq pop succeeded");
-  LCII_PCOUNTERS_FIELD_TO_STRING(lci_cq_pop_failed_empty,
-                                 "LCI cq pop failed due to empty queue");
-  LCII_PCOUNTERS_FIELD_TO_STRING(lci_cq_pop_failed_contention,
-                                 "LCI cq pop failed due to thread contention");
-  LCII_PCOUNTERS_FIELD_TO_STRING(lci_cq_pop_len_accumulated,
-                                 "LCI cq pop pending counts accumulated");
-  LCII_PCOUNTERS_FIELD_TO_STRING(progress_call, "LCI progress function called");
-  LCII_PCOUNTERS_FIELD_TO_STRING(progress_useful_call,
-                                 "LCI progress function useful calls");
-  LCII_PCOUNTERS_FIELD_TO_STRING(
-      progress_useful_call_consecutive_max,
-      "LCI progress function Consecutive useful calls max");
-  LCII_PCOUNTERS_FIELD_TO_STRING(
-      progress_useful_call_consecutive_sum,
-      "LCI progress function Consecutive useful calls sum");
-  LCII_PCOUNTERS_FIELD_TO_STRING(
-      recv_backend_no_packet,
-      "Backend post recv attempts failed due to no packet");
-  LCII_PCOUNTERS_FIELD_TO_STRING(backlog_queue_total_count,
-                                 "Backlog queue total item count");
-  LCII_PCOUNTERS_FIELD_TO_STRING(backlog_queue_max_len,
-                                 "Backlog queue maximum length");
-  LCII_PCOUNTERS_FIELD_TO_STRING(backlog_queue_send_attempts,
-                                 "Backlog queue send attempts");
-  LCII_PCOUNTERS_FIELD_TO_STRING(hashtable_insert_num,
-                                 "Matching table insert operation number");
-  LCII_PCOUNTERS_FIELD_TO_STRING(hashtable_walk_steps_total,
-                                 "Matching table total walking step count");
-  LCII_PCOUNTERS_FIELD_TO_STRING(hashtable_walk_steps_max,
-                                 "Matching table maximum walking step count");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_eager_latency_nsec_ave,
-                                 "Send eager time average");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_eager_latency_nsec_count,
-                                 "Send eager time count");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_iovec_handshake_nsec_ave,
-                                 "Send iovec handshake time average");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_iovec_handshake_nsec_count,
-                                 "Send iovec handshake time count");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_iovec_latency_nsec_ave,
-                                 "Send iovec time average");
-  LCII_PCOUNTERS_FIELD_TO_STRING(send_iovec_latency_nsec_count,
-                                 "Send iovec time count");
-  LCII_PCOUNTERS_FIELD_TO_STRING(recv_iovec_handle_rts_nsec_ave,
-                                 "Recv iovec handle rts time average");
-  LCII_PCOUNTERS_FIELD_TO_STRING(recv_iovec_handle_rts_nsec_count,
-                                 "Recv iovec handle rts time count");
-  LCII_PCOUNTERS_FIELD_TO_STRING(recv_iovec_latency_nsec_ave,
-                                 "Recv iovec time average");
-  LCII_PCOUNTERS_FIELD_TO_STRING(recv_iovec_latency_nsec_count,
-                                 "Recv iovec time count");
-  return buf;
-}
diff --git a/src/profile/performance_counter.h b/src/profile/performance_counter.h
deleted file mode 100644
index 2548aa09..00000000
--- a/src/profile/performance_counter.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef LCI_PERFORMANCE_COUNTER_H
-#define LCI_PERFORMANCE_COUNTER_H
-
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-#define LCII_PCOUNTERS_WRAPPER(stat) stat
-#else
-#define LCII_PCOUNTERS_WRAPPER(stat)
-#endif
-
-typedef struct {
-  int64_t msgs_tx;
-  int64_t bytes_tx;
-  int64_t msgs_rx;
-  int64_t bytes_rx;
-  int64_t msgs_2sided_tx;
-  int64_t msgs_1sided_tx;
-  int64_t msgs_2sided_rx;
-  int64_t msgs_1sided_rx;
-  // 8x8 bytes
-  int64_t packet_stealing;
-  int64_t send_lci_succeeded;
-  int64_t send_lci_failed_packet;
-  int64_t send_lci_failed_bq;
-  int64_t send_lci_failed_backend;
-  int64_t send_backend_failed_lock;
-  int64_t send_backend_failed_nomem;
-  int64_t lci_cq_pop_succeeded;
-  // 8x8 bytes
-  int64_t lci_cq_pop_failed_empty;
-  int64_t lci_cq_pop_failed_contention;
-  int64_t lci_cq_pop_len_accumulated;
-  int64_t progress_call;
-  int64_t progress_useful_call;
-  int64_t progress_useful_call_consecutive_max;
-  int64_t progress_useful_call_consecutive_sum;
-  int64_t recv_backend_no_packet;
-  // 8x8 bytes
-  int64_t backlog_queue_total_count;
-  int64_t backlog_queue_send_attempts;
-  int64_t backlog_queue_max_len;
-  int64_t hashtable_insert_num;
-  int64_t hashtable_walk_steps_total;
-  int64_t hashtable_walk_steps_max;
-  int64_t send_eager_latency_nsec_ave;    // post send -> send comp
-  int64_t send_eager_latency_nsec_count;  // post send -> send comp
-  // 8x8 bytes
-  int64_t send_iovec_handshake_nsec_ave;     // send rts -> recv rtr
-  int64_t send_iovec_handshake_nsec_count;   // send rts -> recv rtr
-  int64_t send_iovec_latency_nsec_ave;       // send rts -> send fin
-  int64_t send_iovec_latency_nsec_count;     // send rts -> send fin
-  int64_t recv_iovec_handle_rts_nsec_ave;    // recv rts -> send rtr
-  int64_t recv_iovec_handle_rts_nsec_count;  // recv rts -> send rtr
-  int64_t recv_iovec_latency_nsec_ave;       // recv rts -> recv fin
-  int64_t recv_iovec_latency_nsec_count;     // recv rts -> recv fin
-  // 8x8 bytes
-  LCIU_CACHE_PADDING(8 * 40);
-} LCII_pcounters_per_thread_t;
-
-#define LCI_PCOUNTER_MAX_NTHREADS 256
-extern LCII_pcounters_per_thread_t LCII_pcounters[LCI_PCOUNTER_MAX_NTHREADS];
-
-void LCII_pcounters_init();
-LCII_pcounters_per_thread_t LCII_pcounters_accumulate();
-LCII_pcounters_per_thread_t LCII_pcounters_diff(LCII_pcounters_per_thread_t c1,
-                                                LCII_pcounters_per_thread_t c2);
-char* LCII_pcounters_to_string(LCII_pcounters_per_thread_t pcounter);
-
-#endif  // LCI_PERFORMANCE_COUNTER_H
diff --git a/src/runtime/completion/cq.c b/src/runtime/completion/cq.c
deleted file mode 100644
index 6cb4d18f..00000000
--- a/src/runtime/completion/cq.c
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "runtime/lcii.h"
-
-LCI_error_t LCI_queue_create(LCI_device_t device, LCI_comp_t* cq)
-{
-  LCII_cq_t* cq_ptr = LCIU_malloc(sizeof(LCII_cq_t));
-#ifdef LCI_USE_MUTEX_CQ
-  LCM_dq_init(&cq_ptr->dequeue, LCI_DEFAULT_QUEUE_LENGTH);
-  LCIU_spinlock_init(&cq_ptr->spinlock);
-#else
-  LCM_aqueue_init(cq_ptr, LCI_DEFAULT_QUEUE_LENGTH);
-#endif
-  *cq = cq_ptr;
-  return LCI_OK;
-}
-
-LCI_error_t LCI_queue_free(LCI_comp_t* cq)
-{
-  LCII_cq_t* cq_ptr = *cq;
-#ifdef LCI_USE_MUTEX_CQ
-  LCIU_spinlock_fina(&cq_ptr->spinlock);
-  LCM_dq_finalize(&cq_ptr->dequeue);
-#else
-  LCM_aqueue_fina(cq_ptr);
-#endif
-  LCIU_free(cq_ptr);
-  *cq = NULL;
-  return LCI_OK;
-}
-
-LCI_error_t LCI_queue_pop(LCI_comp_t cq, LCI_request_t* request)
-{
-  LCII_cq_t* cq_ptr = cq;
-#ifdef LCI_USE_MUTEX_CQ
-  LCIU_acquire_spinlock(&cq_ptr->spinlock);
-  LCII_context_t* ctx = LCM_dq_pop_bot(&cq_ptr->dequeue);
-  LCIU_release_spinlock(&cq_ptr->spinlock);
-#else
-  LCII_context_t* ctx = LCM_aqueue_pop(cq_ptr);
-#endif
-  if (ctx == NULL) return LCI_ERR_RETRY;
-  *request = LCII_ctx2req(ctx);
-  return LCI_OK;
-}
-
-LCI_error_t LCI_queue_wait(LCI_comp_t cq, LCI_request_t* request)
-{
-  LCII_cq_t* cq_ptr = cq;
-  LCII_context_t* ctx = NULL;
-  while (ctx == NULL) {
-#ifdef LCI_USE_MUTEX_CQ
-    LCIU_acquire_spinlock(&cq_ptr->spinlock);
-    ctx = LCM_dq_pop_bot(&cq_ptr->dequeue);
-    LCIU_release_spinlock(&cq_ptr->spinlock);
-#else
-    ctx = LCM_aqueue_pop(cq_ptr);
-#endif
-  }
-  *request = LCII_ctx2req(ctx);
-  return LCI_OK;
-}
-
-LCI_error_t LCI_queue_pop_multiple(LCI_comp_t cq, size_t request_count,
-                                   LCI_request_t* requests,
-                                   size_t* return_count)
-{
-  LCII_cq_t* cq_ptr = cq;
-  int count = 0;
-  LCII_context_t* ctx;
-  while (count < request_count) {
-#ifdef LCI_USE_MUTEX_CQ
-    LCIU_acquire_spinlock(&cq_ptr->spinlock);
-    ctx = LCM_dq_pop_bot(&cq_ptr->dequeue);
-    LCIU_release_spinlock(&cq_ptr->spinlock);
-#else
-    ctx = LCM_aqueue_pop(cq_ptr);
-#endif
-    if (ctx != NULL) {
-      requests[count] = LCII_ctx2req(ctx);
-      ++count;
-    } else {
-      break;
-    }
-  }
-  *return_count = count;
-  return LCI_OK;
-}
-
-LCI_error_t LCI_queue_wait_multiple(LCI_comp_t cq, size_t request_count,
-                                    LCI_request_t* requests)
-{
-  LCII_cq_t* cq_ptr = cq;
-  int count = 0;
-  LCII_context_t* ctx;
-  while (count < request_count) {
-#ifdef LCI_USE_MUTEX_CQ
-    LCIU_acquire_spinlock(&cq_ptr->spinlock);
-    ctx = LCM_dq_pop_bot(&cq_ptr->dequeue);
-    LCIU_release_spinlock(&cq_ptr->spinlock);
-#else
-    ctx = LCM_aqueue_pop(cq_ptr);
-#endif
-    if (ctx != NULL) {
-      requests[count] = LCII_ctx2req(ctx);
-      ++count;
-    } else {
-      continue;
-    }
-  }
-  return LCI_OK;
-}
-
-LCI_error_t LCI_queue_len(LCI_comp_t cq, size_t* len)
-{
-#ifdef LCI_USE_MUTEX_CQ
-  LCII_cq_t* cq_ptr = cq;
-  *len = LCM_dq_size(cq_ptr->dequeue);
-#else
-  *len = 0;
-#endif
-  return LCI_ERR_FEATURE_NA;
-}
\ No newline at end of file
diff --git a/src/runtime/completion/cq.h b/src/runtime/completion/cq.h
deleted file mode 100644
index 9c094477..00000000
--- a/src/runtime/completion/cq.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef LC_CQ_H
-#define LC_CQ_H
-
-#ifdef LCI_USE_MUTEX_CQ
-struct __attribute__((aligned(LCI_CACHE_LINE))) LCII_cq_t {
-  LCM_dequeue_t dequeue;
-  LCIU_spinlock_t spinlock;
-};
-typedef struct LCII_cq_t LCII_cq_t;
-#else
-typedef LCM_aqueue_t LCII_cq_t;
-#endif
-
-static inline void LCII_queue_push(LCI_comp_t cq, LCII_context_t* ctx)
-{
-  LCII_cq_t* cq_ptr = cq;
-#ifdef LCI_USE_MUTEX_CQ
-  LCIU_acquire_spinlock(&cq_ptr->spinlock);
-  int ret = LCM_dq_push_top(&cq_ptr->dequeue, ctx);
-  LCIU_release_spinlock(&cq_ptr->spinlock);
-  LCM_Assert(ret == LCM_SUCCESS, "The completion queue is full!\n");
-#else
-  LCM_aqueue_push(cq_ptr, ctx);
-#endif
-}
-
-#endif  // LC_CQ_H
diff --git a/src/runtime/monitor_thread.c b/src/runtime/monitor_thread.c
deleted file mode 100644
index 84d440a0..00000000
--- a/src/runtime/monitor_thread.c
+++ /dev/null
@@ -1,78 +0,0 @@
-#include "runtime/lcii.h"
-
-// needed by the monitor threads
-static pthread_t LCII_monitor_thread;
-static atomic_bool LCII_monitor_thread_run;
-static bool LCI_ENABLE_MONITOR_THREAD = false;
-static int LCI_MONITOR_THREAD_INTERVAL;
-
-struct timespec LCIU_timespec_diff(struct timespec new, struct timespec old)
-{
-  struct timespec diff;
-  if (new.tv_nsec >= old.tv_nsec) {
-    diff.tv_sec = new.tv_sec - old.tv_sec;
-    diff.tv_nsec = new.tv_nsec - old.tv_nsec;
-  } else {
-    diff.tv_sec = new.tv_sec - old.tv_sec - 1;
-    diff.tv_nsec = new.tv_nsec - old.tv_nsec + 1000000000;
-  }
-  diff.tv_sec += diff.tv_nsec / 1000000000;
-  diff.tv_nsec %= 1000000000;
-  return diff;
-}
-
-void* LCII_monitor_thread_fn(void* vargp)
-{
-  struct timespec start_time, time_now;
-  LCII_pcounters_per_thread_t pcounter_now, pcounter_old, pcounter_diff;
-  memset(&pcounter_old, 0, sizeof(pcounter_old));
-  clock_gettime(CLOCK_MONOTONIC, &start_time);
-  LCM_Log(LCM_LOG_INFO, "monitor", "Start the monitor thread at %lu.%lu s\n",
-          start_time.tv_sec, start_time.tv_nsec);
-  LCM_Log_flush();
-  while (atomic_load_explicit(&LCII_monitor_thread_run,
-                              LCIU_memory_order_acquire)) {
-    sleep(LCI_MONITOR_THREAD_INTERVAL);
-    pcounter_now = LCII_pcounters_accumulate();
-    pcounter_diff = LCII_pcounters_diff(pcounter_now, pcounter_old);
-    pcounter_old = pcounter_now;
-    clock_gettime(CLOCK_MONOTONIC, &time_now);
-    struct timespec time_diff = LCIU_timespec_diff(time_now, start_time);
-    char* p = NULL;
-#ifdef LCI_USE_PERFORMANCE_COUNTER
-    p = LCII_pcounters_to_string(pcounter_diff);
-#else
-    p = "No performance counter available.\n";
-#endif
-    LCM_Log(LCM_LOG_INFO, "monitor", "Time %lu.%lu s\n%s", time_diff.tv_sec,
-            time_diff.tv_nsec, p);
-    LCM_Log_flush();
-  }
-  clock_gettime(CLOCK_MONOTONIC, &time_now);
-  struct timespec time_diff = LCIU_timespec_diff(time_now, start_time);
-  LCM_Log(LCM_LOG_INFO, "monitor", "Finish the monitor thread at %lu.%lu s\n",
-          time_diff.tv_sec, time_diff.tv_nsec);
-  LCM_Log_flush();
-  return NULL;
-}
-
-void LCII_monitor_thread_init()
-{
-  LCI_ENABLE_MONITOR_THREAD =
-      LCIU_getenv_or("LCI_ENABLE_MONITOR_THREAD", false);
-  if (LCI_ENABLE_MONITOR_THREAD) {
-    LCI_MONITOR_THREAD_INTERVAL =
-        LCIU_getenv_or("LCI_MONITOR_THREAD_INTERVAL", 60);
-    atomic_init(&LCII_monitor_thread_run, true);
-    pthread_create(&LCII_monitor_thread, NULL, LCII_monitor_thread_fn, NULL);
-  }
-}
-
-void LCII_monitor_thread_fina()
-{
-  if (LCI_ENABLE_MONITOR_THREAD) {
-    atomic_store_explicit(&LCII_monitor_thread_run, false,
-                          LCIU_memory_order_release);
-    pthread_join(LCII_monitor_thread, NULL);
-  }
-}
\ No newline at end of file
diff --git a/tests/lcit/lcit.h b/tests/lcit/lcit.h
index fa1795dd..29e3adca 100644
--- a/tests/lcit/lcit.h
+++ b/tests/lcit/lcit.h
@@ -2,7 +2,7 @@
 #define LCI_LCIT_H
 
 #ifndef NDEBUG
-#define LCM_DEBUG
+#define LCI_DEBUG
 #endif
 
 #include <iostream>
@@ -16,15 +16,8 @@
 #include <cstring>
 #include <sys/time.h>
 #include "lci.h"
-#include "../../src/log/lcm_log.h"
 #include "lcit_threadbarrier.h"
 
-#ifndef LCIT_BENCH
-#define LCIT_Assert LCM_Assert
-#else
-#define LCIT_Assert
-#endif
-
 namespace lcit
 {
 const size_t CACHESIZE_L1 = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
@@ -68,7 +61,8 @@ void checkConfig(Config& config)
 {
   if (config.op >= LCIT_OP_1SIDED_S &&
       config.recv_comp_type != LCI_COMPLETION_QUEUE) {
-    LCM_Warn(
+    LCT_Warn(
+        LCT_log_ctx_default,
         "Currently one-sided communication only support "
         "--recv-comp-type=queue. Change the receive completion to queue \n");
     config.recv_comp_type = LCI_COMPLETION_QUEUE;
@@ -76,9 +70,9 @@ void checkConfig(Config& config)
   if (config.op < LCIT_OP_1SIDED_S &&
       (config.send_comp_type == LCI_COMPLETION_QUEUE ||
        config.recv_comp_type == LCI_COMPLETION_QUEUE)) {
-    LCM_Warn(
-        "Completion queue does not work well with 2-sided ping-pong tests."
-        "You may encounter deadlock!\n");
+    LCT_Warn(LCT_log_ctx_default,
+             "Completion queue does not work well with 2-sided ping-pong tests."
+             "You may encounter deadlock!\n");
   }
 }
 
@@ -128,6 +122,10 @@ enum LongFlags {
   NSTEPS,
 };
 
+void init() { LCI_initialize(); }
+
+void fina() { LCI_finalize(); }
+
 Config parseArgs(int argc, char** argv)
 {
   Config config;
@@ -457,7 +455,7 @@ void threadBarrier(Context& ctx)
 
 LCI_comp_t postSend(Context& ctx, int rank, size_t size, LCI_tag_t tag)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "lcit",
+  LCT_DBG_Log(LCT_log_ctx_default, LCT_LOG_DEBUG, "lcit",
               "%d/%d: postSend rank %d size %lu tag %d\n", LCI_RANK,
               TRD_RANK_ME, rank, size, tag);
   LCI_comp_t comp;
@@ -536,8 +534,8 @@ LCI_comp_t postSend(Context& ctx, int rank, size_t size, LCI_tag_t tag)
 
 void waitSend(Context& ctx, LCI_comp_t comp)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "lcit", "%d/%d: waitSend\n", LCI_RANK,
-              TRD_RANK_ME);
+  LCT_DBG_Log(LCT_log_ctx_default, LCT_LOG_DEBUG, "lcit", "%d/%d: waitSend\n",
+              LCI_RANK, TRD_RANK_ME);
   switch (ctx.config.op) {
     case LCIT_OP_2SIDED_S:
     case LCIT_OP_2SIDED_M:
@@ -547,21 +545,26 @@ void waitSend(Context& ctx, LCI_comp_t comp)
     case LCIT_OP_2SIDED_L:
     case LCIT_OP_1SIDED_L:
       LCI_request_t request = waitComp(ctx, comp, ctx.config.send_comp_type);
-      LCIT_Assert(request.flag == LCI_OK, "flag is wrong\n");
-      LCIT_Assert(request.type == LCI_LONG, "type is wrong\n");
-      LCIT_Assert(request.data.lbuffer.address == ctx.send_data.lbuffer.address,
-                  "address is wrong\n");
-      LCIT_Assert(request.data.lbuffer.segment == ctx.send_data.lbuffer.segment,
-                  "segment is wrong\n");
-      LCIT_Assert((uint64_t)request.user_context == USER_CONTEXT,
-                  "user_context is wrong\n");
+      LCT_Assert(LCT_log_ctx_default, request.flag == LCI_OK,
+                 "flag is wrong\n");
+      LCT_Assert(LCT_log_ctx_default, request.type == LCI_LONG,
+                 "type is wrong\n");
+      LCT_Assert(LCT_log_ctx_default,
+                 request.data.lbuffer.address == ctx.send_data.lbuffer.address,
+                 "address is wrong\n");
+      LCT_Assert(LCT_log_ctx_default,
+                 request.data.lbuffer.segment == ctx.send_data.lbuffer.segment,
+                 "segment is wrong\n");
+      LCT_Assert(LCT_log_ctx_default,
+                 (uint64_t)request.user_context == USER_CONTEXT,
+                 "user_context is wrong\n");
       break;
   }
 }
 
 LCI_comp_t postRecv(Context& ctx, int rank, size_t size, LCI_tag_t tag)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "lcit",
+  LCT_DBG_Log(LCT_log_ctx_default, LCT_LOG_DEBUG, "lcit",
               "%d/%d: postRecv rank %d size %lu tag %d\n", LCI_RANK,
               TRD_RANK_ME, rank, size, tag);
   LCI_comp_t comp;
@@ -600,37 +603,43 @@ LCI_comp_t postRecv(Context& ctx, int rank, size_t size, LCI_tag_t tag)
 
 void waitRecv(Context& ctx, LCI_comp_t comp)
 {
-  LCM_DBG_Log(LCM_LOG_DEBUG, "lcit", "%d/%d: waitRecv\n", LCI_RANK,
-              TRD_RANK_ME);
+  LCT_DBG_Log(LCT_log_ctx_default, LCT_LOG_DEBUG, "lcit", "%d/%d: waitRecv\n",
+              LCI_RANK, TRD_RANK_ME);
   LCI_request_t request = waitComp(ctx, comp, ctx.config.recv_comp_type);
-  LCIT_Assert(request.flag == LCI_OK, "flag is wrong\n");
+  LCT_Assert(LCT_log_ctx_default, request.flag == LCI_OK, "flag is wrong\n");
   if (ctx.config.op == LCIT_OP_2SIDED_L || ctx.config.op == LCIT_OP_2SIDED_M ||
       ctx.config.op == LCIT_OP_2SIDED_S)
-    LCIT_Assert((uint64_t)request.user_context == USER_CONTEXT,
-                "user_context is wrong\n");
+    LCT_Assert(LCT_log_ctx_default,
+               (uint64_t)request.user_context == USER_CONTEXT,
+               "user_context is wrong\n");
   switch (ctx.config.op) {
     case LCIT_OP_2SIDED_S:
     case LCIT_OP_1SIDED_S:
-      LCIT_Assert(request.type == LCI_IMMEDIATE, "type is wrong\n");
+      LCT_Assert(LCT_log_ctx_default, request.type == LCI_IMMEDIATE,
+                 "type is wrong\n");
       break;
     case LCIT_OP_2SIDED_M:
-      LCIT_Assert(request.type == LCI_MEDIUM, "type is wrong\n");
+      LCT_Assert(LCT_log_ctx_default, request.type == LCI_MEDIUM,
+                 "type is wrong\n");
       if (ctx.config.recv_dyn) {
         LCI_mbuffer_free(request.data.mbuffer);
       }
       break;
     case LCIT_OP_1SIDED_M:
-      LCIT_Assert(request.type == LCI_MEDIUM, "type is wrong\n");
+      LCT_Assert(LCT_log_ctx_default, request.type == LCI_MEDIUM,
+                 "type is wrong\n");
       LCI_mbuffer_free(request.data.mbuffer);
       break;
     case LCIT_OP_2SIDED_L:
-      LCIT_Assert(request.type == LCI_LONG, "type is wrong\n");
+      LCT_Assert(LCT_log_ctx_default, request.type == LCI_LONG,
+                 "type is wrong\n");
       if (ctx.config.recv_dyn) {
         LCI_lbuffer_free(request.data.lbuffer);
       }
       break;
     case LCIT_OP_1SIDED_L:
-      LCIT_Assert(request.type == LCI_LONG, "type is wrong\n");
+      LCT_Assert(LCT_log_ctx_default, request.type == LCI_LONG,
+                 "type is wrong\n");
       LCI_lbuffer_free(request.data.lbuffer);
       break;
   }
diff --git a/tests/lcit/lcit_many2one.cpp b/tests/lcit/lcit_many2one.cpp
index adeb20b6..71e00189 100644
--- a/tests/lcit/lcit_many2one.cpp
+++ b/tests/lcit/lcit_many2one.cpp
@@ -74,13 +74,13 @@ void test(Context ctx)
 
 int main(int argc, char** args)
 {
-  LCI_initialize();
+  init();
   Config config = parseArgs(argc, args);
   Context ctx = initCtx(config);
 
   run(ctx, test, ctx);
 
   freeCtx(ctx);
-  LCI_finalize();
+  fina();
   return 0;
 }
diff --git a/tests/lcit/lcit_pt2pt.cpp b/tests/lcit/lcit_pt2pt.cpp
index 9ea5b93b..fa133a74 100644
--- a/tests/lcit/lcit_pt2pt.cpp
+++ b/tests/lcit/lcit_pt2pt.cpp
@@ -67,13 +67,13 @@ void test(Context ctx)
 
 int main(int argc, char** args)
 {
-  LCI_initialize();
+  init();
   Config config = parseArgs(argc, args);
   Context ctx = initCtx(config);
 
   run(ctx, test, ctx);
 
   freeCtx(ctx);
-  LCI_finalize();
+  fina();
   return 0;
 }
diff --git a/tests/lcit/lcit_threadbarrier.h b/tests/lcit/lcit_threadbarrier.h
index b5a64e54..ec011ce6 100644
--- a/tests/lcit/lcit_threadbarrier.h
+++ b/tests/lcit/lcit_threadbarrier.h
@@ -1,6 +1,5 @@
 #ifndef LCIT_THREADBARRIER_HPP
 #define LCIT_THREADBARRIER_HPP
-#include "../../src/log/lcm_log.h"
 
 class ThreadBarrier
 {
@@ -8,16 +7,19 @@ class ThreadBarrier
   ThreadBarrier(size_t thread_num)
       : waiting(0), step(0), thread_num_(thread_num)
   {
-    LCM_Assert(thread_num_ > 0, "Error: thread_num cannot be 0.\n");
+    LCT_Assert(LCT_log_ctx_default, thread_num_ > 0,
+               "Error: thread_num cannot be 0.\n");
   }
 
   void set_thread_num(int thread_num)
   {
-    LCM_Assert(thread_num_ > 0, "Error: thread_num cannot be 0.\n");
+    LCT_Assert(LCT_log_ctx_default, thread_num_ > 0,
+               "Error: thread_num cannot be 0.\n");
   }
   void wait()
   {
-    LCM_DBG_Assert(thread_num_ > 0, "Error: call wait() before init().\n");
+    LCT_Assert(LCT_log_ctx_default, thread_num_ > 0,
+               "Error: call wait() before init().\n");
     size_t mstep = step.load();
 
     if (++waiting == thread_num_) {
@@ -31,7 +33,8 @@ class ThreadBarrier
   template <typename Fn, typename... Args>
   void wait(Fn&& fn, Args&&... args)
   {
-    LCM_DBG_Assert(thread_num_ > 0, "Error: call wait() before init().\n");
+    LCT_Assert(LCT_log_ctx_default, thread_num_ > 0,
+               "Error: call wait() before init().\n");
     size_t mstep = step.load();
 
     if (++waiting == thread_num_) {
diff --git a/tools/lct_parse_pcounter.py b/tools/lct_parse_pcounter.py
new file mode 100644
index 00000000..470cd998
--- /dev/null
+++ b/tools/lct_parse_pcounter.py
@@ -0,0 +1,138 @@
+import argparse
+import ast
+import glob
+import math
+import re
+import numpy as np
+import pandas as pd
+from bokeh.layouts import row, column
+from bokeh.models import HoverTool, LegendItem, Legend, RangeSlider, Button
+from bokeh.plotting import figure, show
+from bokeh.palettes import Dark2_5 as palette, Bokeh
+from bokeh.models import ColumnDataSource
+from bokeh.models import CheckboxGroup, CustomJS
+from bokeh.palettes import Viridis3
+import itertools
+
+
+def get_typed_value(value):
+    if value == '-nan':
+        return np.nan
+    try:
+        typed_value = ast.literal_eval(value)
+    except:
+        typed_value = value
+    return typed_value
+
+
+def parse_tag(df, x_key, y_key, tag_key):
+    lines = []
+
+    for tag in df[tag_key].unique():
+        criterion = (df[tag_key] == tag)
+        df1 = df[criterion]
+        current_domain = []
+        current_value = []
+        current_error = []
+        for x in df1[x_key].unique():
+            y = df1[df1[x_key] == x][y_key].median()
+            error = df1[df1[x_key] == x][y_key].std()
+            if y is np.nan:
+                continue
+            if y == 0:
+                continue
+            current_domain.append(float(x))
+            current_value.append(float(y))
+            current_error.append(float(error))
+        lines.append({'label': str(tag), 'x': current_domain, 'y': current_value, 'error': current_error})
+    return lines
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='lct_parse_pcounter',
+        description='Parse the log generated by pcounter',
+        epilog='Designed for the Lightweight Communication Tool (LCT) Library')
+    parser.add_argument('filename')
+    parser.add_argument('-r', '--rank', type=int, default=0)
+    args = parser.parse_args()
+    filenames = glob.glob(args.filename)
+
+    labels = ["rank", "ctx_name", "time", "counter_name", "total", "count", "ave", "min", "max"]
+    rows = []
+    for filename in filenames:
+        with open(filename) as f:
+            for line in f.readlines():
+                line = line.strip()
+                m = re.match("pcounter,trend,(\S+)", line)
+                if m:
+                    data = m.groups()[0].split(",")
+                    current_entry = dict()
+                    for label, data in zip(labels, data):
+                        current_entry[label] = get_typed_value(data)
+                    rows.append(current_entry)
+                    if len(rows) % 10000 == 0:
+                        print("Processing..." + str(len(rows)))
+    df = pd.DataFrame(rows, columns=list(rows[0].keys()))
+
+    print("Apply filter...")
+    df1_tmp = df[df.apply(lambda row:
+                          row["rank"] == args.rank,
+                          axis=1)]
+    df1 = df1_tmp.copy()
+    df1["ctx_counter_name"] = df1_tmp["ctx_name"] +":" + df1_tmp["counter_name"]
+    line_entries_all = parse_tag(df1, "time", "count", "ctx_counter_name")
+
+
+    def create_plot(line_entries, title):
+        p = figure(title=title, x_axis_label="time", y_axis_label="count", width=1200, height=600)
+        p.x_range.only_visible = True
+        p.y_range.only_visible = True
+
+        lines = []
+        for entry, color in zip(line_entries, itertools.cycle(palette)):
+            line = p.line(x=entry["x"], y=entry["y"], legend_label=entry["label"], color=color, name=entry["label"])
+            lines.append(line)
+
+        legend = p.legend[0]
+        legend.click_policy = "hide"
+        legend.ncols = 2
+        legend.nrows = math.ceil(len(legend.items) / legend.ncols)
+        p.add_layout(legend, "right")
+
+        # create hover tool
+        hover = HoverTool(mode="vline", tooltips=[
+            ("time", "$snap_x{0,0}"),
+            ('counter', '$name: $snap_y'),
+        ])
+        hover.point_policy = 'snap_to_data'
+        hover.line_policy = 'nearest'
+        p.add_tools(hover)
+        # Add button
+        btn = Button(label='Hide All')
+        cb = CustomJS(args=dict(fig=p, btn=btn)
+                      ,code='''
+                      if (btn.label=='Hide All'){
+                          for (var i=0; i<fig.renderers.length; i++){
+                                  fig.renderers[i].visible=false}
+                          btn.label = 'Show All'
+                          }
+                      else {for (var i=0; i<fig.renderers.length; i++){
+                              fig.renderers[i].visible=true}
+                      btn.label = 'Hide All'}
+                      ''')
+
+        btn.js_on_click(cb)
+        p = column([p, btn])
+
+        return p
+
+    print("Creating plot...")
+    fig_cumu = create_plot(line_entries_all, title="Cumulative")
+
+    for entry in line_entries_all:
+        entry["y"] = list(np.diff(np.array([0, *entry["y"]])))
+    fig_hist = create_plot(line_entries_all, title="Histogram")
+    layout = row(fig_hist, fig_cumu)
+    # show the results
+    show(layout)