diff --git a/.github/workflows/opensuse.yml b/.github/workflows/opensuse.yml index a4bf30d4d..ebbd9eef2 100644 --- a/.github/workflows/opensuse.yml +++ b/.github/workflows/opensuse.yml @@ -98,6 +98,11 @@ jobs: -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_OMPT=OFF -DROCPROFSYS_USE_PYTHON=ON + -DROCPROFSYS_BUILD_DYNINST=ON + -DROCPROFSYS_BUILD_BOOST=ON + -DROCPROFSYS_BUILD_TBB=ON + -DROCPROFSYS_BUILD_ELFUTILS=ON + -DROCPROFSYS_BUILD_LIBIBERTY=ON -DROCPROFSYS_INSTALL_PERFETTO_TOOLS=OFF -DROCPROFSYS_USE_MPI_HEADERS=ON -DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs @@ -107,7 +112,7 @@ jobs: -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;openmp-target;videodecode;jpegdecode" -DROCPROFSYS_BUILD_NUMBER=${{ github.run_attempt }} -- - -LE "transpose|rccl|videodecode|jpegdecode|network" + -LE "transpose|rccl|videodecode|jpegdecode|network|mpi" - name: Install timeout-minutes: 10 diff --git a/.github/workflows/redhat.yml b/.github/workflows/redhat.yml index fc6e0aec1..77c1e49a6 100644 --- a/.github/workflows/redhat.yml +++ b/.github/workflows/redhat.yml @@ -114,6 +114,11 @@ jobs: -DROCPROFSYS_USE_ROCM=${USE_HIP} -DROCPROFSYS_USE_OMPT=OFF -DROCPROFSYS_USE_PYTHON=ON + -DROCPROFSYS_BUILD_DYNINST=ON + -DROCPROFSYS_BUILD_BOOST=ON + -DROCPROFSYS_BUILD_TBB=ON + -DROCPROFSYS_BUILD_ELFUTILS=ON + -DROCPROFSYS_BUILD_LIBIBERTY=ON -DROCPROFSYS_USE_MPI_HEADERS=ON -DROCPROFSYS_CI_MPI_RUN_AS_ROOT=ON -DROCPROFSYS_MAX_THREADS=64 diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index e0e60452c..28d57f954 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -150,6 +150,11 @@ jobs: -DROCPROFSYS_USE_ROCM=OFF -DROCPROFSYS_USE_OMPT=OFF -DROCPROFSYS_USE_PAPI=OFF + -DROCPROFSYS_BUILD_DYNINST=ON + -DROCPROFSYS_BUILD_BOOST=ON + -DROCPROFSYS_BUILD_TBB=ON + -DROCPROFSYS_BUILD_ELFUTILS=ON + -DROCPROFSYS_BUILD_LIBIBERTY=ON -DROCPROFSYS_USE_PYTHON=${{ matrix.python }} -DROCPROFSYS_USE_MPI_HEADERS=${{ matrix.mpi-headers }} -DROCPROFSYS_STRIP_LIBRARIES=${{ matrix.strip }} @@ -346,6 +351,11 @@ jobs: -DROCPROFSYS_USE_OMPT=OFF -DROCPROFSYS_USE_PYTHON=ON -DROCPROFSYS_USE_MPI_HEADERS=${{ matrix.mpi-headers }} + -DROCPROFSYS_BUILD_DYNINST=ON + -DROCPROFSYS_BUILD_BOOST=ON + -DROCPROFSYS_BUILD_TBB=ON + -DROCPROFSYS_BUILD_ELFUTILS=ON + -DROCPROFSYS_BUILD_LIBIBERTY=ON -DROCPROFSYS_USE_SANITIZER=OFF -DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs -DROCPROFSYS_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10;py3.11" @@ -455,7 +465,11 @@ jobs: -DCMAKE_INSTALL_PREFIX=/opt/rocprofiler-systems -DROCPROFSYS_BUILD_CI=OFF -DROCPROFSYS_BUILD_TESTING=ON - -DROCPROFSYS_BUILD_DYNINST=OFF + -DROCPROFSYS_BUILD_DYNINST=ON + -DROCPROFSYS_BUILD_BOOST=ON + -DROCPROFSYS_BUILD_TBB=ON + -DROCPROFSYS_BUILD_ELFUTILS=ON + -DROCPROFSYS_BUILD_LIBIBERTY=ON -DROCPROFSYS_BUILD_DEBUG=OFF -DROCPROFSYS_BUILD_HIDDEN_VISIBILITY=OFF -DROCPROFSYS_USE_MPI=ON diff --git a/.github/workflows/ubuntu-jammy.yml b/.github/workflows/ubuntu-jammy.yml index a9fbed9c8..aa391520e 100644 --- a/.github/workflows/ubuntu-jammy.yml +++ b/.github/workflows/ubuntu-jammy.yml @@ -57,7 +57,7 @@ jobs: hidden: ['ON', 'OFF'] build-type: ['Release'] mpi-headers: ['ON', 'OFF'] - build-dyninst: ['OFF'] + build-dyninst: ['ON'] rocm-version: ['0.0'] env: @@ -133,6 +133,10 @@ jobs: -DROCPROFSYS_USE_PYTHON=${{ matrix.python }} -DROCPROFSYS_USE_MPI_HEADERS=${{ matrix.mpi-headers }} -DROCPROFSYS_BUILD_DYNINST=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_BOOST=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_TBB=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_ELFUTILS=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_LIBIBERTY=${{ matrix.build-dyninst }} -DROCPROFSYS_BUILD_HIDDEN_VISIBILITY=${{ matrix.hidden }} -DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs -DROCPROFSYS_PYTHON_ENVS="py3.7;py3.8;py3.9;py3.10;py3.11" @@ -210,7 +214,7 @@ jobs: hidden: ['ON'] build-type: ['Release'] mpi-headers: ['OFF'] - build-dyninst: ['OFF'] + build-dyninst: ['ON'] rocm-version: ['6.3', '6.4'] env: @@ -310,6 +314,10 @@ jobs: -DROCPROFSYS_USE_PYTHON=${{ matrix.python }} -DROCPROFSYS_USE_MPI_HEADERS=${{ matrix.mpi-headers }} -DROCPROFSYS_BUILD_DYNINST=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_BOOST=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_TBB=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_ELFUTILS=${{ matrix.build-dyninst }} + -DROCPROFSYS_BUILD_LIBIBERTY=${{ matrix.build-dyninst }} -DROCPROFSYS_BUILD_HIDDEN_VISIBILITY=${{ matrix.hidden }} -DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs -DROCPROFSYS_PYTHON_ENVS="py3.7;py3.8;py3.9;py3.10;py3.11" diff --git a/.github/workflows/ubuntu-noble.yml b/.github/workflows/ubuntu-noble.yml index 9c33d93da..cf00d96e3 100644 --- a/.github/workflows/ubuntu-noble.yml +++ b/.github/workflows/ubuntu-noble.yml @@ -105,6 +105,11 @@ jobs: -DROCPROFSYS_DISABLE_EXAMPLES="transpose;rccl;openmp-target" \ -DROCPROFSYS_USE_ROCM=${USE_ROCM} \ -DRCOPROFSYS_USE_PYTHON=ON \ + -DROCPROFSYS_BUILD_DYNINST=ON \ + -DROCPROFSYS_BUILD_BOOST=ON \ + -DROCPROFSYS_BUILD_TBB=ON \ + -DROCPROFSYS_BUILD_ELFUTILS=ON \ + -DROCPROFSYS_BUILD_LIBIBERTY=ON \ -DROCPROFSYS_STRIP_LIBRARIES=${{ matrix.strip }} \ -DROCPROFSYS_PYTHON_PREFIX=/opt/conda/envs \ -DROCPROFSYS_PYTHON_ENVS="py3.8;py3.9;py3.10;py3.11;py3.12" diff --git a/.gitmodules b/.gitmodules index 2515e865b..4fbeaeebb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,6 +10,7 @@ [submodule "external/dyninst"] path = external/dyninst url = https://github.com/ROCm/dyninst.git + branch = dyninst_13 [submodule "external/PTL"] path = external/PTL url = https://github.com/jrmadsen/PTL.git diff --git a/CMakeLists.txt b/CMakeLists.txt index c8976c3d8..c68ecdeb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,11 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND CMAKE_CURRENT_SOURCE_DIR STREQ message(AUTHOR_WARNING "In-source build") endif() +# find_package() uses upper-case _ROOT variables. +if(POLICY CMP0144) + cmake_policy(SET CMP0144 NEW) +endif() + if(NOT UNIX OR APPLE) message( AUTHOR_WARNING @@ -179,7 +184,6 @@ rocprofiler_systems_add_option(ROCPROFSYS_USE_MPI "Enable MPI support" OFF) rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCM "Enable ROCm support" ON) rocprofiler_systems_add_option(ROCPROFSYS_USE_PAPI "Enable HW counter support via PAPI" ON) -rocprofiler_systems_add_option(ROCPROFSYS_USE_RCCL "Enable RCCL support" OFF) rocprofiler_systems_add_option( ROCPROFSYS_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" ON) @@ -209,12 +213,6 @@ elseif("$ENV{ROCPROFSYS_CI}") endif() endif() -if(NOT ROCPROFSYS_USE_ROCM) - set(ROCPROFSYS_USE_RCCL - OFF - CACHE BOOL "Disabled via ROCPROFSYS_USE_ROCM=OFF" FORCE) -endif() - if(ROCPROFSYS_BUILD_TESTING) set(ROCPROFSYS_BUILD_EXAMPLES ON diff --git a/cmake/ConfigCPack.cmake b/cmake/ConfigCPack.cmake index 1c69444fb..f0b026bd6 100644 --- a/cmake/ConfigCPack.cmake +++ b/cmake/ConfigCPack.cmake @@ -150,16 +150,16 @@ if(ROCPROFSYS_USE_PAPI AND NOT ROCPROFSYS_BUILD_PAPI) list(APPEND _DEBIAN_PACKAGE_DEPENDS libpapi-dev libpfm4) endif() if(NOT ROCPROFSYS_BUILD_DYNINST) - if(NOT DYNINST_BUILD_BOOST) + if(NOT ROCPROFSYS_BUILD_BOOST) foreach(_BOOST_COMPONENT atomic system thread date-time filesystem timer) list(APPEND _DEBIAN_PACKAGE_DEPENDS "libboost-${_BOOST_COMPONENT}-dev (>= 1.67.0)") endforeach() endif() - if(NOT DYNINST_BUILD_TBB) + if(NOT ROCPROFSYS_BUILD_TBB) list(APPEND _DEBIAN_PACKAGE_DEPENDS "libtbb-dev (>= 2018.6)") endif() - if(NOT DYNINST_BUILD_LIBIBERTY) + if(NOT ROCPROFSYS_BUILD_LIBIBERTY) list(APPEND _DEBIAN_PACKAGE_DEPENDS "libiberty-dev (>= 20170913)") endif() endif() diff --git a/cmake/DyninstBoost.cmake b/cmake/DyninstBoost.cmake new file mode 100644 index 000000000..553b52f56 --- /dev/null +++ b/cmake/DyninstBoost.cmake @@ -0,0 +1,389 @@ +# ======================================================================================================== +# Boost.cmake +# +# Configure Boost for Dyninst +# +# ---------------------------------------- +# +# Accepts the following CMake variables +# +# Boost_ROOT_DIR - Hint directory that contains the Boost installation +# PATH_BOOST - Alias for Boost_ROOT_DIR Boost_MIN_VERSION - Minimum +# acceptable version of Boost Boost_USE_MULTITHREADED - Use the multithreaded version of +# Boost Boost_USE_STATIC_RUNTIME - Use libraries linked statically to the C++ runtime +# +# Options inherited from Modules/FindBoost.cmake that may be useful +# +# BOOST_INCLUDEDIR - Hint directory that contains the Boost headers files +# BOOST_LIBRARYDIR - Hint directory that contains the Boost library files +# +# Advanced options: +# +# Boost_DEBUG - Enable debug output from FindBoost Boost_NO_SYSTEM_PATHS - +# Disable searching in locations not specified by hint variables +# +# Exports the following CMake cache variables +# +# Boost_ROOT_DIR - Computed base directory the of Boost installation +# Boost_INCLUDE_DIRS - Boost include directories Boost_INCLUDE_DIR - Alias for +# Boost_INCLUDE_DIRS Boost_LIBRARY_DIRS - Link directories for Boost libraries +# Boost_DEFINES - Boost compiler definitions Boost_LIBRARIES - Boost +# library files Boost__LIBRARY_RELEASE - Release libraries to link for component +# ( is upper-case) Boost__LIBRARY_DEBUG - Debug libraries to link for component +# Boost_THREAD_LIBRARY - The filename of the Boost thread library +# Boost_USE_MULTITHREADED - Use the multithreaded version of Boost +# Boost_USE_STATIC_RUNTIME - Use libraries linked statically to the C++ runtime +# +# NOTE: The exported Boost_ROOT_DIR can be different from the value provided by the user +# in the case that it is determined to build Boost from source. In such a case, +# Boost_ROOT_DIR will contain the directory of the from-source installation. +# +# See Modules/FindBoost.cmake for additional input and exported variables +# +# ======================================================================================================== + +include_guard(GLOBAL) + +# always provide Dyninst::Boost even if it is empty +rocprofiler_systems_add_interface_library(rocprofiler-systems-boost + "Boost interface library") + +if(NOT BUILD_BOOST) + find_package(Boost) +endif() + +if(Boost_FOUND) + return() +endif() + +# Need at least Boost-1.67 because of deprecated headers +set(_boost_min_version 1.67.0) + +# Provide a default, if the user didn't specify +set(Boost_MIN_VERSION + ${_boost_min_version} + CACHE STRING "Minimum Boost version") + +# Enforce minimum version +if(${Boost_MIN_VERSION} VERSION_LESS ${_boost_min_version}) + rocprofiler_systems_message( + FATAL_ERROR + "Requested Boost-${Boost_MIN_VERSION} is less than minimum supported version (${_boost_min_version})" + ) +endif() + +# -------------- RUNTIME CONFIGURATION ---------------------------------------- + +# Use the multithreaded version of Boost NB: This _must_ be a cache variable as it +# controls the tagged layout of Boost library names +set(Boost_USE_MULTITHREADED + ON + CACHE BOOL "Enable multithreaded Boost libraries") + +# Don't use libraries linked statically to the C++ runtime NB: This _must_ be a cache +# variable as it controls the tagged layout of Boost library names +set(Boost_USE_STATIC_RUNTIME + OFF + CACHE BOOL "Enable usage of libraries statically linked to C++ runtime") + +# If using multithreaded Boost, make sure Threads has been intialized +if(Boost_USE_MULTITHREADED AND NOT DEFINED CMAKE_THREAD_LIBS_INIT) + find_package(Threads) +endif() + +# Enable debug output from FindBoost +set(Boost_DEBUG + OFF + CACHE BOOL "Enable debug output from FindBoost") + +# -------------- PATHS -------------------------------------------------------- + +# By default, search system paths +set(Boost_NO_SYSTEM_PATHS + OFF + CACHE BOOL "Disable searching in locations not specified by hint variables") + +# A sanity check This must be done _before_ the cache variables are set +if(PATH_BOOST AND Boost_ROOT_DIR) + rocprofiler_systems_message( + FATAL_ERROR + "PATH_BOOST AND Boost_ROOT_DIR both specified. Please provide only one") +endif() + +# Provide a default root directory +if(NOT PATH_BOOST AND NOT Boost_ROOT_DIR) + set(PATH_BOOST "/usr") +endif() + +# Set the default location to look for Boost +set(Boost_ROOT_DIR + ${PATH_BOOST} + CACHE PATH "Base directory the of Boost installation") + +# In FindBoost, Boost_ROOT_DIR is spelled BOOST_ROOT +set(BOOST_ROOT ${Boost_ROOT_DIR}) + +# -------------- COMPILER DEFINES --------------------------------------------- + +set(_boost_defines) + +# Disable auto-linking +list(APPEND _boost_defines BOOST_ALL_NO_LIB=1) + +# Disable generating serialization code in boost::multi_index +list(APPEND _boost_defines BOOST_MULTI_INDEX_DISABLE_SERIALIZATION) + +# There are broken versions of MSVC that won't handle variadic templates correctly +# (despite the C++11 test case passing). +if(MSVC) + list(APPEND _boost_defines BOOST_NO_CXX11_VARIADIC_TEMPLATES) +endif() + +set(Boost_DEFINES + ${_boost_defines} + CACHE STRING "Boost compiler defines") +add_compile_definitions(${Boost_DEFINES}) + +# -------------- INTERNALS ---------------------------------------------------- + +# Disable Boost's own CMake as it's known to be buggy NB: This should not be a cache +# variable +set(Boost_NO_BOOST_CMAKE ON) + +# The required Boost library components NB: These are just the ones that require +# compilation/linking This should _not_ be a cache variable +set(_boost_components atomic chrono date_time filesystem system thread timer) + +if(NOT BUILD_BOOST) + find_package(Boost ${Boost_MIN_VERSION} QUIET COMPONENTS ${_boost_components}) +endif() + +# -------------- SOURCE BUILD ------------------------------------------------- + +if(Boost_FOUND AND NOT BUILD_BOOST) + # Force the cache entries to be updated Normally, these would not be exported. + # However, we need them in the Testsuite + set(Boost_INCLUDE_DIRS + ${Boost_INCLUDE_DIRS} + CACHE PATH "Boost include directory" FORCE) + set(Boost_LIBRARY_DIRS + ${Boost_LIBRARY_DIRS} + CACHE PATH "Boost library directory" FORCE) + set(Boost_INCLUDE_DIR + ${Boost_INCLUDE_DIR} + CACHE PATH "Boost include directory" FORCE) +elseif(NOT Boost_FOUND AND STERILE_BUILD) + rocprofiler_systems_message( + FATAL_ERROR "Boost not found and cannot be downloaded because build is sterile.") +elseif(NOT BUILD_BOOST) + rocprofiler_systems_message( + FATAL_ERROR + "Boost was not found. Either configure cmake to find Boost properly or set BUILD_BOOST=ON to download and build" + ) +else() + rocprofiler_systems_add_option(BOOST_LINK_STATIC "Link to boost libraries statically" + ON) + # If we didn't find a suitable version on the system, then download one from the web + rocprofiler_systems_add_cache_option( + ROCPROFSYS_BOOST_DOWNLOAD_VERSION "Version of boost to download and install" + STRING "1.79.0") + + # If the user specifies a version other than ROCPROFSYS_BOOST_DOWNLOAD_VERSION, use + # that version. + if(${ROCPROFSYS_BOOST_DOWNLOAD_VERSION} VERSION_LESS ${Boost_MIN_VERSION}) + rocprofiler_systems_message( + FATAL_ERROR + "Boost download version is set to ${ROCPROFSYS_BOOST_DOWNLOAD_VERSION} but Boost minimum version is set to ${Boost_MIN_VERSION}" + ) + endif() + + rocprofiler_systems_message( + STATUS + "Attempting to build BOOST(${ROCPROFSYS_BOOST_DOWNLOAD_VERSION}) as external project" + ) + + if(Boost_USE_MULTITHREADED) + set(_boost_threading multi) + else() + set(_boost_threading single) + endif() + + if(Boost_USE_STATIC_RUNTIME) + set(_boost_runtime_link static) + else() + set(_boost_runtime_link shared) + endif() + + # Change the base directory + set(Boost_ROOT_DIR + ${TPL_STAGING_PREFIX}/boost + CACHE PATH "Base directory the of Boost installation" FORCE) + + # Update the exported variables + set(Boost_INCLUDE_DIRS + "$;$" + CACHE PATH "Boost include directory" FORCE) + set(Boost_LIBRARY_DIRS + "$;$" + CACHE PATH "Boost library directory" FORCE) + set(Boost_INCLUDE_DIR + ${Boost_INCLUDE_DIRS} + CACHE PATH "Boost include directory" FORCE) + + file(MAKE_DIRECTORY ${Boost_ROOT_DIR}/include) + file(MAKE_DIRECTORY ${Boost_ROOT_DIR}/lib) + + if(BOOST_LINK_STATIC) + set(_BOOST_LINK static) + else() + set(_BOOST_LINK shared) + endif() + + set(BOOST_ARGS link=${_BOOST_LINK} runtime-link=${_boost_runtime_link} + threading=${_boost_threading}) + if(WIN32) + # NB: We need to build both debug/release on windows as we don't use + # CMAKE_BUILD_TYPE + set(BOOST_BOOTSTRAP call bootstrap.bat) + set(BOOST_BUILD ".\\b2") + if(CMAKE_SIZEOF_VOID_P STREQUAL "8") + list(APPEND BOOST_ARGS address-model=64) + endif() + else() + set(BOOST_BOOTSTRAP "./bootstrap.sh") + set(BOOST_BUILD "./b2") + if(CMAKE_BUILD_TYPE MATCHES "^(Debug|DEBUG)$") + list(APPEND BOOST_ARGS variant=debug) + else() + list(APPEND BOOST_ARGS variant=release) + endif() + endif() + + # Join the component names together to pass to --with-libraries during bootstrap + set(_boost_lib_names "headers,") + foreach(c ${_boost_components}) + # list(JOIN ...) is in cmake 3.12 + string(CONCAT _boost_lib_names "${_boost_lib_names}${c},") + endforeach() + + if(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Clang|Intel)") + list(APPEND BOOST_ARGS cflags=-fPIC cxxflags=-fPIC) + endif() + + string(REPLACE "." "_" _boost_download_filename ${ROCPROFSYS_BOOST_DOWNLOAD_VERSION}) + # zip is subject to locales on Unix + set(_boost_download_ext "zip") + if(UNIX) + set(_boost_download_ext "tar.gz") + endif() + + set(_LIB_SUFFIX "${CMAKE_SHARED_LIBRARY_SUFFIX}") + if(BOOST_LINK_STATIC) + set(_LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}") + endif() + + if(WIN32) + # We need to specify different library names for debug vs release + set(Boost_LIBRARIES "") + foreach(c ${_boost_components}) + list(APPEND Boost_LIBRARIES "optimized libboost_${c} debug libboost_${c}-gd ") + list(APPEND _boost_build_byproducts + "{Boost_ROOT_DIR}/lib/libboost_${c}${_LIB_SUFFIX}") + set(Boost_${c}_LIBRARY + $ + $) + set(Boost_${c}_LIBRARY_DEBUG + $ + $) + + # Also export cache variables for the file location of each library + string(TOUPPER ${c} _basename) + set(Boost_${_basename}_LIBRARY_RELEASE + "${Boost_${c}_LIBRARY}" + CACHE FILEPATH "" FORCE) + set(Boost_${_basename}_LIBRARY_DEBUG + "${Boost_${c}_LIBRARY_DEBUG}" + CACHE FILEPATH "" FORCE) + endforeach() + else() + # Transform the component names into the library filenames e.g., system -> + # boost_system + set(Boost_LIBRARIES "") + foreach(c ${_boost_components}) + set(Boost_${c}_LIBRARY + $ + $/${INSTALL_LIB_DIR}/${TPL_INSTALL_LIB_DIR}/libboost_${c}${_LIB_SUFFIX}> + ) + list(APPEND _boost_build_byproducts + "${Boost_ROOT_DIR}/lib/libboost_${c}${_LIB_SUFFIX}") + list(APPEND Boost_LIBRARIES "${Boost_${c}_LIBRARY}") + + # Also export cache variables for the file location of each library + string(TOUPPER ${c} _basename) + set(Boost_${_basename}_LIBRARY_RELEASE + "${Boost_${c}_LIBRARY}" + CACHE FILEPATH "" FORCE) + set(Boost_${_basename}_LIBRARY_DEBUG + "${Boost_${c}_LIBRARY}" + CACHE FILEPATH "" FORCE) + endforeach() + endif() + + include(ExternalProject) + externalproject_add( + rocprofiler-systems-boost-build + PREFIX ${Boost_ROOT_DIR} + GIT_REPOSITORY https://github.com/boostorg/boost.git + GIT_TAG boost-${ROCPROFSYS_BOOST_DOWNLOAD_VERSION} + BUILD_IN_SOURCE 1 + CONFIGURE_COMMAND ${BOOST_BOOTSTRAP} --prefix=${Boost_ROOT_DIR} + --with-libraries=${_boost_lib_names} + BUILD_COMMAND ${BOOST_BUILD} --ignore-site-config --prefix=${Boost_ROOT_DIR} -j2 + ${BOOST_ARGS} -d0 install + BUILD_BYPRODUCTS ${_boost_build_byproducts} + INSTALL_COMMAND "") + + # target for re-executing the installation + add_custom_target( + rocprofiler-systems-boost-install + COMMAND ${BOOST_BUILD} ${BOOST_ARGS} -d0 install + WORKING_DIRECTORY ${Boost_ROOT_DIR}/src/Boost-External + COMMENT "Installing Boost...") +endif() + +# -------------- EXPORT VARIABLES --------------------------------------------- + +# Export Boost_THREAD_LIBRARY +list(FIND _boost_components "thread" _building_threads) +if(Boost_USE_MULTITHREADED AND ${_building_threads}) + # On Windows, always use the debug version On Linux, we don't use tagged builds, so + # the debug/release filenames are the same + set(Boost_THREAD_LIBRARY + ${Boost_THREAD_LIBRARY_DEBUG} + CACHE FILEPATH "Boost thread library") +endif() + +# Add the system thread library +if(Boost_USE_MULTITHREADED) + list(APPEND Boost_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) +endif() + +# Export the complete set of libraries +set(Boost_LIBRARIES + ${Boost_LIBRARIES} + CACHE FILEPATH "Boost library files" FORCE) + +target_include_directories(rocprofiler-systems-boost SYSTEM + INTERFACE ${Boost_INCLUDE_DIRS}) +target_compile_definitions(rocprofiler-systems-boost INTERFACE ${Boost_DEFINITIONS}) +target_link_directories(rocprofiler-systems-boost INTERFACE ${Boost_LIBRARY_DIRS}) +target_link_libraries(rocprofiler-systems-boost INTERFACE ${Boost_LIBRARIES}) + +rocprofiler_systems_message(STATUS "Boost includes: ${Boost_INCLUDE_DIRS}") +rocprofiler_systems_message(STATUS "Boost library dirs: ${Boost_LIBRARY_DIRS}") +rocprofiler_systems_message(STATUS "Boost thread library: ${Boost_THREAD_LIBRARY}") +rocprofiler_systems_message(STATUS "Boost libraries: ${Boost_LIBRARIES}") + +# Just the headers (effectively a simplified Boost::headers target) +add_library(Dyninst::Boost_headers INTERFACE IMPORTED) +target_include_directories(Dyninst::Boost_headers SYSTEM INTERFACE ${Boost_INCLUDE_DIRS}) diff --git a/cmake/DyninstElfUtils.cmake b/cmake/DyninstElfUtils.cmake new file mode 100644 index 000000000..6ee4c49f3 --- /dev/null +++ b/cmake/DyninstElfUtils.cmake @@ -0,0 +1,218 @@ +# ====================================================================================== +# elfutils.cmake +# +# Configure elfutils for Dyninst +# +# ---------------------------------------- +# +# Accepts the following CMake variables +# +# ElfUtils_ROOT_DIR - Base directory the of elfutils installation +# ElfUtils_INCLUDEDIR - Hint directory that contains the elfutils headers files +# ElfUtils_LIBRARYDIR - Hint directory that contains the elfutils library files +# ElfUtils_MIN_VERSION - Minimum acceptable version of elfutils +# +# Directly exports the following CMake variables +# +# ElfUtils_ROOT_DIR - Computed base directory the of elfutils installation +# ElfUtils_INCLUDE_DIRS - elfutils include directories ElfUtils_LIBRARY_DIRS - Link +# directories for elfutils libraries ElfUtils_LIBRARIES - elfutils library files +# +# NOTE: The exported ElfUtils_ROOT_DIR can be different from the value provided by the +# user in the case that it is determined to build elfutils from source. In such a case, +# ElfUtils_ROOT_DIR will contain the directory of the from-source installation. +# +# See Modules/FindLibElf.cmake and Modules/FindLibDwarf.cmake for details +# +# ====================================================================================== + +include_guard(GLOBAL) + +# always provide Dyninst::ElfUtils even if it is a dummy +rocprofiler_systems_add_interface_library(ElfUtils "ElfUtils interface library") + +if(NOT BUILD_ELFUTILS) + find_package(Elfutils) +endif() + +if(LibElf_FOUND + AND LibDwarf_FOUND + AND NOT ENABLE_DEBUGINFOD) + return() +endif() + +if(NOT UNIX) + return() +endif() + +# Minimum acceptable version of elfutils NB: We need >=0.178 because libdw isn't +# thread-safe before then +set(_min_version 0.178) + +set(ElfUtils_MIN_VERSION + ${_min_version} + CACHE STRING "Minimum acceptable elfutils version") +if(${ElfUtils_MIN_VERSION} VERSION_LESS ${_min_version}) + rocprofiler_systems_message( + FATAL_ERROR + "Requested version ${ElfUtils_MIN_VERSION} is less than minimum supported version (${_min_version})" + ) +endif() + +# -------------- PATHS -------------------------------------------------------- + +# Base directory the of elfutils installation +set(ElfUtils_ROOT_DIR + "/usr" + CACHE PATH "Base directory the of elfutils installation") + +# Hint directory that contains the elfutils headers files +set(ElfUtils_INCLUDEDIR + "${ElfUtils_ROOT_DIR}/include" + CACHE PATH "Hint directory that contains the elfutils headers files") + +# Hint directory that contains the elfutils library files +set(ElfUtils_LIBRARYDIR + "${ElfUtils_ROOT_DIR}/lib" + CACHE PATH "Hint directory that contains the elfutils library files") + +# libelf/dwarf-specific directory hints +foreach(l LibElf LibDwarf LibDebuginfod) + foreach(d ROOT_DIR INCLUDEDIR LIBRARYDIR) + set(${l}_${d} ${ElfUtils_${d}}) + endforeach() +endforeach() + +# -------------- PACKAGES------------------------------------------------------ + +if(NOT BUILD_ELFUTILS) + find_package(LibElf ${ElfUtils_MIN_VERSION}) + + # Don't search for libdw or libdebuginfod if we didn't find a suitable libelf + if(LibElf_FOUND) + find_package(LibDwarf ${ElfUtils_MIN_VERSION}) + if(ENABLE_DEBUGINFOD) + find_package(LibDebuginfod ${ElfUtils_MIN_VERSION} REQUIRED) + endif() + endif() +endif() + +# -------------- SOURCE BUILD ------------------------------------------------- +if(LibElf_FOUND + AND LibDwarf_FOUND + AND (NOT ENABLE_DEBUGINFOD OR LibDebuginfod_FOUND)) + if(ENABLE_DEBUGINFOD AND LibDebuginfod_FOUND) + set(_eu_root ${ElfUtils_ROOT_DIR}) + set(_eu_inc_dirs ${LibElf_INCLUDE_DIRS} ${LibDwarf_INCLUDE_DIRS} + ${LibDebuginfod_INCLUDE_DIRS}) + set(_eu_lib_dirs ${LibElf_LIBRARY_DIRS} ${LibDwarf_LIBRARY_DIRS} + ${LibDebuginfod_LIBRARY_DIRS}) + set(_eu_libs ${LibElf_LIBRARIES} ${LibDwarf_LIBRARIES} ${LibDebuginfod_LIBRARIES}) + else() + set(_eu_root ${ElfUtils_ROOT_DIR}) + set(_eu_inc_dirs ${LibElf_INCLUDE_DIRS} ${LibDwarf_INCLUDE_DIRS}) + set(_eu_lib_dirs ${LibElf_LIBRARY_DIRS} ${LibDwarf_LIBRARY_DIRS}) + set(_eu_libs ${LibElf_LIBRARIES} ${LibDwarf_LIBRARIES}) + endif() +elseif(NOT (LibElf_FOUND AND LibDwarf_FOUND) AND STERILE_BUILD) + rocprofiler_systems_message( + FATAL_ERROR + "ElfUtils not found and cannot be downloaded because build is sterile.") +elseif(NOT BUILD_ELFUTILS) + rocprofiler_systems_message( + FATAL_ERROR + "ElfUtils was not found. Either configure cmake to find ElfUtils properly or set BUILD_ELFUTILS=ON to download and build" + ) +else() + # If we didn't find a suitable version on the system, then download one from the web + rocprofiler_systems_add_cache_option( + ELFUTILS_DOWNLOAD_VERSION "Version of elfutils to download and install" STRING + "0.188") + set(ELFUTILS_DOWNLOAD_VERSION ${ElfUtils_DOWNLOAD_VERSION}) + + # make sure we are not downloading a version less than minimum + if(${ELFUTILS_DOWNLOAD_VERSION} VERSION_LESS ${ElfUtils_MIN_VERSION}) + rocprofiler_systems_message( + FATAL_ERROR + "elfutils download version is set to ${ELFUTILS_DOWNLOAD_VERSION} but elfutils minimum version is set to ${ElfUtils_MIN_VERSION}" + ) + endif() + + rocprofiler_systems_message(STATUS "${ElfUtils_ERROR_REASON}") + rocprofiler_systems_message( + STATUS + "Attempting to build elfutils(${ELFUTILS_DOWNLOAD_VERSION}) as external project") + + if(NOT (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") OR NOT (${CMAKE_C_COMPILER_ID} + STREQUAL "GNU")) + rocprofiler_systems_message(FATAL_ERROR + "ElfUtils will only build with the GNU compiler") + endif() + + set(_eu_root ${TPL_STAGING_PREFIX}) + set(_eu_inc_dirs $ + $) + set(_eu_lib_dirs $ + $) + set(_eu_libs + $ + $ + $/${INSTALL_LIB_DIR}/${TPL_INSTALL_LIB_DIR}/libdw${CMAKE_SHARED_LIBRARY_SUFFIX}> + $/${INSTALL_LIB_DIR}/${TPL_INSTALL_LIB_DIR}/libelf${CMAKE_SHARED_LIBRARY_SUFFIX}> + ) + set(_eu_build_byproducts "${_eu_root}/lib/libdw${CMAKE_SHARED_LIBRARY_SUFFIX}" + "${_eu_root}/lib/libelf${CMAKE_SHARED_LIBRARY_SUFFIX}") + + include(ExternalProject) + externalproject_add( + ElfUtils-External + PREFIX ${PROJECT_BINARY_DIR}/elfutils + URL ${ElfUtils_DOWNLOAD_URL} + "https://sourceware.org/elfutils/ftp/${ELFUTILS_DOWNLOAD_VERSION}/elfutils-${ELFUTILS_DOWNLOAD_VERSION}.tar.bz2" + "https://mirrors.kernel.org/sourceware/elfutils/${ELFUTILS_DOWNLOAD_VERSION}/elfutils-${ELFUTILS_DOWNLOAD_VERSION}.tar.bz2" + BUILD_IN_SOURCE 1 + CONFIGURE_COMMAND + ${CMAKE_COMMAND} -E env CC=${CMAKE_C_COMPILER} CFLAGS=-fPIC\ -O3 + CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=-fPIC\ -O3 + [=[LDFLAGS=-Wl,-rpath='$$ORIGIN']=] /configure + --enable-install-elfh --prefix=${TPL_STAGING_PREFIX} --disable-libdebuginfod + --disable-debuginfod --enable-thread-safety ${ElfUtils_CONFIG_OPTIONS} + --libdir=${TPL_STAGING_PREFIX}/lib + BUILD_COMMAND make install + BUILD_BYPRODUCTS ${_eu_build_byproducts} + INSTALL_COMMAND "") + + # target for re-executing the installation + add_custom_target( + install-elfutils-external + COMMAND make install + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/elfutils/src/ElfUtils-External + COMMENT "Installing ElfUtils...") +endif() + +# -------------- EXPORT VARIABLES --------------------------------------------- + +set(ElfUtils_ROOT_DIR + ${_eu_root} + CACHE PATH "Base directory the of elfutils installation" FORCE) +set(ElfUtils_INCLUDE_DIRS + ${_eu_inc_dirs} + CACHE PATH "elfutils include directory" FORCE) +set(ElfUtils_LIBRARY_DIRS + ${_eu_lib_dirs} + CACHE PATH "elfutils library directory" FORCE) +set(ElfUtils_INCLUDE_DIR + ${ElfUtils_INCLUDE_DIRS} + CACHE PATH "elfutils include directory" FORCE) +set(ElfUtils_LIBRARIES + ${_eu_libs} + CACHE FILEPATH "elfutils library files" FORCE) + +target_include_directories(ElfUtils SYSTEM INTERFACE ${ElfUtils_INCLUDE_DIRS}) +target_compile_definitions(ElfUtils INTERFACE ${ElfUtils_DEFINITIONS}) +target_link_directories(ElfUtils INTERFACE ${ElfUtils_LIBRARY_DIRS}) +target_link_libraries(ElfUtils INTERFACE ${ElfUtils_LIBRARIES}) + +rocprofiler_systems_message(STATUS "ElfUtils includes: ${ElfUtils_INCLUDE_DIRS}") +rocprofiler_systems_message(STATUS "ElfUtils library dirs: ${ElfUtils_LIBRARY_DIRS}") +rocprofiler_systems_message(STATUS "ElfUtils libraries: ${ElfUtils_LIBRARIES}") diff --git a/cmake/DyninstExternals.cmake b/cmake/DyninstExternals.cmake new file mode 100644 index 000000000..3f9c71833 --- /dev/null +++ b/cmake/DyninstExternals.cmake @@ -0,0 +1,132 @@ +include(MacroUtilities) + +# Map deprecated DYNINST_BUILD_* variables to new ROCPROFSYS_BUILD_* variables +foreach(dep BOOST TBB ELFUTILS LIBIBERTY) + if(DYNINST_BUILD_${dep}) + message( + WARNING + "DYNINST_BUILD_${dep} is deprecated. Use ROCPROFSYS_BUILD_${dep} instead." + ) + set(ROCPROFSYS_BUILD_${dep} ON) + endif() +endforeach() + +# Set BUILD_* to ON if ROCPROFSYS_BUILD_* is ON +foreach(dep BOOST TBB ELFUTILS LIBIBERTY) + if(ROCPROFSYS_BUILD_${dep}) + if(dep STREQUAL "BOOST") + rocprofiler_systems_add_option(BUILD_BOOST "Enable building Boost internally" + ON) + elseif(dep STREQUAL "TBB") + rocprofiler_systems_add_option(BUILD_TBB "Enable building TBB internally" ON) + elseif(dep STREQUAL "ELFUTILS") + rocprofiler_systems_add_option(BUILD_ELFUTILS + "Enable building elfutils internally" ON) + elseif(dep STREQUAL "LIBIBERTY") + rocprofiler_systems_add_option(BUILD_LIBIBERTY + "Enable building libiberty internally" ON) + endif() + endif() +endforeach() + +set(TPL_STAGING_PREFIX + "${PROJECT_BINARY_DIR}/external" + CACHE PATH "Third-party library build-tree install prefix") +file(MAKE_DIRECTORY "${TPL_STAGING_PREFIX}") +file(MAKE_DIRECTORY "${TPL_STAGING_PREFIX}/include") + +add_custom_target(external-prebuild) + +# Add external dependencies to be built +include(DyninstBoost) +if(TARGET rocprofiler-systems-boost-build) + # Make Boost build serially + set_target_properties( + rocprofiler-systems-boost PROPERTIES JOB_POOL_COMPILE external_deps_pool + JOB_POOL_LINK external_deps_pool) + # Create a prebuild target that depends on Boost + add_dependencies(external-prebuild rocprofiler-systems-boost-build) +endif() + +include(DyninstTBB) +if(TARGET rocprofiler-systems-tbb-build AND TARGET external-prebuild) + # Make TBB build serially and wait for Boost + set_target_properties( + rocprofiler-systems-tbb-build PROPERTIES JOB_POOL_COMPILE external_deps_pool + JOB_POOL_LINK external_deps_pool) + add_dependencies(external-prebuild rocprofiler-systems-tbb-build) +endif() + +include(DyninstElfUtils) +if(TARGET ElfUtils-External AND TARGET external-prebuild) + set_target_properties(ElfUtils-External PROPERTIES JOB_POOL_COMPILE external_deps_pool + JOB_POOL_LINK external_deps_pool) + add_dependencies(external-prebuild ElfUtils-External) +endif() + +include(DyninstLibIberty) +if(TARGET rocprofiler-systems-libiberty-build AND TARGET external-prebuild) + set_target_properties( + rocprofiler-systems-libiberty-build PROPERTIES JOB_POOL_COMPILE external_deps_pool + JOB_POOL_LINK external_deps_pool) + add_dependencies(external-prebuild rocprofiler-systems-libiberty-build) +endif() + +# Final dependency check +if(NOT TARGET external-prebuild) + message(WARNING "Not all dyninst external dependencies found. Build may fail.") +endif() + +# Create a dummy target to ensure external dependencies are fully built +add_custom_target(external-deps-complete) +if(TARGET external-prebuild) + add_dependencies(external-deps-complete external-prebuild) +endif() + +if(NOT TARGET Dyninst::Boost AND TARGET rocprofiler-systems-boost) + add_library(Dyninst::Boost INTERFACE IMPORTED) + set_target_properties(Dyninst::Boost PROPERTIES INTERFACE_LINK_LIBRARIES + rocprofiler-systems-boost) + message( + STATUS + "Created imported target Dyninst::Boost linked to rocprofiler-systems-boost") +endif() + +if(NOT TARGET Dyninst::ElfUtils AND TARGET ElfUtils) + add_library(Dyninst::ElfUtils INTERFACE IMPORTED) + set_target_properties(Dyninst::ElfUtils PROPERTIES INTERFACE_LINK_LIBRARIES ElfUtils) + message(STATUS "Created imported target Dyninst::ElfUtils linked to ElfUtils") +endif() + +if(NOT TARGET Dyninst::TBB AND TARGET rocprofiler-systems-tbb) + add_library(Dyninst::TBB INTERFACE IMPORTED) + set_target_properties(Dyninst::TBB PROPERTIES INTERFACE_LINK_LIBRARIES + rocprofiler-systems-tbb) + message( + STATUS "Created imported target Dyninst::TBB linked to rocprofiler-systems-tbb") +endif() + +if(NOT TARGET Dyninst::LibIberty AND TARGET rocprofiler-systems-libiberty) + add_library(Dyninst::LibIberty INTERFACE IMPORTED) + set_target_properties(Dyninst::LibIberty PROPERTIES INTERFACE_LINK_LIBRARIES + rocprofiler-systems-libiberty) + message( + STATUS + "Created imported target Dyninst::LibIberty linked to rocprofiler-systems-libiberty" + ) +endif() + +# for packaging +install( + DIRECTORY ${TPL_STAGING_PREFIX}/lib/ + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} + FILES_MATCHING + PATTERN "*${CMAKE_SHARED_LIBRARY_SUFFIX}*" + PATTERN "*${CMAKE_STATIC_LIBRARY_SUFFIX}*") + +install( + DIRECTORY ${TPL_STAGING_PREFIX}/tbb/lib/ + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} + FILES_MATCHING + PATTERN "*${CMAKE_SHARED_LIBRARY_SUFFIX}*" + PATTERN "*${CMAKE_STATIC_LIBRARY_SUFFIX}*") diff --git a/cmake/DyninstLibIberty.cmake b/cmake/DyninstLibIberty.cmake new file mode 100644 index 000000000..c2248d982 --- /dev/null +++ b/cmake/DyninstLibIberty.cmake @@ -0,0 +1,151 @@ +# ====================================================================================== +# LibIberty.cmake +# +# Configure LibIberty for Dyninst +# +# ---------------------------------------- +# +# Directly exports the following CMake variables +# +# LibIberty_ROOT_DIR - Computed base directory the of LibIberty installation +# LibIberty_LIBRARY_DIRS - Link directories for LibIberty libraries LibIberty_LIBRARIES +# - LibIberty library files LibIberty_INCLUDE - LibIberty include files +# +# NOTE: The exported LibIberty_ROOT_DIR can be different from the value provided by the +# user in the case that it is determined to build LibIberty from source. In such a case, +# LibIberty_ROOT_DIR will contain the directory of the from-source installation. +# +# See Modules/FindLibIberty.cmake for details +# +# ====================================================================================== + +include_guard(GLOBAL) + +# always provide Dyninst::LibIberty even if it is empty +rocprofiler_systems_add_interface_library(rocprofiler-systems-libiberty + "LibIberty interface library") + +if(NOT UNIX) + return() +endif() + +# -------------- PATHS -------------------------------------------------------- + +# Base directory the of LibIberty installation +set(LibIberty_ROOT_DIR + "/usr" + CACHE PATH "Base directory the of LibIberty installation") + +# Hint directory that contains the LibIberty library files +set(LibIberty_LIBRARYDIR + "${LibIberty_ROOT_DIR}/lib" + CACHE PATH "Hint directory that contains the LibIberty library files") + +# -------------- PACKAGES ----------------------------------------------------- + +if(NOT BUILD_LIBIBERTY) + find_package(LibIberty) +endif() + +# -------------- SOURCE BUILD ------------------------------------------------- +if(LibIberty_FOUND) + set(_li_root ${LibIberty_ROOT_DIR}) + set(_li_inc_dirs ${LibIberty_INCLUDE_DIRS}) + set(_li_lib_dirs ${LibIberty_LIBRARY_DIRS}) + set(_li_libs ${LibIberty_LIBRARIES}) +elseif(STERILE_BUILD) + rocprofiler_systems_message( + FATAL_ERROR + "LibIberty not found and cannot be downloaded because build is sterile.") +elseif(NOT BUILD_LIBIBERTY) + rocprofiler_systems_message( + FATAL_ERROR + "LibIberty was not found. Either configure cmake to find TBB properly or set BUILD_LIBIBERTY=ON to download and build" + ) +else() + rocprofiler_systems_message(STATUS "${LibIberty_ERROR_REASON}") + rocprofiler_systems_message(STATUS + "Attempting to build LibIberty as external project") + + set(_li_root ${TPL_STAGING_PREFIX}/binutils) + set(_li_project_name rocprofiler-systems-libiberty-build) + set(_li_working_dir ${_li_root}/src/${_li_project_name}) + set(_li_inc_dirs $) + set(_li_lib_dirs $ + $) + set(_li_libs + $ + $/${INSTALL_LIB_DIR}/${TPL_INSTALL_LIB_DIR}/libiberty${CMAKE_STATIC_LIBRARY_SUFFIX}> + ) + set(_li_build_byproducts "${_li_root}/lib/libiberty${CMAKE_STATIC_LIBRARY_SUFFIX}") + + file(MAKE_DIRECTORY "${_li_root}/lib") + file(MAKE_DIRECTORY "${_li_root}/include") + + include(ExternalProject) + externalproject_add( + ${_li_project_name} + PREFIX ${_li_root} + URL ${DYNINST_BINUTILS_DOWNLOAD_URL} + http://ftpmirror.gnu.org/gnu/binutils/binutils-2.42.tar.gz + http://mirrors.kernel.org/sourceware/binutils/releases/binutils-2.42.tar.gz + BUILD_IN_SOURCE 1 + CONFIGURE_COMMAND + ${CMAKE_COMMAND} -E env CC=${CMAKE_C_COMPILER} CFLAGS=-fPIC\ -O3 + CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=-fPIC\ -O3 /configure + --prefix=${_li_root} + BUILD_COMMAND make + BUILD_BYPRODUCTS ${_li_build_byproducts} + INSTALL_COMMAND "") + + add_custom_command( + TARGET ${_li_project_name} + POST_BUILD + COMMAND install ARGS -C ${_li_working_dir}/libiberty/libiberty.a ${_li_root}/lib + COMMAND install ARGS -C ${_li_working_dir}/include/*.h ${_li_root}/include + COMMENT "Installing LibIberty...") + + # target for re-executing the installation + add_custom_target( + rocprofiler-systems-libiberty-install + COMMAND install -C ${_li_working_dir}/libiberty/libiberty.a ${_li_root}/lib + COMMAND install ARGS -C ${_li_working_dir}/include/*.h ${_li_root}/include + WORKING_DIRECTORY ${_li_working_dir} + COMMENT "Installing LibIberty...") + + # For backward compatibility + set(IBERTY_FOUND TRUE) + set(IBERTY_BUILD TRUE) +endif() + +# -------------- EXPORT VARIABLES --------------------------------------------- + +foreach(_DIR_TYPE inc lib) + if(_li_${_DIR_TYPE}_dirs) + list(REMOVE_DUPLICATES _li_${_DIR_TYPE}_dirs) + endif() +endforeach() + +target_include_directories(rocprofiler-systems-libiberty INTERFACE ${_li_inc_dirs}) +target_link_directories(rocprofiler-systems-libiberty INTERFACE ${_lib_lib_dirs}) +target_link_libraries(rocprofiler-systems-libiberty INTERFACE ${_li_libs}) + +set(LibIberty_ROOT_DIR + ${_li_root} + CACHE PATH "Base directory the of LibIberty installation" FORCE) +set(LibIberty_INCLUDE_DIRS + ${_li_inc_dirs} + CACHE PATH "LibIberty include directories" FORCE) +set(LibIberty_LIBRARY_DIRS + ${_li_lib_dirs} + CACHE PATH "LibIberty library directory" FORCE) +set(LibIberty_LIBRARIES + ${_li_libs} + CACHE FILEPATH "LibIberty library files" FORCE) + +# For backward compatibility only +set(IBERTY_LIBRARIES ${LibIberty_LIBRARIES}) + +rocprofiler_systems_message(STATUS "LibIberty include dirs: ${LibIberty_INCLUDE_DIRS}") +rocprofiler_systems_message(STATUS "LibIberty library dirs: ${LibIberty_LIBRARY_DIRS}") +rocprofiler_systems_message(STATUS "LibIberty libraries: ${LibIberty_LIBRARIES}") diff --git a/cmake/DyninstTBB.cmake b/cmake/DyninstTBB.cmake new file mode 100644 index 000000000..f95cd69a0 --- /dev/null +++ b/cmake/DyninstTBB.cmake @@ -0,0 +1,260 @@ +# ===================================================================================== +# ThreadingBuildingBlocks.cmake +# +# Configure Intel's Threading Building Blocks for Dyninst +# +# ---------------------------------------- +# +# Accepts the following CMake variables +# +# TBB_ROOT_DIR - Hint directory that contains the TBB installation TBB_INCLUDEDIR - +# Hint directory that contains the TBB headers files TBB_LIBRARYDIR - Hint directory +# that contains the TBB library files TBB_LIBRARY - Alias for TBB_LIBRARYDIR +# TBB_USE_DEBUG_BUILD - Use debug version of tbb libraries, if present TBB_MIN_VERSION - +# Minimum acceptable version of TBB +# +# Directly exports the following CMake variables +# +# TBB_ROOT_DIR - Computed base directory of TBB installation TBB_INCLUDE_DIRS - +# TBB include directory TBB_INCLUDE_DIR - Alias for TBB_INCLUDE_DIRS TBB_LIBRARY_DIRS +# - TBB library directory TBB_LIBRARY_DIR - Alias for TBB_LIBRARY_DIRS TBB_DEFINITIONS - +# TBB compiler definitions TBB_LIBRARIES - TBB library files +# +# TBB__LIBRARY_RELEASE - Path to the release version of component +# TBB__LIBRARY_DEBUG - Path to the debug version of component +# +# NOTE: The exported TBB_ROOT_DIR can be different from the value provided by the user in +# the case that it is determined to build TBB from source. In such a case, TBB_ROOT_DIR +# will contain the directory of the from-source installation. +# +# See Modules/FindTBB.cmake for additional input and exported variables +# +# ===================================================================================== + +include_guard(GLOBAL) + +# always provide Dyninst::TBB even if it is a dummy +rocprofiler_systems_add_interface_library(rocprofiler-systems-tbb + "Threading Building Blocks") + +if(TBB_FOUND) + return() +endif() + +# -------------- RUNTIME CONFIGURATION ---------------------------------------- + +# Use debug versions of TBB libraries +set(TBB_USE_DEBUG_BUILD + OFF + CACHE BOOL "Use debug versions of TBB libraries") + +# Minimum version of TBB (assumes a dotted-decimal format: YYYY.XX) +if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + set(_tbb_min_version 2019.7) +else() + set(_tbb_min_version 2018.6) +endif() + +set(TBB_MIN_VERSION + ${_tbb_min_version} + CACHE STRING "Minimum version of TBB (assumes a dotted-decimal format: YYYY.XX)") + +if(${TBB_MIN_VERSION} VERSION_LESS ${_tbb_min_version}) + dyninst_message( + FATAL_ERROR + "Requested TBB version ${TBB_MIN_VERSION} is less than minimum supported version ${_tbb_min_version}" + ) +endif() + +# -------------- PATHS -------------------------------------------------------- + +# TBB root directory +set(TBB_ROOT_DIR + "/usr" + CACHE PATH "TBB root directory") + +# TBB include directory hint +set(TBB_INCLUDEDIR + "${TBB_ROOT_DIR}/include" + CACHE INTERNAL "TBB include directory") + +# TBB library directory hint +set(TBB_LIBRARYDIR + "${TBB_ROOT_DIR}/lib" + CACHE INTERNAL "TBB library directory") + +# Translate to FindTBB names +set(TBB_LIBRARY ${TBB_LIBRARYDIR}) +set(TBB_INCLUDE_DIR ${TBB_INCLUDEDIR}) + +# The specific TBB libraries we need NB: This should _NOT_ be a cache variable +set(_tbb_components tbb tbbmalloc_proxy tbbmalloc) + +if(NOT BUILD_TBB) + find_package(TBB ${TBB_MIN_VERSION} COMPONENTS ${_tbb_components}) +endif() + +# -------------- SOURCE BUILD ------------------------------------------------- +if(TBB_FOUND) + # Force the cache entries to be updated Normally, these would not be exported. + # However, we need them in the Testsuite + set(TBB_INCLUDE_DIRS + ${TBB_INCLUDE_DIRS} + CACHE PATH "TBB include directory" FORCE) + set(TBB_LIBRARY_DIRS + ${TBB_LIBRARY_DIRS} + CACHE PATH "TBB library directory" FORCE) + set(TBB_DEFINITIONS + ${TBB_DEFINITIONS} + CACHE STRING "TBB compiler definitions" FORCE) + set(TBB_LIBRARIES + ${TBB_LIBRARIES} + CACHE FILEPATH "TBB library files" FORCE) +elseif(STERILE_BUILD) + rocprofiler_systems_message( + FATAL_ERROR "TBB not found and cannot be downloaded because build is sterile.") +elseif(NOT BUILD_TBB) + rocprofiler_systems_message( + FATAL_ERROR + "TBB was not found. Either configure cmake to find TBB properly or set BUILD_TBB=ON to download and build" + ) +else() + # If we didn't find a suitable version on the system, then download one from the web + rocprofiler_systems_message(STATUS "${ThreadingBuildingBlocks_ERROR_REASON}") + rocprofiler_systems_message( + STATUS "Attempting to build TBB(${TBB_MIN_VERSION}) as external project") + + if(NOT UNIX) + rocprofiler_systems_message( + FATAL_ERROR "Building TBB from source is not supported on this platform") + endif() + + set(TBB_ROOT_DIR + ${TPL_STAGING_PREFIX}/tbb + CACHE PATH "TBB root directory" FORCE) + + set(_tbb_libraries) + set(_tbb_components_cfg) + set(_tbb_library_dirs $ + $) + set(_tbb_include_dirs + $ + $) + + # Forcibly update the cache variables + set(TBB_INCLUDE_DIRS + "${_tbb_include_dirs}" + CACHE PATH "TBB include directory" FORCE) + set(TBB_LIBRARY_DIRS + "${_tbb_library_dirs}" + CACHE PATH "TBB library directory" FORCE) + set(TBB_DEFINITIONS + "" + CACHE STRING "TBB compiler definitions" FORCE) + + file(MAKE_DIRECTORY "${TBB_ROOT_DIR}/include") + file(MAKE_DIRECTORY "${TBB_ROOT_DIR}/lib") + + foreach(c ${_tbb_components}) + # Generate make target names + if(${c} STREQUAL tbbmalloc_proxy) + # tbbmalloc_proxy is spelled tbbproxy in their Makefiles + list(APPEND _tbb_components_cfg tbbproxy_release) + else() + list(APPEND _tbb_components_cfg ${c}_release) + endif() + + set(_tbb_${c}_lib + $ + $) + + # Generate library filenames + list(APPEND _tbb_libraries ${_tbb_${c}_lib}) + list(APPEND _tbb_build_byproducts + "${TBB_ROOT_DIR}/lib/lib${c}${CMAKE_SHARED_LIBRARY_SUFFIX}") + + foreach(t RELEASE DEBUG) + set(TBB_${c}_LIBRARY_${t} + "${_tbb_${c}_lib}" + CACHE FILEPATH "" FORCE) + endforeach() + endforeach() + + set(TBB_LIBRARIES + "${_tbb_libraries}" + CACHE FILEPATH "TBB library files" FORCE) + + # Split the dotted decimal version into major/minor parts + string(REGEX REPLACE "\\." ";" _tbb_download_name ${TBB_MIN_VERSION}) + list(GET _tbb_download_name 0 _tbb_ver_major) + list(GET _tbb_download_name 1 _tbb_ver_minor) + + # Set the compiler for TBB It assumes gcc and tests for Intel, so clang is the only + # one that needs special treatment. + if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + set(_tbb_compiler "compiler=clang") + endif() + + find_program( + MAKE_EXECUTABLE + NAMES make gmake + PATH_SUFFIXES bin) + + if(NOT MAKE_EXECUTABLE AND CMAKE_GENERATOR MATCHES "Ninja") + dyninst_message( + FATAL_ERROR + "make/gmake executable not found. Please re-run with -DMAKE_EXECUTABLE=/path/to/make" + ) + elseif(NOT MAKE_EXECUTABLE AND CMAKE_GENERATOR MATCHES "Makefiles") + set(MAKE_EXECUTABLE "$(MAKE)") + endif() + + include(ExternalProject) + externalproject_add( + rocprofiler-systems-tbb-build + PREFIX ${TBB_ROOT_DIR} + URL https://github.com/ajanicijamd/oneTBB/archive/refs/tags/v${_tbb_ver_major}.${_tbb_ver_minor}.01.tar.gz + BUILD_IN_SOURCE 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND + ${CMAKE_COMMAND} -E env CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} + [=[LDFLAGS=-Wl,-rpath='$$ORIGIN']=] ${MAKE_EXECUTABLE} -C src + ${_tbb_components_cfg} tbb_build_dir=${TBB_ROOT_DIR}/src tbb_build_prefix=tbb + ${_tbb_compiler} + BUILD_BYPRODUCTS ${_tbb_build_byproducts} + INSTALL_COMMAND "") + + # post-build target for installing build + add_custom_command( + TARGET rocprofiler-systems-tbb-build + POST_BUILD + COMMAND + ${CMAKE_COMMAND} ARGS -DLIBDIR=${TBB_LIBRARY_DIRS} + -DINCDIR=${TBB_INCLUDE_DIRS} -DPREFIX=${TBB_ROOT_DIR} + -DCMAKE_STRIP=${CMAKE_STRIP} -P + ${CMAKE_CURRENT_LIST_DIR}/DyninstTBBInstall.cmake + COMMENT "Installing TBB...") + + add_custom_target( + rocprofiler-systems-tbb-install + COMMAND + ${CMAKE_COMMAND} -DLIBDIR=${TBB_LIBRARY_DIRS} -DINCDIR=${TBB_INCLUDE_DIRS} + -DPREFIX=${TBB_ROOT_DIR} -P ${CMAKE_CURRENT_LIST_DIR}/DyninstTBBInstall.cmake + COMMENT "Installing TBB...") +endif() + +foreach(_DIR_TYPE INCLUDE LIBRARY) + if(TBB_${_DIR_TYPE}_DIRS) + list(REMOVE_DUPLICATES TBB_${_DIR_TYPE}_DIRS) + endif() +endforeach() + +target_include_directories(rocprofiler-systems-tbb SYSTEM INTERFACE ${TBB_INCLUDE_DIRS}) +target_compile_definitions(rocprofiler-systems-tbb INTERFACE ${TBB_DEFINITIONS}) +target_link_directories(rocprofiler-systems-tbb INTERFACE ${TBB_LIBRARY_DIRS}) +target_link_libraries(rocprofiler-systems-tbb INTERFACE ${TBB_LIBRARIES}) + +rocprofiler_systems_message(STATUS "TBB include directory: ${TBB_INCLUDE_DIRS}") +rocprofiler_systems_message(STATUS "TBB library directory: ${TBB_LIBRARY_DIRS}") +rocprofiler_systems_message(STATUS "TBB libraries: ${TBB_LIBRARIES}") +rocprofiler_systems_message(STATUS "TBB definitions: ${TBB_DEFINITIONS}") diff --git a/cmake/DyninstTBBInstall.cmake b/cmake/DyninstTBBInstall.cmake new file mode 100644 index 000000000..cbd2ca95e --- /dev/null +++ b/cmake/DyninstTBBInstall.cmake @@ -0,0 +1,43 @@ +# ######################################################################################## +# ThreadingBuildingBlocks.cmake +# +# Install Intel's Threading Building Blocks for Dyninst +# +# The default TBB build does not have an 'install' target, so we have to do it manually. +# This file contains the necessary CMake commands to complete the installation assuming it +# has been built using ExternalProject_Add. +# +# ######################################################################################## + +cmake_minimum_required(VERSION 3.13.0) + +if(NOT CMAKE_STRIP) + find_program(CMAKE_STRIP NAMES strip) +endif() + +file(MAKE_DIRECTORY ${LIBDIR} ${INCDIR}) +file( + COPY ${PREFIX}/src/tbb_release/ + DESTINATION ${LIBDIR} + FILES_MATCHING + PATTERN "*.so.*") +file(COPY ${PREFIX}/src/rocprofiler-systems-tbb-build/include/tbb DESTINATION ${INCDIR}) +file(GLOB _tbb_libs ${LIBDIR}/libtbb*.so.*) + +foreach(_lib ${_tbb_libs}) + string(REGEX REPLACE "\\.2$" "" _lib_short ${_lib}) + get_filename_component(_lib "${_lib}" NAME) + execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${_lib} ${_lib_short} + WORKING_DIRECTORY ${LIBDIR}) +endforeach() + +foreach(_lib ${_tbb_libs}) + get_filename_component(_lib_realpath "${_lib}" REALPATH) + if(NOT "${_lib_realpath}" IN_LIST _tbb_libs_realpath) + list(APPEND _tbb_libs_realpath ${_lib_realpath}) + endif() +endforeach() + +foreach(_lib ${_tbb_libs_realpath}) + execute_process(COMMAND ${CMAKE_STRIP} ${_lib}) +endforeach() diff --git a/cmake/Modules/FindLibDW.cmake b/cmake/Modules/FindLibDW.cmake new file mode 100644 index 000000000..6fa6f9077 --- /dev/null +++ b/cmake/Modules/FindLibDW.cmake @@ -0,0 +1,141 @@ +#[=======================================================================[.rst: +FindLibDW +--------- + +Find libdw, the elfutils library for DWARF data and ELF file or process inspection. + +Variables that affect this module + +``LibDW_NO_SYSTEM_PATHS`` + If `True`, no system paths are searched. + +Imported targets +^^^^^^^^^^^^^^^^ + +This module defines the following :prop_tgt:`IMPORTED` target: + +``LibDW::LibDW`` + The libdw library, if found. + +Result variables +^^^^^^^^^^^^^^^^ + +This module will set the following variables in your project: + +``LibDW_INCLUDE_DIRS`` + where to find libdw.h, etc. +``LibDW_LIBRARIES`` + the libraries to link against to use libdw. +``LibDW_FOUND`` + If false, do not try to use libdw. +``LibDW_VERSION`` + the version of the libdw library found + +#]=======================================================================] +cmake_policy(SET CMP0074 NEW) # Use _ROOT + +if(LibDW_NO_SYSTEM_PATHS) + set(_find_path_args NO_CMAKE_SYSTEM_PATH NO_SYSTEM_ENVIRONMENT_PATH) +endif() + +# There is no way to tell pkg-config to ignore directories, so disable it +if(NOT LibDW_NO_SYSTEM_PATHS) + find_package(PkgConfig QUIET) + if(PKG_CONFIG_FOUND) + if(NOT "x${LibDW_FIND_VERSION}" STREQUAL "x") + set(_version ">=${LibDW_FIND_VERSION}") + endif() + if(LibDW_FIND_QUIETLY) + set(_quiet "QUIET") + endif() + + pkg_check_modules(PC_LIBDW ${_quiet} "libdw${_version}") + unset(_version) + unset(_quiet) + endif() +endif() + +if(PC_LIBDW_FOUND) + # FindPkgConfig sometimes gets the include dir wrong + if("x${PC_LIBDW_INCLUDE_DIRS}" STREQUAL "x") + pkg_get_variable(PC_LIBDW_INCLUDE_DIRS libdw includedir) + endif() + + set(LibDW_INCLUDE_DIRS + ${PC_LIBDW_INCLUDE_DIRS} + CACHE PATH "") + set(LibDW_LIBRARIES + ${PC_LIBDW_LINK_LIBRARIES} + CACHE PATH "") + set(LibDW_VERSION + ${PC_LIBDW_VERSION} + CACHE STRING "") +else() + find_path( + LibDW_INCLUDE_DIRS + NAMES libdw.h + PATH_SUFFIXES elfutils ${_find_path_args}) + + find_library( + LibDW_LIBRARIES + NAMES libdw dw + PATH_SUFFIXES elfutils ${_find_path_args}) + + if(EXISTS "${LibDW_INCLUDE_DIRS}/version.h") + file(STRINGS "${LibDW_INCLUDE_DIRS}/version.h" _version_line + REGEX "^#define _ELFUTILS_VERSION[ \t]+[0-9]+") + string(REGEX MATCH "[0-9]+" _version "${_version_line}") + if(NOT "x${_version}" STREQUAL "x") + set(LibDW_VERSION "0.${_version}") + endif() + unset(_version_line) + unset(_version) + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + LibDW + FOUND_VAR LibDW_FOUND + REQUIRED_VARS LibDW_LIBRARIES LibDW_INCLUDE_DIRS + VERSION_VAR LibDW_VERSION) + +if(LibDW_FOUND) + mark_as_advanced(LibDW_INCLUDE_DIRS) + mark_as_advanced(LibDW_LIBRARIES) + mark_as_advanced(LibDW_VERSION) + + # Some platforms explicitly list libelf as a dependency, so separate it out + list(LENGTH LibDW_LIBRARIES _cnt) + if(${_cnt} GREATER 1) + foreach(_l ${LibDW_LIBRARIES}) + if(${_l} MATCHES "libdw") + set(_libdw ${_l}) + else() + list(APPEND _link_libs ${_l}) + endif() + endforeach() + endif() + unset(_cnt) + + if(NOT TARGET LibDW::LibDW) + add_library(LibDW::LibDW UNKNOWN IMPORTED) + set_target_properties(LibDW::LibDW PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${LibDW_INCLUDE_DIRS}") + + if(NOT "x${_link_libs}" STREQUAL "x") + set_target_properties( + LibDW::LibDW PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LINK_DEPENDENT_LIBRARIES "${_link_libs}") + set(LibDW_LIBRARIES ${_libdw}) + unset(_libdw) + unset(_link_libs) + endif() + + set_target_properties( + LibDW::LibDW PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${LibDW_LIBRARIES}") + endif() +endif() + +unset(_find_path_args) diff --git a/cmake/Modules/FindLibDebuginfod.cmake b/cmake/Modules/FindLibDebuginfod.cmake new file mode 100644 index 000000000..50e697fba --- /dev/null +++ b/cmake/Modules/FindLibDebuginfod.cmake @@ -0,0 +1,118 @@ +#[=======================================================================[.rst: +FindLibDebuginfod +----------------- + +Find libdebuginfod, the elfutils library to query debuginfo files from debuginfod servers. + +Variables that affect this module + +``LibDebuginfod_NO_SYSTEM_PATHS`` + If `True`, no system paths are searched. + +Imported targets +^^^^^^^^^^^^^^^^ + +This module defines the following :prop_tgt:`IMPORTED` target: + +``LibDebuginfod::LibDebuginfod`` + The libdebuginfod library, if found. + +Result variables +^^^^^^^^^^^^^^^^ + +This module will set the following variables in your project: + +``LibDebuginfod_INCLUDE_DIRS`` + where to find debuginfod.h, etc. +``LibDebuginfod_LIBRARIES`` + the libraries to link against to use libdebuginfod. +``LibDebuginfod_FOUND`` + If false, do not try to use libdebuginfod. +``LibDebuginfod_VERSION`` + the version of the libdebuginfod library found + +#]=======================================================================] +cmake_policy(SET CMP0074 NEW) # Use _ROOT + +if(LibDebuginfod_NO_SYSTEM_PATHS) + set(_find_path_args NO_CMAKE_SYSTEM_PATH NO_SYSTEM_ENVIRONMENT_PATH) +endif() + +# There is no way to tell pkg-config to ignore directories, so disable it +if(NOT LibDebuginfod_NO_SYSTEM_PATHS) + find_package(PkgConfig QUIET) + if(PKG_CONFIG_FOUND) + if(NOT "x${LibDebuginfod_FIND_VERSION}" STREQUAL "x") + set(_version ">=${LibDebuginfod_FIND_VERSION}") + endif() + if(LibDebuginfod_FIND_QUIETLY) + set(_quiet "QUIET") + endif() + + pkg_check_modules(PC_LIBDEBUGINFOD ${_quiet} "libdebuginfod${_version}") + unset(_version) + unset(_quiet) + endif() +endif() + +if(PC_LIBDEBUGINFOD_FOUND) + # FindPkgConfig sometimes gets the include dir wrong + if("x${PC_LIBDEBUGINFOD_INCLUDE_DIRS}" STREQUAL "x") + pkg_get_variable(PC_LIBDEBUGINFOD_INCLUDE_DIRS libdebuginfod includedir) + endif() + + set(LibDebuginfod_INCLUDE_DIRS + ${PC_LIBDEBUGINFOD_INCLUDE_DIRS} + CACHE PATH "") + set(LibDebuginfod_LIBRARIES + ${PC_LIBDEBUGINFOD_LINK_LIBRARIES} + CACHE PATH "") + set(LibDebuginfod_VERSION + ${PC_LIBDEBUGINFOD_VERSION} + CACHE STRING "") +else() + find_path( + LibDebuginfod_INCLUDE_DIRS + NAMES debuginfod.h + PATH_SUFFIXES elfutils ${_find_path_args}) + + find_library( + LibDebuginfod_LIBRARIES + NAMES libdebuginfod debuginfod + PATH_SUFFIXES elfutils ${_find_path_args}) + + if(EXISTS "${LibDebuginfod_INCLUDE_DIRS}/version.h") + file(STRINGS "${LibDebuginfod_INCLUDE_DIRS}/version.h" _version_line + REGEX "^#define _ELFUTILS_VERSION[ \t]+[0-9]+") + string(REGEX MATCH "[0-9]+" _version "${_version_line}") + if(NOT "x${_version}" STREQUAL "x") + set(LibDebuginfod_VERSION "0.${_version}") + endif() + unset(_version_line) + unset(_version) + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + LibDebuginfod + FOUND_VAR LibDebuginfod_FOUND + REQUIRED_VARS LibDebuginfod_LIBRARIES LibDebuginfod_INCLUDE_DIRS + VERSION_VAR LibDebuginfod_VERSION) + +if(LibDebuginfod_FOUND) + mark_as_advanced(LibDebuginfod_INCLUDE_DIR) + mark_as_advanced(LibDebuginfod_LIBRARIES) + mark_as_advanced(LibDebuginfod_VERSION) + + if(NOT TARGET LibDebuginfod::LibDebuginfod) + add_library(LibDebuginfod::LibDebuginfod UNKNOWN IMPORTED) + set_target_properties( + LibDebuginfod::LibDebuginfod + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LibDebuginfod_INCLUDE_DIRS}" + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${LibDebuginfod_LIBRARIES}") + endif() +endif() + +unset(_find_path_args) diff --git a/cmake/Modules/FindLibDwarf.cmake b/cmake/Modules/FindLibDwarf.cmake new file mode 100644 index 000000000..d5c4bd118 --- /dev/null +++ b/cmake/Modules/FindLibDwarf.cmake @@ -0,0 +1,86 @@ +# =================================================================================== +# FindLibDwarf.cmake +# +# Find libdw include dirs and libraries +# +# ---------------------------------------- +# +# Use this module by invoking find_package with the form:: +# +# find_package(LibDwarf [version] [EXACT] # Minimum or EXACT version e.g. 0.173 +# [REQUIRED] # Fail with error if libdw is not found ) +# +# This module reads hints about search locations from variables:: +# +# LibDwarf_ROOT_DIR - Base directory the of libdw installation +# LibDwarf_INCLUDEDIR - Hint directory that contains the libdw headers files +# LibDwarf_LIBRARYDIR - Hint directory that contains the libdw library files +# +# and saves search results persistently in CMake cache entries:: +# +# LibDwarf_FOUND - True if headers and requested libraries were found +# LibDwarf_INCLUDE_DIRS - libdw include directories LibDwarf_LIBRARY_DIRS - Link +# directories for libdw libraries LibDwarf_LIBRARIES - libdw library files +# +# =================================================================================== + +# Non-standard subdirectories to search +set(_path_suffixes libdw libdwarf elfutils) + +find_path( + LibDwarf_INCLUDE_DIR + NAMES libdw.h + HINTS ${LibDwarf_ROOT_DIR}/include ${LibDwarf_ROOT_DIR} ${LibDwarf_INCLUDEDIR} + PATHS ${DYNINST_SYSTEM_INCLUDE_PATHS} + PATH_SUFFIXES ${_path_suffixes} + DOC "libdw include directories") + +find_library( + LibDwarf_LIBRARIES + NAMES libdw.so.1 libdw.so + HINTS ${LibDwarf_ROOT_DIR}/lib ${LibDwarf_ROOT_DIR} ${LibDwarf_LIBRARYDIR} + PATHS ${DYNINST_SYSTEM_LIBRARY_PATHS} + PATH_SUFFIXES ${_path_suffixes}) + +# Find the library with the highest version +set(_max_ver 0.0) +set(_max_ver_lib) +foreach(l ${LibDwarf_LIBRARIES}) + get_filename_component(_dw_realpath ${LibDwarf_LIBRARIES} REALPATH) + string(REGEX MATCH "libdw\\-(.+)\\.so\\.*$" res ${_dw_realpath}) + + # The library version number is stored in CMAKE_MATCH_1 + set(_cur_ver ${CMAKE_MATCH_1}) + + if(${_cur_ver} VERSION_GREATER ${_max_ver}) + set(_max_ver ${_cur_ver}) + set(_max_ver_lib ${l}) + endif() +endforeach() + +# Set the exported variables to the best match +set(LibDwarf_LIBRARIES ${_max_ver_lib}) +set(LibDwarf_VERSION ${_max_ver}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + LibDwarf + FOUND_VAR LibDwarf_FOUND + REQUIRED_VARS LibDwarf_LIBRARIES LibDwarf_INCLUDE_DIR + VERSION_VAR LibDwarf_VERSION) + +# Export cache variables +if(LibDwarf_FOUND) + set(LibDwarf_INCLUDE_DIRS ${LibDwarf_INCLUDE_DIR}) + set(LibDwarf_LIBRARIES ${LibDwarf_LIBRARIES}) + + # Because we only report the library with the largest version, we are guaranteed there + # is only one file in LibDwarf_LIBRARIES + get_filename_component(_dw_dir ${LibDwarf_LIBRARIES} DIRECTORY) + set(LibDwarf_LIBRARY_DIRS ${_dw_dir}) + + add_library(LibDwarf::LibDwarf INTERFACE IMPORTED) + target_include_directories(LibDwarf::LibDwarf INTERFACE ${LibDwarf_INCLUDE_DIR}) + target_link_directories(LibDwarf::LibDwarf INTERFACE ${LibDwarf_LIBRARY_DIRS}) + target_link_libraries(LibDwarf::LibDwarf INTERFACE ${LibDwarf_LIBRARIES}) +endif() diff --git a/cmake/Modules/FindLibElf.cmake b/cmake/Modules/FindLibElf.cmake new file mode 100644 index 000000000..a339b0299 --- /dev/null +++ b/cmake/Modules/FindLibElf.cmake @@ -0,0 +1,88 @@ +# ======================================================================================== +# FindLibElf.cmake +# +# Find libelf include dirs and libraries +# +# ---------------------------------------- +# +# Use this module by invoking find_package with the form:: +# +# find_package(LibElf [version] [EXACT] # Minimum or EXACT version e.g. 0.173 +# [REQUIRED] # Fail with error if libelf is not found ) +# +# This module reads hints about search locations from variables:: +# +# LibElf_ROOT_DIR - Base directory the of libelf installation LibElf_INCLUDEDIR - +# Hint directory that contains the libelf headers files LibElf_LIBRARYDIR - Hint +# directory that contains the libelf library files +# +# and saves search results persistently in CMake cache entries:: +# +# LibElf_FOUND - True if headers and requested libraries were found +# LibElf_INCLUDE_DIRS - libelf include directories LibElf_LIBRARY_DIRS - Link +# directories for libelf libraries LibElf_LIBRARIES - libelf library files +# +# Based on the version by Bernhard Walle Copyright (c) 2008 +# +# ======================================================================================== + +# Non-standard subdirectories to search +set(_path_suffixes libelf libelfls elfutils) + +find_path( + LibElf_INCLUDE_DIR + NAMES libelf.h + HINTS ${LibElf_ROOT_DIR}/include ${LibElf_ROOT_DIR} ${LibElf_INCLUDEDIR} + PATHS ${DYNINST_SYSTEM_INCLUDE_PATHS} + PATH_SUFFIXES ${_path_suffixes} + DOC "libelf include directories") + +find_library( + LibElf_LIBRARIES + NAMES libelf.so.1 libelf.so + HINTS ${LibElf_ROOT_DIR}/lib ${LibElf_ROOT_DIR} ${LibElf_LIBRARYDIR} + PATHS ${DYNINST_SYSTEM_LIBRARY_PATHS} + PATH_SUFFIXES ${_path_suffixes}) + +# Find the library with the highest version +set(_max_ver 0.0) +set(_max_ver_lib) +foreach(l ${LibElf_LIBRARIES}) + get_filename_component(_elf_realpath ${LibElf_LIBRARIES} REALPATH) + string(REGEX MATCH "libelf\\-(.+)\\.so\\.*$" res ${_elf_realpath}) + + # The library version number is stored in CMAKE_MATCH_1 + set(_cur_ver ${CMAKE_MATCH_1}) + + if(${_cur_ver} VERSION_GREATER ${_max_ver}) + set(_max_ver ${_cur_ver}) + set(_max_ver_lib ${l}) + endif() +endforeach() + +# Set the exported variables to the best match +set(LibElf_LIBRARIES ${_max_ver_lib}) +set(LibElf_VERSION ${_max_ver}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + LibElf + FOUND_VAR LibElf_FOUND + REQUIRED_VARS LibElf_LIBRARIES LibElf_INCLUDE_DIR + VERSION_VAR LibElf_VERSION) + +# Export cache variables +if(LibElf_FOUND) + set(LibElf_INCLUDE_DIRS ${LibElf_INCLUDE_DIR}) + set(LibElf_LIBRARIES ${LibElf_LIBRARIES}) + + # Because we only report the library with the largest version, we are guaranteed there + # is only one file in LibElf_LIBRARIES + get_filename_component(_elf_dir ${LibElf_LIBRARIES} DIRECTORY) + set(LibElf_LIBRARY_DIRS ${_elf_dir} "${_elf_dir}/elfutils") + + add_library(LibElf::LibElf INTERFACE IMPORTED) + target_include_directories(LibElf::LibElf INTERFACE ${LibElf_INCLUDE_DIR}) + target_link_directories(LibElf::LibElf INTERFACE ${LibElf_LIBRARY_DIRS}) + target_link_libraries(LibElf::LibElf INTERFACE ${LibElf_LIBRARIES}) +endif() diff --git a/cmake/Modules/FindLibIberty.cmake b/cmake/Modules/FindLibIberty.cmake new file mode 100644 index 000000000..4caebe9d0 --- /dev/null +++ b/cmake/Modules/FindLibIberty.cmake @@ -0,0 +1,80 @@ +# ======================================================================================== +# FindLibIberty.cmake +# +# Find LibIberty include dirs and libraries +# +# ---------------------------------------- +# +# Use this module by invoking find_package with the form:: +# +# find_package(LibIberty [REQUIRED] # Fail with error if LibIberty is not +# found ) +# +# This module reads hints about search locations from variables:: +# +# LibIberty_ROOT_DIR - Base directory the of LibIberty installation +# LibIberty_LIBRARYDIR - Hint directory that contains the LibIberty library files +# IBERTY_LIBRARIES - Alias for LibIberty_LIBRARIES (backwards compatibility only) +# LibIberty_INCLUDEDIR - Hint directory that contains the libiberty headers files +# +# and saves search results persistently in CMake cache entries:: +# +# LibIberty_FOUND - True if headers and requested libraries were found +# IBERTY_FOUND - Alias for LibIberty_FOUND (backwards compatibility only) +# LibIberty_INCLUDE_DIRS - libiberty include directories LibIberty_LIBRARY_DIRS - Link +# directories for LibIberty libraries LibIberty_LIBRARIES - LibIberty library files +# IBERTY_LIBRARIES - Alias for LibIberty_LIBRARIES (backwards compatibility only) +# +# ======================================================================================== + +cmake_minimum_required(VERSION 3.13.0 FATAL_ERROR) + +# Keep the semantics of IBERTY_LIBRARIES for backward compatibility NB: If both are +# specified, LibIberty_LIBRARIES is ignored +if(NOT "${IBERTY_LIBRARIES}" STREQUAL "") + set(LibIberty_LIBRARIES ${IBERTY_LIBRARIES}) +endif() + +# Non-standard subdirectories to search +set(_path_suffixes libiberty iberty) + +find_path( + LibIberty_INCLUDE_DIRS + NAMES libiberty.h + HINTS ${LibIberty_ROOT_DIR} ${LibIberty_ROOT_DIR}/include ${LibIberty_INCLUDEDIR} + PATHS ${DYNINST_SYSTEM_INCLUDE_PATHS} + PATH_SUFFIXES ${_path_suffixes} + DOC "LibIberty include directories") + +# iberty_pic is for Debian <= wheezy +find_library( + LibIberty_LIBRARIES + NAMES iberty_pic iberty + HINTS ${LibIberty_ROOT_DIR} ${LibIberty_LIBRARYDIR} ${IBERTY_LIBRARIES} + PATHS ${DYNINST_SYSTEM_LIBRARY_PATHS} + PATH_SUFFIXES ${_path_suffixes}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + LibIberty + FOUND_VAR LibIberty_FOUND + REQUIRED_VARS LibIberty_INCLUDE_DIRS LibIberty_LIBRARIES) + +# For backwards compatibility only +set(IBERTY_FOUND ${LibIberty_FOUND}) + +if(LibIberty_FOUND) + foreach(l ${LibIberty_LIBRARIES}) + get_filename_component(_dir ${l} DIRECTORY) + if(NOT "${_dir}" IN_LIST LibIberty_LIBRARY_DIRS) + list(APPEND LibIberty_LIBRARY_DIRS ${_dir}) + endif() + endforeach() + + add_library(LibIberty::LibIberty INTERFACE IMPORTED) + target_include_directories(LibIberty::LibIberty INTERFACE ${LibIberty_INCLUDE_DIRS}) + target_link_libraries(LibIberty::LibIberty INTERFACE ${LibIberty_LIBRARIES}) + + # For backwards compatibility only + set(IBERTY_LIBRARIES ${LibIberty_LIBRARIES}) +endif() diff --git a/cmake/Modules/FindRCCL-Headers.cmake b/cmake/Modules/FindRCCL-Headers.cmake deleted file mode 100644 index 8d0befed3..000000000 --- a/cmake/Modules/FindRCCL-Headers.cmake +++ /dev/null @@ -1,94 +0,0 @@ -# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file -# Copyright.txt or https://cmake.org/licensing for details. - -include(FindPackageHandleStandardArgs) - -# ----------------------------------------------------------------------------------------# - -set(RCCL-Headers_INCLUDE_DIR_INTERNAL - "${PROJECT_SOURCE_DIR}/source/lib/rocprof-sys/library/tpls/rccl" - CACHE PATH "Path to internal rccl.h") - -# ----------------------------------------------------------------------------------------# - -if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "") - set(ROCM_PATH "$ENV{ROCM_PATH}") -endif() - -foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl) - if(EXISTS ${_DIR}) - get_filename_component(_ABS_DIR "${_DIR}" REALPATH) - list(APPEND _RCCL_PATHS ${_ABS_DIR}) - endif() -endforeach() - -# ----------------------------------------------------------------------------------------# - -find_package( - rccl - QUIET - CONFIG - HINTS - ${_RCCL_PATHS} - PATHS - ${_RCCL_PATHS} - PATH_SUFFIXES - rccl/lib/cmake) - -if(NOT rccl_FOUND) - set(RCCL-Headers_INCLUDE_DIR - "${RCCL-Headers_INCLUDE_DIR_INTERNAL}" - CACHE PATH "Path to RCCL headers") -else() - set(RCCL-Headers_INCLUDE_DIR - "${rccl_INCLUDE_DIR}" - CACHE PATH "Path to RCCL headers") -endif() - -# because of the annoying warning starting with v5.2.0, we've got to do this crap -if(ROCmVersion_NUMERIC_VERSION) - if(ROCmVersion_NUMERIC_VERSION LESS 50200) - set(_RCCL-Headers_FILE "rccl.h") - set(_RCCL-Headers_DIR "/rccl") - else() - set(_RCCL-Headers_FILE "rccl/rccl.h") - set(_RCCL-Headers_DIR "") - endif() -else() - set(_RCCL-Headers_FILE "rccl/rccl.h") - set(_RCCL-Headers_DIR "") -endif() - -if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}") - rocprofiler_systems_message( - AUTHOR_WARNING - "RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}" - ) - set(RCCL-Headers_INCLUDE_DIR - "${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}" - CACHE PATH "Path to RCCL headers" FORCE) -endif() - -unset(_RCCL-Headers_FILE) -unset(_RCCL-Headers_DIR) - -mark_as_advanced(RCCL-Headers_INCLUDE_DIR) - -# ----------------------------------------------------------------------------------------# - -find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR) - -# ------------------------------------------------------------------------------# - -if(RCCL-Headers_FOUND) - add_library(roc::rccl-headers INTERFACE IMPORTED) - set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR}) - - target_include_directories(roc::rccl-headers SYSTEM - INTERFACE ${RCCL-Headers_INCLUDE_DIR}) - - add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED) - target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers) -endif() - -# ------------------------------------------------------------------------------# diff --git a/cmake/Modules/FindTBB.cmake b/cmake/Modules/FindTBB.cmake new file mode 100644 index 000000000..e69e6f1a5 --- /dev/null +++ b/cmake/Modules/FindTBB.cmake @@ -0,0 +1,268 @@ +# ====================================================================================================== +# FindTBB.cmake +# +# Find TBB include directories and libraries. +# +# ---------------------------------------- +# +# Use this module by invoking find_package with the form:: +# +# find_package(TBB [major[.minor]] [EXACT] [QUIET] # Minimum or EXACT version e.g. +# 2018.6 [REQUIRED] # Fail with error if TBB is not found +# [[COMPONENTS] [components...]] # Required components [OPTIONAL_COMPONENTS +# components...] # Optional components ) +# +# This module reads hints about search locations from variables:: +# +# TBB_ROOT_DIR - The base directory the of TBB installation. TBB_INCLUDE_DIR - +# The directory that contains the TBB headers files. TBB_LIBRARY - The directory +# that contains the TBB library files. TBB__LIBRARY - The path of the TBB the +# corresponding TBB library. These libraries override the corresponding library search +# results. TBB_USE_DEBUG_BUILD - Use the debug version of tbb libraries +# +# Environment variable aliases for TBB_ROOT_DIR: +# +# TBB_INSTALL_DIR TBBROOT LIBRARY_PATH +# +# This module will set the following variables: +# +# TBB_FOUND - If false, or undefined, TBB not found, or don’t want to +# use TBB. TBB__FOUND - If False, optional part of TBB +# sytem is not available. TBB_VERSION - The full version string TBB_VERSION_MAJOR - The +# major version TBB_VERSION_MINOR - The minor version TBB_INTERFACE_VERSION - The +# interface version number defined in tbb/tbb_stddef.h. TBB__LIBRARY_RELEASE - +# The path of the TBB release version of . TBB__LIBRARY_DEBUG - The +# path of the TBB debug version of . +# +# The following varibles should be used to build and link with TBB: +# +# TBB_INCLUDE_DIRS - The include directory for TBB. TBB_LIBRARY_DIRS - The library +# directory for TBB. TBB_LIBRARIES - The libraries to link against to use TBB. +# TBB_LIBRARIES_RELEASE - The release libraries to link against to use TBB. +# TBB_LIBRARIES_DEBUG - The debug libraries to link against to use TBB. +# TBB_DEFINITIONS - Definitions to use when compiling code that uses TBB. +# TBB_DEFINITIONS_RELEASE - Definitions to use when compiling release code that uses TBB. +# TBB_DEFINITIONS_DEBUG - Definitions to use when compiling debug code that uses TBB. +# +# This module will also create the "TBB" target that may be used when building executables +# and libraries. +# +# Based on the version by Justus Calvin - Copyright (c) 2015 +# +# ====================================================================================================== + +if(TBB_FOUND) + return() +endif() + +include(FindPackageHandleStandardArgs) + +# +# Check the build type +# +if(NOT DEFINED TBB_USE_DEBUG_BUILD) + if(CMAKE_BUILD_TYPE MATCHES "(Debug|DEBUG|debug)") + set(TBB_BUILD_TYPE DEBUG) + else() + set(TBB_BUILD_TYPE RELEASE) + endif() +elseif(TBB_USE_DEBUG_BUILD) + set(TBB_BUILD_TYPE DEBUG) +else() + set(TBB_BUILD_TYPE RELEASE) +endif() + +# +# Set the TBB search directories +# + +# Define search paths based on user input and environment variables +set(TBB_SEARCH_DIR ${TBB_ROOT_DIR} $ENV{TBB_INSTALL_DIR} $ENV{TBBROOT}) + +# Define the search directories based on the current platform +if(CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(TBB_DEFAULT_SEARCH_DIR "C:/Program Files/Intel/TBB" + "C:/Program Files (x86)/Intel/TBB") + + # Set the target architecture + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(TBB_ARCHITECTURE "intel64") + else() + set(TBB_ARCHITECTURE "ia32") + endif() + + # Set the TBB search library path search suffix based on the version of VC + if(WINDOWS_STORE) + set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11_ui") + elseif(MSVC14) + set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc14") + elseif(MSVC12) + set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc12") + elseif(MSVC11) + set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11") + elseif(MSVC10) + set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc10") + endif() + + # Add the library path search suffix for the VC independent version of TBB + list(APPEND TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc_mt") + +elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + # OS X + set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb") + + # TODO: Check to see which C++ library is being used by the compiler. + if(NOT ${CMAKE_SYSTEM_VERSION} VERSION_LESS 13.0) + # The default C++ library on OS X 10.9 and later is libc++ + set(TBB_LIB_PATH_SUFFIX "lib/libc++" "lib") + else() + set(TBB_LIB_PATH_SUFFIX "lib") + endif() +elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # Linux + set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb") + + # TODO: Check compiler version to see the suffix should be /gcc4.1 or + # /gcc4.1. For now, assume that the compiler is more recent than gcc 4.4.x or + # later. + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(TBB_LIB_PATH_SUFFIX "lib/intel64/gcc4.4") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$") + set(TBB_LIB_PATH_SUFFIX "lib/ia32/gcc4.4") + endif() +endif() + +# +# Find the TBB include dir +# +find_path( + TBB_INCLUDE_DIRS tbb/tbb.h + HINTS ${TBB_INCLUDE_DIRS} ${TBB_SEARCH_DIR} + PATHS ${TBB_DEFAULT_SEARCH_DIR} + PATH_SUFFIXES include) + +# +# Set version strings +# +if(TBB_INCLUDE_DIRS) + # Starting in 2020.1.1, tbb_stddef.h is replaced by version.h + set(_version_files "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" + "${TBB_INCLUDE_DIRS}/tbb/version.h") + foreach(f IN ITEMS ${_version_files}) + if(EXISTS ${f}) + set(_version_file ${f}) + endif() + endforeach() + unset(_version_files) + + file(READ ${_version_file} _tbb_version_file) + string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1" TBB_VERSION_MAJOR + "${_tbb_version_file}") + string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" TBB_VERSION_MINOR + "${_tbb_version_file}") + string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" + TBB_INTERFACE_VERSION "${_tbb_version_file}") + + # The TBB_VERSION_MINOR isn't necessarily changed for minor releases Hence, we need to + # read the engineering versioning in TBB_INTERFACE_VERSION to get the minor version + # correct + if("${TBB_VERSION_MINOR}" STREQUAL "0") + math(EXPR _tbb_iface_major_ver "${TBB_INTERFACE_VERSION} / 100") + math(EXPR TBB_VERSION_MINOR + "${TBB_INTERFACE_VERSION} - ${_tbb_iface_major_ver} * 100") + endif() + set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}") +endif() + +# +# Find TBB components +# +if(TBB_VERSION VERSION_LESS 4.3) + set(TBB_SEARCH_COMPOMPONENTS tbb_preview tbbmalloc tbb) +else() + set(TBB_SEARCH_COMPOMPONENTS tbb_preview tbbmalloc_proxy tbbmalloc tbb) +endif() + +set(TBB_LIBRARY_DIRS) + +# Find each component +foreach(_comp ${TBB_SEARCH_COMPOMPONENTS}) + # message(STATUS "Searching for ${_comp}...") message(STATUS "Hints: ${TBB_LIBRARY} + # ${TBB_SEARCH_DIR}") + if(";${TBB_FIND_COMPONENTS};tbb;" MATCHES ";${_comp};") + + # Search for the libraries + find_library( + TBB_${_comp}_LIBRARY_RELEASE ${_comp} + HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR} + PATHS ${TBB_DEFAULT_SEARCH_DIR} ENV LIBRARY_PATH + PATH_SUFFIXES ${TBB_LIB_PATH_SUFFIX} lib_release) + + find_library( + TBB_${_comp}_LIBRARY_DEBUG ${_comp}_debug + HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR} + PATHS ${TBB_DEFAULT_SEARCH_DIR} ENV LIBRARY_PATH + PATH_SUFFIXES ${TBB_LIB_PATH_SUFFIX} lib_debug) + + if(TBB_${_comp}_LIBRARY_DEBUG) + list(APPEND TBB_LIBRARIES_DEBUG "${TBB_${_comp}_LIBRARY_DEBUG}") + # message(STATUS "Found ${TBB_${_comp}_LIBRARY_DEBUG}") + endif() + if(TBB_${_comp}_LIBRARY_RELEASE) + list(APPEND TBB_LIBRARIES_RELEASE "${TBB_${_comp}_LIBRARY_RELEASE}") + # message(STATUS "Found ${TBB_${_comp}_LIBRARY_RELEASE}") + endif() + if(TBB_${_comp}_LIBRARY_${TBB_BUILD_TYPE} AND NOT TBB_${_comp}_LIBRARY) + set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_${TBB_BUILD_TYPE}}") + endif() + + if(TBB_${_comp}_LIBRARY AND EXISTS "${TBB_${_comp}_LIBRARY}") + set(TBB_${_comp}_FOUND TRUE) + else() + set(TBB_${_comp}_FOUND FALSE) + endif() + + # Mark internal variables as advanced + mark_as_advanced(TBB_${_comp}_LIBRARY_RELEASE) + mark_as_advanced(TBB_${_comp}_LIBRARY_DEBUG) + mark_as_advanced(TBB_${_comp}_LIBRARY) + + # Save the directory names for each library component + if(TBB_USE_DEBUG_BUILD) + get_filename_component(_dir ${TBB_${_comp}_LIBRARY_DEBUG} DIRECTORY) + else() + get_filename_component(_dir ${TBB_${_comp}_LIBRARY_RELEASE} DIRECTORY) + endif() + list(APPEND TBB_LIBRARY_DIRS ${_dir}) + endif() +endforeach() + +# +# Set compile flags and libraries +# +set(TBB_DEFINITIONS_RELEASE "") +set(TBB_DEFINITIONS_DEBUG "-DTBB_USE_DEBUG=1") + +if(TBB_LIBRARIES_${TBB_BUILD_TYPE}) + set(TBB_DEFINITIONS "${TBB_DEFINITIONS_${TBB_BUILD_TYPE}}") + set(TBB_LIBRARIES "${TBB_LIBRARIES_${TBB_BUILD_TYPE}}") +elseif(TBB_LIBRARIES_RELEASE) + set(TBB_DEFINITIONS "${TBB_DEFINITIONS_RELEASE}") + set(TBB_LIBRARIES "${TBB_LIBRARIES_RELEASE}") +elseif(TBB_LIBRARIES_DEBUG) + set(TBB_DEFINITIONS "${TBB_DEFINITIONS_DEBUG}") + set(TBB_LIBRARIES "${TBB_LIBRARIES_DEBUG}") +endif() + +find_package_handle_standard_args( + TBB + REQUIRED_VARS TBB_INCLUDE_DIRS TBB_LIBRARIES + HANDLE_COMPONENTS + VERSION_VAR TBB_VERSION) + +mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARIES TBB_LIBRARY_DIRS) + +unset(TBB_ARCHITECTURE) +unset(TBB_BUILD_TYPE) +unset(TBB_LIB_PATH_SUFFIX) +unset(TBB_DEFAULT_SEARCH_DIR) diff --git a/cmake/Packages.cmake b/cmake/Packages.cmake index 596d2d132..19ccbe9e1 100644 --- a/cmake/Packages.cmake +++ b/cmake/Packages.cmake @@ -17,9 +17,6 @@ rocprofiler_systems_add_interface_library( "Provides flags and libraries for Dyninst (dynamic instrumentation)") rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm "Provides flags and libraries for ROCm") -rocprofiler_systems_add_interface_library( - rocprofiler-systems-rccl - "Provides flags for ROCm Communication Collectives Library (RCCL)") rocprofiler_systems_add_interface_library(rocprofiler-systems-mpi "Provides MPI or MPI headers") rocprofiler_systems_add_interface_library(rocprofiler-systems-libva @@ -47,7 +44,6 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-compile-definition # libraries with relevant compile definitions set(ROCPROFSYS_EXTENSION_LIBRARIES rocprofiler-systems::rocprofiler-systems-rocm - rocprofiler-systems::rocprofiler-systems-rccl rocprofiler-systems::rocprofiler-systems-bfd rocprofiler-systems::rocprofiler-systems-mpi rocprofiler-systems::rocprofiler-systems-ptl @@ -185,19 +181,6 @@ if(ROCPROFSYS_USE_ROCM) target_link_libraries(rocprofiler-systems-rocm INTERFACE amd-smi::amd-smi) endif() -# ----------------------------------------------------------------------------------------# -# -# RCCL -# -# ----------------------------------------------------------------------------------------# - -if(ROCPROFSYS_USE_RCCL) - find_package(RCCL-Headers ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) - target_link_libraries(rocprofiler-systems-rccl INTERFACE roc::rccl-headers) - rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rccl - INTERFACE ROCPROFSYS_USE_RCCL) -endif() - # ----------------------------------------------------------------------------------------# # # MPI @@ -210,13 +193,12 @@ set(_ROCPROFSYS_MPI_HEADERS_ALLOW_MPICH ${MPI_HEADERS_ALLOW_MPICH}) if(ROCPROFSYS_USE_MPI) find_package(MPI ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) target_link_libraries(rocprofiler-systems-mpi INTERFACE MPI::MPI_C MPI::MPI_CXX) - rocprofiler_systems_target_compile_definitions( - rocprofiler-systems-mpi INTERFACE TIMEMORY_USE_MPI=1 ROCPROFSYS_USE_MPI) + rocprofiler_systems_target_compile_definitions(rocprofiler-systems-mpi + INTERFACE ROCPROFSYS_USE_MPI) elseif(ROCPROFSYS_USE_MPI_HEADERS) find_package(MPI-Headers ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) - rocprofiler_systems_target_compile_definitions( - rocprofiler-systems-mpi INTERFACE TIMEMORY_USE_MPI_HEADERS=1 - ROCPROFSYS_USE_MPI_HEADERS) + rocprofiler_systems_target_compile_definitions(rocprofiler-systems-mpi + INTERFACE ROCPROFSYS_USE_MPI_HEADERS) target_link_libraries(rocprofiler-systems-mpi INTERFACE MPI::MPI_HEADERS) endif() @@ -248,13 +230,13 @@ target_link_libraries(rocprofiler-systems-elfutils INTERFACE ${ElfUtils_LIBRARIE # Dyninst # # ----------------------------------------------------------------------------------------# - +include(DyninstExternals) if(ROCPROFSYS_BUILD_DYNINST) rocprofiler_systems_checkout_git_submodule( RELATIVE_PATH external/dyninst WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} - REPO_URL https://github.com/jrmadsen/dyninst.git - REPO_BRANCH omnitrace) + REPO_URL https://github.com/ROCm/dyninst.git + REPO_BRANCH dyninst_13) set(DYNINST_OPTION_PREFIX ON) set(DYNINST_BUILD_DOCS OFF) @@ -297,6 +279,7 @@ if(ROCPROFSYS_BUILD_DYNINST) set(DYNINST_TPL_INSTALL_LIB_DIR "${PROJECT_NAME}" CACHE PATH "Third-party library install-tree install library prefix" FORCE) + add_subdirectory(external/dyninst EXCLUDE_FROM_ALL) rocprofiler_systems_restore_variables( PIC VARIABLES CMAKE_POSITION_INDEPENDENT_CODE CMAKE_INSTALL_RPATH @@ -329,17 +312,37 @@ if(ROCPROFSYS_BUILD_DYNINST) endif() endforeach() - # for packaging - install( - DIRECTORY ${DYNINST_TPL_STAGING_PREFIX}/lib/ - DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} - COMPONENT dyninst - FILES_MATCHING - PATTERN "*${CMAKE_SHARED_LIBRARY_SUFFIX}*") + foreach( + _LIB + common + dynDwarf + dynElf + dyninstAPI + instructionAPI + parseAPI + patchAPI + pcontrol + stackwalk + symtabAPI) + if(TARGET ${_LIB}) + add_dependencies(${_LIB} external-prebuild) + if(NOT TARGET Dyninst::${_LIB}) + add_library(Dyninst::${_LIB} ALIAS ${_LIB}) + endif() + endif() + endforeach() target_link_libraries(rocprofiler-systems-dyninst INTERFACE Dyninst::Dyninst) else() + # Find Boost before finding Dyninst + find_package(Boost) + if(NOT TARGET Dyninst::Boost_headers) + add_library(Dyninst::Boost_headers INTERFACE IMPORTED) + target_include_directories(Dyninst::Boost_headers SYSTEM + INTERFACE ${Boost_INCLUDE_DIRS}) + endif() + find_package(Dyninst ${rocprofiler_systems_FIND_QUIETLY} REQUIRED COMPONENTS dyninstAPI parseAPI instructionAPI symtabAPI) @@ -543,9 +546,6 @@ set(TIMEMORY_QUIET_CONFIG CACHE BOOL "Make timemory configuration quieter") # timemory feature settings -set(TIMEMORY_USE_MPI - ${ROCPROFSYS_USE_MPI} - CACHE BOOL "Enable MPI support in timemory" FORCE) set(TIMEMORY_USE_GOTCHA ON CACHE BOOL "Enable GOTCHA support in timemory") diff --git a/docker/Dockerfile.opensuse b/docker/Dockerfile.opensuse index d59f9bcb0..81edfb2dd 100644 --- a/docker/Dockerfile.opensuse +++ b/docker/Dockerfile.opensuse @@ -25,7 +25,7 @@ RUN zypper --non-interactive update -y && \ zypper --non-interactive install -y -t pattern devel_basis && \ zypper --non-interactive install -y binutils-gold chrpath cmake curl dpkg-devel \ gcc-c++ git libdrm-devel libnuma-devel openmpi3-devel python3-pip rpm-build \ - wget && \ + wget iproute2 && \ python3 -m pip install 'cmake==3.21' ARG ROCM_VERSION=0.0 @@ -46,12 +46,12 @@ RUN if [ "${ROCM_VERSION}" != "0.0" ]; then \ ARG PYTHON_VERSIONS="6 7 8 9 10 11 12 13" -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ - bash miniconda.sh -b -p /opt/conda && \ +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniforge.sh && \ + bash miniforge.sh -b -p /opt/conda && \ export PATH="/opt/conda/bin:${PATH}" && \ conda config --set always_yes yes --set changeps1 no && \ - conda update -c defaults -n base conda && \ - for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c defaults -c conda-forge python=3.${i} pip; done && \ + conda update -c conda-forge -n base conda && \ + for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c conda-forge python=3.${i} pip; done && \ for i in ${PYTHON_VERSIONS}; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done && \ conda clean -a -y && \ conda init diff --git a/docker/Dockerfile.opensuse.ci b/docker/Dockerfile.opensuse.ci index 88637abcf..fd8db6249 100644 --- a/docker/Dockerfile.opensuse.ci +++ b/docker/Dockerfile.opensuse.ci @@ -29,7 +29,7 @@ RUN zypper --non-interactive update -y && \ zypper --non-interactive install -y -t pattern devel_basis && \ zypper --non-interactive install -y binutils-gold chrpath cmake curl dpkg-devel \ gcc-c++ git libnuma-devel openmpi3-devel papi-devel python3-pip \ - rpm-build vim wget && \ + rpm-build vim wget iproute2 && \ zypper --non-interactive clean --all && \ python3 -m pip install 'cmake==3.21' perfetto @@ -48,12 +48,12 @@ RUN cd /tmp/dyninst && \ ARG PYTHON_VERSIONS="6 7 8 9 10 11 12 13" -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ - bash miniconda.sh -b -p /opt/conda && \ +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniforge.sh && \ + bash miniforge.sh -b -p /opt/conda && \ export PATH="/opt/conda/bin:${PATH}" && \ conda config --set always_yes yes --set changeps1 no && \ - conda update -c defaults -n base conda && \ - for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c defaults -c conda-forge python=3.${i} pip numpy; done && \ + conda update -c conda-forge -n base conda && \ + for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c conda-forge python=3.${i} pip numpy; done && \ for i in ${PYTHON_VERSIONS}; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done && \ conda clean -a -y && \ cd /tmp && \ diff --git a/docker/Dockerfile.rhel b/docker/Dockerfile.rhel index a5ee8b3f0..acc5c9c2f 100644 --- a/docker/Dockerfile.rhel +++ b/docker/Dockerfile.rhel @@ -16,7 +16,7 @@ ENV LIBRARY_PATH ${LIBRARY_PATH}:/opt/amdgpu/lib64 RUN yum groupinstall -y "Development Tools" && \ yum install -y epel-release && crb enable && \ yum install -y --allowerasing chrpath cmake curl dpkg-devel libdrm-devel numactl-devel \ - openmpi-devel papi-devel python3-pip texinfo wget which zlib-devel && \ + openmpi-devel papi-devel python3-pip texinfo wget which zlib-devel iproute && \ yum clean all && \ python3 -m pip install 'cmake==3.21' && \ python3 -m pip install 'perfetto' @@ -39,12 +39,12 @@ RUN if [ "${ROCM_VERSION}" != "0.0" ]; then \ ARG PYTHON_VERSIONS="6 7 8 9 10 11 12 13" -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ - bash miniconda.sh -b -p /opt/conda && \ +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniforge.sh && \ + bash miniforge.sh -b -p /opt/conda && \ export PATH="/opt/conda/bin:${PATH}" && \ conda config --set always_yes yes --set changeps1 no && \ - conda update -c defaults -n base conda && \ - for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c defaults -c conda-forge python=3.${i} pip; done && \ + conda update -c conda-forge -n base conda && \ + for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c conda-forge python=3.${i} pip; done && \ for i in ${PYTHON_VERSIONS}; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done && \ conda clean -a -y && \ conda init diff --git a/docker/Dockerfile.rhel.ci b/docker/Dockerfile.rhel.ci index 1fba6c2b9..95999cbd7 100644 --- a/docker/Dockerfile.rhel.ci +++ b/docker/Dockerfile.rhel.ci @@ -20,7 +20,7 @@ ARG NJOBS="8" RUN yum groupinstall -y "Development Tools" && \ yum install -y epel-release && crb enable && \ yum install -y --allowerasing chrpath cmake curl dpkg-devel numactl-devel \ - openmpi-devel papi-devel python3-pip texinfo wget which vim zlib-devel && \ + openmpi-devel papi-devel python3-pip texinfo wget which vim zlib-devel iproute && \ yum clean all && \ python3 -m pip install 'cmake==3.21' perfetto @@ -39,12 +39,12 @@ RUN cd /tmp/dyninst && \ ARG PYTHON_VERSIONS="6 7 8 9 10 11 12 13" -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ - bash miniconda.sh -b -p /opt/conda && \ +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniforge.sh && \ + bash miniforge.sh -b -p /opt/conda && \ export PATH="/opt/conda/bin:${PATH}" && \ conda config --set always_yes yes --set changeps1 no && \ - conda update -c defaults -n base conda && \ - for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c defaults -c conda-forge python=3.${i} pip numpy; done && \ + conda update -c conda-forge -n base conda && \ + for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c conda-forge python=3.${i} pip numpy; done && \ for i in ${PYTHON_VERSIONS}; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done && \ conda clean -a -y && \ cd /tmp && \ diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 4fc1c4cbb..dab6dea85 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -23,7 +23,7 @@ RUN apt-get update && \ apt-get install -y apt-utils autoconf autotools-dev bash-completion bison \ build-essential chrpath cmake curl flex gettext git-core gnupg2 libnuma1 libopenmpi-dev \ libpapi-dev libpfm4-dev librpm-dev libtool libudev1 lsb-release m4 \ - python3-pip rpm texinfo wget && \ + python3-pip rpm texinfo wget iproute2 && \ OS_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed 's/=/ /'1 | awk '{print $NF}' | sed 's/"//g') && \ if [ "${OS_VERSION}" == "24.04" ]; then \ python3 -m pip install --break-system-packages 'cmake==3.21'; \ @@ -47,12 +47,12 @@ RUN if [ "${ROCM_VERSION}" != "0.0" ]; then \ ARG PYTHON_VERSIONS="6 7 8 9 10 11 12 13" -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ - bash miniconda.sh -b -p /opt/conda && \ +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniforge.sh && \ + bash miniforge.sh -b -p /opt/conda && \ export PATH="/opt/conda/bin:${PATH}" && \ conda config --set always_yes yes --set changeps1 no && \ - conda update -c defaults -n base conda && \ - for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c defaults -c conda-forge python=3.${i} pip; done && \ + conda update -c conda-forge -n base conda && \ + for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c conda-forge python=3.${i} pip; done && \ for i in ${PYTHON_VERSIONS}; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done && \ conda clean -a -y && \ conda init diff --git a/docker/Dockerfile.ubuntu.ci b/docker/Dockerfile.ubuntu.ci index e36ba9b64..9e61da769 100644 --- a/docker/Dockerfile.ubuntu.ci +++ b/docker/Dockerfile.ubuntu.ci @@ -28,7 +28,7 @@ RUN apt-get update && \ apt-get install -y autoconf autotools-dev bash-completion bison build-essential \ bzip2 chrpath cmake curl environment-modules flex gettext git-core gnupg2 gzip \ libiberty-dev libpapi-dev libpfm4-dev libtool locales lsb-release m4 \ - python3-pip texinfo unzip wget vim zip zlib1g-dev && \ + python3-pip texinfo unzip wget vim zip zlib1g-dev iproute2 && \ apt-get autoclean && \ if [ "${OS_VERSION}" == "24.04" ]; then \ python3 -m pip install --break-system-packages 'cmake==3.21' perfetto \ @@ -51,12 +51,12 @@ RUN cd /tmp/dyninst && \ ARG PYTHON_VERSIONS="6 7 8 9 10 11 12 13" -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ - bash miniconda.sh -b -p /opt/conda && \ +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniforge.sh && \ + bash miniforge.sh -b -p /opt/conda && \ export PATH="/opt/conda/bin:${PATH}" && \ conda config --set always_yes yes --set changeps1 no && \ - conda update -c defaults -n base conda && \ - for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c defaults -c conda-forge python=3.${i} pip numpy; done && \ + conda update -c conda-forge -n base conda && \ + for i in ${PYTHON_VERSIONS}; do conda create -n py3.${i} -c conda-forge python=3.${i} pip numpy; done && \ for i in ${PYTHON_VERSIONS}; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done && \ conda clean -a -y && \ cd /tmp && \ diff --git a/docs/install/install.rst b/docs/install/install.rst index db649bd93..1412ad600 100644 --- a/docs/install/install.rst +++ b/docs/install/install.rst @@ -163,10 +163,10 @@ while Dyninst requires TBB), and the CMake option to build the package alongside "Dyninst", "12.0", "ROCm Systems Profiler", "``ROCPROFSYS_BUILD_DYNINST`` (default: OFF)" "Libunwind", "", "ROCm Systems Profiler", "``ROCPROFSYS_BUILD_LIBUNWIND`` (default: ON)" - "TBB", "2018.6", "Dyninst", "``DYNINST_BUILD_TBB`` (default: OFF)" - "ElfUtils", "0.178", "Dyninst", "``DYNINST_BUILD_ELFUTILS`` (default: OFF)" - "LibIberty", "", "Dyninst", "``DYNINST_BUILD_LIBIBERTY`` (default: OFF)" - "Boost", "1.67.0", "Dyninst", "``DYNINST_BUILD_BOOST`` (default: OFF)" + "TBB", "2018.6", "Dyninst", "``ROCPROFSYS_BUILD_TBB`` (default: OFF)" + "ElfUtils", "0.178", "Dyninst", "``ROCPROFSYS_BUILD_ELFUTILS`` (default: OFF)" + "LibIberty", "", "Dyninst", "``ROCPROFSYS_BUILD_LIBIBERTY`` (default: OFF)" + "Boost", "1.67.0", "Dyninst", "``ROCPROFSYS_BUILD_BOOST`` (default: OFF)" "OpenMP", "4.x", "Dyninst", "" Optional third-party packages @@ -209,16 +209,16 @@ To install Dyninst alongside ROCm Systems Profiler, configure ROCm Systems Profi Depending on the version of Ubuntu, the ``apt`` package manager might have current enough versions of the Dyninst Boost, TBB, and LibIberty dependencies (use ``apt-get install libtbb-dev libiberty-dev libboost-dev``). -However, it is possible to request Dyninst to install -its dependencies via ``DYNINST_BUILD_=ON``, as follows: +However, it is possible to request Dyninst to build and install +its dependencies via ``ROCPROFSYS_BUILD_=ON``, as follows: .. code-block:: shell git clone https://github.com/ROCm/rocprofiler-systems.git rocprof-sys-source - cmake -B rocprof-sys-build -DROCPROFSYS_BUILD_DYNINST=ON -DDYNINST_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON rocprof-sys-source + cmake -B rocprof-sys-build -DROCPROFSYS_BUILD_DYNINST=ON -DROCPROFSYS_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON rocprof-sys-source -where ``-DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON`` is expanded by -the shell to ``-DDYNINST_BUILD_TBB=ON -DDYNINST_BUILD_BOOST=ON ...`` +where ``-DROCPROFSYS_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON`` is expanded by +the shell to ``-DROCPROFSYS_BUILD_TBB=ON -DROCPROFSYS_BUILD_BOOST=ON ...`` Installing Dyninst via Spack ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -266,10 +266,10 @@ in `the Perfetto UI `_. -D ROCPROFSYS_BUILD_PAPI=ON \ -D ROCPROFSYS_BUILD_LIBUNWIND=ON \ -D ROCPROFSYS_BUILD_DYNINST=ON \ - -D DYNINST_BUILD_TBB=ON \ - -D DYNINST_BUILD_BOOST=ON \ - -D DYNINST_BUILD_ELFUTILS=ON \ - -D DYNINST_BUILD_LIBIBERTY=ON \ + -D ROCPROFSYS_BUILD_TBB=ON \ + -D ROCPROFSYS_BUILD_BOOST=ON \ + -D ROCPROFSYS_BUILD_ELFUTILS=ON \ + -D ROCPROFSYS_BUILD_LIBIBERTY=ON \ rocprof-sys-source cmake --build rocprof-sys-build --target all --parallel 8 cmake --build rocprof-sys-build --target install diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 5dd627c43..73a360278 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1 +1 @@ -rocm-docs-core[api_reference]==1.20.0 +rocm-docs-core[api_reference]==1.20.1 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 2e2ce9259..2943c6f36 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -227,7 +227,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.20.0 +rocm-docs-core[api-reference]==1.20.1 # via -r requirements.in rpds-py==0.22.3 # via diff --git a/examples/openmp/target/CMakeLists.txt b/examples/openmp/target/CMakeLists.txt index 9ac37181a..3bb695a0d 100644 --- a/examples/openmp/target/CMakeLists.txt +++ b/examples/openmp/target/CMakeLists.txt @@ -1,26 +1,5 @@ -# -# -# cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR) -# try to find a compatible HIP version -if(ROCmVersion_MAJOR_VERSION AND ROCmVersion_MAJOR_VERSION GREATER_EQUAL 6) - find_package(hip ${ROCmVersion_MAJOR_VERSION}.0.0) -else() - find_package(hip) -endif() - -if(NOT hip_FOUND) - message(WARNING "ROCm >= 5.6 not found. Skipping OpenMP target example.") - return() -elseif(hip_FOUND AND hip_VERSION VERSION_LESS 5.6.0) - message( - WARNING - "ROCm >= 5.6 not found (found ${hip_VERSION}). Skipping OpenMP target example." - ) - return() -endif() - if(NOT OMP_TARGET_COMPILER) find_program( amdclangpp_EXECUTABLE diff --git a/examples/transpose/CMakeLists.txt b/examples/transpose/CMakeLists.txt index 001571f58..bc7c721fc 100644 --- a/examples/transpose/CMakeLists.txt +++ b/examples/transpose/CMakeLists.txt @@ -41,7 +41,7 @@ if((NOT CMAKE_CXX_COMPILER_IS_HIPCC OR (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang return() endif() -option(TRANSPOSE_USE_MPI "Enable MPI support in transpose exe" ${TIMEMORY_USE_MPI}) +option(TRANSPOSE_USE_MPI "Enable MPI support in transpose exe" ${ROCPROFSYS_USE_MPI}) find_package(Threads REQUIRED) if(TRANSPOSE_USE_MPI) diff --git a/external/dyninst b/external/dyninst index 09e781d41..b4350c3c1 160000 --- a/external/dyninst +++ b/external/dyninst @@ -1 +1 @@ -Subproject commit 09e781d414c83b4ad587083d449a3e976546937d +Subproject commit b4350c3c1426a36d5d5e70b3b172bce1b0bff335 diff --git a/external/timemory b/external/timemory index 858cfc752..5400e1349 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit 858cfc752a02cc46ef03bef8ca34aaafe2609d19 +Subproject commit 5400e1349cb3266a1f9c605643c6d4f3871ced47 diff --git a/scripts/build-release.sh b/scripts/build-release.sh index 674eb5e68..e4b766f0a 100755 --- a/scripts/build-release.sh +++ b/scripts/build-release.sh @@ -222,7 +222,7 @@ ROCPROFSYS_GENERAL_ARGS="-DROCPROFSYS_CPACK_SYSTEM_NAME=${DISTRO} -DROCPROFSYS_R ROCPROFSYS_BUILD_ARGS="-DROCPROFSYS_BUILD_TESTING=OFF -DROCPROFSYS_BUILD_EXAMPLES=OFF -DROCPROFSYS_BUILD_PAPI=ON -DROCPROFSYS_BUILD_LTO=${LTO} -DROCPROFSYS_BUILD_HIDDEN_VISIBILITY=${HIDDEN_VIZ} -DROCPROFSYS_BUILD_STATIC_LIBGCC=${LIBGCC} -DROCPROFSYS_BUILD_STATIC_LIBSTDCXX=${LIBSTDCXX} -DROCPROFSYS_BUILD_RELEASE=ON" ROCPROFSYS_USE_ARGS="-DROCPROFSYS_USE_MPI_HEADERS=ON -DROCPROFSYS_USE_OMPT=ON -DROCPROFSYS_USE_PAPI=ON" TIMEMORY_ARGS="-DTIMEMORY_USE_LIBUNWIND=ON -DTIMEMORY_BUILD_LIBUNWIND=ON -DTIMEMORY_BUILD_PORTABLE=ON" -DYNINST_ARGS="-DROCPROFSYS_BUILD_DYNINST=ON -DDYNINST_USE_OpenMP=ON $(echo -DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON) -DDYNINST_BOOST_DOWNLOAD_VERSION=${BOOST_VERSION}" +DYNINST_ARGS="-DROCPROFSYS_BUILD_DYNINST=ON $(echo -DROCPROFSYS_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON) -DROCPROFSYS_BOOST_DOWNLOAD_VERSION=${BOOST_VERSION}" STANDARD_ARGS="${CMAKE_ARGS} ${ROCPROFSYS_GENERAL_ARGS} ${ROCPROFSYS_USE_ARGS} ${ROCPROFSYS_BUILD_ARGS} ${TIMEMORY_ARGS} ${DYNINST_ARGS} ${EXTRA_ARGS}" SCRIPT_DIR=$(realpath $(dirname ${BASH_SOURCE[0]})) diff --git a/source/bin/rocprof-sys-causal/impl.cpp b/source/bin/rocprof-sys-causal/impl.cpp index 39237443c..91b7bceb0 100644 --- a/source/bin/rocprof-sys-causal/impl.cpp +++ b/source/bin/rocprof-sys-causal/impl.cpp @@ -834,10 +834,6 @@ parse_args(int argc, char** argv, std::vector& _env, (defined(ROCPROFSYS_USE_MPI_HEADERS) && ROCPROFSYS_USE_MPI_HEADERS > 0) add_default_env(_env, "ROCPROFSYS_USE_MPIP", true); #endif - -#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0 - add_default_env(_env, "ROCPROFSYS_USE_RCCLP", true); -#endif } _fill("ROCPROFSYS_CAUSAL_BINARY_EXCLUDE", _binary_excludes, _generate_configs); diff --git a/source/bin/rocprof-sys-instrument/details.cpp b/source/bin/rocprof-sys-instrument/details.cpp index 729776944..698e8adf1 100644 --- a/source/bin/rocprof-sys-instrument/details.cpp +++ b/source/bin/rocprof-sys-instrument/details.cpp @@ -817,7 +817,7 @@ process_modules(const std::vector& _app_modules) } symtab_data.functions.emplace(itr, std::vector{}); - if(!itr->getAllFunctions(symtab_data.functions.at(itr))) continue; + if(itr->getAllFunctions().empty()) continue; _erase_nullptrs(symtab_data.functions.at(itr)); for(auto* fitr : symtab_data.functions.at(itr)) diff --git a/source/bin/rocprof-sys-instrument/internal_libs.cpp b/source/bin/rocprof-sys-instrument/internal_libs.cpp index 8f2b58b4d..3ed1ec972 100644 --- a/source/bin/rocprof-sys-instrument/internal_libs.cpp +++ b/source/bin/rocprof-sys-instrument/internal_libs.cpp @@ -486,8 +486,7 @@ get_internal_libs_data_impl() _data[itr.first].emplace(_mpath, func_set_t{}); _data[itr.first].emplace(_mname, func_set_t{}); - auto _funcs = std::vector{}; - mitr->getAllFunctions(_funcs); + auto _funcs = mitr->getAllFunctions(); for(const auto& fitr : _funcs) { diff --git a/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp b/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp index 365fdc7b5..77a11cedb 100644 --- a/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp +++ b/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp @@ -1711,8 +1711,10 @@ main(int argc, char** argv) { "rocprofsys_user_stop_thread_trace" }); #if ROCPROFSYS_USE_MPI > 0 || ROCPROFSYS_USE_MPI_HEADERS > 0 // if any of the below MPI functions are found, enable MPI support - for(const auto* itr : { "MPI_Init", "MPI_Init_thread", "MPI_Finalize", - "MPI_Comm_rank", "MPI_Comm_size" }) + for(const auto* itr : + { "MPI_Init", "MPI_Init_thread", "MPI_Finalize", "MPI_Comm_rank", "MPI_Comm_size", + "MPI_INIT", "mpi_init", "mpi_init_", "mpi_init__", "MPI_INIT_THREAD", + "mpi_init_thread", "mpi_init_thread_", "mpi_init_thread__" }) { if(find_function(app_image, itr) != nullptr) { diff --git a/source/bin/rocprof-sys-sample/impl.cpp b/source/bin/rocprof-sys-sample/impl.cpp index 37d177b5d..7225a21b6 100644 --- a/source/bin/rocprof-sys-sample/impl.cpp +++ b/source/bin/rocprof-sys-sample/impl.cpp @@ -730,11 +730,10 @@ parse_args(int argc, char** argv, std::vector& _env) } }); - std::set _backend_choices = { - "all", "kokkosp", "mpip", "ompt", - "rcclp", "amd-smi", "mutex-locks", "spin-locks", - "rw-locks", "rocprofiler-sdk", "rocm" - }; + std::set _backend_choices = { "all", "kokkosp", "mpip", + "ompt", "rcclp", "amd-smi", + "mutex-locks", "spin-locks", "rw-locks", + "rocm" }; #if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS) _backend_choices.erase("mpip"); @@ -744,14 +743,10 @@ parse_args(int argc, char** argv, std::vector& _env) _backend_choices.erase("ompt"); #endif -#if !defined(ROCPROFSYS_USE_RCCL) - _backend_choices.erase("rcclp"); -#endif - #if !defined(ROCPROFSYS_USE_ROCM) _backend_choices.erase("rocm"); _backend_choices.erase("amd-smi"); - _backend_choices.erase("rocprofiler-sdk"); + _backend_choices.erase("rcclp"); #endif parser.start_group("BACKEND OPTIONS", diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 29a65bf6c..0a2b8ea51 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -44,7 +44,6 @@ target_link_libraries( $ $ $ - $ $ $ $ diff --git a/source/lib/core/CMakeLists.txt b/source/lib/core/CMakeLists.txt index 184229642..5c0cbad99 100644 --- a/source/lib/core/CMakeLists.txt +++ b/source/lib/core/CMakeLists.txt @@ -15,6 +15,7 @@ set(core_sources ${CMAKE_CURRENT_LIST_DIR}/perf.cpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.cpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp + ${CMAKE_CURRENT_LIST_DIR}/amd_smi.cpp ${CMAKE_CURRENT_LIST_DIR}/state.cpp ${CMAKE_CURRENT_LIST_DIR}/timemory.cpp ${CMAKE_CURRENT_LIST_DIR}/utility.cpp) @@ -31,12 +32,13 @@ set(core_headers ${CMAKE_CURRENT_LIST_DIR}/exception.hpp ${CMAKE_CURRENT_LIST_DIR}/gpu.hpp ${CMAKE_CURRENT_LIST_DIR}/locking.hpp + ${CMAKE_CURRENT_LIST_DIR}/mpi.hpp ${CMAKE_CURRENT_LIST_DIR}/mproc.hpp ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp - ${CMAKE_CURRENT_LIST_DIR}/rccl.hpp ${CMAKE_CURRENT_LIST_DIR}/redirect.hpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp + ${CMAKE_CURRENT_LIST_DIR}/amd_smi.hpp ${CMAKE_CURRENT_LIST_DIR}/state.hpp ${CMAKE_CURRENT_LIST_DIR}/timemory.hpp ${CMAKE_CURRENT_LIST_DIR}/utility.hpp) diff --git a/source/lib/core/amd_smi.cpp b/source/lib/core/amd_smi.cpp new file mode 100644 index 000000000..842c7f53e --- /dev/null +++ b/source/lib/core/amd_smi.cpp @@ -0,0 +1,112 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "core/amd_smi.hpp" +#include "core/common.hpp" +#include "core/config.hpp" +#include "core/debug.hpp" +#include "core/gpu.hpp" +#include "timemory.hpp" + +#if defined(ROCPROFSYS_USE_ROCM) && ROCPROFSYS_USE_ROCM > 0 +namespace rocprofsys +{ +namespace amd_smi +{ +namespace +{ +std::string +get_setting_name(std::string _v) +{ + constexpr auto _prefix = tim::string_view_t{ "rocprofsys_" }; + for(auto& itr : _v) + itr = tolower(itr); + auto _pos = _v.find(_prefix); + if(_pos == 0) return _v.substr(_prefix.length()); + return _v; +} + +# define ROCPROFSYS_CONFIG_SETTING(TYPE, ENV_NAME, DESCRIPTION, INITIAL_VALUE, ...) \ + [&]() { \ + auto _ret = _config->insert( \ + ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, \ + TYPE{ INITIAL_VALUE }, \ + std::set{ "custom", "rocprofsys", "librocprof-sys", \ + __VA_ARGS__ }); \ + if(!_ret.second) \ + { \ + ROCPROFSYS_PRINT("Warning! Duplicate setting: %s / %s\n", \ + get_setting_name(ENV_NAME).c_str(), ENV_NAME); \ + } \ + return _config->find(ENV_NAME)->second; \ + }() +} // namespace + +void +config_settings(const std::shared_ptr& _config) +{ + if(!get_use_amd_smi() || !gpu::initialize_amdsmi()) return; + + std::string default_metrics = "busy, temp, power, mem_usage"; + // No distinction between busy and activity shown in description + std::string jpeg_activity_support = ""; + std::string vcn_activity_support = ""; + + size_t device_count = gpu::get_processor_count(); + for(size_t i = 0; i < device_count; i++) + { + if(gpu::is_vcn_activity_supported(i) || gpu::is_vcn_busy_supported(i)) + { + vcn_activity_support += ", vcn_activity"; + break; + } + } + for(size_t i = 0; i < device_count; i++) + { + if(gpu::is_jpeg_activity_supported(i) || gpu::is_jpeg_busy_supported(i)) + { + jpeg_activity_support += ", jpeg_activity"; + break; + } + } + + ROCPROFSYS_CONFIG_SETTING( + std::string, "ROCPROFSYS_AMD_SMI_METRICS", + "amd-smi metrics to collect: " + default_metrics + jpeg_activity_support + + vcn_activity_support + ". " + + "An empty value implies 'all' and 'none' suppresses all.", + "busy, temp, power, mem_usage", "backend", "amd_smi", "rocm", "process_sampling"); +} +} // namespace amd_smi +} // namespace rocprofsys + +#else +namespace rocprofsys +{ +namespace amd_smi +{ +void +config_settings(const std::shared_ptr&) +{} +} // namespace amd_smi +} // namespace rocprofsys +#endif diff --git a/source/lib/core/rccl.hpp b/source/lib/core/amd_smi.hpp similarity index 78% rename from source/lib/core/rccl.hpp rename to source/lib/core/amd_smi.hpp index b5f271ab0..0f95e82a5 100644 --- a/source/lib/core/rccl.hpp +++ b/source/lib/core/amd_smi.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,10 +20,17 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#pragma once +#include "core/timemory.hpp" -#include "core/defines.hpp" - -#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0 -# include +#if ROCPROFSYS_USE_ROCM > 0 +# include #endif + +namespace rocprofsys +{ +namespace amd_smi +{ +void +config_settings(const std::shared_ptr&); +} // namespace amd_smi +} // namespace rocprofsys diff --git a/source/lib/core/argparse.cpp b/source/lib/core/argparse.cpp index 30a41cc47..a8026c26f 100644 --- a/source/lib/core/argparse.cpp +++ b/source/lib/core/argparse.cpp @@ -562,10 +562,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _data.processed_environs.emplace("periods"); } - strset_t _backend_choices = { - "all", "kokkosp", "mpip", "ompt", "rcclp", - "amd-smi", "rocprofiler-sdk", "mutex-locks", "spin-locks", "rw-locks" - }; + strset_t _backend_choices = { "all", "kokkosp", "mpip", "ompt", + "rcclp", "amd-smi", "rocm", "mutex-locks", + "spin-locks", "rw-locks" }; #if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS) _backend_choices.erase("mpip"); @@ -575,14 +574,10 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _backend_choices.erase("ompt"); #endif -#if !defined(ROCPROFSYS_USE_RCCL) - _backend_choices.erase("rcclp"); -#endif - #if !defined(ROCPROFSYS_USE_ROCM) _backend_choices.erase("amd-smi"); - _backend_choices.erase("rocprofiler-sdk"); _backend_choices.erase("rocm"); + _backend_choices.erase("rcclp"); #endif if(gpu::device_count() == 0) @@ -590,13 +585,8 @@ add_core_arguments(parser_t& _parser, parser_data& _data) // remove GPU-specific backends _backend_choices.erase("rcclp"); _backend_choices.erase("amd-smi"); - _backend_choices.erase("rocprofiler-sdk"); _backend_choices.erase("rocm"); -#if defined(ROCPROFSYS_USE_RCCL) - update_env(_data, "ROCPROFSYS_USE_RCCLP", false); -#endif - #if defined(ROCPROFSYS_USE_ROCM) update_env(_data, "ROCPROFSYS_USE_AMD_SMI", false); update_env(_data, "ROCPROFSYS_USE_ROCM", false); diff --git a/source/lib/core/categories.hpp b/source/lib/core/categories.hpp index fe28aa548..bb9db10b0 100644 --- a/source/lib/core/categories.hpp +++ b/source/lib/core/categories.hpp @@ -102,6 +102,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGOR ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocdecode_api, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, "rocm_rocdecode_api", "ROCm RocDecode API") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocjpeg_api, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, "rocm_rocjpeg_api", "ROCm RocJPEG API") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl_api, ROCPROFSYS_CATEGORY_ROCM_RCCL_API, "rocm_rccl_api", "ROCm RCCL API") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "amd_smi", "AMD-SMI data") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_gfx_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, "device_busy_gfx", "Busy percentage of GFX engine on a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_umc_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, "device_busy_umc", "Busy percentage of UMC engin on a GPU device") @@ -171,6 +172,7 @@ using name = perfetto_category; ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_marker_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocdecode_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocjpeg_api), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi), \ ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_gfx_busy), \ ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_umc_busy), \ diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index 3bb205bdf..4eabcd015 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "config.hpp" +#include "amd_smi.hpp" #include "common/defines.h" #include "common/static_object.hpp" #include "constraint.hpp" @@ -626,13 +627,7 @@ configure_settings(bool _init) ->set_choices(perf::get_config_choices()); rocprofiler_sdk::config_settings(_config); - - ROCPROFSYS_CONFIG_SETTING(std::string, "ROCPROFSYS_AMD_SMI_METRICS", - "amd-smi metrics to collect: busy, temp, power, " - "vcn_activity, jpeg_activity, mem_usage. " - "An empty value implies 'all' and 'none' suppresses all.", - "busy, temp, power, mem_usage", "backend", "amd_smi", - "rocm", "process_sampling"); + amd_smi::config_settings(_config); ROCPROFSYS_CONFIG_SETTING(size_t, "ROCPROFSYS_PERFETTO_SHMEM_SIZE_HINT_KB", "Hint for shared-memory buffer size in perfetto (in KB)", @@ -1046,7 +1041,7 @@ configure_settings(bool _init) settings::use_output_suffix() = _config->get("ROCPROFSYS_USE_PID"); if(settings::use_output_suffix()) settings::default_process_suffix() = process::get_id(); -#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) +#if !defined(ROCPROFSYS_USE_MPI) && defined(ROCPROFSYS_USE_MPI_HEADERS) if(tim::dmp::is_initialized()) settings::default_process_suffix() = tim::dmp::rank(); #endif @@ -1359,6 +1354,14 @@ configure_disabled_settings(const std::shared_ptr& _config) _config->find("ROCPROFSYS_USE_AMD_SMI")->second->set_hidden(true); for(const auto& itr : _config->disable_category("amd_smi")) _config->find(itr)->second->set_hidden(true); + + _config->find("ROCPROFSYS_USE_RCCLP")->second->set_hidden(true); + for(const auto& itr : _config->disable_category("rcclp")) + _config->find(itr)->second->set_hidden(true); + + _config->find("ROCPROFSYS_USE_ROCM")->second->set_hidden(true); + for(const auto& itr : _config->disable_category("rocm")) + _config->find(itr)->second->set_hidden(true); #endif #if defined(ROCPROFSYS_USE_OMPT) || ROCPROFSYS_USE_OMPT == 0 @@ -1367,7 +1370,7 @@ configure_disabled_settings(const std::shared_ptr& _config) _config->find(itr)->second->set_hidden(true); #endif -#if !defined(TIMEMORY_USE_MPI) || TIMEMORY_USE_MPI == 0 +#if !defined(ROCPROFSYS_USE_MPI) || ROCPROFSYS_USE_MPI == 0 _config->disable("ROCPROFSYS_PERFETTO_COMBINE_TRACES"); _config->disable("ROCPROFSYS_COLLAPSE_PROCESSES"); _config->find("ROCPROFSYS_PERFETTO_COMBINE_TRACES")->second->set_hidden(true); @@ -1991,7 +1994,7 @@ get_perfetto_buffer_size() bool get_perfetto_combined_traces() { -#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0 +#if defined(ROCPROFSYS_USE_MPI) && ROCPROFSYS_USE_MPI > 0 static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_COMBINE_TRACES"); return static_cast&>(*_v->second).get(); #else diff --git a/source/lib/core/debug.hpp b/source/lib/core/debug.hpp index c82b33069..b933cf33f 100644 --- a/source/lib/core/debug.hpp +++ b/source/lib/core/debug.hpp @@ -159,9 +159,9 @@ as_hex(void*, size_t); #endif #if !defined(ROCPROFSYS_DEBUG_PROCESS_IDENTIFIER) -# if defined(TIMEMORY_USE_MPI) +# if defined(ROCPROFSYS_USE_MPI) # define ROCPROFSYS_DEBUG_PROCESS_IDENTIFIER static_cast(::tim::dmp::rank()) -# elif defined(TIMEMORY_USE_MPI_HEADERS) +# elif defined(ROCPROFSYS_USE_MPI_HEADERS) # define ROCPROFSYS_DEBUG_PROCESS_IDENTIFIER \ (::tim::dmp::is_initialized()) ? static_cast(::tim::dmp::rank()) \ : static_cast(::tim::process::get_id()) diff --git a/source/lib/core/gpu.cpp b/source/lib/core/gpu.cpp index 91b3aad63..c46a9db40 100644 --- a/source/lib/core/gpu.cpp +++ b/source/lib/core/gpu.cpp @@ -201,8 +201,12 @@ add_device_metadata() * Required amdsmi methods to get processors and handles */ -uint32_t processors::total_processor_count = 0; -std::vector processors::processors_list = {}; +uint32_t processors::total_processor_count = 0; +std::vector processors::processors_list = {}; +std::vector processors::vcn_activity_supported = {}; +std::vector processors::jpeg_activity_supported = {}; +std::vector processors::vcn_busy_supported = {}; +std::vector processors::jpeg_busy_supported = {}; void get_processor_handles() @@ -246,10 +250,87 @@ get_processor_handles() return; } processors::processors_list.push_back(processor); + + amdsmi_gpu_metrics_t gpu_metrics; + bool vcn_supported = false; + bool jpeg_supported = false; + bool v_busy_supported = false; + bool j_busy_supported = false; + ret = amdsmi_get_gpu_metrics_info(processor, &gpu_metrics); + if(ret == AMDSMI_STATUS_SUCCESS) + { + for(const auto& vcn_activity : gpu_metrics.vcn_activity) + { + if(vcn_activity != UINT16_MAX) + { + vcn_supported = true; + break; + } + } + for(const auto& jpeg_activity : gpu_metrics.jpeg_activity) + { + if(jpeg_activity != UINT16_MAX) + { + jpeg_supported = true; + break; + } + } + for(const auto& xcp : gpu_metrics.xcp_stats) + { + if(!v_busy_supported) + { + v_busy_supported = + std::any_of(std::begin(xcp.vcn_busy), std::end(xcp.vcn_busy), + [](uint16_t val) { return val != UINT16_MAX; }); + } + + if(!j_busy_supported) + { + j_busy_supported = std::any_of( + std::begin(xcp.jpeg_busy), std::end(xcp.jpeg_busy), + [](uint16_t val) { return val != UINT16_MAX; }); + } + + if(v_busy_supported && j_busy_supported) break; + } + } + processors::vcn_activity_supported.push_back(vcn_supported); + processors::jpeg_activity_supported.push_back(jpeg_supported); + processors::vcn_busy_supported.push_back(v_busy_supported); + processors::jpeg_busy_supported.push_back(j_busy_supported); } } processors::total_processor_count = processors::processors_list.size(); } + +bool +is_vcn_activity_supported(uint32_t dev_id) +{ + if(dev_id >= processors::vcn_activity_supported.size()) return false; + return processors::vcn_activity_supported[dev_id]; +} + +bool +is_jpeg_activity_supported(uint32_t dev_id) +{ + if(dev_id >= processors::jpeg_activity_supported.size()) return false; + return processors::jpeg_activity_supported[dev_id]; +} + +bool +is_vcn_busy_supported(uint32_t dev_id) +{ + if(dev_id >= processors::vcn_busy_supported.size()) return false; + return processors::vcn_busy_supported[dev_id]; +} + +bool +is_jpeg_busy_supported(uint32_t dev_id) +{ + if(dev_id >= processors::jpeg_busy_supported.size()) return false; + return processors::jpeg_busy_supported[dev_id]; +} + uint32_t get_processor_count() { diff --git a/source/lib/core/gpu.hpp b/source/lib/core/gpu.hpp index 42b693304..f883630c2 100644 --- a/source/lib/core/gpu.hpp +++ b/source/lib/core/gpu.hpp @@ -40,15 +40,35 @@ get_processor_count(); amdsmi_processor_handle get_handle_from_id(uint32_t dev_id); +bool +is_vcn_activity_supported(uint32_t dev_id); + +bool +is_jpeg_activity_supported(uint32_t dev_id); + +bool +is_vcn_busy_supported(uint32_t dev_id); + +bool +is_jpeg_busy_supported(uint32_t dev_id); + struct processors { static uint32_t total_processor_count; static std::vector processors_list; + static std::vector vcn_activity_supported; + static std::vector jpeg_activity_supported; + static std::vector vcn_busy_supported; + static std::vector jpeg_busy_supported; private: friend void rocprofsys::gpu::get_processor_handles(); friend uint32_t rocprofsys::gpu::get_processor_count(); friend amdsmi_processor_handle rocprofsys::gpu::get_handle_from_id(uint32_t dev_id); + friend bool rocprofsys::gpu::is_vcn_activity_supported(uint32_t dev_id); + friend bool rocprofsys::gpu::is_jpeg_activity_supported(uint32_t dev_id); + friend bool rocprofsys::gpu::is_vcn_busy_supported(uint32_t dev_id); + friend bool rocprofsys::gpu::is_jpeg_busy_supported(uint32_t dev_id); }; #endif diff --git a/source/lib/core/mpi.hpp b/source/lib/core/mpi.hpp new file mode 100644 index 000000000..981458655 --- /dev/null +++ b/source/lib/core/mpi.hpp @@ -0,0 +1,725 @@ +// MIT License +// +// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +/* + * Defines mpi functions and dummy functions when compiled without MPI + * + */ + +#pragma once + +#include "debug.hpp" +#include + +#include +#include + +#include +#include +#include + +#if !defined(ROCPROFSYS_USE_MPI) && defined(ROCPROFSYS_USE_MPI_HEADERS) && \ + !defined(OMPI_SKIP_MPICXX) +# define ROCPROFSYS_UNDEFINE_OMPI_SKIP_MPICXX 1 +# define OMPI_SKIP_MPICXX 1 +#endif + +#if defined(ROCPROFSYS_USE_MPI) || defined(ROCPROFSYS_USE_MPI_HEADERS) +# include +#endif + +#if defined(MPICH) && MPICH > 0 +# define ROCPROFSYS_MPI_MPICH 1 +#elif defined(OMPI_MAJOR_VERSION) && defined(OMPI_MINOR_VERSION) && \ + defined(OMPI_PATCH_VERSION) +# define ROCPROFSYS_MPI_OPENMPI 1 +#endif + +namespace rocprofsys +{ +namespace mpi +{ +//--------------------------------------------------------------------------------------// + +#if !defined(ROCPROFSYS_USE_MPI) +struct dummy_data_type +{ + enum type + { + int_t, + float_t, + double_t + }; +}; +#endif + +//--------------------------------------------------------------------------------------// + +#if !defined(ROCPROFSYS_USE_MPI) && !defined(MPI_INT) +# define MPI_INT ::rocprofsys::mpi::dummy_data_type::int_t +#endif + +#if !defined(ROCPROFSYS_USE_MPI) && !defined(MPI_FLOAT) +# define MPI_FLOAT ::rocprofsys::mpi::dummy_data_type::float_t +#endif + +#if !defined(ROCPROFSYS_USE_MPI) && !defined(MPI_DOUBLE) +# define MPI_DOUBLE ::rocprofsys::mpi::dummy_data_type::double_t +#endif + +//--------------------------------------------------------------------------------------// + +#if defined(ROCPROFSYS_USE_MPI) || defined(ROCPROFSYS_USE_MPI_HEADERS) +# if defined(MPICH) && (MPICH > 0) +static constexpr bool is_mpich = true; +# else +static constexpr bool is_mpich = false; +# endif +# if defined(OPEN_MPI) && (OPEN_MPI > 0) +static constexpr bool is_openmpi = true; +# else +static constexpr bool is_openmpi = false; +# endif +#endif + +//--------------------------------------------------------------------------------------// + +#if defined(ROCPROFSYS_USE_MPI) || defined(ROCPROFSYS_USE_MPI_HEADERS) + +using comm_t = MPI_Comm; +using info_t = MPI_Info; +using data_type_t = MPI_Datatype; +using status_t = MPI_Status; + +# if !defined(ROCPROFSYS_USE_MPI) && defined(ROCPROFSYS_USE_MPI_HEADERS) && \ + defined(OPEN_MPI) && (OPEN_MPI > 0) +static const comm_t comm_world_v = nullptr; +static const comm_t comm_self_v = nullptr; +static const info_t info_null_v = nullptr; +# else +static const comm_t comm_world_v = MPI_COMM_WORLD; +static const comm_t comm_self_v = MPI_COMM_SELF; +static const info_t info_null_v = MPI_INFO_NULL; +# endif +static const int success_v = MPI_SUCCESS; +static const int comm_type_shared_v = MPI_COMM_TYPE_SHARED; + +namespace threading +{ +enum : int +{ + /// Only one thread will execute. + single = MPI_THREAD_SINGLE, + /// Only main thread will do MPI calls. The process may be multi-threaded, but only + /// the main thread will make MPI calls (all MPI calls are funneled to the main + /// thread) + funneled = MPI_THREAD_FUNNELED, + /// Only one thread at the time do MPI calls. The process may be multi-threaded, and + /// multiple threads may make MPI calls, but only one at a time: MPI calls are not + /// made concurrently from two distinct threads (all MPI calls are serialized). + serialized = MPI_THREAD_SERIALIZED, + /// Multiple thread may do MPI calls with no restrictions. + multiple = MPI_THREAD_MULTIPLE +}; +} // namespace threading + +#else // dummy MPI types + +using comm_t = int32_t; +using info_t = int32_t; +using data_type_t = int32_t; +using status_t = int32_t; +static const comm_t comm_world_v = 0; +static const comm_t comm_self_v = 0; +static const info_t info_null_v = 0; +static const int success_v = 0; +static const int comm_type_shared_v = 0; + +namespace threading +{ +enum : int +{ + /// Only one thread will execute. + single = 0, + /// Only main thread will do MPI calls. The process may be multi-threaded, but only + /// the main thread will make MPI calls (all MPI calls are funneled to the main + /// thread) + funneled = 1, + /// Only one thread at the time do MPI calls. The process may be multi-threaded, and + /// multiple threads may make MPI calls, but only one at a time: MPI calls are not + /// made concurrently from two distinct threads (all MPI calls are serialized). + serialized = 2, + /// Multiple thread may do MPI calls with no restrictions. + multiple = 3 +}; +} // namespace threading + +#endif + +//--------------------------------------------------------------------------------------// + +namespace threading +{ +inline auto +get_id() +{ + return ::tim::threading::get_id(); +} +} // namespace threading + +template +using communicator_map_t = std::unordered_map; + +inline int32_t rank(comm_t = comm_world_v); +inline int32_t size(comm_t = comm_world_v); +inline void set_rank(int32_t, comm_t = comm_world_v); +inline void set_size(int32_t, comm_t = comm_world_v); + +//--------------------------------------------------------------------------------------// +// Currently ROCPROFSYS_MPI_THREAD is just a placeholder for future +// implementation. + +inline bool& +use_mpi_thread() +{ + static bool _instance = tim::get_env("ROCPROFSYS_MPI_THREAD", true); + return _instance; +} + +//--------------------------------------------------------------------------------------// + +inline std::string& +use_mpi_thread_type() +{ + static std::string _instance = + tim::get_env("ROCPROFSYS_MPI_THREAD_TYPE", ""); + return _instance; +} + +//--------------------------------------------------------------------------------------// + +inline bool& +fail_on_error() +{ + static bool _instance = tim::get_env("ROCPROFSYS_MPI_FAIL_ON_ERROR", false); + return _instance; +} + +//--------------------------------------------------------------------------------------// + +inline bool& +quiet() +{ + static bool _instance = tim::get_env("ROCPROFSYS_MPI_QUIET", false); + return _instance; +} + +//--------------------------------------------------------------------------------------// + +#if !defined(ROCPROFSYS_MPI_ERROR_FUNCTION) +# define ROCPROFSYS_MPI_ERROR_FUNCTION(FUNC, ...) # FUNC +#endif + +#if !defined(ROCPROFSYS_MPI_ERROR_CHECK) +# define ROCPROFSYS_MPI_ERROR_CHECK(...) \ + ::rocprofsys::mpi::check_error(ROCPROFSYS_MPI_ERROR_FUNCTION(__VA_ARGS__, ""), \ + __VA_ARGS__) +#endif + +//--------------------------------------------------------------------------------------// + +inline bool +check_error(const char* _func, int err_code, comm_t _comm = mpi::comm_world_v) +{ +#if defined(ROCPROFSYS_USE_MPI) + bool _success = (err_code == MPI_SUCCESS); + if(!_success && !mpi::quiet()) + { + int len = 0; + char msg[1024]; + PMPI_Error_string(err_code, msg, &len); + msg[std::min(len, 1023)] = '\0'; + int _rank = rank(); + fprintf(stderr, "[rank=%i][pid=%i][tid=%i][%s]> Error code (%i): %s\n", _rank, + (int) process::get_id(), (int) threading::get_id(), _func, err_code, msg); + } + if(!_success && fail_on_error()) PMPI_Abort(_comm, err_code); + return (err_code == MPI_SUCCESS); +#else + tim::consume_parameters(_func, err_code, _comm); + return false; +#endif +} + +//--------------------------------------------------------------------------------------// + +inline void +barrier(comm_t comm = comm_world_v); + +inline bool +is_supported() +{ +#if defined(ROCPROFSYS_USE_MPI) + return true; +#else + return false; +#endif +} + +//--------------------------------------------------------------------------------------// + +inline bool& +is_finalized() +{ +#if defined(ROCPROFSYS_USE_MPI) + int32_t _fini = 0; + PMPI_Finalized(&_fini); + static bool _instance = static_cast(_fini); + if(!_instance) _instance = static_cast(_fini); +#else + static bool _instance = true; +#endif + return _instance; +} + +//--------------------------------------------------------------------------------------// + +template +inline std::function& +is_initialized_callback() +{ + static std::function _v = []() -> bool { + int32_t _init = 0; +#if defined(ROCPROFSYS_USE_MPI) + if(!is_finalized()) PMPI_Initialized(&_init); +#endif + return (_init != 0) ? true : false; + }; + return _v; +} + +//--------------------------------------------------------------------------------------// + +inline bool +is_initialized() +{ + return is_initialized_callback()(); +} + +//--------------------------------------------------------------------------------------// + +inline void +initialize(int& argc, char**& argv) +{ +#if defined(ROCPROFSYS_USE_MPI) + if(!is_initialized()) + { + using namespace threading; + bool _success_v = false; + if(use_mpi_thread()) + { + auto _init = [&argc, &argv](int itr, const std::string& _type) { + int _actual = -1; + auto ret = MPI_Init_thread(&argc, &argv, itr, &_actual); + if(_actual != itr) + { + fprintf(stderr, "Warning! MPI_Init_thread does not support: %s\n", + _type.c_str()); + } + return ROCPROFSYS_MPI_ERROR_CHECK(ret); + }; + + // ROCPROFSYS_MPI_ERROR_CHECK(MPI_Init(&argc, &argv)); + // int _provided = 0; + // MPI_Query_thread(&_provided); + + auto _mpi_type = use_mpi_thread_type(); + if(_mpi_type == "single") + { + _success_v = _init(single, _mpi_type); + } + else if(_mpi_type == "serialized") + { + _success_v = _init(serialized, _mpi_type); + } + else if(_mpi_type == "funneled") + { + _success_v = _init(funneled, _mpi_type); + } + else if(_mpi_type == "multiple") + { + _success_v = _init(multiple, _mpi_type); + } + else + { + _success_v = _init(multiple, "multiple"); + } + } + + if(!_success_v) ROCPROFSYS_MPI_ERROR_CHECK(MPI_Init(&argc, &argv)); + } +#else + tim::consume_parameters(argc, argv); +#endif +} + +//--------------------------------------------------------------------------------------// + +inline void +initialize(int* argc, char*** argv) +{ + initialize(*argc, *argv); +} + +//--------------------------------------------------------------------------------------// + +inline void +finalize() +{ +#if defined(ROCPROFSYS_USE_MPI) + if(is_initialized()) + { + // barrier(); + MPI_Finalize(); + is_finalized() = true; + // finalized + } +#endif +} + +//--------------------------------------------------------------------------------------// + +#if defined(ROCPROFSYS_USE_MPI) + +int32_t +rank(comm_t comm) +{ + int32_t _rank = 0; + if(is_initialized()) + { + // this is used to guard against the queries that might happen after an + // application calls MPI_Finalize() directly + static communicator_map_t* _instance = new communicator_map_t(); + if(_instance->find(comm) == _instance->end()) + { + PMPI_Comm_rank(comm, &_rank); + (*_instance)[comm] = _rank; + } + else + { + _rank = (*_instance)[comm]; + } + } + return std::max(_rank, (int32_t) 0); +} + +int32_t +size(comm_t comm) +{ + int32_t _size = 1; + if(is_initialized()) + { + // this is used to guard against the queries that might happen after an + // application calls MPI_Finalize() directly + static communicator_map_t* _instance = new communicator_map_t(); + if(_instance->find(comm) == _instance->end()) + { + PMPI_Comm_size(comm, &_size); + (*_instance)[comm] = _size; + } + else + { + _size = (*_instance)[comm]; + } + } + return std::max(_size, (int32_t) 1); +} + +void set_rank(int32_t, comm_t) {} +void set_size(int32_t, comm_t) {} + +#else + +struct comm_data +{ + using entry_t = std::array; + + static int32_t rank(comm_t _comm) { return std::max(m_data()[_comm][0], 0); } + static int32_t size(comm_t _comm) { return std::max(m_data()[_comm][1], 1); } + + friend void set_rank(int32_t, comm_t); + friend void set_size(int32_t, comm_t); + +private: + static std::map& m_data() + { + static std::map _v = { { 0, entry_t{ 0, 1 } } }; + return _v; + } +}; + +int32_t +rank(comm_t comm) +{ + return comm_data::rank(comm); +} + +int32_t +size(comm_t comm) +{ + return comm_data::size(comm); +} + +void +set_rank(int32_t _rank, comm_t comm) +{ + comm_data::m_data()[comm][0] = _rank; +} + +void +set_size(int32_t _size, comm_t comm) +{ + comm_data::m_data()[comm][1] = _size; +} + +#endif + +//--------------------------------------------------------------------------------------// + +inline void +barrier(comm_t comm) +{ +#if defined(ROCPROFSYS_USE_MPI) + if(is_initialized()) PMPI_Barrier(comm); +#else + tim::consume_parameters(comm); +#endif +} + +//--------------------------------------------------------------------------------------// + +inline void +comm_split(comm_t comm, int split_size, int rank, comm_t* local_comm) +{ +#if defined(ROCPROFSYS_USE_MPI) + if(is_initialized()) + ROCPROFSYS_MPI_ERROR_CHECK(PMPI_Comm_split(comm, split_size, rank, local_comm)); +#else + tim::consume_parameters(comm, split_size, rank, local_comm); +#endif +} + +//--------------------------------------------------------------------------------------// + +inline void +comm_split_type(comm_t comm, int split_size, int key, info_t info, comm_t* local_comm) +{ +#if defined(ROCPROFSYS_USE_MPI) + if(is_initialized()) + { + ROCPROFSYS_MPI_ERROR_CHECK( + PMPI_Comm_split_type(comm, split_size, key, info, local_comm)); + } +#else + tim::consume_parameters(comm, split_size, key, info, local_comm); +#endif +} + +//--------------------------------------------------------------------------------------// +/// returns the communicator for the node +inline comm_t +get_node_comm() +{ + if(!is_initialized()) return comm_world_v; + auto _get_node_comm = []() { + comm_t local_comm; + comm_split_type(mpi::comm_world_v, mpi::comm_type_shared_v, 0, mpi::info_null_v, + &local_comm); + return local_comm; + }; + static comm_t _instance = _get_node_comm(); + return _instance; +} + +//--------------------------------------------------------------------------------------// +/// returns the number of ranks on a node +inline int32_t +get_num_ranks_per_node() +{ + if(!is_initialized()) return 1; + return size(get_node_comm()); +} + +//--------------------------------------------------------------------------------------// + +inline int32_t +get_num_nodes() +{ + if(!is_initialized()) return 1; + auto _world_size = size(comm_world_v); + auto _ncomm_size = get_num_ranks_per_node(); + return (_world_size >= _ncomm_size) ? (_world_size / _ncomm_size) : 1; +} + +//--------------------------------------------------------------------------------------// + +inline int32_t +get_node_index() +{ + if(!is_initialized()) return 0; + return rank() / get_num_ranks_per_node(); +} + +//--------------------------------------------------------------------------------------// + +inline void +send(const std::string& str, int dest, int tag, comm_t comm = mpi::comm_world_v) +{ +#if defined(ROCPROFSYS_USE_MPI) + using ulli_t = unsigned long long; + ulli_t len = str.size(); + ROCPROFSYS_MPI_ERROR_CHECK( + PMPI_Send(&len, 1, MPI_UNSIGNED_LONG_LONG, dest, tag, comm)); + if(len != 0) + { + ulli_t _cmax = std::numeric_limits::max(); + if(len <= _cmax) + { + ROCPROFSYS_MPI_ERROR_CHECK( + PMPI_Send(const_cast(str.data()), len, MPI_CHAR, dest, tag, comm)); + } + else + { + auto _len = str.length() / sizeof(long); + auto _rem = str.length() % sizeof(long); + auto _str = str; + if(_rem > 0) + { + _str.resize(_str.length() + _rem, '\0'); + _len += 1; + } + ROCPROFSYS_MPI_ERROR_CHECK(PMPI_Send(const_cast(_str.data()), _len, + MPI_LONG, dest, tag, comm)); + } + } +#else + tim::consume_parameters(str, dest, tag, comm); +#endif +} + +//--------------------------------------------------------------------------------------// + +inline void +recv(std::string& str, int src, int tag, comm_t comm = mpi::comm_world_v) +{ +#if defined(ROCPROFSYS_USE_MPI) + using ulli_t = unsigned long long; + ulli_t len = 0; + MPI_Status s; + ROCPROFSYS_MPI_ERROR_CHECK( + PMPI_Recv(&len, 1, MPI_UNSIGNED_LONG_LONG, src, tag, comm, &s)); + if(len != 0) + { + ulli_t _cmax = std::numeric_limits::max(); + if(len <= _cmax) + { + std::vector tmp(len); + ROCPROFSYS_MPI_ERROR_CHECK( + PMPI_Recv(tmp.data(), len, MPI_CHAR, src, tag, comm, &s)); + str.assign(tmp.begin(), tmp.end()); + } + else + { + auto _len = len / sizeof(long); + auto _rem = len % sizeof(long); + if(_rem > 0) _len += 1; + std::vector tmp(_len); + ROCPROFSYS_MPI_ERROR_CHECK( + PMPI_Recv(tmp.data(), _len, MPI_LONG, src, tag, comm, &s)); + std::vector chars = {}; + auto _ratio = sizeof(long) / sizeof(char); + chars.reserve(_len * _ratio); + for(auto& itr : tmp) + { + for(size_t i = 0; i < _ratio; ++i) + { + chars.emplace_back(itr >> (i * sizeof(void*))); + if(chars.size() == len) break; + } + } + str.assign(chars.begin(), chars.end()); + } + } + else + { + str.clear(); + } +#else + tim::consume_parameters(str, src, tag, comm); +#endif +} + +//--------------------------------------------------------------------------------------// + +inline void +gather(const void* sendbuf, int sendcount, data_type_t sendtype, void* recvbuf, + int recvcount, data_type_t recvtype, int root, comm_t comm = mpi::comm_world_v) +{ +#if defined(ROCPROFSYS_USE_MPI) + if(is_initialized()) + { + ROCPROFSYS_MPI_ERROR_CHECK(PMPI_Gather(sendbuf, sendcount, sendtype, recvbuf, + recvcount, recvtype, root, comm)); + } +#else + tim::consume_parameters(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, + root, comm); +#endif +} + +//--------------------------------------------------------------------------------------// + +inline void +comm_spawn_multiple(int count, char** commands, char*** argv, const int* maxprocs, + const info_t* info, int root, comm_t comm, comm_t* intercomm, + int* errcodes) +{ +#if defined(ROCPROFSYS_USE_MPI) + if(is_initialized()) + { + ROCPROFSYS_MPI_ERROR_CHECK(PMPI_Comm_spawn_multiple( + count, commands, argv, maxprocs, info, root, comm, intercomm, errcodes)); + } +#else + tim::consume_parameters(count, commands, argv, maxprocs, info, root, comm, intercomm, + errcodes); +#endif +} + +//--------------------------------------------------------------------------------------// + +} // namespace mpi +} // namespace rocprofsys + +#if defined(ROCPROFSYS_UNDEFINE_OMPI_SKIP_MPICXX) && ROCPROFSYS_UNDEFINE_OMPI_SKIP_MPICXX +# undef OMPI_SKIP_MPICXX +#endif diff --git a/source/lib/core/perfetto.cpp b/source/lib/core/perfetto.cpp index 41b0c71a4..50f366721 100644 --- a/source/lib/core/perfetto.cpp +++ b/source/lib/core/perfetto.cpp @@ -209,7 +209,7 @@ post_process(tim::manager* _timemory_manager, bool& _perfetto_output_error) }; auto trace_data = char_vec_t{}; -#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0 +#if defined(ROCPROFSYS_USE_MPI) && ROCPROFSYS_USE_MPI > 0 if(get_perfetto_combined_traces()) { using perfetto_mpi_get_t = tim::operation::finalize::mpi_get; diff --git a/source/lib/core/rocprofiler-sdk.cpp b/source/lib/core/rocprofiler-sdk.cpp index 8c5764d84..e39ca2902 100644 --- a/source/lib/core/rocprofiler-sdk.cpp +++ b/source/lib/core/rocprofiler-sdk.cpp @@ -218,6 +218,30 @@ get_operations_impl(const std::unordered_set& _complete, } // namespace +/// @brief Return the version of the rocprofiler-sdk +/// @return The version of the rocprofiler-sdk or 0 if not initialized +version_info& +get_version() +{ + static auto _version = version_info{ 0 }; + + if(_version.formatted == 0) + { + uint32_t _major = 0; + uint32_t _minor = 0; + uint32_t _patch = 0; + + ROCPROFILER_CALL(rocprofiler_get_version(&_major, &_minor, &_patch)); + + _version.major = _major; + _version.minor = _minor; + _version.patch = _patch; + _version.formatted = _major * 10000 + _minor * 100 + _patch; + } + + return _version; +} + void config_settings(const std::shared_ptr& _config) { @@ -319,6 +343,7 @@ config_settings(const std::shared_ptr& _config) join::join(join::array_config{ ", ", "", "" }, _domain_choices)); auto _domain_defaults = std::string{ "hip_runtime_api,marker_api,kernel_dispatch," "memory_copy,scratch_memory" }; + # if(ROCPROFILER_VERSION < 10000) _domain_defaults.append(",page_migration"); # endif @@ -353,21 +378,35 @@ std::unordered_set get_callback_domains() { const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names(); - const auto supported = std::unordered_set - { + auto supported = std::unordered_set{ ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API, - ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, - ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, - ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, - ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + }; + + auto _version = get_version(); + ROCPROFSYS_WARNING_IF(_version.formatted == 0, + "Warning! rocprofiler-sdk version not initialized\n"); + +# if(ROCPROFILER_VERSION >= 600) + if(_version.formatted >= 600) + { + // Argument tracing is supported in rocprofiler-sdk 0.6.0 and later + supported.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API); + supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API); + } +# endif # if(ROCPROFILER_VERSION >= 700) - ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API, - ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API, + if(_version.formatted >= 700) + { + supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API); + } # endif - }; auto _data = std::unordered_set{}; auto _domains = @@ -375,6 +414,12 @@ get_callback_domains() .value_or(std::string{}), " ,;:\t\n"); + if(config::get_use_rcclp() && _version.formatted >= 600) + { + // Translate ROCPROFSYS_USE_RCCLP to entry in ROCPROFSYS_ROCM_DOMAINS + _data.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API); + } + const auto valid_choices = settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices(); @@ -582,6 +627,12 @@ void config_settings(const std::shared_ptr&) {} +version_info& +get_version() +{ + static auto _version = version_info{ 0 }; + return _version; +} } // namespace rocprofiler_sdk } // namespace rocprofsys diff --git a/source/lib/core/rocprofiler-sdk.hpp b/source/lib/core/rocprofiler-sdk.hpp index 5ceee14e2..b70c1af63 100644 --- a/source/lib/core/rocprofiler-sdk.hpp +++ b/source/lib/core/rocprofiler-sdk.hpp @@ -39,9 +39,20 @@ namespace rocprofsys { namespace rocprofiler_sdk { +struct version_info +{ + uint32_t major = 0; + uint32_t minor = 0; + uint32_t patch = 0; + uint32_t formatted = 0; // major * 10000 + minor * 100 + patch +}; + void config_settings(const std::shared_ptr&); +version_info& +get_version(); + #if defined(ROCPROFSYS_USE_ROCM) std::unordered_set diff --git a/source/lib/core/timemory.hpp b/source/lib/core/timemory.hpp index 92363bdb2..0d35fe0d3 100644 --- a/source/lib/core/timemory.hpp +++ b/source/lib/core/timemory.hpp @@ -27,11 +27,9 @@ #include "defines.hpp" #include -#include #include #include #include -#include #include #include #include @@ -40,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h index 5357469cf..208411303 100644 --- a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h +++ b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h @@ -54,6 +54,7 @@ extern "C" ROCPROFSYS_CATEGORY_ROCM_MARKER_API, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, + ROCPROFSYS_CATEGORY_ROCM_RCCL_API, ROCPROFSYS_CATEGORY_AMD_SMI, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, diff --git a/source/lib/rocprof-sys/api.cpp b/source/lib/rocprof-sys/api.cpp index bda609ef6..ad2369ca5 100644 --- a/source/lib/rocprof-sys/api.cpp +++ b/source/lib/rocprof-sys/api.cpp @@ -122,7 +122,7 @@ rocprofsys_init_library(void) extern "C" void rocprofsys_init_tooling(void) { - rocprofsys_init_tooling_hidden(true); + rocprofsys_init_tooling_hidden(); } extern "C" void diff --git a/source/lib/rocprof-sys/api.hpp b/source/lib/rocprof-sys/api.hpp index 55dfbbdd0..478f96ec5 100644 --- a/source/lib/rocprof-sys/api.hpp +++ b/source/lib/rocprof-sys/api.hpp @@ -95,7 +95,7 @@ extern "C" // these are the real implementations for internal calling convention void rocprofsys_init_library_hidden(void) ROCPROFSYS_HIDDEN_API; - bool rocprofsys_init_tooling_hidden(bool postinit = false) ROCPROFSYS_HIDDEN_API; + bool rocprofsys_init_tooling_hidden(void) ROCPROFSYS_HIDDEN_API; void rocprofsys_init_hidden(const char*, bool, const char*) ROCPROFSYS_HIDDEN_API; void rocprofsys_finalize_hidden(void) ROCPROFSYS_HIDDEN_API; void rocprofsys_reset_preload_hidden(void) ROCPROFSYS_HIDDEN_API; diff --git a/source/lib/rocprof-sys/library.cpp b/source/lib/rocprof-sys/library.cpp index e90594264..ca6e4f692 100644 --- a/source/lib/rocprof-sys/library.cpp +++ b/source/lib/rocprof-sys/library.cpp @@ -53,7 +53,6 @@ #include "library/ompt.hpp" #include "library/process_sampler.hpp" #include "library/ptl.hpp" -#include "library/rcclp.hpp" #include "library/rocprofiler-sdk.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" @@ -404,42 +403,16 @@ rocprofsys_init_library_hidden() ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n"); } -// Initialize RCCL if: -// - postinit=true - so the code doesn't hang at the initialization stage -// - get_state() >= State::Init - so the code doesn't throw an exception -// - rccl_initialized=false - so we don't try to initialize RCCL twice -// - get_use_rcclp()=true - only if the environment is configured to use RCCL -static void -rccl_setup(bool postinit) -{ - // Flag used to avoid initializing RCCL twice - static bool rccl_initialized = false; - - if(postinit && (get_state() >= State::Init) && !rccl_initialized && get_use_rcclp()) - { - ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n"); - rcclp::setup(); - rccl_initialized = true; - } -} - -static void -rocprofsys_init_library_hidden_with_rccl(bool postinit) -{ - rocprofsys_init_library_hidden(); - rccl_setup(postinit); -} - //======================================================================================// extern "C" bool -rocprofsys_init_tooling_hidden(bool postinit) +rocprofsys_init_tooling_hidden(void) { if(get_env("ROCPROFSYS_MONOCHROME", false, false)) tim::log::monochrome() = true; if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true)) { - rocprofsys_init_library_hidden_with_rccl(postinit); + rocprofsys_init_library_hidden(); return false; } @@ -458,7 +431,6 @@ rocprofsys_init_tooling_hidden(bool postinit) if(get_state() != State::PreInit || get_state() == State::Init || _once) { - rccl_setup(postinit); return false; } _once = true; @@ -481,7 +453,7 @@ rocprofsys_init_tooling_hidden(bool postinit) ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "Calling rocprofsys_init_library()...\n"); - rocprofsys_init_library_hidden_with_rccl(postinit); + rocprofsys_init_library_hidden(); ROCPROFSYS_DEBUG_F("\n"); @@ -807,12 +779,6 @@ rocprofsys_finalize_hidden(void) component::vaapi_gotcha::shutdown(); } - if(get_use_rcclp()) - { - ROCPROFSYS_VERBOSE_F(1, "Shutting down RCCLP...\n"); - rcclp::shutdown(); - } - if(get_use_ompt()) { ROCPROFSYS_VERBOSE_F(1, "Shutting down OMPT...\n"); diff --git a/source/lib/rocprof-sys/library/CMakeLists.txt b/source/lib/rocprof-sys/library/CMakeLists.txt index 104ee1fa6..17b06c63d 100644 --- a/source/lib/rocprof-sys/library/CMakeLists.txt +++ b/source/lib/rocprof-sys/library/CMakeLists.txt @@ -20,7 +20,6 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/process_sampler.hpp ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/ptl.hpp - ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/rocm.hpp ${CMAKE_CURRENT_LIST_DIR}/amd_smi.hpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp @@ -34,11 +33,6 @@ set(library_headers target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources} ${library_headers}) -if(ROCPROFSYS_USE_RCCL) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) -endif() - if(ROCPROFSYS_USE_ROCM) target_sources( rocprofiler-systems-object-library @@ -56,7 +50,6 @@ add_subdirectory(tracing) set(ndebug_sources ${CMAKE_CURRENT_LIST_DIR}/components/mpi_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/components/backtrace_metrics.cpp - ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp ${CMAKE_CURRENT_LIST_DIR}/kokkosp.cpp ${CMAKE_CURRENT_LIST_DIR}/amd_smi.cpp ${CMAKE_CURRENT_LIST_DIR}/ompt.cpp) diff --git a/source/lib/rocprof-sys/library/amd_smi.cpp b/source/lib/rocprof-sys/library/amd_smi.cpp index c539eb7f7..f840f8972 100644 --- a/source/lib/rocprof-sys/library/amd_smi.cpp +++ b/source/lib/rocprof-sys/library/amd_smi.cpp @@ -463,9 +463,8 @@ setup() } amdsmi_version_t _version = get_version(); - ROCPROFSYS_VERBOSE_F(0, "AMD SMI version: %u.%u.%u.%u - str: %s.\n", _version.year, - _version.major, _version.minor, _version.release, - _version.build); + ROCPROFSYS_VERBOSE_F(0, "AMD SMI version: %u.%u.%u - str: %s.\n", _version.major, + _version.minor, _version.release, _version.build); data::device_count = gpu::get_processor_count(); diff --git a/source/lib/rocprof-sys/library/components/CMakeLists.txt b/source/lib/rocprof-sys/library/components/CMakeLists.txt index 3ccd1f981..15940da05 100644 --- a/source/lib/rocprof-sys/library/components/CMakeLists.txt +++ b/source/lib/rocprof-sys/library/components/CMakeLists.txt @@ -26,18 +26,13 @@ set(component_headers ${CMAKE_CURRENT_LIST_DIR}/ensure_storage.hpp ${CMAKE_CURRENT_LIST_DIR}/exit_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/fork_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/mpip.hpp ${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/numa_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/vaapi_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp) target_sources(rocprofiler-systems-object-library PRIVATE ${component_sources} ${component_headers}) - -if(ROCPROFSYS_USE_RCCL) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) -endif() diff --git a/source/lib/rocprof-sys/library/components/comm_data.cpp b/source/lib/rocprof-sys/library/components/comm_data.cpp index f733f53f0..50b5b437e 100644 --- a/source/lib/rocprof-sys/library/components/comm_data.cpp +++ b/source/lib/rocprof-sys/library/components/comm_data.cpp @@ -298,6 +298,8 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sen #endif #if defined(ROCPROFSYS_USE_RCCL) +// Kept for reference, but now gathered throught the SDK callbacks. + // ncclReduce void comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, @@ -403,6 +405,7 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const v } // ncclAllGather +// ncclAllToAll void comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t) diff --git a/source/lib/rocprof-sys/library/components/comm_data.hpp b/source/lib/rocprof-sys/library/components/comm_data.hpp index 037358623..93dfac752 100644 --- a/source/lib/rocprof-sys/library/components/comm_data.hpp +++ b/source/lib/rocprof-sys/library/components/comm_data.hpp @@ -26,7 +26,6 @@ #include "core/common.hpp" #include "core/components/fwd.hpp" #include "core/defines.hpp" -#include "core/rccl.hpp" #include "core/timemory.hpp" #include "library/components/category_region.hpp" @@ -78,18 +77,6 @@ struct comm_data : base static constexpr auto label = "MPI Comm Send"; }; - struct rccl_recv - { - static constexpr auto value = "comm_data"; - static constexpr auto label = "RCCL Comm Recv"; - }; - - struct rccl_send - { - static constexpr auto value = "comm_data"; - static constexpr auto label = "RCCL Comm Send"; - }; - ROCPROFSYS_DEFAULT_OBJECT(comm_data) static void preinit(); @@ -148,55 +135,6 @@ struct comm_data : base MPI_Datatype recvtype, MPI_Comm); #endif -#if defined(ROCPROFSYS_USE_RCCL) - static auto rccl_type_size(ncclDataType_t datatype) - { - switch(datatype) - { - case ncclInt8: - case ncclUint8: return 1; - case ncclFloat16: return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: return 8; - default: return 0; - }; - } - - // ncclReduce - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, - ncclComm_t, hipStream_t); - - // ncclSend - // ncclGather - // ncclBcast - // ncclRecv - static void audit(const gotcha_data& _data, audit::incoming, const void*, - size_t count, ncclDataType_t datatype, int peer, ncclComm_t, - hipStream_t); - - // ncclBroadcast - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, int root, ncclComm_t, - hipStream_t); - - // ncclAllReduce - // ncclReduceScatter - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, - hipStream_t); - - // ncclAllGather - // ncclAlltoAll - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t); - -#endif - private: static auto& add(tracker_t& _t, data_type value) { diff --git a/source/lib/rocprof-sys/library/components/mpi_gotcha.cpp b/source/lib/rocprof-sys/library/components/mpi_gotcha.cpp index 94d694ced..bd65c608d 100644 --- a/source/lib/rocprof-sys/library/components/mpi_gotcha.cpp +++ b/source/lib/rocprof-sys/library/components/mpi_gotcha.cpp @@ -25,11 +25,12 @@ #include "core/components/fwd.hpp" #include "core/config.hpp" #include "core/debug.hpp" +#include "core/mpi.hpp" #include "core/mproc.hpp" #include "library/components/category_region.hpp" #include "library/components/comm_data.hpp" +#include "mpip.hpp" -#include #include #include #include @@ -46,8 +47,7 @@ namespace component { namespace { -using mpip_bundle_t = - tim::component_tuple, comp::comm_data>; +using mpip_bundle_t = tim::component_tuple, comm_data>; struct comm_rank_data { @@ -102,7 +102,7 @@ auto mpi_comm_records = std::map{}; using tim::auto_lock_t; using tim::type_mutex; -#if defined(TIMEMORY_USE_MPI) +#if defined(ROCPROFSYS_USE_MPI) int rocprofsys_mpi_copy(MPI_Comm, int, void*, void*, void*, int*) { @@ -117,7 +117,7 @@ rocprofsys_mpi_fini(MPI_Comm, int, void*, void*) if(!_blocked.empty()) tim::signals::block_signals(_blocked, tim::signals::sigmask_scope::process); if(mpip_index != std::numeric_limits::max()) - comp::deactivate_mpip(mpip_index); + deactivate_mpip(mpip_index); if(is_root_process()) rocprofsys_finalize_hidden(); return MPI_SUCCESS; } @@ -127,7 +127,7 @@ rocprofsys_mpi_fini(MPI_Comm, int, void*, void*) void rocprofsys_mpi_set_attr() { -#if defined(TIMEMORY_USE_MPI) +#if defined(ROCPROFSYS_USE_MPI) auto _blocked = get_sampling_signals(); if(!_blocked.empty()) tim::signals::block_signals(_blocked, tim::signals::sigmask_scope::process); @@ -162,17 +162,28 @@ mpi_gotcha::configure() mpi_gotcha_t::get_initializer() = []() { mpi_gotcha_t::template configure<0, int, int*, char***>("MPI_Init"); - mpi_gotcha_t::template configure<1, int, int*, char***, int, int*>( + mpi_gotcha_t::template configure<1, int, int*, char***>("PMPI_Init"); + mpi_gotcha_t::template configure<2, int, int*, char***, int, int*>( "MPI_Init_thread"); - mpi_gotcha_t::template configure<2, int>("MPI_Finalize"); + mpi_gotcha_t::template configure<3, int, int*, char***, int, int*>( + "PMPI_Init_thread"); + mpi_gotcha_t::template configure<4, int>("MPI_Finalize"); + mpi_gotcha_t::template configure<5, int>("PMPI_Finalize"); reject_bindings.emplace("MPI_Init"); + reject_bindings.emplace("PMPI_Init"); reject_bindings.emplace("MPI_Init_thread"); + reject_bindings.emplace("PMPI_Init_thread"); reject_bindings.emplace("MPI_Finalize"); + reject_bindings.emplace("PMPI_Finalize"); #if defined(ROCPROFSYS_USE_MPI_HEADERS) && ROCPROFSYS_USE_MPI_HEADERS > 0 - mpi_gotcha_t::template configure<3, int, comm_t, int*>("MPI_Comm_rank"); - mpi_gotcha_t::template configure<4, int, comm_t, int*>("MPI_Comm_size"); + mpi_gotcha_t::template configure<6, int, comm_t, int*>("MPI_Comm_rank"); + mpi_gotcha_t::template configure<7, int, comm_t, int*>("PMPI_Comm_rank"); + mpi_gotcha_t::template configure<8, int, comm_t, int*>("MPI_Comm_size"); + mpi_gotcha_t::template configure<9, int, comm_t, int*>("PMPI_Comm_size"); reject_bindings.emplace("MPI_Comm_rank"); + reject_bindings.emplace("PMPI_Comm_rank"); reject_bindings.emplace("MPI_Comm_size"); + reject_bindings.emplace("PMPI_Comm_size"); #endif }; } @@ -207,13 +218,13 @@ mpi_gotcha::update() auto _rank = _rank_data.rank; auto _size = _rank_data.size; - tim::mpi::set_rank(_rank); - tim::mpi::set_size(_size); - tim::settings::default_process_suffix() = _rank; + rocprofsys::mpi::set_rank(_rank); + rocprofsys::mpi::set_size(_size); + rocprofsys::settings::default_process_suffix() = _rank; ROCPROFSYS_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i), MPI size: %i (%i)\n", - process::get_id(), tim::mpi::rank(), _rank, - tim::mpi::size(), _size); + process::get_id(), rocprofsys::mpi::rank(), _rank, + rocprofsys::mpi::size(), _size); last_comm_record = _rank_data; config::get_use_pid() = true; return true; @@ -236,9 +247,9 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***) ROCPROFSYS_BASIC_DEBUG_F("%s(int*, char***)\n", _data.tool_id.c_str()); rocprofsys_push_trace_hidden(_data.tool_id.c_str()); -#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) - tim::mpi::is_initialized_callback() = []() { return true; }; - tim::mpi::is_finalized() = false; +#if !defined(ROCPROFSYS_USE_MPI) && defined(ROCPROFSYS_USE_MPI_HEADERS) + rocprofsys::mpi::is_initialized_callback() = []() { return true; }; + rocprofsys::mpi::is_finalized() = false; #endif } @@ -248,9 +259,9 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, in ROCPROFSYS_BASIC_DEBUG_F("%s(int*, char***, int, int*)\n", _data.tool_id.c_str()); rocprofsys_push_trace_hidden(_data.tool_id.c_str()); -#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) - tim::mpi::is_initialized_callback() = []() { return true; }; - tim::mpi::is_finalized() = false; +#if !defined(ROCPROFSYS_USE_MPI) && defined(ROCPROFSYS_USE_MPI_HEADERS) + rocprofsys::mpi::is_initialized_callback() = []() { return true; }; + rocprofsys::mpi::is_finalized() = false; #endif } @@ -264,11 +275,11 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming) tim::signals::block_signals(_blocked, tim::signals::sigmask_scope::process); if(mpip_index != std::numeric_limits::max()) - comp::deactivate_mpip(mpip_index); + deactivate_mpip(mpip_index); -#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) - tim::mpi::is_initialized_callback() = []() { return false; }; - tim::mpi::is_finalized() = true; +#if !defined(ROCPROFSYS_USE_MPI) && defined(ROCPROFSYS_USE_MPI_HEADERS) + rocprofsys::mpi::is_initialized_callback() = []() { return false; }; + rocprofsys::mpi::is_finalized() = true; #else if(is_root_process() && rocprofsys::get_state() < rocprofsys::State::Finalized) rocprofsys_finalize_hidden(); @@ -278,15 +289,17 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming) void mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, comm_t _comm, int* _val) { - ROCPROFSYS_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str()); + ROCPROFSYS_BASIC_DEBUG_F("%s(comm_t _comm, int* _val)\n", _data.tool_id.c_str()); rocprofsys_push_trace_hidden(_data.tool_id.c_str()); - if(_data.tool_id == "MPI_Comm_rank") + if(_data.tool_id.find("MPI_Comm_rank") == 0 || + _data.tool_id.find("PMPI_Comm_rank") == 0) { m_comm_val = (uintptr_t) _comm; // NOLINT m_rank_ptr = _val; } - else if(_data.tool_id == "MPI_Comm_size") + else if(_data.tool_id.find("MPI_Comm_size") == 0 || + _data.tool_id.find("PMPI_Comm_size") == 0) { m_comm_val = (uintptr_t) _comm; // NOLINT m_size_ptr = _val; @@ -305,7 +318,8 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) if(!settings::use_output_suffix()) settings::use_output_suffix() = true; - if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Init") == 0) + if(_retval == rocprofsys::mpi::success_v && + (_data.tool_id.find("MPI_Init") == 0 || _data.tool_id.find("PMPI_Init") == 0)) { rocprofsys_mpi_set_attr(); // rocprof-sys will set this environement variable to true in binary rewrite mode @@ -319,9 +333,9 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) // use env vars ROCPROFSYS_MPIP_PERMIT_LIST and ROCPROFSYS_MPIP_REJECT_LIST // to control the gotcha bindings at runtime - comp::configure_mpip(permit_bindings, - reject_bindings); - mpip_index = comp::activate_mpip(); + configure_mpip(permit_bindings, + reject_bindings); + mpip_index = activate_mpip(); } auto_lock_t _lk{ type_mutex() }; @@ -339,7 +353,9 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) } } } - else if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Comm_") == 0) + else if(_retval == rocprofsys::mpi::success_v && + (_data.tool_id.find("MPI_Comm_") == 0 || + _data.tool_id.find("PMPI_Comm_") == 0)) { auto_lock_t _lk{ type_mutex() }; if(m_comm_val != null_comm()) @@ -356,7 +372,8 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) : std::max(m_size, _get_rank() + 1); }; - if(_data.tool_id == "MPI_Comm_rank" || _data.tool_id == "MPI_Comm_size") + if(_data.tool_id == "MPI_Comm_rank" || _data.tool_id == "MPI_Comm_size" || + _data.tool_id == "PMPI_Comm_rank" || _data.tool_id == "PMPI_Comm_size") { _comm_entry.rank = m_rank = std::max(_comm_entry.rank, _get_rank()); _comm_entry.size = m_size = std::max(_comm_entry.size, _get_size()); diff --git a/source/lib/rocprof-sys/library/components/mpi_gotcha.hpp b/source/lib/rocprof-sys/library/components/mpi_gotcha.hpp index cd0bac0ee..b4147292f 100644 --- a/source/lib/rocprof-sys/library/components/mpi_gotcha.hpp +++ b/source/lib/rocprof-sys/library/components/mpi_gotcha.hpp @@ -24,6 +24,7 @@ #include "core/common.hpp" #include "core/defines.hpp" +#include "core/mpi.hpp" #include "core/timemory.hpp" #include @@ -35,7 +36,7 @@ namespace component // this is used to wrap MPI_Init and MPI_Init_thread struct mpi_gotcha : comp::base { - using comm_t = tim::mpi::comm_t; + using comm_t = rocprofsys::mpi::comm_t; using gotcha_data_t = comp::gotcha_data; ROCPROFSYS_DEFAULT_OBJECT(mpi_gotcha) @@ -81,5 +82,5 @@ struct mpi_gotcha : comp::base } // namespace component using mpi_gotcha_t = - comp::gotcha<5, tim::component_tuple, project::rocprofsys>; + comp::gotcha<10, tim::component_tuple, project::rocprofsys>; } // namespace rocprofsys diff --git a/source/lib/rocprof-sys/library/components/mpip.hpp b/source/lib/rocprof-sys/library/components/mpip.hpp new file mode 100644 index 000000000..1c5e35991 --- /dev/null +++ b/source/lib/rocprof-sys/library/components/mpip.hpp @@ -0,0 +1,779 @@ +// MIT License +// +// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "core/timemory.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if !defined(ROCPROFSYS_USE_MPI) && defined(ROCPROFSYS_USE_MPI_HEADERS) && \ + !defined(OMPI_SKIP_MPICXX) +# define ROCPROFSYS_UNDEFINE_OMPI_SKIP_MPICXX 1 +# define OMPI_SKIP_MPICXX 1 +#endif + +#if defined(ROCPROFSYS_USE_MPI) || defined(ROCPROFSYS_USE_MPI_HEADERS) +# include +#endif + +#if !defined(NUM_ROCPROFSYS_MPIP_WRAPPERS) +# define NUM_ROCPROFSYS_MPIP_WRAPPERS 500 +#endif + +namespace rocprofsys +{ +namespace component +{ +// +//--------------------------------------------------------------------------------------// +// +template +TIMEMORY_VISIBILITY("default") +TIMEMORY_NOINLINE void configure_mpip(const std::set& permit = {}, + const std::set& reject = {}); +// +//--------------------------------------------------------------------------------------// +// +template +TIMEMORY_VISIBILITY("default") +TIMEMORY_NOINLINE uint64_t activate_mpip(); +// +//--------------------------------------------------------------------------------------// +// +template +TIMEMORY_VISIBILITY("default") +TIMEMORY_NOINLINE uint64_t deactivate_mpip(uint64_t); +// +//--------------------------------------------------------------------------------------// +// +template +struct mpip_handle : base, void> +{ + static constexpr size_t mpip_wrapper_count = NUM_ROCPROFSYS_MPIP_WRAPPERS; + + using value_type = void; + using this_type = mpip_handle; + using base_type = base; + + using mpi_toolset_t = Toolset; + using mpip_gotcha_t = tim::component::gotcha; + using mpip_tuple_t = tim::component_tuple; + using toolset_ptr_t = std::shared_ptr; + + static std::string label() { return "mpip_handle"; } + static std::string description() { return "Handle for activating MPI wrappers"; } + + void get() {} + + void start() + { + if(get_tool_count()++ == 0) + { + get_tool_instance() = std::make_shared("rocprofsys_mpip"); + get_tool_instance()->start(); + } + } + + void stop() + { + auto idx = --get_tool_count(); + if(get_tool_instance().get()) + { + get_tool_instance()->stop(); + if(idx == 0) get_tool_instance().reset(); + } + } + + int get_count() { return get_tool_count().load(); } + +private: + struct persistent_data + { + std::atomic m_configured; + std::atomic m_count; + toolset_ptr_t m_tool; + }; + + static persistent_data& get_persistent_data() + { + static persistent_data _instance; + return _instance; + } + + static std::atomic& get_configured() + { + return get_persistent_data().m_configured; + } + + static toolset_ptr_t& get_tool_instance() { return get_persistent_data().m_tool; } + + static std::atomic& get_tool_count() + { + return get_persistent_data().m_count; + } +}; +// +//======================================================================================// +// +} // namespace component +} // namespace rocprofsys +// +//======================================================================================// +// +#include +// +//======================================================================================// +// +/// \fn uint64_t rocprofsys::component::activate_mpip() +/// \brief The thread that first activates mpip will be the thread that turns it off. +/// Function returns the number of new mpip handles +/// +template +uint64_t +rocprofsys::component::activate_mpip() +{ + using handle_t = rocprofsys::component::mpip_handle; + + static std::shared_ptr _handle; + + if(!_handle.get()) + { + _handle = std::make_shared(); + _handle->start(); + + auto cleanup_functor = [=]() { + if(_handle) + { + _handle->stop(); + _handle.reset(); + } + }; + + static std::string _label = []() { + std::stringstream ss; + ss << "rocprofsys-mpip-" << demangle() << "-" << demangle(); + return ss.str(); + }(); + ROCPROFSYS_BASIC_DEBUG_F("Adding cleanup for %s", _label.c_str()); + tim::manager::instance()->add_cleanup(_label, cleanup_functor); + return 1; + } + return 0; +} +// +//======================================================================================// +// +/// \fn uint64_t rocprofsys::component::deactivate_mpip(uint64_t id) +/// \brief The thread that created the initial mpip handle will turn off. Returns +/// the number of handles active +/// +template +uint64_t +rocprofsys::component::deactivate_mpip(uint64_t id) +{ + if(id > 0) + { + static std::string _label = []() { + std::stringstream ss; + ss << "rocprofsys-mpip-" << demangle() << "-" << demangle(); + return ss.str(); + }(); + ROCPROFSYS_BASIC_DEBUG_F("Removing cleanup for %s", _label.c_str()); + tim::manager::instance()->cleanup(_label); + return 0; + } + return 1; +} +// +//======================================================================================// +// +#if !defined(TIMEMORY_USE_GOTCHA) || \ + (!defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)) +// +template +void +rocprofsys::component::configure_mpip(const std::set&, + const std::set&) +{} +// +#else +// +template +void +rocprofsys::component::configure_mpip(const std::set& permit, + const std::set& reject) +{ + static constexpr size_t mpip_wrapper_count = NUM_ROCPROFSYS_MPIP_WRAPPERS; + static bool is_initialized = false; + + using mpip_gotcha_t = tim::component::gotcha; + + if(!is_initialized) + { + // generate the gotcha wrappers + mpip_gotcha_t::get_initializer() = []() { + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 0, MPI_Accumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 1, MPI_Add_error_class); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 2, MPI_Add_error_code); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 3, MPI_Add_error_string); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 4, MPI_Aint_add); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 5, MPI_Aint_diff); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 6, MPI_Allgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 7, MPI_Allgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 8, MPI_Alloc_mem); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 9, MPI_Allreduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 10, MPI_Alltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 11, MPI_Alltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 12, MPI_Alltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 13, MPI_Barrier); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 14, MPI_Bcast); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 15, MPI_Bsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 16, MPI_Bsend_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 17, MPI_Buffer_attach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 18, MPI_Buffer_detach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 19, MPI_Cancel); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 20, MPI_Cart_coords); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 21, MPI_Cart_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 22, MPI_Cart_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 23, MPI_Cart_map); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 24, MPI_Cart_rank); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 25, MPI_Cart_shift); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 26, MPI_Cart_sub); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 27, MPI_Cartdim_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 28, MPI_Close_port); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 29, MPI_Comm_accept); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 30, MPI_Comm_call_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 31, MPI_Comm_compare); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 32, MPI_Comm_connect); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 33, MPI_Comm_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 34, MPI_Comm_create_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 35, MPI_Comm_create_group); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 36, MPI_Comm_create_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 37, MPI_Comm_delete_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 38, MPI_Comm_disconnect); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 39, MPI_Comm_dup); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 40, MPI_Comm_dup_with_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 41, MPI_Comm_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 42, MPI_Comm_free_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 43, MPI_Comm_get_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 44, MPI_Comm_get_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 45, MPI_Comm_get_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 46, MPI_Comm_get_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 47, MPI_Comm_get_parent); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 48, MPI_Comm_group); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 49, MPI_Comm_idup); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 50, MPI_Comm_join); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 51, MPI_Comm_rank); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 52, MPI_Comm_remote_group); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 53, MPI_Comm_remote_size); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 54, MPI_Comm_set_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 55, MPI_Comm_set_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 56, MPI_Comm_set_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 57, MPI_Comm_set_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 58, MPI_Comm_size); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 59, MPI_Comm_spawn); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 60, MPI_Comm_spawn_multiple); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 61, MPI_Comm_split); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 62, MPI_Comm_split_type); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 63, MPI_Comm_test_inter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 64, MPI_Compare_and_swap); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 65, MPI_Dims_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 66, MPI_Dist_graph_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 67, MPI_Dist_graph_create_adjacent); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 68, MPI_Dist_graph_neighbors); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 69, MPI_Dist_graph_neighbors_count); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 70, MPI_Error_class); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 71, MPI_Error_string); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 72, MPI_Exscan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 73, MPI_Fetch_and_op); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 74, MPI_File_call_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 75, MPI_File_create_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 76, MPI_File_get_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 77, MPI_File_set_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 78, MPI_Free_mem); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 79, MPI_Gather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 80, MPI_Gatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 81, MPI_Get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 82, MPI_Get_accumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 83, MPI_Get_address); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 84, MPI_Get_count); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 85, MPI_Get_elements); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 86, MPI_Get_elements_x); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 87, MPI_Get_library_version); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 88, MPI_Get_processor_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 89, MPI_Get_version); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 90, MPI_Graph_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 91, MPI_Graph_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 92, MPI_Graph_map); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 93, MPI_Graph_neighbors); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 94, MPI_Graph_neighbors_count); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 95, MPI_Graphdims_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 96, MPI_Grequest_complete); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 97, MPI_Grequest_start); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 98, MPI_Group_compare); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 99, MPI_Group_difference); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 100, MPI_Group_excl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 101, MPI_Group_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 102, MPI_Group_incl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 103, MPI_Group_intersection); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 104, MPI_Group_range_excl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 105, MPI_Group_range_incl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 106, MPI_Group_rank); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 107, MPI_Group_size); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 108, MPI_Group_translate_ranks); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 109, MPI_Group_union); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 110, MPI_Iallgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 111, MPI_Iallgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 112, MPI_Iallreduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 113, MPI_Ialltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 114, MPI_Ialltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 115, MPI_Ialltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 116, MPI_Ibarrier); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 117, MPI_Ibcast); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 118, MPI_Ibsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 119, MPI_Iexscan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 120, MPI_Igather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 121, MPI_Igatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 122, MPI_Improbe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 123, MPI_Imrecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 124, MPI_Ineighbor_allgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 125, MPI_Ineighbor_allgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 126, MPI_Ineighbor_alltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 127, MPI_Ineighbor_alltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 128, MPI_Ineighbor_alltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 129, MPI_Info_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 130, MPI_Info_delete); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 131, MPI_Info_dup); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 132, MPI_Info_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 133, MPI_Info_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 134, MPI_Info_get_nkeys); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 135, MPI_Info_get_nthkey); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 136, MPI_Info_get_valuelen); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 137, MPI_Info_set); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 138, MPI_Intercomm_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 139, MPI_Intercomm_merge); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 140, MPI_Iprobe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 141, MPI_Irecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 142, MPI_Ireduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 143, MPI_Ireduce_scatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 144, MPI_Ireduce_scatter_block); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 145, MPI_Irsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 146, MPI_Is_thread_main); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 147, MPI_Iscan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 148, MPI_Iscatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 149, MPI_Iscatterv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 150, MPI_Isend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 151, MPI_Issend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 152, MPI_Lookup_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 153, MPI_Mprobe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 154, MPI_Mrecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 155, MPI_Neighbor_allgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 156, MPI_Neighbor_allgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 157, MPI_Neighbor_alltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 158, MPI_Neighbor_alltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 159, MPI_Neighbor_alltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 160, MPI_Op_commutative); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 161, MPI_Op_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 162, MPI_Op_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 163, MPI_Open_port); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 164, MPI_Pack); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 165, MPI_Pack_external); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 166, MPI_Pack_external_size); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 167, MPI_Pack_size); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 168, MPI_Probe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 169, MPI_Publish_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 170, MPI_Put); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 171, MPI_Query_thread); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 172, MPI_Raccumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 173, MPI_Recv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 174, MPI_Recv_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 175, MPI_Reduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 176, MPI_Reduce_local); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 177, MPI_Reduce_scatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 178, MPI_Reduce_scatter_block); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 179, MPI_Request_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 180, MPI_Request_get_status); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 181, MPI_Rget); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 182, MPI_Rget_accumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 183, MPI_Rput); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 184, MPI_Rsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 185, MPI_Rsend_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 186, MPI_Scan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 187, MPI_Scatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 188, MPI_Scatterv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 189, MPI_Send); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 190, MPI_Send_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 191, MPI_Sendrecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 192, MPI_Sendrecv_replace); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 193, MPI_Ssend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 194, MPI_Ssend_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 195, MPI_Start); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 196, MPI_Startall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 197, MPI_Status_f2c); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 198, MPI_Status_set_cancelled); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 199, MPI_Status_set_elements); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 200, MPI_Status_set_elements_x); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 201, MPI_Topo_test); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 202, MPI_Unpack); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 203, MPI_Unpack_external); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 204, MPI_Unpublish_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 205, MPI_Wait); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 206, MPI_Waitall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 207, MPI_Waitany); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 208, MPI_Waitsome); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 209, MPI_Win_allocate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 210, MPI_Win_allocate_shared); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 211, MPI_Win_attach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 212, MPI_Win_call_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 213, MPI_Win_complete); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 214, MPI_Win_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 215, MPI_Win_create_dynamic); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 216, MPI_Win_create_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 217, MPI_Win_create_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 218, MPI_Win_delete_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 219, MPI_Win_detach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 220, MPI_Win_fence); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 221, MPI_Win_flush); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 222, MPI_Win_flush_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 223, MPI_Win_flush_local); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 224, MPI_Win_flush_local_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 225, MPI_Win_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 226, MPI_Win_free_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 227, MPI_Win_get_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 228, MPI_Win_get_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 229, MPI_Win_get_group); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 230, MPI_Win_get_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 231, MPI_Win_get_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 232, MPI_Win_lock); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 233, MPI_Win_lock_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 234, MPI_Win_post); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 235, MPI_Win_set_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 236, MPI_Win_set_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 237, MPI_Win_set_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 238, MPI_Win_set_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 239, MPI_Win_shared_query); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 240, MPI_Win_start); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 241, MPI_Win_sync); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 242, MPI_Win_test); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 243, MPI_Win_unlock); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 244, MPI_Win_unlock_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 245, MPI_Win_wait); + + // MPI profiling interface wrappers + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 250, PMPI_Accumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 251, PMPI_Add_error_class); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 252, PMPI_Add_error_code); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 253, PMPI_Add_error_string); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 254, PMPI_Aint_add); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 255, PMPI_Aint_diff); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 256, PMPI_Allgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 257, PMPI_Allgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 258, PMPI_Alloc_mem); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 259, PMPI_Allreduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 260, PMPI_Alltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 261, PMPI_Alltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 262, PMPI_Alltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 263, PMPI_Barrier); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 264, PMPI_Bcast); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 265, PMPI_Bsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 266, PMPI_Bsend_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 267, PMPI_Buffer_attach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 268, PMPI_Buffer_detach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 269, PMPI_Cancel); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 270, PMPI_Cart_coords); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 271, PMPI_Cart_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 272, PMPI_Cart_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 273, PMPI_Cart_map); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 274, PMPI_Cart_rank); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 275, PMPI_Cart_shift); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 276, PMPI_Cart_sub); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 277, PMPI_Cartdim_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 278, PMPI_Close_port); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 279, PMPI_Comm_accept); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 280, PMPI_Comm_call_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 281, PMPI_Comm_compare); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 282, PMPI_Comm_connect); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 283, PMPI_Comm_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 284, PMPI_Comm_create_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 285, PMPI_Comm_create_group); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 286, PMPI_Comm_create_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 287, PMPI_Comm_delete_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 288, PMPI_Comm_disconnect); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 289, PMPI_Comm_dup); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 290, PMPI_Comm_dup_with_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 291, PMPI_Comm_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 292, PMPI_Comm_free_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 293, PMPI_Comm_get_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 294, PMPI_Comm_get_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 295, PMPI_Comm_get_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 296, PMPI_Comm_get_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 297, PMPI_Comm_get_parent); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 298, PMPI_Comm_group); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 299, PMPI_Comm_idup); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 300, PMPI_Comm_join); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 301, PMPI_Comm_rank); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 302, PMPI_Comm_remote_group); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 303, PMPI_Comm_remote_size); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 304, PMPI_Comm_set_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 305, PMPI_Comm_set_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 306, PMPI_Comm_set_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 307, PMPI_Comm_set_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 308, PMPI_Comm_size); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 309, PMPI_Comm_spawn); + // TIMEMORY_C_GOTCHA(mpip_gotcha_t, 310, PMPI_Comm_spawn_multiple); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 311, PMPI_Comm_split); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 312, PMPI_Comm_split_type); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 313, PMPI_Comm_test_inter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 314, PMPI_Compare_and_swap); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 315, PMPI_Dims_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 316, PMPI_Dist_graph_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 317, PMPI_Dist_graph_create_adjacent); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 318, PMPI_Dist_graph_neighbors); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 319, PMPI_Dist_graph_neighbors_count); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 320, PMPI_Error_class); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 321, PMPI_Error_string); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 322, PMPI_Exscan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 323, PMPI_Fetch_and_op); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 324, PMPI_File_call_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 325, PMPI_File_create_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 326, PMPI_File_get_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 327, PMPI_File_set_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 328, PMPI_Free_mem); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 329, PMPI_Gather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 330, PMPI_Gatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 331, PMPI_Get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 332, PMPI_Get_accumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 333, PMPI_Get_address); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 334, PMPI_Get_count); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 335, PMPI_Get_elements); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 336, PMPI_Get_elements_x); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 337, PMPI_Get_library_version); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 338, PMPI_Get_processor_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 339, PMPI_Get_version); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 340, PMPI_Graph_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 341, PMPI_Graph_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 342, PMPI_Graph_map); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 343, PMPI_Graph_neighbors); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 344, PMPI_Graph_neighbors_count); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 345, PMPI_Graphdims_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 346, PMPI_Grequest_complete); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 347, PMPI_Grequest_start); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 348, PMPI_Group_compare); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 349, PMPI_Group_difference); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 350, PMPI_Group_excl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 351, PMPI_Group_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 352, PMPI_Group_incl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 353, PMPI_Group_intersection); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 354, PMPI_Group_range_excl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 355, PMPI_Group_range_incl); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 356, PMPI_Group_rank); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 357, PMPI_Group_size); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 358, PMPI_Group_translate_ranks); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 359, PMPI_Group_union); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 360, PMPI_Iallgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 361, PMPI_Iallgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 362, PMPI_Iallreduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 363, PMPI_Ialltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 364, PMPI_Ialltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 365, PMPI_Ialltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 366, PMPI_Ibarrier); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 367, PMPI_Ibcast); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 368, PMPI_Ibsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 369, PMPI_Iexscan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 370, PMPI_Igather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 371, PMPI_Igatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 372, PMPI_Improbe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 373, PMPI_Imrecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 374, PMPI_Ineighbor_allgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 375, PMPI_Ineighbor_allgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 376, PMPI_Ineighbor_alltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 377, PMPI_Ineighbor_alltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 378, PMPI_Ineighbor_alltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 379, PMPI_Info_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 380, PMPI_Info_delete); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 381, PMPI_Info_dup); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 382, PMPI_Info_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 383, PMPI_Info_get); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 384, PMPI_Info_get_nkeys); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 385, PMPI_Info_get_nthkey); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 386, PMPI_Info_get_valuelen); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 387, PMPI_Info_set); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 388, PMPI_Intercomm_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 389, PMPI_Intercomm_merge); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 390, PMPI_Iprobe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 391, PMPI_Irecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 392, PMPI_Ireduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 393, PMPI_Ireduce_scatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 394, PMPI_Ireduce_scatter_block); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 395, PMPI_Irsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 396, PMPI_Is_thread_main); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 397, PMPI_Iscan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 398, PMPI_Iscatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 399, PMPI_Iscatterv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 400, PMPI_Isend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 401, PMPI_Issend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 402, PMPI_Lookup_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 403, PMPI_Mprobe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 404, PMPI_Mrecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 405, PMPI_Neighbor_allgather); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 406, PMPI_Neighbor_allgatherv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 407, PMPI_Neighbor_alltoall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 408, PMPI_Neighbor_alltoallv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 409, PMPI_Neighbor_alltoallw); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 410, PMPI_Op_commutative); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 411, PMPI_Op_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 412, PMPI_Op_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 413, PMPI_Open_port); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 414, PMPI_Pack); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 415, PMPI_Pack_external); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 416, PMPI_Pack_external_size); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 417, PMPI_Pack_size); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 418, PMPI_Probe); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 419, PMPI_Publish_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 420, PMPI_Put); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 421, PMPI_Query_thread); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 422, PMPI_Raccumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 423, PMPI_Recv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 424, PMPI_Recv_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 425, PMPI_Reduce); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 426, PMPI_Reduce_local); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 427, PMPI_Reduce_scatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 428, PMPI_Reduce_scatter_block); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 429, PMPI_Request_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 430, PMPI_Request_get_status); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 431, PMPI_Rget); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 432, PMPI_Rget_accumulate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 433, PMPI_Rput); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 434, PMPI_Rsend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 435, PMPI_Rsend_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 436, PMPI_Scan); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 437, PMPI_Scatter); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 438, PMPI_Scatterv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 439, PMPI_Send); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 440, PMPI_Send_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 441, PMPI_Sendrecv); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 442, PMPI_Sendrecv_replace); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 443, PMPI_Ssend); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 444, PMPI_Ssend_init); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 445, PMPI_Start); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 446, PMPI_Startall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 447, PMPI_Status_f2c); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 448, PMPI_Status_set_cancelled); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 449, PMPI_Status_set_elements); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 450, PMPI_Status_set_elements_x); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 451, PMPI_Topo_test); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 452, PMPI_Unpack); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 453, PMPI_Unpack_external); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 454, PMPI_Unpublish_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 455, PMPI_Wait); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 456, PMPI_Waitall); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 457, PMPI_Waitany); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 458, PMPI_Waitsome); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 459, PMPI_Win_allocate); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 460, PMPI_Win_allocate_shared); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 461, PMPI_Win_attach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 462, PMPI_Win_call_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 463, PMPI_Win_complete); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 464, PMPI_Win_create); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 465, PMPI_Win_create_dynamic); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 466, PMPI_Win_create_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 467, PMPI_Win_create_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 468, PMPI_Win_delete_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 469, PMPI_Win_detach); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 470, PMPI_Win_fence); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 471, PMPI_Win_flush); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 472, PMPI_Win_flush_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 473, PMPI_Win_flush_local); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 474, PMPI_Win_flush_local_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 475, PMPI_Win_free); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 476, PMPI_Win_free_keyval); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 477, PMPI_Win_get_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 478, PMPI_Win_get_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 479, PMPI_Win_get_group); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 480, PMPI_Win_get_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 481, PMPI_Win_get_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 482, PMPI_Win_lock); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 483, PMPI_Win_lock_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 484, PMPI_Win_post); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 485, PMPI_Win_set_attr); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 486, PMPI_Win_set_errhandler); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 487, PMPI_Win_set_info); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 488, PMPI_Win_set_name); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 489, PMPI_Win_shared_query); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 490, PMPI_Win_start); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 491, PMPI_Win_sync); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 492, PMPI_Win_test); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 493, PMPI_Win_unlock); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 494, PMPI_Win_unlock_all); + TIMEMORY_C_GOTCHA(mpip_gotcha_t, 495, PMPI_Win_wait); + }; + + // provide environment variable for suppressing wrappers + mpip_gotcha_t::get_reject_list() = [reject]() { + auto _reject = reject; + // check environment + auto reject_list = tim::get_env( + TIMEMORY_SETTINGS_PREFIX "ROCPROFSYS_MPIP_REJECT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(reject_list)) + _reject.insert(itr); + return _reject; + }; + + // provide environment variable for selecting wrappers + mpip_gotcha_t::get_permit_list() = [permit]() { + auto _permit = permit; + // check environment + auto permit_list = tim::get_env( + TIMEMORY_SETTINGS_PREFIX "ROCPROFSYS_MPIP_PERMIT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(permit_list)) + _permit.insert(itr); + return _permit; + }; + + is_initialized = true; + } +} +// +#endif +// +//======================================================================================// +// + +#if defined(ROCPROFSYS_UNDEFINE_OMPI_SKIP_MPICXX) && ROCPROFSYS_UNDEFINE_OMPI_SKIP_MPICXX +# undef OMPI_SKIP_MPICXX +#endif diff --git a/source/lib/rocprof-sys/library/components/rcclp.cpp b/source/lib/rocprof-sys/library/components/rcclp.cpp deleted file mode 100644 index 9b5f00d62..000000000 --- a/source/lib/rocprof-sys/library/components/rcclp.cpp +++ /dev/null @@ -1,195 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/components/rcclp.hpp" -#include "library/rcclp.hpp" - -#include - -std::ostream& -operator<<(std::ostream& _os, const ncclUniqueId& _v) -{ - for(auto itr : _v.internal) - _os << itr; - return _os; -} - -namespace rocprofsys -{ -namespace component -{ -uint64_t -activate_rcclp() -{ - using handle_t = tim::component::rcclp_handle; - - static auto _handle = std::shared_ptr{}; - - if(!_handle.get()) - { - _handle = std::make_shared(); - _handle->start(); - - auto cleanup_functor = [=]() { - if(_handle) - { - _handle->stop(); - _handle.reset(); - } - }; - - std::stringstream ss; - ss << "timemory-rcclp-" << demangle() << "-" - << demangle(); - tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor); - return 1; - } - return 0; -} -// -//======================================================================================// -// -uint64_t -deactivate_rcclp(uint64_t id) -{ - if(id > 0) - { - std::stringstream ss; - ss << "timemory-rcclp-" << demangle() << "-" - << demangle(); - tim::manager::instance()->cleanup(ss.str()); - return 0; - } - return 1; -} -// -//======================================================================================// -// -void -configure_rcclp(const std::set& permit, const std::set& reject) -{ - static bool is_initialized = false; - if(!is_initialized) - { - // generate the gotcha wrappers - rcclp_gotcha_t::get_initializer() = []() { - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy); - }; - - // provide environment variable for suppressing wrappers - rcclp_gotcha_t::get_reject_list() = [reject]() { - auto _reject = reject; - // check environment - auto reject_list = - tim::get_env("ROCPROFSYS_RCCLP_REJECT_LIST", ""); - // add environment setting - for(const auto& itr : tim::delimit(reject_list)) - _reject.insert(itr); - return _reject; - }; - - // provide environment variable for selecting wrappers - rcclp_gotcha_t::get_permit_list() = [permit]() { - auto _permit = permit; - // check environment - auto permit_list = - tim::get_env("ROCPROFSYS_RCCLP_PERMIT_LIST", ""); - // add environment setting - for(const auto& itr : tim::delimit(permit_list)) - _permit.insert(itr); - return _permit; - }; - - is_initialized = true; - } -} - -void -rcclp_handle::start() -{ - if(get_tool_count()++ == 0) - { - get_tool_instance() = std::make_shared("timemory_rcclp"); - get_tool_instance()->start(); - } -} - -void -rcclp_handle::stop() -{ - auto idx = --get_tool_count(); - if(get_tool_instance().get()) - { - get_tool_instance()->stop(); - if(idx == 0) get_tool_instance().reset(); - } -} - -rcclp_handle::persistent_data& -rcclp_handle::get_persistent_data() -{ - static persistent_data _instance; - return _instance; -} - -std::atomic& -rcclp_handle::get_configured() -{ - return get_persistent_data().m_configured; -} - -rcclp_handle::toolset_ptr_t& -rcclp_handle::get_tool_instance() -{ - return get_persistent_data().m_tool; -} - -std::atomic& -rcclp_handle::get_tool_count() -{ - return get_persistent_data().m_count; -} -} // namespace component -} // namespace rocprofsys diff --git a/source/lib/rocprof-sys/library/components/rcclp.hpp b/source/lib/rocprof-sys/library/components/rcclp.hpp deleted file mode 100644 index 957dec1ca..000000000 --- a/source/lib/rocprof-sys/library/components/rcclp.hpp +++ /dev/null @@ -1,106 +0,0 @@ -// MIT License -// -// Copyright (c) 2020, The Regents of the University of California, -// through Lawrence Berkeley National Laboratory (subject to receipt of any -// required approvals from the U.S. Dept. of Energy). All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "core/common.hpp" -#include "core/components/fwd.hpp" -#include "core/defines.hpp" -#include "core/rccl.hpp" -#include "core/timemory.hpp" -#include "library/components/category_region.hpp" -#include "library/components/comm_data.hpp" - -#include -#include - -#include -#include -#include -#include -#include -#include - -#if !defined(ROCPROFSYS_NUM_RCCLP_WRAPPERS) -# define ROCPROFSYS_NUM_RCCLP_WRAPPERS 25 -#endif - -ROCPROFSYS_COMPONENT_ALIAS( - rccl_toolset_t, - ::tim::component_bundle, - comm_data>) -ROCPROFSYS_COMPONENT_ALIAS(rcclp_gotcha_t, - ::tim::component::gotcha) - -#if !defined(ROCPROFSYS_USE_RCCL) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type) -#endif - -namespace rocprofsys -{ -namespace component -{ -uint64_t -activate_rcclp(); - -uint64_t -deactivate_rcclp(uint64_t id); - -void -configure_rcclp(const std::set& permit = {}, - const std::set& reject = {}); - -struct rcclp_handle : base -{ - static constexpr size_t rcclp_wrapper_count = ROCPROFSYS_NUM_RCCLP_WRAPPERS; - - using value_type = void; - using this_type = rcclp_handle; - using base_type = base; - - using rcclp_tuple_t = tim::component_tuple; - using toolset_ptr_t = std::shared_ptr; - - static std::string label() { return "rcclp_handle"; } - static std::string description() { return "Handle for activating NCCL wrappers"; } - static void get() {} - static void start(); - static void stop(); - static int get_count() { return get_tool_count().load(); } - -private: - struct persistent_data - { - std::atomic m_configured{ 0 }; - std::atomic m_count{ 0 }; - toolset_ptr_t m_tool = toolset_ptr_t{}; - }; - - static persistent_data& get_persistent_data(); - static std::atomic& get_configured(); - static toolset_ptr_t& get_tool_instance(); - static std::atomic& get_tool_count(); -}; -} // namespace component -} // namespace rocprofsys diff --git a/source/lib/rocprof-sys/library/ompt.cpp b/source/lib/rocprof-sys/library/ompt.cpp index fada40cc4..4f8ba8953 100644 --- a/source/lib/rocprof-sys/library/ompt.cpp +++ b/source/lib/rocprof-sys/library/ompt.cpp @@ -163,10 +163,7 @@ struct ompt : comp::base auto _track = tracing::get_perfetto_track( category::ompt{}, - [](uint64_t _targ_id_v) { - return ::timemory::join::join("", "OMP Target ", _targ_id_v); - }, - targ_id); + [](auto) -> std::string { return "OpenMP Target Offloads"; }, 0); category_region::start( name, _track, beg_time, ::perfetto::Flow::ProcessScoped(id), diff --git a/source/lib/rocprof-sys/library/rcclp.cpp b/source/lib/rocprof-sys/library/rcclp.cpp deleted file mode 100644 index 541f0575f..000000000 --- a/source/lib/rocprof-sys/library/rcclp.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// MIT License -// -// Copyright (c) 2020, The Regents of the University of California, -// through Lawrence Berkeley National Laboratory (subject to receipt of any -// required approvals from the U.S. Dept. of Energy). All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/components/rcclp.hpp" -#include "core/components/fwd.hpp" -#include "core/defines.hpp" -#include "core/dynamic_library.hpp" -#include "core/rccl.hpp" -#include "core/timemory.hpp" -#include "library/components/category_region.hpp" - -#include - -#include -#include -#include -#include -#include - -namespace -{ -uint64_t global_id = std::numeric_limits::max(); -} - -namespace rocprofsys -{ -namespace rcclp -{ -void -configure() -{} - -void -setup() -{ - configure(); - - // make sure the symbols are loaded to be wrapped - dynamic_library _librccl{ - "ROCPROFSYS_RCCL_LIBRARY", "librccl.so", RTLD_NOW | RTLD_GLOBAL, true, true, true - }; - - auto _use_data = tim::get_env("ROCPROFSYS_RCCLP_COMM_DATA", get_use_timemory()); - if(!get_use_timemory()) - { - trait::runtime_enabled::set(false); - trait::runtime_enabled::set(false); - } - else - { - trait::runtime_enabled::set(_use_data); - trait::runtime_enabled::set(_use_data); - } - - component::configure_rcclp(); - global_id = component::activate_rcclp(); -} - -void -shutdown() -{ - if(global_id < std::numeric_limits::max()) - component::deactivate_rcclp(global_id); -} -} // namespace rcclp -} // namespace rocprofsys diff --git a/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index a13d02426..ffbe9592b 100644 --- a/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -34,6 +34,7 @@ #include "library/components/category_region.hpp" #include "library/rocprofiler-sdk/counters.hpp" #include "library/rocprofiler-sdk/fwd.hpp" +#include "library/rocprofiler-sdk/rccl.hpp" #include "library/thread_info.hpp" #include "library/tracing.hpp" @@ -482,6 +483,17 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, auto ts = rocprofiler_timestamp_t{}; ROCPROFILER_CALL(rocprofiler_get_timestamp(&ts)); + const char* name = nullptr; + rocprofiler_query_callback_tracing_kind_operation_name(record.kind, record.operation, + &name, nullptr); + + auto info = std::stringstream{}; + info << std::left << "tid=" << record.thread_id << ", cid=" << std::setw(3) + << record.correlation_id.internal << ", kind=" << std::setw(2) << record.kind + << ", operation=" << std::setw(3) << record.operation + << ", phase=" << record.phase << ", dt_nsec=" << std::setw(8) << ts + << ", name=" << name; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) { user_data->value = ts; @@ -525,6 +537,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, break; } #endif + case ROCPROFILER_CALLBACK_TRACING_RCCL_API: + { + tool_tracing_callback_start(category::rocm_rccl_api{}, record, user_data, + ts); + break; + } case ROCPROFILER_CALLBACK_TRACING_NONE: case ROCPROFILER_CALLBACK_TRACING_LAST: case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API: @@ -533,7 +551,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: - case ROCPROFILER_CALLBACK_TRACING_RCCL_API: #if(ROCPROFILER_VERSION >= 600) case ROCPROFILER_CALLBACK_TRACING_OMPT: case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: @@ -616,6 +633,13 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, break; } #endif + case ROCPROFILER_CALLBACK_TRACING_RCCL_API: + { + tool_tracing_callback_rccl(record, user_data->value, ts); + tool_tracing_callback_stop(category::rocm_rccl_api{}, record, user_data, + ts, _bt_data); + break; + } case ROCPROFILER_CALLBACK_TRACING_NONE: case ROCPROFILER_CALLBACK_TRACING_LAST: case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API: @@ -624,7 +648,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: - case ROCPROFILER_CALLBACK_TRACING_RCCL_API: #if(ROCPROFILER_VERSION >= 600) case ROCPROFILER_CALLBACK_TRACING_OMPT: case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: @@ -651,6 +674,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, _data->dispatch_info.dispatch_id, timing_interval{ _data->start_timestamp, _data->end_timestamp }); } + else + { + ROCPROFSYS_WARNING_F( + 1, "tool_tracing_callback: unhandled PHASE_NONE callback record\n\t%s\n", + info.str().c_str()); + } } else { @@ -1025,13 +1054,16 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) { auto domains = settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS"); - ROCPROFSYS_VERBOSE_F(1, "rocprof-sys ROCm Domains:\n"); + ROCPROFSYS_VERBOSE_F(1, "Available ROCm Domains:\n"); for(const auto& itr : domains->get_choices()) ROCPROFSYS_VERBOSE_F(1, "- %s\n", itr.c_str()); auto _callback_domains = rocprofiler_sdk::get_callback_domains(); auto _buffered_domain = rocprofiler_sdk::get_buffered_domains(); auto _counter_events = rocprofiler_sdk::get_rocm_events(); + auto _version = rocprofiler_sdk::get_version(); + ROCPROFSYS_WARNING_IF(_version.formatted == 0, + "Warning! rocprofiler-sdk version not initialized\n"); auto* _data = as_client_data(user_data); _data->client_fini = fini_func; @@ -1052,11 +1084,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, -#if(ROCPROFILER_VERSION >= 700) + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, + ROCPROFILER_CALLBACK_TRACING_RCCL_API, +#if(ROCPROFILER_VERSION >= 600) ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API, +#endif +#if(ROCPROFILER_VERSION >= 700) ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API, #endif - ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API }) { if(_callback_domains.count(itr) > 0) diff --git a/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt b/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt index 97446e34c..407037a67 100644 --- a/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt +++ b/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt @@ -1,9 +1,11 @@ # -set(rocprofiler_sdk_sources ${CMAKE_CURRENT_LIST_DIR}/counters.cpp - ${CMAKE_CURRENT_LIST_DIR}/fwd.cpp) +set(rocprofiler_sdk_sources + ${CMAKE_CURRENT_LIST_DIR}/counters.cpp ${CMAKE_CURRENT_LIST_DIR}/fwd.cpp + ${CMAKE_CURRENT_LIST_DIR}/rccl.cpp) -set(rocprofiler_sdk_headers ${CMAKE_CURRENT_LIST_DIR}/counters.hpp - ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp) +set(rocprofiler_sdk_headers + ${CMAKE_CURRENT_LIST_DIR}/counters.hpp ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp + ${CMAKE_CURRENT_LIST_DIR}/rccl.hpp) target_sources(rocprofiler-systems-object-library PRIVATE ${rocprofiler_sdk_sources} ${rocprofiler_sdk_headers}) diff --git a/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp b/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp new file mode 100644 index 000000000..7b2e04b25 --- /dev/null +++ b/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp @@ -0,0 +1,182 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/rocprofiler-sdk/rccl.hpp" + +#include "core/config.hpp" +#include "core/debug.hpp" +#include "core/perfetto.hpp" + +#include "library/tracing.hpp" + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +namespace +{ +struct rccl_recv +{ + static constexpr auto value = "comm_data"; + static constexpr auto label = "RCCL Comm Recv"; +}; + +struct rccl_send +{ + static constexpr auto value = "comm_data"; + static constexpr auto label = "RCCL Comm Send"; +}; + +template +void +write_perfetto_counter_track(uint64_t _val, uint64_t _begin_ts, uint64_t _end_ts) +{ + using counter_track = rocprofsys::perfetto_counter_track; + + if(rocprofsys::get_use_perfetto() && + rocprofsys::get_state() == rocprofsys::State::Active) + { + const size_t _idx = 0; + + if(!counter_track::exists(_idx)) + { + std::string _label = + (_idx > 0) ? JOIN(" ", Tp::label, JOIN("", '[', _idx, ']')) : Tp::label; + counter_track::emplace(_idx, _label, "bytes"); + } + + TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _begin_ts, _val); + TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _end_ts, 0); + } +} + +static auto +rccl_type_size(ncclDataType_t datatype) +{ + switch(datatype) + { + case ncclInt8: + case ncclUint8: return 1; + case ncclFloat16: return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: return 8; + default: + ROCPROFSYS_CI_ABORT(true, "Unsupported RCCL datatype: %i", datatype); + return 0; + }; +} + +} // namespace + +/* + * @brief RCCL callback tracing handler + * + * This function processes RCCL API calls and writes the data transfer size to + * the Perfetto counter track. + * + * @param record The tracing record containing the RCCL API call information. + * @param begin_ts The timestamp when the operation started. + * @param end_ts The timestamp when the operation ended. + */ +void +tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record, + uint64_t begin_ts, uint64_t end_ts) +{ + if(record.kind == ROCPROFILER_CALLBACK_TRACING_RCCL_API) + { + auto* payload = + static_cast(record.payload); + + size_t size = 0; + bool is_send = false; + + auto set_recv = [&](size_t count, ncclDataType_t _dt) { + is_send = false; + size = count * rccl_type_size(_dt); + }; + + auto set_send = [&](size_t count, ncclDataType_t _dt) { + is_send = true; + size = count * rccl_type_size(_dt); + }; + + switch(record.operation) + { + // RCCL Data Receive + case ROCPROFILER_RCCL_API_ID_ncclAllGather: + set_recv(payload->args.ncclAllGather.sendcount, + payload->args.ncclAllGather.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclAllToAll: + set_recv(payload->args.ncclAllToAll.count, + payload->args.ncclAllToAll.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclAllReduce: + set_recv(payload->args.ncclAllReduce.count, + payload->args.ncclAllReduce.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclGather: + set_recv(payload->args.ncclGather.sendcount, + payload->args.ncclGather.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclRecv: + set_recv(payload->args.ncclRecv.count, payload->args.ncclRecv.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclReduce: + set_recv(payload->args.ncclReduce.count, + payload->args.ncclReduce.datatype); + break; + + // RCCL Data Send + case ROCPROFILER_RCCL_API_ID_ncclBroadcast: + set_send(payload->args.ncclBroadcast.count, + payload->args.ncclBroadcast.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclReduceScatter: + set_send(payload->args.ncclReduceScatter.recvcount, + payload->args.ncclReduceScatter.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclSend: + set_send(payload->args.ncclSend.count, payload->args.ncclSend.datatype); + break; + + default: + // Skip other RCCL operations + break; + } + + if(config::get_use_perfetto() && size > 0) + { + if(is_send) + write_perfetto_counter_track(size, begin_ts, end_ts); + else + write_perfetto_counter_track(size, begin_ts, end_ts); + } + } +} + +} // namespace rocprofiler_sdk +} // namespace rocprofsys diff --git a/source/lib/rocprof-sys/library/rcclp.hpp b/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.hpp similarity index 67% rename from source/lib/rocprof-sys/library/rcclp.hpp rename to source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.hpp index 053433671..0a1ab3b9c 100644 --- a/source/lib/rocprof-sys/library/rcclp.hpp +++ b/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.hpp @@ -1,8 +1,6 @@ // MIT License // -// Copyright (c) 2020, The Regents of the University of California, -// through Lawrence Berkeley National Laboratory (subject to receipt of any -// required approvals from the U.S. Dept. of Energy). All rights reserved. +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,34 +22,22 @@ #pragma once -#include "core/defines.hpp" +#include +#include +// #include +// #include +// #include +#include +#include namespace rocprofsys { -namespace rcclp +namespace rocprofiler_sdk { void -configure(); +tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record, + uint64_t begin_ts, uint64_t end_ts); -void -setup(); - -void -shutdown(); - -#if !defined(ROCPROFSYS_USE_RCCL) || \ - (defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL == 0) -inline void -configure() -{} - -inline void -setup() -{} +} // namespace rocprofiler_sdk -inline void -shutdown() -{} -#endif -} // namespace rcclp } // namespace rocprofsys diff --git a/source/lib/rocprof-sys/library/sampling.cpp b/source/lib/rocprof-sys/library/sampling.cpp index cb2ba13fe..45c984874 100644 --- a/source/lib/rocprof-sys/library/sampling.cpp +++ b/source/lib/rocprof-sys/library/sampling.cpp @@ -831,6 +831,7 @@ setup() std::set shutdown() { + threading::set_is_shutting_down(true); if(is_child_process()) { for(auto& itr : *sampler_instances::get()) diff --git a/source/lib/rocprof-sys/library/tpls/rccl/rccl/rccl.h b/source/lib/rocprof-sys/library/tpls/rccl/rccl/rccl.h deleted file mode 100644 index 5fb23b1ab..000000000 --- a/source/lib/rocprof-sys/library/tpls/rccl/rccl/rccl.h +++ /dev/null @@ -1,522 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_H_ -#define NCCL_H_ - -#include -#include - -#define NCCL_MAJOR 2 -#define NCCL_MINOR 11 -#define NCCL_PATCH 4 -#define NCCL_SUFFIX "" - -#define NCCL_VERSION_CODE 21104 -#define NCCL_VERSION(X, Y, Z) \ - (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z)) - -#define RCCL_BFLOAT16 1 -#define RCCL_GATHER_SCATTER 1 -#define RCCL_ALLTOALLV 1 - -#ifdef __cplusplus -extern "C" -{ -#endif - - /*! @brief Opaque handle to communicator */ - typedef struct ncclComm* ncclComm_t; - -#define NCCL_UNIQUE_ID_BYTES 128 - typedef struct - { - char internal[NCCL_UNIQUE_ID_BYTES]; - } ncclUniqueId; - - /*! @brief Error type */ - typedef enum - { - ncclSuccess = 0, - ncclUnhandledCudaError = 1, - ncclSystemError = 2, - ncclInternalError = 3, - ncclInvalidArgument = 4, - ncclInvalidUsage = 5, - ncclNumResults = 6 - } ncclResult_t; - - /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. - * - * @details This integer is coded with the MAJOR, MINOR and PATCH level of the - * NCCL library - */ - ncclResult_t ncclGetVersion(int* version); - /// @cond include_hidden - ncclResult_t pncclGetVersion(int* version); - /// @endcond - - /*! @brief Generates an ID for ncclCommInitRank - - @details - Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be - called once and the Id should be distributed to all ranks in the - communicator before calling ncclCommInitRank. - - @param[in] - uniqueId ncclUniqueId* - pointer to uniqueId - - */ - ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); - /// @cond include_hidden - ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); - /// @endcond - - /*! @brief Creates a new communicator (multi thread/process version). - - @details - rank must be between 0 and nranks-1 and unique within a communicator clique. - Each rank is associated to a CUDA device, which has to be set before calling - ncclCommInitRank. - ncclCommInitRank implicitly syncronizes with other ranks, so it must be - called by different threads/processes or use ncclGroupStart/ncclGroupEnd. - - @param[in] - comm ncclComm_t* - communicator struct pointer - */ - ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, - int rank); - /// @cond include_hidden - ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, - int rank); - /// @endcond - - /*! @brief Creates a clique of communicators (single process version). - * - * @details This is a convenience function to create a single-process communicator - * clique. Returns an array of ndev newly initialized communicators in comm. comm - * should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is - * NULL, the first ndev HIP devices are used. Order of devlist defines user-order of - * processors within the communicator. - * */ - ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); - /// @cond include_hidden - ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); - /// @endcond - - /*! @brief Frees resources associated with communicator object, but waits for any - * operations that might still be running on the device */ - ncclResult_t ncclCommDestroy(ncclComm_t comm); - /// @cond include_hidden - ncclResult_t pncclCommDestroy(ncclComm_t comm); - /// @endcond - - /*! @brief Frees resources associated with communicator object and aborts any - * operations that might still be running on the device. */ - ncclResult_t ncclCommAbort(ncclComm_t comm); - /// @cond include_hidden - ncclResult_t pncclCommAbort(ncclComm_t comm); - /// @endcond - - /*! @brief Returns a human-readable error message. */ - const char* ncclGetErrorString(ncclResult_t result); - const char* pncclGetErrorString(ncclResult_t result); - - /*! @brief Checks whether the comm has encountered any asynchronous errors */ - ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); - /// @cond include_hidden - ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); - /// @endcond - - /*! @brief Gets the number of ranks in the communicator clique. */ - ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); - /// @cond include_hidden - ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); - /// @endcond - - /*! @brief Returns the rocm device number associated with the communicator. */ - ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); - /// @cond include_hidden - ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); - /// @endcond - - /*! @brief Returns the user-ordered "rank" associated with the communicator. */ - ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); - /// @cond include_hidden - ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); - /// @endcond - - /*! @brief Reduction operation selector */ - /* Reduction operation selector */ - typedef enum - { - ncclNumOps_dummy = 5 - } ncclRedOp_dummy_t; - typedef enum - { - ncclSum = 0, - ncclProd = 1, - ncclMax = 2, - ncclMin = 3, - ncclAvg = 4, - /* ncclNumOps: The number of built-in ncclRedOp_t values. Also - * serves as the least possible value for dynamic ncclRedOp_t's - * as constructed by ncclRedOpCreate*** functions. */ - ncclNumOps = 5, - /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. - * It is defined to be the largest signed value (since compilers - * are permitted to use signed enums) that won't grow - * sizeof(ncclRedOp_t) when compared to previous NCCL versions to - * maintain ABI compatibility. */ - ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) - } ncclRedOp_t; - - /*! @brief Data types */ - typedef enum - { - ncclInt8 = 0, - ncclChar = 0, - ncclUint8 = 1, - ncclInt32 = 2, - ncclInt = 2, - ncclUint32 = 3, - ncclInt64 = 4, - ncclUint64 = 5, - ncclFloat16 = 6, - ncclHalf = 6, - ncclFloat32 = 7, - ncclFloat = 7, - ncclFloat64 = 8, - ncclDouble = 8, - ncclBfloat16 = 9, - ncclNumTypes = 10 - } ncclDataType_t; - - /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ - typedef enum - { - /* ncclScalarDevice: The scalar is in device-visible memory and will be - * dereferenced while the collective is running. */ - ncclScalarDevice = 0, - - /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be - * dereferenced before the ncclRedOpCreate***() function returns. */ - ncclScalarHostImmediate = 1 - } ncclScalarResidence_t; - - /* - * ncclRedOpCreatePreMulSum - * - * Creates a new reduction operator which pre-multiplies input values by a given - * scalar locally before reducing them with peer values via summation. For use - * only with collectives launched against *comm* and *datatype*. The - * *residence* argument indicates how/when the memory pointed to by *scalar* - * will be dereferenced. Upon return, the newly created operator's handle - * is stored in *op*. - */ - ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, - ncclDataType_t datatype, - ncclScalarResidence_t residence, - ncclComm_t comm); - ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, - ncclDataType_t datatype, - ncclScalarResidence_t residence, - ncclComm_t comm); - - /* - * ncclRedOpDestroy - * - * Destroys the reduction operator *op*. The operator must have been created by - * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be - * destroyed as soon as the last NCCL function which is given that operator returns. - */ - ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); - ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); - - /* - * Collective communication operations - * - * Collective communication operations must be called separately for each - * communicator in a communicator clique. - * - * They return when operations have been enqueued on the CUDA stream. - * - * Since they may perform inter-CPU synchronization, each call has to be done - * from a different thread or process, or need to use Group Semantics (see - * below). - */ - - /*! - * @brief Reduce - * - * @details Reduces data arrays of length count in sendbuff into recvbuff using op - * operation. - * recvbuff may be NULL on all calls except for root device. - * root is the rank (not the CUDA device) where data will reside after the - * operation is complete. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, - ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief (deprecated) Broadcast (in-place) - * - * @details Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the - * operation is started. - * - * This operation is implicitely in place. - */ - ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief Broadcast - * - * @details Copies count values from root to all other devices. - * root is the rank (not the HIP device) where data resides before the - * operation is started. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief All-Reduce - * - * @details Reduces data arrays of length count in sendbuff using op operation, and - * leaves identical copies of result on each recvbuff. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! - * @brief Reduce-Scatter - * - * @details Reduces data in sendbuff using op operation and leaves reduced result - * scattered over the devices so that recvbuff on rank i will contain the i-th - * block of the result. - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ - ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, ncclRedOp_t op, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, - size_t recvcount, ncclDataType_t datatype, - ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief All-Gather - * - * @details Each device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ - ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief Send - * - * @details Send data from sendbuff to rank peer. - * Rank peer needs to call ncclRecv with the same datatype and the same count from - * this rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv - * operations need to progress concurrently to complete, they must be fused within a - * ncclGroupStart/ ncclGroupEnd section. - */ - ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, - int peer, ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, - int peer, ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief Receive - * - * @details Receive data from rank peer into recvbuff. - * Rank peer needs to call ncclSend with the same datatype and the same count to this - * rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv - * operations need to progress concurrently to complete, they must be fused within a - * ncclGroupStart/ ncclGroupEnd section. - */ - ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, - int peer, ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief Gather - * - * @details Root device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ - ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief Scatter - * - * @details Scattered over the devices so that recvbuff on rank i will contain the - * i-th block of the data on root. - * - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ - ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief All-To-All - * - * @details Device (i) send (j)th block of data to device (j) and be placed as (i)th - * block. Each block for sending/receiving has count elements, which means - * that recvbuff and sendbuff should have a size of nranks*count elements. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief All-To-Allv - * - * @details Device (i) sends sendcounts[j] of data from offset sdispls[j] - * to device (j). In the same time, device (i) receives recvcounts[j] of data - * from device (j) to be placed at rdispls[j]. - - * sendcounts, sdispls, recvcounts and rdispls are all measured in the units - * of datatype, not bytes. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[], - const size_t sdispls[], void* recvbuff, - const size_t recvcounts[], const size_t rdispls[], - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[], - const size_t sdispls[], void* recvbuff, - const size_t recvcounts[], const size_t rdispls[], - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /* - * Group semantics - * - * When managing multiple GPUs from a single thread, and since NCCL collective - * calls may perform inter-CPU synchronization, we need to "group" calls for - * different ranks/devices into a single call. - * - * Grouping NCCL calls as being part of the same collective operation is done - * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all - * collective calls until the ncclGroupEnd call, which will wait for all calls - * to be complete. Note that for collective communication, ncclGroupEnd only - * guarantees that the operations are enqueued on the streams, not that - * the operation is effectively done. - * - * Both collective communication and ncclCommInitRank can be used in conjunction - * of ncclGroupStart/ncclGroupEnd, but not together. - * - * Group semantics also allow to fuse multiple operations on the same device - * to improve performance (for aggregated collective calls), or to permit - * concurrent progress of multiple send/receive operations. - */ - - /*! @brief Group Start - * - * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into - * a single NCCL operation. Nothing will be started on the CUDA stream until - * ncclGroupEnd. - */ - ncclResult_t ncclGroupStart(); - /// @cond include_hidden - ncclResult_t pncclGroupStart(); - /// @endcond - - /*! @brief Group End - * - * End a group call. Start a fused NCCL operation consisting of all calls since - * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations - * need to be called after ncclGroupEnd. - */ - ncclResult_t ncclGroupEnd(); - /// @cond include_hidden - ncclResult_t pncclGroupEnd(); - /// @endcond - -#ifdef __cplusplus -} // end extern "C" -#endif - -#endif // end include guard diff --git a/source/lib/rocprof-sys/library/tracing.hpp b/source/lib/rocprof-sys/library/tracing.hpp index 80eb801d0..324c959a6 100644 --- a/source/lib/rocprof-sys/library/tracing.hpp +++ b/source/lib/rocprof-sys/library/tracing.hpp @@ -101,6 +101,20 @@ template auto& get_category_stack(); +template +auto +get_perfetto_string(T& name) +{ + if constexpr(std::is_const_v) + { + return ::perfetto::StaticString{ name }; + } + else + { + return ::perfetto::DynamicString{ name }; + } +} + template inline void push_perfetto(CategoryT, const char*, Args&&...); @@ -387,15 +401,13 @@ push_perfetto(CategoryT, const char* name, Args&&... args) uint64_t _ts = now(); if(config::get_perfetto_annotations()) { - TRACE_EVENT_BEGIN(trait::name::value, - ::perfetto::StaticString(name), _ts, "begin_ns", _ts, - std::forward(args)...); + TRACE_EVENT_BEGIN(trait::name::value, get_perfetto_string(name), + _ts, "begin_ns", _ts, std::forward(args)...); } else { - TRACE_EVENT_BEGIN(trait::name::value, - ::perfetto::StaticString(name), _ts, - std::forward(args)...); + TRACE_EVENT_BEGIN(trait::name::value, get_perfetto_string(name), + _ts, std::forward(args)...); } } else @@ -418,7 +430,7 @@ push_perfetto(CategoryT, const char* name, Args&&... args) ++get_tracing_stack(); uint64_t _ts = now(); TRACE_EVENT_BEGIN( - trait::name::value, ::perfetto::StaticString(name), _ts, + trait::name::value, get_perfetto_string(name), _ts, std::forward(args)..., [&](::perfetto::EventContext ctx) { if(config::get_perfetto_annotations()) { @@ -527,7 +539,7 @@ push_perfetto_ts(CategoryT, const char* name, uint64_t _ts, Args&&... args) if(category_push_disabled()) return; ++get_tracing_stack(); - TRACE_EVENT_BEGIN(trait::name::value, ::perfetto::StaticString(name), _ts, + TRACE_EVENT_BEGIN(trait::name::value, get_perfetto_string(name), _ts, std::forward(args)...); } @@ -555,8 +567,8 @@ push_perfetto_track(CategoryT, const char* name, ::perfetto::Track _track, uint6 if(category_push_disabled()) return; ++get_tracing_stack(); - TRACE_EVENT_BEGIN(trait::name::value, ::perfetto::StaticString(name), - _track, _ts, std::forward(args)...); + TRACE_EVENT_BEGIN(trait::name::value, get_perfetto_string(name), _track, + _ts, std::forward(args)...); } template @@ -588,15 +600,13 @@ mark_perfetto(CategoryT, const char* name, Args&&... args) uint64_t _ts = now(); if(config::get_perfetto_annotations()) { - TRACE_EVENT_INSTANT(trait::name::value, - ::perfetto::StaticString(name), _ts, "ns", _ts, - std::forward(args)...); + TRACE_EVENT_INSTANT(trait::name::value, get_perfetto_string(name), + _ts, "ns", _ts, std::forward(args)...); } else { - TRACE_EVENT_INSTANT(trait::name::value, - ::perfetto::StaticString(name), _ts, - std::forward(args)...); + TRACE_EVENT_INSTANT(trait::name::value, get_perfetto_string(name), + _ts, std::forward(args)...); } } else @@ -617,14 +627,14 @@ mark_perfetto(CategoryT, const char* name, Args&&... args) else { uint64_t _ts = now(); - TRACE_EVENT_INSTANT( - trait::name::value, ::perfetto::StaticString(name), _ts, - std::forward(args)..., [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "ns", _ts); - } - }); + TRACE_EVENT_INSTANT(trait::name::value, get_perfetto_string(name), + _ts, std::forward(args)..., + [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "ns", _ts); + } + }); } } } @@ -636,8 +646,8 @@ mark_perfetto_ts(CategoryT, const char* name, uint64_t _ts, Args&&... args) // skip if category is disabled if(category_mark_disabled()) return; - TRACE_EVENT_INSTANT(trait::name::value, ::perfetto::StaticString(name), - _ts, std::forward(args)...); + TRACE_EVENT_INSTANT(trait::name::value, get_perfetto_string(name), _ts, + std::forward(args)...); } template @@ -648,8 +658,8 @@ mark_perfetto_track(CategoryT, const char* name, ::perfetto::Track _track, uint6 // skip if category is disabled if(category_mark_disabled()) return; - TRACE_EVENT_INSTANT(trait::name::value, ::perfetto::DynamicString{ name }, - _track, _ts, std::forward(args)...); + TRACE_EVENT_INSTANT(trait::name::value, get_perfetto_string(name), _track, + _ts, std::forward(args)...); } template diff --git a/tests/get_default_nic.sh b/tests/get_default_nic.sh new file mode 100755 index 000000000..8c7d0af2d --- /dev/null +++ b/tests/get_default_nic.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +# This script gets the name of the default NIC and writes it to standard output. + +ip r | awk '/default/{print $5}' diff --git a/tests/rocprof-sys-nic-perf.cmake b/tests/rocprof-sys-nic-perf.cmake index ca1453bdb..1c1bcd163 100644 --- a/tests/rocprof-sys-nic-perf.cmake +++ b/tests/rocprof-sys-nic-perf.cmake @@ -26,7 +26,14 @@ # # -------------------------------------------------------------------------------------- # -set(_network_interface "lo") +# Get the name of the default NIC and write it to _network_interface. +execute_process( + COMMAND "${CMAKE_SOURCE_DIR}/tests/get_default_nic.sh" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE _network_interface) + +message(STATUS "Default network interface is ${_network_interface}") + set(_nic_perf_environment "${_base_environment}" "ROCPROFSYS_OUTPUT_PATH=${PROJECT_BINARY_DIR}/rocprof-sys-tests-output/nic-performance" @@ -38,18 +45,20 @@ set(_nic_perf_environment "ROCPROFSYS_USE_ROCM=OFF" "ROCPROFSYS_TIMEMORY_COMPONENTS=wall_clock,papi_array,network_stats" "ROCPROFSYS_NETWORK_INTERFACE=${_network_interface}" - "ROCPROFSYS_PAPI_EVENTS=net:::${_network_interface}:tx:byte,net:::${_network_interface}:rx:byte,net:::${_network_interface}:rx:packet,net:::${_network_interface}:tx:packet" - ) + "ROCPROFSYS_PAPI_EVENTS=net:::${_network_interface}:tx:byte net:::${_network_interface}:rx:byte net:::${_network_interface}:rx:packet net:::${_network_interface}:tx:packet" + "ROCPROFSYS_SAMPLING_DELAY=0.05") +# Set _download_url to a large file that will give rocprof-sys-sample time to collect NIC +# performance data. set(_download_url - "https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-6.3.1/rocprofiler-systems-0.1.0-ubuntu-20.04-ROCm-60200-PAPI-OMPT-Python3.sh" + "https://github.com/llvm/llvm-project/releases/download/llvmorg-20.1.0/clang+llvm-20.1.0-armv7a-linux-gnueabihf.tar.gz" ) # Run the NIC performance test add_test( NAME nic-performance COMMAND $ -- wget --no-check-certificate - --quiet ${_download_url} -O /tmp/rocprofiler-systems-install.sh + ${_download_url} -O /tmp/rocprofiler-systems.test.bin WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) set_tests_properties(nic-performance PROPERTIES ENVIRONMENT "${_nic_perf_environment}" diff --git a/tests/rocprof-sys-openmp-tests.cmake b/tests/rocprof-sys-openmp-tests.cmake index 0327b339f..7bd20a65f 100644 --- a/tests/rocprof-sys-openmp-tests.cmake +++ b/tests/rocprof-sys-openmp-tests.cmake @@ -74,7 +74,19 @@ rocprofiler_systems_add_validation_test( PERFETTO_METRIC "rocm_kernel_dispatch" PERFETTO_FILE "perfetto-trace.proto" LABELS "openmp;openmp-target" - ARGS --label-substrings Z4vmulIiEvPT_S1_S1_i_l51.kd -c 12 -d 0 -p) + ARGS --label-substrings + Z4vmulIiEvPT_S1_S1_i_l51.kd + Z4vmulIfEvPT_S1_S1_i_l51.kd + Z4vmulIdEvPT_S1_S1_i_l51.kd + -c + 4 + 4 + 4 + -d + 0 + 0 + 0 + -p) set(_ompt_sampling_environ "${_ompt_environment}" diff --git a/tests/rocprof-sys-rccl-tests.cmake b/tests/rocprof-sys-rccl-tests.cmake index ea17deccb..60d5bf003 100644 --- a/tests/rocprof-sys-rccl-tests.cmake +++ b/tests/rocprof-sys-rccl-tests.cmake @@ -30,12 +30,15 @@ foreach(_TARGET ${RCCL_TEST_TARGETS}) string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}") string(REPLACE "_" "-" _NAME "${_NAME}") rocprofiler_systems_add_test( + SKIP_RUNTIME NAME rccl-test-${_NAME} TARGET ${_TARGET} LABELS "rccl-tests;rcclp" MPI ON GPU ON NUM_PROCS 1 + SAMPLING_TIMEOUT 300 + REWRITE_TIMEOUT 300 REWRITE_ARGS -e -v diff --git a/tests/rocprof-sys-testing.cmake b/tests/rocprof-sys-testing.cmake index a72ddaf85..15d4d4258 100644 --- a/tests/rocprof-sys-testing.cmake +++ b/tests/rocprof-sys-testing.cmake @@ -166,9 +166,10 @@ set(_rccl_environment "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" - "ROCPROFSYS_USE_RCCLP=ON" "ROCPROFSYS_TIME_OUTPUT=OFF" "ROCPROFSYS_USE_PID=OFF" + "ROCPROFSYS_USE_RCCLP=ON" + "ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy" "${_test_openmp_env}" "${_test_library_path}") @@ -466,8 +467,8 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST) cmake_parse_arguments( TEST "SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME" - "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" "${_KWARGS}" - ${ARGN}) + "NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" + "${_KWARGS}" ${ARGN}) foreach(_PREFIX SAMPLING RUNTIME REWRITE REWRITE_RUN BASELINE) if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "")