Skip to content

[MPIX Stream] workq-based MPIX P2P Stream test failure #7670

@R0n12

Description

@R0n12

Hi!

I am currently testing out stream.cu and have the following output:

mpirun -l -np 2 -ppn 2 -genv MPIR_CVAR_CH4_ENABLE_STREAM_WORKQ=1 -genv MPIR_CVAR_GPU_HAS_WAIT_KERNEL=1 -genv MPIR_CVAR_ENABLE_GPU=1 -genv MPIR_CVAR_CH4_RESERVE_VCIS=1 -genv MPIR_CVAR_CH4_NUM_VCIS=2 ./stream -progress-thread
[0] [a100-11:1399809:0:1399809]  ucp_worker.c:3063 Assertion `--worker->inprogress == 0' failed
[0] [a100-11:1399809:1:1399830]  ucp_worker.c:3058 Assertion `worker->inprogress++ == 0' failed
[1] [a100-11:1399810:0:1399810]  ucp_worker.c:3058 Assertion `worker->inprogress++ == 0' failed
[1] [a100-11:1399810:1:1399831]  ucp_worker.c:3063 Assertion `--worker->inprogress == 0' failed
[1] [1762972281.903354] [a100-11:1399810:1]           debug.c:1300 UCX  WARN  ucs_debug_disable_signal: signal 8 was not set in ucs
[1] [1762972281.903386] [a100-11:1399810:0]           debug.c:1300 UCX  WARN  ucs_debug_disable_signal: signal 1 was not set in ucs
[1] [1762972281.903391] [a100-11:1399810:1]        spinlock.c:29   UCX  WARN  ucs_recursive_spinlock_destroy() failed: busy
[0] 
[0] /home/xu.3304/project/mpix-stream/mpich/modules/ucx/src/ucp/core/ucp_worker.c: [ ucp_worker_progress() ]
[0]       ...
[1] 
[1] /home/xu.3304/project/mpix-stream/mpich/modules/ucx/src/ucp/core/ucp_worker.c: [ ucp_worker_progress() ]
[1]       ...
[0]      3055     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
[0]      3056 
[0]      3057     /* check that ucp_worker_progress is not called from within ucp_worker_progress */
[0] ==>  3058     ucs_assert(worker->inprogress++ == 0);
[0]      3059     count = uct_worker_progress(worker->uct);
[0]      3060     ucs_async_check_miss(&worker->async);
[0]      3061 
[0] 
[1]      3060     ucs_async_check_miss(&worker->async);
[1]      3061 
[1]      3062     /* coverity[assert_side_effect] */
[1] ==>  3063     ucs_assert(--worker->inprogress == 0);
[1]      3064 
[1]      3065     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
[1]      3066 
[1] 
[0] 
[0] /home/xu.3304/project/mpix-stream/mpich/modules/ucx/src/ucp/core/ucp_worker.c: [ ucp_worker_progress() ]
[0]       ...
[0]      3060     ucs_async_check_miss(&worker->async);
[0]      3061 
[0]      3062     /* coverity[assert_side_effect] */
[0] ==>  3063     ucs_assert(--worker->inprogress == 0);
[0]      3064 
[0]      3065     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
[0]      3066 
[0] 
[1] 
[1] /home/xu.3304/project/mpix-stream/mpich/modules/ucx/src/ucp/core/ucp_worker.c: [ ucp_worker_progress() ]
[1]       ...
[1]      3055     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
[1]      3056 
[1]      3057     /* check that ucp_worker_progress is not called from within ucp_worker_progress */
[1] ==>  3058     ucs_assert(worker->inprogress++ == 0);
[1]      3059     count = uct_worker_progress(worker->uct);
[1]      3060     ucs_async_check_miss(&worker->async);
[1]      3061 
[1] 
[0] ==== backtrace (tid:1399830) ====
[0]  0 0x00000000000664e8 ucp_worker_progress()  /home/xu.3304/project/mpix-stream/mpich/modules/ucx/src/ucp/core/ucp_worker.c:3058
[0]  1 0x00000000004139cf MPIDI_progress_test.isra.0()  init_async.c:0
[0]  2 0x0000000000414206 progress_fn()  init_async.c:0
[0]  3 0x00000000004cd7d5 MPLI_thread_start()  :0
[0]  4 0x00000000000081ca start_thread()  ???:0
[0]  5 0x00000000000398d3 __GI___clone()  :0
[0] =================================
[1] ==== backtrace (tid:1399831) ====
[1]  0 0x0000000000066528 ucp_worker_progress()  /home/xu.3304/project/mpix-stream/mpich/modules/ucx/src/ucp/core/ucp_worker.c:3063
[1]  1 0x0000000000066528 ucp_worker_progress()  /home/xu.3304/project/mpix-stream/mpich/modules/ucx/src/ucp/core/ucp_worker.c:3060
[1]  2 0x00000000004139cf MPIDI_progress_test.isra.0()  init_async.c:0
[1]  3 0x0000000000414206 progress_fn()  init_async.c:0
[1]  4 0x00000000004cd7d5 MPLI_thread_start()  :0
[1]  5 0x00000000000081ca start_thread()  ???:0
[1]  6 0x00000000000398d3 __GI___clone()  :0
[1] =================================

===================================================================================
=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
=   PID 1399809 RUNNING AT a100-11
=   EXIT CODE: 134
=   CLEANING UP REMAINING PROCESSES
=   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Aborted (signal 6)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

It does seem like two MPIDI_progres_test is calling into the ucx worker which was created using UCS_THREAD_MODE_SERIALIZED and doesn't support concurrent access.

I suspected that the main thread is still inside MPIX_Stream_comm_create while the progress thread is already proceeding, so I moved the MPIX_Start_progress_thread to after MPIX_Stream_comm_create

    MPI_Comm stream_comm;
    MPIX_Stream_comm_create(MPI_COMM_WORLD, mpi_stream, &stream_comm);

    if (need_progress_thread) {
        MPIX_Start_progress_thread(mpi_stream);
    }

After the change, the ucp worker error went away but the process got hang during cudaFree()

#0  0x000014778a3802ae in ?? () from /lib64/libcuda.so.1
#1  0x000014778a13e183 in ?? () from /lib64/libcuda.so.1
#2  0x000014778a17cdeb in ?? () from /lib64/libcuda.so.1
#3  0x000014778aebb8f7 in ?? () from /lib64/libcuda.so.1
#4  0x000014778aebbd45 in ?? () from /lib64/libcuda.so.1
#5  0x000014778a14a9f4 in ?? () from /lib64/libcuda.so.1
#6  0x000014778aeb9f09 in ?? () from /lib64/libcuda.so.1
#7  0x000014778a0e97cb in ?? () from /lib64/libcuda.so.1
#8  0x000014778a25f437 in ?? () from /lib64/libcuda.so.1
#9  0x000014778a24c300 in cuMemFree_v2 () from /lib64/libcuda.so.1
#10 0x00001477331cb07c in ucm_cuMemFree_v2 (arg0=22502560497664) at cudamem.c:200
#11 0x000014779074b2a5 in ?? () from /opt/cuda/12.6/lib64/libcudart.so.12
#12 0x000014779078bbf8 in cudaFree () from /opt/cuda/12.6/lib64/libcudart.so.12
#13 0x0000000000401dc8 in main ()

I have no clue how to proceed next to make this work. Maybe something related to ucx intercepting cuda memory instructions?

MPICH config:

MPICH Version:      5.0.0b1
MPICH Release date: unreleased development copy
MPICH ABI:          0:0:0
MPICH Device:       ch4:ucx
MPICH configure:    --prefix=/home/xu.3304/project/mpix-stream/install/mpich-2e32b90-cuda12.6 --disable-fortran --disable-f77 --enable-cuda --with-cuda=/opt/cuda/12.6 --with-device=ch4:ucx --with-ucx=embedded CC=/opt/gcc/13.3.0/bin/gcc CXX=/opt/gcc/13.3.0/bin/g++ LDFLAGS=-lstdc++ CFLAGS=-I/opt/cuda/12.6/include
MPICH CC:           /opt/gcc/13.3.0/bin/gcc -I/opt/cuda/12.6/include    -O2
MPICH CXX:          /opt/gcc/13.3.0/bin/g++   -O2
MPICH F77:          /opt/gcc/13.3.0/bin/gfortran  
MPICH FC:           /opt/gcc/13.3.0/bin/gfortran  
MPICH features:     threadcomm

Run commands:

mpirun -l -np 2 -ppn 2 -genv MPIR_CVAR_CH4_ENABLE_STREAM_WORKQ=1 -genv MPIR_CVAR_GPU_HAS_WAIT_KERNEL=1 -genv MPIR_CVAR_ENABLE_GPU=1 -genv MPIR_CVAR_CH4_RESERVE_VCIS=1 -genv MPIR_CVAR_CH4_NUM_VCIS=2 ./stream -progress-thread

UCX config:

  $ ./configure --disable-option-checking --prefix=/home/xu.3304/project/mpix-stream/install/mpich-2e32b90-cuda12.6 --disable-static --with-java=no --with-go=no --enable-embedded --disable-fortran --disable-f77 --enable-cuda --with-cuda=/opt/cuda/12.6 --with-device=ch4:ucx --with-ucx=embedded CC=/opt/gcc/13.3.0/bin/gcc CXX=/opt/gcc/13.3.0/bin/g++ LDFLAGS=-lstdc++  CFLAGS=-I/opt/cuda/12.6/include  FC=/opt/gcc/13.3.0/bin/gfortran FCFLAGS=  F77=/opt/gcc/13.3.0/bin/gfortran --cache-file=/dev/null --srcdir=.

configure: =========================================================
configure: UCX build configuration:
configure:         Build prefix:   /home/xu.3304/project/mpix-stream/install/mpich-2e32b90-cuda12.6
configure:    Configuration dir:   ${prefix}/etc/ucx
configure:                   CC:   /opt/gcc/13.3.0/bin/gcc
configure:                  CXX:   /opt/gcc/13.3.0/bin/g++
configure:             CPPFLAGS:   -DCPU_FLAGS="" -I${abs_top_srcdir}/src -I${abs_top_builddir} -I${abs_top_builddir}/src  
configure:               CFLAGS:   -O3 -g -Wall -funwind-tables -Wframe-larger-than=8192 -Wno-missing-field-initializers -Wno-unused-parameter -Wno-unused-label -Wno-long-long -Wno-endif-labels -Wno-sign-compare -Wno-multichar -Wno-deprecated-declarations -Winvalid-pch -Wno-pointer-sign -Werror-implicit-function-declaration -Wno-format-zero-length -Wnested-externs -Wshadow -Werror=declaration-after-statement -I/opt/cuda/12.6/include 
configure:             CXXFLAGS:   -O3 -g -Wall -funwind-tables -Wframe-larger-than=8192 -Wno-missing-field-initializers -Wno-unused-parameter -Wno-unused-label -Wno-long-long -Wno-endif-labels -Wno-sign-compare -Wno-multichar -Wno-deprecated-declarations -Winvalid-pch  
configure:           ASAN check:   no
configure:         Multi-thread:   disabled
configure:            MPI tests:   disabled
configure:          VFS support:   yes
configure:        Devel headers:   no
configure: io_demo CUDA support:   no
configure:             Bindings:   < >
configure:          UCS modules:   < fuse >
configure:          UCT modules:   < cuda ib rdmacm cma knem xpmem >
configure:         CUDA modules:   < gdrcopy >
configure:         ROCM modules:   < >
configure:           IB modules:   < mlx5 >
configure:          UCM modules:   < cuda >
configure:         Perf modules:   < cuda mad >
configure: =========================================================

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions