Open
Description
To report a non-security related issue, please provide:
- the version of the software with which you are encountering an issue
netcdf-fortran/4.6.0 netcdf-c/4.9.2 hdf5/1.14.0 - environmental information (i.e. Operating System, compiler info, java version, python version, etc.)
HPC system: Linux/Rocky8, Intel 2021.5.0
laptop: Linux/Fedora39, gcc 13.2.1 - a description of the issue with the steps needed to reproduce it
The following test program:
$ cat test_netcdf_chunking.F90
!> Return error to ESMF and finalize it.
#define NC_ERR_STOP(status) \
if (status /= nf90_noerr) write(0,*) "file: ", __FILE__, " line: ", __LINE__, trim(nf90_strerror(status)); \
if (status /= nf90_noerr) call MPI_Abort(MPI_COMM_WORLD,1,ierr)
program test_netcdf_chunking
use mpi
use netcdf
implicit none
integer :: mype, nproc
! integer, parameter :: im=3600, jm=1800, lm=256
integer, parameter :: im=3600, jm=1800, lm=128
real, dimension(:,:,:), allocatable :: array
integer :: ncerr,ierr
integer :: ncid
integer :: oldMode
integer :: im_dimid, jm_dimid, lm_dimid
integer :: varid
call MPI_Init ( ierr )
call MPI_Comm_rank ( MPI_COMM_WORLD, mype, ierr )
call MPI_Comm_size ( MPI_COMM_WORLD, nproc, ierr )
if (mod(lm,nproc) /= 0) then
write(0,*)'MPI_Comm_size=',nproc,' must evenly divide lm=', lm
call MPI_Abort(MPI_COMM_WORLD,1,ierr)
endif
ncerr = nf90_create('test.nc',&
cmode=IOR(NF90_CLOBBER,NF90_NETCDF4),&
comm=MPI_COMM_WORLD, info = MPI_INFO_NULL, ncid=ncid); NC_ERR_STOP(ncerr)
! disable auto filling.
ncerr = nf90_set_fill(ncid, NF90_NOFILL, oldMode); NC_ERR_STOP(ncerr)
ncerr = nf90_def_dim(ncid, "im", im, im_dimid); NC_ERR_STOP(ncerr)
ncerr = nf90_def_dim(ncid, "jm", jm, jm_dimid); NC_ERR_STOP(ncerr)
ncerr = nf90_def_dim(ncid, "lm", lm, lm_dimid); NC_ERR_STOP(ncerr)
ncerr = nf90_def_var(ncid, "field", NF90_FLOAT, [im_dimid,jm_dimid,lm_dimid], varid) ; NC_ERR_STOP(ncerr)
ncerr = nf90_def_var_chunking(ncid, varid, NF90_CHUNKED, [im,jm,lm]) ; NC_ERR_STOP(ncerr)
ncerr = nf90_def_var_deflate(ncid, varid, NF90_NOSHUFFLE, 1, 1) ; NC_ERR_STOP(ncerr)
ncerr = nf90_var_par_access(ncid, varid, NF90_COLLECTIVE); NC_ERR_STOP(ncerr)
ncerr = nf90_enddef(ncid); NC_ERR_STOP(ncerr)
allocate(array(im,jm,lm/nproc))
call random_number(array)
ncerr = nf90_put_var(ncid, varid, values=array, start=[1,1,(mype*(lm/nproc))+1]); NC_ERR_STOP(ncerr)
deallocate(array)
ncerr = nf90_close(ncid=ncid); NC_ERR_STOP(ncerr)
call MPI_Finalize(ierr)
end program test_netcdf_chunking
crashes with the following error on HPC system using intel/intelmpi:
0: Abort(671744002) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Type_contiguous: Invalid count, error stack:
0: PMPI_Type_contiguous(271): MPI_Type_contiguous(count=-1299292371, MPI_BYTE, new_type_p=0x7fff1d3777d4) failed
0: PMPI_Type_contiguous(238): Negative count, value is -1299292371
0: slurmstepd: error: *** STEP 58146089.1 ON h21c07 CANCELLED AT 2024-04-10T22:45:59 ***
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: error: h21c07: tasks 0-3: Killed
srun: Terminating StepId=58146089.1
and on my laptop using gcc/13 and either OpenMPI or mpich:
$ mpirun -n 4 ./test_netcdf_chunking
[fedora:1007948] *** An error occurred in MPI_Type_contiguous
[fedora:1007948] *** reported by process [2256207873,0]
[fedora:1007948] *** on communicator MPI_COMM_WORLD
[fedora:1007948] *** MPI_ERR_COUNT: invalid count argument
[fedora:1007948] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[fedora:1007948] *** and potentially your MPI job)
$ mpirun -n 4 ./test_netcdf_chunking
Abort(205157890) on node 0 (rank 0 in comm 0): Fatal error in internal_Type_contiguous: Invalid count, error stack:
internal_Type_contiguous(75): MPI_Type_contiguous(count=-1311205156, MPI_BYTE, newtype=0x7ffdbae07a20) failed
internal_Type_contiguous(43): Negative count, value is -1311205156
Metadata
Metadata
Assignees
Labels
No labels