From 565e28e17d4e6fd0dd34ed1ff4fc4a23cd6184b7 Mon Sep 17 00:00:00 2001 From: CFLAG Date: Wed, 2 Feb 2022 21:03:41 +0100 Subject: [PATCH 1/5] Remove nvtx profiling commands --- src/navier.f90 | 16 ---------------- src/x3d_derive.f90 | 3 --- src/xcompact3d.f90 | 5 ----- 3 files changed, 24 deletions(-) diff --git a/src/navier.f90 b/src/navier.f90 index 7a129e4..783e522 100644 --- a/src/navier.f90 +++ b/src/navier.f90 @@ -53,7 +53,6 @@ SUBROUTINE solve_poisson(pp3, px1, py1, pz1, ux1, uy1, uz1) USE decomp_2d_poisson, ONLY : poisson USE variables, ONLY : nzm USE param, ONLY : npress - use nvtx implicit none @@ -69,17 +68,11 @@ SUBROUTINE solve_poisson(pp3, px1, py1, pz1, ux1, uy1, uz1) nlock = 1 !! Corresponds to computing div(u*) - call nvtxStartRange("divergence") CALL divergence(pp3(:,:,:,1),ux1,uy1,uz1,nlock) - call nvtxEndRange ! - call nvtxStartRange("poisson") CALL poisson(pp3(:,:,:,1)) - call nvtxEndRange ! - call nvtxStartRange("gradp") CALL gradp(px1,py1,pz1,pp3(:,:,:,1)) - call nvtxEndRange END SUBROUTINE solve_poisson !############################################################################ @@ -137,7 +130,6 @@ subroutine divergence (pp3,ux1,uy1,uz1,nlock) duxdxp2, uyp2, uzp2, duydypi2, upi2, ta2, & duxydxyp3, uzp3, po3 USE MPI - use nvtx implicit none @@ -167,15 +159,9 @@ subroutine divergence (pp3,ux1,uy1,uz1,nlock) call interxvp(pgy1,tb1,sx,x3d_op_intxvp,xsize(1),nxm,xsize(2),xsize(3)) call interxvp(pgz1,tc1,sx,x3d_op_intxvp,xsize(1),nxm,xsize(2),xsize(3)) - call nvtxStartRange("Transpose xty pp1") call transpose_x_to_y(pp1,duxdxp2,ph2)!->NXM NY NZ - call nvtxEndRange - call nvtxStartRange("Transpose xty pgy") call transpose_x_to_y(pgy1,uyp2,ph2) - call nvtxEndRange - call nvtxStartRange("Transpose xty pgz") call transpose_x_to_y(pgz1,uzp2,ph2) - call nvtxEndRange !WORK Y-PENCILS call interyvp(upi2,duxdxp2,sy,x3d_op_intyvp,(ph1%yen(1)-ph1%yst(1)+1),ysize(2),nym,ysize(3)) @@ -188,9 +174,7 @@ subroutine divergence (pp3,ux1,uy1,uz1,nlock) call interyvp(upi2,uzp2,sy,x3d_op_intyvp,(ph1%yen(1)-ph1%yst(1)+1),ysize(2),nym,ysize(3)) - call nvtxStartRange("Transpose ytz duy") call transpose_y_to_z(duydypi2,duxydxyp3,ph3)!->NXM NYM NZ - call nvtxEndRange call transpose_y_to_z(upi2,uzp3,ph3) !WORK Z-PENCILS diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 8768d3e..f247cb9 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -526,7 +526,6 @@ end subroutine dery_22 subroutine derz_00(tz,uz,sz,x3dop,nx,ny,nz) use x3d_operator_z_data - use nvtx implicit none @@ -570,9 +569,7 @@ subroutine derz_00(tz,uz,sz,x3dop,nx,ny,nz) enddo ! Solve tri-diagonal system - call nvtxStartRange("zthomas") call zthomas(tz, sz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - call nvtxEndRange end subroutine derz_00 diff --git a/src/xcompact3d.f90 b/src/xcompact3d.f90 index dda827a..bd1340d 100644 --- a/src/xcompact3d.f90 +++ b/src/xcompact3d.f90 @@ -40,7 +40,6 @@ program xcompact3d use transeq, only : calculate_transeq_rhs use navier, only : solve_poisson, cor_vel use mom, only : test_du, test_dv, test_dw - use nvtx implicit none @@ -65,9 +64,7 @@ program xcompact3d tstart = MPI_Wtime() - call nvtxStartRange("transeq") call calculate_transeq_rhs(dux1,duy1,duz1,ux1,uy1,uz1) - call nvtxEndRange do concurrent (k=1:xsize(3), j=1:xsize(2), i=1:xsize(1)) ux1(i,j,k) = ux1(i,j,k) + dt * dux1(i,j,k,1) uy1(i,j,k) = uy1(i,j,k) + dt * duy1(i,j,k,1) @@ -77,9 +74,7 @@ program xcompact3d !do concurrent (k=1:zsize(3), j=1:zsize(2), i=1:zsize(1)) ! divu3(:,:,:) = zero !enddo - call nvtxStartRange("solve_poisson") call solve_poisson(pp3,px1,py1,pz1,ux1,uy1,uz1) - call nvtxEndRange call cor_vel(ux1,uy1,uz1,px1,py1,pz1) tend = MPI_Wtime() From 87f73bfaf6d3a1a3d7429486384b9c2c19cdda8c Mon Sep 17 00:00:00 2001 From: CFLAG Date: Wed, 2 Feb 2022 21:07:57 +0100 Subject: [PATCH 2/5] Switch makefile to gcc --- Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index e8e55dd..9429f13 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ DEFS = -DDOUBLE_PREC -DVERSION=\"$(GIT_VERSION)\" LCL = local# local,lad,sdu,archer IVER = 17# 15,16,17,18 -CMP = nvhpc# intel,gcc,nvhpc +CMP = gcc# intel,gcc,nvhpc FFT = generic# generic,fftw3,mkl #######CMP settings########### @@ -24,8 +24,8 @@ FFLAGS = -fpp -O3 -xSSE4.2 -axAVX,CORE-AVX-I,CORE-AVX2 -ipo -fp-model fast=2 -mc else ifeq ($(CMP),gcc) FC = mpif90 #FC = mpif90-mpich-mp -#FFLAGS = -O3 -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -x f95-cpp-input -FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu +FFLAGS = -O3 -funroll-loops -floop-optimize -Warray-bounds -fcray-pointer -cpp +#FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu #-cpp -O3 -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -fbacktrace -ffree-line-length-none -fallow-argument-mismatch #-ffpe-trap=invalid,zero else ifeq ($(CMP),nagfor) @@ -67,7 +67,7 @@ else ifeq ($(FFT),fftw3_f03) LIBFFT=-L$(FFTW3_PATH)/lib -lfftw3 -lfftw3f else ifeq ($(FFT),generic) INC= - LIBFFT=-lnvhpcwrapnvtx + LIBFFT=#-lnvhpcwrapnvtx else ifeq ($(FFT),mkl) SRCDECOMP := $(DECOMPDIR)/mkl_dfti.f90 $(SRCDECOMP) LIBFFT=-Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread @@ -76,7 +76,7 @@ endif #######OPTIONS settings########### OPT = -I$(SRCDIR) -I$(DECOMPDIR) $(FFLAGS) -LINKOPT = $(FFLAGS) -lnvhpcwrapnvtx +LINKOPT = $(FFLAGS) #-lnvhpcwrapnvtx #----------------------------------------------------------------------- # Normally no need to change anything below From f17793bad2330029af647f1f67f1a415477d2140 Mon Sep 17 00:00:00 2001 From: CFLAG Date: Wed, 2 Feb 2022 22:31:56 +0100 Subject: [PATCH 3/5] Fix for nvidia compiler --- src/x3d_derive.f90 | 11 +++++----- src/x3d_staggered.f90 | 48 +++++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index f247cb9..9a0baeb 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -171,10 +171,11 @@ subroutine derx_00(tx,ux,sx,x3dop,nx,ny,nz) tx(nx,j,k) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & + bfix*(ux(2,j,k)-ux(nx-2,j,k)) - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + end subroutine derx_00 !******************************************************************** @@ -232,11 +233,11 @@ subroutine derx_ij(tx,ux,sx,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) tx(nx-1,j,k) = afmx*(ux(nx,j,k)-ux(nx-2,j,k)) tx(nx,j,k) = - afnx*ux(nx,j,k) - bfnx*ux(nx-1,j,k) - cfnx*ux(nx-2,j,k) endif - - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), ff, fs, fw, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, ff, fs, fw, nx, ny, nz) + end subroutine derx_ij !******************************************************************** diff --git a/src/x3d_staggered.f90 b/src/x3d_staggered.f90 index 2bc3181..ce1c584 100644 --- a/src/x3d_staggered.f90 +++ b/src/x3d_staggered.f90 @@ -81,11 +81,11 @@ subroutine derxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz) + bcix6*(ux(1 ,j,k)-ux(nx-2,j,k)) tx(nx ,j,k) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) - - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + else ! nxm = nx-1 do concurrent (k=1:nz, j=1:ny) @@ -117,11 +117,11 @@ subroutine derxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz) tx(nxm,j,k) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) & + bcix6*(two*ux(nx,j,k)-ux(nxm,j,k)-ux(nxm-1,j,k)) endif - - ! Solve tri-diagonal system - call xthomas(tx(1:nxm,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nxm, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz) + endif end subroutine derxvp @@ -183,11 +183,11 @@ subroutine interxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz) + bicix6*(ux(2,j,k)+ux(nx-1,j,k)) & + cicix6*(ux(3,j,k)+ux(nx-2,j,k)) & + dicix6*(ux(4,j,k)+ux(nx-3,j,k)) - - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + else ! nxm = nx-1 if (x3dop%npaire==1) then @@ -224,11 +224,11 @@ subroutine interxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz) + bicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) & + cicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) & + dicix6*(ux(nxm-2,j,k)+ux(nxm-3,j,k)) - - ! Solve tri-diagonal system - call xthomas(tx(1:nxm,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nxm, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz) + endif endif @@ -269,11 +269,11 @@ subroutine derxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz) + bcix6*(ux(nx ,j,k)-ux(nx-3,j,k)) tx(nx ,j,k) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + bcix6*(ux(1,j,k)-ux(nx-2,j,k)) - - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + else ! nxm = nx-1 if (x3dop%npaire==1) then @@ -290,11 +290,11 @@ subroutine derxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz) tx(nx-1,j,k) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + bcix6*(ux(nx-1,j,k)-ux(nx-3,j,k)) tx(nx,j,k) = zero - - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + endif endif @@ -357,11 +357,11 @@ subroutine interxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz) + bicix6*(ux(1,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(2,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(3,j,k)+ux(nx-4,j,k)) - - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + else ! nxm = nx-1 if (x3dop%npaire==1) then @@ -406,11 +406,11 @@ subroutine interxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz) + bicix6*(ux(nx-2,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(nx-3,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(nx-4,j,k)+ux(nx-4,j,k)) - - ! Solve tri-diagonal system - call xthomas(tx(1:nx,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nx, 1, 1) enddo + ! Solve tri-diagonal system + call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + endif endif From a3948bee627fcb4cca13aabf8c673eddf51e8ac2 Mon Sep 17 00:00:00 2001 From: CFLAG Date: Thu, 3 Feb 2022 09:45:31 +0100 Subject: [PATCH 4/5] Update Makefile for benchmark --- Makefile | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 9429f13..7abb8fd 100644 --- a/Makefile +++ b/Makefile @@ -12,22 +12,18 @@ DEFS = -DDOUBLE_PREC -DVERSION=\"$(GIT_VERSION)\" LCL = local# local,lad,sdu,archer IVER = 17# 15,16,17,18 -CMP = gcc# intel,gcc,nvhpc +CMP = nvhpc# intel,gcc,nvhpc FFT = generic# generic,fftw3,mkl #######CMP settings########### ifeq ($(CMP),intel) FC = mpiifort -#FFLAGS = -fpp -O3 -xHost -heap-arrays -shared-intel -mcmodel=large -safe-cray-ptr -g -traceback -FFLAGS = -fpp -O3 -xSSE4.2 -axAVX,CORE-AVX-I,CORE-AVX2 -ipo -fp-model fast=2 -mcmodel=large -safe-cray-ptr -I$(MPI_ROOT)/lib -##debuggin test: -check all -check bounds -chintel eck uninit -gen-interfaces -warn interfaces +FFLAGS = -fpp -O3 -mavx2 -mlzcnt -march=core-avx2 -mf16c +FFLAGS += -fopenmp -I$(MPI_ROOT)/lib else ifeq ($(CMP),gcc) FC = mpif90 -#FC = mpif90-mpich-mp -FFLAGS = -O3 -funroll-loops -floop-optimize -Warray-bounds -fcray-pointer -cpp -#FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu -#-cpp -O3 -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -fbacktrace -ffree-line-length-none -fallow-argument-mismatch -#-ffpe-trap=invalid,zero +FFLAGS = -cpp -O3 -march=native +FFLAGS += -fopenmp -ftree-parallelize-loops=12 else ifeq ($(CMP),nagfor) FC = mpinagfor FFLAGS = -fpp @@ -36,7 +32,8 @@ FC = ftn FFLAGS = -eF -g -O3 -N 1023 else ifeq ($(CMP),nvhpc) FC = mpif90 -FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -stdpar=gpu -gpu=cc80,managed -O3 +FFLAGS = -cpp -O3 -march=native +FFLAGS += -Minfo=stdpar -stdpar=multicore -acc #FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu -fast -O3 -Minstrument endif From 6453acc011fb946efe44aa83ffe2cc65fbf68e37 Mon Sep 17 00:00:00 2001 From: Paul Bartholomew Date: Thu, 3 Feb 2022 10:31:23 +0000 Subject: [PATCH 5/5] Add an nvhpc FFT target --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7abb8fd..5a744a2 100644 --- a/Makefile +++ b/Makefile @@ -64,7 +64,10 @@ else ifeq ($(FFT),fftw3_f03) LIBFFT=-L$(FFTW3_PATH)/lib -lfftw3 -lfftw3f else ifeq ($(FFT),generic) INC= - LIBFFT=#-lnvhpcwrapnvtx + LIBFFT= +else ifeq ($(FFT),nvhpc) + INC= + LIBFFT=-lnvhpcwrapnvtx else ifeq ($(FFT),mkl) SRCDECOMP := $(DECOMPDIR)/mkl_dfti.f90 $(SRCDECOMP) LIBFFT=-Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread