Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nvhpc FFT option #12

Open
wants to merge 5 commits into
base: hack_doconcurrent
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,12 @@ FFT = generic# generic,fftw3,mkl
#######CMP settings###########
ifeq ($(CMP),intel)
FC = mpiifort
#FFLAGS = -fpp -O3 -xHost -heap-arrays -shared-intel -mcmodel=large -safe-cray-ptr -g -traceback
FFLAGS = -fpp -O3 -xSSE4.2 -axAVX,CORE-AVX-I,CORE-AVX2 -ipo -fp-model fast=2 -mcmodel=large -safe-cray-ptr -I$(MPI_ROOT)/lib
##debuggin test: -check all -check bounds -chintel eck uninit -gen-interfaces -warn interfaces
FFLAGS = -fpp -O3 -mavx2 -mlzcnt -march=core-avx2 -mf16c
FFLAGS += -fopenmp -I$(MPI_ROOT)/lib
else ifeq ($(CMP),gcc)
FC = mpif90
#FC = mpif90-mpich-mp
#FFLAGS = -O3 -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -x f95-cpp-input
FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu
#-cpp -O3 -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -fbacktrace -ffree-line-length-none -fallow-argument-mismatch
#-ffpe-trap=invalid,zero
FFLAGS = -cpp -O3 -march=native
FFLAGS += -fopenmp -ftree-parallelize-loops=12
else ifeq ($(CMP),nagfor)
FC = mpinagfor
FFLAGS = -fpp
Expand All @@ -36,7 +32,8 @@ FC = ftn
FFLAGS = -eF -g -O3 -N 1023
else ifeq ($(CMP),nvhpc)
FC = mpif90
FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -stdpar=gpu -gpu=cc80,managed -O3
FFLAGS = -cpp -O3 -march=native
FFLAGS += -Minfo=stdpar -stdpar=multicore -acc
#FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu -fast -O3 -Minstrument
endif

Expand Down Expand Up @@ -66,6 +63,9 @@ else ifeq ($(FFT),fftw3_f03)
INC=-I$(FFTW3_PATH)/include
LIBFFT=-L$(FFTW3_PATH)/lib -lfftw3 -lfftw3f
else ifeq ($(FFT),generic)
INC=
LIBFFT=
else ifeq ($(FFT),nvhpc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably broken when FFT=nvhpc because there is no file "fft_nvhpc.f90" in decomp2d ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point - totally untested code, but in principle this is how it should work, generic should be portable and not link NVIDIA libraries

INC=
LIBFFT=-lnvhpcwrapnvtx
else ifeq ($(FFT),mkl)
Expand All @@ -76,7 +76,7 @@ endif

#######OPTIONS settings###########
OPT = -I$(SRCDIR) -I$(DECOMPDIR) $(FFLAGS)
LINKOPT = $(FFLAGS) -lnvhpcwrapnvtx
LINKOPT = $(FFLAGS) #-lnvhpcwrapnvtx
#-----------------------------------------------------------------------
# Normally no need to change anything below

Expand Down
16 changes: 0 additions & 16 deletions src/navier.f90
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ SUBROUTINE solve_poisson(pp3, px1, py1, pz1, ux1, uy1, uz1)
USE decomp_2d_poisson, ONLY : poisson
USE variables, ONLY : nzm
USE param, ONLY : npress
use nvtx

implicit none

Expand All @@ -69,17 +68,11 @@ SUBROUTINE solve_poisson(pp3, px1, py1, pz1, ux1, uy1, uz1)

nlock = 1 !! Corresponds to computing div(u*)

call nvtxStartRange("divergence")
CALL divergence(pp3(:,:,:,1),ux1,uy1,uz1,nlock)
call nvtxEndRange
!
call nvtxStartRange("poisson")
CALL poisson(pp3(:,:,:,1))
call nvtxEndRange
!
call nvtxStartRange("gradp")
CALL gradp(px1,py1,pz1,pp3(:,:,:,1))
call nvtxEndRange

END SUBROUTINE solve_poisson
!############################################################################
Expand Down Expand Up @@ -137,7 +130,6 @@ subroutine divergence (pp3,ux1,uy1,uz1,nlock)
duxdxp2, uyp2, uzp2, duydypi2, upi2, ta2, &
duxydxyp3, uzp3, po3
USE MPI
use nvtx

implicit none

Expand Down Expand Up @@ -167,15 +159,9 @@ subroutine divergence (pp3,ux1,uy1,uz1,nlock)
call interxvp(pgy1,tb1,sx,x3d_op_intxvp,xsize(1),nxm,xsize(2),xsize(3))
call interxvp(pgz1,tc1,sx,x3d_op_intxvp,xsize(1),nxm,xsize(2),xsize(3))

call nvtxStartRange("Transpose xty pp1")
call transpose_x_to_y(pp1,duxdxp2,ph2)!->NXM NY NZ
call nvtxEndRange
call nvtxStartRange("Transpose xty pgy")
call transpose_x_to_y(pgy1,uyp2,ph2)
call nvtxEndRange
call nvtxStartRange("Transpose xty pgz")
call transpose_x_to_y(pgz1,uzp2,ph2)
call nvtxEndRange

!WORK Y-PENCILS
call interyvp(upi2,duxdxp2,sy,x3d_op_intyvp,(ph1%yen(1)-ph1%yst(1)+1),ysize(2),nym,ysize(3))
Expand All @@ -188,9 +174,7 @@ subroutine divergence (pp3,ux1,uy1,uz1,nlock)

call interyvp(upi2,uzp2,sy,x3d_op_intyvp,(ph1%yen(1)-ph1%yst(1)+1),ysize(2),nym,ysize(3))

call nvtxStartRange("Transpose ytz duy")
call transpose_y_to_z(duydypi2,duxydxyp3,ph3)!->NXM NYM NZ
call nvtxEndRange
call transpose_y_to_z(upi2,uzp3,ph3)

!WORK Z-PENCILS
Expand Down
14 changes: 6 additions & 8 deletions src/x3d_derive.f90
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,11 @@ subroutine derx_00(tx,ux,sx,x3dop,nx,ny,nz)
tx(nx,j,k) = afix*(ux(1,j,k)-ux(nx-1,j,k)) &
+ bfix*(ux(2,j,k)-ux(nx-2,j,k))

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz)

end subroutine derx_00

!********************************************************************
Expand Down Expand Up @@ -232,11 +233,11 @@ subroutine derx_ij(tx,ux,sx,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln)
tx(nx-1,j,k) = afmx*(ux(nx,j,k)-ux(nx-2,j,k))
tx(nx,j,k) = - afnx*ux(nx,j,k) - bfnx*ux(nx-1,j,k) - cfnx*ux(nx-2,j,k)
endif

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), ff, fs, fw, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, ff, fs, fw, nx, ny, nz)

end subroutine derx_ij

!********************************************************************
Expand Down Expand Up @@ -526,7 +527,6 @@ end subroutine dery_22
subroutine derz_00(tz,uz,sz,x3dop,nx,ny,nz)

use x3d_operator_z_data
use nvtx

implicit none

Expand Down Expand Up @@ -570,9 +570,7 @@ subroutine derz_00(tz,uz,sz,x3dop,nx,ny,nz)
enddo

! Solve tri-diagonal system
call nvtxStartRange("zthomas")
call zthomas(tz, sz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz)
call nvtxEndRange

end subroutine derz_00

Expand Down
48 changes: 24 additions & 24 deletions src/x3d_staggered.f90
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,11 @@ subroutine derxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz)
+ bcix6*(ux(1 ,j,k)-ux(nx-2,j,k))
tx(nx ,j,k) = acix6*(ux(1,j,k)-ux(nx ,j,k)) &
+ bcix6*(ux(2,j,k)-ux(nx-1,j,k))

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz)

else
! nxm = nx-1
do concurrent (k=1:nz, j=1:ny)
Expand Down Expand Up @@ -117,11 +117,11 @@ subroutine derxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz)
tx(nxm,j,k) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) &
+ bcix6*(two*ux(nx,j,k)-ux(nxm,j,k)-ux(nxm-1,j,k))
endif

! Solve tri-diagonal system
call xthomas(tx(1:nxm,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nxm, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz)

endif

end subroutine derxvp
Expand Down Expand Up @@ -183,11 +183,11 @@ subroutine interxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz)
+ bicix6*(ux(2,j,k)+ux(nx-1,j,k)) &
+ cicix6*(ux(3,j,k)+ux(nx-2,j,k)) &
+ dicix6*(ux(4,j,k)+ux(nx-3,j,k))

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz)

else
! nxm = nx-1
if (x3dop%npaire==1) then
Expand Down Expand Up @@ -224,11 +224,11 @@ subroutine interxvp(tx,ux,sx,x3dop,nx,nxm,ny,nz)
+ bicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) &
+ cicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) &
+ dicix6*(ux(nxm-2,j,k)+ux(nxm-3,j,k))

! Solve tri-diagonal system
call xthomas(tx(1:nxm,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nxm, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz)

endif
endif

Expand Down Expand Up @@ -269,11 +269,11 @@ subroutine derxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz)
+ bcix6*(ux(nx ,j,k)-ux(nx-3,j,k))
tx(nx ,j,k) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) &
+ bcix6*(ux(1,j,k)-ux(nx-2,j,k))

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz)

else
! nxm = nx-1
if (x3dop%npaire==1) then
Expand All @@ -290,11 +290,11 @@ subroutine derxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz)
tx(nx-1,j,k) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) &
+ bcix6*(ux(nx-1,j,k)-ux(nx-3,j,k))
tx(nx,j,k) = zero

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz)

endif
endif

Expand Down Expand Up @@ -357,11 +357,11 @@ subroutine interxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz)
+ bicix6*(ux(1,j,k)+ux(nx-2,j,k)) &
+ cicix6*(ux(2,j,k)+ux(nx-3,j,k)) &
+ dicix6*(ux(3,j,k)+ux(nx-4,j,k))

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), sx(j:j,k:k), x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, sx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz)

else
! nxm = nx-1
if (x3dop%npaire==1) then
Expand Down Expand Up @@ -406,11 +406,11 @@ subroutine interxpv(tx,ux,sx,x3dop,nxm,nx,ny,nz)
+ bicix6*(ux(nx-2,j,k)+ux(nx-2,j,k)) &
+ cicix6*(ux(nx-3,j,k)+ux(nx-3,j,k)) &
+ dicix6*(ux(nx-4,j,k)+ux(nx-4,j,k))

! Solve tri-diagonal system
call xthomas(tx(1:nx,j:j,k:k), x3dop%f, x3dop%s, x3dop%w, nx, 1, 1)
enddo

! Solve tri-diagonal system
call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz)

endif
endif

Expand Down
5 changes: 0 additions & 5 deletions src/xcompact3d.f90
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ program xcompact3d
use transeq, only : calculate_transeq_rhs
use navier, only : solve_poisson, cor_vel
use mom, only : test_du, test_dv, test_dw
use nvtx

implicit none

Expand All @@ -65,9 +64,7 @@ program xcompact3d

tstart = MPI_Wtime()

call nvtxStartRange("transeq")
call calculate_transeq_rhs(dux1,duy1,duz1,ux1,uy1,uz1)
call nvtxEndRange
do concurrent (k=1:xsize(3), j=1:xsize(2), i=1:xsize(1))
ux1(i,j,k) = ux1(i,j,k) + dt * dux1(i,j,k,1)
uy1(i,j,k) = uy1(i,j,k) + dt * duy1(i,j,k,1)
Expand All @@ -77,9 +74,7 @@ program xcompact3d
!do concurrent (k=1:zsize(3), j=1:zsize(2), i=1:zsize(1))
! divu3(:,:,:) = zero
!enddo
call nvtxStartRange("solve_poisson")
call solve_poisson(pp3,px1,py1,pz1,ux1,uy1,uz1)
call nvtxEndRange
call cor_vel(ux1,uy1,uz1,px1,py1,pz1)

tend = MPI_Wtime()
Expand Down