#include <misc.h>
#include <params.h>
!-----------------------------------------------------------------------
!
! Purpose:
! Reallocation routines for the Fourier coefficients
!
! Method:
! 1) After FFT preceding Legendre analysis, reallocate fftbuf
! to decompose over wavenumber, recombining latitudes.
! 2) Before FFT following Legendre synthesis, reallocate fftbuf
! to recombine wavenumbers, decomposing over latitude.
!
!-----------------------------------------------------------------------
!
! $Id: realloc4.F90,v 1.3.22.6 2004/04/28 23:43:56 eaton Exp $
! $Author: eaton $
!
!-----------------------------------------------------------------------
subroutine realloc4a(nlon_fft_in, nlon_fft_out, fftbuf_in, fftbuf_out ) 1,11
!-----------------------------------------------------------------------
!
! Purpose:
! Reallocation routines for the Fourier coefficients
!
! Method:
! After FFT preceding Legendre analysis, reallocate fftbuf
! to decompose over wavenumber, combining latitudes.
!
! Author:
! Original version: J. Rosinski
! Standardized: J. Rosinski, Oct 1995
! J. Truesdale, Feb. 1996
! Modified: P. Worley, September 2002, December 2003
!
!-----------------------------------------------------------------------
#ifdef SPMD
use shr_kind_mod
, only: r8 => shr_kind_r8
use pmgrid
use pspect
use comspe
use spmd_dyn
use mpishorthand
use swap_comm
!-----------------------------------------------------------------------
implicit none
!-----------------------------------------------------------------------
#include <comsta.h>
!------------------------------Parameters-------------------------------
!
integer, parameter :: msgtag = 1000
!---------------------------Input arguments-----------------------------
!
integer, intent(in) :: nlon_fft_in ! first dimension of input array
integer, intent(in) :: nlon_fft_out ! first dimension of output array
real(r8), intent(in) :: fftbuf_in(nlon_fft_in,plev,9,beglat:endlat)
! buffer used for in-place FFTs
real(r8), intent(out) :: fftbuf_out(nlon_fft_out,plev,9,plat)
! buffer used for reordered Fourier coefficients
!
!---------------------------Local workspace-----------------------------
!
! xxx_l: local decomposition
! xxx_r: remote decomposition
integer procid
integer length_r, length_l
integer bpos
integer step, ifld, k, i
integer lat_l, lat_r, beglat_r, endlat_r
integer offset_s ! send displacement + 1
integer offset_r ! receive displacement + 1
integer sndids(realloc4_steps) ! nonblocking MPI send request ids
integer rcvids(realloc4_steps) ! nonblocking MPI recv request ids
integer sndcnts(0:npes-1), sdispls(0:npes-1)
integer rcvcnts(0:npes-1), rdispls(0:npes-1)
logical delayed_recv ! local copy of delayed_swap_recv flag
!-----------------------------------------------------------------------
! Compute send/recv counts and displacements
sndcnts(:) = 0
do step=1,realloc4_steps
procid = realloc4_proc(step)
length_r = 2*numm(procid)
sndcnts(procid) = length_r*(plev*8 + 1)*numlats
enddo
!
sdispls(0) = 0
do procid=1,npes-1
sdispls(procid) = sdispls(procid-1) + sndcnts(procid-1)
enddo
!
length_l = 2*numm(iam)
rcvcnts(:) = 0
do step=1,realloc4_steps
procid = realloc4_proc(step)
rcvcnts(procid) = length_l*(plev*8 + 1)*nlat_p(procid)
enddo
!
rdispls(0) = 0
do procid=1,npes-1
rdispls(procid) = rdispls(procid-1) + rcvcnts(procid-1)
enddo
!
if (dyn_alltoall .eq. 0) then
!
! Copy local data to new location
length_l = 2*numm(iam)
!$omp parallel do private(lat_l, ifld, k, i)
!DIR$ NEXTSCALAR, NOSTREAM
do lat_l=beglat,endlat
!DIR$ STREAM
do ifld=1,8
!DIR$ PREFERVECTOR, PREFERSTREAM
do k=1,plev
do i=1,length_l
fftbuf_out(i,k,ifld,lat_l) = fftbuf_in(locrm(i,iam),k,ifld,lat_l)
enddo
enddo
enddo
do i=1,length_l
fftbuf_out(i,1,9,lat_l) = fftbuf_in(locrm(i,iam),1,9,lat_l)
enddo
enddo
!
! Fill message buffer
!$OMP PARALLEL DO PRIVATE (STEP, PROCID, LENGTH_R, BPOS, LAT_L, IFLD, K, I)
!CSD$ PARALLEL DO PRIVATE (STEP, PROCID, LENGTH_R, BPOS, LAT_L, IFLD, K, I)
do step=1,realloc4_steps
procid = realloc4_proc(step)
length_r = 2*numm(procid)
!
bpos = sdispls(procid)
do lat_l=beglat,endlat
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
do i=1,length_r
buf1(bpos+i) = fftbuf_in(locrm(i,procid),k,ifld,lat_l)
enddo
bpos = bpos+length_r
enddo
enddo
do i=1,length_r
buf1(bpos+i) = fftbuf_in(locrm(i,procid),1,9,lat_l)
enddo
bpos = bpos+length_r
enddo
enddo
!CSD$ END PARALLEL DO
!
! Get remote data
!
call mpialltoallv
(buf1, sndcnts, sdispls, mpir8, &
buf2, rcvcnts, rdispls, mpir8, &
mpicom)
!
! Copy out of message buffers
!
!$OMP PARALLEL DO PRIVATE (STEP, PROCID, BEGLAT_R, ENDLAT_R, BPOS, LAT_R, IFLD, K, I)
!CSD$ PARALLEL DO PRIVATE (STEP, PROCID, BEGLAT_R, ENDLAT_R, BPOS, LAT_R, IFLD, K, I)
do step=1,realloc4_steps
procid = realloc4_proc(step)
beglat_r = cut(1,procid)
endlat_r = cut(2,procid)
bpos = rdispls(procid)
do lat_r=beglat_r,endlat_r
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
do i=1,length_l
fftbuf_out(i,k,ifld,lat_r) = buf2(bpos+i)
enddo
bpos = bpos+length_l
enddo
enddo
do i=1,length_l
fftbuf_out(i,1,9,lat_r) = buf2(bpos+i)
enddo
bpos = bpos+length_l
enddo
!
end do
!CSD$ END PARALLEL DO
else
!
delayed_recv = delayed_swap_recv
()
!
! Post receive requests
call swap1m
(realloc4_steps, msgtag, realloc4_proc, rcvcnts, &
rdispls, spmdbuf_siz, buf2, rcvids)
!
! Copy local data to new location
length_l = 2*numm(iam)
!DIR$ NEXTSCALAR, NOSTREAM
do lat_l=beglat,endlat
!DIR$ STREAM
do ifld=1,8
!DIR$ PREFERVECTOR, PREFERSTREAM
do k=1,plev
do i=1,length_l
fftbuf_out(i,k,ifld,lat_l) = fftbuf_in(locrm(i,iam),k,ifld,lat_l)
enddo
enddo
enddo
do i=1,length_l
fftbuf_out(i,1,9,lat_l) = fftbuf_in(locrm(i,iam),1,9,lat_l)
enddo
enddo
!
! Send data, receive data (depending on communication protocol)
do step=1,realloc4_steps
procid = realloc4_proc(step)
length_r = 2*numm(procid)
!
bpos = sdispls(procid)
do lat_l=beglat,endlat
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
do i=1,length_r
buf1(bpos+i) = fftbuf_in(locrm(i,procid),k,ifld,lat_l)
enddo
bpos = bpos+length_r
enddo
enddo
do i=1,length_r
buf1(bpos+i) = fftbuf_in(locrm(i,procid),1,9,lat_l)
enddo
bpos = bpos+length_r
enddo
offset_s = sdispls(procid)+1
offset_r = rdispls(procid)+1
call swap2
(msgtag, procid, sndcnts(procid), buf1(offset_s), sndids(step), &
rcvcnts(procid), buf2(offset_r), rcvids(step))
if (.not. delayed_recv) then
beglat_r = cut(1,procid)
endlat_r = cut(2,procid)
bpos = rdispls(procid)
do lat_r=beglat_r,endlat_r
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
do i=1,length_l
fftbuf_out(i,k,ifld,lat_r) = buf2(bpos+i)
enddo
bpos = bpos+length_l
enddo
enddo
do i=1,length_l
fftbuf_out(i,1,9,lat_r) = buf2(bpos+i)
enddo
bpos = bpos+length_l
enddo
endif
!
enddo
!
! Wait for any outstanding send or receive requests to complete.
call swap3m
(realloc4_steps, msgtag, realloc4_proc, sndids, rcvcnts, &
rdispls, spmdbuf_siz, buf2, rcvids)
!
if ( delayed_recv ) then
!
! Copy out of message buffers
!
!$OMP PARALLEL DO PRIVATE (STEP, PROCID, BEGLAT_R, ENDLAT_R, BPOS, LAT_R, IFLD, K, I)
!CSD$ PARALLEL DO PRIVATE (STEP, PROCID, BEGLAT_R, ENDLAT_R, BPOS, LAT_R, IFLD, K, I)
do step=1,realloc4_steps
procid = realloc4_proc(step)
beglat_r = cut(1,procid)
endlat_r = cut(2,procid)
bpos = rdispls(procid)
do lat_r=beglat_r,endlat_r
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
do i=1,length_l
fftbuf_out(i,k,ifld,lat_r) = buf2(bpos+i)
enddo
bpos = bpos+length_l
enddo
enddo
do i=1,length_l
fftbuf_out(i,1,9,lat_r) = buf2(bpos+i)
enddo
bpos = bpos+length_l
enddo
!
end do
!CSD$ END PARALLEL DO
endif
endif
#endif
return
end subroutine realloc4a
subroutine realloc4b(nlon_fft_in, nlon_fft_out, fftbuf_in, fftbuf_out ) 1,11
!-----------------------------------------------------------------------
!
! Purpose:
! Reallocation routines for the Fourier coefficients
!
! Method:
! Before FFT following Legendre synthesis, reallocate fftbuf
! to combine wavenumbers, decomposing over latitude.
!
! Author: P. Worley, September 2002
! Modified: P. Worley, December 2003
!
!-----------------------------------------------------------------------
#ifdef SPMD
use shr_kind_mod
, only: r8 => shr_kind_r8
use pmgrid
use pspect
use comspe
use spmd_dyn
use mpishorthand
use swap_comm
!-----------------------------------------------------------------------
implicit none
!-----------------------------------------------------------------------
#include <comsta.h>
!------------------------------Parameters-------------------------------
!
integer, parameter :: msgtag = 2000
!---------------------------Input arguments--------------------------
!
integer, intent(in) :: nlon_fft_in ! first dimension of input array
integer, intent(in) :: nlon_fft_out ! first dimension of output array
real(r8), intent(in) :: fftbuf_in(nlon_fft_in,8,plevp,plat)
! buffer of Fourier coefficients to be reordered
real(r8), intent(out) :: fftbuf_out(nlon_fft_out,8,plevp,beglat:endlat)
! buffer used for in-place FFTs
!
!---------------------------Local workspace-----------------------------
!
! xxx_l: local decomposition
! xxx_r: remote decomposition
integer procid
integer length_r, length_l
integer bpos
integer step, ifld, k, i
integer lat_l, lat_r
integer beglat_r, endlat_r
integer offset_s ! send displacement + 1
integer offset_r ! receive displacement + 1
integer sndids(realloc4_steps) ! nonblocking MPI send request ids
integer rcvids(realloc4_steps) ! nonblocking MPI recv request ids
integer sndcnts(0:npes-1), sdispls(0:npes-1)
integer rcvcnts(0:npes-1), rdispls(0:npes-1)
logical delayed_recv ! local copy of delayed_swap_recv flag
!-----------------------------------------------------------------------
! Compute send/recv counts and displacements
length_l = 2*numm(iam)
sndcnts(:) = 0
do step=1,realloc4_steps
procid = realloc4_proc(step)
sndcnts(procid) = length_l*(8*plev + 4)*nlat_p(procid)
enddo
!
sdispls(0) = 0
do procid=1,npes-1
sdispls(procid) = sdispls(procid-1) + sndcnts(procid-1)
enddo
!
rcvcnts(:) = 0
do step=1,realloc4_steps
procid = realloc4_proc(step)
length_r = 2*numm(procid)
rcvcnts(procid) = length_r*(8*plev + 4)*numlats
enddo
!
rdispls(0) = 0
do procid=1,npes-1
rdispls(procid) = rdispls(procid-1) + rcvcnts(procid-1)
enddo
!
if (dyn_alltoall .eq. 0) then
!
! Copy local data to new location
length_l = 2*numm(iam)
!$omp parallel do private(lat_l, ifld, k, i)
!DIR$ NEXTSCALAR, NOSTREAM
do lat_l=beglat,endlat
!DIR$ STREAM
do k=1,plev
!DIR$ PREFERVECTOR, PREFERSTREAM
do ifld=1,8
do i=1,length_l
fftbuf_out(locrm(i,iam),ifld,k,lat_l) = fftbuf_in(i,ifld,k,lat_l)
enddo
enddo
enddo
!
do ifld=1,4
do i=1,length_l
fftbuf_out(locrm(i,iam),ifld,plevp,lat_l) = fftbuf_in(i,ifld,plevp,lat_l)
enddo
enddo
enddo
!
! Fill message buffer
!$OMP PARALLEL DO PRIVATE (STEP, PROCID, BEGLAT_R, ENDLAT_R, BPOS, LAT_R, K, IFLD, I)
!CSD$ PARALLEL DO PRIVATE (STEP, PROCID, BEGLAT_R, ENDLAT_R, BPOS, LAT_R, K, IFLD, I)
do step=1,realloc4_steps
procid = realloc4_proc(step)
beglat_r = cut(1,procid)
endlat_r = cut(2,procid)
bpos = sdispls(procid)
!
do lat_r=beglat_r,endlat_r
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
do i=1,length_l
buf1(bpos+i) = fftbuf_in(i,ifld,k,lat_r)
enddo
bpos = bpos+length_l
enddo
enddo
do ifld=1,4
do i=1,length_l
buf1(bpos+i) = fftbuf_in(i,ifld,plevp,lat_r)
enddo
bpos = bpos+length_l
enddo
enddo
enddo
!CSD$ END PARALLEL DO
!
! Get remote data
!
call mpialltoallv
(buf1, sndcnts, sdispls, mpir8, &
buf2, rcvcnts, rdispls, mpir8, &
mpicom)
!
! Copy out of message buffers
!
!$OMP PARALLEL DO PRIVATE (STEP, PROCID, LENGTH_R, BPOS, LAT_L, K, IFLD, I)
!CSD$ PARALLEL DO PRIVATE (STEP, PROCID, LENGTH_R, BPOS, LAT_L, K, IFLD, I)
do step=1,realloc4_steps
procid = realloc4_proc(step)
length_r = 2*numm(procid)
bpos = rdispls(procid)
do lat_l=beglat,endlat
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
do i=1,length_r
fftbuf_out(locrm(i,procid),ifld,k,lat_l) = buf2(bpos+i)
enddo
bpos = bpos+length_r
enddo
enddo
do ifld=1,4
do i=1,length_r
fftbuf_out(locrm(i,procid),ifld,plevp,lat_l) = buf2(bpos+i)
enddo
bpos = bpos+length_r
enddo
enddo
!
end do
!CSD$ END PARALLEL DO
!
else
!
delayed_recv = delayed_swap_recv
()
!
! Post receive requests
call swap1m
(realloc4_steps, msgtag, realloc4_proc, rcvcnts, &
rdispls, spmdbuf_siz, buf2, rcvids)
!
! Copy local data to new location
length_l = 2*numm(iam)
!DIR$ NEXTSCALAR, NOSTREAM
do lat_l=beglat,endlat
!DIR$ STREAM
do k=1,plev
!DIR$ PREFERVECTOR, PREFERSTREAM
do ifld=1,8
do i=1,length_l
fftbuf_out(locrm(i,iam),ifld,k,lat_l) = fftbuf_in(i,ifld,k,lat_l)
enddo
enddo
enddo
do ifld=1,4
do i=1,length_l
fftbuf_out(locrm(i,iam),ifld,plevp,lat_l) = fftbuf_in(i,ifld,plevp,lat_l)
enddo
enddo
enddo
!
! Send data, receive data (depending on communication protocol)
do step=1,realloc4_steps
procid = realloc4_proc(step)
length_r = 2*numm(procid)
!
beglat_r = cut(1,procid)
endlat_r = cut(2,procid)
bpos = sdispls(procid)
do lat_r=beglat_r,endlat_r
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
do i=1,length_l
buf1(bpos+i) = fftbuf_in(i,ifld,k,lat_r)
enddo
bpos = bpos+length_l
enddo
enddo
do ifld=1,4
do i=1,length_l
buf1(bpos+i) = fftbuf_in(i,ifld,plevp,lat_r)
enddo
bpos = bpos+length_l
enddo
enddo
offset_s = sdispls(procid)+1
offset_r = rdispls(procid)+1
call swap2
(msgtag, procid, sndcnts(procid), buf1(offset_s), sndids(step), &
rcvcnts(procid), buf2(offset_r), rcvids(step))
!
if (.not. delayed_recv) then
length_r = 2*numm(procid)
bpos = rdispls(procid)
!DIR$ NEXTSCALAR, NOSTREAM
do lat_l=beglat,endlat
!DIR$ CONCURRENT, PREFERVECTOR, STREAM
do k=1,plev
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
do i=1,length_r
fftbuf_out(locrm(i,procid),ifld,k,lat_l) = buf2(bpos+i)
enddo
bpos = bpos+length_r
enddo
enddo
do ifld=1,4
do i=1,length_r
fftbuf_out(locrm(i,procid),ifld,plevp,lat_l) = buf2(bpos+i)
enddo
bpos = bpos+length_r
enddo
enddo
!
endif
!
enddo
!
! Wait for any outstanding send or receive requests to complete.
call swap3m
(realloc4_steps, msgtag, realloc4_proc, sndids, rcvcnts, &
rdispls, spmdbuf_siz, buf2, rcvids)
!
if ( delayed_recv ) then
!
! Copy out of message buffers
!
!$OMP PARALLEL DO PRIVATE (STEP, PROCID, LENGTH_R, BPOS, LAT_L, K, IFLD, I)
!CSD$ PARALLEL DO PRIVATE (STEP, PROCID, LENGTH_R, BPOS, LAT_L, K, IFLD, I)
do step=1,realloc4_steps
procid = realloc4_proc(step)
length_r = 2*numm(procid)
bpos = rdispls(procid)
do lat_l=beglat,endlat
!DIR$ CONCURRENT, PREFERVECTOR
do k=1,plev
!DIR$ CONCURRENT, PREFERVECTOR
do ifld=1,8
do i=1,length_r
fftbuf_out(locrm(i,procid),ifld,k,lat_l) = buf2(bpos+i)
enddo
bpos = bpos+length_r
enddo
enddo
do ifld=1,4
do i=1,length_r
fftbuf_out(locrm(i,procid),ifld,plevp,lat_l) = buf2(bpos+i)
enddo
bpos = bpos+length_r
enddo
!
enddo
end do
!CSD$ END PARALLEL DO
endif
endif
!
#endif
return
end subroutine realloc4b