diff --git a/src/shmem.h b/src/shmem.h index 31d5944..7936280 100644 --- a/src/shmem.h +++ b/src/shmem.h @@ -171,9 +171,9 @@ F(uint,unsigned int) \ F(ulong,unsigned long) \ F(ulonglong,unsigned long long) \ F(int32,int32_t) \ -F(int64,int64_t) /*\ +F(int64,int64_t) \ F(uint32,uint32_t) \ -F(uint64,uint64_t)*/ +F(uint64,uint64_t) #define DECL_P2P(F) \ F(short,short) \ @@ -317,8 +317,8 @@ DECL_P2P(DECL_SHMEM_X_TEST) SHMEM_SCOPE void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync); SHMEM_SCOPE void shmem_barrier_all(void); -#define shmem_sync(...) shmem_barrier(__VA_ARGS__) -#define shmem_sync_all(...) shmem_barrier_all(__VA_ARGS__) +SHMEM_SCOPE void shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync); +SHMEM_SCOPE void shmem_sync_all(void); #define DECL_SHMEM_X_TO_ALL(N,T) \ SHMEM_SCOPE void shmem_##N##_to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync); @@ -593,14 +593,45 @@ DECL_SHMEM_TYPE_RMA(ptrdiff_t, ptrdiff, 32) #define shmem_wait_until(ivar,cmp,cmp_value) DECL_GENERIC_P2P(ivar,SHMEM_WAIT_UNTIL_GENERIC)(ivar,cmp,cmp_value) #define shmem_test(ivar,cmp,value) DECL_GENERIC_P2P(ivar,SHMEM_TEST_GENERIC)(ivar,cmp,value) -#define shmem_finc(...) shmem_atomic_fetch_inc(__VA_ARGS__) -#define shmem_inc(...) shmem_atomic_inc(__VA_ARGS__) -#define shmem_fadd(...) shmem_atomic_fetch_add(__VA_ARGS__) -#define shmem_add(...) shmem_atomic_add(__VA_ARGS__) -#define shmem_cswap(...) shmem_atomic_compare_swap(__VA_ARGS__) -#define shmem_swap(...) shmem_atomic_swap(__VA_ARGS__) -#define shmem_fetch(...) shmem_atomic_fetch(__VA_ARGS__) -#define shmem_set(...) shmem_atomic_set(__VA_ARGS__) +#define shmem_add shmem_atomic_add +#define shmem_cswap shmem_atomic_compare_swap +#define shmem_fadd shmem_atomic_fetch_add +#define shmem_fetch shmem_atomic_fetch +#define shmem_finc shmem_atomic_fetch_inc +#define shmem_inc shmem_atomic_inc +#define shmem_set shmem_atomic_set +#define shmem_swap shmem_atomic_swap + +#define shmem_int_add shmem_int_atomic_add +#define shmem_long_add shmem_long_atomic_add +#define shmem_longlong_add shmem_longlong_atomic_add +#define shmem_int_cswap shmem_int_atomic_compare_swap +#define shmem_long_cswap shmem_long_atomic_compare_swap +#define shmem_longlong_cswap shmem_longlong_atomic_compare_swap +#define shmem_int_fadd shmem_int_atomic_fetch_add +#define shmem_long_fadd shmem_long_atomic_fetch_add +#define shmem_longlong_fadd shmem_longlong_atomic_fetch_add +#define shmem_int_fetch shmem_int_atomic_fetch +#define shmem_long_fetch shmem_long_atomic_fetch +#define shmem_longlong_fetch shmem_longlong_atomic_fetch +#define shmem_float_fetch shmem_float_atomic_fetch +#define shmem_double_fetch shmem_double_atomic_fetch +#define shmem_int_finc shmem_int_atomic_fetch_inc +#define shmem_long_finc shmem_long_atomic_fetch_inc +#define shmem_longlong_finc shmem_longlong_atomic_fetch_inc +#define shmem_int_inc shmem_int_atomic_inc +#define shmem_long_inc shmem_long_atomic_inc +#define shmem_longlong_inc shmem_longlong_atomic_inc +#define shmem_int_set shmem_int_atomic_set +#define shmem_long_set shmem_long_atomic_set +#define shmem_longlong_set shmem_longlong_atomic_set +#define shmem_float_set shmem_float_atomic_set +#define shmem_double_set shmem_double_atomic_set +#define shmem_int_swap shmem_int_atomic_swap +#define shmem_long_swap shmem_long_atomic_swap +#define shmem_longlong_swap shmem_longlong_atomic_swap +#define shmem_float_swap shmem_float_atomic_swap +#define shmem_double_swap shmem_double_atomic_swap #define __put_nbi(dest,source,nelems,pe) DECL_GENERIC_STANDARD_RMA(dest,SHMEM_PUT_NBI_GENERIC)(dest,source,nelems,pe) #define __get_nbi(dest,source,nelems,pe) DECL_GENERIC_STANDARD_RMA(dest,SHMEM_GET_NBI_GENERIC)(dest,source,nelems,pe) diff --git a/src/shmem_barrier.c b/src/shmem_barrier.c index 7d7f27e..9e07a7b 100644 --- a/src/shmem_barrier.c +++ b/src/shmem_barrier.c @@ -34,39 +34,12 @@ extern "C" { #endif -SHMEM_SCOPE void SHMEM_INLINE -__shmem_barrier_lte2(int PE_start, int logPE_stride, int PE_size, long *pSync) -{ /* Routine for PE_size <= 2. Looping over shmem_barrier() for npes = 2 may - * not work correctly. Solution requires using testset because only - * sychronization stage may not be reset before subsequent call */ - if (PE_size == 1) return; - int PE_step = 0x1 << logPE_stride; - if (__shmem.my_pe != PE_start) PE_step *= -1; - int to = __shmem.my_pe + PE_step; - volatile long* lock = (volatile long*)pSync; - __shmem_set_lock((long*)shmem_ptr((void*)lock, to)); - while (*lock == SHMEM_SYNC_VALUE); - *lock = 0; -} - SHMEM_SCOPE void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) { - if (PE_size < 3) return __shmem_barrier_lte2(PE_start, logPE_stride, PE_size, pSync); - int PE_size_stride = PE_size << logPE_stride; - int PE_end = PE_size_stride + PE_start; - - int c, r; - for (c = 0, r = (1 << logPE_stride); r < PE_size_stride; c++, r <<= 1) - { - int to = __shmem.my_pe + r; - if (to >= PE_end) to -= PE_size_stride; - volatile long* lock = (volatile long*)(pSync + c); - long * remote_lock = (long*)shmem_ptr((void*)lock, to); - *remote_lock = 1; - while (*lock == SHMEM_SYNC_VALUE); - *lock = SHMEM_SYNC_VALUE; - } + shmem_quiet(); + shmem_sync(PE_start, logPE_stride, PE_size, pSync); + __shmem.dma_used = 0; // reset } #ifdef __cplusplus diff --git a/src/shmem_barrier_all.c b/src/shmem_barrier_all.c index abe5e24..6224c6e 100644 --- a/src/shmem_barrier_all.c +++ b/src/shmem_barrier_all.c @@ -34,41 +34,14 @@ extern "C" { #endif -#ifdef SHMEM_USE_WAND_BARRIER - SHMEM_SCOPE void shmem_barrier_all(void) { shmem_quiet(); - __asm__ __volatile__ ( - "gid \n" // disable interrupts - "wand \n" // wait on AND - ".balignw 8,0x01a2 \n" // nop align gie/idle pair to block - "gie \n" // enable interrupts - "idle \n" // to go sleep - ); + shmem_sync_all(); __shmem.dma_used = 0; // reset } -#else - -SHMEM_SCOPE void -shmem_barrier_all(void) -{ - shmem_quiet(); - int c; - for (c = 0; c < __shmem.n_pes_log2; c++) - { - volatile long* lock = (volatile long*)(__shmem.barrier_sync + c); - *(__shmem.barrier_psync[c]) = 1; - while (*lock == SHMEM_SYNC_VALUE); - *lock = SHMEM_SYNC_VALUE; - } - __shmem.dma_used = 0; // reset -} - -#endif - #ifdef __cplusplus } #endif diff --git a/src/shmem_header_only.h b/src/shmem_header_only.h index 8cd05c2..fd29be0 100644 --- a/src/shmem_header_only.h +++ b/src/shmem_header_only.h @@ -202,6 +202,8 @@ #include "shmem_size_atomic_inc.c" #include "shmem_size_atomic_set.c" #include "shmem_size_atomic_swap.c" +#include "shmem_sync.c" +#include "shmem_sync_all.c" #include "shmem_test_lock.c" #include "shmem_uint32_atomic_add.c" #include "shmem_uint32_atomic_and.c" diff --git a/src/shmem_quiet.c b/src/shmem_quiet.c index 598f622..9f5dbd8 100644 --- a/src/shmem_quiet.c +++ b/src/shmem_quiet.c @@ -49,12 +49,15 @@ shmem_quiet(void) " bne .Loop%= \n" // spin until both complete : : : "r0", "r1", "r2", "cc" ); + // XXX This isn't a great way to guarantee the data has finished if (__shmem.cdst0) { - while(*__shmem.cdst0 == __shmem.csrc0); + if(*__shmem.cdst0 == __shmem.csrc0); + *__shmem.cdst0 = ~__shmem.csrc0; __shmem.cdst0 = 0; } if (__shmem.cdst1) { - while(*__shmem.cdst1 == __shmem.csrc1); + if(*__shmem.cdst1 == __shmem.csrc1); + *__shmem.cdst1 = ~__shmem.csrc1; __shmem.cdst1 = 0; } } diff --git a/src/shmem_sync.c b/src/shmem_sync.c new file mode 100644 index 0000000..a32f24b --- /dev/null +++ b/src/shmem_sync.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016-2017 U.S. Army Research laboratory. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * This software was developed by Brown Deer Technology, LLC. with Copyright + * assigned to the US Army Research laboratory as required by contract. + */ + +#include "internals.h" +#include "shmem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +SHMEM_SCOPE void SHMEM_INLINE +__shmem_sync_lte2(int PE_start, int logPE_stride, int PE_size, long *pSync) +{ /* Routine for PE_size <= 2. Looping over shmem_barrier() for npes = 2 may + * not work correctly. Solution requires using testset because only + * sychronization stage may not be reset before subsequent call */ + if (PE_size == 1) return; + int PE_step = 0x1 << logPE_stride; + if (__shmem.my_pe != PE_start) PE_step *= -1; + int to = __shmem.my_pe + PE_step; + volatile long* lock = (volatile long*)pSync; + __shmem_set_lock((long*)shmem_ptr((void*)lock, to)); + while (*lock == SHMEM_SYNC_VALUE); + *lock = 0; +} + +SHMEM_SCOPE void +shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) +{ + if (PE_size < 3) return __shmem_sync_lte2(PE_start, logPE_stride, PE_size, pSync); + int PE_size_stride = PE_size << logPE_stride; + int PE_end = PE_size_stride + PE_start; + + int c, r; + for (c = 0, r = (1 << logPE_stride); r < PE_size_stride; c++, r <<= 1) + { + int to = __shmem.my_pe + r; + if (to >= PE_end) to -= PE_size_stride; + volatile long* lock = (volatile long*)(pSync + c); + long * remote_lock = (long*)shmem_ptr((void*)lock, to); + *remote_lock = 1; + while (*lock == SHMEM_SYNC_VALUE); + *lock = SHMEM_SYNC_VALUE; + } +} + +#ifdef __cplusplus +} +#endif diff --git a/src/shmem_sync_all.c b/src/shmem_sync_all.c new file mode 100644 index 0000000..3ec47aa --- /dev/null +++ b/src/shmem_sync_all.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016-2017 U.S. Army Research laboratory. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * This software was developed by Brown Deer Technology, LLC. with Copyright + * assigned to the US Army Research laboratory as required by contract. + */ + +#include "internals.h" +#include "shmem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef SHMEM_USE_WAND_BARRIER + +SHMEM_SCOPE void +shmem_sync_all(void) +{ + __asm__ __volatile__ ( + "gid \n" // disable interrupts + "wand \n" // wait on AND + ".balignw 8,0x01a2 \n" // nop align gie/idle pair to block + "gie \n" // enable interrupts + "idle \n" // to go sleep + ); +} + +#else + +SHMEM_SCOPE void +shmem_sync_all(void) +{ + int c; + for (c = 0; c < __shmem.n_pes_log2; c++) + { + volatile long* lock = (volatile long*)(__shmem.barrier_sync + c); + *(__shmem.barrier_psync[c]) = 1; + while (*lock == SHMEM_SYNC_VALUE); + *lock = SHMEM_SYNC_VALUE; + } +} + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/src/shmemx_memcpy_nbi.c b/src/shmemx_memcpy_nbi.c index 3557aab..2bbd6c5 100644 --- a/src/shmemx_memcpy_nbi.c +++ b/src/shmemx_memcpy_nbi.c @@ -55,7 +55,7 @@ shmemx_memcpy_nbi(void *dst, const void *src, size_t nbytes) __shmem.dma_desc.outer_stride = stride, __shmem.dma_desc.config = config; __shmem.dma_used = 1; -#if 0 // XXX dual channel DMA may be unstable +#if 1 // XXX dual channel DMA may be unstable unsigned int dmachannel; __asm__ __volatile__ ( "mov r0, #15 \n"