Skip to content

Commit

Permalink
performance and bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesaross committed Sep 8, 2019
1 parent dbc64ed commit c9dd0c5
Show file tree
Hide file tree
Showing 15 changed files with 155 additions and 123 deletions.
4 changes: 2 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
CC = coprcc
AR = epiphany-elf-ar
READELF = epiphany-elf-readelf
CFLAGS = -Wall -Wno-unused-function -fno-unroll-loops -mfp-mode=caller
CFLAGS = -Os -Wall -Werror -Wno-unused-function
DEFS =
INCS =
LIBS =
Expand All @@ -20,7 +20,7 @@ run:
info: $(TARGETS)
@$(READELF) -s --wide libshmem_coprthr.a | \
awk '/FUNC|OBJECT/{printf "%7s %-6s %s\n",$$3,$$4,$$8;sum+=$$3}; \
END{printf "===================\n%7s TOTAL BYTES\n", sum}'
END{printf "===================\n%7s total bytes\n", sum}' | sort -n

.PHONY: clean install uninstall

Expand Down
14 changes: 7 additions & 7 deletions src/__shmem_set_lock.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ SHMEM_SCOPE void
__shmem_set_lock (volatile long* x)
{
__asm__ __volatile__(
"mov r16, #0 \n" // zero lock pointer offset
".Loop%=: \n"
" mov r18, #1 \n" // copying value to write to lock
" testset r18, [%[x], r16] \n" // test set
" sub r18, r18, #0 \n" // checking result
" bne .Loop%= \n" // if zero, loop until we acquire lock
"mov r1, #0 \n" // zero lock pointer offset
"mov r2, #1 \n" // copying value to write to lock
".Loop%=: \n"
" testset r2, [%[x], r1] \n" // test set
" sub r3, r2, #0 \n" // checking result
" bne .Loop%= \n" // if zero, loop until we acquire lock
:
: [x] "r" (x)
: "r16", "r18"
: "r1", "r2", "r3"
);
}

Expand Down
6 changes: 3 additions & 3 deletions src/__shmem_test_lock.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ __shmem_test_lock (volatile long* x)
{
long r = 1; // attempting to acquire the lock
__asm__ __volatile__(
"mov r16, #0 \n" // zero lock pointer offset
"testset %[r], [%[x], r16] \n" // test set
"mov r1, #0 \n" // zero lock pointer offset
"testset %[r], [%[x], r1] \n" // test set
: [r] "+r" (r)
: [x] "r" (x)
: "r16"
: "r1"
); // return 0 if the lock was originally cleared and call set lock
return (r ? 0 : 1); // return 1 of the lock had already been set.
}
Expand Down
51 changes: 22 additions & 29 deletions src/def_shmem_collect_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,37 +37,30 @@
SHMEM_SCOPE void \
shmem_collect##N (void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync) \
{ \
int PE = __shmem.my_pe; \
int PE_step = 0x1 << logPE_stride; \
int PE_end = PE_start + PE_step * (PE_size - 1); \
int my_offset; \
/* The SHMEM_COLLECT_SYNC_SIZE is padded with one extra value for \
* syncronization and is not used in the shmem_barrier */ \
volatile long* vSync = pSync + SHMEM_COLLECT_SYNC_SIZE - 2; \
long* neighbor = (long*)shmem_ptr((void*)vSync, PE + PE_step); \
if (PE == PE_start) { \
my_offset = 0; \
neighbor[0] = nelems; /* XXX casting size_t to long */ \
neighbor[1] = 1; /* XXX must not be SHMEM_SYNC_VALUE */ \
} else { \
/* spin until neighbor sets offset */ \
while (!vSync[1]); \
my_offset = vSync[0]; \
if (PE != PE_end) { \
neighbor[0] = my_offset + nelems; \
neighbor[1] = 1; \
} \
const int my_pe = __shmem.my_pe; \
const int pe_step = 1 << logPE_stride; \
const int pe_end = PE_start + ((PE_size - 1) << logPE_stride); \
int my_offset = 0; \
/* The SHMEM_COLLECT_SYNC_SIZE is padded with two extra values for \
* syncronization and are not used in the shmem_sync */ \
volatile long* lsync = pSync + SHMEM_COLLECT_SYNC_SIZE - 2; \
if (my_pe != PE_start) { /* spin until neighbor sets offset */ \
while (!*lsync) my_offset = lsync[1]; \
*lsync = SHMEM_SYNC_VALUE; \
} \
vSync[0] = SHMEM_SYNC_VALUE; \
vSync[1] = SHMEM_SYNC_VALUE; \
int i; \
for (i = PE_start; i <= PE_end; i += PE_step) { \
T* dst = (T*)dest + my_offset; \
if (PE != i) dst = (T*)shmem_ptr((void*)dst, i); \
shmemx_memcpy##N(dst, source, nelems); \
int pe = my_pe + pe_step; \
if (my_pe != pe_end) { \
volatile long* rsync = (volatile long*)shmem_ptr((void*)lsync, pe); \
rsync[1] = my_offset + nelems; /* XXX casting size_t to long */ \
__shmem_set_lock(rsync); /* XXX this method isn't ideal */ \
} \
shmem_barrier(PE_start, logPE_stride, PE_size, pSync); \
dest = (T*)dest + my_offset; \
shmemx_memcpy##N(dest, source, nelems); \
for (; pe <= pe_end; pe += pe_step) \
shmemx_memcpy##N(shmem_ptr(dest, pe), source, nelems); \
for (pe = PE_start; pe < my_pe; pe += pe_step) \
shmemx_memcpy##N(shmem_ptr(dest, pe), source, nelems); \
shmem_sync(PE_start, logPE_stride, PE_size, pSync); \
}

#endif

20 changes: 10 additions & 10 deletions src/def_shmem_fcollect_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,23 @@ SHMEM_SCOPE void \
shmem_fcollect##N (void* dest, const void* source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long* pSync) \
{ \
const int step = 1 << logPE_stride; \
const int pe_shift = PE_size << logPE_stride; \
const int pe_end = PE_start + pe_shift; \
const int pe_end = PE_start + (PE_size << logPE_stride); \
const int my_pe = __shmem.my_pe; \
const int nbytes = nelems << S; /* << 2 = 4 bytes, << 3 = 8 bytes */ \
const ptrdiff_t offset = nbytes * ((my_pe - PE_start) >> logPE_stride); \
const void* target = dest + offset; \
int pe = my_pe; \
do { \
shmemx_memcpy##N(shmem_ptr(target,pe), source, nelems); \
dest += offset; \
shmemx_memcpy##N(dest, source, nelems); \
int pe = my_pe + step; \
while (pe < pe_end) { \
shmemx_memcpy##N(shmem_ptr(dest,pe), source, nelems); \
pe += step; \
} while (pe < pe_end); \
pe -= pe_shift; \
} \
pe = PE_start; \
while (pe < my_pe) { \
shmemx_memcpy##N(shmem_ptr(target,pe), source, nelems); \
shmemx_memcpy##N(shmem_ptr(dest,pe), source, nelems); \
pe += step; \
} \
shmem_barrier(PE_start, logPE_stride, PE_size, pSync); \
shmem_sync(PE_start, logPE_stride, PE_size, pSync); \
}

#endif
32 changes: 11 additions & 21 deletions src/def_shmem_x_to_all.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,20 @@ SHMEM_SCOPE void \
shmem_##N##_to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) \
{ \
int PE_size_stride = PE_size << logPE_stride; \
int PE_step = 1 << logPE_stride; \
int PE_end = PE_size_stride + PE_start; \
int nreduced2p1 = (nreduce >> 1) + 1; \
int nwrk = (nreduced2p1 > SHMEM_REDUCE_MIN_WRKDATA_SIZE) ? nreduced2p1 : SHMEM_REDUCE_MIN_WRKDATA_SIZE; \
int nwrk = (nreduced2p1 > SHMEM_REDUCE_MIN_WRKDATA_SIZE) ? (nreduce >> 1) : SHMEM_REDUCE_MIN_WRKDATA_SIZE; \
volatile long* vSync = (volatile long*)(pSync + SHMEM_REDUCE_SYNC_SIZE - 2); \
int i, j, r; \
shmemx_memcpy##SZ(dest, source, nreduce); \
vSync[0] = SHMEM_SYNC_VALUE; /* XXX */ \
vSync[1] = SHMEM_SYNC_VALUE; /* XXX */ \
shmem_sync(PE_start, logPE_stride, PE_size, pSync); /* XXX */ \
int start = 1 << logPE_stride; \
int end = PE_size_stride; \
int step = start; \
vSync[0] = SHMEM_SYNC_VALUE; \
vSync[1] = SHMEM_SYNC_VALUE; \
shmem_sync(PE_start, logPE_stride, PE_size, pSync); \
int step = 1 << logPE_stride; \
int to = __shmem.my_pe; \
T* data = dest; \
if (PE_size & (PE_size - 1)) { /* Use ring algorithm for non-powers of 2 */ \
start = 1; \
end = PE_size; \
step = PE_step; \
data = (T*)source; \
} \
for (r = start; r < end;) { \
int ring = PE_size & (PE_size - 1); /* Use ring algorithm for non-powers of 2 */ \
T* data = (T*)((ring) ? source : dest); \
for (r = 1; r < PE_size_stride; r += step) { \
to += step; \
if (to >= PE_end) to -= PE_size_stride; \
uintptr_t remote_ptr = (uintptr_t)shmem_ptr(0, to); \
Expand All @@ -75,16 +67,14 @@ shmem_##N##_to_all(T *dest, const T *source, int nreduce, int PE_start, int logP
nrem = (nrem < nwrk) ? nrem : nwrk; \
__shmem_set_lock(remote_locks); \
shmemx_memcpy##SZ(remote_work, data + i, nrem); \
remote_locks[1] = 1; /* XXX assumes SHMEM_SYNC_VALUE != 1 */\
remote_locks[1] = 1; /* XXX signal assumes SHMEM_SYNC_VALUE != 1 */\
while (vSync[1] == SHMEM_SYNC_VALUE); \
for (j = 0; j < nrem; j++) dest[i+j] OP; \
vSync[1] = SHMEM_SYNC_VALUE; \
vSync[0] = SHMEM_SYNC_VALUE; \
} \
if ((PE_size & (PE_size - 1))) { \
r++; \
} else { \
r <<= 1; \
if (!ring) { \
to -= step; \
step <<= 1; \
} \
} \
Expand Down
1 change: 0 additions & 1 deletion src/shmem_barrier.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync)
{
shmem_quiet();
shmem_sync(PE_start, logPE_stride, PE_size, pSync);
__shmem.dma_used = 0; // reset
}

#ifdef __cplusplus
Expand Down
1 change: 0 additions & 1 deletion src/shmem_barrier_all.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ shmem_barrier_all(void)
{
shmem_quiet();
shmem_sync_all();
__shmem.dma_used = 0; // reset
}

#ifdef __cplusplus
Expand Down
1 change: 1 addition & 0 deletions src/shmem_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ shmem_init(void)
x >>= 1;
}
__shmem.dma_start = ((int)(&__shmem.dma_desc) << 16) | 0x8;
__shmem.cdst0 = __shmem.cdst1 = 0;
#ifdef SHMEM_USE_WAND_BARRIER
__shmem_wand_barrier_init();
#else
Expand Down
15 changes: 8 additions & 7 deletions src/shmem_quiet.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,26 +40,27 @@ shmem_quiet(void)
if (__shmem.dma_used) { // SHMEM doesn't guarantee value is available
__asm__ __volatile__ (
"mov r0, #15 \n" // setting r0 lower 4 bits on
".Loop%=: \n"
".LOOP%=: \n"
" movfs r1, DMA0STATUS \n" // copy DMA0STATUS to r1
" and r1,r1,r0 \n" // check if DMA0STATUS != 0
" movfs r2, DMA1STATUS \n" // copy DMA1STATUS to r2
" and r2, r2, r0 \n" // check if DMA1STATUS != 0
" orr r2, r2, r1 \n" // check if either are != 0
" bne .Loop%= \n" // spin until both complete
" bne .LOOP%= \n" // spin until both complete
: : : "r0", "r1", "r2", "cc"
);
// XXX This isn't a great way to guarantee the data has finished
// XXX Spinning isn't a great way to guarantee the data has finished
// XXX since another PE may have modified the value
// XXX Also see shmemx_memcpy_nbi.c
if (__shmem.cdst0) {
if(*__shmem.cdst0 == __shmem.csrc0);
*__shmem.cdst0 = ~__shmem.csrc0;
//while (*(__shmem.cdst0) == __shmem.csrc0);
__shmem.cdst0 = 0;
}
if (__shmem.cdst1) {
if(*__shmem.cdst1 == __shmem.csrc1);
*__shmem.cdst1 = ~__shmem.csrc1;
//while (*(__shmem.cdst1) == __shmem.csrc1);
__shmem.cdst1 = 0;
}
__shmem.dma_used = 0;
}
}

Expand Down
1 change: 1 addition & 0 deletions src/shmem_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync)
volatile long* lock = (volatile long*)(pSync + c);
long * remote_lock = (long*)shmem_ptr((void*)lock, to);
*remote_lock = 1;
//__shmem_set_lock(remote_lock);
while (*lock == SHMEM_SYNC_VALUE);
*lock = SHMEM_SYNC_VALUE;
}
Expand Down
2 changes: 1 addition & 1 deletion src/shmem_sync_all.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ shmem_sync_all(void)
int c;
for (c = 0; c < __shmem.n_pes_log2; c++)
{
volatile long* lock = (volatile long*)(__shmem.barrier_sync + c);
*(__shmem.barrier_psync[c]) = 1;
volatile long* lock = (volatile long*)(__shmem.barrier_sync + c);
while (*lock == SHMEM_SYNC_VALUE);
*lock = SHMEM_SYNC_VALUE;
}
Expand Down
7 changes: 7 additions & 0 deletions src/shmemx_memcpy32.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,15 @@ shmemx_memcpy32(void* dst, const void* src, size_t nelem)
"mov %[nelem], #1 \n" // This is here for alignment and is used below
"lsr r3, %[src], #3 \n" // Checking number dwords >= 4
"beq .LDremainder%= \n"
#ifdef SHMEM_USE_UNSAFE // Disabling interrupts and hardware loops might be sketchy
"gid \n"
"movts lc, r3 \n"
"mov r3, %%low(.LDstart%=) \n"
"movts ls, r3 \n"
"mov r3, %%low(.LDend%=-4) \n"
"movts le, r3 \n"
".balignw 8,0x01a2 \n" // If alignment is correct, no need for nops
#endif
".LDstart%=: \n"
"ldrd r16, [r24], #1 \n"
"ldrd r18, [r24], #1 \n"
Expand All @@ -102,8 +104,13 @@ shmemx_memcpy32(void* dst, const void* src, size_t nelem)
"ldrd r22, [r24], #1 \n"
"strd r20, [%[dst]], #1 \n"
"strd r22, [%[dst]], #1 \n"
#ifdef SHMEM_USE_UNSAFE
".LDend%=: \n"
"gie \n"
#else
"sub r3, r3, #1 \n"
"bne .LDstart%= \n"
#endif
".LDremainder%=: \n"
"lsl r3, %[src], #29 \n"
"lsr r3, r3, #30 \n"
Expand Down
7 changes: 7 additions & 0 deletions src/shmemx_memcpy64.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,15 @@ shmemx_memcpy64(void* dst, const void* src, size_t nelem)
__asm__ __volatile__(
"lsr r3, %[nelem], #2 \n" // Checking number dwords >= 4
"beq .LDremainder%= \n"
#ifdef SHMEM_USE_UNSAFE // Disabling interrupts and hardware loops might be sketchy
"gid \n"
"movts lc, r3 \n"
"mov r3, %%low(.LDstart%=) \n"
"movts ls, r3 \n"
"mov r3, %%low(.LDend%=-4) \n"
"movts le, r3 \n"
".balignw 8,0x01a2 \n" // If alignment is correct, no need for nops
#endif
".LDstart%=: \n"
"ldrd r16, [%[src]], #1 \n"
"ldrd r18, [%[src]], #1 \n"
Expand All @@ -73,8 +75,13 @@ shmemx_memcpy64(void* dst, const void* src, size_t nelem)
"ldrd r22, [%[src]], #1 \n"
"strd r20, [%[dst]], #1 \n"
"strd r22, [%[dst]], #1 \n"
#ifdef SHMEM_USE_UNSAFE
".LDend%=: \n"
"gie \n"
#else
"sub r3, r3, #1 \n"
"bne .LDstart%= \n"
#endif
".LDremainder%=: \n"
"lsl %[nelem], %[nelem], #30 \n"
"lsr %[nelem], %[nelem], #30 \n"
Expand Down
Loading

0 comments on commit c9dd0c5

Please sign in to comment.