From c9dd0c5168bd51a195d28fa1d1a058c7fa8ca665 Mon Sep 17 00:00:00 2001 From: "James A. Ross" Date: Sun, 8 Sep 2019 09:24:36 +0000 Subject: [PATCH] performance and bug fixes --- src/Makefile | 4 +- src/__shmem_set_lock.c | 14 ++--- src/__shmem_test_lock.c | 6 +- src/def_shmem_collect_n.h | 51 +++++++--------- src/def_shmem_fcollect_n.h | 20 +++---- src/def_shmem_x_to_all.h | 32 ++++------ src/shmem_barrier.c | 1 - src/shmem_barrier_all.c | 1 - src/shmem_init.c | 1 + src/shmem_quiet.c | 15 ++--- src/shmem_sync.c | 1 + src/shmem_sync_all.c | 2 +- src/shmemx_memcpy32.c | 7 +++ src/shmemx_memcpy64.c | 7 +++ src/shmemx_memcpy_nbi.c | 116 ++++++++++++++++++++++++------------- 15 files changed, 155 insertions(+), 123 deletions(-) diff --git a/src/Makefile b/src/Makefile index 9b7e9ab..e527c9b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ CC = coprcc AR = epiphany-elf-ar READELF = epiphany-elf-readelf -CFLAGS = -Wall -Wno-unused-function -fno-unroll-loops -mfp-mode=caller +CFLAGS = -Os -Wall -Werror -Wno-unused-function DEFS = INCS = LIBS = @@ -20,7 +20,7 @@ run: info: $(TARGETS) @$(READELF) -s --wide libshmem_coprthr.a | \ awk '/FUNC|OBJECT/{printf "%7s %-6s %s\n",$$3,$$4,$$8;sum+=$$3}; \ - END{printf "===================\n%7s TOTAL BYTES\n", sum}' + END{printf "===================\n%7s total bytes\n", sum}' | sort -n .PHONY: clean install uninstall diff --git a/src/__shmem_set_lock.c b/src/__shmem_set_lock.c index 5546411..cc3ef5f 100644 --- a/src/__shmem_set_lock.c +++ b/src/__shmem_set_lock.c @@ -38,15 +38,15 @@ SHMEM_SCOPE void __shmem_set_lock (volatile long* x) { __asm__ __volatile__( - "mov r16, #0 \n" // zero lock pointer offset - ".Loop%=: \n" - " mov r18, #1 \n" // copying value to write to lock - " testset r18, [%[x], r16] \n" // test set - " sub r18, r18, #0 \n" // checking result - " bne .Loop%= \n" // if zero, loop until we acquire lock + "mov r1, #0 \n" // zero lock pointer offset + "mov r2, #1 \n" // copying value to write to lock + ".Loop%=: \n" + " testset r2, [%[x], r1] \n" // test set + " sub r3, r2, #0 \n" // checking result + " bne .Loop%= \n" // if zero, loop until we acquire lock : : [x] "r" (x) - : "r16", "r18" + : "r1", "r2", "r3" ); } diff --git a/src/__shmem_test_lock.c b/src/__shmem_test_lock.c index 420e8b0..1485c99 100644 --- a/src/__shmem_test_lock.c +++ b/src/__shmem_test_lock.c @@ -39,11 +39,11 @@ __shmem_test_lock (volatile long* x) { long r = 1; // attempting to acquire the lock __asm__ __volatile__( - "mov r16, #0 \n" // zero lock pointer offset - "testset %[r], [%[x], r16] \n" // test set + "mov r1, #0 \n" // zero lock pointer offset + "testset %[r], [%[x], r1] \n" // test set : [r] "+r" (r) : [x] "r" (x) - : "r16" + : "r1" ); // return 0 if the lock was originally cleared and call set lock return (r ? 0 : 1); // return 1 of the lock had already been set. } diff --git a/src/def_shmem_collect_n.h b/src/def_shmem_collect_n.h index 11bf84f..74a6968 100644 --- a/src/def_shmem_collect_n.h +++ b/src/def_shmem_collect_n.h @@ -37,37 +37,30 @@ SHMEM_SCOPE void \ shmem_collect##N (void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync) \ { \ - int PE = __shmem.my_pe; \ - int PE_step = 0x1 << logPE_stride; \ - int PE_end = PE_start + PE_step * (PE_size - 1); \ - int my_offset; \ - /* The SHMEM_COLLECT_SYNC_SIZE is padded with one extra value for \ - * syncronization and is not used in the shmem_barrier */ \ - volatile long* vSync = pSync + SHMEM_COLLECT_SYNC_SIZE - 2; \ - long* neighbor = (long*)shmem_ptr((void*)vSync, PE + PE_step); \ - if (PE == PE_start) { \ - my_offset = 0; \ - neighbor[0] = nelems; /* XXX casting size_t to long */ \ - neighbor[1] = 1; /* XXX must not be SHMEM_SYNC_VALUE */ \ - } else { \ - /* spin until neighbor sets offset */ \ - while (!vSync[1]); \ - my_offset = vSync[0]; \ - if (PE != PE_end) { \ - neighbor[0] = my_offset + nelems; \ - neighbor[1] = 1; \ - } \ + const int my_pe = __shmem.my_pe; \ + const int pe_step = 1 << logPE_stride; \ + const int pe_end = PE_start + ((PE_size - 1) << logPE_stride); \ + int my_offset = 0; \ + /* The SHMEM_COLLECT_SYNC_SIZE is padded with two extra values for \ + * syncronization and are not used in the shmem_sync */ \ + volatile long* lsync = pSync + SHMEM_COLLECT_SYNC_SIZE - 2; \ + if (my_pe != PE_start) { /* spin until neighbor sets offset */ \ + while (!*lsync) my_offset = lsync[1]; \ + *lsync = SHMEM_SYNC_VALUE; \ } \ - vSync[0] = SHMEM_SYNC_VALUE; \ - vSync[1] = SHMEM_SYNC_VALUE; \ - int i; \ - for (i = PE_start; i <= PE_end; i += PE_step) { \ - T* dst = (T*)dest + my_offset; \ - if (PE != i) dst = (T*)shmem_ptr((void*)dst, i); \ - shmemx_memcpy##N(dst, source, nelems); \ + int pe = my_pe + pe_step; \ + if (my_pe != pe_end) { \ + volatile long* rsync = (volatile long*)shmem_ptr((void*)lsync, pe); \ + rsync[1] = my_offset + nelems; /* XXX casting size_t to long */ \ + __shmem_set_lock(rsync); /* XXX this method isn't ideal */ \ } \ - shmem_barrier(PE_start, logPE_stride, PE_size, pSync); \ + dest = (T*)dest + my_offset; \ + shmemx_memcpy##N(dest, source, nelems); \ + for (; pe <= pe_end; pe += pe_step) \ + shmemx_memcpy##N(shmem_ptr(dest, pe), source, nelems); \ + for (pe = PE_start; pe < my_pe; pe += pe_step) \ + shmemx_memcpy##N(shmem_ptr(dest, pe), source, nelems); \ + shmem_sync(PE_start, logPE_stride, PE_size, pSync); \ } #endif - diff --git a/src/def_shmem_fcollect_n.h b/src/def_shmem_fcollect_n.h index 9c975c1..8f833c6 100644 --- a/src/def_shmem_fcollect_n.h +++ b/src/def_shmem_fcollect_n.h @@ -35,23 +35,23 @@ SHMEM_SCOPE void \ shmem_fcollect##N (void* dest, const void* source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long* pSync) \ { \ const int step = 1 << logPE_stride; \ - const int pe_shift = PE_size << logPE_stride; \ - const int pe_end = PE_start + pe_shift; \ + const int pe_end = PE_start + (PE_size << logPE_stride); \ const int my_pe = __shmem.my_pe; \ const int nbytes = nelems << S; /* << 2 = 4 bytes, << 3 = 8 bytes */ \ const ptrdiff_t offset = nbytes * ((my_pe - PE_start) >> logPE_stride); \ - const void* target = dest + offset; \ - int pe = my_pe; \ - do { \ - shmemx_memcpy##N(shmem_ptr(target,pe), source, nelems); \ + dest += offset; \ + shmemx_memcpy##N(dest, source, nelems); \ + int pe = my_pe + step; \ + while (pe < pe_end) { \ + shmemx_memcpy##N(shmem_ptr(dest,pe), source, nelems); \ pe += step; \ - } while (pe < pe_end); \ - pe -= pe_shift; \ + } \ + pe = PE_start; \ while (pe < my_pe) { \ - shmemx_memcpy##N(shmem_ptr(target,pe), source, nelems); \ + shmemx_memcpy##N(shmem_ptr(dest,pe), source, nelems); \ pe += step; \ } \ - shmem_barrier(PE_start, logPE_stride, PE_size, pSync); \ + shmem_sync(PE_start, logPE_stride, PE_size, pSync); \ } #endif diff --git a/src/def_shmem_x_to_all.h b/src/def_shmem_x_to_all.h index 570d002..192ea6d 100644 --- a/src/def_shmem_x_to_all.h +++ b/src/def_shmem_x_to_all.h @@ -43,28 +43,20 @@ SHMEM_SCOPE void \ shmem_##N##_to_all(T *dest, const T *source, int nreduce, int PE_start, int logPE_stride, int PE_size, T *pWrk, long *pSync) \ { \ int PE_size_stride = PE_size << logPE_stride; \ - int PE_step = 1 << logPE_stride; \ int PE_end = PE_size_stride + PE_start; \ int nreduced2p1 = (nreduce >> 1) + 1; \ - int nwrk = (nreduced2p1 > SHMEM_REDUCE_MIN_WRKDATA_SIZE) ? nreduced2p1 : SHMEM_REDUCE_MIN_WRKDATA_SIZE; \ + int nwrk = (nreduced2p1 > SHMEM_REDUCE_MIN_WRKDATA_SIZE) ? (nreduce >> 1) : SHMEM_REDUCE_MIN_WRKDATA_SIZE; \ volatile long* vSync = (volatile long*)(pSync + SHMEM_REDUCE_SYNC_SIZE - 2); \ int i, j, r; \ shmemx_memcpy##SZ(dest, source, nreduce); \ - vSync[0] = SHMEM_SYNC_VALUE; /* XXX */ \ - vSync[1] = SHMEM_SYNC_VALUE; /* XXX */ \ - shmem_sync(PE_start, logPE_stride, PE_size, pSync); /* XXX */ \ - int start = 1 << logPE_stride; \ - int end = PE_size_stride; \ - int step = start; \ + vSync[0] = SHMEM_SYNC_VALUE; \ + vSync[1] = SHMEM_SYNC_VALUE; \ + shmem_sync(PE_start, logPE_stride, PE_size, pSync); \ + int step = 1 << logPE_stride; \ int to = __shmem.my_pe; \ - T* data = dest; \ - if (PE_size & (PE_size - 1)) { /* Use ring algorithm for non-powers of 2 */ \ - start = 1; \ - end = PE_size; \ - step = PE_step; \ - data = (T*)source; \ - } \ - for (r = start; r < end;) { \ + int ring = PE_size & (PE_size - 1); /* Use ring algorithm for non-powers of 2 */ \ + T* data = (T*)((ring) ? source : dest); \ + for (r = 1; r < PE_size_stride; r += step) { \ to += step; \ if (to >= PE_end) to -= PE_size_stride; \ uintptr_t remote_ptr = (uintptr_t)shmem_ptr(0, to); \ @@ -75,16 +67,14 @@ shmem_##N##_to_all(T *dest, const T *source, int nreduce, int PE_start, int logP nrem = (nrem < nwrk) ? nrem : nwrk; \ __shmem_set_lock(remote_locks); \ shmemx_memcpy##SZ(remote_work, data + i, nrem); \ - remote_locks[1] = 1; /* XXX assumes SHMEM_SYNC_VALUE != 1 */\ + remote_locks[1] = 1; /* XXX signal assumes SHMEM_SYNC_VALUE != 1 */\ while (vSync[1] == SHMEM_SYNC_VALUE); \ for (j = 0; j < nrem; j++) dest[i+j] OP; \ vSync[1] = SHMEM_SYNC_VALUE; \ vSync[0] = SHMEM_SYNC_VALUE; \ } \ - if ((PE_size & (PE_size - 1))) { \ - r++; \ - } else { \ - r <<= 1; \ + if (!ring) { \ + to -= step; \ step <<= 1; \ } \ } \ diff --git a/src/shmem_barrier.c b/src/shmem_barrier.c index 9e07a7b..3f79624 100644 --- a/src/shmem_barrier.c +++ b/src/shmem_barrier.c @@ -39,7 +39,6 @@ shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) { shmem_quiet(); shmem_sync(PE_start, logPE_stride, PE_size, pSync); - __shmem.dma_used = 0; // reset } #ifdef __cplusplus diff --git a/src/shmem_barrier_all.c b/src/shmem_barrier_all.c index 6224c6e..dd1a658 100644 --- a/src/shmem_barrier_all.c +++ b/src/shmem_barrier_all.c @@ -39,7 +39,6 @@ shmem_barrier_all(void) { shmem_quiet(); shmem_sync_all(); - __shmem.dma_used = 0; // reset } #ifdef __cplusplus diff --git a/src/shmem_init.c b/src/shmem_init.c index b0e2cc3..0ed095d 100644 --- a/src/shmem_init.c +++ b/src/shmem_init.c @@ -157,6 +157,7 @@ shmem_init(void) x >>= 1; } __shmem.dma_start = ((int)(&__shmem.dma_desc) << 16) | 0x8; + __shmem.cdst0 = __shmem.cdst1 = 0; #ifdef SHMEM_USE_WAND_BARRIER __shmem_wand_barrier_init(); #else diff --git a/src/shmem_quiet.c b/src/shmem_quiet.c index 9f5dbd8..700ece5 100644 --- a/src/shmem_quiet.c +++ b/src/shmem_quiet.c @@ -40,26 +40,27 @@ shmem_quiet(void) if (__shmem.dma_used) { // SHMEM doesn't guarantee value is available __asm__ __volatile__ ( "mov r0, #15 \n" // setting r0 lower 4 bits on - ".Loop%=: \n" + ".LOOP%=: \n" " movfs r1, DMA0STATUS \n" // copy DMA0STATUS to r1 " and r1,r1,r0 \n" // check if DMA0STATUS != 0 " movfs r2, DMA1STATUS \n" // copy DMA1STATUS to r2 " and r2, r2, r0 \n" // check if DMA1STATUS != 0 " orr r2, r2, r1 \n" // check if either are != 0 - " bne .Loop%= \n" // spin until both complete + " bne .LOOP%= \n" // spin until both complete : : : "r0", "r1", "r2", "cc" ); - // XXX This isn't a great way to guarantee the data has finished + // XXX Spinning isn't a great way to guarantee the data has finished + // XXX since another PE may have modified the value + // XXX Also see shmemx_memcpy_nbi.c if (__shmem.cdst0) { - if(*__shmem.cdst0 == __shmem.csrc0); - *__shmem.cdst0 = ~__shmem.csrc0; + //while (*(__shmem.cdst0) == __shmem.csrc0); __shmem.cdst0 = 0; } if (__shmem.cdst1) { - if(*__shmem.cdst1 == __shmem.csrc1); - *__shmem.cdst1 = ~__shmem.csrc1; + //while (*(__shmem.cdst1) == __shmem.csrc1); __shmem.cdst1 = 0; } + __shmem.dma_used = 0; } } diff --git a/src/shmem_sync.c b/src/shmem_sync.c index e0210ac..a465994 100644 --- a/src/shmem_sync.c +++ b/src/shmem_sync.c @@ -64,6 +64,7 @@ shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) volatile long* lock = (volatile long*)(pSync + c); long * remote_lock = (long*)shmem_ptr((void*)lock, to); *remote_lock = 1; + //__shmem_set_lock(remote_lock); while (*lock == SHMEM_SYNC_VALUE); *lock = SHMEM_SYNC_VALUE; } diff --git a/src/shmem_sync_all.c b/src/shmem_sync_all.c index 3ec47aa..19b6e26 100644 --- a/src/shmem_sync_all.c +++ b/src/shmem_sync_all.c @@ -56,8 +56,8 @@ shmem_sync_all(void) int c; for (c = 0; c < __shmem.n_pes_log2; c++) { - volatile long* lock = (volatile long*)(__shmem.barrier_sync + c); *(__shmem.barrier_psync[c]) = 1; + volatile long* lock = (volatile long*)(__shmem.barrier_sync + c); while (*lock == SHMEM_SYNC_VALUE); *lock = SHMEM_SYNC_VALUE; } diff --git a/src/shmemx_memcpy32.c b/src/shmemx_memcpy32.c index 70f497d..5837940 100644 --- a/src/shmemx_memcpy32.c +++ b/src/shmemx_memcpy32.c @@ -86,6 +86,7 @@ shmemx_memcpy32(void* dst, const void* src, size_t nelem) "mov %[nelem], #1 \n" // This is here for alignment and is used below "lsr r3, %[src], #3 \n" // Checking number dwords >= 4 "beq .LDremainder%= \n" +#ifdef SHMEM_USE_UNSAFE // Disabling interrupts and hardware loops might be sketchy "gid \n" "movts lc, r3 \n" "mov r3, %%low(.LDstart%=) \n" @@ -93,6 +94,7 @@ shmemx_memcpy32(void* dst, const void* src, size_t nelem) "mov r3, %%low(.LDend%=-4) \n" "movts le, r3 \n" ".balignw 8,0x01a2 \n" // If alignment is correct, no need for nops +#endif ".LDstart%=: \n" "ldrd r16, [r24], #1 \n" "ldrd r18, [r24], #1 \n" @@ -102,8 +104,13 @@ shmemx_memcpy32(void* dst, const void* src, size_t nelem) "ldrd r22, [r24], #1 \n" "strd r20, [%[dst]], #1 \n" "strd r22, [%[dst]], #1 \n" +#ifdef SHMEM_USE_UNSAFE ".LDend%=: \n" "gie \n" +#else + "sub r3, r3, #1 \n" + "bne .LDstart%= \n" +#endif ".LDremainder%=: \n" "lsl r3, %[src], #29 \n" "lsr r3, r3, #30 \n" diff --git a/src/shmemx_memcpy64.c b/src/shmemx_memcpy64.c index f0c5de9..e665d36 100644 --- a/src/shmemx_memcpy64.c +++ b/src/shmemx_memcpy64.c @@ -57,6 +57,7 @@ shmemx_memcpy64(void* dst, const void* src, size_t nelem) __asm__ __volatile__( "lsr r3, %[nelem], #2 \n" // Checking number dwords >= 4 "beq .LDremainder%= \n" +#ifdef SHMEM_USE_UNSAFE // Disabling interrupts and hardware loops might be sketchy "gid \n" "movts lc, r3 \n" "mov r3, %%low(.LDstart%=) \n" @@ -64,6 +65,7 @@ shmemx_memcpy64(void* dst, const void* src, size_t nelem) "mov r3, %%low(.LDend%=-4) \n" "movts le, r3 \n" ".balignw 8,0x01a2 \n" // If alignment is correct, no need for nops +#endif ".LDstart%=: \n" "ldrd r16, [%[src]], #1 \n" "ldrd r18, [%[src]], #1 \n" @@ -73,8 +75,13 @@ shmemx_memcpy64(void* dst, const void* src, size_t nelem) "ldrd r22, [%[src]], #1 \n" "strd r20, [%[dst]], #1 \n" "strd r22, [%[dst]], #1 \n" +#ifdef SHMEM_USE_UNSAFE ".LDend%=: \n" "gie \n" +#else + "sub r3, r3, #1 \n" + "bne .LDstart%= \n" +#endif ".LDremainder%=: \n" "lsl %[nelem], %[nelem], #30 \n" "lsr %[nelem], %[nelem], #30 \n" diff --git a/src/shmemx_memcpy_nbi.c b/src/shmemx_memcpy_nbi.c index 2bbd6c5..33c58a5 100644 --- a/src/shmemx_memcpy_nbi.c +++ b/src/shmemx_memcpy_nbi.c @@ -46,59 +46,93 @@ shmemx_memcpy_nbi(void *dst, const void *src, size_t nbytes) unsigned int config = 0x3 | data_size; unsigned char* csrc = (unsigned char*)src; unsigned char value = ~csrc[nbytes-1]; - unsigned char* cdst = (unsigned char*)dst + nbytes - 1; - *cdst = value; + volatile unsigned char* cdst = (unsigned char*)dst + nbytes - 1; __shmem.dma_desc.count = count; __shmem.dma_desc.src_addr = (char*)src; __shmem.dma_desc.dst_addr = (char*)dst; __shmem.dma_desc.inner_stride = stride, __shmem.dma_desc.outer_stride = stride, __shmem.dma_desc.config = config; - __shmem.dma_used = 1; -#if 1 // XXX dual channel DMA may be unstable - unsigned int dmachannel; +#ifdef SHMEM_USE_UNSAFE // XXX dual channel DMA may not work correctly without calling shmem_quiet + // XXX this code is possibly broken __asm__ __volatile__ ( - "mov r0, #15 \n" - "mov r2, #0 \n" - ".Loop%=: \n" - " movfs r1, DMA0STATUS \n" - " and r1,r1,r0 \n" - " beq .Lconfig%= \n" // if DMA0 isn't busy, branch to start DMA0 - " movfs r1, DMA1STATUS \n" - " and r1,r1,r0 \n" - " bne .Loop%= \n" // loop until one DMA engine isn't busy - "movts DMA1CONFIG, %[x] \n" // start DMA1 - "mov r2, #1 \n" - "b .Ldone%= \n" - ".Lconfig%=: \n" - "movts DMA0CONFIG, %[x] \n" // start DMA0 - ".Ldone%=: \n" - "mov %[y], r2 \n" - : [y] "=r" (dmachannel) - : [x] "r" (__shmem.dma_start) - : "r0", "r1", "r2", "memory", "cc"); - if(dmachannel) { - __shmem.cdst1 = cdst; - __shmem.csrc1 = value; - } - else { - __shmem.cdst0 = cdst; - __shmem.csrc0 = value; - } + "mov r0, #15 \n" + ".LOOKUP_DMA0STATUS%=: \n" + " movfs r1, DMA0STATUS \n" + " and r1,r1,r0 \n" + " bne .LOOKUP_DMA1STATUS%= \n" // if DMA0 isn't busy, branch to start DMA0 + + "sub r0, %[cdst0], #0 \n" // check if prior destination set + "beq .LOAD_DMA0%= \n" + ".LOOP0%=: \n" + " ldrb r0, [%[cdst0], #0] \n" + " sub r1, r0, %[csrc0] \n" + " beq .LOOP0%= \n" // loop until prior data to complete + + ".LOAD_DMA0%=: \n" + "strb %[value], [%[cdst], #0] \n" // store trailing inverted value + "mov %[cdst0], %[cdst] \n" // save destination pointer + "mov %[csrc0], %[value] \n" // save inverted value to be overwritten + "movts DMA0CONFIG, %[x] \n" // start DMA0 + "b .LEAVE%= \n" + + ".LOOKUP_DMA1STATUS%=: \n" + " movfs r1, DMA1STATUS \n" + " and r1,r1,r0 \n" + " bne .LOOKUP_DMA0STATUS%= \n" // loop until one DMA engine isn't busy + + "sub r0, %[cdst1], #0 \n" // check if prior destination set + "beq .LOAD_DMA1%= \n" + ".LOOP1%=: \n" + " ldrb r0, [%[cdst1], #0] \n" + " sub r1, r0, %[csrc1] \n" + " beq .LOOP1%= \n" // loop until prior data to complete + + ".LOAD_DMA1%=: \n" + "strb %[value], [%[cdst], #0] \n" // store trailing inverted value + "mov %[cdst1], %[cdst] \n" // save destination pointer + "mov %[csrc1], %[value] \n" // save inverted value to be overwritten + "movts DMA1CONFIG, %[x] \n" // start DMA1 + + ".LEAVE%=: \n" + : [cdst0] "+r" (__shmem.cdst0), + [cdst1] "+r" (__shmem.cdst1), + [csrc0] "+r" (__shmem.csrc0), + [csrc1] "+r" (__shmem.csrc1) + : [x] "r" (__shmem.dma_start), + [cdst] "r" (cdst), + [value] "r" (value) + : "r0", "r1", "memory", "cc"); #else __asm__ __volatile__ ( - "mov r0, #15 \n" - ".Loop%=: \n" - " movfs r1, DMA0STATUS \n" - " and r1,r1,r0 \n" - " bne .Loop%= \n" // loop until one DMA engine isn't busy - "movts DMA0CONFIG, %[x] \n" // start DMA0 + "sub r1, %[cdst0], #0 \n" // check if prior destination set + "beq .LOAD_DMA0%= \n" + "mov r0, #15 \n" + ".LOOP_DMA0STATUS%=: \n" + " movfs r1, DMA0STATUS \n" + " and r1,r1,r0 \n" + " bne .LOOP_DMA0STATUS%= \n" // loop until DMA0 engine isn't busy +// ".LOOP0%=: \n" // XXX spinning on remote data is probably not +// " ldrb r0, [%[cdst0], #0] \n" // XXX safe since it could have been modified by +// " sub r1, r0, %[csrc0] \n" // XXX the remote PE +// " beq .LOOP0%= \n" // loop until prior data to complete + ".LOAD_DMA0%=: \n" + : [cdst0] "+r" (__shmem.cdst0), + [csrc0] "+r" (__shmem.csrc0) + : + : "r0", "r1", "cc" + ); + *cdst = value; // store trailing inverted value (to be overwritten) + __shmem.cdst0 = cdst; + __shmem.csrc0 = value; + __asm__ __volatile__ ( + "movts DMA0CONFIG, %[x] \n" // start DMA0 : : [x] "r" (__shmem.dma_start) - : "r0", "r1", "memory", "cc"); - __shmem.cdst0 = cdst; - __shmem.csrc0 = value; + : + ); #endif + __shmem.dma_used = 1; } #ifdef __cplusplus