Skip to content

Commit

Permalink
added GUPS example code
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesaross committed Dec 1, 2017
1 parent c261dfc commit 586f3e8
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 0 deletions.
41 changes: 41 additions & 0 deletions example/c_gups/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
DEFS =
EDEFS = -fno-unroll-loops

CFLAGS += -O3 -std=c11 -fopenmp $(DEFS)

COPRTHR = /usr/local/browndeer/coprthr2
SHMEM = ../../src

INCS = -I. -I$(COPRTHR)/include -I$(SHMEM) -I../../common
LIBS = -L$(COPRTHR)/lib -lcoprthr -lcoprthrcc -lm

ELIBS = -lcoprthr2_dev -L$(SHMEM) -lshmem_coprthr

KERNEL = shmem_tfunc.e32

TARGETS = main.x $(KERNEL)

all: $(TARGETS)

.PHONY: clean distclean

.SUFFIXES:
.SUFFIXES: .c .o .x .e32 .S

main.x: main.c
$(CC) $(CFLAGS) $(INCS) main.c -o main.x $(LIBS)

shmem_tfunc.e32: shmem_tfunc.c
coprcc $(INCS) $(EDEFS) $(ELIBS) shmem_tfunc.c -o shmem_tfunc.e32

.c.o:
$(CC) $(CFLAGS) $(INCS) -c $<

info: $(KERNEL)
coprcc-info -j -l 100 shmem_tfunc.e32

clean:
rm -f *.o
rm -f $(TARGETS)

distclean: clean
24 changes: 24 additions & 0 deletions example/c_gups/gups.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include <stdint.h>
#include "ctimer.h"

#define NDEFAULT 16 // T[] is 2^N
#define NCORESLOG2DEFAULT 4 // cores = 2^C
#define A(i) (1 + (i) + (i) * (i))

typedef struct my_args
{
uint32_t* T;
uint32_t N;
uint32_t time;
} my_args_t;

inline __attribute__((__always_inline__))
uint32_t INDEX(uint32_t i) // pseudo-random index access
{
uint32_t a = i;
a *= 1234567891;
a ^= a << 13;
a ^= a >> 17;
a ^= a << 5;
return a;
}
136 changes: 136 additions & 0 deletions example/c_gups/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#include <stdio.h>
#include <string.h>
#include "gups.h"
#include "coprthr_cc.h"
#include "coprthr_thread.h"

uint32_t RandomAccessHost(uint32_t* T, uint32_t N)
{
ctimer_start();
uint64_t t0 = ctimer();

uint32_t kshift = 32 - N;
uint32_t I = 1 << (N + 2);
for (uint32_t i = 0; i < I; i++)
{
uint32_t a = INDEX(i);
uint32_t k = a >> kshift;
uint32_t t = T[k];
uint32_t t_new = t ^ a;
T[k] = t_new;
}

t0 -= ctimer();
return ctimer_nsec(t0);
}

uint32_t RandomAccessEpiphany(uint32_t* T, uint32_t N, uint32_t C)
{
uint32_t K = 1 << N;
size_t Ksz = K * sizeof(uint32_t);

// open device for threads
int dd = coprthr_dopen(COPRTHR_DEVICE_E32, COPRTHR_O_THREAD);
if (dd < 0) {
fprintf(stderr, "device open failed\n");
return 0;
}

// Allocate shared DRAM
coprthr_mem_t T_mem = coprthr_dmalloc(dd, Ksz, 0);
coprthr_mem_t argmem = coprthr_dmalloc(dd, sizeof(my_args_t), 0);

// Write to shared DRAM
coprthr_dwrite(dd, T_mem, 0, T, Ksz, COPRTHR_E_WAIT);

my_args_t* args = (my_args_t*)coprthr_memptr(argmem, 0);
*args = (my_args_t){.N = N, .T = coprthr_memptr(T_mem,0)};

// Select program and thread function
coprthr_program_t prg = coprthr_cc_read_bin("./shmem_tfunc.e32", 0);
coprthr_sym_t thr = coprthr_getsym(prg, "SHMEMRandomAccessPowerOf2");

// launch threads
coprthr_attr_t attr;
coprthr_attr_init(&attr);
coprthr_attr_setdetachstate(&attr, COPRTHR_CREATE_JOINABLE);
coprthr_attr_setdevice(&attr, dd);
coprthr_td_t td;
coprthr_ncreate(C, &td, &attr, thr, (void*)&argmem);
void* status;
coprthr_join(td, &status);

// read back data from memory on device
coprthr_dread(dd, T_mem, 0, T, Ksz, COPRTHR_E_WAIT);

// cleanup
coprthr_attr_destroy(&attr);
coprthr_dfree(dd, T_mem);
coprthr_dfree(dd, argmem);

return args->time;
}

int main(int argc, char* argv[])
{
uint32_t time_cpu, time_e32;
uint32_t N = NDEFAULT;
uint32_t NC = NCORESLOG2DEFAULT;

int i = 1;
while (i < argc) {
if (!strncmp(argv[i],"-n",2)) N = atoi(argv[++i]);
else if (!strncmp(argv[i],"-c",2)) NC = atoi(argv[++i]);
else if (!strncmp(argv[i],"-h",2) || !strncmp(argv[i],"--help",6)) goto help;
else {
fprintf(stderr, "unrecognized option: %s\n",argv[i]);
help:
printf("Usage: %s [OPTION]...\n"
"Parallel 32-bit on-chip implementation of Random Access measuring Giga Updates Per Second (GUPS)\n"
" -n log2 of size. Default: %u (size = %u)\n"
" -c log2 of number of Epiphany cores. Default: %u (cores = %u)\n"
" -h, --help print this\n",
argv[0], NDEFAULT, 1 << NDEFAULT,
NCORESLOG2DEFAULT, 1 << NCORESLOG2DEFAULT);
return 0;
}
++i;
}
if (NC > NCORESLOG2DEFAULT) {
fprintf(stderr, "Try a smaller number of cores, C (%u)\n", NCORESLOG2DEFAULT);
return 1;
}
uint32_t Nlimit = NDEFAULT - (NCORESLOG2DEFAULT - NC);
if (N > Nlimit) {
fprintf(stderr, "Try a smaller size, N (%u)\n", Nlimit);
return 1;
}

uint32_t K = 1 << N;
uint32_t I = 1 << (N + 2);
uint32_t C = 1 << NC;

uint32_t* T = calloc(K, sizeof(*T));
uint32_t* Thost = calloc(K, sizeof(*Thost));
if (!T || !Thost) {
fprintf(stderr, "Could not allocate memory\n");
return 1;
}

// initializing T
srand(12345);
for (int k = 0; k < K; k++) T[k] = rand();
memcpy(Thost, T, K * sizeof(*Thost));

time_e32 = RandomAccessEpiphany(T, N, C);
time_cpu = RandomAccessHost(Thost, N);

int err = 0;
for (int k = 0; k < K; k++) if (T[k] != Thost[k]) err++;

printf("N = %u (2^N = %u), cores = %u, err = %d (\x1b[3%dm%0.2f%%\x1b[0m)\n",
N, K, C, err, (err*100 >= K) ? 1 : 2, (double)err*100.0/(double)K);
printf("GUPS = %0.4f (host = %0.4f)\n", (double)I/(double)time_e32, (double)I/(double)time_cpu);

return 0;
}
90 changes: 90 additions & 0 deletions example/c_gups/shmem_tfunc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* This routine performs a distributed RandomAccess Benchmark. Instead of
* 64-bit values, 32-bit values are used. Also, a different PRNG is used
* because GF(2) at small indices isn't very random. It is also non-trivial to
* calculate the n'th value.
*
* -JAR
*/

#include <host_stdio.h>
#include "gups.h"

#define SHMEM_USE_HEADER_ONLY

#include "shmem.h"
#include "shmemx.h"

inline __attribute__((__always_inline__))
uint32_t ctz(register uint32_t x)
{
x &= -x;
x -= 1;
x -= ((x >> 1) & 0x55555555);
x = (((x >> 2) & 0x33333333) + (x & 0x33333333));
x = (((x >> 4) + x) & 0x0f0f0f0f);
x += (x >> 8);
x += (x >> 16);
return (x & 0x0000003f);
}

void __entry
SHMEMRandomAccessPowerOf2( my_args_t* args )
{
uint32_t* T = args->T;
uint32_t N = args->N;
uint32_t Np2 = N + 2;
uint32_t K = 1 << N;
uint32_t I = 1 << Np2;
uint32_t N32 = 32 - N;

shmem_init();
int me = shmem_my_pe();
int npes = shmem_n_pes();

uint32_t log2_npes = ctz(npes);
uint32_t peshift = N - log2_npes;
uint32_t shift = N - 32;
uint32_t kmask0 = (0xffffffff << shift) >> shift;
shift = 32 - peshift;
uint32_t kmask1 = (0xffffffff << shift) >> shift;

uint32_t npart = K >> log2_npes;
int ipart = I >> log2_npes;
int istart = ipart * me;
int iend = ipart * (me + 1);
size_t nsz = npart * sizeof(uint32_t);
uint32_t* t = shmem_align(0x2000, nsz);
shmemx_memcpy(t, T + npart * me, nsz);

shmem_barrier_all();

ctimer_start();
uint32_t time = ctimer();

for (int i = istart; i < iend; i++) {
// pseudo random number 'a', can trivially calculate n'th value
uint32_t a = INDEX(i);
uint32_t k_global = a >> N32;
int k = k_global & kmask1;
int q = k_global >> peshift; // the processor number
if (N > 11) { // The percentage of errors is high for small N...
uint64_t* ptk = shmem_ptr(t + k, q);
*ptk ^= a;
} else { // ...so use atomic xor to be error-free
shmem_uint32_atomic_xor(t + k, a, q);
}
}

shmem_barrier_all();
time -= ctimer();
time = ctimer_nsec(time);

// Copying local results to DRAM
shmemx_memcpy(T + npart * me, t, nsz);
if (!me) args->time = time;

shmem_free(t);

shmem_finalize();
}

0 comments on commit 586f3e8

Please sign in to comment.