1
+ #include < cuda.h>
2
+ #include < stdint.h>
3
+
4
+ #include < chrono>
5
+ using namespace std ::chrono;
6
+ using nano_double = duration<double , std::nano>;
7
+
8
+ #ifdef _WIN32
9
+ #define EXPORT_API __declspec (dllexport)
10
+ #else
11
+ #define EXPORT_API
12
+ #endif
13
+
14
+ #define A_new (ix, iy ) _A_new[(iy) * (n - 2 ) + ix]
15
+ #define A (ix, iy ) _A[(iy)*n + ix]
16
+
17
+ __global__ void diffusion_kernel (double *_A_new, const double *_A, const int n, const double h) {
18
+ int ix = blockIdx .x * blockDim .x + threadIdx .x ;
19
+ int iy = blockIdx .y * blockDim .y + threadIdx .y ;
20
+ if (ix < n - 2 && iy < n - 2 ) {
21
+ A_new (ix, iy) = A (ix + 1 , iy + 1 ) + h * (A (ix, iy + 1 ) + A (ix + 2 , iy + 1 ) + A (ix + 1 , iy) + A (ix + 1 , iy + 2 ) -
22
+ 4.0 * A (ix + 1 , iy + 1 ));
23
+ }
24
+ }
25
+
26
+ extern " C" EXPORT_API void run_benchmark (double *times, const int nsamples, const int n) {
27
+ double *A_new, *A;
28
+ cudaMalloc (&A_new, (n - 2 ) * (n - 2 ) * sizeof (double ));
29
+ cudaMalloc (&A, n * n * sizeof (double ));
30
+
31
+ double h = 1.0 / 5.0 ;
32
+
33
+ cudaStream_t stream;
34
+ cudaStreamCreate (&stream);
35
+
36
+ dim3 nthreads (32 , 8 );
37
+ dim3 nblocks ((n + nthreads.x - 1 ) / nthreads.x , (n + nthreads.y - 1 ) / nthreads.y );
38
+
39
+ for (int isample = 0 ; isample < nsamples; ++isample) {
40
+ auto timer = high_resolution_clock::now ();
41
+ diffusion_kernel<<<nblocks, nthreads, 0 , stream>>> (A_new, A, n, h);
42
+ cudaStreamSynchronize (stream);
43
+ auto elapsed = high_resolution_clock::now () - timer;
44
+ auto time_total = duration_cast<nano_double>(elapsed).count ();
45
+ times[isample] = time_total;
46
+ }
47
+
48
+ cudaFree (A_new);
49
+ cudaFree (A);
50
+
51
+ cudaStreamDestroy (stream);
52
+ }
0 commit comments