-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathmatrix_transpose_with_SM_nobc.cpp
97 lines (64 loc) · 2.62 KB
/
matrix_transpose_with_SM_nobc.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#include <hip/hip_runtime.h>
#include <cstdlib>
#include <vector>
const static int width = 4096;
const static int height = 4096;
const static int tile_dim = 16;
__global__ void transpose_SM_nobc_kernel(float *in, float *out, int width,
int height) {
__shared__ float tile[tile_dim][tile_dim+1];
int x_tile_index = blockIdx.x * tile_dim;
int y_tile_index = blockIdx.y * tile_dim;
int in_index =
(y_tile_index + threadIdx.y) * width + (x_tile_index + threadIdx.x);
int out_index =
(x_tile_index + threadIdx.y) * height + (y_tile_index + threadIdx.x);
tile[threadIdx.y][threadIdx.x] = in[in_index];
__syncthreads();
out[out_index] = tile[threadIdx.x][threadIdx.y];
}
int main() {
std::vector<float> matrix_in;
std::vector<float> matrix_out;
matrix_in.resize(width * height);
matrix_out.resize(width * height);
for (int i = 0; i < width * height; i++) {
matrix_in[i] = (float)rand() / (float)RAND_MAX;
}
float *d_in;
float *d_out;
hipMalloc((void **)&d_in, width * height * sizeof(float));
hipMalloc((void **)&d_out, width * height * sizeof(float));
hipMemcpy(d_in, matrix_in.data(), width * height * sizeof(float),
hipMemcpyHostToDevice);
printf("Setup complete. Launching kernel \n");
int block_x = width / tile_dim;
int block_y = height / tile_dim;
// Create events
hipEvent_t start_kernel_event;
hipEventCreate(&start_kernel_event);
hipEvent_t end_kernel_event;
hipEventCreate(&end_kernel_event);
printf("Warm up the gpu!\n");
for(int i=1;i<=10;i++){
hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y),
dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
height);}
hipEventRecord(start_kernel_event, 0);
for(int i=1;i<=10;i++){
hipLaunchKernelGGL(transpose_SM_nobc_kernel, dim3(block_x, block_y),
dim3(tile_dim, tile_dim), 0, 0, d_in, d_out, width,
height);}
hipEventRecord(end_kernel_event, 0);
hipEventSynchronize(end_kernel_event);
float time_kernel;
hipEventElapsedTime(&time_kernel, start_kernel_event, end_kernel_event);
printf("Kernel execution complete \n");
printf("Event timings:\n");
printf(" %.6f ms - shared memory with no bank conflicts \n Bandwidth %.6f GB/s\n", time_kernel/10, 2.0*10000*(((double)(width)*(double)height)*sizeof(float))/(time_kernel*1024*1024*1024));
hipMemcpy(matrix_out.data(), d_out, width * height * sizeof(float),
hipMemcpyDeviceToHost);
hipEventDestroy(start_kernel_event);
hipEventDestroy(end_kernel_event);
return 0;
}