-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathsaxpy.cpp
97 lines (78 loc) · 2.79 KB
/
saxpy.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <hip/hip_runtime.h>
#include <math.h>
#include <stdio.h>
#include <vector>
#define HIP_ERRCHK(result) hip_errchk(result, __FILE__, __LINE__)
static inline void hip_errchk(hipError_t result, const char *file, int line) {
if (result != hipSuccess) {
printf("\n\n%s in %s at line %d\n", hipGetErrorString(result), file,
line);
exit(EXIT_FAILURE);
}
}
/*
TODO: add a device kernel that calculates y = a * x + y for vectors x, y and
constant a
Hints:
What attribute(s) do you need to add on a kernel declaration?
- __device__?
- __global__?
- __shared__?
- no attribute(s) needed?
What is the return type of a kernel?
- int?
- float?
- void?
- depends on the kernel?
What data do you need in the kernel to compute y = a * x + y, for vectors x, y,
and constant a?
What built-in variables can you use to calculate the (global) index for a
thread?
- Is threadIdx enough or do you need blockIdx, blockDim, gridDim?
- Is the problem one or multi-dimensional?
- Remember the grid, block, thread hierarchy and the launch parameters
*/
int main() {
// Use HIP_ERRCHK to help you find any errors you make with the API calls
// Read the HIP Runtime API documentation to help you with the API calls:
// Ctrl-click this to open it in a browser:
// https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___memory.html
static constexpr size_t n = 1000000;
static constexpr size_t num_bytes = sizeof(float) * n;
static constexpr float a = 3.4f;
std::vector<float> x(n);
std::vector<float> y(n);
std::vector<float> y_ref(n);
// Initialise data and calculate reference values on CPU
for (size_t i = 0; i < n; i++) {
x[i] = sin(i) * 2.3;
y[i] = cos(i) * 1.1;
y_ref[i] = a * x[i] + y[i];
}
// TODO: Allocate + copy initial values
// - hipMalloc, hipMemcpy
// TODO: Define grid dimensions + launch the device kernel
// int/dim3 threads = ...
// int/dim3 blocks = ...
// kernelName<<<blocks, threads>>>(arguments);
// TODO: Copy results back to CPU
// - hipMemcpy
// TODO: Free device memory
// - hipFree
// Check the result of the GPU computation
printf("reference: %f %f %f %f ... %f %f\n", y_ref[0], y_ref[1], y_ref[2],
y_ref[3], y_ref[n - 2], y_ref[n - 1]);
printf(" result: %f %f %f %f ... %f %f\n", y[0], y[1], y[2], y[3],
y[n - 2], y[n - 1]);
float error = 0.0;
static constexpr float tolerance = 1e-6f;
for (size_t i = 0; i < n; i++) {
const auto diff = abs(y_ref[i] - y[i]);
if (diff > tolerance)
error += diff;
}
printf("total error: %f\n", error);
printf(" reference: %f at (42)\n", y_ref[42]);
printf(" result: %f at (42)\n", y[42]);
return 0;
}