forked from ContinuumIO/numbapro-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsaxpy.py
136 lines (99 loc) · 2.51 KB
/
saxpy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
SAXPY
-----
Compute `a * x + y`
Where
`a` is a scalar
`x` and `y` are vectors
Prefix 'S' indicates single-precision float32 operations
"""
from __future__ import print_function
import sys
import numpy
from numba import cuda, vectorize, float32, void
# GPU code
# ---------
@cuda.jit(void(float32, float32[:], float32[:], float32[:]))
def saxpy(a, x, y, out):
# Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
i = cuda.grid(1)
# Map i to array elements
if i >= out.size:
# Out of range?
return
# Do actual work
out[i] = a * x[i] + y[i]
"""
Vectorize turns a scalar function into a
elementwise operation over the input arrays.
"""
@vectorize([float32(float32, float32, float32)],
target='cuda')
def vec_saxpy(a, x, y):
### Task 1 ###
# Complete the vectorize version
# Hint: this is a scalar function of
# float32(float32 a, float32 x, float32 y)
return a * x + y
# CPU code
# ---------
NUM_BLOCKS = 1
NUM_THREADS = 32
NELEM = NUM_BLOCKS * NUM_THREADS
def task1():
a = numpy.float32(2.) # Force value to be float32
# Generate numbers 0..(NELEM - 1)
x = numpy.arange(NELEM, dtype='float32')
y = numpy.arange(NELEM, dtype='float32')
out = numpy.empty_like(x)
griddim = NUM_BLOCKS
blockdim = NUM_THREADS
saxpy[griddim, blockdim](a, x, y, out)
print("out =", out)
vecout = vec_saxpy(a, x, y)
print("vecout =", vecout)
# Check output
if not (out == vecout).all():
print("Incorrect result")
else:
print("Correct result")
def task2():
a = numpy.float32(2.) # Force value to be float32
x = numpy.arange(NELEM, dtype='float32')
y = numpy.arange(NELEM, dtype='float32')
### Task2 ###
# a) Complete the memory transfer for x -> dx, y -> dy
# b) Allocate device memory for dout
# c) Transfer for out <- dout
dx = cuda.to_device(x)
dy = cuda.to_device(y)
dout = cuda.device_array_like(x)
griddim = NUM_BLOCKS
blockdim = NUM_THREADS
saxpy[griddim, blockdim](a, dx, dy, dout)
out = dout.copy_to_host()
print("out =", out)
if numpy.allclose(a * x + y, out):
print("Correct result")
else:
print("Incorrect result")
# ----------------------------------------------------------------------------
# All exercise code is above
HELPER = """
Lab 1
This exercise contains two tasks.
Usage:
python %(this)s --task1
Execute task1 codepath
python %(this)s --task2
Execute task2 codepath
"""
def main():
if '--task1' in sys.argv[1:]:
task1()
elif '--task2' in sys.argv[1:]:
task2()
else:
print(HELPER % {'this': sys.argv[0]})
if __name__ == '__main__':
main()