forked from ContinuumIO/numbapro-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvolve.py
144 lines (111 loc) · 3.72 KB
/
convolve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Convolve
"""
from __future__ import print_function
import sys
import numpy as np
from scipy.signal import fftconvolve
from scipy import misc, ndimage
from accelerate.cuda.fft import FFTPlan, fft_inplace, ifft_inplace
from numba import cuda, vectorize
from timeit import default_timer as timer
@vectorize(['complex64(complex64, complex64)'], target='cuda')
def vmult(a, b):
"""Element complex64 multiplication
"""
return a * b
def task1(d_image_complex, d_response_complex):
### Task1 ###
# Implement a inplace CUDA FFT convolution
# Pseduocode:
# freq_imag = fft(image)
# freq_resp = fft(response)
# freq_out = fftimag * fftresp
# output = ifft(freq_out)
#
# Use the cuFFT functions:
# - fft_inplace(ary)
# - ifft_inplace(ary)
#
# Call `vmult` which is our elementwise complex multiplication.
# Do a inplace operation on `d_image_complex`.
# Hints:
# - keyword argument 'out' specify the output array
# - length of d_image_complex and d_response_complex has the same length.
fft_inplace(d_image_complex)
fft_inplace(d_response_complex)
vmult(d_image_complex, d_response_complex, out=d_image_complex)
ifft_inplace(d_image_complex)
# At this point, we have applied the filter onto d_image_complex
return # Does not return anything
# ----------------------------------------------------------------------------
# Details of the program. Not necessary for the exercise.
def convolve():
# Build Filter
laplacian_pts = '''
-4 -1 0 -1 -4
-1 2 3 2 -1
0 3 4 3 0
-1 2 3 2 -1
-4 -1 0 -1 -4
'''.split()
laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)
image = get_image()
print("Image size: %s" % (image.shape,))
response = np.zeros_like(image)
response[:5, :5] = laplacian
# CPU
# Use SciPy to perform the FFT convolution
ts = timer()
cvimage_cpu = fftconvolve(image, laplacian, mode='same')
te = timer()
print('CPU: %.2fs' % (te - ts))
# GPU
threadperblock = 32, 8
blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
print('kernel config: %s x %s' % (blockpergrid, threadperblock))
# Initialize the cuFFT system.
FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)
# Start GPU timer
ts = timer()
image_complex = image.astype(np.complex64)
response_complex = response.astype(np.complex64)
d_image_complex = cuda.to_device(image_complex)
d_response_complex = cuda.to_device(response_complex)
task1(d_image_complex, d_response_complex)
cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)
te = timer()
print('GPU: %.2fs' % (te - ts))
return cvimage_cpu, cvimage_gpu
def best_grid_size(size, tpb):
bpg = np.ceil(np.array(size, dtype=np.float) / tpb).astype(np.int).tolist()
return tuple(bpg)
def get_image():
# Build Image
try:
filename = sys.argv[1]
image = ndimage.imread(filename, flatten=True).astype(np.float32)
except IndexError:
image = misc.lena().astype(np.float32)
return image
def main():
cvimage_cpu, cvimage_gpu = convolve()
# Plot the results
try:
from matplotlib import pyplot as plt
except ImportError:
print("To render the images")
print("Do `conda install matplotlib` on the terminal")
print("Then, rerun")
else:
plt.subplot(1, 2, 1)
plt.title('CPU')
plt.imshow(cvimage_cpu, cmap=plt.cm.gray)
plt.axis('off')
plt.subplot(1, 2, 2)
plt.title('GPU')
plt.imshow(cvimage_gpu, cmap=plt.cm.gray)
plt.axis('off')
plt.show()
if __name__ == '__main__':
main()