9
9
*/
10
10
11
11
#include < chrono> // timing library
12
+ #include < numeric> // std::accumulate()
12
13
#include < iostream>
13
14
15
+ #include < vector>
16
+
14
17
namespace { // anonymous
15
18
16
19
using time_series = float [ 4 ];
17
20
#include " ../21-simt-trees/4d-points.hpp"
18
21
19
- } // namespace anonymous
20
-
21
-
22
- namespace cpu {
23
-
24
22
/* *
25
23
* Determines whether two time_series are within a piecewise threshold of each other.
26
24
*/
25
+ __host__ __device__
27
26
bool near_match ( time_series const t1, time_series const t2, float const threshold )
28
27
{
29
28
for ( auto i = 0u ; i < 4u ; ++i )
@@ -36,6 +35,24 @@ bool near_match( time_series const t1, time_series const t2, float const thresho
36
35
return true ;
37
36
}
38
37
38
+ __host__ __device__
39
+ bool check_match ( time_series * const data, uint32_t const i, float const threshold, size_t const n )
40
+ {
41
+ for ( auto j = 0lu; j < n; ++j )
42
+ {
43
+ if ( i != j && near_match ( data[ i ], data[ j ], threshold ) )
44
+ {
45
+ return true ;
46
+ }
47
+ }
48
+ return false ;
49
+ }
50
+
51
+ } // namespace anonymous
52
+
53
+
54
+ namespace cpu {
55
+
39
56
/* *
40
57
* Calculates the number of time series that match at least one other in the dataset,
41
58
* given a particular threshold for piecewise error tolerance.
@@ -48,21 +65,58 @@ template < typename T >
48
65
#pragma omp parallel for reduction ( +:count )
49
66
for ( auto i = 0lu; i < n; ++i )
50
67
{
51
- for ( auto j = 0lu; j < n; ++j )
68
+ if ( check_match ( points, i, threshold, n ) )
52
69
{
53
- if ( i != j && near_match ( points[ i ], points[ j ], threshold ) )
54
- {
55
- ++count;
56
- break ;
57
- }
70
+ ++count;
58
71
}
59
72
}
60
73
61
74
return count;
62
75
}
63
76
64
77
} // namespace cpu
78
+ namespace gpu {
79
+
80
+ auto const block_size = 512u ;
81
+
82
+ template < typename S, typename T >
83
+ __global__
84
+ void has_match ( uint8_t * result, S * data, T const threshold, size_t const n )
85
+ {
86
+ auto const i = threadIdx .x + blockIdx .x * blockDim .x ;
87
+
88
+ if ( i < n )
89
+ {
90
+ result[ i ] = check_match ( data, i, threshold, n );
91
+ }
92
+ }
93
+
94
+ template < typename T >
95
+ T similar_time_series ( T const threshold, size_t const n )
96
+ {
97
+ auto const size = n * sizeof ( points[ 0 ] );
98
+ auto const num_blocks = ceil ( n / static_cast < float >( block_size ) );
99
+
100
+ std::vector< uint8_t > found_masks ( n, 0 );
101
+ uint8_t *dev_output;
102
+ time_series *dev_input;
103
+
104
+ cudaMalloc ( (void **) &dev_output, n * sizeof ( uint8_t ) );
105
+ cudaMalloc ( (void **) &dev_input, size );
106
+
107
+ cudaMemcpy ( dev_input, points, size, cudaMemcpyHostToDevice );
108
+
109
+ has_match<<< num_blocks, block_size >>> ( dev_output, dev_input, threshold, n );
110
+
111
+ cudaMemcpy ( found_masks.data (), dev_output, n * sizeof ( uint8_t ), cudaMemcpyDeviceToHost );
112
+
113
+ cudaFree ( dev_input );
114
+ cudaFree ( dev_output );
115
+
116
+ return std::accumulate ( found_masks.cbegin (), found_masks.cend (), 0 .0f );
117
+ }
65
118
119
+ } // namespace gpu
66
120
67
121
68
122
int main ( int argc, char **argv )
@@ -75,15 +129,16 @@ int main( int argc, char **argv )
75
129
}
76
130
77
131
auto const num_trials = 20u ;
78
- auto const threshold = atof ( argv[ 1 ] );
132
+ auto const threshold = static_cast < float >( atof ( argv[ 1 ] ) );
79
133
auto const n = sizeof ( points ) / sizeof ( points[ 0 ] ) / 25 ; // increase at your own risk!
80
134
auto sum = 0.0 ;
81
135
82
136
auto const start_time = std::chrono::system_clock::now ();
83
137
84
138
for ( auto i = 0u ; i < num_trials; ++i )
85
139
{
86
- sum += cpu::similar_time_series ( threshold, n );
140
+ // sum += cpu::similar_time_series( threshold, n );
141
+ sum += gpu::similar_time_series ( threshold, n );
87
142
}
88
143
89
144
auto const end_time = std::chrono::system_clock::now ();
0 commit comments