⛓ Add CUDA version of time series similarity code

sean-chester · sean-chester · commit 6ad866e3d768 · 2020-03-27T10:27:02.000-07:00
diff --git a/lectures/25-divergence/branch-div.cu b/lectures/25-divergence/branch-div.cu
@@ -9,21 +9,20 @@
  */
 
 #include <chrono>     // timing library
+#include <numeric>    // std::accumulate()
 #include <iostream>
 
+#include <vector>
+
 namespace { // anonymous
 
 using time_series = float[ 4 ];
 #include "../21-simt-trees/4d-points.hpp"
 
-} // namespace anonymous
-
-
-namespace cpu {
-
 /**
  * Determines whether two time_series are within a piecewise threshold of each other.
  */
+__host__ __device__
 bool near_match( time_series const t1, time_series const t2, float const threshold )
 {
     for( auto i = 0u; i < 4u; ++i )
@@ -36,6 +35,24 @@ bool near_match( time_series const t1, time_series const t2, float const thresho
     return true;
 }
 
+__host__ __device__
+bool check_match( time_series * const data, uint32_t const i, float const threshold, size_t const n )
+{
+    for( auto j = 0lu; j < n; ++j )
+    {
+        if( i != j && near_match( data[ i ], data[ j ], threshold ) )
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+} // namespace anonymous
+
+
+namespace cpu {
+
 /**
  * Calculates the number of time series that match at least one other in the dataset,
  * given a particular threshold for piecewise error tolerance.
@@ -48,21 +65,58 @@ template < typename T >
         #pragma omp parallel for reduction ( +:count )
         for( auto i = 0lu; i < n; ++i )
         {
-            for( auto j = 0lu; j < n; ++j )
+            if( check_match( points, i, threshold, n ) )
             {
-                if( i != j && near_match( points[ i ], points[ j ], threshold ) )
-                {
-                    ++count;
-                    break;
-                }
+                ++count;
             }
         }
 
         return count;
     }
 
 } // namespace cpu
+namespace gpu {
+
+auto const block_size = 512u;
+
+template < typename S, typename T >
+    __global__
+    void has_match( uint8_t * result, S * data, T const threshold, size_t const n )
+    {
+        auto const i = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if( i < n )
+        {
+            result[ i ] = check_match( data, i, threshold, n );
+        }
+    }
+
+template < typename T >
+    T similar_time_series( T const threshold, size_t const n )
+    {
+        auto const size = n * sizeof( points[ 0 ] );
+        auto const num_blocks = ceil( n / static_cast< float >( block_size ) );
+
+        std::vector< uint8_t > found_masks( n, 0 );
+        uint8_t *dev_output;
+        time_series *dev_input;
+
+        cudaMalloc( (void **) &dev_output, n * sizeof( uint8_t ) );
+        cudaMalloc( (void **) &dev_input,  size );
+
+        cudaMemcpy( dev_input, points, size, cudaMemcpyHostToDevice );
+
+        has_match<<< num_blocks, block_size >>>( dev_output, dev_input, threshold, n );
+
+        cudaMemcpy( found_masks.data(), dev_output, n * sizeof( uint8_t ), cudaMemcpyDeviceToHost );
+
+        cudaFree( dev_input );
+        cudaFree( dev_output );
+
+        return std::accumulate( found_masks.cbegin(), found_masks.cend(), 0.0f );
+    }
 
+} // namespace gpu
 
 
 int main( int argc, char **argv )
@@ -75,15 +129,16 @@ int main( int argc, char **argv )
     }
 
     auto const num_trials = 20u;
-    auto const threshold = atof( argv[ 1 ] );
+    auto const threshold = static_cast< float >( atof( argv[ 1 ] ) );
     auto const n = sizeof( points ) / sizeof( points[ 0 ] ) / 25; // increase at your own risk!
     auto sum = 0.0;
 
     auto const start_time = std::chrono::system_clock::now();
 
     for( auto i = 0u; i < num_trials; ++i )
     {
-        sum += cpu::similar_time_series( threshold, n );
+        // sum += cpu::similar_time_series( threshold, n );
+        sum += gpu::similar_time_series( threshold, n );
     }
 
     auto const end_time = std::chrono::system_clock::now();