From 4b61388e8fb2314220b028cdaf30a5bbee21b0e9 Mon Sep 17 00:00:00 2001
From: Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
Date: Wed, 14 Feb 2018 08:47:38 -0800
Subject: [PATCH] CUB: Integrate CUB 1.7.5 into Thrust to pull in the
 (corrected) fix for small data type radix sorting performance regressions.
 Bug 1997368 Bug 200355591 git-commit b56409c060fe4c718066d19099fb12d8acdb2163
 git-author Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com> VDVS:
 http://ausdvs.nvidia.com/Build_Results?virtualId=1000099285&which_page=current_build

Jobs: 1997368-2006 200355591-2006
[git-p4: depot-paths = "//sw/gpgpu/thrust/": change = 23593281]
---
 internal/rename_cub_namespace.sh              |   7 +
 internal/reverse_rename_cub_namespace.sh      |   7 +
 internal/update_thrust_cub.sh                 |  18 --
 .../cuda/detail/cub/agent/agent_histogram.cuh |   2 +-
 .../cub/agent/agent_radix_sort_downsweep.cuh  |  29 +-
 .../cub/agent/agent_radix_sort_upsweep.cuh    |   2 +-
 .../cuda/detail/cub/agent/agent_reduce.cuh    |   2 +-
 .../detail/cub/agent/agent_reduce_by_key.cuh  |  12 +-
 .../cuda/detail/cub/agent/agent_rle.cuh       |   6 +-
 .../cuda/detail/cub/agent/agent_scan.cuh      |   2 +-
 .../detail/cub/agent/agent_segment_fixup.cuh  |   2 +-
 .../cuda/detail/cub/agent/agent_select_if.cuh |   2 +-
 .../cuda/detail/cub/agent/agent_spmv_orig.cuh | 261 +-----------------
 .../cub/agent/single_pass_scan_operators.cuh  |   4 +-
 .../cub/block/block_adjacent_difference.cuh   |   2 +-
 .../detail/cub/block/block_discontinuity.cuh  |   2 +-
 .../cuda/detail/cub/block/block_exchange.cuh  |   2 +-
 .../cuda/detail/cub/block/block_histogram.cuh |   2 +-
 .../cuda/detail/cub/block/block_load.cuh      |  47 +---
 .../detail/cub/block/block_radix_rank.cuh     |   7 +-
 .../detail/cub/block/block_radix_sort.cuh     |   7 +-
 .../detail/cub/block/block_raking_layout.cuh  |   2 +-
 .../cuda/detail/cub/block/block_reduce.cuh    |   2 +-
 .../cuda/detail/cub/block/block_scan.cuh      |   2 +-
 .../cuda/detail/cub/block/block_shuffle.cuh   |   2 +-
 .../cuda/detail/cub/block/block_store.cuh     |   2 +-
 .../block_histogram_atomic.cuh                |   2 +-
 .../specializations/block_histogram_sort.cuh  |   2 +-
 .../specializations/block_reduce_raking.cuh   |   2 +-
 .../block_reduce_raking_commutative_only.cuh  |   2 +-
 .../block_reduce_warp_reductions.cuh          |   2 +-
 .../specializations/block_scan_raking.cuh     |   2 +-
 .../specializations/block_scan_warp_scans.cuh |   2 +-
 .../block_scan_warp_scans2.cuh                |   2 +-
 .../block_scan_warp_scans3.cuh                |   2 +-
 thrust/system/cuda/detail/cub/cub.cuh         |   2 +-
 .../detail/cub/device/device_histogram.cuh    |   2 +-
 .../detail/cub/device/device_partition.cuh    |   2 +-
 .../detail/cub/device/device_radix_sort.cuh   |   7 +-
 .../cuda/detail/cub/device/device_reduce.cuh  |   2 +-
 .../cub/device/device_run_length_encode.cuh   |   2 +-
 .../cuda/detail/cub/device/device_scan.cuh    |   2 +-
 .../device/device_segmented_radix_sort.cuh    |   7 +-
 .../cub/device/device_segmented_reduce.cuh    |   2 +-
 .../cuda/detail/cub/device/device_select.cuh  |   2 +-
 .../cuda/detail/cub/device/device_spmv.cuh    |   2 +-
 .../device/dispatch/dispatch_histogram.cuh    |   2 +-
 .../device/dispatch/dispatch_radix_sort.cuh   | 151 ++++------
 .../cub/device/dispatch/dispatch_reduce.cuh   |  42 +--
 .../dispatch/dispatch_reduce_by_key.cuh       |   2 +-
 .../cub/device/dispatch/dispatch_rle.cuh      |   2 +-
 .../cub/device/dispatch/dispatch_scan.cuh     |  16 +-
 .../device/dispatch/dispatch_select_if.cuh    |   2 +-
 .../device/dispatch/dispatch_spmv_orig.cuh    |  90 +++---
 .../cuda/detail/cub/grid/grid_barrier.cuh     |   2 +-
 .../cuda/detail/cub/grid/grid_even_share.cuh  |   2 +-
 .../cuda/detail/cub/grid/grid_mapping.cuh     |   2 +-
 .../cuda/detail/cub/grid/grid_queue.cuh       |   2 +-
 thrust/system/cuda/detail/cub/host/mutex.cuh  |   2 +-
 .../cub/iterator/arg_index_input_iterator.cuh |   2 +-
 .../cache_modified_input_iterator.cuh         |   2 +-
 .../cache_modified_output_iterator.cuh        |   2 +-
 .../cub/iterator/constant_input_iterator.cuh  |   2 +-
 .../cub/iterator/counting_input_iterator.cuh  |   2 +-
 .../cub/iterator/discard_output_iterator.cuh  |   2 +-
 .../cub/iterator/tex_obj_input_iterator.cuh   |   2 +-
 .../cub/iterator/tex_ref_input_iterator.cuh   |   2 +-
 .../cub/iterator/transform_input_iterator.cuh |   2 +-
 .../cuda/detail/cub/thread/thread_load.cuh    |   2 +-
 .../detail/cub/thread/thread_operators.cuh    |   2 +-
 .../cuda/detail/cub/thread/thread_reduce.cuh  |   2 +-
 .../cuda/detail/cub/thread/thread_scan.cuh    |   2 +-
 .../cuda/detail/cub/thread/thread_search.cuh  |   2 +-
 .../cuda/detail/cub/thread/thread_store.cuh   |   2 +-
 .../system/cuda/detail/cub/util_allocator.cuh |   2 +-
 thrust/system/cuda/detail/cub/util_arch.cuh   |  36 +--
 thrust/system/cuda/detail/cub/util_debug.cuh  |   2 +-
 thrust/system/cuda/detail/cub/util_device.cuh |   2 +-
 thrust/system/cuda/detail/cub/util_macro.cuh  |   2 +-
 .../system/cuda/detail/cub/util_namespace.cuh |   2 +-
 thrust/system/cuda/detail/cub/util_ptx.cuh    |   2 +-
 thrust/system/cuda/detail/cub/util_type.cuh   |  36 ++-
 .../warp/specializations/warp_reduce_shfl.cuh |   2 +-
 .../warp/specializations/warp_reduce_smem.cuh |   2 +-
 .../warp/specializations/warp_scan_shfl.cuh   |   2 +-
 .../warp/specializations/warp_scan_smem.cuh   |   2 +-
 .../cuda/detail/cub/warp/warp_reduce.cuh      |   2 +-
 .../system/cuda/detail/cub/warp/warp_scan.cuh |   2 +-
 88 files changed, 318 insertions(+), 610 deletions(-)
 create mode 100755 internal/rename_cub_namespace.sh
 create mode 100755 internal/reverse_rename_cub_namespace.sh
 delete mode 100755 internal/update_thrust_cub.sh

diff --git a/internal/rename_cub_namespace.sh b/internal/rename_cub_namespace.sh
new file mode 100755
index 000000000..7a539e5d6
--- /dev/null
+++ b/internal/rename_cub_namespace.sh
@@ -0,0 +1,7 @@
+#! /bin/bash
+
+# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to add a THRUST_
+# prefix to CUB's namespace macro.
+
+sed -i -e 's/CUB_NS_P/THRUST_CUB_NS_P/g' `find . -type f`
+
diff --git a/internal/reverse_rename_cub_namespace.sh b/internal/reverse_rename_cub_namespace.sh
new file mode 100755
index 000000000..bc4858449
--- /dev/null
+++ b/internal/reverse_rename_cub_namespace.sh
@@ -0,0 +1,7 @@
+#! /bin/bash
+
+# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to undo the
+# renaming of CUB's namespace macro.
+
+sed -i -e 's|THRUST_CUB_NS_P|CUB_NS_P|g' `find . -type f`
+
diff --git a/internal/update_thrust_cub.sh b/internal/update_thrust_cub.sh
deleted file mode 100755
index eeaf9d7f8..000000000
--- a/internal/update_thrust_cub.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/sh
-
-# When a update version of CUB is fetched either from
-#   http://github.com/dumerrill/PrivateCUB (currently in use)
-# or
-#   http://github.com/NVLabs/cub 
-# Run this script from
-#   //sw/gpgpu/thrust/thrust/system/cuda/detail/cub
-# using the following command, only once
-#  find . -type f -exec //sw/gpgpu/thrust/internal/update_thrust_cub.sh '{}' \;
-
-# The purpose of this is to rename every instance of 
-#   CUB_NSP{EFIX|OSTFIX} -> THRUST_CUB_NS_P{EFIX|OSTFIX}
-# 
-
-echo $1
-cat $1|sed -e 's|CUB_NS_P|THRUST_CUB_NS_P|g' > /tmp/tmp.xxx
-mv /tmp/tmp.xxx $1
diff --git a/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh b/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh
index 634c67f5a..0833ed31b 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh
index f030ef788..1b1fd8a3e 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -293,7 +293,7 @@ struct AgentRadixSortDownsweep
         {
             ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
 
-            if (FULL_TILE || 
+            if (FULL_TILE ||
                 (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
             {
                 d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
@@ -332,6 +332,10 @@ struct AgentRadixSortDownsweep
         Int2Type<false>             is_full_tile,
         Int2Type<_RANK_ALGORITHM>   rank_algorithm)
     {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
+
         BlockLoadKeysT(temp_storage.load_keys).Load(
             d_keys_in + block_offset, keys, valid_items, oob_item);
 
@@ -365,6 +369,10 @@ struct AgentRadixSortDownsweep
         Int2Type<false>             is_full_tile,
         Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
     {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
+
         LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
     }
 
@@ -398,6 +406,10 @@ struct AgentRadixSortDownsweep
         Int2Type<false>             is_full_tile,
         Int2Type<_RANK_ALGORITHM>   rank_algorithm)
     {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
+
         BlockLoadValuesT(temp_storage.load_values).Load(
             d_values_in + block_offset, values, valid_items);
 
@@ -411,7 +423,7 @@ struct AgentRadixSortDownsweep
     __device__ __forceinline__ void LoadValues(
         ValueT                      (&values)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
-        volatile OffsetT                     valid_items,
+        OffsetT                     valid_items,
         Int2Type<true>              is_full_tile,
         Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
     {
@@ -425,10 +437,14 @@ struct AgentRadixSortDownsweep
     __device__ __forceinline__ void LoadValues(
         ValueT                      (&values)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
-        volatile OffsetT                     valid_items,
+        OffsetT                     valid_items,
         Int2Type<false>             is_full_tile,
         Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
     {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
+
         LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
     }
 
@@ -444,10 +460,10 @@ struct AgentRadixSortDownsweep
         OffsetT         valid_items,
         Int2Type<false> /*is_keys_only*/)
     {
-        CTA_SYNC();
-
         ValueT values[ITEMS_PER_THREAD];
 
+        CTA_SYNC();
+
         LoadValues(
             values,
             block_offset,
@@ -746,6 +762,7 @@ struct AgentRadixSortDownsweep
         else
         {
             // Process full tiles of tile_items
+            #pragma unroll 1
             while (block_offset + TILE_ITEMS <= block_end)
             {
                 ProcessTile<true>(block_offset);
diff --git a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh
index 541f923e2..efa69858d 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh b/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh
index c4085a777..df3f4a70f 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh b/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh
index b1692b8eb..d68201013 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -454,13 +454,13 @@ struct AgentReduceByKey
         // Perform exclusive tile scan
         OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
         OffsetT             num_segments_prefix;    // Number of segments prior to this tile
-        ValueOutputT        total_aggregate;        // The tile prefix folded with block_aggregate
+        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
         if (tile_idx == 0)
         {
             // Scan first tile
             BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
             num_segments_prefix     = 0;
-            total_aggregate         = block_aggregate.value;
+            total_aggregate         = block_aggregate;
 
             // Update tile status if there are successor tiles
             if ((!IS_LAST_TILE) && (threadIdx.x == 0))
@@ -474,9 +474,7 @@ struct AgentReduceByKey
 
             block_aggregate         = prefix_op.GetBlockAggregate();
             num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
-            total_aggregate         = reduction_op(
-                                        prefix_op.GetExclusivePrefix().value,
-                                        block_aggregate.value);
+            total_aggregate         = prefix_op.GetInclusivePrefix();
         }
 
         // Rezip scatter items and segment indices
@@ -506,7 +504,7 @@ struct AgentReduceByKey
             if (num_remaining == TILE_ITEMS)
             {
                 d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
-                d_aggregates_out[num_segments]  = total_aggregate;
+                d_aggregates_out[num_segments]  = total_aggregate.value;
                 num_segments++;
             }
 
diff --git a/thrust/system/cuda/detail/cub/agent/agent_rle.cuh b/thrust/system/cuda/detail/cub/agent/agent_rle.cuh
index 90ea81dbd..94f47eb5b 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_rle.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_rle.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -618,8 +618,8 @@ struct AgentRle
         OffsetT             num_items,          ///< Total number of global input items
         OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
         int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,       ///< Tile offset
-        ScanTileStateT       &tile_status)       ///< Global list of tile status
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT      &tile_status)       ///< Global list of tile status
     {
         if (tile_idx == 0)
         {
diff --git a/thrust/system/cuda/detail/cub/agent/agent_scan.cuh b/thrust/system/cuda/detail/cub/agent/agent_scan.cuh
index 512f1eafc..bd35b6932 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_scan.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_scan.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh b/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh
index b004beb33..dd5359b96 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh b/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh
index a8b89f848..327e66530 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh b/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh
index 9d3feb4b6..5a6c4c73c 100644
--- a/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh
+++ b/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -422,52 +422,8 @@ struct AgentSpmv
 
 #if (CUB_PTX_ARCH >= 520)
 
-/*
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.merge_items[tile_num_nonzeros].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.merge_items[0].nonzero;
-
-        OffsetT col_indices[ITEMS_PER_THREAD];
-        ValueT mat_values[ITEMS_PER_THREAD];
-        int nonzero_indices[ITEMS_PER_THREAD];
-
-        // Gather the nonzeros for the merge tile into shared memory
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            nonzero_indices[ITEM]           = threadIdx.x + (ITEM * BLOCK_THREADS);
-
-            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_indices[ITEM];
-            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_indices[ITEM];
-
-            col_indices[ITEM]               = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *ci : 0;
-            mat_values[ITEM]                = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *a : 0.0;
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            VectorValueIteratorT x = wd_vector_x + col_indices[ITEM];
-            mat_values[ITEM] *= *x;
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            ValueT *s = s_tile_nonzeros + nonzero_indices[ITEM];
-
-            *s = mat_values[ITEM];
-        }
-
-        CTA_SYNC();
-
-*/
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.merge_items[0].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
 
         // Gather the nonzeros for the merge tile into shared memory
         #pragma unroll
@@ -640,217 +596,6 @@ struct AgentSpmv
     }
 
 
-
-
-
-
-
-    /**
-     * Consume a merge tile, specialized for indirect load of nonzeros
-     * /
-    template <typename IsDirectLoadT>
-    __device__ __forceinline__ KeyValuePairT ConsumeTile1(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        IsDirectLoadT   is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.merge_items[0].row_end_offset;
-
-        int warp_idx                        = threadIdx.x / WARP_THREADS;
-        int lane_idx                        = LaneId();
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        #pragma unroll 1
-        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
-        {
-            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
-        }
-
-        CTA_SYNC();
-
-        // Search for warp start/end coords
-        if (lane_idx == 0)
-        {
-            MergePathSearch(
-                OffsetT(warp_idx * ITEMS_PER_WARP),                 // Diagonal
-                s_tile_row_end_offsets,                             // List A
-                CountingInputIterator<OffsetT>(tile_start_coord.y), // List B
-                tile_num_rows,
-                tile_num_nonzeros,
-                temp_storage.warp_coords[warp_idx]);
-
-            CoordinateT last = {tile_num_rows, tile_num_nonzeros};
-            temp_storage.warp_coords[WARPS] = last;
-        }
-
-        CTA_SYNC();
-
-        CoordinateT     warp_coord          = temp_storage.warp_coords[warp_idx];
-        CoordinateT     warp_end_coord      = temp_storage.warp_coords[warp_idx + 1];
-        OffsetT         warp_nonzero_idx    = tile_start_coord.y + warp_coord.y;
-
-        // Consume whole rows
-        #pragma unroll 1
-        for (; warp_coord.x < warp_end_coord.x; ++warp_coord.x)
-        {
-            ValueT  row_total       = 0.0;
-            OffsetT row_end_offset  = s_tile_row_end_offsets[warp_coord.x];
-
-            #pragma unroll 1
-            for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx;
-                nonzero_idx < row_end_offset;
-                nonzero_idx += WARP_THREADS)
-            {
-                OffsetT column_idx          = wd_column_indices[nonzero_idx];
-                ValueT  value               = wd_values[nonzero_idx];
-                ValueT  vector_value        = wd_vector_x[column_idx];
-                row_total                   += value * vector_value;
-            }
-
-            // Warp reduce
-            row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total);
-
-            // Output
-            if (lane_idx == 0)
-            {
-                spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total;
-            }
-
-            warp_nonzero_idx = row_end_offset;
-        }
-
-        // Consume partial portion of thread's last row
-        if (warp_nonzero_idx < tile_start_coord.y + warp_end_coord.y)
-        {
-            ValueT row_total = 0.0;
-            for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx;
-                nonzero_idx < tile_start_coord.y + warp_end_coord.y;
-                nonzero_idx += WARP_THREADS)
-            {
-
-                OffsetT column_idx          = wd_column_indices[nonzero_idx];
-                ValueT  value               = wd_values[nonzero_idx];
-                ValueT  vector_value        = wd_vector_x[column_idx];
-                row_total                   += value * vector_value;
-            }
-
-            // Warp reduce
-            row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total);
-
-            // Output
-            if (lane_idx == 0)
-            {
-                spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total;
-            }
-        }
-
-        // Return the tile's running carry-out
-        KeyValuePairT tile_carry(tile_num_rows, 0.0);
-        return tile_carry;
-    }
-*/
-
-
-
-
-
-
-
-    /**
-     * Consume a merge tile, specialized for indirect load of nonzeros
-     * /
-    __device__ __forceinline__ KeyValuePairT ConsumeTile2(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-
-        ValueT*     s_tile_nonzeros         = &temp_storage.merge_items[0].nonzero;
-
-        ValueT      nonzeros[ITEMS_PER_THREAD];
-
-        // Gather the nonzeros for the merge tile into shared memory
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int     nonzero_idx         = threadIdx.x + (ITEM * BLOCK_THREADS);
-            nonzero_idx                 = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
-
-            OffsetT column_idx          = wd_column_indices[tile_start_coord.y + nonzero_idx];
-            ValueT  value               = wd_values[tile_start_coord.y + nonzero_idx];
-
-            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
-#if (CUB_PTX_ARCH >= 350)
-            vector_value                = wd_vector_x[column_idx];
-#endif
-
-            nonzeros[ITEM]              = value * vector_value;
-        }
-
-        // Exchange striped->blocked
-        BlockExchangeT(temp_storage.exchange).StripedToBlocked(nonzeros);
-
-        CTA_SYNC();
-
-        // Compute an inclusive prefix sum
-        BlockPrefixSumT(temp_storage.prefix_sum).InclusiveSum(nonzeros, nonzeros);
-
-        CTA_SYNC();
-
-        if (threadIdx.x == 0)
-            s_tile_nonzeros[0] = 0.0;
-
-        // Scatter back to smem
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM + 1;
-            s_tile_nonzeros[item_idx] = nonzeros[ITEM];
-        }
-
-        CTA_SYNC();
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        #pragma unroll 1
-        for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
-        {
-            OffsetT start = CUB_MAX(wd_row_end_offsets[tile_start_coord.x + item - 1], tile_start_coord.y);
-            OffsetT end = wd_row_end_offsets[tile_start_coord.x + item];
-
-            start -= tile_start_coord.y;
-            end -= tile_start_coord.y;
-
-            ValueT row_partial = s_tile_nonzeros[end] - s_tile_nonzeros[start];
-
-            spmv_params.d_vector_y[tile_start_coord.x + item] = row_partial;
-        }
-
-        // Get the tile's carry-out
-        KeyValuePairT tile_carry;
-        if (threadIdx.x == 0)
-        {
-            tile_carry.key = tile_num_rows;
-
-            OffsetT start = CUB_MAX(wd_row_end_offsets[tile_end_coord.x - 1], tile_start_coord.y);
-            start -= tile_start_coord.y;
-            OffsetT end = tile_num_nonzeros;
-
-            tile_carry.value = s_tile_nonzeros[end] - s_tile_nonzeros[start];
-        }
-
-        // Return the tile's running carry-out
-        return tile_carry;
-    }
-*/
-
-
     /**
      * Consume input tile
      */
diff --git a/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh b/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh
index 5503c8cf0..438c643b4 100644
--- a/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh
+++ b/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -320,7 +320,7 @@ struct ScanTileState<T, false>
         cudaError_t error = cudaSuccess;
         do
         {
-            void*   allocations[3] = { NULL, NULL, NULL };
+            void*   allocations[3];
             size_t  allocation_sizes[3];
 
             allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
diff --git a/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh b/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh
index 5f212dce9..dae1f3018 100644
--- a/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh b/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh
index 17ef2ab37..f43ee39ee 100644
--- a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_exchange.cuh b/thrust/system/cuda/detail/cub/block/block_exchange.cuh
index a8e386e04..7cc8c5abb 100644
--- a/thrust/system/cuda/detail/cub/block/block_exchange.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_exchange.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_histogram.cuh b/thrust/system/cuda/detail/cub/block/block_histogram.cuh
index 4a5233b91..f97f89ea6 100644
--- a/thrust/system/cuda/detail/cub/block/block_histogram.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_histogram.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_load.cuh b/thrust/system/cuda/detail/cub/block/block_load.cuh
index 5d97b6598..6f7671b4b 100644
--- a/thrust/system/cuda/detail/cub/block/block_load.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_load.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -809,10 +809,7 @@ private:
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
+        {};
 
         /// Alias wrapper allowing storage to be unioned
         struct TempStorage : Uninitialized<_TempStorage> {};
@@ -849,10 +846,7 @@ private:
             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items)                    ///< [in] Number of valid items to load
         {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
             BlockExchange(temp_storage).StripedToBlocked(items, items);
         }
 
@@ -864,10 +858,7 @@ private:
             int             valid_items,                    ///< [in] Number of valid items to load
             DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
         {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
             BlockExchange(temp_storage).StripedToBlocked(items, items);
         }
 
@@ -893,10 +884,7 @@ private:
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
+        {};
 
         /// Alias wrapper allowing storage to be unioned
         struct TempStorage : Uninitialized<_TempStorage> {};
@@ -933,10 +921,7 @@ private:
             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items)                    ///< [in] Number of valid items to load
         {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
@@ -949,10 +934,7 @@ private:
             int             valid_items,                    ///< [in] Number of valid items to load
             DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
         {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
     };
@@ -977,10 +959,7 @@ private:
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
+        {};
 
         /// Alias wrapper allowing storage to be unioned
         struct TempStorage : Uninitialized<_TempStorage> {};
@@ -1017,10 +996,7 @@ private:
             InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items)                    ///< [in] Number of valid items to load
         {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
@@ -1033,10 +1009,7 @@ private:
             int             valid_items,                    ///< [in] Number of valid items to load
             DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
         {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
     };
diff --git a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh b/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh
index 743c10103..cfd0652ec 100644
--- a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -140,7 +140,7 @@ public:
     enum
     {
         /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
     };
 
 private:
@@ -495,7 +495,7 @@ public:
     enum
     {
         /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
     };
 
 private:
@@ -589,7 +589,6 @@ public:
         // Each warp will strip-mine its section of input, one strip at a time
 
         volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
-        uint32_t                lane_id         = LaneId();
         uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
         uint32_t                lane_mask_lt    = LaneMaskLt();
 
diff --git a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh b/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh
index 27d61cb70..8a54b3fb9 100644
--- a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,8 +71,9 @@ namespace cub {
  *   given input sequence of keys and a set of rules specifying a total ordering
  *   of the symbolic alphabet, the radix sorting method produces a lexicographic
  *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- *   <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ *   half-precision floating-point type. Within each key, the implementation treats fixed-length
  *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
  *   method can only be applied to unsigned integral types, BlockRadixSort
  *   is able to sort signed and floating-point types via simple bit-wise transformations
diff --git a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh b/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh
index c04af877a..9cf4ffa97 100644
--- a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_reduce.cuh b/thrust/system/cuda/detail/cub/block/block_reduce.cuh
index f44113ed2..12a79ecea 100644
--- a/thrust/system/cuda/detail/cub/block/block_reduce.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_reduce.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_scan.cuh b/thrust/system/cuda/detail/cub/block/block_scan.cuh
index 80f0affe7..c553cfbe4 100644
--- a/thrust/system/cuda/detail/cub/block/block_scan.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_scan.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_shuffle.cuh b/thrust/system/cuda/detail/cub/block/block_shuffle.cuh
index b357e66f4..eb49fb6d4 100644
--- a/thrust/system/cuda/detail/cub/block/block_shuffle.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_shuffle.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/block_store.cuh b/thrust/system/cuda/detail/cub/block/block_store.cuh
index 6b5e1ae4a..c79c94f5b 100644
--- a/thrust/system/cuda/detail/cub/block/block_store.cuh
+++ b/thrust/system/cuda/detail/cub/block/block_store.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh
index 8ae7b46a5..c971f000a 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh
index 5955a3a4c..cdbbefd40 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh
index c8eb14718..612a5acf7 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh
index 29f7f6182..012c71d4e 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh
index edd501aad..2e8be1c3d 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh
index 0560235bb..0d49d0693 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh
index e7dcc6e1f..6f582a8e4 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh
index d6e61f059..2be0e749c 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh
index 0d13d3ce0..15a9cf54b 100644
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh
+++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/cub.cuh b/thrust/system/cuda/detail/cub/cub.cuh
index b1c8e3200..3ece0f658 100644
--- a/thrust/system/cuda/detail/cub/cub.cuh
+++ b/thrust/system/cuda/detail/cub/cub.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_histogram.cuh b/thrust/system/cuda/detail/cub/device/device_histogram.cuh
index e54fdd0b7..259bcad32 100644
--- a/thrust/system/cuda/detail/cub/device/device_histogram.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_histogram.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_partition.cuh b/thrust/system/cuda/detail/cub/device/device_partition.cuh
index 3ffcc9b81..178cfe938 100644
--- a/thrust/system/cuda/detail/cub/device/device_partition.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_partition.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh
index c767c4035..aead91103 100644
--- a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -62,8 +62,9 @@ namespace cub {
  * ordering of those keys.
  *
  * \par
- * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
  * method can only be applied to unsigned integral types, DeviceRadixSort
  * is able to sort signed and floating-point types via simple bit-wise transformations
  * that ensure lexicographic key ordering.
diff --git a/thrust/system/cuda/detail/cub/device/device_reduce.cuh b/thrust/system/cuda/detail/cub/device/device_reduce.cuh
index 645e19988..43b91f799 100644
--- a/thrust/system/cuda/detail/cub/device/device_reduce.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_reduce.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh b/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh
index 7cdb1c3fa..236926c71 100644
--- a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_scan.cuh b/thrust/system/cuda/detail/cub/device/device_scan.cuh
index 0742bdb4a..91827f230 100644
--- a/thrust/system/cuda/detail/cub/device/device_scan.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_scan.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh
index 624e64793..dc019331e 100644
--- a/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -62,8 +62,9 @@ namespace cub {
  * ordering of those keys.
  *
  * \par
- * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
  * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
  * is able to sort signed and floating-point types via simple bit-wise transformations
  * that ensure lexicographic key ordering.
diff --git a/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh b/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh
index c38d9f1c8..5626e0a00 100644
--- a/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_select.cuh b/thrust/system/cuda/detail/cub/device/device_select.cuh
index 909a37e22..3dc9d6ac3 100644
--- a/thrust/system/cuda/detail/cub/device/device_select.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_select.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/device_spmv.cuh b/thrust/system/cuda/detail/cub/device/device_spmv.cuh
index 60e7aa6ee..611d75d3a 100644
--- a/thrust/system/cuda/detail/cub/device/device_spmv.cuh
+++ b/thrust/system/cuda/detail/cub/device/device_spmv.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh
index f864a71ef..4bf7d6f85 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh
index 6c9a87f47..baf7f422c 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -104,7 +104,7 @@ __global__ void DeviceRadixSortUpsweepKernel(
     CTA_SYNC();
 
     // Write out digit counts (striped)
-    upsweep.ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
 }
 
 
@@ -279,6 +279,10 @@ __global__ void DeviceRadixSortSingleTileKernel(
     // Load values
     if (!KEYS_ONLY)
     {
+        // Register pressure work-around: moving num_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        num_items = ShuffleIndex(num_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
+
         BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
 
         CTA_SYNC();
@@ -496,64 +500,24 @@ struct DeviceRadixSortPolicy
     {
         // Whether this is a keys-only (or key-value) sort
         KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-
-        // Relative size of KeyT type to a 4-byte word
-        SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
     };
 
+    // Dominant-sized key/value type
+    typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT;
+
     //------------------------------------------------------------------------------
     // Architecture-specific tuning policies
     //------------------------------------------------------------------------------
 
-    /// SM13
-    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
     /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
     {
         enum {
             PRIMARY_RADIX_BITS      = 5,
             ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+
+            // Relative size of KeyT type to a 4-byte word
+            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
         };
 
         // Keys-only upsweep policies
@@ -597,6 +561,9 @@ struct DeviceRadixSortPolicy
         enum {
             PRIMARY_RADIX_BITS      = 5,
             ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+
+            // Relative size of KeyT type to a 4-byte word
+            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
         };
 
         // Keys-only upsweep policies
@@ -639,19 +606,19 @@ struct DeviceRadixSortPolicy
     struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
     {
         enum {
-            PRIMARY_RADIX_BITS      = 6,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
         };
 
         // Scan policy
         typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
 
         // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128,   CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 9, DominantT), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(64, 18, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
 
         // Key-value pairs downsweep policies
         typedef DownsweepPolicyKeys DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 15, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
 
         // Downsweep policies
         typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
@@ -676,28 +643,28 @@ struct DeviceRadixSortPolicy
     struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
     {
         enum {
-            PRIMARY_RADIX_BITS      = 7,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 3.1B 32b segmented keys/s (TitanX)
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
         };
 
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
         // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <160, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(160, 39, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
 
         // Upsweep policies
         typedef DownsweepPolicy UpsweepPolicy;
         typedef AltDownsweepPolicy AltUpsweepPolicy;
 
         // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
 
         // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 31, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 11, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
     };
 
 
@@ -705,28 +672,28 @@ struct DeviceRadixSortPolicy
     struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
     {
         enum {
-            PRIMARY_RADIX_BITS      = 7,    // 6.9B 32b keys/s (Quadro P100)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 5.9B 32b segmented keys/s (Quadro P100)
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
         };
 
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
         // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 25 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
 
         // Upsweep policies
         typedef DownsweepPolicy UpsweepPolicy;
         typedef AltDownsweepPolicy AltUpsweepPolicy;
 
         // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
 
         // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
 
     };
 
@@ -735,28 +702,28 @@ struct DeviceRadixSortPolicy
     struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
     {
         enum {
-            PRIMARY_RADIX_BITS      = 7,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 3.3B 32b segmented keys/s (1080)
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
         };
 
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
         // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT,       LOAD_DEFAULT,       RADIX_RANK_MATCH,   BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 35 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE,    LOAD_DEFAULT,   RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 31, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 35, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
 
         // Upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
 
         // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
 
         // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
     };
 
 
@@ -772,15 +739,15 @@ struct DeviceRadixSortPolicy
         typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
         // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
 
         // Upsweep policies
         typedef DownsweepPolicy UpsweepPolicy;
         typedef AltDownsweepPolicy AltUpsweepPolicy;
 
         // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
 
         // Segmented policies
         typedef DownsweepPolicy     SegmentedPolicy;
@@ -792,28 +759,28 @@ struct DeviceRadixSortPolicy
     struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
     {
         enum {
-            PRIMARY_RADIX_BITS      = 6,    // 7.62B 32b keys/s (GV100)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 8.7B 32b segmented keys/s (GV100)
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
         };
 
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
         // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
 
         // Upsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  UpsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>  AltUpsweepPolicy;
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
 
         // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
 
         // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
     };
 
 
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh
index dfc390c5a..44b1233a4 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -248,10 +248,10 @@ struct DeviceReducePolicy
     {
         // ReducePolicy
         typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(128, 8, OutputT), ///< Threads per block, items per thread
-                2,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                       ///< Cache load modifier
+                CUB_SCALED_GRANULARITIES(128, 8, OutputT), ///< Threads per block, items per thread
+                2,                                         ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                       ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                              ///< Cache load modifier
             ReducePolicy;
 
         // SingleTilePolicy
@@ -267,10 +267,10 @@ struct DeviceReducePolicy
     {
         // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
         typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(128, 8, OutputT),     ///< Threads per block, items per thread
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
+                CUB_SCALED_GRANULARITIES(128, 8, OutputT), ///< Threads per block, items per thread
+                4,                                         ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                       ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                              ///< Cache load modifier
             ReducePolicy;
 
         // SingleTilePolicy
@@ -286,10 +286,10 @@ struct DeviceReducePolicy
     {
         // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
         typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(256, 20, OutputT),    ///< Threads per block, items per thread
-                2,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
+                CUB_SCALED_GRANULARITIES(256, 20, OutputT), ///< Threads per block, items per thread
+                2,                                          ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,               ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                               ///< Cache load modifier
             ReducePolicy;
 
         // SingleTilePolicy
@@ -305,10 +305,10 @@ struct DeviceReducePolicy
     {
         // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
         typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(256, 20, OutputT),    ///< Threads per block, items per thread
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
+                CUB_SCALED_GRANULARITIES(256, 20, OutputT), ///< Threads per block, items per thread
+                4,                                          ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,               ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                                   ///< Cache load modifier
             ReducePolicy;
 
         // SingleTilePolicy
@@ -323,10 +323,10 @@ struct DeviceReducePolicy
     {
         // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
         typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(256, 16, OutputT),    ///< Threads per block, items per thread
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
+                CUB_SCALED_GRANULARITIES(256, 16, OutputT), ///< Threads per block, items per thread
+                4,                                          ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,               ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                                   ///< Cache load modifier
             ReducePolicy;
 
         // SingleTilePolicy
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 501ae0da1..38bee414e 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh
index 704968dd9..0d244a8a6 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh
index f1522aaf9..782e686d5 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -174,7 +174,7 @@ struct DispatchScan
     struct Policy600
     {
         typedef AgentScanPolicy<
-            CUB_NOMINAL_CONFIG(128, 15, OutputT),      ///< Threads per block, items per thread
+            CUB_SCALED_GRANULARITIES(128, 15, OutputT),      ///< Threads per block, items per thread
                 BLOCK_LOAD_TRANSPOSE,
                 LOAD_DEFAULT,
                 BLOCK_STORE_TRANSPOSE,
@@ -188,7 +188,7 @@ struct DispatchScan
     {
         // Titan X: 32.47B items/s @ 48M 32-bit T
         typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
                 BLOCK_LOAD_DIRECT,
                 LOAD_LDG,
                 BLOCK_STORE_WARP_TRANSPOSE,
@@ -202,7 +202,7 @@ struct DispatchScan
     {
         // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
         typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
                 BLOCK_LOAD_DIRECT,
                 LOAD_LDG,
                 BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
@@ -214,7 +214,7 @@ struct DispatchScan
     struct Policy300
     {
         typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(256, 9, OutputT),      ///< Threads per block, items per thread
+                CUB_SCALED_GRANULARITIES(256, 9, OutputT),      ///< Threads per block, items per thread
                 BLOCK_LOAD_WARP_TRANSPOSE,
                 LOAD_DEFAULT,
                 BLOCK_STORE_WARP_TRANSPOSE,
@@ -227,7 +227,7 @@ struct DispatchScan
     {
         // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
         typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
                 BLOCK_LOAD_WARP_TRANSPOSE,
                 LOAD_DEFAULT,
                 BLOCK_STORE_WARP_TRANSPOSE,
@@ -239,7 +239,7 @@ struct DispatchScan
     struct Policy130
     {
         typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(96, 21, OutputT),      ///< Threads per block, items per thread
+                CUB_SCALED_GRANULARITIES(96, 21, OutputT),      ///< Threads per block, items per thread
                 BLOCK_LOAD_WARP_TRANSPOSE,
                 LOAD_DEFAULT,
                 BLOCK_STORE_WARP_TRANSPOSE,
@@ -251,7 +251,7 @@ struct DispatchScan
     struct Policy100
     {
         typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(64, 9, OutputT),      ///< Threads per block, items per thread
+                CUB_SCALED_GRANULARITIES(64, 9, OutputT),      ///< Threads per block, items per thread
                 BLOCK_LOAD_WARP_TRANSPOSE,
                 LOAD_DEFAULT,
                 BLOCK_STORE_WARP_TRANSPOSE,
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh
index 2b33879ec..1b3aa8dad 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh
index 54c2c8cad..a0bf515c1 100644
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -415,12 +415,41 @@ struct DispatchSpmv
     };
 
 
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
 
     //---------------------------------------------------------------------
     // Tuning policies of current PTX compiler pass
     //---------------------------------------------------------------------
 
-#if (CUB_PTX_ARCH >= 500)
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
     typedef Policy500 PtxPolicy;
 
 #elif (CUB_PTX_ARCH >= 370)
@@ -468,7 +497,12 @@ struct DispatchSpmv
     #else
 
         // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 500)
+        if (ptx_version >= 600)
+        {
+            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 500)
         {
             spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
             segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
@@ -786,56 +820,6 @@ struct DispatchSpmv
                 DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
                 spmv_config, segment_fixup_config))) break;
 
-/*
-            // Dispatch
-            if (spmv_params.beta == 0.0)
-            {
-                if (spmv_params.alpha == 1.0)
-                {
-                    // Dispatch y = A*x
-                    if (CubDebug(error = Dispatch(
-                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
-                        DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
-                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
-                        DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
-                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-                        spmv_config, segment_fixup_config))) break;
-                }
-                else
-                {
-                    // Dispatch y = alpha*A*x
-                    if (CubDebug(error = Dispatch(
-                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
-                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
-                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, false>,
-                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-                        spmv_config, segment_fixup_config))) break;
-                }
-            }
-            else
-            {
-                if (spmv_params.alpha == 1.0)
-                {
-                    // Dispatch y = A*x + beta*y
-                    if (CubDebug(error = Dispatch(
-                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
-                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
-                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false, true>,
-                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-                        spmv_config, segment_fixup_config))) break;
-                }
-                else
-                {
-                    // Dispatch y = alpha*A*x + beta*y
-                    if (CubDebug(error = Dispatch(
-                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
-                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
-                        DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, true>,
-                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-                        spmv_config, segment_fixup_config))) break;
-                }
-            }
-*/
         }
         while (0);
 
diff --git a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh b/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh
index 8d1555269..5b12c66ed 100644
--- a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh
+++ b/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh b/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh
index f1b1fe7e3..59fe5c909 100644
--- a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh
+++ b/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh b/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh
index 14af378ee..6d1ab5846 100644
--- a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh
+++ b/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh b/thrust/system/cuda/detail/cub/grid/grid_queue.cuh
index e9d81a01b..3c5330e4a 100644
--- a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh
+++ b/thrust/system/cuda/detail/cub/grid/grid_queue.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/host/mutex.cuh b/thrust/system/cuda/detail/cub/host/mutex.cuh
index 8fe3e9287..30d64b7d4 100644
--- a/thrust/system/cuda/detail/cub/host/mutex.cuh
+++ b/thrust/system/cuda/detail/cub/host/mutex.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh
index d0a2678b8..e527202e4 100644
--- a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh
index 484da0186..012a32180 100644
--- a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh
index 1822be7e1..9038fed64 100644
--- a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh
index 13fc75147..e2582db35 100644
--- a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh
index 93a7c644f..69a736302 100644
--- a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh
index 3a40e949b..497b2893a 100644
--- a/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
index 74ba6f926..7067ae001 100644
--- a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh
index 5a6f556fd..73904b787 100644
--- a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh
index e85e899cb..5ab407b0c 100644
--- a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh
+++ b/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/thread/thread_load.cuh b/thrust/system/cuda/detail/cub/thread/thread_load.cuh
index 3342759f7..888fa8ea8 100644
--- a/thrust/system/cuda/detail/cub/thread/thread_load.cuh
+++ b/thrust/system/cuda/detail/cub/thread/thread_load.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh b/thrust/system/cuda/detail/cub/thread/thread_operators.cuh
index d1f7cb6db..5bfa790e2 100644
--- a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh
+++ b/thrust/system/cuda/detail/cub/thread/thread_operators.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh b/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh
index 8cc9cf4f1..7e525ea0c 100644
--- a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh
+++ b/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh b/thrust/system/cuda/detail/cub/thread/thread_scan.cuh
index 44a318c83..94f3016f4 100644
--- a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh
+++ b/thrust/system/cuda/detail/cub/thread/thread_scan.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/thread/thread_search.cuh b/thrust/system/cuda/detail/cub/thread/thread_search.cuh
index 70cf6bdfe..3fcdd628f 100644
--- a/thrust/system/cuda/detail/cub/thread/thread_search.cuh
+++ b/thrust/system/cuda/detail/cub/thread/thread_search.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/thread/thread_store.cuh b/thrust/system/cuda/detail/cub/thread/thread_store.cuh
index 05a9e1676..e79122c85 100644
--- a/thrust/system/cuda/detail/cub/thread/thread_store.cuh
+++ b/thrust/system/cuda/detail/cub/thread/thread_store.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/util_allocator.cuh b/thrust/system/cuda/detail/cub/util_allocator.cuh
index cc44a4944..3ed80d3c5 100644
--- a/thrust/system/cuda/detail/cub/util_allocator.cuh
+++ b/thrust/system/cuda/detail/cub/util_allocator.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/util_arch.cuh b/thrust/system/cuda/detail/cub/util_arch.cuh
index e2b42b44b..e869b85b5 100644
--- a/thrust/system/cuda/detail/cub/util_arch.cuh
+++ b/thrust/system/cuda/detail/cub/util_arch.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -116,31 +116,31 @@ namespace cub {
 #endif
 
 
-/// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data.  Minimum of two warps.
-#ifndef CUB_BLOCK_THREADS
-    #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                        \
+/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data.  Minimum of two warps.
+#ifndef CUB_SCALED_BLOCK_THREADS
+    #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                   \
         (CUB_MIN(                                                                           \
-            NOMINAL_4B_BLOCK_THREADS * 2,                                                   \
+            NOMINAL_4B_BLOCK_THREADS,                                                       \
             CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
-                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4,            \
+                2,                                                                          \
                 (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
 #endif
 
-/// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data.  Minimum 1 item per thread
-#ifndef CUB_ITEMS_PER_THREAD
-    #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)    \
-	    (CUB_MIN(                                                                                       \
-	        NOMINAL_4B_ITEMS_PER_THREAD * 2,                                                            \
-	        CUB_MAX(                                                                                    \
-	            1,                                                                                      \
-	            (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
+/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data.  Minimum 1 item per thread
+#ifndef CUB_SCALED_ITEMS_PER_THREAD
+    #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)     \
+        CUB_MAX(                                                                                                \
+            1,                                                                                                  \
+            (sizeof(T) < 4) ?                                                                                   \
+                ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 :  \
+                ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))
 #endif
 
 /// Define both nominal threads-per-block and items-per-thread
-#ifndef CUB_NOMINAL_CONFIG
-    #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)    \
-        CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                \
-        CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
+#ifndef CUB_SCALED_GRANULARITIES
+    #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)      \
+        CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                   \
+        CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
 #endif
 
 
diff --git a/thrust/system/cuda/detail/cub/util_debug.cuh b/thrust/system/cuda/detail/cub/util_debug.cuh
index 37f92db26..5dcacbaf7 100644
--- a/thrust/system/cuda/detail/cub/util_debug.cuh
+++ b/thrust/system/cuda/detail/cub/util_debug.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/util_device.cuh b/thrust/system/cuda/detail/cub/util_device.cuh
index 1b771e694..ca55bd530 100644
--- a/thrust/system/cuda/detail/cub/util_device.cuh
+++ b/thrust/system/cuda/detail/cub/util_device.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/util_macro.cuh b/thrust/system/cuda/detail/cub/util_macro.cuh
index 0474feb53..14bd9b12b 100644
--- a/thrust/system/cuda/detail/cub/util_macro.cuh
+++ b/thrust/system/cuda/detail/cub/util_macro.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/util_namespace.cuh b/thrust/system/cuda/detail/cub/util_namespace.cuh
index ef24c5550..0c2bf29fe 100644
--- a/thrust/system/cuda/detail/cub/util_namespace.cuh
+++ b/thrust/system/cuda/detail/cub/util_namespace.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/util_ptx.cuh b/thrust/system/cuda/detail/cub/util_ptx.cuh
index 9a72b3de2..aff170333 100644
--- a/thrust/system/cuda/detail/cub/util_ptx.cuh
+++ b/thrust/system/cuda/detail/cub/util_ptx.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/util_type.cuh b/thrust/system/cuda/detail/cub/util_type.cuh
index cbebb3e47..bd3bebd36 100644
--- a/thrust/system/cuda/detail/cub/util_type.cuh
+++ b/thrust/system/cuda/detail/cub/util_type.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,10 +37,16 @@
 #include <limits>
 #include <cfloat>
 
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    #include <cuda_fp16.h>
+#endif
+
 #include "util_macro.cuh"
 #include "util_arch.cuh"
 #include "util_namespace.cuh"
 
+
+
 /// Optional outer namespace(s)
 THRUST_CUB_NS_PREFIX
 
@@ -889,10 +895,10 @@ private:
     template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
     template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
 */
-    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
 
     template <typename BinaryOpT> static int Test(...);
 
@@ -1057,6 +1063,23 @@ struct FpLimits<double>
 };
 
 
+#if (__CUDACC_VER_MAJOR__ >= 9)
+template <>
+struct FpLimits<__half>
+{
+    static __host__ __device__ __forceinline__ __half Max() {
+        unsigned short max_word = 0x7BFF;
+        return reinterpret_cast<__half&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __half Lowest() {
+        unsigned short lowest_word = 0xFBFF;
+        return reinterpret_cast<__half&>(lowest_word);
+    }
+};
+#endif
+
+
 /**
  * Basic type traits (fp primitive specialization)
  */
@@ -1120,6 +1143,9 @@ template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTE
 
 template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
 template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+#endif
 
 template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
 
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh
index 4a719625f..c92765297 100644
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh
index bec27e4e8..4325ca0c8 100644
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh
index ebff77335..d5f40161b 100644
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh
index aaa3d095c..5bafb3559 100644
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh
+++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh b/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh
index 907053de5..baef93594 100644
--- a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh
+++ b/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh b/thrust/system/cuda/detail/cub/warp/warp_scan.cuh
index 8966a1e4b..aa7149586 100644
--- a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh
+++ b/thrust/system/cuda/detail/cub/warp/warp_scan.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met: