From 4b61388e8fb2314220b028cdaf30a5bbee21b0e9 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Wed, 14 Feb 2018 08:47:38 -0800 Subject: [PATCH] CUB: Integrate CUB 1.7.5 into Thrust to pull in the (corrected) fix for small data type radix sorting performance regressions. Bug 1997368 Bug 200355591 git-commit b56409c060fe4c718066d19099fb12d8acdb2163 git-author Bryce Adelstein Lelbach aka wash VDVS: http://ausdvs.nvidia.com/Build_Results?virtualId=1000099285&which_page=current_build Jobs: 1997368-2006 200355591-2006 [git-p4: depot-paths = "//sw/gpgpu/thrust/": change = 23593281] --- internal/rename_cub_namespace.sh | 7 + internal/reverse_rename_cub_namespace.sh | 7 + internal/update_thrust_cub.sh | 18 -- .../cuda/detail/cub/agent/agent_histogram.cuh | 2 +- .../cub/agent/agent_radix_sort_downsweep.cuh | 29 +- .../cub/agent/agent_radix_sort_upsweep.cuh | 2 +- .../cuda/detail/cub/agent/agent_reduce.cuh | 2 +- .../detail/cub/agent/agent_reduce_by_key.cuh | 12 +- .../cuda/detail/cub/agent/agent_rle.cuh | 6 +- .../cuda/detail/cub/agent/agent_scan.cuh | 2 +- .../detail/cub/agent/agent_segment_fixup.cuh | 2 +- .../cuda/detail/cub/agent/agent_select_if.cuh | 2 +- .../cuda/detail/cub/agent/agent_spmv_orig.cuh | 261 +----------------- .../cub/agent/single_pass_scan_operators.cuh | 4 +- .../cub/block/block_adjacent_difference.cuh | 2 +- .../detail/cub/block/block_discontinuity.cuh | 2 +- .../cuda/detail/cub/block/block_exchange.cuh | 2 +- .../cuda/detail/cub/block/block_histogram.cuh | 2 +- .../cuda/detail/cub/block/block_load.cuh | 47 +--- .../detail/cub/block/block_radix_rank.cuh | 7 +- .../detail/cub/block/block_radix_sort.cuh | 7 +- .../detail/cub/block/block_raking_layout.cuh | 2 +- .../cuda/detail/cub/block/block_reduce.cuh | 2 +- .../cuda/detail/cub/block/block_scan.cuh | 2 +- .../cuda/detail/cub/block/block_shuffle.cuh | 2 +- .../cuda/detail/cub/block/block_store.cuh | 2 +- .../block_histogram_atomic.cuh | 2 +- .../specializations/block_histogram_sort.cuh | 2 +- .../specializations/block_reduce_raking.cuh | 2 +- .../block_reduce_raking_commutative_only.cuh | 2 +- .../block_reduce_warp_reductions.cuh | 2 +- .../specializations/block_scan_raking.cuh | 2 +- .../specializations/block_scan_warp_scans.cuh | 2 +- .../block_scan_warp_scans2.cuh | 2 +- .../block_scan_warp_scans3.cuh | 2 +- thrust/system/cuda/detail/cub/cub.cuh | 2 +- .../detail/cub/device/device_histogram.cuh | 2 +- .../detail/cub/device/device_partition.cuh | 2 +- .../detail/cub/device/device_radix_sort.cuh | 7 +- .../cuda/detail/cub/device/device_reduce.cuh | 2 +- .../cub/device/device_run_length_encode.cuh | 2 +- .../cuda/detail/cub/device/device_scan.cuh | 2 +- .../device/device_segmented_radix_sort.cuh | 7 +- .../cub/device/device_segmented_reduce.cuh | 2 +- .../cuda/detail/cub/device/device_select.cuh | 2 +- .../cuda/detail/cub/device/device_spmv.cuh | 2 +- .../device/dispatch/dispatch_histogram.cuh | 2 +- .../device/dispatch/dispatch_radix_sort.cuh | 151 ++++------ .../cub/device/dispatch/dispatch_reduce.cuh | 42 +-- .../dispatch/dispatch_reduce_by_key.cuh | 2 +- .../cub/device/dispatch/dispatch_rle.cuh | 2 +- .../cub/device/dispatch/dispatch_scan.cuh | 16 +- .../device/dispatch/dispatch_select_if.cuh | 2 +- .../device/dispatch/dispatch_spmv_orig.cuh | 90 +++--- .../cuda/detail/cub/grid/grid_barrier.cuh | 2 +- .../cuda/detail/cub/grid/grid_even_share.cuh | 2 +- .../cuda/detail/cub/grid/grid_mapping.cuh | 2 +- .../cuda/detail/cub/grid/grid_queue.cuh | 2 +- thrust/system/cuda/detail/cub/host/mutex.cuh | 2 +- .../cub/iterator/arg_index_input_iterator.cuh | 2 +- .../cache_modified_input_iterator.cuh | 2 +- .../cache_modified_output_iterator.cuh | 2 +- .../cub/iterator/constant_input_iterator.cuh | 2 +- .../cub/iterator/counting_input_iterator.cuh | 2 +- .../cub/iterator/discard_output_iterator.cuh | 2 +- .../cub/iterator/tex_obj_input_iterator.cuh | 2 +- .../cub/iterator/tex_ref_input_iterator.cuh | 2 +- .../cub/iterator/transform_input_iterator.cuh | 2 +- .../cuda/detail/cub/thread/thread_load.cuh | 2 +- .../detail/cub/thread/thread_operators.cuh | 2 +- .../cuda/detail/cub/thread/thread_reduce.cuh | 2 +- .../cuda/detail/cub/thread/thread_scan.cuh | 2 +- .../cuda/detail/cub/thread/thread_search.cuh | 2 +- .../cuda/detail/cub/thread/thread_store.cuh | 2 +- .../system/cuda/detail/cub/util_allocator.cuh | 2 +- thrust/system/cuda/detail/cub/util_arch.cuh | 36 +-- thrust/system/cuda/detail/cub/util_debug.cuh | 2 +- thrust/system/cuda/detail/cub/util_device.cuh | 2 +- thrust/system/cuda/detail/cub/util_macro.cuh | 2 +- .../system/cuda/detail/cub/util_namespace.cuh | 2 +- thrust/system/cuda/detail/cub/util_ptx.cuh | 2 +- thrust/system/cuda/detail/cub/util_type.cuh | 36 ++- .../warp/specializations/warp_reduce_shfl.cuh | 2 +- .../warp/specializations/warp_reduce_smem.cuh | 2 +- .../warp/specializations/warp_scan_shfl.cuh | 2 +- .../warp/specializations/warp_scan_smem.cuh | 2 +- .../cuda/detail/cub/warp/warp_reduce.cuh | 2 +- .../system/cuda/detail/cub/warp/warp_scan.cuh | 2 +- 88 files changed, 318 insertions(+), 610 deletions(-) create mode 100755 internal/rename_cub_namespace.sh create mode 100755 internal/reverse_rename_cub_namespace.sh delete mode 100755 internal/update_thrust_cub.sh diff --git a/internal/rename_cub_namespace.sh b/internal/rename_cub_namespace.sh new file mode 100755 index 000000000..7a539e5d6 --- /dev/null +++ b/internal/rename_cub_namespace.sh @@ -0,0 +1,7 @@ +#! /bin/bash + +# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to add a THRUST_ +# prefix to CUB's namespace macro. + +sed -i -e 's/CUB_NS_P/THRUST_CUB_NS_P/g' `find . -type f` + diff --git a/internal/reverse_rename_cub_namespace.sh b/internal/reverse_rename_cub_namespace.sh new file mode 100755 index 000000000..bc4858449 --- /dev/null +++ b/internal/reverse_rename_cub_namespace.sh @@ -0,0 +1,7 @@ +#! /bin/bash + +# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to undo the +# renaming of CUB's namespace macro. + +sed -i -e 's|THRUST_CUB_NS_P|CUB_NS_P|g' `find . -type f` + diff --git a/internal/update_thrust_cub.sh b/internal/update_thrust_cub.sh deleted file mode 100755 index eeaf9d7f8..000000000 --- a/internal/update_thrust_cub.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh - -# When a update version of CUB is fetched either from -# http://github.com/dumerrill/PrivateCUB (currently in use) -# or -# http://github.com/NVLabs/cub -# Run this script from -# //sw/gpgpu/thrust/thrust/system/cuda/detail/cub -# using the following command, only once -# find . -type f -exec //sw/gpgpu/thrust/internal/update_thrust_cub.sh '{}' \; - -# The purpose of this is to rename every instance of -# CUB_NSP{EFIX|OSTFIX} -> THRUST_CUB_NS_P{EFIX|OSTFIX} -# - -echo $1 -cat $1|sed -e 's|CUB_NS_P|THRUST_CUB_NS_P|g' > /tmp/tmp.xxx -mv /tmp/tmp.xxx $1 diff --git a/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh b/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh index 634c67f5a..0833ed31b 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh index f030ef788..1b1fd8a3e 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -293,7 +293,7 @@ struct AgentRadixSortDownsweep { ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; - if (FULL_TILE || + if (FULL_TILE || (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) { d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; @@ -332,6 +332,10 @@ struct AgentRadixSortDownsweep Int2Type is_full_tile, Int2Type<_RANK_ALGORITHM> rank_algorithm) { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff); + BlockLoadKeysT(temp_storage.load_keys).Load( d_keys_in + block_offset, keys, valid_items, oob_item); @@ -365,6 +369,10 @@ struct AgentRadixSortDownsweep Int2Type is_full_tile, Int2Type rank_algorithm) { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff); + LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); } @@ -398,6 +406,10 @@ struct AgentRadixSortDownsweep Int2Type is_full_tile, Int2Type<_RANK_ALGORITHM> rank_algorithm) { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff); + BlockLoadValuesT(temp_storage.load_values).Load( d_values_in + block_offset, values, valid_items); @@ -411,7 +423,7 @@ struct AgentRadixSortDownsweep __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, - volatile OffsetT valid_items, + OffsetT valid_items, Int2Type is_full_tile, Int2Type rank_algorithm) { @@ -425,10 +437,14 @@ struct AgentRadixSortDownsweep __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, - volatile OffsetT valid_items, + OffsetT valid_items, Int2Type is_full_tile, Int2Type rank_algorithm) { + // Register pressure work-around: moving valid_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff); + LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); } @@ -444,10 +460,10 @@ struct AgentRadixSortDownsweep OffsetT valid_items, Int2Type /*is_keys_only*/) { - CTA_SYNC(); - ValueT values[ITEMS_PER_THREAD]; + CTA_SYNC(); + LoadValues( values, block_offset, @@ -746,6 +762,7 @@ struct AgentRadixSortDownsweep else { // Process full tiles of tile_items + #pragma unroll 1 while (block_offset + TILE_ITEMS <= block_end) { ProcessTile(block_offset); diff --git a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh index 541f923e2..efa69858d 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh b/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh index c4085a777..df3f4a70f 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh b/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh index b1692b8eb..d68201013 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -454,13 +454,13 @@ struct AgentReduceByKey // Perform exclusive tile scan OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate OffsetT num_segments_prefix; // Number of segments prior to this tile - ValueOutputT total_aggregate; // The tile prefix folded with block_aggregate + OffsetValuePairT total_aggregate; // The tile prefix folded with block_aggregate if (tile_idx == 0) { // Scan first tile BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); num_segments_prefix = 0; - total_aggregate = block_aggregate.value; + total_aggregate = block_aggregate; // Update tile status if there are successor tiles if ((!IS_LAST_TILE) && (threadIdx.x == 0)) @@ -474,9 +474,7 @@ struct AgentReduceByKey block_aggregate = prefix_op.GetBlockAggregate(); num_segments_prefix = prefix_op.GetExclusivePrefix().key; - total_aggregate = reduction_op( - prefix_op.GetExclusivePrefix().value, - block_aggregate.value); + total_aggregate = prefix_op.GetInclusivePrefix(); } // Rezip scatter items and segment indices @@ -506,7 +504,7 @@ struct AgentReduceByKey if (num_remaining == TILE_ITEMS) { d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; - d_aggregates_out[num_segments] = total_aggregate; + d_aggregates_out[num_segments] = total_aggregate.value; num_segments++; } diff --git a/thrust/system/cuda/detail/cub/agent/agent_rle.cuh b/thrust/system/cuda/detail/cub/agent/agent_rle.cuh index 90ea81dbd..94f47eb5b 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_rle.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_rle.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -618,8 +618,8 @@ struct AgentRle OffsetT num_items, ///< Total number of global input items OffsetT num_remaining, ///< Number of global input items remaining (including this tile) int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT &tile_status) ///< Global list of tile status + OffsetT tile_offset, ///< Tile offset + ScanTileStateT &tile_status) ///< Global list of tile status { if (tile_idx == 0) { diff --git a/thrust/system/cuda/detail/cub/agent/agent_scan.cuh b/thrust/system/cuda/detail/cub/agent/agent_scan.cuh index 512f1eafc..bd35b6932 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_scan.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_scan.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh b/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh index b004beb33..dd5359b96 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh b/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh index a8b89f848..327e66530 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh b/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh index 9d3feb4b6..5a6c4c73c 100644 --- a/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh +++ b/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -422,52 +422,8 @@ struct AgentSpmv #if (CUB_PTX_ARCH >= 520) -/* - OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[tile_num_nonzeros].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.merge_items[0].nonzero; - - OffsetT col_indices[ITEMS_PER_THREAD]; - ValueT mat_values[ITEMS_PER_THREAD]; - int nonzero_indices[ITEMS_PER_THREAD]; - - // Gather the nonzeros for the merge tile into shared memory - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - nonzero_indices[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); - - ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_indices[ITEM]; - ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_indices[ITEM]; - - col_indices[ITEM] = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *ci : 0; - mat_values[ITEM] = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *a : 0.0; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - VectorValueIteratorT x = wd_vector_x + col_indices[ITEM]; - mat_values[ITEM] *= *x; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - ValueT *s = s_tile_nonzeros + nonzero_indices[ITEM]; - - *s = mat_values[ITEM]; - } - - CTA_SYNC(); - -*/ - - OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; // Gather the nonzeros for the merge tile into shared memory #pragma unroll @@ -640,217 +596,6 @@ struct AgentSpmv } - - - - - - /** - * Consume a merge tile, specialized for indirect load of nonzeros - * / - template - __device__ __forceinline__ KeyValuePairT ConsumeTile1( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - IsDirectLoadT is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - - OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[0].row_end_offset; - - int warp_idx = threadIdx.x / WARP_THREADS; - int lane_idx = LaneId(); - - // Gather the row end-offsets for the merge tile into shared memory - #pragma unroll 1 - for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) - { - s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; - } - - CTA_SYNC(); - - // Search for warp start/end coords - if (lane_idx == 0) - { - MergePathSearch( - OffsetT(warp_idx * ITEMS_PER_WARP), // Diagonal - s_tile_row_end_offsets, // List A - CountingInputIterator(tile_start_coord.y), // List B - tile_num_rows, - tile_num_nonzeros, - temp_storage.warp_coords[warp_idx]); - - CoordinateT last = {tile_num_rows, tile_num_nonzeros}; - temp_storage.warp_coords[WARPS] = last; - } - - CTA_SYNC(); - - CoordinateT warp_coord = temp_storage.warp_coords[warp_idx]; - CoordinateT warp_end_coord = temp_storage.warp_coords[warp_idx + 1]; - OffsetT warp_nonzero_idx = tile_start_coord.y + warp_coord.y; - - // Consume whole rows - #pragma unroll 1 - for (; warp_coord.x < warp_end_coord.x; ++warp_coord.x) - { - ValueT row_total = 0.0; - OffsetT row_end_offset = s_tile_row_end_offsets[warp_coord.x]; - - #pragma unroll 1 - for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx; - nonzero_idx < row_end_offset; - nonzero_idx += WARP_THREADS) - { - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - ValueT vector_value = wd_vector_x[column_idx]; - row_total += value * vector_value; - } - - // Warp reduce - row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total); - - // Output - if (lane_idx == 0) - { - spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total; - } - - warp_nonzero_idx = row_end_offset; - } - - // Consume partial portion of thread's last row - if (warp_nonzero_idx < tile_start_coord.y + warp_end_coord.y) - { - ValueT row_total = 0.0; - for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx; - nonzero_idx < tile_start_coord.y + warp_end_coord.y; - nonzero_idx += WARP_THREADS) - { - - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - ValueT vector_value = wd_vector_x[column_idx]; - row_total += value * vector_value; - } - - // Warp reduce - row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total); - - // Output - if (lane_idx == 0) - { - spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total; - } - } - - // Return the tile's running carry-out - KeyValuePairT tile_carry(tile_num_rows, 0.0); - return tile_carry; - } -*/ - - - - - - - - /** - * Consume a merge tile, specialized for indirect load of nonzeros - * / - __device__ __forceinline__ KeyValuePairT ConsumeTile2( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - - ValueT* s_tile_nonzeros = &temp_storage.merge_items[0].nonzero; - - ValueT nonzeros[ITEMS_PER_THREAD]; - - // Gather the nonzeros for the merge tile into shared memory - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); - - OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; - ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; -#if (CUB_PTX_ARCH >= 350) - vector_value = wd_vector_x[column_idx]; -#endif - - nonzeros[ITEM] = value * vector_value; - } - - // Exchange striped->blocked - BlockExchangeT(temp_storage.exchange).StripedToBlocked(nonzeros); - - CTA_SYNC(); - - // Compute an inclusive prefix sum - BlockPrefixSumT(temp_storage.prefix_sum).InclusiveSum(nonzeros, nonzeros); - - CTA_SYNC(); - - if (threadIdx.x == 0) - s_tile_nonzeros[0] = 0.0; - - // Scatter back to smem - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM + 1; - s_tile_nonzeros[item_idx] = nonzeros[ITEM]; - } - - CTA_SYNC(); - - // Gather the row end-offsets for the merge tile into shared memory - #pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) - { - OffsetT start = CUB_MAX(wd_row_end_offsets[tile_start_coord.x + item - 1], tile_start_coord.y); - OffsetT end = wd_row_end_offsets[tile_start_coord.x + item]; - - start -= tile_start_coord.y; - end -= tile_start_coord.y; - - ValueT row_partial = s_tile_nonzeros[end] - s_tile_nonzeros[start]; - - spmv_params.d_vector_y[tile_start_coord.x + item] = row_partial; - } - - // Get the tile's carry-out - KeyValuePairT tile_carry; - if (threadIdx.x == 0) - { - tile_carry.key = tile_num_rows; - - OffsetT start = CUB_MAX(wd_row_end_offsets[tile_end_coord.x - 1], tile_start_coord.y); - start -= tile_start_coord.y; - OffsetT end = tile_num_nonzeros; - - tile_carry.value = s_tile_nonzeros[end] - s_tile_nonzeros[start]; - } - - // Return the tile's running carry-out - return tile_carry; - } -*/ - - /** * Consume input tile */ diff --git a/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh b/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh index 5503c8cf0..438c643b4 100644 --- a/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh +++ b/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -320,7 +320,7 @@ struct ScanTileState cudaError_t error = cudaSuccess; do { - void* allocations[3] = { NULL, NULL, NULL }; + void* allocations[3]; size_t allocation_sizes[3]; allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors diff --git a/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh b/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh index 5f212dce9..dae1f3018 100644 --- a/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh +++ b/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh b/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh index 17ef2ab37..f43ee39ee 100644 --- a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh +++ b/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_exchange.cuh b/thrust/system/cuda/detail/cub/block/block_exchange.cuh index a8e386e04..7cc8c5abb 100644 --- a/thrust/system/cuda/detail/cub/block/block_exchange.cuh +++ b/thrust/system/cuda/detail/cub/block/block_exchange.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_histogram.cuh b/thrust/system/cuda/detail/cub/block/block_histogram.cuh index 4a5233b91..f97f89ea6 100644 --- a/thrust/system/cuda/detail/cub/block/block_histogram.cuh +++ b/thrust/system/cuda/detail/cub/block/block_histogram.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_load.cuh b/thrust/system/cuda/detail/cub/block/block_load.cuh index 5d97b6598..6f7671b4b 100644 --- a/thrust/system/cuda/detail/cub/block/block_load.cuh +++ b/thrust/system/cuda/detail/cub/block/block_load.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -809,10 +809,7 @@ private: /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; + {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; @@ -849,10 +846,7 @@ private: InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); + LoadDirectStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).StripedToBlocked(items, items); } @@ -864,10 +858,7 @@ private: int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); + LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).StripedToBlocked(items, items); } @@ -893,10 +884,7 @@ private: /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; + {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; @@ -933,10 +921,7 @@ private: InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } @@ -949,10 +934,7 @@ private: int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } }; @@ -977,10 +959,7 @@ private: /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; + {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; @@ -1017,10 +996,7 @@ private: InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load int valid_items) ///< [in] Number of valid items to load { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } @@ -1033,10 +1009,7 @@ private: int valid_items, ///< [in] Number of valid items to load DefaultT oob_default) ///< [in] Default value to assign out-of-bound items { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } }; diff --git a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh b/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh index 743c10103..cfd0652ec 100644 --- a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh +++ b/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -140,7 +140,7 @@ public: enum { /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS), + BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: @@ -495,7 +495,7 @@ public: enum { /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS), + BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: @@ -589,7 +589,6 @@ public: // Each warp will strip-mine its section of input, one strip at a time volatile DigitCounterT *digit_counters[KEYS_PER_THREAD]; - uint32_t lane_id = LaneId(); uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; uint32_t lane_mask_lt = LaneMaskLt(); diff --git a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh b/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh index 27d61cb70..8a54b3fb9 100644 --- a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh +++ b/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -71,8 +71,9 @@ namespace cub { * given input sequence of keys and a set of rules specifying a total ordering * of the symbolic alphabet, the radix sorting method produces a lexicographic * ordering of those keys. - * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: - * unsigned char, \p int, \p double, etc. Within each key, the implementation treats fixed-length + * - BlockRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Within each key, the implementation treats fixed-length * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting * method can only be applied to unsigned integral types, BlockRadixSort * is able to sort signed and floating-point types via simple bit-wise transformations diff --git a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh b/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh index c04af877a..9cf4ffa97 100644 --- a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh +++ b/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_reduce.cuh b/thrust/system/cuda/detail/cub/block/block_reduce.cuh index f44113ed2..12a79ecea 100644 --- a/thrust/system/cuda/detail/cub/block/block_reduce.cuh +++ b/thrust/system/cuda/detail/cub/block/block_reduce.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_scan.cuh b/thrust/system/cuda/detail/cub/block/block_scan.cuh index 80f0affe7..c553cfbe4 100644 --- a/thrust/system/cuda/detail/cub/block/block_scan.cuh +++ b/thrust/system/cuda/detail/cub/block/block_scan.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_shuffle.cuh b/thrust/system/cuda/detail/cub/block/block_shuffle.cuh index b357e66f4..eb49fb6d4 100644 --- a/thrust/system/cuda/detail/cub/block/block_shuffle.cuh +++ b/thrust/system/cuda/detail/cub/block/block_shuffle.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/block_store.cuh b/thrust/system/cuda/detail/cub/block/block_store.cuh index 6b5e1ae4a..c79c94f5b 100644 --- a/thrust/system/cuda/detail/cub/block/block_store.cuh +++ b/thrust/system/cuda/detail/cub/block/block_store.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh index 8ae7b46a5..c971f000a 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh index 5955a3a4c..cdbbefd40 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh index c8eb14718..612a5acf7 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 29f7f6182..012c71d4e 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh index edd501aad..2e8be1c3d 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh index 0560235bb..0d49d0693 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh index e7dcc6e1f..6f582a8e4 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh index d6e61f059..2be0e749c 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh index 0d13d3ce0..15a9cf54b 100644 --- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh +++ b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/cub.cuh b/thrust/system/cuda/detail/cub/cub.cuh index b1c8e3200..3ece0f658 100644 --- a/thrust/system/cuda/detail/cub/cub.cuh +++ b/thrust/system/cuda/detail/cub/cub.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_histogram.cuh b/thrust/system/cuda/detail/cub/device/device_histogram.cuh index e54fdd0b7..259bcad32 100644 --- a/thrust/system/cuda/detail/cub/device/device_histogram.cuh +++ b/thrust/system/cuda/detail/cub/device/device_histogram.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_partition.cuh b/thrust/system/cuda/detail/cub/device/device_partition.cuh index 3ffcc9b81..178cfe938 100644 --- a/thrust/system/cuda/detail/cub/device/device_partition.cuh +++ b/thrust/system/cuda/detail/cub/device/device_partition.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh index c767c4035..aead91103 100644 --- a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh +++ b/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -62,8 +62,9 @@ namespace cub { * ordering of those keys. * * \par - * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: - * unsigned char, \p int, \p double, etc. Although the direct radix sorting + * DeviceRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting * method can only be applied to unsigned integral types, DeviceRadixSort * is able to sort signed and floating-point types via simple bit-wise transformations * that ensure lexicographic key ordering. diff --git a/thrust/system/cuda/detail/cub/device/device_reduce.cuh b/thrust/system/cuda/detail/cub/device/device_reduce.cuh index 645e19988..43b91f799 100644 --- a/thrust/system/cuda/detail/cub/device/device_reduce.cuh +++ b/thrust/system/cuda/detail/cub/device/device_reduce.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh b/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh index 7cdb1c3fa..236926c71 100644 --- a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh +++ b/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_scan.cuh b/thrust/system/cuda/detail/cub/device/device_scan.cuh index 0742bdb4a..91827f230 100644 --- a/thrust/system/cuda/detail/cub/device/device_scan.cuh +++ b/thrust/system/cuda/detail/cub/device/device_scan.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh index 624e64793..dc019331e 100644 --- a/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh +++ b/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -62,8 +62,9 @@ namespace cub { * ordering of those keys. * * \par - * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: - * unsigned char, \p int, \p double, etc. Although the direct radix sorting + * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort * is able to sort signed and floating-point types via simple bit-wise transformations * that ensure lexicographic key ordering. diff --git a/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh b/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh index c38d9f1c8..5626e0a00 100644 --- a/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh +++ b/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_select.cuh b/thrust/system/cuda/detail/cub/device/device_select.cuh index 909a37e22..3dc9d6ac3 100644 --- a/thrust/system/cuda/detail/cub/device/device_select.cuh +++ b/thrust/system/cuda/detail/cub/device/device_select.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/device_spmv.cuh b/thrust/system/cuda/detail/cub/device/device_spmv.cuh index 60e7aa6ee..611d75d3a 100644 --- a/thrust/system/cuda/detail/cub/device/device_spmv.cuh +++ b/thrust/system/cuda/detail/cub/device/device_spmv.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh index f864a71ef..4bf7d6f85 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh index 6c9a87f47..baf7f422c 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -104,7 +104,7 @@ __global__ void DeviceRadixSortUpsweepKernel( CTA_SYNC(); // Write out digit counts (striped) - upsweep.ExtractCounts(d_spine, gridDim.x, blockIdx.x); + upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); } @@ -279,6 +279,10 @@ __global__ void DeviceRadixSortSingleTileKernel( // Load values if (!KEYS_ONLY) { + // Register pressure work-around: moving num_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + num_items = ShuffleIndex(num_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff); + BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); CTA_SYNC(); @@ -496,64 +500,24 @@ struct DeviceRadixSortPolicy { // Whether this is a keys-only (or key-value) sort KEYS_ONLY = (Equals::VALUE), - - // Relative size of KeyT type to a 4-byte word - SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, }; + // Dominant-sized key/value type + typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT; + //------------------------------------------------------------------------------ // Architecture-specific tuning policies //------------------------------------------------------------------------------ - /// SM13 - struct Policy130 : ChainedPolicy<130, Policy130, Policy130> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - }; - - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - /// SM20 - struct Policy200 : ChainedPolicy<200, Policy200, Policy130> + struct Policy200 : ChainedPolicy<200, Policy200, Policy200> { enum { PRIMARY_RADIX_BITS = 5, ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, }; // Keys-only upsweep policies @@ -597,6 +561,9 @@ struct DeviceRadixSortPolicy enum { PRIMARY_RADIX_BITS = 5, ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, }; // Keys-only upsweep policies @@ -639,19 +606,19 @@ struct DeviceRadixSortPolicy struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { enum { - PRIMARY_RADIX_BITS = 6, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) }; // Scan policy typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyKeys; // Key-value pairs downsweep policies typedef DownsweepPolicyKeys DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyPairs; // Downsweep policies typedef typename If::Type DownsweepPolicy; @@ -676,28 +643,28 @@ struct DeviceRadixSortPolicy struct Policy500 : ChainedPolicy<500, Policy500, Policy350> { enum { - PRIMARY_RADIX_BITS = 7, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 3.1B 32b segmented keys/s (TitanX) + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) }; // ScanPolicy typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <160, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 31 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; }; @@ -705,28 +672,28 @@ struct DeviceRadixSortPolicy struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { enum { - PRIMARY_RADIX_BITS = 7, // 6.9B 32b keys/s (Quadro P100) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 5.9B 32b segmented keys/s (Quadro P100) + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) }; // ScanPolicy typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 25 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; }; @@ -735,28 +702,28 @@ struct DeviceRadixSortPolicy struct Policy610 : ChainedPolicy<610, Policy610, Policy600> { enum { - PRIMARY_RADIX_BITS = 7, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 3.3B 32b segmented keys/s (1080) + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) }; // ScanPolicy typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 31 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 35 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; // Upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS> UpsweepPolicy; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; + typedef AgentRadixSortUpsweepPolicy UpsweepPolicy; + typedef AgentRadixSortUpsweepPolicy AltUpsweepPolicy; // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; }; @@ -772,15 +739,15 @@ struct DeviceRadixSortPolicy typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS> AltDownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy; + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; // Segmented policies typedef DownsweepPolicy SegmentedPolicy; @@ -792,28 +759,28 @@ struct DeviceRadixSortPolicy struct Policy700 : ChainedPolicy<700, Policy700, Policy620> { enum { - PRIMARY_RADIX_BITS = 6, // 7.62B 32b keys/s (GV100) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 8.7B 32b segmented keys/s (GV100) + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) }; // ScanPolicy typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 47 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 29 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; // Upsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 47 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> UpsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 29 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; }; diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh index dfc390c5a..44b1233a4 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -248,10 +248,10 @@ struct DeviceReducePolicy { // ReducePolicy typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(128, 8, OutputT), ///< Threads per block, items per thread - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier + CUB_SCALED_GRANULARITIES(128, 8, OutputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier ReducePolicy; // SingleTilePolicy @@ -267,10 +267,10 @@ struct DeviceReducePolicy { // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(128, 8, OutputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier + CUB_SCALED_GRANULARITIES(128, 8, OutputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier ReducePolicy; // SingleTilePolicy @@ -286,10 +286,10 @@ struct DeviceReducePolicy { // ReducePolicy (GTX670: 154.0 @ 48M 4B items) typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(256, 20, OutputT), ///< Threads per block, items per thread - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier + CUB_SCALED_GRANULARITIES(256, 20, OutputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier ReducePolicy; // SingleTilePolicy @@ -305,10 +305,10 @@ struct DeviceReducePolicy { // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(256, 20, OutputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier + CUB_SCALED_GRANULARITIES(256, 20, OutputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier ReducePolicy; // SingleTilePolicy @@ -323,10 +323,10 @@ struct DeviceReducePolicy { // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(256, 16, OutputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier + CUB_SCALED_GRANULARITIES(256, 16, OutputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier ReducePolicy; // SingleTilePolicy diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh index 501ae0da1..38bee414e 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh index 704968dd9..0d244a8a6 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh index f1522aaf9..782e686d5 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -174,7 +174,7 @@ struct DispatchScan struct Policy600 { typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 15, OutputT), ///< Threads per block, items per thread + CUB_SCALED_GRANULARITIES(128, 15, OutputT), ///< Threads per block, items per thread BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, @@ -188,7 +188,7 @@ struct DispatchScan { // Titan X: 32.47B items/s @ 48M 32-bit T typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, @@ -202,7 +202,7 @@ struct DispatchScan { // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, @@ -214,7 +214,7 @@ struct DispatchScan struct Policy300 { typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(256, 9, OutputT), ///< Threads per block, items per thread + CUB_SCALED_GRANULARITIES(256, 9, OutputT), ///< Threads per block, items per thread BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, @@ -227,7 +227,7 @@ struct DispatchScan { // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, @@ -239,7 +239,7 @@ struct DispatchScan struct Policy130 { typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(96, 21, OutputT), ///< Threads per block, items per thread + CUB_SCALED_GRANULARITIES(96, 21, OutputT), ///< Threads per block, items per thread BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, @@ -251,7 +251,7 @@ struct DispatchScan struct Policy100 { typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(64, 9, OutputT), ///< Threads per block, items per thread + CUB_SCALED_GRANULARITIES(64, 9, OutputT), ///< Threads per block, items per thread BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh index 2b33879ec..1b3aa8dad 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh index 54c2c8cad..a0bf515c1 100644 --- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -415,12 +415,41 @@ struct DispatchSpmv }; + /// SM60 + struct Policy600 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 5 : 7, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + //--------------------------------------------------------------------- // Tuning policies of current PTX compiler pass //--------------------------------------------------------------------- -#if (CUB_PTX_ARCH >= 500) +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 500) typedef Policy500 PtxPolicy; #elif (CUB_PTX_ARCH >= 370) @@ -468,7 +497,12 @@ struct DispatchSpmv #else // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 500) + if (ptx_version >= 600) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 500) { spmv_config.template Init(); segment_fixup_config.template Init(); @@ -786,56 +820,6 @@ struct DispatchSpmv DeviceSegmentFixupKernel, spmv_config, segment_fixup_config))) break; -/* - // Dispatch - if (spmv_params.beta == 0.0) - { - if (spmv_params.alpha == 1.0) - { - // Dispatch y = A*x - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmv1ColKernel, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - else - { - // Dispatch y = alpha*A*x - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - } - else - { - if (spmv_params.alpha == 1.0) - { - // Dispatch y = A*x + beta*y - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - else - { - // Dispatch y = alpha*A*x + beta*y - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - } -*/ } while (0); diff --git a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh b/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh index 8d1555269..5b12c66ed 100644 --- a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh +++ b/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh b/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh index f1b1fe7e3..59fe5c909 100644 --- a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh +++ b/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh b/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh index 14af378ee..6d1ab5846 100644 --- a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh +++ b/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh b/thrust/system/cuda/detail/cub/grid/grid_queue.cuh index e9d81a01b..3c5330e4a 100644 --- a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh +++ b/thrust/system/cuda/detail/cub/grid/grid_queue.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/host/mutex.cuh b/thrust/system/cuda/detail/cub/host/mutex.cuh index 8fe3e9287..30d64b7d4 100644 --- a/thrust/system/cuda/detail/cub/host/mutex.cuh +++ b/thrust/system/cuda/detail/cub/host/mutex.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh index d0a2678b8..e527202e4 100644 --- a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh index 484da0186..012a32180 100644 --- a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh index 1822be7e1..9038fed64 100644 --- a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh index 13fc75147..e2582db35 100644 --- a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh index 93a7c644f..69a736302 100644 --- a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh index 3a40e949b..497b2893a 100644 --- a/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh index 74ba6f926..7067ae001 100644 --- a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh index 5a6f556fd..73904b787 100644 --- a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh index e85e899cb..5ab407b0c 100644 --- a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh +++ b/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/thread/thread_load.cuh b/thrust/system/cuda/detail/cub/thread/thread_load.cuh index 3342759f7..888fa8ea8 100644 --- a/thrust/system/cuda/detail/cub/thread/thread_load.cuh +++ b/thrust/system/cuda/detail/cub/thread/thread_load.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh b/thrust/system/cuda/detail/cub/thread/thread_operators.cuh index d1f7cb6db..5bfa790e2 100644 --- a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh +++ b/thrust/system/cuda/detail/cub/thread/thread_operators.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh b/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh index 8cc9cf4f1..7e525ea0c 100644 --- a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh +++ b/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh b/thrust/system/cuda/detail/cub/thread/thread_scan.cuh index 44a318c83..94f3016f4 100644 --- a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh +++ b/thrust/system/cuda/detail/cub/thread/thread_scan.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/thread/thread_search.cuh b/thrust/system/cuda/detail/cub/thread/thread_search.cuh index 70cf6bdfe..3fcdd628f 100644 --- a/thrust/system/cuda/detail/cub/thread/thread_search.cuh +++ b/thrust/system/cuda/detail/cub/thread/thread_search.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/thread/thread_store.cuh b/thrust/system/cuda/detail/cub/thread/thread_store.cuh index 05a9e1676..e79122c85 100644 --- a/thrust/system/cuda/detail/cub/thread/thread_store.cuh +++ b/thrust/system/cuda/detail/cub/thread/thread_store.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/util_allocator.cuh b/thrust/system/cuda/detail/cub/util_allocator.cuh index cc44a4944..3ed80d3c5 100644 --- a/thrust/system/cuda/detail/cub/util_allocator.cuh +++ b/thrust/system/cuda/detail/cub/util_allocator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/util_arch.cuh b/thrust/system/cuda/detail/cub/util_arch.cuh index e2b42b44b..e869b85b5 100644 --- a/thrust/system/cuda/detail/cub/util_arch.cuh +++ b/thrust/system/cuda/detail/cub/util_arch.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -116,31 +116,31 @@ namespace cub { #endif -/// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data. Minimum of two warps. -#ifndef CUB_BLOCK_THREADS - #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ +/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. +#ifndef CUB_SCALED_BLOCK_THREADS + #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ (CUB_MIN( \ - NOMINAL_4B_BLOCK_THREADS * 2, \ + NOMINAL_4B_BLOCK_THREADS, \ CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ - (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4, \ + 2, \ (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) #endif -/// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data. Minimum 1 item per thread -#ifndef CUB_ITEMS_PER_THREAD - #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ - (CUB_MIN( \ - NOMINAL_4B_ITEMS_PER_THREAD * 2, \ - CUB_MAX( \ - 1, \ - (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)))) +/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread +#ifndef CUB_SCALED_ITEMS_PER_THREAD + #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + CUB_MAX( \ + 1, \ + (sizeof(T) < 4) ? \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) #endif /// Define both nominal threads-per-block and items-per-thread -#ifndef CUB_NOMINAL_CONFIG - #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ - CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ - CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) +#ifndef CUB_SCALED_GRANULARITIES + #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ + CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ + CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) #endif diff --git a/thrust/system/cuda/detail/cub/util_debug.cuh b/thrust/system/cuda/detail/cub/util_debug.cuh index 37f92db26..5dcacbaf7 100644 --- a/thrust/system/cuda/detail/cub/util_debug.cuh +++ b/thrust/system/cuda/detail/cub/util_debug.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/util_device.cuh b/thrust/system/cuda/detail/cub/util_device.cuh index 1b771e694..ca55bd530 100644 --- a/thrust/system/cuda/detail/cub/util_device.cuh +++ b/thrust/system/cuda/detail/cub/util_device.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/util_macro.cuh b/thrust/system/cuda/detail/cub/util_macro.cuh index 0474feb53..14bd9b12b 100644 --- a/thrust/system/cuda/detail/cub/util_macro.cuh +++ b/thrust/system/cuda/detail/cub/util_macro.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/util_namespace.cuh b/thrust/system/cuda/detail/cub/util_namespace.cuh index ef24c5550..0c2bf29fe 100644 --- a/thrust/system/cuda/detail/cub/util_namespace.cuh +++ b/thrust/system/cuda/detail/cub/util_namespace.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/util_ptx.cuh b/thrust/system/cuda/detail/cub/util_ptx.cuh index 9a72b3de2..aff170333 100644 --- a/thrust/system/cuda/detail/cub/util_ptx.cuh +++ b/thrust/system/cuda/detail/cub/util_ptx.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/util_type.cuh b/thrust/system/cuda/detail/cub/util_type.cuh index cbebb3e47..bd3bebd36 100644 --- a/thrust/system/cuda/detail/cub/util_type.cuh +++ b/thrust/system/cuda/detail/cub/util_type.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,10 +37,16 @@ #include #include +#if (__CUDACC_VER_MAJOR__ >= 9) + #include +#endif + #include "util_macro.cuh" #include "util_arch.cuh" #include "util_namespace.cuh" + + /// Optional outer namespace(s) THRUST_CUB_NS_PREFIX @@ -889,10 +895,10 @@ private: template static char Test(SFINAE3 *); template static char Test(SFINAE4 *); */ - template static char Test(SFINAE5 *); - template static char Test(SFINAE6 *); - template static char Test(SFINAE7 *); - template static char Test(SFINAE8 *); + template __host__ __device__ static char Test(SFINAE5 *); + template __host__ __device__ static char Test(SFINAE6 *); + template __host__ __device__ static char Test(SFINAE7 *); + template __host__ __device__ static char Test(SFINAE8 *); template static int Test(...); @@ -1057,6 +1063,23 @@ struct FpLimits }; +#if (__CUDACC_VER_MAJOR__ >= 9) +template <> +struct FpLimits<__half> +{ + static __host__ __device__ __forceinline__ __half Max() { + unsigned short max_word = 0x7BFF; + return reinterpret_cast<__half&>(max_word); + } + + static __host__ __device__ __forceinline__ __half Lowest() { + unsigned short lowest_word = 0xFBFF; + return reinterpret_cast<__half&>(lowest_word); + } +}; +#endif + + /** * Basic type traits (fp primitive specialization) */ @@ -1120,6 +1143,9 @@ template <> struct NumericTraits : BaseTraits struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; +#if (__CUDACC_VER_MAJOR__ >= 9) + template <> struct NumericTraits<__half> : BaseTraits {}; +#endif template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh index 4a719625f..c92765297 100644 --- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh index bec27e4e8..4325ca0c8 100644 --- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh +++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh index ebff77335..d5f40161b 100644 --- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh +++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh index aaa3d095c..5bafb3559 100644 --- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh +++ b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh b/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh index 907053de5..baef93594 100644 --- a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh +++ b/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh b/thrust/system/cuda/detail/cub/warp/warp_scan.cuh index 8966a1e4b..aa7149586 100644 --- a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh +++ b/thrust/system/cuda/detail/cub/warp/warp_scan.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: