Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 4b61388

Browse files
committed
CUB: Integrate CUB 1.7.5 into Thrust to pull in the (corrected) fix for small
data type radix sorting performance regressions. Bug 1997368 Bug 200355591 git-commit b56409c060fe4c718066d19099fb12d8acdb2163 git-author Bryce Adelstein Lelbach aka wash <[email protected]> VDVS: http://ausdvs.nvidia.com/Build_Results?virtualId=1000099285&which_page=current_build Jobs: 1997368-2006 200355591-2006 [git-p4: depot-paths = "//sw/gpgpu/thrust/": change = 23593281]
1 parent 5a0a118 commit 4b61388

File tree

88 files changed

+318
-610
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+318
-610
lines changed

internal/rename_cub_namespace.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#! /bin/bash
2+
3+
# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to add a THRUST_
4+
# prefix to CUB's namespace macro.
5+
6+
sed -i -e 's/CUB_NS_P/THRUST_CUB_NS_P/g' `find . -type f`
7+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#! /bin/bash
2+
3+
# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to undo the
4+
# renaming of CUB's namespace macro.
5+
6+
sed -i -e 's|THRUST_CUB_NS_P|CUB_NS_P|g' `find . -type f`
7+

internal/update_thrust_cub.sh

Lines changed: 0 additions & 18 deletions
This file was deleted.

thrust/system/cuda/detail/cub/agent/agent_histogram.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3-
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
44
*
55
* Redistribution and use in source and binary forms, with or without
66
* modification, are permitted provided that the following conditions are met:

thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3-
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
44
*
55
* Redistribution and use in source and binary forms, with or without
66
* modification, are permitted provided that the following conditions are met:
@@ -293,7 +293,7 @@ struct AgentRadixSortDownsweep
293293
{
294294
ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
295295

296-
if (FULL_TILE ||
296+
if (FULL_TILE ||
297297
(static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
298298
{
299299
d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
@@ -332,6 +332,10 @@ struct AgentRadixSortDownsweep
332332
Int2Type<false> is_full_tile,
333333
Int2Type<_RANK_ALGORITHM> rank_algorithm)
334334
{
335+
// Register pressure work-around: moving valid_items through shfl prevents compiler
336+
// from reusing guards/addressing from prior guarded loads
337+
valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
338+
335339
BlockLoadKeysT(temp_storage.load_keys).Load(
336340
d_keys_in + block_offset, keys, valid_items, oob_item);
337341

@@ -365,6 +369,10 @@ struct AgentRadixSortDownsweep
365369
Int2Type<false> is_full_tile,
366370
Int2Type<RADIX_RANK_MATCH> rank_algorithm)
367371
{
372+
// Register pressure work-around: moving valid_items through shfl prevents compiler
373+
// from reusing guards/addressing from prior guarded loads
374+
valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
375+
368376
LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
369377
}
370378

@@ -398,6 +406,10 @@ struct AgentRadixSortDownsweep
398406
Int2Type<false> is_full_tile,
399407
Int2Type<_RANK_ALGORITHM> rank_algorithm)
400408
{
409+
// Register pressure work-around: moving valid_items through shfl prevents compiler
410+
// from reusing guards/addressing from prior guarded loads
411+
valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
412+
401413
BlockLoadValuesT(temp_storage.load_values).Load(
402414
d_values_in + block_offset, values, valid_items);
403415

@@ -411,7 +423,7 @@ struct AgentRadixSortDownsweep
411423
__device__ __forceinline__ void LoadValues(
412424
ValueT (&values)[ITEMS_PER_THREAD],
413425
OffsetT block_offset,
414-
volatile OffsetT valid_items,
426+
OffsetT valid_items,
415427
Int2Type<true> is_full_tile,
416428
Int2Type<RADIX_RANK_MATCH> rank_algorithm)
417429
{
@@ -425,10 +437,14 @@ struct AgentRadixSortDownsweep
425437
__device__ __forceinline__ void LoadValues(
426438
ValueT (&values)[ITEMS_PER_THREAD],
427439
OffsetT block_offset,
428-
volatile OffsetT valid_items,
440+
OffsetT valid_items,
429441
Int2Type<false> is_full_tile,
430442
Int2Type<RADIX_RANK_MATCH> rank_algorithm)
431443
{
444+
// Register pressure work-around: moving valid_items through shfl prevents compiler
445+
// from reusing guards/addressing from prior guarded loads
446+
valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
447+
432448
LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
433449
}
434450

@@ -444,10 +460,10 @@ struct AgentRadixSortDownsweep
444460
OffsetT valid_items,
445461
Int2Type<false> /*is_keys_only*/)
446462
{
447-
CTA_SYNC();
448-
449463
ValueT values[ITEMS_PER_THREAD];
450464

465+
CTA_SYNC();
466+
451467
LoadValues(
452468
values,
453469
block_offset,
@@ -746,6 +762,7 @@ struct AgentRadixSortDownsweep
746762
else
747763
{
748764
// Process full tiles of tile_items
765+
#pragma unroll 1
749766
while (block_offset + TILE_ITEMS <= block_end)
750767
{
751768
ProcessTile<true>(block_offset);

thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3-
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
44
*
55
* Redistribution and use in source and binary forms, with or without
66
* modification, are permitted provided that the following conditions are met:

thrust/system/cuda/detail/cub/agent/agent_reduce.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3-
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
44
*
55
* Redistribution and use in source and binary forms, with or without
66
* modification, are permitted provided that the following conditions are met:

thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3-
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
44
*
55
* Redistribution and use in source and binary forms, with or without
66
* modification, are permitted provided that the following conditions are met:
@@ -454,13 +454,13 @@ struct AgentReduceByKey
454454
// Perform exclusive tile scan
455455
OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate
456456
OffsetT num_segments_prefix; // Number of segments prior to this tile
457-
ValueOutputT total_aggregate; // The tile prefix folded with block_aggregate
457+
OffsetValuePairT total_aggregate; // The tile prefix folded with block_aggregate
458458
if (tile_idx == 0)
459459
{
460460
// Scan first tile
461461
BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
462462
num_segments_prefix = 0;
463-
total_aggregate = block_aggregate.value;
463+
total_aggregate = block_aggregate;
464464

465465
// Update tile status if there are successor tiles
466466
if ((!IS_LAST_TILE) && (threadIdx.x == 0))
@@ -474,9 +474,7 @@ struct AgentReduceByKey
474474

475475
block_aggregate = prefix_op.GetBlockAggregate();
476476
num_segments_prefix = prefix_op.GetExclusivePrefix().key;
477-
total_aggregate = reduction_op(
478-
prefix_op.GetExclusivePrefix().value,
479-
block_aggregate.value);
477+
total_aggregate = prefix_op.GetInclusivePrefix();
480478
}
481479

482480
// Rezip scatter items and segment indices
@@ -506,7 +504,7 @@ struct AgentReduceByKey
506504
if (num_remaining == TILE_ITEMS)
507505
{
508506
d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1];
509-
d_aggregates_out[num_segments] = total_aggregate;
507+
d_aggregates_out[num_segments] = total_aggregate.value;
510508
num_segments++;
511509
}
512510

thrust/system/cuda/detail/cub/agent/agent_rle.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3-
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
44
*
55
* Redistribution and use in source and binary forms, with or without
66
* modification, are permitted provided that the following conditions are met:
@@ -618,8 +618,8 @@ struct AgentRle
618618
OffsetT num_items, ///< Total number of global input items
619619
OffsetT num_remaining, ///< Number of global input items remaining (including this tile)
620620
int tile_idx, ///< Tile index
621-
OffsetT tile_offset, ///< Tile offset
622-
ScanTileStateT &tile_status) ///< Global list of tile status
621+
OffsetT tile_offset, ///< Tile offset
622+
ScanTileStateT &tile_status) ///< Global list of tile status
623623
{
624624
if (tile_idx == 0)
625625
{

thrust/system/cuda/detail/cub/agent/agent_scan.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/******************************************************************************
22
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3-
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
3+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
44
*
55
* Redistribution and use in source and binary forms, with or without
66
* modification, are permitted provided that the following conditions are met:

0 commit comments

Comments
 (0)