Skip to content

Commit 898dcb3

Browse files
NaraendaBeanavilex-rzrborysborys
authored
Develop stream 2025-03-10 (#622)
* Deprecate fortran interface * Resolve "Index performance of rocRAND on MI300 and curand on H100 and H200" * Stop using __AMDGCN_WAVEFRONT_SIZE and warpSize as compile-time constants * log more information during configure * docs(changelog): re-organize newly added changelog entries * Resolve "bug found on SOBOL generator" --------- Co-authored-by: Beatriz Navidad Vilches <[email protected]> Co-authored-by: Anton Gorenko <[email protected]> Co-authored-by: Borys Petrov <[email protected]>
1 parent 5187953 commit 898dcb3

15 files changed

+245
-274
lines changed

CHANGELOG.md

+12-7
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,25 @@ Documentation for rocRAND is available at
1212
### Changed
1313

1414
* Updated several `gfx942` auto tuning parameters.
15-
* Deprecated C++14 and set the default target to C++17.
16-
* Directly accessing the (scrambled) sobol32 and sobol64 constants and direction vectors is deprecated:
15+
* Modified error handling and expanded the error information for the case of double-deallocation of the (scrambled) sobol32 and sobol64 constants and direction vectors.
16+
17+
### Removed
18+
19+
* Removed inline assembly and the `ENABLE_INLINE_ASM` CMake option. Inline assembly was used to optimizate of multiplications in the Mrg32k3a and Philox 4x32-10 generators. It is no longer needed because the current HIP compiler is able to produce code with the same or better performance.
20+
* Removed instances of the deprecated clang definition `__AMDGCN_WAVEFRONT_SIZE`.
21+
22+
### Upcoming changes
23+
24+
* Deprecated the rocRAND Fortran API in favor of hipfort.
25+
* Deprecated C++14 and set the default target to C++17. C++14 will be removed in the next major release.
26+
* Directly accessing the (scrambled) sobol32 and sobol64 constants and direction vectors is deprecated and will be removed in the next major release. For:
1727
* `h_scrambled_sobol32_constants`, use `rocrand_get_scramble_constants32` instead.
1828
* `h_scrambled_sobol64_constants`, use `rocrand_get_scramble_constants64` instead.
1929
* `rocrand_h_sobol32_direction_vectors`, use `rocrand_get_direction_vectors32` instead.
2030
* `rocrand_h_sobol64_direction_vectors`, use `rocrand_get_direction_vectors64` instead.
2131
* `rocrand_h_scrambled_sobol32_direction_vectors`, use `rocrand_get_direction_vectors32` instead.
2232
* `rocrand_h_scrambled_sobol64_direction_vectors`, use `rocrand_get_direction_vectors64` instead.
2333

24-
### Upcoming changes
25-
* C++14 will be removed in the next major release.
26-
* Directly accessing the (scrambled) sobol32 and sobol64 constants and direction vectors will be removed in the next major release.
27-
28-
2934
### Fixed
3035

3136
* Fixed an issue where `mt19937.hpp` would cause kernel errors during auto tuning.

README.md

-4
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,6 @@ compilers) may cause a build failure; if you encounter errors with the existing
121121
other dependencies, you can pass the `DEPENDENCIES_FORCE_DOWNLOAD` flag to CMake, which can
122122
help to solve the problem.
123123

124-
To disable inline assembly optimizations in rocRAND (for both the host library and
125-
the device functions provided in `rocrand_kernel.h`), set the CMake option `ENABLE_INLINE_ASM`
126-
to `OFF`.
127-
128124
## Running unit tests
129125

130126
```shell

benchmark/benchmark_curand_generate.cpp

+41-18
Original file line numberDiff line numberDiff line change
@@ -150,25 +150,31 @@ void run_benchmarks(const cli::Parser& parser,
150150
const std::string format = parser.get<std::string>("format");
151151
if (distribution == "uniform-uint")
152152
{
153-
run_benchmark<unsigned int>(
154-
parser,
155-
rng_type,
156-
stream,
157-
[](curandGenerator_t gen, unsigned int* data, size_t size)
158-
{ return curandGenerate(gen, data, size); },
159-
distribution,
160-
engine);
153+
if(rng_type != CURAND_RNG_QUASI_SOBOL64 && rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
154+
{
155+
run_benchmark<unsigned int>(
156+
parser,
157+
rng_type,
158+
stream,
159+
[](curandGenerator_t gen, unsigned int* data, size_t size)
160+
{ return curandGenerate(gen, data, size); },
161+
distribution,
162+
engine);
163+
}
161164
}
162165
if (distribution == "uniform-long-long")
163166
{
164-
run_benchmark<unsigned long long>(
165-
parser,
166-
rng_type,
167-
stream,
168-
[](curandGenerator_t gen, unsigned long long* data, size_t size)
169-
{ return curandGenerateLongLong(gen, data, size); },
170-
distribution,
171-
engine);
167+
if(rng_type == CURAND_RNG_QUASI_SOBOL64 || rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
168+
{
169+
run_benchmark<unsigned long long>(
170+
parser,
171+
rng_type,
172+
stream,
173+
[](curandGenerator_t gen, unsigned long long* data, size_t size)
174+
{ return curandGenerateLongLong(gen, data, size); },
175+
distribution,
176+
engine);
177+
}
172178
}
173179
if (distribution == "uniform-float")
174180
{
@@ -311,6 +317,10 @@ int main(int argc, char *argv[])
311317
parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
312318
parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
313319
parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
320+
parser.set_optional<std::string>("format",
321+
"format",
322+
{"console"},
323+
"output format: console or csv");
314324
parser.run_and_exit_if_error();
315325

316326
std::vector<std::string> engines;
@@ -365,6 +375,17 @@ int main(int argc, char *argv[])
365375
cudaStream_t stream;
366376
CUDA_CALL(cudaStreamCreate(&stream));
367377

378+
std::string format = parser.get<std::string>("format");
379+
bool console_output = format.compare("console") == 0 ? true : false;
380+
381+
if(!console_output)
382+
{
383+
std::cout
384+
<< "Engine,Distribution,Throughput,Samples,AvgTime (1 Trial),Time(all),Size,Lambda"
385+
<< std::endl;
386+
std::cout << ",,GB/s,GSample/s,ms),ms),values," << std::endl;
387+
}
388+
368389
for (auto engine : engines)
369390
{
370391
rng_type_t rng_type = CURAND_RNG_PSEUDO_XORWOW;
@@ -392,11 +413,13 @@ int main(int argc, char *argv[])
392413
exit(1);
393414
}
394415

395-
std::cout << engine << ":" << std::endl;
416+
if(console_output)
417+
std::cout << engine << ":" << std::endl;
396418

397419
for (auto distribution : distributions)
398420
{
399-
std::cout << " " << distribution << ":" << std::endl;
421+
if(console_output)
422+
std::cout << " " << distribution << ":" << std::endl;
400423
run_benchmarks(parser, rng_type, distribution, engine, stream);
401424
}
402425
std::cout << std::endl;

benchmark/benchmark_curand_host_api.cpp

+28-21
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ void run_benchmark(benchmark::State& state,
5656

5757
if(benchmark_host)
5858
{
59-
data = new T[size];
59+
data = new T[rounded_size];
6060
CURAND_CALL(curandCreateGeneratorHost(&generator, rng_type));
6161
}
6262
else
@@ -84,7 +84,7 @@ void run_benchmark(benchmark::State& state,
8484
// Warm-up
8585
for(size_t i = 0; i < 15; i++)
8686
{
87-
CURAND_CALL(generate_func(generator, data, size));
87+
CURAND_CALL(generate_func(generator, data, rounded_size));
8888
}
8989
CUDA_CALL(cudaDeviceSynchronize());
9090

@@ -97,7 +97,7 @@ void run_benchmark(benchmark::State& state,
9797
CUDA_CALL(cudaEventRecord(start, stream));
9898
for(size_t i = 0; i < trials; i++)
9999
{
100-
CURAND_CALL(generate_func(generator, data, size));
100+
CURAND_CALL(generate_func(generator, data, rounded_size));
101101
}
102102
CUDA_CALL(cudaEventRecord(stop, stream));
103103
CUDA_CALL(cudaEventSynchronize(stop));
@@ -108,8 +108,8 @@ void run_benchmark(benchmark::State& state,
108108
state.SetIterationTime(elapsed / 1000.f);
109109
}
110110

111-
state.SetBytesProcessed(trials * state.iterations() * size * sizeof(T));
112-
state.SetItemsProcessed(trials * state.iterations() * size);
111+
state.SetBytesProcessed(trials * state.iterations() * rounded_size * sizeof(T));
112+
state.SetItemsProcessed(trials * state.iterations() * rounded_size);
113113

114114
CUDA_CALL(cudaEventDestroy(stop));
115115
CUDA_CALL(cudaEventDestroy(start));
@@ -128,6 +128,10 @@ void run_benchmark(benchmark::State& state,
128128
void configure_parser(cli::Parser& parser)
129129
{
130130
parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
131+
parser.set_optional<bool>("byte-size",
132+
"byte-size",
133+
false,
134+
"--size is interpreted as the number of generated bytes");
131135
parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
132136
parser.set_optional<size_t>("offset", "offset", 0, "offset of generated pseudo-random values");
133137
parser.set_optional<size_t>("dimensions",
@@ -232,23 +236,26 @@ int main(int argc, char* argv[])
232236
const std::string name_engine_prefix
233237
= benchmark_name_prefix + "<" + name + "," + ordering_name_map.at(ordering) + ",";
234238

235-
benchmarks.emplace_back(benchmark::RegisterBenchmark(
236-
(name_engine_prefix + "uniform-uint>").c_str(),
237-
&run_benchmark<unsigned int>,
238-
[](curandGenerator_t gen, unsigned int* data, size_t size_gen)
239-
{ return curandGenerate(gen, data, size_gen); },
240-
size,
241-
byte_size,
242-
trials,
243-
dimensions,
244-
offset,
245-
engine_type,
246-
ordering,
247-
benchmark_host,
248-
stream));
239+
if(engine_type != CURAND_RNG_QUASI_SOBOL64
240+
&& engine_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
249241

250-
if(engine_type == CURAND_RNG_QUASI_SOBOL64
251-
|| engine_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
242+
{
243+
benchmarks.emplace_back(benchmark::RegisterBenchmark(
244+
(name_engine_prefix + "uniform-uint>").c_str(),
245+
&run_benchmark<unsigned int>,
246+
[](curandGenerator_t gen, unsigned int* data, size_t size_gen)
247+
{ return curandGenerate(gen, data, size_gen); },
248+
size,
249+
byte_size,
250+
trials,
251+
dimensions,
252+
offset,
253+
engine_type,
254+
ordering,
255+
benchmark_host,
256+
stream));
257+
}
258+
else
252259
{
253260
benchmarks.emplace_back(benchmark::RegisterBenchmark(
254261
(name_engine_prefix + "uniform-long-long>").c_str(),

cmake/Summary.cmake

+49-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,41 @@
1-
function (print_configuration_summary)
1+
function(print_configuration_summary)
2+
find_package(Git)
3+
if(GIT_FOUND)
4+
execute_process(
5+
COMMAND ${GIT_EXECUTABLE} show --format=%H --no-patch
6+
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
7+
OUTPUT_VARIABLE COMMIT_HASH
8+
OUTPUT_STRIP_TRAILING_WHITESPACE
9+
)
10+
execute_process(
11+
COMMAND ${GIT_EXECUTABLE} show --format=%s --no-patch
12+
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
13+
OUTPUT_VARIABLE COMMIT_SUBJECT
14+
OUTPUT_STRIP_TRAILING_WHITESPACE
15+
)
16+
endif()
17+
18+
execute_process(
19+
COMMAND ${CMAKE_CXX_COMPILER} --version
20+
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
21+
OUTPUT_VARIABLE CMAKE_CXX_COMPILER_VERBOSE_DETAILS
22+
OUTPUT_STRIP_TRAILING_WHITESPACE
23+
)
24+
25+
find_program(UNAME_EXECUTABLE uname)
26+
if(UNAME_EXECUTABLE)
27+
execute_process(
28+
COMMAND ${UNAME_EXECUTABLE} -a
29+
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
30+
OUTPUT_VARIABLE LINUX_KERNEL_DETAILS
31+
OUTPUT_STRIP_TRAILING_WHITESPACE
32+
)
33+
endif()
34+
35+
string(REPLACE "\n" ";" CMAKE_CXX_COMPILER_VERBOSE_DETAILS "${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}")
36+
list(TRANSFORM CMAKE_CXX_COMPILER_VERBOSE_DETAILS PREPEND "-- ")
37+
string(REPLACE ";" "\n" CMAKE_CXX_COMPILER_VERBOSE_DETAILS "${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}")
38+
239
message(STATUS "")
340
message(STATUS "******** Summary ********")
441
message(STATUS "General:")
@@ -35,4 +72,15 @@ function (print_configuration_summary)
3572
endif()
3673
message(STATUS " BUILD_ADDRESS_SANITIZER : ${BUILD_ADDRESS_SANITIZER}")
3774
message(STATUS " DEPENDENCIES_FORCE_DOWNLOAD: ${DEPENDENCIES_FORCE_DOWNLOAD}")
75+
message(STATUS "")
76+
message(STATUS "Detailed:")
77+
message(STATUS " C++ compiler details : \n${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}")
78+
if(GIT_FOUND)
79+
message(STATUS " Commit : ${COMMIT_HASH}")
80+
message(STATUS " ${COMMIT_SUBJECT}")
81+
endif()
82+
if(UNAME_EXECUTABLE)
83+
message(STATUS " Unix name : ${LINUX_KERNEL_DETAILS}")
84+
endif()
85+
3886
endfunction()

library/CMakeLists.txt

-14
Original file line numberDiff line numberDiff line change
@@ -43,20 +43,6 @@ set(rocRAND_HIP_SRCS
4343
"src/rocrand_mt19937_precomputed.cpp"
4444
"src/rocrand.cpp")
4545

46-
# When enabled, it defines ROCRAND_ENABLE_INLINE_ASM in rocrand_version.h, which
47-
# turns on inline asm in rocRAND (for both compiled library and device functions).
48-
option(ENABLE_INLINE_ASM "Enable inline asm optimisations in rocRAND" ON)
49-
if(ENABLE_INLINE_ASM)
50-
set(
51-
rocrand_ENABLE_INLINE_ASM
52-
"\n// Enables inline asm optimisations\n"
53-
"#if !defined(ROCRAND_ENABLE_INLINE_ASM) && !defined(ROCRAND_DISABLE_INLINE_ASM)\n"
54-
" #define ROCRAND_ENABLE_INLINE_ASM\n"
55-
"#endif"
56-
)
57-
string(REPLACE ";" "" rocrand_ENABLE_INLINE_ASM "${rocrand_ENABLE_INLINE_ASM}")
58-
endif()
59-
6046
# Configure a header file to pass the rocRAND version
6147
configure_file(
6248
"${PROJECT_SOURCE_DIR}/library/include/rocrand/rocrand_version.h.in"

library/include/rocrand/rocrand_common.h

+8-54
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
1+
// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
22
//
33
// Permission is hereby granted, free of charge, to any person obtaining a copy
44
// of this software and associated documentation files (the "Software"), to deal
@@ -92,63 +92,17 @@
9292
namespace rocrand_device {
9393
namespace detail {
9494

95-
#if ( defined(__HIP_PLATFORM_NVCC__) || \
96-
defined(__gfx801__) || \
97-
defined(__gfx802__) || \
98-
defined(__gfx803__) || \
99-
defined(__gfx810__) || \
100-
defined(__gfx900__) || \
101-
defined(__gfx902__) || \
102-
defined(__gfx904__) || \
103-
defined(__gfx906__) || \
104-
defined(__gfx908__) || \
105-
defined(__gfx909__) || \
106-
defined(__gfx1030__) )
107-
#if !defined(ROCRAND_ENABLE_INLINE_ASM)
108-
#define ROCRAND_ENABLE_INLINE_ASM
109-
#endif
110-
#else
111-
#if defined(__HIP_DEVICE_COMPILE__) && defined(ROCRAND_ENABLE_INLINE_ASM)
112-
#undef ROCRAND_ENABLE_INLINE_ASM
113-
#endif
114-
#endif
115-
116-
__forceinline__ __device__ __host__ unsigned long long
95+
__forceinline__ __device__ __host__
96+
unsigned long long
11797
mad_u64_u32(const unsigned int x, const unsigned int y, const unsigned long long z)
11898
{
119-
#if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_DEVICE_COMPILE__) \
120-
&& defined(ROCRAND_ENABLE_INLINE_ASM)
121-
122-
#if __AMDGCN_WAVEFRONT_SIZE == 64u
123-
using sgpr_t = unsigned long long;
124-
#elif __AMDGCN_WAVEFRONT_SIZE == 32u
125-
using sgpr_t = unsigned int;
126-
#endif
127-
128-
unsigned long long r;
129-
sgpr_t c; // carry bits, SGPR, unused
130-
// x has "r" constraint. This allows to use both VGPR and SGPR
131-
// (to save VGPR) as input.
132-
// y and z have "v" constraints, because only one SGPR or literal
133-
// can be read by the instruction.
134-
asm volatile("v_mad_u64_u32 %0, %1, %2, %3, %4"
135-
: "=v"(r), "=s"(c) : "r"(x), "v"(y), "v"(z)
136-
);
137-
return r;
138-
#elif defined(__HIP_PLATFORM_NVCC__) && defined(__HIP_DEVICE_COMPILE__) \
139-
&& defined(ROCRAND_ENABLE_INLINE_ASM)
140-
141-
unsigned long long r;
142-
asm("mad.wide.u32 %0, %1, %2, %3;"
143-
: "=l"(r) : "r"(x), "r"(y), "l"(z)
144-
);
145-
return r;
146-
147-
#else // host code
148-
14999
return static_cast<unsigned long long>(x) * static_cast<unsigned long long>(y) + z;
100+
}
150101

151-
#endif
102+
__forceinline__ __device__ __host__
103+
unsigned long long mul_u64_u32(const unsigned int x, const unsigned int y)
104+
{
105+
return static_cast<unsigned long long>(x) * static_cast<unsigned long long>(y);
152106
}
153107

154108
// This helps access fields of engine's internal state which

0 commit comments

Comments
 (0)