Skip to content

Commit

Permalink
Make tests more deterministic (#17008)
Browse files Browse the repository at this point in the history
Fixes #17045

This PR removes randomness in our pytests and switches from using `np.random.seed` to `np.random.default_rng` in all of the codebase.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Lawrence Mitchell (https://github.com/wence-)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: #17008
  • Loading branch information
galipremsagar authored Oct 17, 2024
1 parent 3683e46 commit e493340
Show file tree
Hide file tree
Showing 79 changed files with 5,496 additions and 5,288 deletions.
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ repos:
entry: 'pytest\.xfail'
language: pygrep
types: [python]
- id: no-unseeded-default-rng
name: no-unseeded-default-rng
description: 'Enforce that no non-seeded default_rng is used and default_rng is used instead of np.random.seed'
entry: |
# Check for usage of default_rng without seeding
default_rng\(\)|
# Check for usage of np.random.seed
np.random.seed\(
language: pygrep
types: [python]
- id: cmake-format
name: cmake-format
entry: ./cpp/scripts/run-cmake-format.sh cmake-format
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ select = [
"UP007",
# Import from `collections.abc` instead: `Callable`
"UP035",
# usage of legacy `np.random` function calls
"NPY002",
]
ignore = [
# whitespace before :
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/benchmarks/API/bench_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,13 @@ def bench_pivot_table_simple(benchmark, dataframe):

@pytest_cases.parametrize("nr", NUM_ROWS)
def bench_crosstab_simple(benchmark, nr):
rng = np.random.default_rng(seed=0)
series_a = np.array(["foo", "bar"] * nr)
series_b = np.array(["one", "two"] * nr)
series_c = np.array(["dull", "shiny"] * nr)
np.random.shuffle(series_a)
np.random.shuffle(series_b)
np.random.shuffle(series_c)
rng.shuffle(series_a)
rng.shuffle(series_b)
rng.shuffle(series_c)
series_a = cudf.Series(series_a)
series_b = cudf.Series(series_b)
series_c = cudf.Series(series_c)
Expand Down
12 changes: 7 additions & 5 deletions python/cudf/benchmarks/API/bench_multiindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

"""Benchmarks of MultiIndex methods."""

Expand All @@ -11,16 +11,18 @@
@pytest.fixture
def pidx():
num_elements = int(1e3)
a = np.random.randint(0, num_elements // 10, num_elements)
b = np.random.randint(0, num_elements // 10, num_elements)
rng = np.random.default_rng(seed=0)
a = rng.integers(0, num_elements // 10, num_elements)
b = rng.integers(0, num_elements // 10, num_elements)
return pd.MultiIndex.from_arrays([a, b], names=("a", "b"))


@pytest.fixture
def midx(pidx):
num_elements = int(1e3)
a = np.random.randint(0, num_elements // 10, num_elements)
b = np.random.randint(0, num_elements // 10, num_elements)
rng = np.random.default_rng(seed=0)
a = rng.integers(0, num_elements // 10, num_elements)
b = rng.integers(0, num_elements // 10, num_elements)
df = cudf.DataFrame({"a": a, "b": b})
return cudf.MultiIndex.from_frame(df)

Expand Down
15 changes: 8 additions & 7 deletions python/cudf/cudf/_fuzz_testing/avro.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import copy
import io
Expand Down Expand Up @@ -68,12 +68,12 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/6604
- cudf.utils.dtypes.TIMEDELTA_TYPES
)

seed = random.randint(0, 2**32 - 1)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2**32 - 1)

self._current_params["seed"] = seed
self._current_params["num_rows"] = num_rows
self._current_params["num_cols"] = num_cols
Expand All @@ -100,17 +100,18 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "columns":
col_size = self._rand(len(self._df.columns))
params_dict[param] = list(
np.unique(np.random.choice(self._df.columns, col_size))
np.unique(rng.choice(self._df.columns, col_size))
)
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[None, self._rand(len(self._df))]
)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
40 changes: 18 additions & 22 deletions python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def generate_input(self):
random.seed(seed)
dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -77,25 +77,22 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "usecols":
col_size = self._rand(len(self._df.columns))
col_val = np.random.choice(
col_val = rng.choice(
[
None,
np.unique(
np.random.choice(self._df.columns, col_size)
),
np.unique(rng.choice(self._df.columns, col_size)),
]
)
params_dict[param] = (
col_val if col_val is None else list(col_val)
)
elif param == "dtype":
dtype_val = np.random.choice(
[None, self._df.dtypes.to_dict()]
)
dtype_val = rng.choice([None, self._df.dtypes.to_dict()])
if dtype_val is not None:
dtype_val = {
col_name: "category"
Expand All @@ -105,25 +102,25 @@ def set_rand_params(self, params):
}
params_dict[param] = dtype_val
elif param == "header":
header_val = np.random.choice(
["infer", np.random.randint(low=0, high=len(self._df))]
header_val = rng.choice(
["infer", rng.integers(low=0, high=len(self._df))]
)
params_dict[param] = header_val
elif param == "skiprows":
params_dict[param] = np.random.randint(
params_dict[param] = rng.integers(
low=0, high=len(self._df)
)
elif param == "skipfooter":
params_dict[param] = np.random.randint(
params_dict[param] = rng.integers(
low=0, high=len(self._df)
)
elif param == "nrows":
nrows_val = np.random.choice(
[None, np.random.randint(low=0, high=len(self._df))]
nrows_val = rng.choice(
[None, rng.integers(low=0, high=len(self._df))]
)
params_dict[param] = nrows_val
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)


Expand Down Expand Up @@ -159,7 +156,7 @@ def generate_input(self):
random.seed(seed)
dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -182,26 +179,25 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "columns":
col_size = self._rand(len(self._current_buffer.columns))
params_dict[param] = list(
np.unique(
np.random.choice(
self._current_buffer.columns, col_size
)
rng.choice(self._current_buffer.columns, col_size)
)
)
elif param == "chunksize":
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[
None,
np.random.randint(
rng.integers(
low=1, high=max(1, len(self._current_buffer))
),
]
)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
5 changes: 3 additions & 2 deletions python/cudf/cudf/_fuzz_testing/io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import copy
import json
Expand Down Expand Up @@ -91,8 +91,9 @@ def get_next_regression_params(self):
return dtypes_meta, num_rows, num_cols, seed

def set_rand_params(self, params):
rng = np.random.default_rng(seed=None)
params_dict = {
param: np.random.choice(values) for param, values in params.items()
param: rng.choice(values) for param, values in params.items()
}
self._current_params["test_kwargs"] = self.process_kwargs(
params_dict=params_dict
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/7086
# dtypes_list.extend(["list"])
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -105,14 +105,15 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if param == "dtype" and values == ALL_POSSIBLE_VALUES:
dtype_val = np.random.choice(
dtype_val = rng.choice(
[True, self._current_buffer.dtypes.to_dict()]
)
params_dict[param] = _get_dtype_param_value(dtype_val)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)


Expand Down Expand Up @@ -155,7 +156,7 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/7086
# dtypes_list.extend(["list"])
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -180,12 +181,13 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if param == "dtype" and values == ALL_POSSIBLE_VALUES:
dtype_val = np.random.choice(
dtype_val = rng.choice(
[True, self._current_buffer.dtypes.to_dict()]
)
params_dict[param] = _get_dtype_param_value(dtype_val)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
28 changes: 12 additions & 16 deletions python/cudf/cudf/_fuzz_testing/orc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import copy
import io
Expand Down Expand Up @@ -62,13 +62,11 @@ def generate_input(self):
- cudf.utils.dtypes.UNSIGNED_TYPES
- {"datetime64[ns]"}
)

seed = random.randint(0, 2**32 - 1)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)

self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2**32 - 1)
self._current_params["seed"] = seed
self._current_params["num_rows"] = num_rows
self._current_params["num_cols"] = num_cols
Expand All @@ -94,42 +92,41 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "columns":
col_size = self._rand(len(self._df.columns))
params_dict[param] = list(
np.unique(np.random.choice(self._df.columns, col_size))
np.unique(rng.choice(self._df.columns, col_size))
)
elif param == "stripes":
f = io.BytesIO(self._current_buffer)
orcFile = pa.orc.ORCFile(f)
stripes = list(range(orcFile.nstripes))
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[
None,
list(
map(
int,
np.unique(
np.random.choice(
stripes, orcFile.nstripes
)
rng.choice(stripes, orcFile.nstripes)
),
)
),
]
)
elif param == "use_index":
params_dict[param] = np.random.choice([True, False])
params_dict[param] = rng.choice([True, False])
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[None, self._rand(len(self._df))]
)
else:
if not isinstance(values, list):
raise TypeError("values must be of type list")
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)


Expand Down Expand Up @@ -177,12 +174,11 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/7355
- cudf.utils.dtypes.DATETIME_TYPES
)

seed = random.randint(0, 2**32 - 1)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2**32 - 1)
self._current_params["seed"] = seed
self._current_params["num_rows"] = num_rows
self._current_params["num_cols"] = num_cols
Expand Down
Loading

0 comments on commit e493340

Please sign in to comment.