diff --git a/esda/crand.py b/esda/crand.py index 523cfbd4..4577c8d7 100644 --- a/esda/crand.py +++ b/esda/crand.py @@ -4,25 +4,13 @@ import os import warnings - +from .significance import _permutation_significance import numpy as np try: - from numba import boolean, jit, njit, prange + from numba import boolean, njit, prange except (ImportError, ModuleNotFoundError): - - def jit(*dec_args, **dec_kwargs): # noqa: ARG001 - """ - decorator mimicking numba.jit - """ - - def intercepted_function(f, *f_args, **f_kwargs): # noqa: ARG001 - return f - - return intercepted_function - - njit = jit - + from libpysal.common import jit as njit prange = range boolean = bool @@ -35,7 +23,9 @@ def intercepted_function(f, *f_args, **f_kwargs): # noqa: ARG001 @njit(fastmath=True) -def vec_permutations(max_card: int, n: int, k_replications: int, seed: int): +def vec_permutations( + max_card: int, n: int, k_replications: int, seed: int + ): """ Generate `max_card` permuted IDs, sampled from `n` without replacement, `k_replications` times @@ -75,6 +65,7 @@ def crand( scaling=None, seed=None, island_weight=0, + alternative=None ): """ Conduct conditional randomization of a given input using the provided @@ -147,6 +138,24 @@ def crand( f"conditional randomization. Recieved `z` of shape {z.shape}" ) + if alternative is None: + warnings.warn( + "The alternative hypothesis for conditional randomization" + " is changing in the next major release of esda. We recommend" + " setting alternative='two-sided', which will generally" + " double the p-value returned." + " To retain the current behavior, set alternative='directed'." + " We strongly recommend moving to alternative='two-sided'.", + DeprecationWarning, + ) + # TODO: replace this with 'two-sided' by next major release + alternative = 'directed' + if alternative not in ("two-sided", "greater", "lesser", "directed", "folded"): + raise ValueError( + f"alternative='{alternative}' provided, but is not" + f" one of the supported options: 'two-sided', 'greater', 'lesser', 'directed', 'folded')" + ) + # paralellise over permutations? if seed is None: seed = np.random.randint(12345, 12345000) @@ -185,7 +194,7 @@ def crand( n_jobs = 1 if n_jobs == 1: - larger, rlocals = compute_chunk( + p_sims, rlocals = compute_chunk( 0, # chunk start z, # chunked z, for serial this is the entire data z, # all z, for serial this is also the entire data @@ -198,6 +207,7 @@ def crand( keep, # whether or not to keep the local statistics stat_func, island_weight, + alternative=alternative ) else: if n_jobs == -1: @@ -205,7 +215,7 @@ def crand( if n_jobs > len(z): n_jobs = len(z) # Parallel implementation - larger, rlocals = parallel_crand( + p_sims, rlocals = parallel_crand( z, observed, cardinalities, @@ -217,13 +227,10 @@ def crand( keep, stat_func, island_weight, + alternative=alternative ) - low_extreme = (permutations - larger) < larger - larger[low_extreme] = permutations - larger[low_extreme] - p_sim = (larger + 1.0) / (permutations + 1.0) - - return p_sim, rlocals + return p_sims, rlocals @njit(parallel=False, fastmath=True) @@ -240,6 +247,7 @@ def compute_chunk( keep: bool, stat_func, island_weight: float, + alternative: str ): """ Compute conditional randomisation for a single chunk @@ -302,11 +310,13 @@ def compute_chunk( the null of spatial randomness """ chunk_n = z_chunk.shape[0] - n = z.shape[0] - larger = np.zeros((chunk_n,), dtype=np.int64) + n_samples = z.shape[0] + p_permutations, k_max_card = permuted_ids.shape + p_sims = np.zeros((chunk_n,), dtype=np.float32) rlocals = np.empty((chunk_n, permuted_ids.shape[0])) if keep else np.empty((1, 1)) - mask = np.ones((n,), dtype=np.int8) == 1 + + mask = np.ones((n_samples,), dtype=np.int8) == 1 wloc = 0 for i in range(chunk_n): @@ -323,10 +333,13 @@ def compute_chunk( wloc += cardinality mask[chunk_start + i] = False rstats = stat_func(chunk_start + i, z, permuted_ids, weights_i, scaling) + p_sims[i] = _permutation_significance( + observed[i], rstats, alternative=alternative + ).item() if keep: rlocals[i] = rstats - larger[i] = np.sum(rstats >= observed[i]) - return larger, rlocals + + return p_sims, rlocals ####################################################################### @@ -449,6 +462,7 @@ def parallel_crand( keep: bool, stat_func, island_weight, + alternative: str = 'directed' ): """ Conduct conditional randomization in parallel using numba @@ -516,8 +530,8 @@ def parallel_crand( starts = np.arange(n_jobs + 1) * chunk_size # ------------------------------------------------------------------ # Set up output holders - larger = np.zeros((n,), dtype=np.int64) rlocals = np.empty((n, permuted_ids.shape[0])) if keep else np.empty((1, 1)) + # ------------------------------------------------------------------ # Joblib parallel loop by chunks @@ -536,14 +550,15 @@ def parallel_crand( with parallel_backend("loky", inner_max_num_threads=1): worker_out = Parallel(n_jobs=n_jobs)( delayed(compute_chunk)( - *pars, permuted_ids, scaling, keep, stat_func, island_weight + *pars, permuted_ids, scaling, keep, stat_func, island_weight, alternative ) for pars in chunks ) - larger, rlocals = zip(*worker_out, strict=True) - larger = np.hstack(larger).squeeze() + + p_sims, rlocals = zip(*worker_out) + p_sims = np.hstack(p_sims).squeeze() rlocals = np.row_stack(rlocals).squeeze() - return larger, rlocals + return p_sims, rlocals ####################################################################### diff --git a/esda/significance.py b/esda/significance.py new file mode 100644 index 00000000..8fb06be3 --- /dev/null +++ b/esda/significance.py @@ -0,0 +1,107 @@ +import numpy as np +import warnings + +try: + from numba import njit +except (ImportError, ModuleNotFoundError): + from libpysal.common import jit as njit + + +def calculate_significance(test_stat, reference_distribution, alternative="two-sided"): + """ + Calculate a pseudo p-value from a reference distribution. + + Pseudo-p values are calculated using the formula (M + 1) / (R + 1). Where R is the number of simulations + and M is the number of times that the simulated value was equal to, or more extreme than the observed test statistic. + + Parameters + ---------- + test_stat: float or numpy.ndarray + The observed test statistic, or a vector of observed test statistics + reference_distribution: numpy.ndarray + A numpy array containing simulated test statistics as a result of conditional permutation. + alternative: string + One of 'two-sided', 'lesser', 'greater', 'folded', or 'directed'. Indicates the alternative hypothesis. + - 'two-sided': the observed test statistic is in either tail of the reference distribution. This is an un-directed alternative hypothesis. + - 'folded': the observed test statistic is an extreme value of the reference distribution folded about its mean. This is an un-directed alternative hypothesis. + - 'lesser': the observed test statistic is small relative to the reference distribution. This is a directed alternative hypothesis. + - 'greater': the observed test statistic is large relative to the reference distribution. This is a directed alternative hypothesis. + - 'directed': the observed test statistic is in either tail of the reference distribution, but the tail is selected depending on the test statistic. This is a directed alternative hypothesis, but the direction is chosen dependent on the data. This is not advised, and included solely to reproduce past results. + + Notes + ----- + + the directed p-value is half of the two-sided p-value, and corresponds to running the + lesser and greater tests, then picking the smaller significance value. This is not advised, + since the p-value will be uniformly too small. + """ + reference_distribution = np.atleast_2d(reference_distribution) + n_samples, p_permutations = reference_distribution.shape + test_stat = np.atleast_2d(test_stat).reshape(n_samples, -1) + if alternative not in ( + 'folded', + 'two-sided', + 'greater', + 'lesser', + 'directed' + ): + raise ValueError( + f"alternative='{alternative}' provided, but is not" + f" one of the supported options: 'two-sided', 'greater', 'lesser', 'directed', 'folded')" + ) + return _permutation_significance( + test_stat, + reference_distribution, + alternative=alternative + ) + +@njit(parallel=False, fastmath=False) +def _permutation_significance(test_stat, reference_distribution, alternative='two-sided'): + reference_distribution = np.atleast_2d(reference_distribution) + n_samples, p_permutations = reference_distribution.shape + if alternative == "directed": + larger = (reference_distribution >= test_stat).sum(axis=1) + low_extreme = (p_permutations - larger) < larger + larger[low_extreme] = p_permutations - larger[low_extreme] + p_value = (larger + 1.0) / (p_permutations + 1.0) + elif alternative == "lesser": + p_value = (np.sum(reference_distribution <= test_stat, axis=1) + 1) / ( + p_permutations + 1 + ) + elif alternative == "greater": + p_value = (np.sum(reference_distribution >= test_stat, axis=1) + 1) / ( + p_permutations + 1 + ) + elif alternative == "two-sided": + # find percentile p at which the test statistic sits + # find "synthetic" test statistic at 1-p + # count how many observations are outisde of (p, 1-p) + # including the test statistic and its synthetic pair + percentile = (reference_distribution <= test_stat).mean(axis=1)*100 + lows = np.empty(n_samples) + highs = np.empty(n_samples) + for i in range(n_samples): + p_low = np.minimum(percentile[i], 100-percentile[i]) + lows[i] = np.percentile( + reference_distribution[i], + p_low + ) + highs[i] = np.percentile( + reference_distribution[i], + 100 - p_low + ) + n_outside = (reference_distribution <= lows[:,None]).sum(axis=1) + n_outside += (reference_distribution >= highs[:,None]).sum(axis=1) + p_value = (n_outside + 1) / (p_permutations + 1) + elif alternative == "folded": + means = reference_distribution.mean(axis=1, keepdims=True) + test_stat = np.abs(test_stat - means) + reference_distribution = np.abs(reference_distribution - means) + p_value = ((reference_distribution >= test_stat).sum(axis=1) + 1) / ( + p_permutations + 1 + ) + else: + p_value = np.ones((n_samples, 1))*np.nan + return p_value + + diff --git a/esda/tests/test_significance.py b/esda/tests/test_significance.py new file mode 100644 index 00000000..d0f4cf46 --- /dev/null +++ b/esda/tests/test_significance.py @@ -0,0 +1,60 @@ +import numpy +import esda +from esda.significance import calculate_significance +import pytest +from libpysal.weights import Voronoi + +numpy.random.seed(2478879) +coordinates = numpy.random.random(size=(800, 2)) +x = numpy.random.normal(size=(800,)) +w = Voronoi(coordinates, clip="bounding_box") +w.transform = "r" +stat = esda.Moran_Local(x, w, permutations=19) + +@pytest.mark.parametrize("alternative", ["two-sided", "directed", "lesser", "greater", "folded"]) +def test_execution_and_range(alternative): + out = calculate_significance(stat.Is, stat.rlisas, alternative=alternative) + assert (out > 0).all() & (out <= 1).all(), f'p-value out of bounds for method {alternative}' + if alternative == 'directed': + assert out.max() <= .5, f"max p-value is too large for method {alternative}" + else: + assert out.max() >= .5, f"max p-value is too small for method {alternative}" + + +def test_alternative_relationships(): + two_sided = calculate_significance( + stat.Is, + stat.rlisas, + alternative="two-sided" + ) + directed = calculate_significance( + stat.Is, + stat.rlisas, + alternative="directed" + ) + lesser = calculate_significance( + stat.Is, + stat.rlisas, + alternative="lesser" + ) + greater = calculate_significance( + stat.Is, + stat.rlisas, + alternative="greater" + ) + folded = calculate_significance( + stat.Is, + stat.rlisas, + alternative="folded" + ) + + numpy.testing.assert_allclose(lesser + greater, + numpy.ones_like(lesser) + (1/(stat.permutations + 1)), + err_msg = "greater p-value should be complement of lesser" + ) + assert (directed <= two_sided).all(), "directed is bigger than two_sided and should not be" + one_or_the_other = (directed == lesser) | (directed == greater) + assert one_or_the_other.all(), "some directed p-value is neither the greater nor lesser p-value" + assert numpy.abs(two_sided - folded).mean() < numpy.abs(lesser - folded).mean(), ( + "folded p-values should be more similar to the two-sided variant than the directed" + ) \ No newline at end of file