From b924a7dcda2109e7969ee18dd3a8927d88055372 Mon Sep 17 00:00:00 2001 From: MothNik Date: Sun, 19 May 2024 23:48:44 +0200 Subject: [PATCH] test/feat: implemented full tests for whittaker base --- tests/test_for_utils/test_whittaker_base.py | 558 ++++++++++++++++++++ 1 file changed, 558 insertions(+) create mode 100644 tests/test_for_utils/test_whittaker_base.py diff --git a/tests/test_for_utils/test_whittaker_base.py b/tests/test_for_utils/test_whittaker_base.py new file mode 100644 index 0000000..562ec70 --- /dev/null +++ b/tests/test_for_utils/test_whittaker_base.py @@ -0,0 +1,558 @@ +""" +Test suite for the utility functions in the :mod:`chemotools.utils.whittaker_base` +module. + +""" + +### Imports ### + +from math import log +from typing import Any, Tuple, Type, Union + +import numpy as np +import pytest + +from chemotools.utils import models +from chemotools.utils.whittaker_base.auto_lambda.shared import get_smooth_wrss +from chemotools.utils.whittaker_base.initialisation import ( + get_checked_lambda, + get_penalty_log_pseudo_det, +) +from chemotools.utils.whittaker_base.main import WhittakerLikeSolver +from chemotools.utils.whittaker_base.misc import get_weight_generator +from chemotools.utils.whittaker_base.solvers import solve_normal_equations +from tests.fixtures import noise_level_whittaker_auto_lambda # noqa: F401 +from tests.fixtures import spectrum_whittaker_auto_lambda # noqa: F401 +from tests.test_for_utils.utils_funcs import ( + find_whittaker_smooth_opt_lambda_log_marginal_likelihood, +) +from tests.test_for_utils.utils_models import ExpectedWhittakerSmoothLambda + +### Type Aliases ### + +_RealNumeric = Union[float, int] +_WhittakerMethod = Union[str, models.WhittakerSmoothMethods] +_LambdaSpecs = Union[_RealNumeric, Tuple[_RealNumeric, _RealNumeric, _WhittakerMethod]] +_LambdaSpecsOrFlawed = Union[_LambdaSpecs, str] + +### Constants ### + +_NAN: float = float("nan") + +### Test Suite ### + + +@pytest.mark.parametrize( + "combination", + [ + ( # Number 0 (fixed float) + 100.0, + ExpectedWhittakerSmoothLambda( + fixed_lambda=100.0, + auto_bounds=(_NAN, _NAN), + fit_auto=False, + method_used=models.WhittakerSmoothMethods.FIXED, + log_auto_bounds=(_NAN, _NAN), + ), + ), + ( # Number 1 (fixed integer) + 100, + ExpectedWhittakerSmoothLambda( + fixed_lambda=100.0, + auto_bounds=(_NAN, _NAN), + fit_auto=False, + method_used=models.WhittakerSmoothMethods.FIXED, + log_auto_bounds=(_NAN, _NAN), + ), + ), + ( # Number 2 (float search space, log marginal likelihood method enum) + (100.0, 10_000.0, models.WhittakerSmoothMethods.LOGML), + ExpectedWhittakerSmoothLambda( + fixed_lambda=_NAN, + auto_bounds=(100.0, 10_000.0), + fit_auto=True, + method_used=models.WhittakerSmoothMethods.LOGML, + log_auto_bounds=(log(100.0), log(10_000.0)), + ), + ), + ( # Number 3 (float search space, log marginal likelihood method string) + (100.0, 10_000.0, "logml"), + ExpectedWhittakerSmoothLambda( + fixed_lambda=_NAN, + auto_bounds=(100.0, 10_000.0), + fit_auto=True, + method_used=models.WhittakerSmoothMethods.LOGML, + log_auto_bounds=(log(100.0), log(10_000.0)), + ), + ), + ( # Number 4 (integer search space, log marginal likelihood method enum) + (100, 10_000, models.WhittakerSmoothMethods.LOGML), + ExpectedWhittakerSmoothLambda( + fixed_lambda=_NAN, + auto_bounds=(100.0, 10_000.0), + fit_auto=True, + method_used=models.WhittakerSmoothMethods.LOGML, + log_auto_bounds=(log(100.0), log(10_000.0)), + ), + ), + ( # Number 5 (integer search space, log marginal likelihood method string) + (100, 10_000, "logml"), + ExpectedWhittakerSmoothLambda( + fixed_lambda=_NAN, + auto_bounds=(100.0, 10_000.0), + fit_auto=True, + method_used=models.WhittakerSmoothMethods.LOGML, + log_auto_bounds=(log(100.0), log(10_000.0)), + ), + ), + ( # Number 6 (dataclass float specification; fixed method) + models.WhittakerSmoothLambda( + bounds=100.0, + method=models.WhittakerSmoothMethods.FIXED, + ), + ExpectedWhittakerSmoothLambda( + fixed_lambda=100.0, + auto_bounds=(_NAN, _NAN), + fit_auto=False, + method_used=models.WhittakerSmoothMethods.FIXED, + log_auto_bounds=(_NAN, _NAN), + ), + ), + ( # Number 7 (dataclass integer specification; fixed method) + models.WhittakerSmoothLambda( + bounds=100, + method=models.WhittakerSmoothMethods.FIXED, + ), + ExpectedWhittakerSmoothLambda( + fixed_lambda=100.0, + auto_bounds=(_NAN, _NAN), + fit_auto=False, + method_used=models.WhittakerSmoothMethods.FIXED, + log_auto_bounds=(_NAN, _NAN), + ), + ), + ( # Number 8 (dataclass float specification; log marginal likelihood method) + models.WhittakerSmoothLambda( + bounds=(100.0, 10_000.0), + method=models.WhittakerSmoothMethods.LOGML, + ), + ExpectedWhittakerSmoothLambda( + fixed_lambda=_NAN, + auto_bounds=(100.0, 10_000.0), + fit_auto=True, + method_used=models.WhittakerSmoothMethods.LOGML, + log_auto_bounds=(log(100.0), log(10_000.0)), + ), + ), + ( # Number 9 (dataclass integer specification; log marginal likelihood method) + models.WhittakerSmoothLambda( + bounds=(100, 10_000), + method=models.WhittakerSmoothMethods.LOGML, + ), + ExpectedWhittakerSmoothLambda( + fixed_lambda=_NAN, + auto_bounds=(100.0, 10_000.0), + fit_auto=True, + method_used=models.WhittakerSmoothMethods.LOGML, + log_auto_bounds=(log(100.0), log(10_000.0)), + ), + ), + ( # Number 10 (wrong length tuple) + (100.0, 10_000.0), + ValueError, + ), + ( # Number 11 (wrong type) + "error", + TypeError, + ), + ], +) +def test_get_checked_lambda( + combination: Tuple[ + _LambdaSpecsOrFlawed, Union[ExpectedWhittakerSmoothLambda, Type[Exception]] + ] +) -> None: + """ + Tests the function that casts a penalty weight lambda to the respective dataclass. + + The ``combination`` parameter defines + + - the lambda specification to be used and + - the expected result (will be an exception if the input should be considered + invalid by the function). + + """ + + # the input parameters are unpacked + lam, expected_result = combination + + # if the expected output is an exception, the test is run in a context manager + if not isinstance(expected_result, ExpectedWhittakerSmoothLambda): + with pytest.raises(expected_result): + get_checked_lambda(lam=lam) + + return + + # otherwise, the output dataclass is compared to the expected output + lambda_model = get_checked_lambda(lam=lam) + if isinstance(lambda_model, models.WhittakerSmoothLambda): + expected_result.assert_is_equal_to(other=lambda_model) + + return + + raise AssertionError( + "The lambda value could not be checked correctly since the returned value is " + "not an instance of the class 'WhittakerSmoothLambda'." + ) + + +@pytest.mark.parametrize( + "combination", + [ + (None, 1.0), # Number 0 + ( # Number 1 + np.ones(shape=(10, 1_000), dtype=np.float64), + np.ones(shape=(1_000), dtype=np.float64), + ), + ( # Number 2 + np.ones(shape=(5, 1_000), dtype=np.float64), + IndexError, + ), + ( # Number 3 + np.ones(shape=(1_000), dtype=np.float64), + ValueError, + ), + ( # Number 4 + np.ones(shape=(1, 5, 1_000), dtype=np.float64), + ValueError, + ), + ("error", TypeError), # Number 5 + ], +) +def test_weight_generator( + combination: Tuple[Any, Union[np.ndarray, float, Type[Exception]]] +) -> None: + """ + Tests the weight generator. + + The ``combination`` parameter defines + + - the weights to be used and + - the expected output at each iteration (will be an exception if the input should + be considered invalid by the function). + + """ + + # the input parameters are unpacked + weights, expected_output = combination + + # the number of series is defined + n_series = 10 + + # if the expected output is an exception, the test is run in a context manager + if not isinstance(expected_output, (np.ndarray, float, int)): + with pytest.raises(expected_output): + for _ in get_weight_generator(w=weights, n_series=n_series): + pass + + return + + # otherwise, the output is compared to the expected output + # Case 1: the expected output is a scalar + if isinstance(expected_output, (float, int)): + for w in get_weight_generator(w=weights, n_series=n_series): + assert isinstance(w, (float, int)) + assert w == expected_output + + return + + # Case 2: the expected output is an array + for w in get_weight_generator(w=weights, n_series=n_series): + assert isinstance(w, np.ndarray) + assert np.array_equal(w, expected_output) + + +@pytest.mark.parametrize("combination", [(True, 244_9755_000.0), (False, 490_000.0)]) +def test_smooth_wrss(combination: Tuple[bool, float]) -> None: + """ + Tests the weighted residual sum of squares calculation. + + The ``combination`` parameter defines + + - whether weights are used (``True``) or not (``False``) and + - the expected weighted residual sum of squares. + + """ + + # the input parameters are unpacked + with_weights, wrss_expected = combination + + # two series are generated where the difference between the elements is 7.0 + np.random.seed(42) + n_data = 10_000 + a_signs = np.random.choice([-1.0, 1.0], size=(n_data,), replace=True) + a_series = a_signs * 4.5 + b_series = (-1.0) * a_signs * 2.5 + + # the weights are generated + weights = ( + np.arange(start=0, stop=n_data, step=1.0, dtype=np.float64) + if with_weights + else 1.0 + ) + + # the wrss is calculated ... + wrss = get_smooth_wrss(b=a_series, b_smooth=b_series, w=weights) + + # ... and compared to the expected value with a very strict tolerance + assert np.isclose(wrss, wrss_expected, atol=1e-13, rtol=0.0) + + +# TODO: due to ill-conditioning, this is highly limited in the differences and number +# of data points; in the future, this should be tackled by QR-decomposition for +# extra numerical stability +@pytest.mark.parametrize( + "differences_and_n_data_from_to", + [ + (1, 0, 2_000), + (1, 2_001, 4_000), + (1, 4_001, 6_000), + (1, 6_001, 8_000), + (1, 8_001, 10_000), + (2, 0, 2_000), + (2, 2_001, 4_000), + (2, 4_001, 6_000), + (2, 6_001, 8_000), + (2, 8_001, 10_000), + ], +) +def test_penalty_log_pseudo_det_can_compute( + differences_and_n_data_from_to: Tuple[int, int, int] +) -> None: + """ + Tests the log pseudo-determinant of the penalty matrix for all the difference orders + and number of data points. + + """ + + differences, n_data_from, n_data_to = differences_and_n_data_from_to + for nd in range(max(differences + 1, n_data_from), n_data_to + 1): + get_penalty_log_pseudo_det(n_data=nd, differences=differences, dtype=np.float64) + + +# TODO: this test will not 100% reflect reality as intended; in the future this should +# be tested with the LAPACK function ``dgbcon`` to check the condition number; +# right now, it is set to a number of data points that causes the intended +# failure, but in the future, the condition number has to be used to detect +# ill-conditioning +def test_penalty_log_pseudo_det_breaks_ill_conditioned() -> None: + """ + Tests that the log pseudo-determinant of the penalty matrix breaks when the matrix + is ill-conditioned. + + """ + + # the difference order and number of data points are set so high that the matrix + # becomes ill-conditioned + n_data = 1_000 + differences = 10 + + # the function is tested for breaking + with pytest.raises(RuntimeError): + get_penalty_log_pseudo_det( + n_data=n_data, differences=differences, dtype=np.float64 + ) + + return + + +# TODO: this test will not 100% reflect reality as intended; in the future this should +# be tested with the LAPACK function ``dgbcon`` to check the condition number; +# right now, the matrix is heavily altered to cause the intended failure, but in +# the future, the condition number has to be used to detect ill-conditioning +@pytest.mark.parametrize("with_pentapy", [True, False]) +def test_normal_condition_solve_breaks_ill_conditioned(with_pentapy: bool) -> None: + """ + Tests that the normal condition solver breaks when the matrix is ill-conditioned. + + Note that the conditions for the solver to break will never be met in practice. + + """ + + # if pentapy is not installed but required, the test is skipped + if with_pentapy: + try: + import pentapy # noqa: F401 + except ImportError: + pytest.skip("Pentapy is not installed.") + + # a banded ill-conditioned matrix is created that has zeros on the diagonal + n_data = 10_000 + differences = 2 + a_banded = np.ones(shape=(2 * differences + 1, n_data), dtype=np.float64) + a_banded[differences, :] = 0.0 + + # some further required variables are initialised + lam = 1e100 + b_vect = np.ones(shape=(n_data,), dtype=np.float64) + weights = 0.0 + + # Test that the solver breaks + with pytest.raises(RuntimeError): + solve_normal_equations( + lam=lam, + differences=differences, + l_and_u=(differences, differences), + penalty_mat_banded=a_banded, + b_weighted=b_vect, + w=weights, + pentapy_enabled=with_pentapy, + ) + + +def test_whittakerlike_issues_warning_difference_order_too_high() -> None: + """ + Tests that the class :class:`WhittakerLikeSolver` issues a warning when the + difference order is greater than 2. + + """ + + with pytest.warns(UserWarning): + whitt_base = WhittakerLikeSolver() + whitt_base._setup_for_fit( + n_data=500, + differences=3, + lam=models.WhittakerSmoothLambda( + bounds=(100.0, 10_000.0), + method=models.WhittakerSmoothMethods.LOGML, + ), + child_class_name="pytest_run", + ) + + return + + +# TODO: this can only go to differences 2 due to ill-conditioning; in the future, this +# should be tackled by QR-decomposition for extra numerical stability +@pytest.mark.parametrize("same_weights_for_all", [True, False]) +@pytest.mark.parametrize("differences", [1, 2]) +def test_auto_lambda_log_marginal_likelihood_refuses_no_weights( + differences: int, + same_weights_for_all: bool, +) -> None: + """ + Tests that the automatic lambda calculation using the log marginal likelihood method + refuses to work with no weights. + + """ + + # the smoother is initialised ... + n_data = 500 + whitt_base = WhittakerLikeSolver() + whitt_base._setup_for_fit( + n_data=n_data, + differences=differences, + lam=models.WhittakerSmoothLambda( + bounds=(100.0, 10_000.0), + method=models.WhittakerSmoothMethods.LOGML, + ), + child_class_name="pytest_run", + ) + + # ... and the log marginal likelihood method is called without weights + np.random.seed(42) + X = np.random.rand(n_data) + with pytest.raises(ValueError): + whitt_base._whittaker_solve( + X=X, + w=None, + use_same_w_for_all=same_weights_for_all, + ) + + +@pytest.mark.parametrize("with_zero_weights", [True, False]) +@pytest.mark.parametrize("same_weights_for_all", [True, False]) +@pytest.mark.parametrize("differences", [1, 2]) +@pytest.mark.parametrize("n_series", [1, 5]) +def test_auto_lambda_log_marginal_likelihood( + spectrum_whittaker_auto_lambda: np.ndarray, # noqa: F811 + noise_level_whittaker_auto_lambda: np.ndarray, # noqa: F811 + n_series: int, + differences: int, + same_weights_for_all: bool, + with_zero_weights: bool, +) -> None: + """ + Tests the automatic lambda calculation using the log marginal likelihood method. + + Some of the noise standard deviations in the respective fixture are set to NaN which + allows for two different ways of handling them: + + - with zero weights, which will set the weights of the NaN values to zero, or + - interpolated weights, which will replace the NaN values with linearly interpolated + values which cannot be zero. + + This has slightly different effects on the log marginal likelihood calculation. + + Everything is tested against a from-scratch implementation based on SciPy to ensure + that the test is decoupled from the actual implementation used in Chemotools. + + """ + + # first of all, the Nan values in the noise level are handled + noise_level = noise_level_whittaker_auto_lambda.copy() + + # Case 1: Zero weights + if with_zero_weights: + # this can be achieved by replacing the NaN-values with +inf + noise_level = np.where(np.isnan(noise_level), np.inf, noise_level) + + # Case 2: Interpolated weights + else: + # the NaN-values are replaced by linearly interpolated values + nan_flags = np.isnan(noise_level) + noise_level[nan_flags] = np.interp( + x=np.where(nan_flags)[0], + xp=np.where(~nan_flags)[0], + fp=noise_level[~nan_flags], + ) + + # then, the weights are computed as the square of the inverse noise level ... + weights = (1.0 / np.square(noise_level))[np.newaxis, ::] + # ... and stacked as many times as required + weights = np.tile(weights, reps=(n_series, 1)) + + # then, the spectrum is repeated as many times as required + X = np.tile(spectrum_whittaker_auto_lambda[np.newaxis, ::], reps=(n_series, 1)) + + # the smoothing is performed using the chemotools implementation + lambda_bounds = (1e-15, 1e10) + whitt_base = WhittakerLikeSolver() + whitt_base._setup_for_fit( + n_data=X.shape[1], + differences=differences, + lam=models.WhittakerSmoothLambda( + bounds=lambda_bounds, + method=models.WhittakerSmoothMethods.LOGML, + ), + child_class_name="pytest_run", + ) + _, lambda_opts = whitt_base._whittaker_solve( + X=X, + w=weights, + use_same_w_for_all=same_weights_for_all, + ) + + # the reference optimum lambda is found by a from-scratch implementation that relies + # on dense matrices + lambda_opt_ref, _, _ = find_whittaker_smooth_opt_lambda_log_marginal_likelihood( + b_vect=X[0, ::], + weight_vect=weights[0, ::], + differences=differences, + log_lambda_bounds=(log(lambda_bounds[0]), log(lambda_bounds[1])), + n_opts=100, + ) + + # the results are compared with 1% relative tolerance + for lam_opts in lambda_opts: + assert np.isclose(lam_opts, lambda_opt_ref, rtol=1e-2)