Complete and modify test_evaluate_ar_model.py and test_fit_ar_model.py so that all functions are tested and number of assert statements per test function have been reduced.

Lenr4 · Lenr4 · commit 1d3a3cf40566 · 2025-02-25T11:53:01.000+01:00
diff --git a/tests/analysis/test_evaluate_ar_model.py b/tests/analysis/test_evaluate_ar_model.py
@@ -9,6 +9,7 @@
     _calculate_bic,
     _compute_residuals,
     _predict_ar,
+    evaluate_ar_models,
 )
 
 Max_Aic_Bic = 1e6
@@ -46,59 +47,58 @@ def test_model_results(test_dataframe):
 
 
 def test_compute_residuals(test_dataframe, test_model_results):
+    """Test that _compute_residuals returns a valid non-NaN pandas Series."""
     residuals = _compute_residuals(test_dataframe, test_model_results)
 
-    assert len(residuals) == len(test_dataframe) - (
-        len(test_model_results["integrated_coefficients"]) - 1
+    assert all(
+        [
+            isinstance(residuals, pd.Series),
+            len(residuals)
+            == len(test_dataframe)
+            - (len(test_model_results["integrated_coefficients"]) - 1),
+            not np.isnan(residuals).all(),
+        ]
     )
-    assert isinstance(residuals, pd.Series)
-    assert not np.isnan(residuals).all()
 
 
 def test_calculate_aic_output(test_dataframe, test_model_results):
+    """Test that _calculate_aic returns a valid float within a reasonable range."""
     residuals = _compute_residuals(test_dataframe, test_model_results)
     aic = _calculate_aic(residuals, p=2)
 
-    assert isinstance(aic, float)
-    assert aic < Max_Aic_Bic
+    assert all([isinstance(aic, float), aic < Max_Aic_Bic])
 
 
 def test_calculate_aic_correctness(test_dataframe, test_model_results):
+    """Test that _calculate_aic computes a nearly equal value to the formula."""
     residuals = _compute_residuals(test_dataframe, test_model_results)
 
     n = len(residuals)
-
     p = 2
-
     sigma_squared = np.var(residuals, ddof=1)
-
     expected_aic = 2 * p + n * np.log(sigma_squared)
-
     computed_aic = _calculate_aic(residuals, p)
     assert np.isclose(computed_aic, expected_aic, atol=1.5), (
         f"AIC does not match: expected {expected_aic}, got {computed_aic}"
     )
 
 
 def test_calculate_bic_output(test_dataframe, test_model_results):
+    """Test that _calculate_bic returns a valid float within a reasonable range."""
     residuals = _compute_residuals(test_dataframe, test_model_results)
     bic = _calculate_bic(residuals, p=2)
 
-    assert isinstance(bic, float)
-    assert bic < Max_Aic_Bic
+    assert all([isinstance(bic, float), bic < Max_Aic_Bic])
 
 
 def test_calculate_bic_correctness(test_dataframe, test_model_results):
+    """Test that _calculate_bic computes a nearly equal value to the formula."""
     residuals = _compute_residuals(test_dataframe, test_model_results)
 
     n = len(residuals)
-
     p = 2
-
     sigma_squared = np.var(residuals, ddof=1)
-
     expected_bic = p * np.log(n) + n * np.log(sigma_squared)
-
     computed_bic = _calculate_bic(residuals, p)
 
     assert np.isclose(computed_bic, expected_bic, atol=5.0), (
@@ -107,10 +107,67 @@ def test_calculate_bic_correctness(test_dataframe, test_model_results):
 
 
 def test_predict_ar(test_dataframe, test_model_results):
+    """Test that _predict_ar produces valid non-NaN numpy predictions."""
     predictions = _predict_ar(test_dataframe, test_model_results)
 
-    assert len(predictions) == len(test_dataframe) - (
-        len(test_model_results["integrated_coefficients"]) - 1
+    assert all(
+        [
+            len(predictions)
+            == len(test_dataframe)
+            - (len(test_model_results["integrated_coefficients"]) - 1),
+            isinstance(predictions, np.ndarray),
+            not np.isnan(predictions).all(),
+        ]
     )
-    assert isinstance(predictions, np.ndarray)
-    assert not np.isnan(predictions).all()
+
+
+def test_evaluate_ar_models_output_structure(test_dataframe):
+    """Test that evaluate_ar_models returns a dictionary with expected keys."""
+    results = evaluate_ar_models(test_dataframe, max_p=5, criterion="aic")
+    assert set(results.keys()) == {"top_models", "model_metrics", "metadata"}
+
+
+def test_evaluate_ar_models_top_models_length(test_dataframe):
+    """Test that 'top_models' contains up to 3 models."""
+    count_top_models = 3
+    results = evaluate_ar_models(test_dataframe, max_p=5, criterion="aic")
+    assert len(results["top_models"]) <= count_top_models
+
+
+def test_evaluate_ar_models_model_metrics_length(test_dataframe):
+    """Test that 'model_metrics' contains max_p models."""
+    max_p = 5
+    results = evaluate_ar_models(test_dataframe, max_p=max_p, criterion="aic")
+    assert len(results["model_metrics"]) == max_p
+
+
+def test_evaluate_ar_models_metadata(test_dataframe):
+    """Test that 'metadata' contains correct max_p and criterion values."""
+    max_p = 5
+    criterion = "aic"
+    results = evaluate_ar_models(test_dataframe, max_p=max_p, criterion=criterion)
+    assert results["metadata"] == {"max_p": max_p, "criterion": criterion}
+
+
+def test_evaluate_ar_models_top_models_sorted(test_dataframe):
+    """Test that 'top_models' are sorted by the chosen criterion."""
+    results = evaluate_ar_models(test_dataframe, max_p=5, criterion="aic")
+    sorted_aic_values = [model["aic"] for model in results["top_models"]]
+    assert sorted_aic_values == sorted(sorted_aic_values)
+
+
+def test_evaluate_ar_models_model_keys(test_dataframe):
+    """Test that each model result contains expected keys."""
+    results = evaluate_ar_models(test_dataframe, max_p=5, criterion="aic")
+    required_keys = {"p", "aic", "bic", "p_value", "differenced", "coefficients"}
+
+    for model in results["model_metrics"]:
+        assert set(model.keys()).issuperset(required_keys)
+
+
+def test_evaluate_ar_models_coefficients_type(test_dataframe):
+    """Test that 'coefficients' in model results are numpy arrays."""
+    results = evaluate_ar_models(test_dataframe, max_p=5, criterion="aic")
+
+    for model in results["model_metrics"]:
+        assert isinstance(model["coefficients"], np.ndarray)
diff --git a/tests/analysis/test_fit_ar_model.py b/tests/analysis/test_fit_ar_model.py
@@ -7,43 +7,71 @@
     _check_stationarity,
     _create_lagged_features,
     _difference_series,
+    _integrate_ar_coefficients,
+    fit_ar_model,
 )
 
-significane = 0.05
+significance = 0.05
 expected_coeff_count = 3  # For p=2: Intercept + 2 Lags
 expected_length_lagged = 3
 
 
-def test_check_stationarity():
+def test_check_stationarity_stationary():
+    """Test ADF stationarity check on a stationary time series."""
     rng = np.random.default_rng(12)
     df = pd.DataFrame({"price": rng.normal(0, 1, 100)})
-    stationary, p_value = _check_stationarity(df, "price", significance=significane)
-    assert bool(stationary) is True
 
-    # 2. Non-stationary series
-    df["price"] = np.linspace(1, 100, 100) + rng.normal(0, 0.5, 100)
-    stationary, p_value = _check_stationarity(df, "price", significance=significane)
-    assert bool(stationary) is False
-    assert p_value > significane
+    stationary, p_value, test_statistic = _check_stationarity(df, "price", significance)
+
+    assert all(
+        [
+            bool(stationary) is True,
+            isinstance(p_value, float),
+            isinstance(test_statistic, float),
+        ]
+    )
+
+
+def test_check_stationarity_non_stationary():
+    """Test ADF stationarity check on a nonstationary time series."""
+    rng = np.random.default_rng(12)
+    df = pd.DataFrame({"price": np.linspace(1, 100, 100) + rng.normal(0, 0.5, 100)})
+
+    stationary, p_value, test_statistic = _check_stationarity(df, "price", significance)
+
+    assert all(
+        [
+            bool(stationary) is False,
+            p_value > significance,
+            isinstance(test_statistic, float),
+        ]
+    )
 
 
 def test_difference_series():
+    """Test differencing function for correct column creation and values."""
     df = pd.DataFrame({"price": [100, 101, 103, 106]})
     result = _difference_series(df.copy(), "price")
-    assert "diff_price" in result.columns
-    # Check that the second value of the differenced series equals 1.
-    assert result["diff_price"].iloc[1] == 1
+
+    assert all(["diff_price" in result.columns, result["diff_price"].iloc[1] == 1])
 
 
 def test_create_lagged_features():
+    """Test creation of lagged features for an AR model."""
     df = pd.DataFrame({"price": [10, 20, 30, 40, 50]})
     result = _create_lagged_features(df.copy(), "price", 2)
-    assert "price_lag1" in result.columns
-    assert "price_lag2" in result.columns
-    assert len(result) == expected_length_lagged
+
+    assert all(
+        [
+            "price_lag1" in result.columns,
+            "price_lag2" in result.columns,
+            len(result) == expected_length_lagged,
+        ]
+    )
 
 
 def test_ar_model_output():
+    """Test output structure and type of AR model fitting function."""
     rng = np.random.default_rng(312)
 
     dates = pd.date_range("2020-01-01", periods=10, freq="D")
@@ -62,11 +90,11 @@ def test_ar_model_output():
 
     coeffs = _ar_model(df, "price", 2)
 
-    assert len(coeffs) == expected_coeff_count
-    assert isinstance(coeffs, np.ndarray)
+    assert all([len(coeffs) == expected_coeff_count, isinstance(coeffs, np.ndarray)])
 
 
 def test_ar_model_correctness():
+    """Test correctness of AR model coefficients against a reference implementation."""
     rng = np.random.default_rng(2)
     dates = pd.date_range("2020-01-01", periods=100, freq="D")
     df = pd.DataFrame(
@@ -89,3 +117,82 @@ def test_ar_model_correctness():
         f"Reference: {reference_coeffs}\n"
         f"Difference: {custom_coeffs - reference_coeffs}"
     )
+
+
+def test_integrate_ar_coefficients_no_differencing():
+    """Test integration of AR coefficients when no differencing is applied."""
+    diff_coeffs = np.array([0.5, -0.2, 0.1])
+
+    result = _integrate_ar_coefficients(diff_coeffs, differenced=False)
+    expected_lags = ["Intercept", "Lag 1", "Lag 2"]
+    assert all(
+        [
+            np.allclose(result["coefficient"].to_numpy(), diff_coeffs),
+            list(result["lag"]) == expected_lags,
+        ]
+    )
+
+
+def test_integrate_ar_coefficients_with_differencing():
+    """Test integration of AR coefficients when differencing is applied."""
+    diff_coeffs = np.array([0.5, -0.2, 0.1])
+
+    result = _integrate_ar_coefficients(diff_coeffs, differenced=True)
+
+    expected_coeffs = np.array([0.5, 1 - 0.2, -0.2 - 0.1, -0.1])
+    expected_lags = ["Intercept", "Lag 1", "Lag 2", "Lag 3"]
+    assert all(
+        [
+            np.allclose(result["coefficient"].to_numpy(), expected_coeffs),
+            list(result["lag"]) == expected_lags,
+        ]
+    )
+
+
+def test_fit_ar_model_stationary_series():
+    """Test AR model fitting on a stationary time series."""
+    rng = np.random.default_rng(42)
+    df = pd.DataFrame(
+        {"close_price": np.sin(np.linspace(0, 10, 100)) + rng.normal(0, 0.1, 100)}
+    )
+
+    result = fit_ar_model(df, column="close_price", p=2)
+
+    assert result["differenced"] is False
+
+
+def test_fit_ar_model_non_stationary_series():
+    """Test AR model fitting on a non-stationary time series."""
+    rng = np.random.default_rng(42)
+    df = pd.DataFrame(
+        {"close_price": np.cumsum(np.linspace(0.1, 1, 100)) + rng.normal(0, 0.1, 100)}
+    )
+
+    result = fit_ar_model(df, column="close_price", p=2)
+
+    assert result["differenced"] is True
+
+
+def test_fit_ar_model_coefficient_shape():
+    """Test if the number of coefficients matches AR order + intercept."""
+    rng = np.random.default_rng(42)
+    df = pd.DataFrame(
+        {"close_price": np.cos(np.linspace(0, 10, 100)) + rng.normal(0, 0.1, 100)}
+    )
+
+    p = 3
+    result = fit_ar_model(df, column="close_price", p=p)
+
+    assert result["coefficients"].shape[0] == p + 1
+
+
+def test_fit_ar_model_p_value():
+    """Test if p-value is included in the result dictionary."""
+    rng = np.random.default_rng(42)
+    df = pd.DataFrame(
+        {"close_price": np.exp(np.linspace(0, 2, 100)) + rng.normal(0, 0.1, 100)}
+    )
+
+    result = fit_ar_model(df, column="close_price", p=2)
+
+    assert isinstance(result["p_value"], float)