Skip to content

Commit c145c8d

Browse files
committed
Add docstrings for functions in evaluate_ar_model.py and fit_ar_model.py.
1 parent e4d3ef1 commit c145c8d

File tree

2 files changed

+136
-13
lines changed

2 files changed

+136
-13
lines changed

src/lennart_epp/analysis/evaluate_ar_model.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,63 @@
55

66

77
def _compute_residuals(df: pd.DataFrame, model_results: dict) -> np.ndarray:
8+
"""Compute residuals for an autoregressive (AR) model.
9+
10+
Args:
11+
df (pd.DataFrame): The input DataFrame containing the 'close_price' column.
12+
model_results (dict): A dictionary containing the fitted AR parameters.
13+
14+
Returns:
15+
np.ndarray: An array of residuals.
16+
"""
817
fitted_values = _predict_ar(df, model_results)
918
return df["close_price"].iloc[len(df) - len(fitted_values) :] - fitted_values
1019

1120

1221
def _calculate_aic(residuals: np.ndarray, p: int) -> float:
22+
"""Calculate the Akaike Information Criterion (AIC) for model evaluation.
23+
24+
Args:
25+
residuals (np.ndarray): An array of residuals from the AR model.
26+
p (int): The number of autoregressive parameters in the model.
27+
28+
Returns:
29+
float: The computed AIC value.
30+
"""
1331
n = len(residuals)
1432
rss = np.sum(residuals**2)
1533
return n * np.log(rss / n) + 2 * (p + 1)
1634

1735

1836
def _calculate_bic(residuals: np.ndarray, p: int) -> float:
37+
"""Calculate the Bayesian Information Criterion (BIC) for model evaluation.
38+
39+
Args:
40+
residuals (np.ndarray): An array of residuals from the AR model.
41+
p (int): The number of autoregressive parameters in the model.
42+
43+
Returns:
44+
float: The computed BIC value.
45+
"""
1946
n = len(residuals)
2047
rss = np.sum(residuals**2)
2148
return n * np.log(rss / n) + np.log(n) * (p + 1)
2249

2350

2451
def _predict_ar(df: pd.DataFrame, model_results: dict) -> np.ndarray:
52+
"""Generate 1 step predictions (fitted values) using an autoregressive (AR) model.
53+
54+
The function applies AR model coefficients to past values of 'close_price'
55+
to make predictions.
56+
57+
Args:
58+
df (pd.DataFrame): The input DataFrame containing the 'close_price' column.
59+
model_results (dict): A dictionary containing model coefficients under
60+
'integrated_coefficients'.
61+
62+
Returns:
63+
np.ndarray: An array of predicted values based on the AR model.
64+
"""
2565
integrated_coeff = model_results["integrated_coefficients"][
2666
"coefficient"
2767
].to_numpy()
@@ -39,6 +79,25 @@ def _predict_ar(df: pd.DataFrame, model_results: dict) -> np.ndarray:
3979
def evaluate_ar_models(
4080
df: pd.DataFrame, max_p: int = 15, criterion: str = "aic"
4181
) -> dict:
82+
"""Evaluate multiple Autoregressive (AR) models and select the best ones.
83+
84+
This function fits AR models for different lag values (p) up to `max_p`,
85+
computes the AIC and BIC scores for each model, and returns the top-performing
86+
models based on the selected criterion.
87+
88+
Args:
89+
df (pd.DataFrame): The input DataFrame containing the 'close_price' column.
90+
max_p (int, optional): The maximum number of autoregressive lags to test.
91+
Defaults to 15.
92+
criterion (str, optional): The selection criterion for ranking models.
93+
Can be "aic" or "bic". Defaults to "aic".
94+
95+
Returns:
96+
dict: A dictionary containing:
97+
- **top_models** (list[dict]): The top 3 sorted by the given criterion.
98+
- **model_metrics** (list[dict]): A list of evaluated models with metrics.
99+
- **metadata** (dict): Information about evaluation, `max_p`, `criterion`.
100+
"""
42101
results = []
43102

44103
for p in range(1, max_p + 1):

src/lennart_epp/analysis/fit_ar_model.py

Lines changed: 77 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,53 @@
66

77
def _check_stationarity(
88
df: pd.DataFrame, column: str, significance: float = 0.05
9-
) -> tuple[bool, float]:
9+
) -> tuple[bool, float, float]:
10+
"""Perform the Augmented Dickey-Fuller (ADF) test to check stationarity.
11+
12+
Args:
13+
df (pd.DataFrame): The DataFrame containing the time series data.
14+
column (str): The name of the column to test for stationarity.
15+
significance (float, optional): The significance level for the test.
16+
17+
Returns:
18+
tuple:
19+
- bool: True if the series is stationary
20+
- float: The p-value from the ADF test.
21+
- float: The ADF test statistic.
22+
"""
1023
adf_test = ADF(df[column].dropna())
1124
p_value = adf_test.pvalue
25+
test_statistic_adf = adf_test.stat
1226

13-
return p_value < significance, p_value
27+
return p_value < significance, p_value, test_statistic_adf
1428

1529

1630
def _difference_series(df: pd.DataFrame, column: str) -> pd.DataFrame:
17-
df[f"diff_{column}"] = df[column].diff().dropna()
18-
return df
31+
"""Apply first differencing to a column in a DataFrame.
32+
33+
Args:
34+
df (pd.DataFrame): The dataframe containing the time series.
35+
column (str): The column to be differenced.
36+
37+
Returns:
38+
pd.DataFrame: A new DataFrame with the differenced column.
39+
"""
40+
df_copy = df.copy()
41+
df_copy[f"diff_{column}"] = df_copy[column].diff().dropna()
42+
return df_copy[[f"diff_{column}"]]
1943

2044

2145
def _create_lagged_features(df: pd.DataFrame, column: str, p: int) -> pd.DataFrame:
46+
"""Generate lagged features for an autoregressive model.
47+
48+
Args:
49+
df (pd.DataFrame): The input DataFrame containing the time series data.
50+
column (str): The column for which lagged features should be created.
51+
p (int): The number of lagged periods to generate.
52+
53+
Returns:
54+
pd.DataFrame: A DataFrame with the original column and its lagged features.
55+
"""
2256
for lag in range(1, p + 1):
2357
df[f"{column}_lag{lag}"] = df[column].shift(lag)
2458

@@ -28,6 +62,16 @@ def _create_lagged_features(df: pd.DataFrame, column: str, p: int) -> pd.DataFra
2862

2963

3064
def _ar_model(df: pd.DataFrame, column: str, p: int) -> np.ndarray:
65+
"""Estimate autoregressive (AR) model parameters using the least squares method.
66+
67+
Args:
68+
df (pd.DataFrame): The DataFrame with time series data and lagged features.
69+
column (str): The target column for the autoregressive model.
70+
p (int): The order (number of lags) of the AR model.
71+
72+
Returns:
73+
np.ndarray: An array of estimated coefficients, including the intercept term.
74+
"""
3175
x = df[[f"{column}_lag{i}" for i in range(1, p + 1)]].to_numpy()
3276
y = df[column].to_numpy()
3377

@@ -41,23 +85,28 @@ def _ar_model(df: pd.DataFrame, column: str, p: int) -> np.ndarray:
4185
def _integrate_ar_coefficients(
4286
diff_coefficients: np.ndarray, *, differenced: bool
4387
) -> pd.DataFrame:
88+
"""Convert differenced AR model coefficients to integrated form.
89+
90+
Args:
91+
diff_coefficients (np.ndarray): The coefficients from the differenced AR model.
92+
differenced (bool): Whether the model was fitted on differenced data.
93+
94+
Returns:
95+
pd.DataFrame: A DataFrame with integrated coefficients and corresponding lags.
96+
"""
4497
if not differenced:
4598
integrated_coeff = diff_coefficients
4699
else:
47-
integrated_coeff = np.zeros(len(diff_coefficients) + 1) # Platz für AR(p+1)
48-
integrated_coeff[0] = diff_coefficients[0] # Intercept bleibt gleich
100+
integrated_coeff = np.zeros(len(diff_coefficients) + 1)
101+
integrated_coeff[0] = diff_coefficients[0]
49102

50-
# Erster AR-Koeffizient (vom differenzierten Model)
51103
integrated_coeff[1] = 1 + diff_coefficients[1]
52104

53-
# Nachfolgende AR-Koeffizienten
54105
for i in range(2, len(diff_coefficients)):
55106
integrated_coeff[i] = diff_coefficients[i - 1] - diff_coefficients[i]
56107

57-
# Zusätzliches Lag (durch Integration)
58108
integrated_coeff[-1] = -diff_coefficients[-1]
59109

60-
# DataFrame für besseren Überblick
61110
integrated_coeff_df = pd.DataFrame(
62111
{
63112
"coefficient": integrated_coeff,
@@ -71,9 +120,24 @@ def _integrate_ar_coefficients(
71120
return integrated_coeff_df
72121

73122

74-
def fit_ar_model(df: pd.DataFrame, column: str = "close_price", p: int = 3) -> dict:
75-
"""Fitte ein AR(p)-Modell und speichere differenzierte & originale Koeffizienten."""
76-
is_stationary, p_value = _check_stationarity(df, column)
123+
def fit_ar_model(df: pd.DataFrame, column: str = "close_price", p: int = 1) -> dict:
124+
"""Fit an autoregressive (AR) model of order p.
125+
126+
Args:
127+
df (pd.DataFrame): The DataFrame containing the time series data.
128+
column (str, optional): The target column to model. Defaults to "close_price".
129+
p (int, optional): The order of the AR model. Defaults to 1.
130+
131+
Returns:
132+
dict: A dictionary containing:
133+
- "coefficients" (np.ndarray): Estimated coefficients of the AR(p) model.
134+
- "integrated_coefficients" (pd.DataFrame): Integrated coefficients.
135+
- "lag_order" (int): The order of the AR model (p).
136+
- "p_value" (float): The p-value from the stationarity test.
137+
- "differenced" (bool): Whether the series was differenced before fitting.
138+
139+
"""
140+
is_stationary, p_value, test_statistic_adf = _check_stationarity(df, column)
77141
differenced = False
78142

79143
if not is_stationary:

0 commit comments

Comments
 (0)