Skip to content

Commit 2ecb9e3

Browse files
paucablopCopilot
andauthored
Refactor residuals (#144)
* update attributes in ModelResidualBase * update hotelling t2 residuals * upadte interface in Q residuals * update all interfaces in outlier algorithms * remove depricated class * refactor _utils into _base * refactor useful residual funcitons to utils * rename functions * fix typing error * update functions in residual analysis * format with with ruff * refactor residual x calculation in dmodx * change default method in qresiduals to "jackson-mudholkar" * normalize dmodx by SPE in training and DoF * format document with ruff * Update chemotools/outliers/leverage.py Swap order in inputs for calculate residuals Co-authored-by: Copilot <[email protected]> * Update studentized_residuals.py update intialization of the studentized_residuals --------- Co-authored-by: Copilot <[email protected]>
1 parent dbe9be7 commit 2ecb9e3

File tree

9 files changed

+220
-221
lines changed

9 files changed

+220
-221
lines changed

chemotools/outliers/_base.py

Lines changed: 75 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from typing import Union, Optional
2+
from typing import Optional, Tuple, Union
33

44
import numpy as np
55

@@ -9,7 +9,6 @@
99
from sklearn.pipeline import Pipeline
1010
from sklearn.utils.validation import check_is_fitted
1111

12-
from ._utils import validate_confidence, validate_and_extract_model
1312

1413
ModelTypes = Union[_BasePCA, _PLS]
1514

@@ -29,10 +28,10 @@ class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
2928
3029
Attributes
3130
----------
32-
model_ : ModelTypes
31+
estimator_ : ModelTypes
3332
The fitted model of type _BasePCA or _PLS
3433
35-
preprocessing_ : Optional[Pipeline]
34+
transformer_ : Optional[Pipeline]
3635
Preprocessing steps before the model
3736
3837
n_features_in_ : int
@@ -54,13 +53,13 @@ def __init__(
5453
confidence: float,
5554
) -> None:
5655
(
57-
self.model_,
58-
self.preprocessing_,
56+
self.estimator_,
57+
self.transformer_,
5958
self.n_features_in_,
6059
self.n_components_,
6160
self.n_samples_,
62-
) = validate_and_extract_model(model)
63-
self.confidence = validate_confidence(confidence)
61+
) = _validate_and_extract_model(model)
62+
self.confidence = _validate_confidence(confidence)
6463

6564
def fit_predict_residuals(
6665
self, X: np.ndarray, y: Optional[np.ndarray] = None
@@ -96,7 +95,7 @@ def predict_residuals(
9695
"""
9796

9897
@abstractmethod
99-
def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
98+
def _calculate_critical_value(self, X: np.ndarray) -> float:
10099
"""Calculate the critical value for outlier detection.
101100
102101
Returns
@@ -106,75 +105,84 @@ def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
106105
"""
107106

108107

109-
class _ModelDiagnosticsBase(ABC):
110-
"""Base class for model diagnostics methods. This does not implement outlier detection algorithms,
111-
but rather implements methods that are used to assess trained models.
108+
def _get_model_parameters(model: ModelTypes) -> Tuple[int, int, int]:
109+
"""
110+
Get the number of features, components and samples from a model with PLS or PCA. types.
112111
113112
Parameters
114113
----------
115-
model : Union[ModelTypes, Pipeline]
116-
A fitted PCA/PLS model or Pipeline ending with such a model
117-
118-
Attributes
119-
----------
120-
model_ : ModelTypes
121-
The fitted model of type _BasePCA or _PLS
122-
123-
preprocessing_ : Optional[Pipeline]
124-
Preprocessing steps before the model
114+
model : ModelType
115+
A fitted model of type _BasePCA or _PLS
125116
117+
Returns
118+
-------
119+
Tuple[int, int, int]
120+
The number of features, components and samples in the model
126121
"""
122+
if isinstance(model, _BasePCA):
123+
return model.n_features_in_, model.n_components_, model.n_samples_
124+
elif isinstance(model, _PLS):
125+
return model.n_features_in_, model.n_components, len(model.x_scores_)
126+
else:
127+
raise ValueError(
128+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
129+
)
127130

128-
def __init__(self, model: Union[ModelTypes, Pipeline]):
129-
self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
130131

131-
def _validate_and_extract_model(self, model):
132-
"""Validate and extract the model and preprocessing steps.
132+
def _validate_confidence(confidence: float) -> float:
133+
"""Validate parameters using sklearn conventions.
133134
134-
Parameters
135-
----------
136-
model : Union[ModelTypes, Pipeline]
137-
A fitted PCA/PLS model or Pipeline ending with such a model
135+
Parameters
136+
----------
137+
confidence : float
138+
Confidence level for statistical calculations (between 0 and 1)
138139
139-
Returns
140-
-------
141-
Tuple[ModelTypes, Optional[Pipeline]]
142-
The extracted model and preprocessing steps
140+
Returns
141+
-------
142+
float
143+
The validated confidence level
143144
144-
Raises
145-
------
146-
ValueError
147-
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
148-
"""
149-
if isinstance(model, Pipeline):
150-
preprocessing = model[:-1]
151-
model = model[-1]
152-
else:
153-
preprocessing = None
154-
155-
if isinstance(model, (_BasePCA, _PLS)):
156-
check_is_fitted(model)
157-
else:
158-
raise ValueError(
159-
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
160-
)
161-
check_is_fitted(model)
162-
return model, preprocessing
145+
Raises
146+
------
147+
ValueError
148+
If confidence is not between 0 and 1
149+
"""
150+
if not 0 < confidence < 1:
151+
raise ValueError("Confidence must be between 0 and 1")
152+
return confidence
163153

164-
@abstractmethod
165-
def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
166-
"""Predict the output of the model.
167154

168-
Parameters
169-
----------
170-
X : array-like of shape (n_samples, n_features)
171-
Input data
155+
def _validate_and_extract_model(
156+
model: Union[ModelTypes, Pipeline],
157+
) -> Tuple[ModelTypes, Optional[Pipeline], int, int, int]:
158+
"""Validate and extract the model and preprocessing steps.
172159
173-
y : array-like of shape (n_samples,), default=None
174-
Target values
160+
Parameters
161+
----------
162+
model : Union[ModelTypes, Pipeline]
163+
A fitted PCA/PLS model or Pipeline ending with such a model
175164
176-
Returns
177-
-------
178-
ndarray of shape (n_samples,)
179-
Predicted values
180-
"""
165+
Returns
166+
-------
167+
Tuple[ModelTypes, Optional[Pipeline]]
168+
The extracted model and preprocessing steps
169+
170+
Raises
171+
------
172+
ValueError
173+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
174+
"""
175+
if isinstance(model, Pipeline):
176+
preprocessing = model[:-1]
177+
model = model[-1]
178+
else:
179+
preprocessing = None
180+
181+
if not isinstance(model, (_BasePCA, _PLS)):
182+
raise ValueError(
183+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
184+
)
185+
186+
check_is_fitted(model)
187+
n_features_in, n_components, n_samples = _get_model_parameters(model)
188+
return model, preprocessing, n_features_in, n_components, n_samples

chemotools/outliers/_utils.py

Lines changed: 0 additions & 91 deletions
This file was deleted.

chemotools/outliers/dmodx.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88

99
from ._base import _ModelResidualsBase, ModelTypes
10+
from .utils import calculate_residual_spectrum
1011

1112

1213
class DModX(_ModelResidualsBase):
@@ -25,10 +26,10 @@ class DModX(_ModelResidualsBase):
2526
2627
Attributes
2728
----------
28-
model_ : ModelType
29+
estimator_ : ModelType
2930
The fitted model of type _BasePCA or _PLS
3031
31-
preprocessing_ : Optional[Pipeline]
32+
transformer_ : Optional[Pipeline]
3233
Preprocessing steps before the model
3334
3435
n_features_in_ : int
@@ -42,13 +43,17 @@ class DModX(_ModelResidualsBase):
4243
4344
critical_value_ : float
4445
The calculated critical value for outlier detection
46+
47+
train_spe_: float
48+
The training sum of squared errors (SSE) for the model normalized by degrees of freedom
4549
"""
4650

4751
def __init__(
4852
self,
4953
model: Union[ModelTypes, Pipeline],
5054
confidence: float = 0.95,
5155
) -> None:
56+
model, confidence = model, confidence
5257
super().__init__(model, confidence)
5358

5459
def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
@@ -62,7 +67,18 @@ def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "DModX":
6267
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
6368
)
6469

70+
# Calculate the critical value
6571
self.critical_value_ = self._calculate_critical_value()
72+
73+
# Calculate the degrees of freedom normalized SPE of the training set
74+
residuals = calculate_residual_spectrum(X, self.estimator_)
75+
squared_errors = np.sum((residuals) ** 2, axis=1)
76+
self.train_spe_ = np.sqrt(
77+
squared_errors
78+
/ (self.n_samples_ - self.n_components_ - 1)
79+
* (self.n_features_in_ - self.n_components_)
80+
)
81+
6682
return self
6783

6884
def predict(self, X: np.ndarray) -> np.ndarray:
@@ -118,15 +134,17 @@ def predict_residuals(
118134
)
119135

120136
# Apply preprocessing if available
121-
if self.preprocessing_:
122-
X = self.preprocessing_.transform(X)
137+
if self.transformer_:
138+
X = self.transformer_.transform(X)
123139

124140
# Calculate the DModX statistics
125-
X_transformed = self.model_.transform(X)
126-
X_reconstructed = self.model_.inverse_transform(X_transformed)
127-
squared_errors = np.sum((X - X_reconstructed) ** 2, axis=1)
141+
residual = calculate_residual_spectrum(X, self.estimator_)
142+
squared_errors = np.sum((residual) ** 2, axis=1)
128143

129-
return np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
144+
return (
145+
np.sqrt(squared_errors / (self.n_features_in_ - self.n_components_))
146+
/ self.train_spe_
147+
)
130148

131149
def _calculate_critical_value(self, X: Optional[np.ndarray] = None) -> float:
132150
"""Calculate F-distribution based critical value.

0 commit comments

Comments
 (0)