Skip to content

Commit 7a925eb

Browse files
authored
Feature/add metrics (#137)
* add residuals * add leverage function * format with ruff * format with ruff * add leverage and studentized residuals * add tests for residuals * test outliers parametrizing * linter with ruff * format document with ruff * factor out functions from _base to utils * rename modeltype to modeltypes * rename utils to _utils * facrtor out validate confidence from _base to _utils * factor out validate and extract model * fix implementation in dmodx outlier detection * Update Hotelling T2 * update hotelling t2 model * format documents by ruff * finish hotelling, dmodx and q residuals * add test for leverage * update interfaces and add studentized Q residuals * finish implementation of studentized residuals
1 parent fe8ef60 commit 7a925eb

14 files changed

+1614
-1
lines changed

chemotools/augmentation/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from ._add_noise import AddNoise
22
from ._baseline_shift import BaselineShift
33
from ._fractional_shift import FractionalShift
4+
from ._gaussian_broadening import GaussianBroadening
45
from ._index_shift import IndexShift
56
from ._spectrum_scale import SpectrumScale
67

@@ -9,6 +10,7 @@
910
"AddNoise",
1011
"BaselineShift",
1112
"FractionalShift",
13+
"GaussianBroadening",
1214
"IndexShift",
1315
"SpectrumScale",
1416
]
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
from typing import Literal, Optional
2+
import numpy as np
3+
from scipy.ndimage import gaussian_filter1d
4+
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
5+
from sklearn.utils.validation import check_is_fitted, validate_data
6+
7+
8+
class GaussianBroadening(TransformerMixin, OneToOneFeatureMixin, BaseEstimator):
9+
"""
10+
Transform spectral data by broadening peaks using Gaussian convolution.
11+
12+
This transformer applies Gaussian smoothing to broaden peaks in spectral data.
13+
For each signal, a random sigma is chosen between 0 and the specified sigma value.
14+
15+
Parameters
16+
----------
17+
sigma : float, default=1.0
18+
Maximum standard deviation for the Gaussian kernel.
19+
The actual sigma used will be randomly chosen between 0 and this value.
20+
21+
mode : {'reflect', 'constant', 'nearest', 'mirror', 'wrap'}, default='reflect'
22+
The mode parameter determines how the input array is extended when
23+
the filter overlaps a border. Default is 'reflect'.
24+
25+
pad_value : float, default=0.0
26+
Value to fill past edges of input if mode is 'constant'.
27+
28+
random_state : int, optional, default=None
29+
Random state for reproducible sigma selection.
30+
31+
truncate : float, default=4.0
32+
Truncate the filter at this many standard deviations.
33+
Larger values increase computation time but improve accuracy.
34+
"""
35+
36+
def __init__(
37+
self,
38+
sigma: float = 1.0,
39+
mode: Literal["reflect", "constant", "nearest", "mirror", "wrap"] = "reflect",
40+
pad_value: float = 0.0,
41+
random_state: Optional[int] = None,
42+
truncate: float = 4.0,
43+
):
44+
self.sigma = sigma
45+
self.mode = mode
46+
self.pad_value = pad_value
47+
self.random_state = random_state
48+
self.truncate = truncate
49+
50+
def fit(self, X: np.ndarray, y=None) -> "GaussianBroadening":
51+
"""
52+
Fit the transformer to the data (in this case, only validates input).
53+
54+
Parameters
55+
----------
56+
X : array-like of shape (n_samples, n_features)
57+
Input data to validate.
58+
59+
y : None
60+
Ignored.
61+
62+
Returns
63+
-------
64+
self : GaussianBroadening
65+
The fitted transformer.
66+
"""
67+
X = validate_data(
68+
self, X, y="no_validation", ensure_2d=True, reset=True, dtype=np.float64
69+
)
70+
71+
# Validate sigma parameter
72+
if not isinstance(self.sigma, (int, float)):
73+
raise ValueError("sigma must be a number")
74+
if self.sigma < 0:
75+
raise ValueError("sigma must be non-negative")
76+
77+
# Initialize random number generator
78+
self._rng = np.random.default_rng(self.random_state)
79+
80+
return self
81+
82+
def transform(self, X: np.ndarray, y=None) -> np.ndarray:
83+
"""
84+
Apply Gaussian broadening to the input data.
85+
86+
Parameters
87+
----------
88+
X : array-like of shape (n_samples, n_features)
89+
The data to transform.
90+
91+
y : None
92+
Ignored.
93+
94+
Returns
95+
-------
96+
X_transformed : ndarray of shape (n_samples, n_features)
97+
The transformed data with broadened peaks.
98+
"""
99+
check_is_fitted(self, "n_features_in_")
100+
X_ = validate_data(
101+
self,
102+
X,
103+
y="no_validation",
104+
ensure_2d=True,
105+
copy=True,
106+
reset=False,
107+
dtype=np.float64,
108+
)
109+
110+
# Transform each sample
111+
for i, x in enumerate(X_):
112+
X_[i] = self._broaden_signal(x)
113+
114+
return X_
115+
116+
def _broaden_signal(self, x: np.ndarray) -> np.ndarray:
117+
"""
118+
Apply Gaussian broadening to a single signal.
119+
120+
Parameters
121+
----------
122+
x : ndarray of shape (n_features,)
123+
The input signal to broaden.
124+
125+
Returns
126+
-------
127+
broadened_signal : ndarray of shape (n_features,)
128+
The broadened signal.
129+
"""
130+
# Randomly choose sigma between 0 and max sigma
131+
sigma = self._rng.uniform(0, self.sigma)
132+
133+
# Apply Gaussian filter
134+
return gaussian_filter1d(
135+
x, sigma=sigma, mode=self.mode, cval=self.pad_value, truncate=self.truncate
136+
)

chemotools/outliers/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .dmodx import DModX
2+
from .hotelling_t2 import HotellingT2
3+
from .q_residuals import QResiduals
4+
from .leverage import Leverage
5+
from .studentized_residuals import StudentizedResiduals
6+
7+
__all__ = ["DModX", "HotellingT2", "QResiduals", "Leverage", "StudentizedResiduals"]

chemotools/outliers/_base.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Union, Optional
3+
4+
import numpy as np
5+
6+
from sklearn.base import BaseEstimator, OutlierMixin
7+
from sklearn.decomposition._base import _BasePCA
8+
from sklearn.cross_decomposition._pls import _PLS
9+
from sklearn.pipeline import Pipeline
10+
from sklearn.utils.validation import check_is_fitted
11+
12+
from ._utils import validate_confidence, validate_and_extract_model
13+
14+
ModelTypes = Union[_BasePCA, _PLS]
15+
16+
17+
class _ModelResidualsBase(ABC, BaseEstimator, OutlierMixin):
18+
"""Base class for model outlier calculations.
19+
20+
Implements statistical calculations for outlier detection in dimensionality
21+
reduction models like PCA and PLS.
22+
23+
Parameters
24+
----------
25+
model : Union[ModelTypes, Pipeline]
26+
A fitted _BasePCA or _PLS models or Pipeline ending with such a model
27+
confidence : float
28+
Confidence level for statistical calculations (between 0 and 1)
29+
30+
Attributes
31+
----------
32+
model_ : ModelTypes
33+
The fitted model of type _BasePCA or _PLS
34+
35+
preprocessing_ : Optional[Pipeline]
36+
Preprocessing steps before the model
37+
38+
n_features_in_ : int
39+
Number of features in the input data
40+
41+
n_components_ : int
42+
Number of components in the model
43+
44+
n_samples_ : int
45+
Number of samples used to train the model
46+
47+
critical_value_ : float
48+
The calculated critical value for outlier detection
49+
"""
50+
51+
def __init__(
52+
self,
53+
model: Union[ModelTypes, Pipeline],
54+
confidence: float,
55+
) -> None:
56+
(
57+
self.model_,
58+
self.preprocessing_,
59+
self.n_features_in_,
60+
self.n_components_,
61+
self.n_samples_,
62+
) = validate_and_extract_model(model)
63+
self.confidence = validate_confidence(confidence)
64+
65+
def fit_predict_residuals(
66+
self, X: np.ndarray, y: Optional[np.ndarray] = None
67+
) -> np.ndarray:
68+
"""Fit the model to the input data and calculate the residuals.
69+
70+
Parameters
71+
----------
72+
X : array-like of shape (n_samples, n_features)
73+
Input data
74+
75+
y : array-like of shape (n_samples,), default=None
76+
Target values
77+
78+
Returns
79+
-------
80+
ndarray of shape (n_samples,)
81+
The residuals of the model
82+
"""
83+
self.fit(X, y)
84+
return self.predict_residuals(X, y, validate=True)
85+
86+
@abstractmethod
87+
def predict_residuals(
88+
self, X: np.ndarray, y: Optional[np.ndarray], validate: bool
89+
) -> np.ndarray:
90+
"""Calculate the residuals of the model.
91+
92+
Returns
93+
-------
94+
ndarray of shape (n_samples,)
95+
The residuals of the model
96+
"""
97+
98+
@abstractmethod
99+
def _calculate_critical_value(self, X: Optional[np.ndarray]) -> float:
100+
"""Calculate the critical value for outlier detection.
101+
102+
Returns
103+
-------
104+
float
105+
The calculated critical value for outlier detection
106+
"""
107+
108+
109+
class _ModelDiagnosticsBase(ABC):
110+
"""Base class for model diagnostics methods. This does not implement outlier detection algorithms,
111+
but rather implements methods that are used to assess trained models.
112+
113+
Parameters
114+
----------
115+
model : Union[ModelTypes, Pipeline]
116+
A fitted PCA/PLS model or Pipeline ending with such a model
117+
118+
Attributes
119+
----------
120+
model_ : ModelTypes
121+
The fitted model of type _BasePCA or _PLS
122+
123+
preprocessing_ : Optional[Pipeline]
124+
Preprocessing steps before the model
125+
126+
"""
127+
128+
def __init__(self, model: Union[ModelTypes, Pipeline]):
129+
self.model_, self.preprocessing_ = self._validate_and_extract_model(model)
130+
131+
def _validate_and_extract_model(self, model):
132+
"""Validate and extract the model and preprocessing steps.
133+
134+
Parameters
135+
----------
136+
model : Union[ModelTypes, Pipeline]
137+
A fitted PCA/PLS model or Pipeline ending with such a model
138+
139+
Returns
140+
-------
141+
Tuple[ModelTypes, Optional[Pipeline]]
142+
The extracted model and preprocessing steps
143+
144+
Raises
145+
------
146+
ValueError
147+
If the model is not of type _BasePCA or _PLS or a Pipeline ending with one of these types or if the model is not fitted
148+
"""
149+
if isinstance(model, Pipeline):
150+
preprocessing = model[:-1]
151+
model = model[-1]
152+
else:
153+
preprocessing = None
154+
155+
if isinstance(model, (_BasePCA, _PLS)):
156+
check_is_fitted(model)
157+
else:
158+
raise ValueError(
159+
"Model not a valid model. Must be of base type _BasePCA or _PLS or a Pipeline ending with one of these types."
160+
)
161+
check_is_fitted(model)
162+
return model, preprocessing
163+
164+
@abstractmethod
165+
def predict(self, X: np.ndarray, y: Optional[np.ndarray]) -> np.ndarray:
166+
"""Predict the output of the model.
167+
168+
Parameters
169+
----------
170+
X : array-like of shape (n_samples, n_features)
171+
Input data
172+
173+
y : array-like of shape (n_samples,), default=None
174+
Target values
175+
176+
Returns
177+
-------
178+
ndarray of shape (n_samples,)
179+
Predicted values
180+
"""

0 commit comments

Comments
 (0)