diff --git a/autoemulate/experimental/data/utils.py b/autoemulate/experimental/data/utils.py index 126f38044..40529aa7a 100644 --- a/autoemulate/experimental/data/utils.py +++ b/autoemulate/experimental/data/utils.py @@ -93,6 +93,24 @@ def _convert_to_tensors( f"Unsupported type for dataset ({type(dataset)}). Must be TensorDataset." ) + def _convert_to_numpy( + self, + x: InputLike, + y: InputLike | None = None, + ) -> tuple[np.ndarray, np.ndarray | None]: + """ + Convert InputLike x, y to tuple of numpy arrays. + """ + if isinstance(x, np.ndarray) and (y is None or isinstance(y, np.ndarray)): + return x, y + + result = self._convert_to_tensors(x, y) + if isinstance(result, tuple): + x, y = result + return x.numpy(), y.numpy() + x = result + return x.numpy(), None + def _random_split( self, dataset: Dataset, diff --git a/autoemulate/experimental/emulators/base.py b/autoemulate/experimental/emulators/base.py index f8557f89e..1cd5b626d 100644 --- a/autoemulate/experimental/emulators/base.py +++ b/autoemulate/experimental/emulators/base.py @@ -24,6 +24,10 @@ def __init__( self, x: InputLike | None = None, y: InputLike | None = None, **kwargs ): ... + @classmethod + def model_name(cls) -> str: + return cls.__name__ + @abstractmethod def fit(self, x: InputLike, y: InputLike | None): ... diff --git a/autoemulate/experimental/emulators/lightgbm.py b/autoemulate/experimental/emulators/lightgbm.py new file mode 100644 index 000000000..f36b94adc --- /dev/null +++ b/autoemulate/experimental/emulators/lightgbm.py @@ -0,0 +1,134 @@ +import numpy as np +from lightgbm import LGBMRegressor +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y +from torch import Tensor + +from autoemulate.experimental.emulators.base import ( + Emulator, + InputTypeMixin, +) +from autoemulate.experimental.types import InputLike, OutputLike + + +class LightGBM(Emulator, InputTypeMixin): + """LightGBM Emulator. + + Wraps LightGBM regression from LightGBM. + See https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html + for more details. + """ + + def __init__( # noqa: PLR0913 allow too many arguments since all currently required + self, + x: InputLike | None = None, + y: InputLike | None = None, + boosting_type: str = "gbdt", + num_leaves: int = 31, + max_depth: int = -1, + learning_rate: float = 0.1, + n_estimators: int = 100, + subsample_for_bin: int = 200000, + objective: str | None = None, + class_weight: dict | str | None = None, + min_split_gain: float = 0.0, + min_child_weight: float = 0.001, + min_child_samples: int = 20, + subsample: float = 1.0, + colsample_bytree: float = 1.0, + reg_alpha: float = 0.0, + reg_lambda: float = 0.0, + random_state: int | None = None, + n_jobs: int | None = 1, + importance_type: str = "split", + verbose: int = -1, + ): + """Initializes a LightGBM object.""" + _, _ = x, y # ignore unused arguments + self.boosting_type = boosting_type + self.num_leaves = num_leaves + self.max_depth = max_depth + self.learning_rate = learning_rate + self.n_estimators = n_estimators + self.subsample_for_bin = subsample_for_bin + self.objective = objective + self.class_weight = class_weight + self.min_split_gain = min_split_gain + self.min_child_weight = min_child_weight + self.min_child_samples = min_child_samples + self.subsample = subsample + self.colsample_bytree = colsample_bytree + self.reg_alpha = reg_alpha + self.reg_lambda = reg_lambda + self.random_state = random_state + self.n_jobs = n_jobs + self.importance_type = importance_type + self.verbose = verbose + + def fit(self, x: InputLike, y: InputLike | None): + """ + Fits the emulator to the data. + The model expects the input data to be: + x (features): 2D array + y (target): 1D array + """ + + x, y = self._convert_to_numpy(x, y) + + if y is None: + msg = "y must be provided." + raise ValueError(msg) + if y.ndim > 2: + msg = f"y must be 1D or 2D array. Found {y.ndim}D array." + raise ValueError(msg) + if y.ndim == 2: # _convert_to_numpy may return 2D y + y = y.ravel() # Ensure y is 1-dimensional + + self.n_features_in_ = x.shape[1] + + x, y = check_X_y(x, y, y_numeric=True) + + self.model_ = LGBMRegressor( + boosting_type=self.boosting_type, + num_leaves=self.num_leaves, + max_depth=self.max_depth, + learning_rate=self.learning_rate, + n_estimators=self.n_estimators, + subsample_for_bin=self.subsample_for_bin, + objective=self.objective, + class_weight=self.class_weight, + min_split_gain=self.min_split_gain, + min_child_weight=self.min_child_weight, + min_child_samples=self.min_child_samples, + subsample=self.subsample, + colsample_bytree=self.colsample_bytree, + reg_alpha=self.reg_alpha, + reg_lambda=self.reg_lambda, + random_state=self.random_state, + n_jobs=self.n_jobs, + importance_type=self.importance_type, + verbose=self.verbose, + ) + + self.model_.fit(x, y) + self.is_fitted_ = True + + def predict(self, x: InputLike) -> OutputLike: + """Predicts the output of the emulator for a given input.""" + x = check_array(x) + check_is_fitted(self, "is_fitted_") + y_pred = self.model_.predict(x) + # Ensure the output is a 2D tensor array with shape (n_samples, 1) + return Tensor(y_pred.reshape(-1, 1)) # type: ignore PGH003 + + @staticmethod + def get_tune_config(): + # Note: 10 ** np.random.uniform(-3, 0) + # is equivalent to scipy.stats.loguniform(0.001, 0.1) + return { + "num_leaves": [np.random.randint(10, 100)], + "max_depth": [np.random.randint(-1, 12)], + "learning_rate": [10 ** np.random.uniform(-3, -1)], + "n_estimators": [np.random.randint(50, 1000)], + "reg_alpha": [10 ** np.random.uniform(-3, 0)], + "reg_lambda": [10 ** np.random.uniform(-3, 0)], + } diff --git a/autoemulate/experimental/tuner.py b/autoemulate/experimental/tuner.py index 0754a1de9..e23a36bc2 100644 --- a/autoemulate/experimental/tuner.py +++ b/autoemulate/experimental/tuner.py @@ -64,6 +64,7 @@ def run(self, model_class: type[Emulator]) -> tuple[list[float], list[ModelConfi } # TODO: consider whether to pass as tensors or dataloader + # require training data for initialisation as well as fitting? m = model_class(train_x, train_y, **model_config) m.fit(train_x, train_y) diff --git a/tests/experimental/test_experimental_lightgbm.py b/tests/experimental/test_experimental_lightgbm.py new file mode 100644 index 000000000..3ed4e80ea --- /dev/null +++ b/tests/experimental/test_experimental_lightgbm.py @@ -0,0 +1,27 @@ +from autoemulate.experimental.emulators.lightgbm import ( + LightGBM, +) +from autoemulate.experimental.tuner import Tuner +from autoemulate.experimental.types import TensorLike + + +def test_predict_lightgbm(sample_data_y1d, new_data_y1d): + x, y = sample_data_y1d + lgbm = LightGBM() + lgbm.fit(x, y) + x2, _ = new_data_y1d + y_pred = lgbm.predict(x2) + assert isinstance(y_pred, TensorLike) + + +def test_tune_lightgbm(sample_data_y1d): + x, y = sample_data_y1d + tuner = Tuner(x, y, n_iter=5) + scores, configs = tuner.run(LightGBM) + assert len(scores) == 5 + assert len(configs) == 5 + + +def test_lightgm_class_name_returned(): + lgbm = LightGBM() + assert lgbm.model_name() == "LightGBM"