alan-turing-institute · edwardchalstrey1 · Apr 24, 2025 · Apr 10, 2025 · Apr 10, 2025 · Apr 11, 2025
diff --git a/autoemulate/experimental/data/utils.py b/autoemulate/experimental/data/utils.py
@@ -93,6 +93,18 @@ def _convert_to_tensors(
             f"Unsupported type for dataset ({type(dataset)}). Must be TensorDataset."
         )
 
+    def _convert_to_numpy(
+        self,
+        x: InputLike,
+        y: InputLike | None = None,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Convert InputLike x, y to tuple of numpy arrays.
+        """
+        x, y = self._convert_to_tensors(x, y)
+        y = y.ravel()  # Ensure y is 1-dimensional
+        return x.numpy(), y.numpy()
+
     def _random_split(
         self,
         dataset: Dataset,

diff --git a/autoemulate/experimental/emulators/lightgbm.py b/autoemulate/experimental/emulators/lightgbm.py
@@ -0,0 +1,125 @@
+import numpy as np
+from lightgbm import LGBMRegressor
+from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
+from torch import Tensor
+
+from autoemulate.experimental.emulators.base import (
+    Emulator,
+    InputTypeMixin,
+)
+from autoemulate.experimental.types import InputLike, OutputLike
+
+
+class LightGBM(Emulator, InputTypeMixin, BaseEstimator, RegressorMixin):
+    """LightGBM Emulator.
+
+    Wraps LightGBM regression from LightGBM.
+    """
+
+    def __init__(  # noqa: PLR0913 allow too many arguments since all currently required
+        self,
+        boosting_type="gbdt",
+        num_leaves=31,
+        max_depth=-1,
+        learning_rate=0.1,
+        n_estimators=100,
+        subsample_for_bin=200000,
+        objective=None,
+        class_weight=None,
+        min_split_gain=0.0,
+        min_child_weight=0.001,
+        min_child_samples=20,
+        subsample=1.0,
+        colsample_bytree=1.0,
+        reg_alpha=0.0,
+        reg_lambda=0.0,
+        random_state=None,
+        n_jobs=1,
+        importance_type="split",
+        verbose=-1,
+    ):
+        """Initializes a LightGBM object."""
+        self.boosting_type = boosting_type
+        self.num_leaves = num_leaves
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.n_estimators = n_estimators
+        self.subsample_for_bin = subsample_for_bin
+        self.objective = objective
+        self.class_weight = class_weight
+        self.min_split_gain = min_split_gain
+        self.min_child_weight = min_child_weight
+        self.min_child_samples = min_child_samples
+        self.subsample = subsample
+        self.colsample_bytree = colsample_bytree
+        self.reg_alpha = reg_alpha
+        self.reg_lambda = reg_lambda
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+        self.importance_type = importance_type
+        self.verbose = verbose
+
+    def fit(self, x: InputLike, y: InputLike | None, sample_weight=None, **kwargs):
+        """Fits the emulator to the data."""
+
+        x, y = self._convert_to_numpy(x, y)
+
+        self.n_features_in_ = x.shape[1]
+
+        x, y = check_X_y(
+            x, y, multi_output=self._more_tags()["multioutput"], y_numeric=True
+        )
+
+        self.model_ = LGBMRegressor(
+            boosting_type=self.boosting_type,
+            num_leaves=self.num_leaves,
+            max_depth=self.max_depth,
+            learning_rate=self.learning_rate,
+            n_estimators=self.n_estimators,
+            subsample_for_bin=self.subsample_for_bin,
+            objective=self.objective,
+            class_weight=self.class_weight,
+            min_split_gain=self.min_split_gain,
+            min_child_weight=self.min_child_weight,
+            min_child_samples=self.min_child_samples,
+            subsample=self.subsample,
+            colsample_bytree=self.colsample_bytree,
+            reg_alpha=self.reg_alpha,
+            reg_lambda=self.reg_lambda,
+            random_state=self.random_state,
+            n_jobs=self.n_jobs,
+            importance_type=self.importance_type,
+            verbose=self.verbose,
+        )
+
+        self.model_.fit(x, y, sample_weight=sample_weight)
+        self.is_fitted_ = True
+
+    def predict(self, x: InputLike) -> OutputLike:
+        """Predicts the output of the emulator for a given input."""
+        x = check_array(x)
+        check_is_fitted(self, "is_fitted_")
+        y_pred = self.model_.predict(x)
+        # Ensure the output is a 2D tensor array with shape (n_samples, 1)
+        return Tensor(y_pred.reshape(-1, 1))  # type: ignore PGH003
+
+    @staticmethod
+    def get_tune_config():
+        # Note: 10 ** np.random.uniform(-3, 0)
+        # is equivalent to scipy.stats.loguniform(0.001, 0.1)
+        return {
+            "num_leaves": [np.random.randint(10, 100)],
+            "max_depth": [np.random.randint(-1, 12)],
+            "learning_rate": [10 ** np.random.uniform(-3, -1)],
+            "n_estimators": [np.random.randint(50, 1000)],
+            "reg_alpha": [10 ** np.random.uniform(-3, 0)],
+            "reg_lambda": [10 ** np.random.uniform(-3, 0)],
+        }
+
+    @property
+    def model_name(self):
+        return self.__class__.__name__
+
+    def _more_tags(self):
+        return {"multioutput": False}
diff --git a/autoemulate/experimental/tuner.py b/autoemulate/experimental/tuner.py
@@ -64,7 +64,12 @@ def run(self, model_class: type[Emulator]) -> tuple[list[float], list[ModelConfi
             }
 
             # TODO: consider whether to pass as tensors or dataloader
-            m = model_class(train_x, train_y, **model_config)
+            # TODO: is there a better way to distinguish between models that
+            # require training data for initialisation as well as fitting?
+            if "x" in model_class.__init__.__code__.co_varnames:
+                m = model_class(train_x, train_y, **model_config)
+            else:
+                m = model_class(**model_config)
             m.fit(train_x, train_y)
 
             # evaluate

diff --git a/tests/experimental/test_experimental_lightgbm.py b/tests/experimental/test_experimental_lightgbm.py
@@ -0,0 +1,22 @@
+from autoemulate.experimental.emulators.lightgbm import (
+    LightGBM,
+)
+from autoemulate.experimental.tuner import Tuner
+from autoemulate.experimental.types import TensorLike
+
+
+def test_predict_lightgbm(sample_data_y1d, new_data_y1d):
+    x, y = sample_data_y1d
+    lgbm = LightGBM()
+    lgbm.fit(x, y)
+    x2, _ = new_data_y1d
+    y_pred = lgbm.predict(x2)
+    assert isinstance(y_pred, TensorLike)
+
+
+def test_tune_lightgbm(sample_data_y1d):
+    x, y = sample_data_y1d
+    tuner = Tuner(x, y, n_iter=5)
+    scores, configs = tuner.run(LightGBM)
+    assert len(scores) == 5
+    assert len(configs) == 5