metoppv · SamGriffithsMO · Sep 25, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/improver/calibration/__init__.py b/improver/calibration/__init__.py
@@ -15,7 +15,7 @@
     get_diagnostic_cube_name_from_probability_name,
 )
 from improver.utilities.cube_manipulation import MergeCubes
-
+from improver.utilities.load import load_cubelist
 
 def split_forecasts_and_truth(
     cubes: List[Cube], truth_attribute: str
@@ -266,3 +266,47 @@ def add_warning_comment(forecast: Cube) -> Cube:
             "however, no calibration has been applied."
         )
     return forecast
+
+from datetime import datetime, timedelta
+
+def get_cube_from_directory(directory, cycle_point=None, max_days_offset=None, date_format='%Y%m%dT%H%MZ'):
+    """
+    loads and merges all netCDF files in a directory
+
+    To switch on the max offset filter, both cycle_point and max_days_offset 
+    need to be provided
+    Args:
+        directory (pathlib.Path):
+            The path to the directory.
+        cycle_point (str):
+            The cycle point of the forecast, used to filter files
+        max_days_offset (int):
+            Maximum number of days before cycle_point to consider files,
+            Defined as a postive int that is subtracted from the cycle_point
-            Maximum number of days before cycle_point to consider files,
-            Defined as a postive int that is subtracted from the cycle_point
+            Maximum number of days before cycle_point to consider files.
+            Defined as a positive int that is subtracted from the cycle_point
-            Maximum number of days before cycle_point to consider files,
-            Defined as a postive int that is subtracted from the cycle_point
+            Maximum number of days before cycle_point to consider files.
+            Defined as a positive int that is subtracted from the cycle_point
+        date_format (str):
+            format of the cyclepoint and datetime in the filename, used by 
+            datetime.strptime
+
+    Returns:
+        Cube
+    """
+    files = [*map(str, directory.glob("*.nc"))]
+    if len(files) == 0:
+        # This is probably too serious - is there a quiet way to handle this?
+        raise ValueError(f"No files found in {directory}")
+
+    if max_days_offset and cycle_point:
+        # Ignore files if they are older than max_days_offset days from cycle_point
+        cycle_point = datetime.strptime(cycle_point, date_format)
+        earliest_time = cycle_point - timedelta(days=max_days_offset)
+        for filename in files.copy():
+            file_datetime = filename.split('/')[-1].split('-')[0]
+            if datetime.strptime(file_datetime, date_format) < earliest_time:
+                files.remove(filename)
+
+    if len(files) < 2:
+        raise ValueError(f"Not enough files found in {directory}")
+
+    # Check for a lower limit on number of files? - 2
+    cubes = load_cubelist(files)
+    return MergeCubes()(cubes)
diff --git a/improver/cli/estimate_emos_coefficients.py b/improver/cli/estimate_emos_coefficients.py
@@ -13,9 +13,14 @@
 @cli.clizefy
 @cli.with_output
 def process(
-    *cubes: cli.inputcube,
+    forecast_directory: cli.inputpath,
+    truth_directory: cli.inputpath,
+    land_sea_mask: cli.inputcube = None,
-    forecast_directory: cli.inputpath,
-    truth_directory: cli.inputpath,
-    land_sea_mask: cli.inputcube = None,
+    forecast_cubes: cli.calib_input_dir,
+    truth_cubes: cli.calib_input_dir,
+    land_sea_mask: cli.inputcube = None,
-    forecast_directory: cli.inputpath,
-    truth_directory: cli.inputpath,
-    land_sea_mask: cli.inputcube = None,
+    *cubes: Union[cli.calib_input_dir, cli.inputcube],
-    forecast_directory: cli.inputpath,
-    truth_directory: cli.inputpath,
-    land_sea_mask: cli.inputcube = None,
+    forecast_cubes: cli.calib_input_dir,
+    truth_cubes: cli.calib_input_dir,
+    land_sea_mask: cli.inputcube = None,
-    forecast_directory: cli.inputpath,
-    truth_directory: cli.inputpath,
-    land_sea_mask: cli.inputcube = None,
+    *cubes: Union[cli.calib_input_dir, cli.inputcube],
+    *,
     distribution,
     truth_attribute,
+    cycle_point: str = None,
+    max_days_offset: int = None,
     point_by_point=False,
     use_default_initial_guess=False,
     units=None,
@@ -32,13 +37,12 @@ def process(
     The estimated coefficients are output as a cube.
 
     Args:
-        cubes (list of iris.cube.Cube):
-            A list of cubes containing the historical forecasts and
-            corresponding truth used for calibration. They must have the same
-            cube name and will be separated based on the truth attribute.
-            Optionally this may also contain a single land-sea mask cube on the
-            same domain as the historic forecasts and truth (where land points
-            are set to one and sea points are set to zero).
+        forecast_directory (posix.Path):
+            The path to a directory containing the historical forecasts
+        truth_directory (posix.Path):
+            The path to a directory containing the truths to be used
+        land_sea_mask (iris.cube.Cube):
+            Optional land-sea mask cube, used as a static additonal predictor.
-            Optional land-sea mask cube, used as a static additonal predictor.
+            Optional land-sea mask cube, used as a static additional predictor.
-            Optional land-sea mask cube, used as a static additonal predictor.
+            Optional land-sea mask cube, used as a static additional predictor.
         distribution (str):
             The distribution that will be used for minimising the
             Continuous Ranked Probability Score when estimating the EMOS
@@ -88,12 +92,13 @@ def process(
             coefficient is stored in a separate cube.
     """
 
-    from improver.calibration import split_forecasts_and_truth
     from improver.calibration.ensemble_calibration import (
         EstimateCoefficientsForEnsembleCalibration,
     )
+    from improver.calibration import get_cube_from_directory
 
-    forecast, truth, land_sea_mask = split_forecasts_and_truth(cubes, truth_attribute)
+    forecast = get_cube_from_directory(forecast_directory, cycle_point=cycle_point, max_days_offset=max_days_offset)
+    truth = get_cube_from_directory(truth_directory, cycle_point=cycle_point, max_days_offset=max_days_offset)
 
     plugin = EstimateCoefficientsForEnsembleCalibration(
         distribution,