Merge branch 'main' into 156_raw_data_location_configurable

zmaalick · web-flow · commit 95af99a92444 · 2026-03-11T10:46:08.000Z
diff --git a/CMEW/app/add_datasets/bin/add_datasets_to_share.py b/CMEW/app/add_datasets/bin/add_datasets_to_share.py
@@ -99,7 +99,7 @@ def convert_str_to_facets(section):
     return section_dict
 
 
-def add_common_facets(dataset_dict, project="CMIP6"):
+def add_common_facets(dataset_dict, project):
     """
     Add start year, end year and project to a dataset dictionary.
 
@@ -109,7 +109,6 @@ def add_common_facets(dataset_dict, project="CMIP6"):
         A dictionary containing the facets of a dataset.
     project: str
         A string indicating the project to which the dataset belongs.
-        Default is "CMIP6".
 
     Returns
     -------
@@ -127,17 +126,23 @@ def add_common_facets(dataset_dict, project="CMIP6"):
     dataset_dict["end_year"] = end_year
     dataset_dict["project"] = project
 
+    # Add MOHC as an institute only for generated CMEW runs
+    if dataset_dict["project"] == "ESMVal":
+        dataset_dict["institute"] = "MOHC"
+
     return dataset_dict
 
 
-def process_naml_file(naml_fp):
+def process_naml_file(naml_fp, project=None):
     """
     Extract the datasets and their facets from a namelist file.
 
     Parameters
     ----------
     naml_fp: str
         The file path to the namelist file containing the datasets.
+    project: str, optional
+        A string indicating the project to which the dataset belongs.
 
     Returns
     -------
@@ -148,7 +153,7 @@ def process_naml_file(naml_fp):
     sections = extract_sections_from_naml(naml_fp)
     for section in sections:
         dataset_dict = convert_str_to_facets(section)
-        dataset_dict = add_common_facets(dataset_dict)
+        dataset_dict = add_common_facets(dataset_dict, project)
         datasets.append(dataset_dict)
     return datasets
 
@@ -269,11 +274,19 @@ def use_facet_as_key(filepath, key_facet="suite_id"):
     # Loop over the namelist files in the work directory
     for basename, nl_fp in dict_namelists_in_work_dir().items():
 
-        # Extract the datasets from each file
-        datasets = process_naml_file(nl_fp)
+        # Check if it's model runs
+        if basename == "model_runs":
+
+            # Write the datasets to a YAML file with ESMVal project
+            datasets = process_naml_file(nl_fp, "ESMVal")
+            write_datasets_to_yaml(datasets, basename, target_dir)
+
+        # Check if it's CMIP6:
+        if basename == "cmip6_datasets":
 
-        # Write the datasets to a YAML file in the target directory
-        write_datasets_to_yaml(datasets, basename, target_dir)
+            # Write the datasets to a YAML file with CMIP6 project
+            datasets = process_naml_file(nl_fp, "CMIP6")
+            write_datasets_to_yaml(datasets, basename, target_dir)
 
     # Reformat the model_runs YAML file to use suite_ids as keys
     use_facet_as_key(f"{target_dir}/model_runs.yml")
diff --git a/CMEW/app/add_datasets/bin/test_add_datasets_to_share.py b/CMEW/app/add_datasets/bin/test_add_datasets_to_share.py
@@ -133,7 +133,7 @@ def test_add_common_facets(mock_env_vars):
         "project": "CMIP6",
     }
 
-    actual = add_common_facets(dataset_dict)
+    actual = add_common_facets(dataset_dict, "CMIP6")
     assert actual == expected
 
 
@@ -161,7 +161,7 @@ def test_process_naml_file(path_to_mock_nl, mock_env_vars):
         },
     ]
 
-    actual = process_naml_file(path_to_mock_nl)
+    actual = process_naml_file(path_to_mock_nl, "CMIP6")
     assert actual == expected
 
 
diff --git a/CMEW/app/add_datasets/meta/rose-meta.conf b/CMEW/app/add_datasets/meta/rose-meta.conf
@@ -4,7 +4,28 @@
 [namelist:cmip6_datasets]
 duplicate=true
 
-[namelist:cmip6_datasets=alias]
+[namelist:cmip6_datasets=experiment_id]
+compulsory=true
+description=The experiment ID of the dataset.
+help=For example, 'historical', 'piControl' or 'abrupt4xco2'.
+sort-key=3
+type=quoted
+
+[namelist:cmip6_datasets=grid]
+compulsory=true
+description=Whether the data is on the native grid of the model.
+help=This will either be 'gn' for native grid or 'gr' for regridded.
+sort-key=5
+values="gn","gr"
+
+[namelist:cmip6_datasets=institute]
+compulsory=true
+description=The institute that produced the dataset.
+help=For example, 'MOHC' or 'EC-Earth-Consortium'.
+sort-key=2
+type=quoted
+
+[namelist:cmip6_datasets=label_for_plots]
 compulsory=false
 description=A label to use on plots for this dataset.
 help=It is recommended to limit the length of this string as much as is
@@ -16,42 +37,21 @@ pattern=^".{0,25}"$
 sort-key=6
 type=quoted
 
-[namelist:cmip6_datasets=dataset]
+[namelist:cmip6_datasets=model_id]
 compulsory=true
-description=The 'model_id' (or 'source_id') of the CMIP6 dataset.
+description=The 'source_id' of the CMIP6 dataset.
 help=For example, 'UKESM1-0-LL' or 'ACCESS-ESM1-5'.
 sort-key=1
 type=quoted
 
-[namelist:cmip6_datasets=ensemble]
+[namelist:cmip6_datasets=variant_label]
 compulsory=true
-description=Also known as 'variant label'.
+description=Also known as 'ensemble'.
 help=Must adhere to CMIP6 variant label format: r<int>i<int>p<int>f<int>.
     =For example, 'r2i1p1f3'. https://help.ceda.ac.uk/article/4801-cmip6-data
-pattern=^r[0-9]+i[0-9]+p[0-9]+f[0-9]+$
+pattern=^"r[0-9]+i[0-9]+p[0-9]+f[0-9]+"$
 sort-key=4
 
-[namelist:cmip6_datasets=exp]
-compulsory=true
-description=The experiment ID of the dataset.
-help=For example, 'historical', 'piControl' or 'abrupt4xco2'.
-sort-key=3
-type=quoted
-
-[namelist:cmip6_datasets=grid]
-compulsory=true
-description=Whether the data is on the native grid of the model.
-help=This will either be 'gn' for native grid or 'gr' for regridded.
-sort-key=5
-values="gn","gr"
-
-[namelist:cmip6_datasets=institute]
-compulsory=true
-description=The institute that produced the dataset.
-help=For example, 'MOHC' or 'EC-Earth-Consortium'.
-sort-key=2
-type=quoted
-
 [namelist:model_runs]
 duplicate=true
 
diff --git a/CMEW/app/add_datasets/rose-app.conf b/CMEW/app/add_datasets/rose-app.conf
@@ -11,12 +11,12 @@ source=(namelist:cmip6_datasets(:))
 source=(namelist:model_runs(:))
 
 [namelist:cmip6_datasets(1)]
-alias="CMIP6 dataset 001"
-dataset="UKESM1-0-LL"
-ensemble=r5i1p1f3
-exp="historical"
+experiment_id="historical"
 grid="gn"
 institute="MOHC"
+label_for_plots="CMIP6 dataset 001"
+model_id="UKESM1-0-LL"
+variant_label="r5i1p1f3"
 
 [namelist:model_runs(1)]
 calendar="gregorian"
diff --git a/CMEW/app/configure_for/bin/update_recipe_file.py b/CMEW/app/configure_for/bin/update_recipe_file.py
@@ -147,6 +147,20 @@ def add_extra_datasets(recipe, yaml_filepath):
     with open(yaml_filepath, "r") as file_handle:
         extra_datasets = yaml.safe_load(file_handle)
 
+    # ESMValTool recipes expect keys to be "dataset", "ensemble", "exp" etc.
+    variables_conversion = {
+        "label_for_plots": "alias",
+        "model_id": "dataset",
+        "variant_label": "ensemble",
+        "experiment_id": "exp",
+    }
+
+    # Convert the variable names in the extra datasets
+    for dataset in extra_datasets:
+        for old_key, new_key in variables_conversion.items():
+            if old_key in dataset:
+                dataset[new_key] = dataset.pop(old_key)
+
     # Add the datasets to the datasets section of the recipe
     recipe["datasets"].extend(extra_datasets)
 
diff --git a/CMEW/app/configure_standardise/bin/create_request_file.py b/CMEW/app/configure_standardise/bin/create_request_file.py
@@ -54,7 +54,8 @@ def create_request():
         "model_workflow_id": os.environ["SUITE_ID"],
         "model_workflow_revision": "not used except with data request",
         "start_date": f"{os.environ['START_YEAR']}-01-01T00:00:00",
-        "streams": "apm",
+        # For now there is only one stream, for Amon and Emon mip.
+        "streams": os.environ["STREAM_ID"],
         "variable_list_file": os.environ["VARIABLES_PATH"],
     }
     request["misc"] = {
diff --git a/CMEW/app/configure_standardise/bin/create_variables_file.py b/CMEW/app/configure_standardise/bin/create_variables_file.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# (C) Crown Copyright 2024-2025, Met Office.
+# (C) Crown Copyright 2024-2026, Met Office.
 # The LICENSE.md file contains full licensing details.
 """
 Generates the variables.txt file from the ESMValTool recipe.
@@ -52,8 +52,8 @@ def parse_variables_from_recipe(recipe_path):
         List of variables from the ESMValTool recipe,
         formatted as ``<mip>/<variable>:<stream>``.
     """
-    # For now, hard-code stream to apm, this is correct for Amon and Emon mip.
-    stream = "apm"
+    # For now there is only one stream, for Amon and Emon mip.
+    stream = os.environ["STREAM_ID"]
     recipe = Recipe(recipe_path)
     diagnostics = recipe.data["diagnostics"]
     formatted_variables = []
diff --git a/CMEW/app/configure_standardise/bin/test_create_request_file.py b/CMEW/app/configure_standardise/bin/test_create_request_file.py
@@ -18,6 +18,7 @@ def test_create_request(monkeypatch):
     monkeypatch.setenv("SUITE_ID", "u-az513")
     monkeypatch.setenv("VARIABLES_PATH", "/path/to/variables.txt")
     monkeypatch.setenv("VARIANT_LABEL", "r1i1p1f1")
+    monkeypatch.setenv("STREAM_ID", "apm")
 
     config = create_request()
     actual = {
diff --git a/CMEW/app/configure_standardise/bin/test_create_variables_file.py b/CMEW/app/configure_standardise/bin/test_create_variables_file.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# (C) Crown Copyright 2024-2025, Met Office.
+# (C) Crown Copyright 2024-2026, Met Office.
 # The LICENSE.md file contains full licensing details.
 """
 Tests for configure_standardise
@@ -8,7 +8,8 @@
 from pathlib import Path
 
 
-def test_parse_radiation_budget_variables():
+def test_parse_radiation_budget_variables(monkeypatch):
+    monkeypatch.setenv("STREAM_ID", "apm")
     recipe_path = (
         Path(__file__).parent.parent.parent
         / "unittest"
diff --git a/CMEW/app/unittest/kgo/extended_radiation_budget_recipe.yml b/CMEW/app/unittest/kgo/extended_radiation_budget_recipe.yml
@@ -9,7 +9,7 @@ datasets:
   dataset: HadGEM3-GC31-LL
   end_year: 1993
   ensemble: r1i1p1f3
-  exp: amip
+  exp: historical
   grid: gn
   institute: MOHC
   project: ESMVal
diff --git a/CMEW/flow.cylc b/CMEW/flow.cylc
@@ -121,6 +121,7 @@
             ROSE_TASK_APP = configure_standardise
             START_YEAR = {{ START_YEAR }}
             NUMBER_OF_YEARS = {{ NUMBER_OF_YEARS }}
+            STREAM_ID = "apm"
 
     [[standardise_model_data]]
         inherit = STANDARDISE, MODEL_RUNS

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ def test_add_common_facets(mock_env_vars):`
`133`	`133`	`"project": "CMIP6",`
`134`	`134`	`}`
`135`	`135`
`136`		`- actual = add_common_facets(dataset_dict)`
	`136`	`+ actual = add_common_facets(dataset_dict, "CMIP6")`
`137`	`137`	`assert actual == expected`
`138`	`138`
`139`	`139`
`@@ -161,7 +161,7 @@ def test_process_naml_file(path_to_mock_nl, mock_env_vars):`
`161`	`161`	`},`
`162`	`162`	`]`
`163`	`163`
`164`		`- actual = process_naml_file(path_to_mock_nl)`
	`164`	`+ actual = process_naml_file(path_to_mock_nl, "CMIP6")`
`165`	`165`	`assert actual == expected`
`166`	`166`
`167`	`167`