Use env for repro tests, improve repro names

Fixes for gpu repro tests, auto-compare all state variables Improve error message, increment ref counter Fix zero_dict calls Fix dict init Fixes to zero_dict Improve debug info
CliMA · Nov 12, 2024 · f802f8b · f802f8b
1 parent 682335f
commit f802f8b
Show file tree

Hide file tree

Showing 19 changed files with 292 additions and 747 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
diff --git a/config/default_configs/default_config.yml b/config/default_configs/default_config.yml
@@ -217,9 +217,6 @@ non_orographic_gravity_wave:
 nh_poly:
   help: "Horizontal polynomial degree. Note: The number of quadrature points in 1D within each horizontal element is then Nq = <--nh_poly> + 1"
   value: 3
-reproducibility_test:
-  help: "(Bool) perform reproducibility test"
-  value: false
 check_conservation:
   help: "Check conservation of mass and energy [`false` (default), `true`]"
   value: false

diff --git a/config/model_configs/deep_sphere_baroclinic_wave_rhoe_equilmoist.yml b/config/model_configs/deep_sphere_baroclinic_wave_rhoe_equilmoist.yml
@@ -1,6 +1,5 @@
 precip_model: "0M"
 dt_save_state_to_disk: "2days"
-reproducibility_test: true
 initial_condition: "MoistBaroclinicWave"
 dt: "450secs"
 t_end: "10days"

diff --git a/config/model_configs/diagnostic_edmfx_aquaplanet.yml b/config/model_configs/diagnostic_edmfx_aquaplanet.yml
@@ -19,6 +19,5 @@ cloud_model: "quadrature_sgs"
 precip_model: 1M
 dt: 120secs
 t_end: 3hours
-reproducibility_test: true
 toml: [toml/diagnostic_edmfx.toml]
 ode_algo: ARS343
diff --git a/config/model_configs/single_column_precipitation_test.yml b/config/model_configs/single_column_precipitation_test.yml
@@ -14,7 +14,6 @@ precip_model: "1M"
 vert_diff: "FriersonDiffusion"
 implicit_diffusion: true
 approximate_linear_solve_iters: 2
-reproducibility_test: false
 toml: [toml/single_column_precipitation_test.toml]
 diagnostics:
   - short_name: [hus, clw, cli, husra, hussn, ta, wa]

diff --git a/config/model_configs/sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric.yml b/config/model_configs/sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric.yml
@@ -16,7 +16,6 @@ cloud_model: "grid_scale"
 surface_temperature: "ZonallyAsymmetric"
 moist: "equil"
 albedo_model: "RegressionFunctionAlbedo"
-reproducibility_test: true
 aerosol_radiation: true
 prescribed_aerosols: ["CB1", "CB2", "DST01", "DST02", "DST03", "DST04", "OC1", "OC2", "SO4"]
 toml: [toml/sphere_aquaplanet.toml]
diff --git a/config/model_configs/sphere_baroclinic_wave_rhoe_equilmoist.yml b/config/model_configs/sphere_baroclinic_wave_rhoe_equilmoist.yml
@@ -1,6 +1,5 @@
 precip_model: "0M"
 dt_save_state_to_disk: "2days"
-reproducibility_test: true
 initial_condition: "MoistBaroclinicWave"
 dt: "450secs"
 t_end: "10days"

diff --git a/config/model_configs/sphere_held_suarez_rhoe_equilmoist_hightop_sponge.yml b/config/model_configs/sphere_held_suarez_rhoe_equilmoist_hightop_sponge.yml
@@ -9,6 +9,5 @@ t_end: "4days"
 vert_diff: true
 forcing: "held_suarez"
 precip_model: "0M"
-reproducibility_test: true
 moist: "equil"
 toml: [toml/sphere_held_suarez.toml]
diff --git a/examples/hybrid/driver.jl b/examples/hybrid/driver.jl
@@ -87,8 +87,8 @@ end
 include(
     joinpath(@__DIR__, "..", "..", "reproducibility_tests", "mse_tables.jl"),
 )
-if config.parsed_args["reproducibility_test"]
-    # Test results against main branch
+if get(ENV, "test_reproducibility", "false") == "true"
+    # Export reproducibility results, to later test against the main branch
     include(
         joinpath(
             @__DIR__,
@@ -98,17 +98,10 @@ if config.parsed_args["reproducibility_test"]
             "reproducibility_tests.jl",
         ),
     )
-    @testset "Test reproducibility table entries" begin
-        mse_keys = sort(collect(keys(all_best_mse[simulation.job_id])))
-        pcs = collect(Fields.property_chains(sol.u[end]))
-        for prop_chain in mse_keys
-            @test prop_chain in pcs
-        end
-    end
-    perform_reproducibility_tests(
+    export_reproducibility_results(
+        config.comms_ctx,
         simulation.job_id,
         sol.u[end],
-        all_best_mse,
         simulation.output_dir,
     )
 end

diff --git a/reproducibility_tests/README.md b/reproducibility_tests/README.md
@@ -64,9 +64,8 @@ To update the mse tables:
 
 To add a new reproducibility test:
 
- - Set the command-line `reproducibility_test` to true, and add `julia --color=yes --project=examples reproducibility_tests/test_mse.jl --job_id [job_id] --out_dir [job_id]` as a separate command for the new (or existing) job
- - Copy the `all_best_mse` dict template from the job's log
- - Paste the `all_best_mse` dict template into `reproducibility_test/mse_tables.jl`
+ - Add `julia --color=yes --project=examples reproducibility_tests/test_mse.jl --job_id [job_id] --out_dir [job_id]` as a separate command for the new (or existing) job, and set the `test_reproducibility` environment flag. For example: `test_reproducibility: "true"`.
+ - Add the job's `job_id` into the `reproducibility_test_job_ids` vector in `reproducibility_test/mse_tables.jl`.
 
 <!-- TODO: improve names / mark off sections for all_best_mse dict -->
 
@@ -90,19 +89,17 @@ We cannot (easily) compare the output with a reference if we change the spatial
 
 ## A detailed procedure of how reproducibility tests are performed
 
-Reprodicibility tests are performed at the end of `examples/hybrid/driver.jl`, after a simulation completes, and relies on a unique job id (`job_id`). Here is an outline of the reproducibility test procedure:
+Reprodicibility results are computed at the end of the `examples/hybrid/driver.jl` script, and tested in the `reproducibility_tests/test_mse.jl`. This separation helps us delay. Here is an outline of the reproducibility test procedure:
 
  0) Run a simulation, with a particular `job_id`, to the final time.
- 1) Load a dictionary, `all_best_mse`, of previous "best" mean-squared errors from `mse_tables.jl` and extract the mean squared errors for the given `job_id` (store in job-specific dictionary, `best_mse`).
- 2) Export the solution (a `FieldVector`) at the final simulation time to an `NCDataset` file.
- 3) Compute the errors between the exported solution and the exported solution from the reference `NCDataset` files (which are saved in a dedicated folders on the Caltech Central cluster) and save into a dictionary, called `computed_mse`.
- 4) Export this dictionary (`computed_mse`) to the output folder
- 5) Test that `computed_mse` is no worse than `best_mse` (determines if reproducibility test passes or not).
+ 1) Export the solution (a `FieldVector`) at the final simulation time to an HDF5 file.
+ 2) Compute the mean squared errors (MSE) against all other comparable references (which are saved in a dedicated folders on the Caltech Central cluster) for all fieldvector variables in the prognostic state.
+ 3) Convert this set of MSEs to a dictionary (called `computed_mse`), and export it to a file in the output folder.
 
 After these steps are performed at the end of the driver, additional jobs are run:
 
  1) Print `computed_mse` for all jobs to make updating `reproducibility_tests/mse_tables.jl` easy
- 2) If we're on the github queue merging branch (all tests have passed, and the PR is effectively merging), move the `NCDataset`s from the scratch directory onto the dedicated folder on the Caltech Central cluster.
+ 2) If we're on the github queue merging branch (all tests have passed, and the PR is effectively merging), move the HDF5 files from the scratch directory onto the dedicated folder on the Caltech Central cluster.
 
 ## How we track which dataset to compare against
 

diff --git a/reproducibility_tests/compute_mse.jl b/reproducibility_tests/compute_mse.jl
@@ -5,67 +5,67 @@ import ClimaCoreTempestRemap as CCTR
 
 include("latest_comparable_paths.jl")
 
-function get_nc_data(ds, var::String)
-    if haskey(ds, var)
-        return ds[var]
-    else
-        for key in keys(ds.group)
-            if haskey(ds.group[key], var)
-                return ds.group[key][var]
-            end
-        end
+"""
+    to_dict(filename::String, comms_ctx)
+
+Convert the HDF5 file containing the
+prognostic field `Y` into a `Dict`
+using ClimaCore's `property_chains` and
+`single_field` functions.
+"""
+function to_dict(filename::String, comms_ctx)
+    dict = Dict{Any, AbstractArray}()
+    reader = InputOutput.HDF5Reader(filename, comms_ctx)
+    Y = InputOutput.read_field(reader, "Y")
+    Base.close(reader)
+    for prop_chain in Fields.property_chains(Y)
+        dict[prop_chain] =
+            vec(Array(parent(Fields.single_field(Y, prop_chain))))
     end
-    error("No key $var for mse computation.")
-    return nothing
+    return dict
 end
 
 """
-    to_dict(nc_filename::String, reference_keys::Vector{String})
+    zero_dict(filename::String, comms_ctx)
 
-Convert an NCDatasets file to a `Dict`.
+Return a dict of zeros for all `ClimaCore.Fields.property_chains`
+in the fieldvector `Y` contained in the HDF5 file `filename`.
 """
-function to_dict(nc_filename::String, reference_keys::Vector{String})
-    dict = Dict{String, AbstractArray}()
-    NCDatasets.Dataset(nc_filename, "r") do ds
-        for key in reference_keys
-            dict[key] = vec(Array(get_nc_data(ds, key)))
-        end
+function zero_dict(filename::String, comms_ctx)
+    dict = Dict{Any, AbstractArray}()
+    reader = InputOutput.HDF5Reader(filename, comms_ctx)
+    Y = InputOutput.read_field(reader, "Y")
+    Base.close(reader)
+    for prop_chain in Fields.property_chains(Y)
+        dict[prop_chain] =
+            vec(Array(parent(Fields.single_field(Y, prop_chain)))) .* 0
     end
     return dict
 end
 
 """
-    reproducibility_test(;
+    reproducibility_results(
+        comms_ctx;
         job_id,
-        reference_mse,
         ds_filename_computed,
         ds_filename_reference = nothing,
-        varname,
     )
 
 Returns a `Dict` of mean-squared errors between
-`NCDataset`s `ds_filename_computed` and
-`ds_filename_reference` for all keys in `reference_mse`.
-Keys in `reference_mse` may directly map to keys in
-the `NCDataset`s, or they may be mapped to the keys
-via `varname`.
+datasets `ds_filename_computed` and
+`ds_filename_reference` for all variables.
 
 If running on buildkite, we get `ds_filename_reference`
 from the latest merged dataset on Caltech central.
 """
-function reproducibility_test(;
-    job_id,
-    reference_mse,
-    ds_filename_computed,
-    varname,
-)
+function reproducibility_results(comms_ctx; job_id, ds_filename_computed)
     local ds_filename_reference
-    reference_keys = map(k -> varname(k), collect(keys(reference_mse)))
     paths = String[] # initialize for later handling
 
     if haskey(ENV, "BUILDKITE_COMMIT")
         paths = latest_comparable_paths(10)
-        isempty(paths) && return (reference_mse, paths)
+        isempty(paths) &&
+            return (zero_dict(ds_filename_computed, comms_ctx), paths)
         @info "`ds_filename_computed`: `$ds_filename_computed`"
         ds_filename_references =
             map(p -> joinpath(p, ds_filename_computed), paths)
@@ -94,40 +94,41 @@ function reproducibility_test(;
                     @warn "There is no reference dataset, and no NC tar file."
                 end
             end
-            if !isfile(ds_filename_reference)
-                msg = "\n\n"
-                msg *= "Pull request author:\n"
-                msg *= "    It seems that a new dataset,\n"
-                msg *= "\n"
-                msg *= "dataset file:`$(ds_filename_computed)`,"
-                msg *= "\n"
-                msg *= "    was created, or the name of the dataset\n"
-                msg *= "    has changed. Please increment the reference\n"
-                msg *= "    counter in `reproducibility_tests/ref_counter.jl`.\n"
-                msg *= "\n"
-                msg *= "    If this is not the case, then please\n"
-                msg *= "    open an issue with a link pointing to this\n"
-                msg *= "    PR and build.\n"
-                msg *= "\n"
-                msg *= "For more information, please find\n"
-                msg *= "`reproducibility_tests/README.md` and read the section\n\n"
-                msg *= "  `How to merge pull requests (PR) that get approved\n"
-                msg *= "   but *break* reproducibility tests`\n\n"
-                msg *= "for how to merge this PR."
-                error(msg)
-            end
+        end
+        non_existent_files = filter(x -> !isfile(x), ds_filename_references)
+        if !isempty(non_existent_files)
+            msg = "\n\n"
+            msg *= "Pull request author:\n"
+            msg *= "    It seems that a new dataset,\n"
+            msg *= "\n"
+            msg *= "dataset file(s):`$(non_existent_files)`,"
+            msg *= "\n"
+            msg *= "    was created, or the name of the dataset\n"
+            msg *= "    has changed. Please increment the reference\n"
+            msg *= "    counter in `reproducibility_tests/ref_counter.jl`.\n"
+            msg *= "\n"
+            msg *= "    If this is not the case, then please\n"
+            msg *= "    open an issue with a link pointing to this\n"
+            msg *= "    PR and build.\n"
+            msg *= "\n"
+            msg *= "For more information, please find\n"
+            msg *= "`reproducibility_tests/README.md` and read the section\n\n"
+            msg *= "  `How to merge pull requests (PR) that get approved\n"
+            msg *= "   but *break* reproducibility tests`\n\n"
+            msg *= "for how to merge this PR."
+            error(msg)
         end
     else
         @warn "Buildkite not detected. Skipping reproducibility tests."
         @info "Please review output results before merging."
-        return (reference_mse, paths)
+        return (zero_dict(ds_filename_computed, comms_ctx), paths)
     end
 
     local computed_mse
-    @info "Prescribed reference keys $reference_keys"
-    dict_computed = to_dict(ds_filename_computed, reference_keys)
-    dict_references =
-        map(ds -> to_dict(ds, reference_keys), ds_filename_references)
+    dict_computed = to_dict(ds_filename_computed, comms_ctx)
+    dict_references = map(ds -> to_dict(ds, comms_ctx), ds_filename_references)
+    reference_keys = keys(first(dict_references))
+    @info "Reference keys $reference_keys"
     @info "Computed keys $(collect(keys(dict_computed)))"
     @info "Reference keys $(collect(keys(first(dict_references))))"
     if all(dr -> keys(dict_computed) == keys(dr), dict_references) && all(
@@ -153,62 +154,3 @@ function reproducibility_test(;
     return (computed_mses, paths)
 
 end
-
-
-##### TODO: move below functions to ClimaCore
-
-function first_center_space(fv::Fields.FieldVector)
-    for prop_chain in Fields.property_chains(fv)
-        f = Fields.single_field(fv, prop_chain)
-        space = axes(f)
-        if space isa Spaces.CenterExtrudedFiniteDifferenceSpace
-            return space
-        end
-    end
-    error("Unfound space")
-end
-
-function first_face_space(fv::Fields.FieldVector)
-    for prop_chain in Fields.property_chains(fv)
-        f = Fields.single_field(fv, prop_chain)
-        space = axes(f)
-        if space isa Spaces.FaceExtrudedFiniteDifferenceSpace
-            return space
-        end
-    end
-    error("Unfound space")
-end
-
-function export_nc(
-    Y::Fields.FieldVector;
-    nc_filename,
-    t_now = 0.0,
-    center_space = first_center_space,
-    face_space = first_face_space,
-    filter_prop_chain = pn -> true, # use all fields
-    varname::Function,
-)
-    prop_chains = Fields.property_chains(Y)
-    filter!(filter_prop_chain, prop_chains)
-    cspace = center_space(Y)
-    fspace = face_space(Y)
-    # create a temporary dir for intermediate data
-    FT = eltype(Y)
-    NCDatasets.NCDataset(nc_filename, "c") do nc
-        # defines the appropriate dimensions and variables for a space coordinate
-        # defines the appropriate dimensions and variables for a time coordinate (by default, unlimited size)
-        nc_time = CCTR.def_time_coord(nc)
-        CCTR.def_space_coord(nc, cspace, type = "cgll")
-        CCTR.def_space_coord(nc, fspace, type = "cgll")
-        # define variables for the prognostic states
-        for prop_chain in Fields.property_chains(Y)
-            f = Fields.single_field(Y, prop_chain)
-            space = axes(f)
-            nc_var = CCTR.defVar(nc, varname(prop_chain), FT, space, ("time",))
-            nc_var[:, 1] = f
-        end
-        # TODO: interpolate w onto center space and save it the same way as the other vars
-        nc_time[1] = t_now
-    end
-    return nothing
-end
diff --git a/reproducibility_tests/move_output.jl b/reproducibility_tests/move_output.jl
@@ -2,12 +2,8 @@
 include(joinpath(@__DIR__, "latest_comparable_paths.jl"))
 paths = latest_comparable_paths()
 
-all_lines = readlines(joinpath(@__DIR__, "mse_tables.jl"))
-lines = deepcopy(all_lines)
-filter!(x -> occursin("] = OrderedCollections", x), lines)
-job_ids = getindex.(split.(lines, "\""), 2)
-@assert count(x -> occursin("OrderedDict", x), all_lines) == length(job_ids) + 1
-@assert length(job_ids) ≠ 0 # safety net
+include(joinpath(@__DIR__, "mse_tables.jl"))
+job_ids = reproducibility_test_job_ids
 
 # Note: cluster_data_prefix is also defined in compute_mse.jl
 cluster_data_prefix = "/central/scratch/esm/slurm-buildkite/climaatmos-main"
@@ -21,7 +17,7 @@ if buildkite_ci
     @info "commit = $(commit)"
 
     using Glob
-    @show readdir(joinpath(@__DIR__, ".."))
+    # @show readdir(joinpath(@__DIR__, ".."))
     # if a contributor manually merged, we still want to move data
     # from scratch to `cluster_data_prefix`. So, let's also try moving
     # data if this is running on the main branch.