From 8874fb78bfbde7afef72445208a14c2f1d7fbdea Mon Sep 17 00:00:00 2001 From: Aliaksandr Yakutovich Date: Wed, 13 Mar 2024 16:54:46 +0100 Subject: [PATCH] Implement output trajectory merge in Cp2kBaseWorkChain (#209) * Implement output trajectory merge in Cp2kBaseWorkChain * Add a unit test for the merge_trajectory_data function. * Add an example of the MD restart. * Modify the reftraj example to facilitate the check of trajectories. --------- Co-authored-by: Carlo Antonio Pignedoli --- Dockerfile | 1 + README.md | 2 +- aiida_cp2k/parsers/__init__.py | 10 +- aiida_cp2k/utils/__init__.py | 6 + aiida_cp2k/utils/datatype_helpers.py | 110 +++++++++--- aiida_cp2k/workchains/base.py | 20 +++ .../example_base_md_reftraj_restart.py | 39 ++++- .../workchains/example_base_md_restart.py | 158 ++++++++++++++++++ test/test_datatype_helpers.py | 60 +++++++ 9 files changed, 373 insertions(+), 33 deletions(-) create mode 100644 examples/workchains/example_base_md_restart.py create mode 100644 test/test_datatype_helpers.py diff --git a/Dockerfile b/Dockerfile index cc0efbd..e245149 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,7 @@ # For further information on the license, see the LICENSE.txt file. # ############################################################################### + FROM aiidateam/aiida-core-with-services:2.5.0 # To prevent the container to exit prematurely. diff --git a/README.md b/README.md index 37c89eb..498bf8a 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ docker build -t aiida_cp2k_test . Then, you can launch the container: ```bash -DOKERID=`docker run -it aiida_cp2k_test` +DOKERID=`docker run -d aiida_cp2k_test` ``` This will remeber the container ID in the variable `DOKERID`. You can then run the tests with the following command: diff --git a/aiida_cp2k/parsers/__init__.py b/aiida_cp2k/parsers/__init__.py index 8b7bc18..440231b 100644 --- a/aiida_cp2k/parsers/__init__.py +++ b/aiida_cp2k/parsers/__init__.py @@ -145,12 +145,19 @@ def _parse_trajectory(self, structure): positions_traj = [] stepids_traj = [] + energies_traj = [] for frame in parse(output_xyz_pos): _, positions = zip(*frame["atoms"]) positions_traj.append(positions) - stepids_traj.append(int(frame["comment"].split()[2][:-1])) + comment_split = frame["comment"].split(",") + stepids_traj.append(int(comment_split[0].split()[-1])) + energy_index = next( + (i for i, s in enumerate(comment_split) if "E =" in s), None + ) + energies_traj.append(float(comment_split[energy_index].split()[-1])) positions_traj = np.array(positions_traj) stepids_traj = np.array(stepids_traj) + energies_traj = np.array(energies_traj) cell_traj = None cell_traj_fname = self.node.process_class._DEFAULT_TRAJECT_CELL_FILE_NAME @@ -190,6 +197,7 @@ def _parse_trajectory(self, structure): symbols=symbols, positions=positions_traj, ) + trajectory.set_array("energies", energies_traj) if forces_traj is not None: trajectory.set_array("forces", forces_traj) diff --git a/aiida_cp2k/utils/__init__.py b/aiida_cp2k/utils/__init__.py index 285483d..fc9d772 100644 --- a/aiida_cp2k/utils/__init__.py +++ b/aiida_cp2k/utils/__init__.py @@ -6,6 +6,10 @@ ############################################################################### """AiiDA-CP2K utils""" +from .datatype_helpers import ( + merge_trajectory_data_non_unique, + merge_trajectory_data_unique, +) from .input_generator import ( Cp2kInput, add_ext_restart_section, @@ -42,4 +46,6 @@ "merge_Dict", "ot_has_small_bandgap", "resize_unit_cell", + "merge_trajectory_data_unique", + "merge_trajectory_data_non_unique", ] diff --git a/aiida_cp2k/utils/datatype_helpers.py b/aiida_cp2k/utils/datatype_helpers.py index 8c12e9e..041bad6 100644 --- a/aiida_cp2k/utils/datatype_helpers.py +++ b/aiida_cp2k/utils/datatype_helpers.py @@ -9,8 +9,8 @@ import re from collections.abc import Sequence -from aiida.common import InputValidationError -from aiida.plugins import DataFactory +import numpy as np +from aiida import common, engine, orm, plugins def _unpack(adict): @@ -50,7 +50,9 @@ def _kind_element_from_kind_section(section): try: kind = section["_"] except KeyError: - raise InputValidationError("No default parameter '_' found in KIND section.") + raise common.InputValidationError( + "No default parameter '_' found in KIND section." + ) try: element = section["ELEMENT"] @@ -60,7 +62,7 @@ def _kind_element_from_kind_section(section): try: element = match["sym"] except TypeError: - raise InputValidationError( + raise common.InputValidationError( f"Unable to figure out atomic symbol from KIND '{kind}'." ) @@ -125,7 +127,7 @@ def _write_gdt(inp, entries, folder, key, fname): def validate_basissets_namespace(basissets, _): """A input_namespace validator to ensure passed down basis sets have the correct type.""" return _validate_gdt_namespace( - basissets, DataFactory("gaussian.basisset"), "basis set" + basissets, plugins.DataFactory("gaussian.basisset"), "basis set" ) @@ -176,7 +178,7 @@ def validate_basissets(inp, basissets, structure): bsets = [(t, b) for t, s, b in basissets if s == element] if not bsets: - raise InputValidationError( + raise common.InputValidationError( f"No basis set found for kind {kind} or element {element}" f" in basissets input namespace and not explicitly set." ) @@ -203,7 +205,7 @@ def validate_basissets(inp, basissets, structure): bsets = [(t, b) for t, s, b in basissets if s == element] if not bsets: - raise InputValidationError( + raise common.InputValidationError( f"'BASIS_SET {bstype} {bsname}' for element {element} (from kind {kind})" " not found in basissets input namespace" ) @@ -213,7 +215,7 @@ def validate_basissets(inp, basissets, structure): basissets_used.add(bset) break else: - raise InputValidationError( + raise common.InputValidationError( f"'BASIS_SET {bstype} {bsname}' for element {element} (from kind {kind})" " not found in basissets input namespace" ) @@ -222,14 +224,14 @@ def validate_basissets(inp, basissets, structure): if not structure and any( bset not in basissets_used for bset in basissets_specified ): - raise InputValidationError( + raise common.InputValidationError( "No explicit structure given and basis sets not referenced in input" ) if isinstance(inp["FORCE_EVAL"], Sequence) and any( kind.name not in explicit_kinds for kind in structure.kinds ): - raise InputValidationError( + raise common.InputValidationError( "Automated BASIS_SET keyword creation is not yet supported with multiple FORCE_EVALs." " Please explicitly reference a BASIS_SET for each KIND." ) @@ -250,13 +252,13 @@ def validate_basissets(inp, basissets, structure): bsets = [(t, b) for t, s, b in basissets if s == kind.symbol] if not bsets: - raise InputValidationError( + raise common.InputValidationError( f"No basis set found in the given basissets for kind '{kind.name}' of your structure." ) for _, bset in bsets: if bset.element != kind.symbol: - raise InputValidationError( + raise common.InputValidationError( f"Basis set '{bset.name}' for '{bset.element}' specified" f" for kind '{kind.name}' (of '{kind.symbol}')." ) @@ -274,7 +276,7 @@ def validate_basissets(inp, basissets, structure): for bset in basissets_specified: if bset not in basissets_used: - raise InputValidationError( + raise common.InputValidationError( f"Basis set '{bset.name}' ('{bset.element}') specified in the basissets" f" input namespace but not referenced by either input or structure." ) @@ -287,7 +289,9 @@ def write_basissets(inp, basissets, folder): def validate_pseudos_namespace(pseudos, _): """A input_namespace validator to ensure passed down pseudopentials have the correct type.""" - return _validate_gdt_namespace(pseudos, DataFactory("gaussian.pseudo"), "pseudo") + return _validate_gdt_namespace( + pseudos, plugins.DataFactory("gaussian.pseudo"), "pseudo" + ) def validate_pseudos(inp, pseudos, structure): @@ -318,7 +322,7 @@ def validate_pseudos(inp, pseudos, structure): try: pseudo = pseudos[element] except KeyError: - raise InputValidationError( + raise common.InputValidationError( f"No pseudopotential found for kind {kind} or element {element}" f" in pseudos input namespace and not explicitly set." ) @@ -335,19 +339,19 @@ def validate_pseudos(inp, pseudos, structure): try: pseudo = pseudos[element] except KeyError: - raise InputValidationError( + raise common.InputValidationError( f"'POTENTIAL {ptype} {pname}' for element {element} (from kind {kind})" " not found in pseudos input namespace" ) if pname not in pseudo.aliases: - raise InputValidationError( + raise common.InputValidationError( f"'POTENTIAL {ptype} {pname}' for element {element} (from kind {kind})" " not found in pseudos input namespace" ) if pseudo.element != element: - raise InputValidationError( + raise common.InputValidationError( f"Pseudopotential '{pseudo.name}' for '{pseudo.element}' specified" f" for element '{element}'." ) @@ -358,14 +362,14 @@ def validate_pseudos(inp, pseudos, structure): if not structure and any( pseudo not in pseudos_used for pseudo in pseudos_specified ): - raise InputValidationError( + raise common.InputValidationError( "No explicit structure given and pseudo not referenced in input" ) if isinstance(inp["FORCE_EVAL"], Sequence) and any( kind.name not in explicit_kinds for kind in structure.kinds ): - raise InputValidationError( + raise common.InputValidationError( "Automated POTENTIAL keyword creation is not yet supported with multiple FORCE_EVALs." " Please explicitly reference a POTENTIAL for each KIND." ) @@ -383,13 +387,13 @@ def validate_pseudos(inp, pseudos, structure): try: pseudo = pseudos[kind.symbol] except KeyError: - raise InputValidationError( + raise common.InputValidationError( f"No basis set found in the given basissets" f" for kind '{kind.name}' (or '{kind.symbol}') of your structure." ) if pseudo.element != kind.symbol: - raise InputValidationError( + raise common.InputValidationError( f"Pseudopotential '{pseudo.name}' for '{pseudo.element}' specified" f" for kind '{kind.name}' (of '{kind.symbol}')." ) @@ -402,7 +406,7 @@ def validate_pseudos(inp, pseudos, structure): for pseudo in pseudos_specified: if pseudo not in pseudos_used: - raise InputValidationError( + raise common.InputValidationError( f"Pseudopodential '{pseudo.name}' specified in the pseudos input namespace" f" but not referenced by either input or structure." ) @@ -411,3 +415,63 @@ def validate_pseudos(inp, pseudos, structure): def write_pseudos(inp, pseudos, folder): """Writes the unified POTENTIAL file with the used pseudos""" _write_gdt(inp, pseudos, folder, "POTENTIAL_FILE_NAME", "POTENTIAL") + + +def _merge_trajectories_into_dictionary(*trajectories, unique_stepids=False): + if len(trajectories) < 0: + return None + final_trajectory_dict = {} + + array_names = trajectories[0].get_arraynames() + + for array_name in array_names: + if any(array_name not in traj.get_arraynames() for traj in trajectories): + raise ValueError( + f"Array name '{array_name}' not found in all trajectories." + ) + merged_array = np.concatenate( + [traj.get_array(array_name) for traj in trajectories], axis=0 + ) + final_trajectory_dict[array_name] = merged_array + + # If unique_stepids is True, we only keep the unique stepids. + # The other arrays are then also reduced to the unique stepids. + if unique_stepids: + stepids = np.concatenate([traj.get_stepids() for traj in trajectories], axis=0) + final_trajectory_dict["stepids"], unique_indices = np.unique( + stepids, return_index=True + ) + + for array_name in array_names: + final_trajectory_dict[array_name] = final_trajectory_dict[array_name][ + unique_indices + ] + + return final_trajectory_dict + + +def _dictionary_to_trajectory(trajectory_dict, symbols): + final_trajectory = orm.TrajectoryData() + final_trajectory.set_trajectory( + symbols=symbols, positions=trajectory_dict.pop("positions") + ) + for array_name, array in trajectory_dict.items(): + final_trajectory.set_array(array_name, array) + + return final_trajectory + + +@engine.calcfunction +def merge_trajectory_data_unique(*trajectories): + trajectory_dict = _merge_trajectories_into_dictionary( + *trajectories, unique_stepids=True + ) + return _dictionary_to_trajectory(trajectory_dict, trajectories[0].symbols) + + +@engine.calcfunction +def merge_trajectory_data_non_unique(*trajectories): + trajectory_dict = _merge_trajectories_into_dictionary( + *trajectories, unique_stepids=False + ) + return _dictionary_to_trajectory(trajectory_dict, trajectories[0].symbols) diff --git a/aiida_cp2k/workchains/base.py b/aiida_cp2k/workchains/base.py index 1ca625f..1f26b78 100644 --- a/aiida_cp2k/workchains/base.py +++ b/aiida_cp2k/workchains/base.py @@ -46,11 +46,31 @@ def setup(self): super().setup() self.ctx.inputs = common.AttributeDict(self.exposed_inputs(Cp2kCalculation, 'cp2k')) + def _collect_all_trajetories(self): + """Collect all trajectories from the children calculations.""" + trajectories = [] + for called in self.ctx.children: + if isinstance(called, orm.CalcJobNode): + try: + trajectories.append(called.outputs.output_trajectory) + except AttributeError: + pass + return trajectories + def results(self): super().results() if self.inputs.cp2k.parameters != self.ctx.inputs.parameters: self.out('final_input_parameters', self.ctx.inputs.parameters) + trajectories = self._collect_all_trajetories() + if trajectories: + self.report("Work chain completed successfully, collecting all trajectories") + if self.ctx.inputs.parameters.get("GLOBAL", {}).get("RUN_TYPE") == "GEO_OPT": + output_trajectory = utils.merge_trajectory_data_non_unique(*trajectories) + else: + output_trajectory = utils.merge_trajectory_data_unique(*trajectories) + self.out("output_trajectory", output_trajectory) + def overwrite_input_structure(self): if "output_structure" in self.ctx.children[self.ctx.iteration-1].outputs: self.ctx.inputs.structure = self.ctx.children[self.ctx.iteration-1].outputs.output_structure diff --git a/examples/workchains/example_base_md_reftraj_restart.py b/examples/workchains/example_base_md_reftraj_restart.py index b0cbcfc..91b38c4 100644 --- a/examples/workchains/example_base_md_reftraj_restart.py +++ b/examples/workchains/example_base_md_reftraj_restart.py @@ -7,7 +7,6 @@ """An example testing the restart calculation handler for geo_opt run in CP2K.""" import os -import random import sys import ase.io @@ -44,14 +43,9 @@ def example_base(cp2k_code): # Trajectory. steps = 20 - positions = np.array( - [[[2, 2, 2.73 + 0.05 * random.random()], [2, 2, 2]] for i in range(steps)] - ) + positions = np.array([[[2, 2, 2.73 + 0.01 * i], [2, 2, 2]] for i in range(steps)]) cells = np.array( - [ - [[4, 0, 0], [0, 4, 0], [0, 0, 4.75 + 0.05 * random.random()]] - for i in range(steps) - ] + [[[4, 0, 0], [0, 4, 0], [0, 0, 4.75 + 0.01 * i]] for i in range(steps)] ) symbols = ["H", "H"] trajectory = TrajectoryData() @@ -172,6 +166,8 @@ def example_base(cp2k_code): "ERROR, EXT_RESTART section is NOT present in the final_input_parameters." ) sys.exit(1) + + # Check stepids extracted from each individual calculation. stepids = np.concatenate( [ called.outputs.output_trajectory.get_stepids() @@ -188,6 +184,33 @@ def example_base(cp2k_code): ) sys.exit(1) + # Check the final trajectory. + final_trajectory = outputs["output_trajectory"] + + if np.all(final_trajectory.get_stepids() == np.arange(1, steps + 1)): + print("OK, final trajectory stepids are correct.") + else: + print( + f"ERROR, final trajectory stepids are NOT correct. Expected: {np.arange(1, steps + 1)} but got: {final_trajectory.get_stepids()}" + ) + sys.exit(1) + + if final_trajectory.get_positions().shape == (steps, len(structure.sites), 3): + print("OK, the shape of the positions array is correct.") + else: + print( + f"ERROR, the shape of the positions array is NOT correct. Expected: {(steps, len(structure.sites), 3)} but got: {final_trajectory.get_positions().shape}" + ) + sys.exit(1) + + if final_trajectory.get_cells().shape == (steps, 3, 3): + print("OK, the shape of the cells array is correct.") + else: + print( + f"ERROR, the shape of the cells array is NOT correct. Expected: {(steps, 3, 3)} but got: {final_trajectory.get_cells().shape}" + ) + sys.exit(1) + @click.command("cli") @click.argument("codelabel") diff --git a/examples/workchains/example_base_md_restart.py b/examples/workchains/example_base_md_restart.py new file mode 100644 index 0000000..b20574f --- /dev/null +++ b/examples/workchains/example_base_md_restart.py @@ -0,0 +1,158 @@ +############################################################################### +# Copyright (c), The AiiDA-CP2K authors. # +# SPDX-License-Identifier: MIT # +# AiiDA-CP2K is hosted on GitHub at https://github.com/aiidateam/aiida-cp2k # +# For further information on the license, see the LICENSE.txt file. # +############################################################################### +"""An example testing the restart calculation handler for geo_opt run in CP2K.""" + +import os +import sys + +import ase.io +import click +from aiida.common import NotExistent +from aiida.engine import run +from aiida.orm import Dict, SinglefileData, load_code +from aiida.plugins import DataFactory, WorkflowFactory + +Cp2kBaseWorkChain = WorkflowFactory("cp2k.base") +StructureData = DataFactory("core.structure") + + +def example_base(cp2k_code): + """Run simple DFT calculation through a workchain.""" + + thisdir = os.path.dirname(os.path.realpath(__file__)) + + print("Testing CP2K ENERGY on H2O (DFT) through a workchain...") + + # Basis set. + basis_file = SinglefileData( + file=os.path.join(thisdir, "..", "files", "BASIS_MOLOPT") + ) + + # Pseudopotentials. + pseudo_file = SinglefileData( + file=os.path.join(thisdir, "..", "files", "GTH_POTENTIALS") + ) + + # Structure. + structure = StructureData( + ase=ase.io.read(os.path.join(thisdir, "..", "files", "h2o.xyz")) + ) + + # Parameters. + parameters = Dict( + { + "GLOBAL": { + "RUN_TYPE": "MD", + "WALLTIME": "00:00:20", # too short + }, + "FORCE_EVAL": { + "METHOD": "Quickstep", + "STRESS_TENSOR": "analytical", + "DFT": { + "BASIS_SET_FILE_NAME": "BASIS_MOLOPT", + "POTENTIAL_FILE_NAME": "GTH_POTENTIALS", + "QS": { + "EPS_DEFAULT": 1.0e-12, + "WF_INTERPOLATION": "ps", + "EXTRAPOLATION_ORDER": 3, + }, + "MGRID": { + "NGRIDS": 4, + "CUTOFF": 280, + "REL_CUTOFF": 30, + }, + "XC": { + "XC_FUNCTIONAL": { + "_": "LDA", + }, + }, + "POISSON": { + "PERIODIC": "none", + "PSOLVER": "MT", + }, + "SCF": {"PRINT": {"RESTART": {"_": "ON"}}}, + }, + "SUBSYS": { + "KIND": [ + { + "_": "O", + "BASIS_SET": "DZVP-MOLOPT-SR-GTH", + "POTENTIAL": "GTH-LDA-q6", + }, + { + "_": "H", + "BASIS_SET": "DZVP-MOLOPT-SR-GTH", + "POTENTIAL": "GTH-LDA-q1", + }, + ], + }, + }, + "MOTION": { + "CONSTRAINT": {}, + "MD": { + "THERMOSTAT": {"CSVR": {}, "TYPE": "csvr"}, + "BAROSTAT": {}, + "MAX_STEPS": 8, + "STEPS": 10000, + "ENSEMBLE": "npt_f", + "TEMPERATURE": 300.0, + }, + "PRINT": { + "RESTART": {"EACH": {"MD": 1}}, + }, + }, + } + ) + + # Construct process builder. + builder = Cp2kBaseWorkChain.get_builder() + + # Switch on resubmit_unconverged_geometry disabled by default. + builder.handler_overrides = Dict( + {"restart_incomplete_calculation": {"enabled": True}} + ) + + # Input structure. + builder.cp2k.structure = structure + builder.cp2k.parameters = parameters + builder.cp2k.code = cp2k_code + builder.cp2k.file = { + "basis": basis_file, + "pseudo": pseudo_file, + } + builder.cp2k.metadata.options.resources = { + "num_machines": 1, + "num_mpiprocs_per_machine": 1, + } + builder.cp2k.metadata.options.max_wallclock_seconds = 1 * 3 * 60 + + print("Submitted calculation...") + calc = run(builder) + + if "EXT_RESTART" in calc["final_input_parameters"].dict: + print("OK, EXT_RESTART section is present in the final_input_parameters.") + else: + print( + "ERROR, EXT_RESTART section is NOT present in the final_input_parameters." + ) + sys.exit(3) + + +@click.command("cli") +@click.argument("codelabel") +def cli(codelabel): + """Click interface.""" + try: + code = load_code(codelabel) + except NotExistent: + print(f"The code '{codelabel}' does not exist") + sys.exit(1) + example_base(code) + + +if __name__ == "__main__": + cli() diff --git a/test/test_datatype_helpers.py b/test/test_datatype_helpers.py new file mode 100644 index 0000000..989389a --- /dev/null +++ b/test/test_datatype_helpers.py @@ -0,0 +1,60 @@ +import numpy as np +import pytest +from aiida import orm + +from aiida_cp2k.utils import ( + merge_trajectory_data_non_unique, + merge_trajectory_data_unique, +) + + +@pytest.mark.parametrize( + "step_ranges", + ( + [(1, 20), (21, 40)], + [(1, 20), (15, 30)], + [(1, 20), (21, 40), (41, 60)], + [(1, 25), (21, 30), (31, 60), (45, 80)], + ), +) +def test_merge_trajectory_data(step_ranges): + def get_trajectory(step1=1, step2=20): + nstes = step2 - step1 + 1 + positions = np.array( + [ + [[2, 2, 2.73 + 0.05 * np.random.random()], [2, 2, 2]] + for i in range(nstes) + ] + ) + cells = np.array( + [ + [[4, 0, 0], [0, 4, 0], [0, 0, 4.75 + 0.05 * np.random.random()]] + for i in range(nstes) + ] + ) + stepids = np.arange(step1, step2 + 1) + symbols = ["H", "H"] + trajectory = orm.TrajectoryData() + trajectory.set_trajectory(symbols, positions, cells=cells, stepids=stepids) + return trajectory + + trajectories = [get_trajectory(*step_range) for step_range in step_ranges] + + total_length = sum( + [step_range[1] - step_range[0] + 1 for step_range in step_ranges] + ) + + unique_elements = [] + for step_range in step_ranges: + unique_elements.extend(range(step_range[0], step_range[1] + 1)) + total_lenght_unique = len(set(unique_elements)) + + merged_trajectory = merge_trajectory_data_non_unique(*trajectories) + assert ( + len(merged_trajectory.get_stepids()) == total_length + ), "The merged trajectory has the wrong length." + + merged_trajectory_unique = merge_trajectory_data_unique(*trajectories) + assert ( + len(merged_trajectory_unique.get_stepids()) == total_lenght_unique + ), "The merged trajectory with unique stepids has the wrong length."