Source code for autoplex.auto.rss.flows

"""RSS workflows."""

from __future__ import annotations

import os
from dataclasses import dataclass
from pathlib import Path

from jobflow import Flow, Maker, Response, job
from ruamel.yaml import YAML

from autoplex.auto.rss.jobs import do_rss_iterations, initial_rss


[docs] @dataclass class RssMaker(Maker): """ Maker to set up and run RSS for exploring and learning potential-energy surfaces (from scratch). Parameters ---------- name: str Name of the flow. path_to_default_config_parameters: Path | str | None Path to the default RSS configuration file 'rss_default_configuration.yaml'. If None, the default path will be used. """ name: str = "ml-driven rss" path_to_default_config_parameters: Path | str | None = None
[docs] @job def make(self, config_file: str | None = None, **kwargs): """ Make a rss workflow using the specified configuration file and additional keyword arguments. Parameters ---------- config_file: str | None Path to the configuration file that defines the setup parameters for the whole RSS workflow. If not provided, the default file 'rss_default_configuration.yaml' will be used. kwargs: dict, optional Additional optional keyword arguments to customize the job execution. Keyword Arguments ----------------- - tag: str Tag of systems. It can also be used for setting up elements and stoichiometry. For example, the tag of 'SiO2' will be recognized as a 1:2 ratio of Si to O and passed into the parameters of buildcell. However, note that this will be overwritten if the stoichiometric ratio of elements is defined in the 'buildcell_options'. - train_from_scratch: bool If True, it starts the workflow from scratch. If False, it resumes from a previous state. - resume_from_previous_state: dict | None A dictionary containing the state information required to resume a previously interrupted or saved RSS workflow. When 'train_from_scratch' is set to False, this parameter is mandatory for the workflow to pick up from a saved state. Expected keys within this dictionary: - test_error: float The test error from the last completed training step. - pre_database_dir: str Path to the directory containing the pre-existing database for resuming. - mlip_path: str Path to the file of a previous MLIP model. - isolated_atom_energies: dict A dictionary of isolated atom energy values, with atomic numbers as keys and their energies as valuables. - generated_struct_numbers: list[int] Expected number of generated randomized unit cells by buildcell. - buildcell_options: list[dict] | None Customized parameters for buildcell. Default is None. - fragment: Atoms | list[Atoms] | None Fragment(s) for random structures, e.g., molecules, to be placed indivudally intact. atoms.arrays should have a 'fragment_id' key with unique identifiers for each fragment if in same Atoms. atoms.cell must be defined (e.g., Atoms.cell = np.eye(3)*20). - fragment_numbers: list[str] | None Numbers of each fragment to be included in the random structures. Defaults to 1 for all specified. - num_processes_buildcell: int Number of processes to use for parallel computation during buildcell generation. - num_of_initial_selected_structs: list[int] | None Number of structures to be sampled directly from the buildcell-generated randomized cells. - num_of_rss_selected_structs: int Number of structures to be selected from each RSS iteration. - initial_selection_enabled: bool If true, sample structures from initially generated randomized cells using CUR. - rss_selection_method: str Method for selecting samples from the RSS trajectories: Boltzmann flat histogram in enthalpy first, then CUR. Options include: - 'bcur1s': Execute bcur with one shot (1s) - 'bcur2i': Execute bcur with two iterations (2i) - bcur_params: dict | None Parameters for Boltzmann CUR selection. The default dictionary includes: - soap_paras: dict SOAP descriptor parameters: - l_max: int Maximum degree of spherical harmonics (default 12). - n_max: int Maximum number of radial basis functions (default 12). - atom_sigma: float Width of Gaussian smearing (default 0.0875). - cutoff: float Radial cutoff distance (default 10.5). - cutoff_transition_width: float Width of the transition region (default 1.0). - zeta: float Exponent for dot-product SOAP kernel (default 4.0). - average: bool Whether to average the SOAP vectors (default True). - species: bool Whether to consider species information (default True). - kb_temp: float Temperature in eV for Boltzmann weighting (default 0.3). - frac_of_bcur: float Fraction of Boltzmann CUR selections (default 0.8). - bolt_max_num: int Maximum number of Boltzmann selections (default 3000). - kernel_exp: float Exponent for the kernel (default 4.0). - energy_label: str Label for the energy data (default 'energy'). - random_seed: int | None A seed to ensure reproducibility of CUR selection. Default is None. - include_isolated_atom: bool If true, perform single-point calculations for isolated atoms. - isolatedatom_box: list[float] List of the lattice constants for an isolated atom configuration. - e0_spin: bool If true, include spin polarization in isolated atom and dimer calculations. Default is False. - include_dimer: bool If true, perform single-point calculations for dimers only once. Default is False. - dimer_box: list[float] The lattice constants of a dimer box. - dimer_range: list[float] Range of distances for dimer calculations. - dimer_num: int Number of different distances to consider for dimer calculations. Default is 21. - custom_incar: dict | None Dictionary of custom VASP input parameters. If provided, will update the default parameters. Default is None. - custom_potcar: dict | None Dictionary of POTCAR settings to update. Keys are element symbols, values are the desired POTCAR labels. Default is None. - vasp_ref_file: str Reference file for VASP data. Default is 'vasp_ref.extxyz'. - config_types: list[str] Configuration types for the VASP calculations. Default is None. - rss_group: list[str] Group name for RSS to setting up regularization. - test_ratio: float The proportion of the test set after splitting the data. The value is allowed to be set to 0; in this case, the testing error would not be meaningful anymore. - regularization: bool If True, apply regularization. This only works for GAP to date. Default is False. - scheme: str Method to use for regularization. Options are: - 'linear_hull': for single-composition system, use 2D convex hull (E, V) - 'volume-stoichiometry': for multi-composition system, use 3D convex hull of (E, V, mole-fraction) - reg_minmax: list[tuple] list of tuples of (min, max) values for energy, force, virial sigmas for regularization. - distillation: bool If true, apply data distillation. Default is True. - force_max: float | None Maximum force value to exclude structures. Default is 50. - force_label: str | None The label of force values to use for distillation. Default is 'REF_forces'. - pre_database_dir: str | None Directory where the previous database was saved. - mlip_type: str Choose one specific MLIP type to be fitted: 'GAP' | 'J-ACE' | 'NEQUIP' | 'M3GNET' | 'MACE'. Default is 'GAP'. - ref_energy_name: str Reference energy name. Default is 'REF_energy'. - ref_force_name: str Reference force name. Default is 'REF_forces'. - ref_virial_name: str Reference virial name. Default is 'REF_virial'. - auto_delta: bool If true, apply automatic determination of delta for GAP terms. Default is False. - num_processes_fit: int Number of processes used for fitting. Default is 1. - device_for_fitting: str Device to be used for model fitting, either "cpu" or "cuda". - **fit_kwargs: Additional keyword arguments for the MLIP fitting process. - scalar_pressure_method: str Method for adding external pressures. Acceptable options are: - 'exp': Applies pressure using an exponential distribution. - 'uniform': Applies pressure using a uniform distribution. - scalar_exp_pressure: float Scalar exponential pressure. Default is 100. - scalar_pressure_exponential_width: float Width for scalar pressure exponential. Default is 0.2. - scalar_pressure_low: float Low limit for scalar pressure. Default is 0. - scalar_pressure_high: float High limit for scalar pressure. Default is 50. - max_steps: int Maximum number of steps for relaxation. Default is 200. - force_tol: float Force residual tolerance for relaxation. Default is 0.05. - stress_tol: float Stress residual tolerance for relaxation. Default is 0.05. - hookean_repul: bool If true, apply Hookean repulsion. Default is False. - hookean_paras: dict[tuple[int, int], tuple[float, float]] | None Parameters for Hookean repulsion as a dictionary of tuples. Default is None. - keep_symmetry: bool If true, preserve symmetry during relaxation. Default is False. - write_traj: bool If true, write trajectory of RSS. Default is True. - num_processes_rss: int Number of processes used for running RSS. Default is 1. - device_for_rss: str Specify device to use "cuda" or "cpu" for running RSS. Default is "cpu". - stop_criterion: float Convergence criterion for stopping RSS iterations. Default is 0.01. - max_iteration_number: int Maximum number of RSS iterations to perform. Default is 25. - num_groups: int Number of structure groups, used for assigning tasks across multiple nodes. For example, if there are 10,000 trajectories to relax and 'num_groups=10', the trajectories will be divided into 10 groups and 10 independent jobs will be created, with each job handling 1,000 trajectories. - initial_kb_temp: float Initial temperature (in eV) for Boltzmann sampling. Default is 0.3. - current_iter_index: int Index for the current RSS iteration. Default is 1. Output ------ dict A dictionary whose keys contains: - test_error: float The test error of the fitted MLIP. - pre_database_dir: str The directory of the latest RSS database. - mlip_path: str The path to the latest fitted MLIP. - isolated_atom_energies: dict The isolated energy values. - current_iter: int The current iteration index. - kb_temp: float The temperature (in eV) for Boltzmann sampling. """ rss_default_config_path = ( self.path_to_default_config_parameters or Path(__file__).absolute().parent / "rss_default_configuration.yaml" ) yaml = YAML(typ="safe", pure=True) with open(rss_default_config_path) as f: config = yaml.load(f) if config_file and os.path.exists(config_file): with open(config_file) as f: new_config = yaml.load(f) config.update(new_config) config.update(kwargs) self._process_hookean_paras(config) config_params = config.copy() if "train_from_scratch" not in config_params: raise ValueError( "'train_from_scratch' must be set in the configuration file or passed as a keyword argument!!" ) rss_flow = [] if config_params["train_from_scratch"]: initial_exclude_keys = [ "train_from_scratch", "resume_from_previous_state", "config_types", "rss_group", "num_of_rss_selected_structs", "rss_selection_method", "scalar_pressure_method", "scalar_exp_pressure", "scalar_pressure_exponential_width", "scalar_pressure_low", "scalar_pressure_high", "max_steps", "force_tol", "stress_tol", "stop_criterion", "max_iteration_number", "num_groups", "initial_kb_temp", "current_iter_index", "hookean_repul", "hookean_paras", "keep_symmetry", "write_traj", "num_processes_rss", "device_for_rss", ] initial_params = { k: v for k, v in config_params.items() if k not in initial_exclude_keys } initial_params.update( { "config_type": config_params["config_types"][0], "rss_group": config_params["rss_group"][0], } ) initial_rss_job = initial_rss(**initial_params) rss_flow.append(initial_rss_job) rss_group = config_params["rss_group"] config_types = config_params["config_types"] do_rss_group = rss_group[0] if len(rss_group) == 1 else rss_group[-1] rss_config_type = ( config_types[0] if len(config_types) == 1 else config_types[1:] ) rss_exclude_keys = [ "train_from_scratch", "resume_from_previous_state", "pre_database_dir", ] rss_params = { k: v for k, v in config_params.items() if k not in rss_exclude_keys } rss_params.update( { "num_of_initial_selected_structs": None, "initial_selection_enabled": False, "rss_group": do_rss_group, "config_types": rss_config_type, } ) if config_params["train_from_scratch"]: rss_params.update({"include_isolated_atom": False}) rss_params.update({"include_dimer": False}) do_rss_job = do_rss_iterations( input=initial_rss_job.output, **rss_params, ) else: if "resume_from_previous_state" not in config_params: raise ValueError( "The parameter 'resume_from_previous_state' must be specified when 'train_from_scratch' is False." ) resume_from_previous_state = config_params["resume_from_previous_state"] do_rss_job = do_rss_iterations( input=resume_from_previous_state, **rss_params, ) rss_flow.append(do_rss_job) return Response(replace=Flow(rss_flow), output=do_rss_job.output)
def _process_hookean_paras(self, config): if "hookean_paras" in config: config["hookean_paras"] = { tuple(map(int, k.strip("()").split(", "))): tuple(v) for k, v in config["hookean_paras"].items() }