Source code for autoplex.auto.rss.flows

"""RSS (random structure searching) flow for exploring and learning potential energy surfaces from scratch."""

from dataclasses import dataclass, field

from jobflow import Flow, Maker, Response, job

from autoplex.auto.rss.jobs import do_rss_iterations, initial_rss
from autoplex.settings import RssConfig



[docs]
@dataclass
class RssMaker(Maker):
    """
    Maker to set up and run RSS for exploring and learning potential-energy surfaces (from scratch).

    Parameters
    ----------
    name: str
        Name of the flow.
    rss_config: RssConfig
        Pydantic model that defines the setup parameters for the whole RSS workflow.
        If not explicitly set, the defaults from 'autoplex.settings.RssConfig' will be used.
    """

    name: str = "ml-driven rss"
    rss_config: RssConfig = field(default_factory=lambda: RssConfig())


[docs]
    @job
    def make(self, **kwargs):
        """
        Make a rss workflow using the specified configuration file and additional keyword arguments.

        Parameters
        ----------
        kwargs: dict, optional
            Additional optional keyword arguments to customize the job execution.

        Keyword Arguments
        -----------------
        tag: str
            Tag of systems. It can also be used for setting up elements and stoichiometry.
            For example, the tag of 'SiO2' will be recognized as a 1:2 ratio of Si to O and
            passed into the parameters of buildcell. However, note that this will be overwritten
            if the stoichiometric ratio of elements is defined in the 'buildcell_options'.
        train_from_scratch: bool
            If True, it starts the workflow from scratch.
            If False, it resumes from a previous state.
        resume_from_previous_state: dict | None
            A dictionary containing the state information required to resume a previously interrupted
            or saved RSS workflow. When 'train_from_scratch' is set to False, this parameter is mandatory
            for the workflow to pick up from a saved state.Expected keys within this dictionary are as follows

            - 'test_error': float, The test error from the last completed training step.
            - 'pre_database_dir': str, Path to the directory containing the pre-existing database for resuming.
            - 'mlip_path': str, Path to the file of a previous MLIP model.
            - 'isolated_atom_energies': dict, A dictionary with isolated atom energy values mapped to atomic numbers.

        generated_struct_numbers: list[int]
            Expected number of generated randomized unit cells by buildcell.
        buildcell_options: list[dict] | None
            Customized parameters for buildcell. Default is None.
        fragment: Atoms | list[Atoms] | None
            Fragment(s) for random structures, e.g., molecules, to be placed individually intact.
            atoms.arrays should have a 'fragment_id' key with unique identifiers for each fragment if in same Atoms.
            atoms.cell must be defined (e.g., Atoms.cell = np.eye(3)*20).
        fragment_numbers: list[str] | None
            Numbers of each fragment to be included in the random structures. Defaults to 1 for all specified.
        num_processes_buildcell: int
            Number of processes to use for parallel computation during buildcell generation.
        num_of_initial_selected_structs: list[int] | None
            Number of structures to be sampled directly from the buildcell-generated randomized cells.
        num_of_rss_selected_structs: int
            Number of structures to be selected from each RSS iteration.
        initial_selection_enabled: bool
            If true, sample structures from initially generated randomized cells using CUR.
        rss_selection_method: str
            Method for selecting samples from the RSS trajectories:
            Boltzmann flat histogram in enthalpy first, then CUR.
            Options are as follows

            - 'bcur1s': Execute bcur with one shot (1s)
            - 'bcur2i': Execute bcur with two iterations (2i)

        bcur_params: dict | None
            Parameters for Boltzmann CUR selection. The default dictionary includes following keys

            soap_paras: dict
                SOAP descriptor parameters dict with following acceptable keys

            - 'l_max': int, Maximum degree of spherical harmonics (default 12).
            - 'n_max': int, Maximum number of radial basis functions (default 12).
            - 'atom_sigma': float, Width of Gaussian smearing (default 0.0875).
            - 'cutoff': float, Radial cutoff distance (default 10.5).
            - 'cutoff_transition_width': float, Width of the transition region (default 1.0).
            - 'zeta': float,Exponent for dot-product SOAP kernel (default 4.0).
            - 'average': bool, Whether to average the SOAP vectors (default True).
            - 'species': bool, Whether to consider species information (default True).

            kb_temp: float
                Temperature in eV for Boltzmann weighting (default 0.3).
            frac_of_bcur: float
                Fraction of Boltzmann CUR selections (default 0.8).
            bolt_max_num: int
                Maximum number of Boltzmann selections (default 3000).
            kernel_exp: float
                Exponent for the kernel (default 4.0).
            energy_label: str
                Label for the energy data (default 'energy').
        random_seed: int | None
            A seed to ensure reproducibility of CUR selection. Default is None.
        include_isolated_atom: bool
            If true, perform single-point calculations for isolated atoms.
        isolatedatom_box: list[float]
            List of the lattice constants for an isolated atom configuration.
        e0_spin: bool
            If true, include spin polarization in isolated atom and dimer calculations. Default is False.
        include_dimer: bool
            If true, perform single-point calculations for dimers only once. Default is False.
        dimer_box: list[float]
            The lattice constants of a dimer box.
        dimer_range: list[float]
            Range of distances for dimer calculations.
        dimer_num: int
            Number of different distances to consider for dimer calculations. Default is 21.
        custom_incar: dict | None
            Dictionary of custom VASP input parameters. If provided, will update the
            default parameters. Default is None.
        custom_potcar: dict | None
            Dictionary of POTCAR settings to update. Keys are element symbols, values are the desired POTCAR labels.
            Default is None.
        vasp_ref_file: str
            Reference file for VASP data. Default is 'vasp_ref.extxyz'.
        config_types: list[str]
            Configuration types for the VASP calculations. Default is None.
        rss_group: list[str] | str
            Group name for RSS to setting up regularization.
        test_ratio: float
            The proportion of the test set after splitting the data. The value is allowed to be set to 0;
            in this case, the testing error would not be meaningful anymore.
        regularization: bool
            If True, apply regularization. This only works for GAP to date. Default is False.
        retain_existing_sigma: bool
            Whether to keep the current sigma values for specific configuration types.
            If set to True, existing sigma values for specific configurations will remain unchanged.
        scheme: str
            Method to use for regularization. Options are

            - 'linear_hull': for single-composition system, use 2D convex hull (E, V)
            - 'volume-stoichiometry': for multi-composition system, use 3D convex hull of (E, V, mole-fraction)

        reg_minmax: list[tuple]
            List of tuples of (min, max) values for energy, force, virial sigmas for regularization.
        distillation: bool
            If true, apply data distillation. Default is True.
        force_max: float | None
            Maximum force value to exclude structures. Default is 50.
        force_label: str | None
            The label of force values to use for distillation. Default is 'REF_forces'.
        pre_database_dir: str | None
            Directory where the previous database was saved.
        mlip_type: str
            Choose one specific MLIP type to be fitted: 'GAP' | 'J-ACE' | 'NEQUIP' | 'M3GNET' | 'MACE'.
            Default is 'GAP'.
        ref_energy_name: str
            Reference energy name. Default is 'REF_energy'.
        ref_force_name: str
            Reference force name. Default is 'REF_forces'.
        ref_virial_name: str
            Reference virial name. Default is 'REF_virial'.
        auto_delta: bool
            If true, apply automatic determination of delta for GAP terms. Default is False.
        num_processes_fit: int
            Number of processes used for fitting. Default is 1.
        device_for_fitting: str
            Device to be used for model fitting, either "cpu" or "cuda".
        scalar_pressure_method: str
            Method for adding external pressures. Acceptable options are as follows

            - 'exp': Applies pressure using an exponential distribution.
            - 'uniform': Applies pressure using a uniform distribution.

        scalar_exp_pressure: float
            Scalar exponential pressure. Default is 100.
        scalar_pressure_exponential_width: float
            Width for scalar pressure exponential. Default is 0.2.
        scalar_pressure_low: float
            Low limit for scalar pressure. Default is 0.
        scalar_pressure_high: float
            High limit for scalar pressure. Default is 50.
        max_steps: int
            Maximum number of steps for relaxation. Default is 200.
        force_tol: float
            Force residual tolerance for relaxation. Default is 0.05.
        stress_tol: float
            Stress residual tolerance for relaxation. Default is 0.05.
        hookean_repul: bool
            If true, apply Hookean repulsion. Default is False.
        hookean_paras: dict[tuple[int, int], tuple[float, float]] | None
            Parameters for Hookean repulsion as a dictionary of tuples. Default is None.
        keep_symmetry: bool
            If true, preserve symmetry during relaxation. Default is False.
        write_traj: bool
            If true, write trajectory of RSS. Default is True.
        num_processes_rss: int
            Number of processes used for running RSS. Default is 1.
        device_for_rss: str
            Specify device to use "cuda" or "cpu" for running RSS. Default is "cpu".
        stop_criterion: float
            Convergence criterion for stopping RSS iterations. Default is 0.01.
        max_iteration_number: int
            Maximum number of RSS iterations to perform. Default is 25.
        num_groups: int
            Number of structure groups, used for assigning tasks across multiple nodes.
            For example, if there are 10,000 trajectories to relax and 'num_groups=10',
            the trajectories will be divided into 10 groups and 10 independent jobs will be created,
            with each job handling 1,000 trajectories.
        initial_kb_temp: float
            Initial temperature (in eV) for Boltzmann sampling. Default is 0.3.
        current_iter_index: int
            Index for the current RSS iteration. Default is 1.
        **fit_kwargs:
            Additional keyword arguments for the MLIP fitting process.

        Returns
        -------
        dict:
            A dictionary with following information

            - 'test_error': float, The test error of the fitted MLIP.
            - 'pre_database_dir': str, The directory of the latest RSS database.
            - 'mlip_path': List of path to the latest fitted MLIP.
            - 'isolated_atom_energies': dict, The isolated energy values.
            - 'current_iter': int, The current iteration index.
            - 'kb_temp': float, The temperature (in eV) for Boltzmann sampling.
        """
        default_config = self.rss_config.model_copy(deep=True)
        if kwargs:
            default_config.update_parameters(kwargs)

        config_params = default_config.model_dump(by_alias=True, exclude_none=True)

        # Extract MLIP hyperparameters from the config_params
        mlip_hypers = config_params["mlip_hypers"][config_params["mlip_type"]]
        del config_params["mlip_hypers"]
        config_params.update(mlip_hypers)

        self._process_hookean_paras(config_params)

        if "train_from_scratch" not in config_params:
            raise ValueError(
                "'train_from_scratch' must be set in the configuration file or passed as a keyword argument!!"
            )

        rss_flow = []

        if config_params["train_from_scratch"]:
            initial_exclude_keys = [
                "train_from_scratch",
                "resume_from_previous_state",
                "config_types",
                "rss_group",
                "num_of_rss_selected_structs",
                "rss_selection_method",
                "scalar_pressure_method",
                "scalar_exp_pressure",
                "scalar_pressure_exponential_width",
                "scalar_pressure_low",
                "scalar_pressure_high",
                "max_steps",
                "force_tol",
                "stress_tol",
                "stop_criterion",
                "max_iteration_number",
                "num_groups",
                "initial_kb_temp",
                "current_iter_index",
                "hookean_repul",
                "hookean_paras",
                "keep_symmetry",
                "write_traj",
                "num_processes_rss",
                "device_for_rss",
            ]
            initial_params = {
                k: v for k, v in config_params.items() if k not in initial_exclude_keys
            }
            initial_params.update(
                {
                    "config_type": config_params["config_types"][0],
                    "rss_group": config_params["rss_group"][0],
                }
            )
            initial_rss_job = initial_rss(**initial_params)
            rss_flow.append(initial_rss_job)

        rss_group = config_params["rss_group"]
        config_types = config_params["config_types"]
        do_rss_group = rss_group[0] if len(rss_group) == 1 else rss_group[-1]
        rss_config_type = (
            config_types[0] if len(config_types) == 1 else config_types[1:]
        )

        rss_exclude_keys = [
            "train_from_scratch",
            "resume_from_previous_state",
            "pre_database_dir",
        ]

        rss_params = {
            k: v for k, v in config_params.items() if k not in rss_exclude_keys
        }
        rss_params.update(
            {
                "num_of_initial_selected_structs": None,
                "initial_selection_enabled": False,
                "rss_group": do_rss_group,
                "config_types": rss_config_type,
            }
        )

        if config_params["train_from_scratch"]:
            rss_params.update({"include_isolated_atom": False})
            rss_params.update({"include_dimer": False})

            do_rss_job = do_rss_iterations(
                input=initial_rss_job.output,
                **rss_params,
            )
        else:
            if "resume_from_previous_state" not in config_params:
                raise ValueError(
                    "The parameter 'resume_from_previous_state' must be specified when 'train_from_scratch' is False."
                )
            resume_from_previous_state = config_params["resume_from_previous_state"]
            do_rss_job = do_rss_iterations(
                input=resume_from_previous_state,
                **rss_params,
            )

        rss_flow.append(do_rss_job)

        return Response(replace=Flow(rss_flow), output=do_rss_job.output)


    @staticmethod
    def _process_hookean_paras(config):
        if "hookean_paras" in config and config["hookean_paras"] is not None:
            config["hookean_paras"] = {
                tuple(map(int, k.strip("()").split(", "))): tuple(v)
                for k, v in config["hookean_paras"].items()
            }