Source code for moxel.utils

# This file is part of MOXελ.
# Copyright (C) 2023-2024 Antonios P. Sarikas

# MOXελ is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

r"""
This module provides helper functions for creating voxels.

.. note::
    Currently, interactions are modelled with the Lennard-Jones (LJ) potential.

.. attention::
    Consider playing with the ``n_jobs`` parameter to get the best performance
    for your system::

        from timeit import timeit

        setup = 'from moxel.utils import voxels_from_file'
        n_jobs = [1, 2, 8, 16]  # Modify this according to your system.

        for n in n_jobs:
            stmt = f'voxels_from_file("path/to/cif", n_jobs={n})'
            time = timeit(stmt=stmt, setup=setup, number=1)
            print(f'Time with {n} jobs: {time:.3f} s')
"""

import os
import json
import itertools
from pathlib import Path
from multiprocessing import Pool
from itertools import repeat
import warnings
import numpy as np
from tqdm import tqdm
from pymatgen.core import Structure
from . _params import lj_params
warnings.filterwarnings('ignore')



[docs]
def load_json(fname):
    r"""
    Load a ``.json`` file.

    Parameters
    ----------
    fname : str
        Pathname to the ``.json`` file.

    Returns
    -------
    names : list
    """
    with open(fname, 'r') as fhand:
        data = json.load(fhand)

    return data




[docs]
def mic_scale_factors(r, lattice_vectors):
    r"""
    Return scale factors to satisfy minimum image convention [MIC]_.

    Parameters
    ----------
    r : float
        The cutoff radius used in MIC convetion.
    lattice_vectors : array of shape (3, 3)
        The lattice vectors of the unit cell.
        Each row corresponds to a lattice vector.

    Returns
    -------
    scale_factors : array of shape (3,)
        ``scale_factors[i]`` scales ``lattice_vectors[i]``.

    References
    ----------
    .. [MIC] W. Smith, "The Minimum Image Convention in Non-Cubic MD Cells", 1989.
    """
    a, b, c = lattice_vectors
    volume = np.linalg.norm(np.dot(a, np.cross(b, c)))

    w_a = volume/np.linalg.norm(np.cross(b, c))
    w_b = volume/np.linalg.norm(np.cross(a, c))
    w_c = volume/np.linalg.norm(np.cross(a, b))

    return np.ceil(2*r/np.array([w_a, w_b, w_c]))




[docs]
class Grid:
    r"""
    A 3D energy grid over a crystal structure.

    Parameters
    ----------
    grid_size : int, default=25
        Number of grid points along each dimension.
    cutoff : float, default=10
        Cutoff radius (Å) for the LJ potential.
    epsilon : float, default=50
        Epsilon value (ε/K) of the probe atom.
    sigma : float, default=2.5
        Sigma value (σ/Å) of the probe atom.

    Attributes
    ----------
    structure : :class:`pymatgen.core.structure.Structure`
        Available only after :meth:`Grid.load_structure` has been called.
    structure_name : str
        Available only after :meth:`Grid.load_structure` has been called.
    cubic_box : bool
        Available only after :meth:`Grid.calculate` has been called.
    voxels : array of shape (grid_size,)*3
       Available only after :meth:`Grid.calculate` has been called.
    """
    def __init__(self, grid_size=25, cutoff=10, epsilon=50, sigma=2.5):
        self.grid_size = grid_size
        self.cutoff = cutoff
        self.epsilon = epsilon
        self.sigma = sigma


[docs]
    def load_structure(self, pathname):
        r"""
        Load a crystal structure from a file in a format supported by
        :meth:`pymatgen.core.Structure.from_file`.

        Parameters
        ----------
        pathname : str
           Pathname to the file.
        """
        self.structure = Structure.from_file(pathname)
        self.structure_name = Path(pathname).stem



[docs]
    def calculate(self, cubic_box=False, length=30, potential='lj', n_jobs=None):
        r"""
        Iterate over the grid and return voxels.

        For computational efficiency and to assure (approximately) the same
        spatial resolution, the grid is overlayed over a supercell scaled
        according to MIC, see :func:`mic_scale_factors`.

        If lattice angles are significantly different than 90°, to avoid
        distortions set ``cubic_box`` to ``True``. In this case, the grid is
        overlayed over a cubic box of size ``length`` centered at the origin but
        periodicity is no longer guaranteed.

        Parameters
        ----------
        potential : str, default='lj'
            The potential used to calculate voxels. Currently, only the
            LJ potential is supported.
        cubic_box : bool, default=False
            If ``True``, the simulation box is cubic.
        length : float, default=30
            The size of the cubic box in Å. Takes effect only
            if ``cubic_box == True``.
        n_jobs : int, optional
            Number of jobs to run in parallel. If ``None``, then the number returned
            by ``os.cpu_count()`` is used.

        Returns
        -------
        voxels : array of shape (grid_size,)*3
            The energy voxels as :math:`e^{-\beta \mathcal{V}}`, to ensure
            numerical stability.
        Notes
        -----
        For structures that can not be processsed, their voxels are filled with
        zeros.
        """
        self.cubic_box = cubic_box

        if cubic_box:
            d = length / 2
            probe_coords = np.linspace(0-d, 0+d, self.grid_size)  # Cartesian.
            self._simulation_box = self.structure
        else:
            probe_coords = np.linspace(0, 1, self.grid_size)  # Fractional.
            scale = mic_scale_factors(self.cutoff, self.structure.lattice.matrix)
            self._simulation_box = self.structure * scale
        
        if potential == 'lj':
            # Cache LJ parameters for all atoms in the simulation box.
            self._lj_params = np.array([lj_params[atom.species_string] for atom in self._simulation_box])

            # Cache fractional coordinates since this is a slow function in pymatgen.
            self._frac_coords = self._simulation_box.frac_coords

            # Embarrassingly parallel.
            with Pool(processes=n_jobs) as p:
                energies = p.map(
                        self.lj_potential, itertools.product(*(probe_coords,)*3)
                        )

        self.voxels = np.array(energies, dtype=np.float32).reshape((self.grid_size,)*3)

        return self.voxels



[docs]
    def lj_potential(self, coords):
        r"""
        Calculate LJ potential at cartesian or fractional
        coordinates.

        Parameters
        ----------
        coordinates : array_like of shape (3,)
            If ``cubic_box == True`` cartesian. Else, fractional.

        Returns
        -------
        energy : float
            Energy as :math:`e^{-\beta \mathcal{V}}`, to ensure numerical stability.
        """
        if self.cubic_box:
            cartesian_coords = coords
        else:
            cartesian_coords = self._simulation_box.lattice.get_cartesian_coords(coords)

        _, r_ij, indices, _ = self._simulation_box._lattice.get_points_in_sphere(
                self._frac_coords, cartesian_coords,
                self.cutoff, zip_results=False,
                )

        '''
        Need to check for length of r_ij because of
        https://github.com/materialsproject/pymatgen/issues/3794
        '''
        if len(r_ij) == 0:  # No neighbor, zero energy.
            return 1.

        if np.any(r_ij < 1e-3):  # Close contact, infinite energy.
            return 0.

        es_j = self._lj_params[indices]
        x = (0.5 * (es_j[:, 1] + self.sigma)) / r_ij
        e = 4 * np.sqrt(es_j[:, 0] * self.epsilon)
        energy = sum(e * (x**12 - x**6))

        # This should be changed with clipping in future versions.
        return np.exp(-(1 / 298) * energy)  # For numerical stability.





[docs]
def voxels_from_file(
        cif_pathname, grid_size=25, cutoff=10,
        epsilon=50, sigma=2.5, cubic_box=False, length=30, 
        n_jobs=None, only_voxels=True,
        ):
    r"""
    Return voxels from ``.cif`` file.

    Parameters
    ----------
    cif_pathname : str
       Pathname to the ``.cif`` file.
    grid_size : int, default=25
        Number of grid points along each dimension.
    cutoff : float, default=10
        Cutoff radius (Å) for the LJ potential.
    epsilon : float, default=50
        Epsilon value (ε/K) of the probe atom.
    sigma : float, default=25
        Sigma value (σ/Å) of the probe atom.
    cubic_box : bool, default=False
        If ``True``, the simulation box is cubic.
    length : float, default=30
        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
    n_jobs : int, optional
        Number of jobs to run in parallel. If ``None``, then the number returned
        by ``os.cpu_count()`` is used.
    only_voxels : bool, default=True
        Determines ``out`` type.
        
    Returns
    -------
    out : ``array`` or :class:`Grid`
        If ``only_voxels == True``, array of shape ``(grid_size,)*3``.
        Otherwise, :class:`Grid`.

    Notes
    -----
    For structures that can not be processsed, their voxels are filled with zeros.
    """
    grid = Grid(grid_size, cutoff, epsilon, sigma)
    try:
        grid.load_structure(cif_pathname)
        grid.calculate(cubic_box=cubic_box, length=length, n_jobs=n_jobs)
    except:
        grid.voxels = np.full(shape=(grid_size,)*3, fill_value=0, dtype=np.float32)

    if only_voxels:
        return grid.voxels

    return grid




[docs]
def voxels_from_files(
        cif_pathnames, out_pathname, grid_size=25, cutoff=10,
        epsilon=50, sigma=2.5, cubic_box=False, length=30,
        n_jobs=None,
        ):
    r"""
    Calculate voxels from a list of ``.cif`` files and store them under
    ``out_pathname`` as :class:`numpy.array` of shape
    ``(n_samples, grid_size, grid_size, grid_size)``,
    where ``n_samples == len(cif_pathnames)``.

    After processing the following files are created::

        out_pathname
            ├──voxels.npy
            └──names.json

    The file ``names.json`` stores the names of the materials as a
    :class:`list`, which might be useful for later indexing.

    Parameters
    ----------
    cif_pathnames : list
       List of pathnames to the ``.cif`` files.
    out_pathname : str
        Pathname to the directory under which voxels are stored.
    grid_size : int, default=25
        Number of grid points along each dimension.
    cutoff : float, default=10
        Cutoff radius (Å) for the LJ potential.
    epsilon : float, default=50
        Epsilon value (ε/K) of the probe atom.
    sigma : float, default=25
        Sigma value (σ/Å) of the probe atom.
    cubic_box : bool, default=False
        If ``True``, the simulation box is cubic.
    length : float, default=30
        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
    n_jobs : int, optional
        Number of jobs to run in parallel. If ``None``, then the number returned
        by ``os.cpu_count()`` is used.

    Notes
    -----
    * Samples in output array follow the order in ``cif_pathnames``.
    * For structures that can not be processsed, their voxels are filled with zeros.
    """
    n_samples = len(cif_pathnames)
    names = [Path(i).stem for i in cif_pathnames]

    # Store the names.
    with open(f'{out_pathname}/names.json', mode='w') as fhand:
        json.dump(names, fhand, indent=4)

    fp = np.lib.format.open_memmap(
        f'{out_pathname}/voxels.npy', mode='w+',
        shape=(n_samples, *(grid_size,)*3),
        dtype=np.float32
        )

    grids = map(
            voxels_from_file, cif_pathnames,
            repeat(grid_size), repeat(cutoff),
            repeat(epsilon), repeat(sigma),
            repeat(cubic_box), repeat(length),
            repeat(n_jobs)
            )

    for i in tqdm(range(n_samples), desc='Creating voxels'):
        fp[i] = next(grids)

    fp.flush()




[docs]
def voxels_from_dir(
        cif_dirname, out_pathname, grid_size=25, cutoff=10,
        epsilon=50, sigma=2.5, cubic_box=False, length=30,
        n_jobs=None,
        ):
    r"""
    Calculate voxels from a directory of ``.cif`` files and save them under
    ``out_pathname`` as :class:`numpy.array` of shape
    ``(n_samples, grid_size, grid_size, grid_size)``,
    where ``n_samples == len(cif_pathnames)``.

    After processing the following files are created::

        out_pathname
            ├──voxels.npy
            └──names.json

    The file ``names.json`` stores the names of the materials as a
    :class:`list`, which might be useful for later indexing.

    Parameters
    ----------
    cif_dirname : str
       Pathname to the directory containing the ``.cif`` files.
    out_pathname : str
        Pathname to the directory under which voxels are stored.
    grid_size : int, default=25
        Number of grid points along each dimension.
    cutoff : float, default=10
        Cutoff radius (Å) for the LJ potential.
    epsilon : float, default=50
        Epsilon value (ε/K) of the probe atom.
    sigma : float, default=25
        Sigma value (σ/Å) of the probe atom.
    cubic_box : bool, default=False
        If ``True``, the simulation box is cubic.
    length : float, default=30
        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
    n_jobs : int, optional
        Number of jobs to run in parallel. If ``None``, then the number returned
        by ``os.cpu_count()`` is used.

    Notes
    -----
    * Samples in output array follow the order in ``sorted(os.listdir(cif_dirname))``.
    * For structures that can not be processsed, their voxels are filled with zeros.
    """
    files = [os.path.join(cif_dirname, f) for f in sorted(os.listdir(cif_dirname))]

    voxels_from_files(
            files, out_pathname,
            grid_size=grid_size, cutoff=cutoff,
            epsilon=epsilon, sigma=sigma,
            cubic_box=cubic_box, length=length,
            n_jobs=n_jobs,
            )




[docs]
def batch_clean(batch_dirname):
    """
    Clean a single batch.

    The batch must have the form::

        batch
        ├──voxels.npy
        └──names.json

    Cleaning is required since the voxels for some structures might be zero,
    see :func:`Grid.calculate`. After cleaning, the directory has the form::

        batch
        ├──voxels.npy
        ├──names.json
        ├──clean_voxels.npy
        └──clean_names.json

    Parameters
    ----------
    batch_dirname : str
        Pathname to the directory which requires cleaning.

    Returns
    -------
    exit_status : int
        If no voxels are missing ``0`` else ``1``.
    """
    # Case 1: no missing voxels.
    names = load_json(f'{batch_dirname}/names.json')
    voxels = np.load(f'{batch_dirname}/voxels.npy', mmap_mode='r')

    missing_idx = [i for i, x in enumerate(voxels) if np.all(x == 0)]

    if len(missing_idx) == 0:
        print('No missing voxels found!')
        return 0

    # Case 2: missing voxels.
    print('Missing voxels found! Cleaning...')

    clean_size = len(voxels) - len(missing_idx)

    # Create a new array to store the clean voxels.
    clean_fp = np.lib.format.open_memmap(
        f'{batch_dirname}/clean_voxels.npy',
        shape=(clean_size, *voxels.shape[1:]),  # Shape (N, grid, grid, grid).
        mode='w+', dtype='float32',
        )

    clean_idx = 0
    for idx, x in enumerate(voxels):
        if idx in missing_idx:
            pass
        else:
            clean_fp[clean_idx] = x
            clean_idx += 1

    clean_names = np.delete(names, missing_idx)

    with open(f'{batch_dirname}/clean_names.json', 'w') as fhand:
        json.dump(list(clean_names), fhand, indent=4)

    return 1