Source code for ilustrado.generation

""" This file implements the Generation class which
is used to store each generation of structures, and to
evaulate their fitness.
"""

import json

from matador.utils.chem_utils import get_formula_from_stoich
from matador.fingerprints.pdf import PDF


[docs]class Generation:
    """ Stores each generation of structures.

    Parameters:
        run_hash (str): hash for this GA run,
        generation_idx (int): index of this generation,
        num_survivors (int): number of structures to aim for per generation,
        num_accepted (int): number to accept from this generation, i.e.
            excluding elites,

    Keyword Arguments:
        populace (list(dict)): initial structures to populate generation with (optional)
        dumpfile (str): dumpfile name for this generation (optional)
        fitness_calculator (str): fitness metric to use, e.g. 'hull'.

    """

    def __init__(
        self,
        run_hash: str,
        generation_idx: int,
        num_survivors: int,
        num_accepted: int,
        populace=None,
        dumpfile=None,
        fitness_calculator=None,
    ):

        self.populace = []
        if populace is not None:
            self.populace = populace
        self._num_survivors = num_survivors
        self._num_accepted = num_accepted
        self._fitness_calculator = fitness_calculator
        self.run_hash = run_hash
        self.generation_idx = generation_idx
        self.bourgeoisie = []
        if dumpfile is not None:
            self.load(dumpfile)

    def __len__(self):
        return len(self.populace)

    def __str__(self):
        gen_string = "\nCompleted generation {}:\n".format(self.generation_idx)
        gen_string += "Number of members: {}\n".format(len(self.populace))
        gen_string += "Number of survivors: {}\n".format(len(self.bourgeoisie))
        gen_string += "Populace:\n"
        gen_string += 84 * "─" + "\n"
        gen_string += "{:^10} {:^10} {:^25} {:^35}\n".format(
            "Formula", "Fitness", "Hull distance (eV/atom)", "ID"
        )
        gen_string += 84 * "─" + "\n"
        for populum in self.populace:
            gen_string += "{:^10} {: ^10.5f} {:^25.5f} {:^35}\n".format(
                get_formula_from_stoich(populum["stoichiometry"]),
                populum["fitness"],
                populum["raw_fitness"],
                populum["source"][0]
                .split("/")[-1]
                .replace(".res", "")
                .replace(".castep", ""),
            )
        gen_string += 84 * "─" + "\n"
        gen_string += "Bourgeoisie:\n"
        gen_string += 84 * "─" + "\n"
        gen_string += "{:^10} {:^10} {:^25} {:^35}\n".format(
            "Formula", "Fitness", "Hull distance (eV/atom)", "ID"
        )
        gen_string += 84 * "─" + "\n"
        for bourge in self.bourgeoisie:
            gen_string += "{:^10} {: ^10.5f} {:^25.5f} {:^35}\n".format(
                get_formula_from_stoich(bourge["stoichiometry"]),
                bourge["fitness"],
                bourge["raw_fitness"],
                bourge["source"][0]
                .split("/")[-1]
                .replace(".res", "")
                .replace(".castep", ""),
            )
        gen_string += "\n"
        return gen_string

    def __getitem__(self, key):
        return self.populace[int(key)]

    def __iter__(self):
        return iter(self.populace)

[docs]    def dump(self, gen_suffix):
        """ Dump the current generation to JSON file.

        Parameters:
            gen_suffix (str): typically gen<gen_number>.

        """
        with open("{}-gen{}.json".format(self.run_hash, gen_suffix), "w") as f:
            json.dump(self.populace, f, sort_keys=False, indent=2)

[docs]    def dump_bourgeoisie(self, gen_suffix):
        """ Dump the current generation's bourgeoisie to JSON file.

        Parameters:
            gen_suffix (str) : typically gen<gen_number>.

        """
        with open(
            "{}-gen{}-bourgeoisie.json".format(self.run_hash, gen_suffix), "w"
        ) as f:
            json.dump(self.bourgeoisie, f, sort_keys=False, indent=2)

[docs]    def load(self, gen_fname):
        """ Load populace of the generation from a JSON dump.

        Parameters:
            gen_fname (str) : filename to load.

        """
        with open(gen_fname, mode="r") as f:
            self.populace = json.load(f)

[docs]    def load_bourgeoisie(self, bourge_fname):
        """ Load bourgeoisie of the generation from a JSON dump.

        Parameters:
            bourge_fname (str) : filename to load.

        """
        with open(bourge_fname, mode="r") as f:
            self.bourgeoisie = json.load(f)

[docs]    def birth(self, populum: dict):
        """ Add a structure to the populace.

        Parameters:
            populum (dict) : structure to add.

        """
        self.populace.append(populum)

[docs]    def rank(self):
        """ Evaluate the fitness of all structures in the generation. """
        self._fitness_calculator.evaluate(self)

[docs]    def clean(self):
        """ Remove structures with pathological formation enthalpies.

        Returns:
            num_removed (int) : number of pathological structures removed.

        """
        init_len = len(self.populace)
        self.populace = [
            populum
            for populum in self.populace
            if (
                populum["formation_enthalpy_per_atom"] > -3.5
                and populum["formation_enthalpy_per_atom"] < 1
            )
        ]
        return init_len - len(self.populace)

[docs]    def set_bourgeoisie(self, elites=None, best_from_stoich=True):
        """ Set the structures that will continue to the next generation,
        i.e. the bourgeoisie.

        Keyword Arguments:
            elites list(dict)       : list of elite structures to
                include from the previous generation,
            best_from_stoich (bool) : whether to include one structure from
                each stoichiometry.

        """

        # first populate with best precomputed "num_accepted" structures,
        # where "num_accepted" takes into account the number of elites
        self.bourgeoisie = sorted(
            self.populace, key=lambda member: member["fitness"], reverse=True
        )[: self._num_accepted]

        # find the fittest structure from each stoichiometry sampled
        if best_from_stoich:
            best_from_stoichs = dict()
            for struc in self.populace:
                stoich = get_formula_from_stoich(sorted(struc["stoichiometry"]))
                best_from_stoichs[stoich] = {"fitness": -1}
            for struc in self.populace:
                stoich = get_formula_from_stoich(sorted(struc["stoichiometry"]))
                if best_from_stoichs[stoich]["fitness"] < struc["fitness"]:
                    best_from_stoichs[stoich] = struc

            # if its not already included, add the best structure from this
            # stoichiometry in exchange for the least fit structure already included
            for stoich in best_from_stoichs:
                if best_from_stoichs[stoich] not in self.bourgeoisie:
                    self.bourgeoisie.insert(0, best_from_stoichs[stoich])

        if elites is not None:
            self.bourgeoisie.extend(elites)

[docs]    def calc_pdfs(self):
        """ Compute PDFs for each structure in the generation. """
        self._pdfs = []
        self._stoichs = []
        for structure in self.populace:
            self._pdfs.append(PDF(structure, projected=True))
            self._stoichs.append(sorted(structure["stoichiometry"]))

[docs]    def is_dupe(self, doc, sim_tol=5e-2, extra_pdfs=None):
        """ Compare doc with all other structures at same stoichiometry via PDF overlap.

        Parameters:
            doc (dict): structure to compare.

        Keyword Arguments:
            sim_tol (float): similarity tolerance to compare to
            extra_pdfs (list(dict)): list of structures with extra pdfs
                to compare against

        """
        new_pdf = PDF(doc, projected=True)
        for ind, pdf in enumerate(self.pdfs):
            if sorted(doc["stoichiometry"]) == self._stoichs[ind]:
                dist = new_pdf.get_sim_distance(pdf, projected=True)
                if dist < sim_tol:
                    return True
        if extra_pdfs is not None:
            for ind, _doc in enumerate(extra_pdfs):
                pdf = _doc["pdf"]
                if sorted(doc["stoichiometry"]) == sorted(_doc["stoichiometry"]):
                    dist = new_pdf.get_sim_distance(pdf, projected=pdf.projected)
                    if dist < sim_tol:
                        return True
        return False

    @property
    def pdfs(self):
        """ Returns list of PDFs for generation, calculating if necessary. """
        try:
            return self._pdfs
        except (AttributeError, AssertionError):
            self.calc_pdfs()
            return self._pdfs

    @property
    def fitnesses(self):
        """ Return list of normalised fitnesses for population."""
        return [populum["fitness"] for populum in self.populace]

    @property
    def raw_fitnesses(self):
        """ Return list of raw fitnesses for population. """
        return [populum["raw_fitness"] for populum in self.populace]

    @property
    def average_pleb_fitness(self):
        """ Return the average normalised fitness of the generation. """
        population = len(self.populace)
        average_fitness = 0
        for populum in self.populace:
            average_fitness += populum["fitness"] / population
        return average_fitness

    @property
    def average_bourgeois_fitness(self):
        """ Return the average normalised fitness of the bourgeoisie. """
        population = len(self.bourgeoisie)
        average_fitness = 0
        for populum in self.bourgeoisie:
            average_fitness += populum["fitness"] / population
        return average_fitness