From dc3f26381a7b2eab53d8b4cf46ed252ae6595036 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Sun, 30 May 2021 18:10:19 -0700
Subject: [PATCH 01/41] Very first committ of emmet supporting cp2k for defect
 calculations. Subject to much change.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 645 ++++++++++++++++++
 .../emmet/builders/cp2k/materials.py          | 294 ++++++++
 emmet-builders/emmet/builders/cp2k/utils.py   |  91 +++
 emmet-builders/setup.py                       |   2 +-
 .../emmet/core/cp2k/calc_types/__init__.py    |   9 +
 .../emmet/core/cp2k/calc_types/enums.py       |  64 ++
 .../emmet/core/cp2k/calc_types/generate.py    |  90 +++
 .../emmet/core/cp2k/calc_types/run_types.yaml |  17 +
 .../emmet/core/cp2k/calc_types/utils.py       | 127 ++++
 emmet-core/emmet/core/cp2k/material.py        | 194 ++++++
 emmet-core/emmet/core/cp2k/task.py            | 210 ++++++
 emmet-core/emmet/core/cp2k/validation.py      | 121 ++++
 emmet-core/emmet/core/defect.py               | 387 +++++++++++
 emmet-core/emmet/core/settings.py             |  17 +
 14 files changed, 2267 insertions(+), 1 deletion(-)
 create mode 100644 emmet-builders/emmet/builders/cp2k/defects.py
 create mode 100644 emmet-builders/emmet/builders/cp2k/materials.py
 create mode 100644 emmet-builders/emmet/builders/cp2k/utils.py
 create mode 100644 emmet-core/emmet/core/cp2k/calc_types/__init__.py
 create mode 100644 emmet-core/emmet/core/cp2k/calc_types/enums.py
 create mode 100644 emmet-core/emmet/core/cp2k/calc_types/generate.py
 create mode 100644 emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
 create mode 100644 emmet-core/emmet/core/cp2k/calc_types/utils.py
 create mode 100644 emmet-core/emmet/core/cp2k/material.py
 create mode 100644 emmet-core/emmet/core/cp2k/task.py
 create mode 100644 emmet-core/emmet/core/cp2k/validation.py
 create mode 100644 emmet-core/emmet/core/defect.py

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
new file mode 100644
index 0000000000..7a3aff9423
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -0,0 +1,645 @@
+from datetime import datetime
+from itertools import chain, groupby, combinations
+from operator import itemgetter
+from typing import Dict, Iterator, List, Optional
+import numpy as np
+
+from maggma.builders import Builder
+from maggma.stores import Store
+from pymatgen.core import Structure
+from pymatgen.analysis.structure_analyzer import oxide_type
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
+from atomate.utils.utils import load_class
+
+from pymatgen.analysis.defects.core import (
+    Defect, Vacancy, Substitution, Polaron, Interstitial, GhostVacancy, DefectEntry
+)
+from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+from monty.json import MontyDecoder
+
+from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
+from emmet.builders.cp2k.utils import matcher, get_dielectric, get_mpid
+from emmet.core import SETTINGS
+from emmet.core.utils import jsanitize, get_sg
+from emmet.core.cp2k.calc_types import TaskType
+from emmet.core.cp2k.material import MaterialsDoc
+from emmet.core.cp2k.task import TaskDocument
+from emmet.stubs import ComputedEntry
+from emmet.core.cp2k.calc_types.utils import run_type
+from emmet.core.defect import DefectDoc, DefectThermoDoc
+
+from pymatgen.ext.matproj import MPRester
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+__maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
+
+
+class DefectBuilder(Builder):
+    """
+    The Materials Builder matches VASP task documents by structure similarity into materials
+    document. The purpose of this builder is group calculations and determine the best structure.
+    All other properties are derived from other builders.
+
+    The process is as follows:
+
+        1.) Find all documents with the same formula
+        2.) Select only task documents for the task_types we can select properties from
+        3.) Aggregate task documents based on structure similarity
+        4.) Convert task docs to property docs with metadata for selection and aggregation
+        5.) Select the best property doc for each property
+        6.) Build material document from best property docs
+        7.) Post-process material document
+        8.) Validate material document
+
+    """
+
+    def __init__(
+        self,
+        tasks: Store,
+        defects: Store,
+        task_validation: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        allowed_task_types: Optional[List[str]] = None,
+        symprec: float = SETTINGS.SYMPREC,
+        ltol: float = SETTINGS.LTOL,
+        stol: float = SETTINGS.STOL,
+        angle_tol: float = SETTINGS.ANGLE_TOL,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            defects: Store of defect documents to generate
+            query: dictionary to limit tasks to be analyzed
+            allowed_task_types: list of task_types that can be processed
+            symprec: tolerance for SPGLib spacegroup finding
+            ltol: StructureMatcher tuning parameter for matching tasks to materials
+            stol: StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
+        """
+
+        self.tasks = tasks
+        self.tasks.key = 'task_id'
+        self.defects = defects
+        self.defects.key = 'task_ids'
+        self.task_validation = task_validation
+        self.allowed_task_types = (
+            [t.value for t in TaskType]
+            if allowed_task_types is None
+            else allowed_task_types
+        )
+
+        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
+
+        self.query = query if query else {}
+        self.symprec = symprec
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.kwargs = kwargs
+
+        sources = [tasks]
+        if self.task_validation:
+            sources.append(self.task_validation)
+        super().__init__(sources=sources, targets=[defects], **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Basic search index for tasks
+        self.tasks.ensure_index("task_id")
+        self.tasks.ensure_index("last_updated")
+        self.tasks.ensure_index("state")
+        self.tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("last_updated")
+        self.defects.ensure_index("sandboxes")
+        self.defects.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    @property
+    def defect_query(self) -> str:
+        """
+        The standard query for defect tasks. Update this if
+        schema changes in the future.
+
+        For example, if top level key exists 'defect' can be returned.
+        Alternatively, if an initial defect transformation was performed, then
+        you can check via 'transformations.history.0.defect'
+        """
+        return 'defect'
+
+    @property
+    def required_defect_properties(self) -> List:
+        return [
+            self.defect_query,
+            'output.energy',
+            'output.v_hartree_grid',
+            'output.v_hartree',
+            'output.structure',
+            'input',
+            'transformations',
+            'task_id',
+            'nsites'
+        ]
+
+    @property
+    def optional_defect_properties(self) -> List:
+        return [
+            'last_updated',
+            'created_on',
+            'tags'
+        ]
+
+    @property
+    def required_bulk_properties(self) -> List:
+        return [
+            'output.energy',
+            'output.v_hartree_grid',
+            'output.v_hartree',
+            'output.structure',
+            'output.vbm',
+            'output.cbm',
+            'input',
+            'transformations',
+        ]
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into defect documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Defect Collection.
+
+        The procedure is as follows:
+
+            1. Get all tasks with standard "defect" query tag
+            2. Filter all tasks by skipping tasks which are already in the Defect Store
+            3. Get all tasks that could be used as bulk
+
+
+        Returns:
+            generator or list relevant tasks and materials to process into materials documents
+        """
+
+        self.logger.info("Defect builder started")
+        self.logger.info(
+            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
+        )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all tasks
+        self.logger.info("Finding tasks to process")
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        all_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        temp_query.update({self.defect_query: {'$exists': True}})
+        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})
+        defect_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        processed_defect_tasks = {
+            t_id
+            for d in self.defects.query({}, ["task_ids"])
+            for t_id in d.get("task_ids", [])
+        }
+
+        bulk_tasks = set(filter(self.preprocess_bulk, all_tasks - defect_tasks))
+        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
+
+        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
+        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")
+
+        # Set total for builder bars to have a total
+        self.total = len(unprocessed_defect_tasks)
+
+        if self.task_validation:
+            invalid_ids = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {"is_valid": False}, [self.task_validation.key]
+                )
+            }
+        else:
+            invalid_ids = set()
+
+        for t in bulk_tasks.union(unprocessed_defect_tasks):
+            for doc in self.tasks.query({self.tasks.key: t}):
+                if t in invalid_ids:
+                    doc["is_valid"] = False
+                else:
+                    doc["is_valid"] = True
+
+        # yield list of defects that are of the same type, matched to an appropriate bulk calc
+        self.logger.debug(f"Processing ")
+
+        grouped_pairs = [
+            [
+                (
+                    next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),
+                    next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None))
+                )
+                for defect_tasks_id, bulk_tasks_id in self.match_defects_to_bulks(bulk_tasks, defect_task_group)
+            ] for defect_task_group in self.filter_and_group_tasks(defect_tasks)
+        ]
+
+        yield grouped_pairs
+
+    def preprocess_bulk(self, task):
+        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))
+        struc = Structure.from_dict(t.get('output').get('structure'))
+        mpid = get_mpid(struc)
+        if not mpid:
+            self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")
+            return False
+        diel = get_dielectric(mpid)
+        if not diel:
+            self.logger.debug(f"NO DIEL FOUND FOR {task} - {struc.composition}")
+            return False
+        return True
+
+    def filter_and_group_tasks(self, tasks):
+        """
+        Groups defect tasks. Tasks are grouped according to the reduced representation
+        of the defect, and so tasks with different settings (e.g. supercell size, functional)
+        will be grouped together.
+
+        Args:
+            defect_ids: task_ids for unprocessed defects
+
+        returns:
+            generator for groups of task_ids that correspond to the same defect
+        """
+
+        props = [
+            self.defect_query,
+            'task_id'
+        ]
+
+        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")
+
+        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)
+        defects = [
+            {'task_id': t['task_id'], 'defect': self.get_defect_from_task(t)}
+            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)
+        ]
+
+        def key(x):
+            s = x.get('defect').bulk_structure
+            return get_sg(s), s.composition.reduced_composition
+
+        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))
+        all_groups = []
+
+        # For each pre-grouped list of structures, perform actual matching.
+        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):
+            unmatched = list(g)
+            while len(unmatched) > 0:
+                i, refs = unmatched.pop(0)
+                matches = [i]
+
+                inds = filter(
+                    lambda i: pdc.are_equal(refs['defect'], unmatched[i][1]['defect']),
+                    list(range(len(unmatched))),
+                )
+
+                inds = list(inds)
+                matches.extend([unmatched[i][0] for i in inds])
+                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
+                all_groups.append([defects[i]['task_id'] for i in matches])
+
+        self.logger.debug(f"All groups {all_groups}")
+        return all_groups
+
+    def match_defects_to_bulks(self, bulk_ids, defect_ids):
+        """
+        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has
+        commensurate:
+
+            - Composition
+            - Number of sites
+            - Symmetry
+
+        """
+
+        self.logger.debug(f"Finding bulk/defect task combinations.")
+
+        props = [
+            'task_id',
+            'input',
+            'output.structure'
+            'transformations',
+            'defect',
+            'scale',
+        ]
+
+        bulks = list(self.tasks.query(criteria={'task_id': {'$in': list(bulk_ids)}}, properties=props))
+        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))
+
+        sm = StructureMatcher(
+            ltol=SETTINGS.LTOL,
+            stol=SETTINGS.STOL,
+            angle_tol=SETTINGS.ANGLE_TOL,
+            primitive_cell=False,
+            scale=True,
+            attempt_supercell=False,
+            allow_subset=False,
+            comparator=ElementComparator(),
+        )
+
+        def _compare(b, d):
+            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \
+                run_type(d.get('input').get('dft')).value.split('+U')[0] and \
+                    sm.fit(DefectBuilder.get_pristine_supercell(d), DefectBuilder.get_pristine_supercell(b)):
+                return True
+            return False
+
+        pairs = [(defect['task_id'], bulk['task_id']) for bulk in bulks for defect in defects if _compare(bulk, defect)]
+        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
+        return pairs
+
+    @staticmethod
+    def get_pristine_supercell(x):
+        if x.get('defect'):
+            s = load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).bulk_structure
+            s.make_supercell(x.get('scale'))
+        elif x.get('transformations'):
+            s = Structure.from_dict(x['transformations']['history'][0]['input_structure']),
+        else:
+            s = Structure.from_dict(x['input']['structure'])
+
+        return s
+
+    def process_item(self, tasks):
+        """
+        """
+        self.logger.debug(f"Processing tasks")
+        for group in tasks:
+            self.logger.info(f"Processing group of size {len(group)}")
+
+        defect_docs = [DefectDoc.from_tasks(tasks=defect_group, query=self.defect_query) for defect_group in tasks if defect_group]
+
+        self.logger.debug(f"Produced {len(defect_docs)} ")
+        return [d.dict() for d in defect_docs]
+
+    def get_defect_entry_from_tasks(self, defect_task, bulk_task):
+        parameters = self.get_parameters_from_tasks(defect_task, bulk_task)
+
+        defect_entry = DefectEntry(
+            self.get_defect_from_task(defect_task),
+            uncorrected_energy=parameters['defect_energy'] - parameters['bulk_energy'],
+            parameters=parameters,
+            entry_id=parameters['task_id']
+        )
+        DefectCompatibility().process_entry(defect_entry, perform_corrections=True)
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
+
+        self.logger.info(f"Produced defect entry for {parameters['material_id']}")
+
+        return defect_entry_as_dict
+
+    def get_parameters_from_tasks(self, defect_task, bulk_task):
+
+        self.logger.debug("Getting parameters from defect and bulk task")
+
+        def get_init(x):
+            if x.get('transformations'):
+                return Structure.from_dict(x['transformations']['history'][0]['input_structure'])
+            return Structure.from_dict(x['input']['structure'])
+
+        init_defect_structure = get_init(defect_task)
+        init_bulk_structure = get_init(bulk_task)  # use init to avoid site_properties in matching
+
+        final_defect_structure = Structure.from_dict(defect_task['output']['structure'])
+        final_bulk_structure = Structure.from_dict(bulk_task['output']['structure'])
+
+        axis_grid = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_grid']]
+        bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree']]
+        defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree']]
+
+        dfi, site_matching_indices = matcher(init_bulk_structure, init_defect_structure)
+
+        mpid = get_mpid(init_bulk_structure)
+
+        dielectric = get_dielectric(mpid)
+
+        parameters = {
+            'defect_energy': defect_task['output']['energy'],
+            'bulk_energy': bulk_task['output']['energy'],
+            'axis_grid': axis_grid,
+            'defect_frac_sc_coords': final_defect_structure[dfi].frac_coords,
+            'defect_planar_averages': defect_planar_averages,
+            'bulk_planar_averages': bulk_planar_averages,
+            'site_matching_indices': site_matching_indices,
+            'initial_defect_structure': init_defect_structure,
+            'final_defect_structure': final_defect_structure,
+            'vbm': bulk_task['output']['vbm'],
+            'cbm': bulk_task['output']['cbm'],
+            'dielectric': dielectric,
+            'material_id': mpid,
+            'entry_id': defect_task.get('task_id')
+        }
+
+        # cannot be easily queried for, so check here.
+        if 'v_hartree' in final_defect_structure.site_properties:
+            parameters['bulk_atomic_site_averages'] = final_bulk_structure.site_properties['v_hartree']
+        if 'v_hartree' in final_defect_structure.site_properties:
+            parameters['defect_atomic_site_averages'] = final_defect_structure.site_properties['v_hartree']
+
+        return parameters
+
+    def get_defect_from_task(self, task):
+        defect = unpack(self.defect_query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+    def update_targets(self, items):
+        """
+        Inserts the new task_types into the task_types collection
+        """
+
+        items = [item for item in chain.from_iterable(items) if item]
+
+        for item in items:
+            item.update({"_bt": self.timestamp})
+
+        task_ids = list(chain.from_iterable([item['task_ids'] for item in items]))
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} materials")
+            self.defects.remove_docs({self.defects.key: {"$in": task_ids}})
+            self.defects.update(
+                docs=jsanitize(items, allow_bson=True),
+                key='task_ids',
+            )
+        else:
+            self.logger.info("No items to update")
+
+
+class DefectThermoBuilder(Builder):
+
+    def __init__(
+            self,
+            defects: Store,
+            defect_thermos: Store,
+            materials: Store,
+            query: Optional[Dict] = None,
+            symprec: float = SETTINGS.SYMPREC,
+            ltol: float = SETTINGS.LTOL,
+            stol: float = SETTINGS.STOL,
+            angle_tol: float = SETTINGS.ANGLE_TOL,
+            **kwargs,
+    ):
+        """
+        Args:
+            defects: Store of defect documents (generated by DefectBuilder)
+            defect_thermos: Store of DefectThermoDocs to generate.
+            materials: Store of MaterialDocs to construct phase diagram
+            query: dictionary to limit tasks to be analyzed
+            allowed_task_types: list of task_types that can be processed
+            symprec: tolerance for SPGLib spacegroup finding
+            ltol: StructureMatcher tuning parameter for matching tasks to materials
+            stol: StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
+        """
+
+        self.defects = defects
+        self.defect_thermos = defect_thermos
+        self.materials = materials
+
+        self.query = query if query else {}
+        self.symprec = symprec
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.kwargs = kwargs
+
+        super().__init__(sources=[defects], targets=[defect_thermos], **kwargs)
+
+    def connect(self):
+        """
+        Connect to the builder sources and targets.
+        """
+        for s in [self.defects, self.defect_thermos, self.materials]:
+            s.connect()
+
+    @property
+    def defect_doc_query(self):
+        return 'material_id'
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the collections
+        """
+
+        # Basic search index for tasks
+        self.defects.ensure_index("material_id")
+
+        # Search index for materials
+        self.defects.ensure_index("material_id")
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets items to process into DefectThermoDocs.
+
+        returns:
+            iterator yielding tuples containing:
+                - group of DefectDocs belonging to the same bulk material as indexed by material_id,
+                - materials in the chemsys of the bulk material for constructing phase diagram
+
+        """
+
+        self.logger.info("Defect thermo builder started")
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark build time for defect thermo documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all tasks
+        self.logger.info("Finding defect docs to process")
+
+        all_docs = [doc for doc in self.defects.query()]
+        for key, group in groupby(sorted(all_docs, key=lambda x: x['material_id']), key=lambda x: x['material_id']):
+            group = [g for g in group]
+            yield (group, self.get_materials(group))
+
+    def get_materials(self, group) -> List:
+        """
+        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
+        materials store.
+        """
+        bulk = self.materials.query(criteria={'material_id': group[0]['material_id']}, properties=None)
+        elements = group[0]['chemsys']
+
+        if isinstance(elements, str):
+            elements = elements.split("-")
+
+        all_chemsyses = []
+        for i in range(len(elements)):
+            for els in combinations(elements, i + 1):
+                all_chemsyses.append("-".join(sorted(els)))
+
+        return list(chain(self.materials.query(criteria={"chemsys": {"$in": all_chemsyses}}, properties=None), bulk))
+
+    def process_item(self, docs):
+        """
+        Process a group of defects belonging to the same material into a defect thermo doc
+        :param item:
+        :return:
+        """
+        self.logger.info(f"Processing defects")
+        defects, materials = docs
+        defects = [DefectDoc(**d) for d in defects]
+        materials = [MaterialsDoc(**m) for m in materials]
+        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials)
+        return defect_thermo_doc.dict()
+
+    def update_targets(self, items):
+        """
+        Inserts the new task_types into the task_types collection
+        """
+
+        #item.update({"_bt": self.timestamp})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defect thermo docs")
+            #self.defects.remove_docs({self.defects.key: {"$in": task_ids}})
+            self.defect_thermos.update(
+                docs=jsanitize(items, allow_bson=True),
+                key='material_id',
+            )
+        else:
+            self.logger.info("No items to update")
+
+
+def unpack(query, d):
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
+
+
+
+
diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
new file mode 100644
index 0000000000..48d21bbb75
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -0,0 +1,294 @@
+from datetime import datetime
+from itertools import chain
+from operator import itemgetter
+from typing import Dict, Iterator, List, Optional
+
+from maggma.builders import Builder
+from maggma.stores import Store
+from pymatgen.core import Structure
+from pymatgen.analysis.structure_analyzer import oxide_type
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
+
+from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
+from emmet.core import SETTINGS
+from emmet.core.utils import group_structures, jsanitize
+from emmet.core.cp2k.calc_types import TaskType
+from emmet.core.cp2k.material import MaterialsDoc
+from emmet.core.cp2k.task import TaskDocument
+from emmet.stubs import ComputedEntry
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+__maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
+
+
+class MaterialsBuilder(Builder):
+    """
+    The Materials Builder matches VASP task documents by structure similarity into materials
+    document. The purpose of this builder is group calculations and determine the best structure.
+    All other properties are derived from other builders.
+
+    The process is as follows:
+
+        1.) Find all documents with the same formula
+        2.) Select only task documents for the task_types we can select properties from
+        3.) Aggregate task documents based on structure similarity
+        4.) Convert task docs to property docs with metadata for selection and aggregation
+        5.) Select the best property doc for each property
+        6.) Build material document from best property docs
+        7.) Post-process material document
+        8.) Validate material document
+
+    """
+
+    def __init__(
+        self,
+        tasks: Store,
+        materials: Store,
+        task_validation: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        allowed_task_types: Optional[List[str]] = None,
+        symprec: float = SETTINGS.SYMPREC,
+        ltol: float = SETTINGS.LTOL,
+        stol: float = SETTINGS.STOL,
+        angle_tol: float = SETTINGS.ANGLE_TOL,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            materials: Store of materials documents to generate
+            query: dictionary to limit tasks to be analyzed
+            allowed_task_types: list of task_types that can be processed
+            symprec: tolerance for SPGLib spacegroup finding
+            ltol: StructureMatcher tuning parameter for matching tasks to materials
+            stol: StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
+        """
+
+        self.tasks = tasks
+        self.materials = materials
+        self.task_validation = task_validation
+        self.allowed_task_types = (
+            [t.value for t in TaskType]
+            if allowed_task_types is None
+            else allowed_task_types
+        )
+
+        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
+
+        self.query = query if query else {}
+        self.symprec = symprec
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.kwargs = kwargs
+
+        sources = [tasks]
+        if self.task_validation:
+            sources.append(self.task_validation)
+        super().__init__(sources=sources, targets=[materials], **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Basic search index for tasks
+        self.tasks.ensure_index("task_id")
+        self.tasks.ensure_index("last_updated")
+        self.tasks.ensure_index("state")
+        self.tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("sandboxes")
+        self.materials.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into materials documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Materials Collection
+
+        Returns:
+            generator or list relevant tasks and materials to process into materials documents
+        """
+
+        self.logger.info("Materials builder started")
+        self.logger.info(
+            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
+        )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all processed tasks:
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        self.logger.info("Finding tasks to process")
+        all_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(temp_query, [self.tasks.key])
+        }
+        processed_tasks = {
+            t_id
+            for d in self.materials.query({}, ["task_ids"])
+            for t_id in d.get("task_ids", [])
+        }
+        to_process_tasks = all_tasks - processed_tasks
+        to_process_forms = self.tasks.distinct(
+            "formula_pretty", {self.tasks.key: {"$in": list(to_process_tasks)}}
+        )
+        self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
+
+        # Set total for builder bars to have a total
+        self.total = len(to_process_forms)
+
+        if self.task_validation:
+            invalid_ids = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {"is_valid": False}, [self.task_validation.key]
+                )
+            }
+        else:
+            invalid_ids = set()
+
+        projected_fields = [
+            "last_updated",
+            "completed_at",
+            "task_id",
+            "formula_pretty",
+            "output.energy_per_atom",
+            "output.structure",
+            "input",
+            "orig_inputs",
+            "input.structure",
+            "tags",
+        ]
+
+        for formula in to_process_forms:
+            tasks_query = dict(temp_query)
+            tasks_query["formula_pretty"] = formula
+            tasks = list(
+                self.tasks.query(criteria=tasks_query, properties=None)
+            )
+            for t in tasks:
+                if t[self.tasks.key] in invalid_ids:
+                    t["is_valid"] = False
+                else:
+                    t["is_valid"] = True
+
+            yield tasks
+
+    def process_item(self, tasks: List[Dict]) -> List[Dict]:
+        """
+        Process the tasks into a list of materials
+
+        Args:
+            tasks [dict] : a list of task docs
+
+        Returns:
+            ([dict],list) : a list of new materials docs and a list of task_ids that were processsed
+        """
+
+        tasks = [TaskDocument(**task) for task in tasks]
+        formula = tasks[0].formula_pretty
+        task_ids = [task.task_id for task in tasks]
+        self.logger.debug(f"Processing {formula} : {task_ids}")
+
+        grouped_tasks = self.filter_and_group_tasks(tasks)
+
+        materials = [MaterialsDoc.from_tasks(group) for group in grouped_tasks]
+        self.logger.debug(f"Produced {len(materials)} materials for {formula}")
+
+        return [mat.dict() for mat in materials]
+
+    def update_targets(self, items: List[List[Dict]]):
+        """
+        Inserts the new task_types into the task_types collection
+
+        Args:
+            items ([([dict],[int])]): A list of tuples of materials to update and the corresponding
+                processed task_ids
+        """
+
+        items = list(filter(None, chain.from_iterable(items)))
+
+        for item in items:
+            item.update({"_bt": self.timestamp})
+
+        material_ids = list({item["material_id"] for item in items})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} materials")
+            self.materials.remove_docs({self.materials.key: {"$in": material_ids}})
+            self.materials.update(
+                docs=jsanitize(items, allow_bson=True),
+                key=["material_id", "sandboxes"],
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def filter_and_group_tasks(self, tasks: List[TaskDocument]) -> Iterator[List[Dict]]:
+        """
+        Groups tasks by structure matching
+        """
+
+        filtered_tasks = [
+            task
+            for task in tasks
+            if any(
+                allowed_type is task.task_type
+                for allowed_type in self._allowed_task_types
+            )
+        ]
+
+        structures = []
+
+        for idx, task in enumerate(filtered_tasks):
+            s = task.output.structure
+            s.index = idx
+            structures.append(s)
+
+        grouped_structures = group_structures(
+            structures,
+            ltol=self.ltol,
+            stol=self.stol,
+            angle_tol=self.angle_tol,
+            symprec=self.symprec,
+        )
+        from itertools import groupby
+
+        def get_sg(struc, symprec=SETTINGS.SYMPREC) -> int:
+            struc.remove_oxidation_states()
+            struc.remove_spin()
+            for p in struc.site_properties:
+                struc.remove_site_property(p)
+            """helper function to get spacegroup with a loose tolerance"""
+            try:
+                return struc.get_space_group_info(symprec=symprec)[0]
+            except Exception:
+                return -1
+
+        for key, group in groupby(sorted(structures, key=get_sg), key=get_sg):
+#        for group in grouped_structures:
+            grouped_tasks = [filtered_tasks[struc.index] for struc in group]
+            sandboxes = {frozenset(task.sandboxes) for task in grouped_tasks}
+
+            for sbx_set in maximal_spanning_non_intersecting_subsets(sandboxes):
+                yield [
+                    task
+                    for task in grouped_tasks
+                    if len(set(task.sandboxes).intersection(sbx_set)) > 0
+                ]
diff --git a/emmet-builders/emmet/builders/cp2k/utils.py b/emmet-builders/emmet/builders/cp2k/utils.py
new file mode 100644
index 0000000000..933ca49991
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/utils.py
@@ -0,0 +1,91 @@
+import numpy as np
+from pymatgen.ext.matproj import MPRester
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+from typing import List
+
+
+def unpack(query, d):
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
+
+
+# TODO Move polaron compare to a different function to make validation easier
+def matcher(bulk_struc, defect_struc, final_bulk_struc=None, final_defect_struc=None):
+    matching_indices = []
+    dis = []
+
+    for j in range(len(defect_struc)):
+        for i in range(len(bulk_struc)):
+            if (
+                    bulk_struc[i].species == defect_struc[j].species
+                    and bulk_struc[i].distance(defect_struc[j]) < .02
+                    and bulk_struc[i].properties.get('ghost', False) == defect_struc[j].properties.get('ghost', False)
+            ):
+                matching_indices.append((i, j))
+                dis.append(j)
+                break
+
+    def_index = list(set(range(len(defect_struc))).difference(set(dis)))
+
+    # Consider a possible polaron
+    if len(def_index) == 0:
+        matching_indices = []
+
+        for j in range(len(defect_struc)):
+            for i in range(len(bulk_struc)):
+                if (
+                        bulk_struc[i].specie.as_dict().get('element') == defect_struc[j].specie.as_dict().get('element')
+                        and bulk_struc[i].distance(defect_struc[j]) < .02
+                        and bulk_struc[i].properties.get('ghost', False) == defect_struc[j].properties.get('ghost',
+                                                                                                           False)):
+                    matching_indices.append((i, j))
+                    break
+
+        oxi_diff = [abs(final_defect_struc[d].specie.oxi_state - final_bulk_struc[b].specie.oxi_state) for b, d in matching_indices]
+        def_index = np.argmax(oxi_diff)
+        matching_indices.pop(def_index)
+        return def_index, matching_indices
+
+    elif len(def_index) > 1:
+        raise ValueError("The provided defect structure and bulk structure "
+                         "have more than one potential defect site")
+
+    return def_index[0], matching_indices
+
+
+def get_dielectric(mpid):
+    with MPRester() as mp:
+        dat = mp.get_data(mpid, prop='diel')
+    try:
+        return dat[0]['diel']['e_total']
+    except:
+        return None
+
+
+from copy import deepcopy
+
+
+def get_mpid(s):
+    struc = deepcopy(s)
+    struc.remove_oxidation_states()
+    struc.remove_spin()
+    for p in struc.site_properties:
+        struc.remove_site_property(p)
+    sga = SpacegroupAnalyzer(struc)
+    with MPRester() as mp:
+        dat = mp.query(
+            criteria={
+                'chemsys': struc.composition.chemical_system,
+                'spacegroup.symbol': sga.get_space_group_symbol()
+            },
+            properties=['material_id', 'formation_energy_per_atom']
+        )
+
+    dat.sort(key=lambda x: x['formation_energy_per_atom'])
+    try:
+        return dat[0]['material_id']
+    except:
+        return None
diff --git a/emmet-builders/setup.py b/emmet-builders/setup.py
index 93fa6f28f4..0ab2ceb2f9 100644
--- a/emmet-builders/setup.py
+++ b/emmet-builders/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name="emmet-builders",
-    use_scm_version={"relative_to": Path(__file__).parent},
+    use_scm_version={"root": "..", "relative_to": __file__},
     setup_requires=["setuptools_scm"],
     description="Builders for the Emmet Library",
     author="The Materials Project",
diff --git a/emmet-core/emmet/core/cp2k/calc_types/__init__.py b/emmet-core/emmet/core/cp2k/calc_types/__init__.py
new file mode 100644
index 0000000000..9503f160d7
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/__init__.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+try:
+    import emmet.core.cp2k.calc_types.enums
+except ImportError:
+    import emmet.core.cp2k.calc_types.generate
+
+from emmet.core.cp2k.calc_types.enums import RunType, TaskType, CalcType
+from emmet.core.cp2k.calc_types.utils import run_type, task_type, calc_type
diff --git a/emmet-core/emmet/core/cp2k/calc_types/enums.py b/emmet-core/emmet/core/cp2k/calc_types/enums.py
new file mode 100644
index 0000000000..b0d280fa8e
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/enums.py
@@ -0,0 +1,64 @@
+"""
+Autogenerated Enums for CP2K RunType, TaskType, and CalcType
+Do not edit this by hand. Edit generate.py or run_types.yaml instead
+"""
+from emmet.core.utils import ValueEnum
+
+
+class RunType(ValueEnum):
+    """ CP2K calculation run types """
+
+    PBE = "PBE"
+    GGA = "GGA"
+    HYBRID = "HYBRID"
+    HSE06 = "HSE06"
+    PB0 = "PB0"
+    RSH = "RSH"
+    PBE_U = "PBE+U"
+    GGA_U = "GGA+U"
+    HYBRID_U = "HYBRID+U"
+    HSE06_U = "HSE06+U"
+    PB0_U = "PB0+U"
+    RSH_U = "RSH+U"
+    LDA = "LDA"
+    LDA_U = "LDA+U"
+
+
+class TaskType(ValueEnum):
+    """ CP2K calculation task types """
+
+    Static = "Static"
+    Structure_Optimization = "Structure Optimization"
+
+
+class CalcType(ValueEnum):
+    """ CP2K calculation types """
+
+    PBE_Static = "PBE Static"
+    PBE_Structure_Optimization = "PBE Structure Optimization"
+    GGA_Static = "GGA Static"
+    GGA_Structure_Optimization = "GGA Structure Optimization"
+    HYBRID_Static = "HYBRID Static"
+    HYBRID_Structure_Optimization = "HYBRID Structure Optimization"
+    HSE06_Static = "HSE06 Static"
+    HSE06_Structure_Optimization = "HSE06 Structure Optimization"
+    PB0_Static = "PB0 Static"
+    PB0_Structure_Optimization = "PB0 Structure Optimization"
+    RSH_Static = "RSH Static"
+    RSH_Structure_Optimization = "RSH Structure Optimization"
+    PBE_U_Static = "PBE+U Static"
+    PBE_U_Structure_Optimization = "PBE+U Structure Optimization"
+    GGA_U_Static = "GGA+U Static"
+    GGA_U_Structure_Optimization = "GGA+U Structure Optimization"
+    HYBRID_U_Static = "HYBRID+U Static"
+    HYBRID_U_Structure_Optimization = "HYBRID+U Structure Optimization"
+    HSE06_U_Static = "HSE06+U Static"
+    HSE06_U_Structure_Optimization = "HSE06+U Structure Optimization"
+    PB0_U_Static = "PB0+U Static"
+    PB0_U_Structure_Optimization = "PB0+U Structure Optimization"
+    RSH_U_Static = "RSH+U Static"
+    RSH_U_Structure_Optimization = "RSH+U Structure Optimization"
+    LDA_Static = "LDA Static"
+    LDA_Structure_Optimization = "LDA Structure Optimization"
+    LDA_U_Static = "LDA+U Static"
+    LDA_U_Structure_Optimization = "LDA+U Structure Optimization"
diff --git a/emmet-core/emmet/core/cp2k/calc_types/generate.py b/emmet-core/emmet/core/cp2k/calc_types/generate.py
new file mode 100644
index 0000000000..a6b2b21cd8
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/generate.py
@@ -0,0 +1,90 @@
+""" Module to define various calculation types as Enums for CP2K """
+import datetime
+from itertools import groupby, product
+from pathlib import Path
+from typing import Dict, Iterator, List
+
+import bson
+import numpy as np
+from monty.json import MSONable
+from monty.serialization import loadfn
+from pydantic import BaseModel
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
+from pymatgen.core.structure import Structure
+from typing_extensions import Literal
+
+from emmet.core import SETTINGS
+from emmet.core.utils import ValueEnum
+
+_RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve()))
+_TASK_TYPES = [
+    "Static",
+    "Structure Optimization",
+]
+
+_RUN_TYPES = (
+    [
+        rt
+        for functional_class in _RUN_TYPE_DATA
+        for rt in _RUN_TYPE_DATA[functional_class]
+    ]
+    + [
+        f"{rt}+U"
+        for functional_class in _RUN_TYPE_DATA
+        for rt in _RUN_TYPE_DATA[functional_class]
+    ]
+    + ["LDA", "LDA+U"]
+)
+
+
+def get_enum_source(enum_name, doc, items):
+    header = f"""
+class {enum_name}(ValueEnum):
+    \"\"\" {doc} \"\"\"\n
+"""
+    items = [f'    {const} = "{val}"' for const, val in items.items()]
+
+    return header + "\n".join(items)
+
+
+run_type_enum = get_enum_source(
+    "RunType",
+    "CP2K calculation run types",
+    dict(
+        {
+            "_".join(rt.split()).replace("+", "_").replace("-", "_"): rt
+            for rt in _RUN_TYPES
+        }
+    ),
+)
+task_type_enum = get_enum_source(
+    "TaskType",
+    "CP2K calculation task types",
+    {"_".join(tt.split()): tt for tt in _TASK_TYPES},
+)
+calc_type_enum = get_enum_source(
+    "CalcType",
+    "CP2K calculation types",
+    {
+        f"{'_'.join(rt.split()).replace('+','_').replace('-','_')}_{'_'.join(tt.split())}": f"{rt} {tt}"
+        for rt, tt in product(_RUN_TYPES, _TASK_TYPES)
+    },
+)
+
+
+with open(Path(__file__).parent / "enums.py", "w") as f:
+    f.write(
+        """\"\"\"
+Autogenerated Enums for CP2K RunType, TaskType, and CalcType
+Do not edit this by hand. Edit generate.py or run_types.yaml instead
+\"\"\"
+from emmet.core.utils import ValueEnum
+
+"""
+    )
+    f.write(run_type_enum)
+    f.write("\n\n")
+    f.write(task_type_enum)
+    f.write("\n\n")
+    f.write(calc_type_enum)
+    f.write("\n")
diff --git a/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
new file mode 100644
index 0000000000..aab07a0092
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
@@ -0,0 +1,17 @@
+GGA:
+  PBE:
+    FUNCTIONAL: PBE
+    FRACTION: 0
+  GGA:
+    GGA: --
+HYBRID:
+  HYBRID:
+    HYBRID: --
+  HSE06:
+    FRACTION: 0.25
+    INTERACTION_POTENTIAL: SHORTRANGE
+  PB0:
+    FRACTION: 0.25
+    INTERACTION_POTENTIAL: TRUNCATED
+  RSH:
+    INTERACTION_POTENTIAL: TRUNCATED_MIX_CL
diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py
new file mode 100644
index 0000000000..0ca93d4964
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py
@@ -0,0 +1,127 @@
+""" Module to define various calculation types as Enums for CP2K """
+from pathlib import Path
+from typing import Dict
+from monty.serialization import loadfn
+
+
+from emmet.core.cp2k.calc_types.enums import RunType, TaskType, CalcType
+
+_RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve()))
+
+
+def run_type(dft: Dict) -> RunType:
+    """
+    Determines the run_type from the CP2K input dict
+    This is adapted from pymatgen to be far less unstable
+
+    Args:
+        dft: dictionary of dft parameters (standard from task doc)
+    """
+
+    def _variant_equal(v1, v2) -> bool:
+        """
+        helper function to deal with strings
+        """
+        if isinstance(v1, str) and isinstance(v2, str):
+            return v1.strip().upper() == v2.strip().upper()
+        else:
+            return v1 == v2
+
+    is_hubbard = '+U' if dft.get('dft_plus_u') else ''
+
+    parameters = {
+        'FUNCTIONAL': dft.get('functional'),
+        'INTERACTION_POTENTIAL': dft.get('hfx', {}).get('Interaction_Potential'),
+        'FRACTION': dft.get('hfx', {}).get('FRACTION', 0)
+    }
+
+    # Standard calc will only have one functional. If there are multiple functionals
+    # used this is either a hybrid calc or a non-generic mixed calculation.
+    if len(parameters['FUNCTIONAL']) == 1:
+        parameters['FUNCTIONAL'] = parameters['FUNCTIONAL'][0]
+
+    # If all parameters in for the functional_class.special_type located in
+    # run_types.yaml are met, then that is the run type.
+    for functional_class in _RUN_TYPE_DATA:
+        for special_type, params in _RUN_TYPE_DATA[functional_class].items():
+            if all(
+                [
+                    _variant_equal(parameters.get(param, None), value)
+                    for param, value in params.items()
+                ]
+            ):
+                return RunType(f"{functional_class}{is_hubbard}")
+
+
+def task_type(
+    inputs: Dict
+) -> TaskType:
+    """
+    Determines the task type
+
+    Args:
+        inputs
+    """
+
+    calc_type = []
+
+    cp2k_run_type = inputs.get('Run_type', '')
+
+    if cp2k_run_type.upper() in ['ENERGY', 'ENERGY_FORCE', 'WAVEFUNCTION_OPTIMIZATION', 'WFN_OPT']:
+        calc_type.append('Static')
+
+    elif cp2k_run_type.upper() in ['GEO_OPT', 'GEOMETRY_OPTIMIZATION', 'CELL_OPT']:
+        calc_type.append('Structure Optimization')
+
+    elif cp2k_run_type.upper() in ['BAND']:
+        calc_type.append('Band')
+
+    elif cp2k_run_type.upper() in ['MOLECULAR_DYNAMICS', 'MD']:
+        calc_type.append('Molecular Dynamics')
+
+    elif cp2k_run_type.upper() in ['MONTE_CARLO', 'MC', 'TMC', 'TAMC']:
+        calc_type.append('Monte Carlo')
+
+    elif cp2k_run_type.upper() in ['LINEAR_RESPONSE', 'LR']:
+        calc_type.append('Linear Response')
+
+    elif cp2k_run_type.upper() in ['VIBRATIONAL_ANALYSIS', 'NORMAL_MODES']:
+        calc_type.append('Vibrational Analysis')
+
+    elif cp2k_run_type.upper() in ['ELECTRONIC_SPECTRA', 'SPECTRA']:
+        calc_type.append('Electronic Spectra')
+
+    elif cp2k_run_type.upper() in ['NEGF']:
+        calc_type.append('Non-equilibrium Green\'s Function')
+
+    elif cp2k_run_type.upper() in ['PINT', 'DRIVER']:
+        calc_type.append('Path Integral')
+
+    elif cp2k_run_type.upper() in ['RT_PROPAGATION', 'EHRENFEST_DYN']:
+        calc_type.append('Real-time propagation')
+
+    elif cp2k_run_type.upper() in ['BSSE']:
+        calc_type.append('Base set superposition error')
+
+    elif cp2k_run_type.upper() in ['DEBUG']:
+        calc_type.append('Debug analysis')
+
+    elif cp2k_run_type.upper() in ['NONE']:
+        calc_type.append('None')
+
+    return TaskType(" ".join(calc_type))
+
+
+def calc_type(
+    inputs: Dict,
+) -> CalcType:
+    """
+    Determines the calc type
+
+    Args:
+        inputs: inputs dict with an incar, kpoints, potcar, and poscar dictionaries
+        parameters: Dictionary of VASP parameters from Vasprun.xml
+    """
+    rt = run_type(inputs).value
+    tt = task_type(inputs).value
+    return CalcType(f"{rt} {tt}")
diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
new file mode 100644
index 0000000000..47390f0990
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -0,0 +1,194 @@
+""" Core definition of a Materials Document """
+from datetime import datetime
+from functools import partial
+from typing import ClassVar, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
+
+from pydantic import BaseModel, Field, create_model
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
+
+from emmet.core import SETTINGS
+from emmet.core.material import MaterialsDoc as CoreMaterialsDoc
+from emmet.core.material import PropertyOrigin as PropertyOrigin
+from emmet.core.structure import StructureMetadata
+from emmet.core.cp2k.calc_types import CalcType, RunType, TaskType
+from emmet.core.cp2k.task import TaskDocument
+from emmet.stubs import ComputedEntry, Structure
+
+
+class MaterialsDoc(CoreMaterialsDoc, StructureMetadata):
+
+    calc_types: Mapping[str, CalcType] = Field(  # type: ignore
+        None,
+        description="Calculation types for all the calculations that make up this material",
+    )
+    task_types: Mapping[str, TaskType] = Field(
+        None,
+        description="Task types for all the calculations that make up this material",
+    )
+    run_types: Mapping[str, RunType] = Field(
+        None,
+        description="Run types for all the calculations that make up this material",
+    )
+
+    origins: Sequence[PropertyOrigin] = Field(
+        None, description="Mappingionary for tracking the provenance of properties"
+    )
+
+    entries: Mapping[RunType, ComputedEntry] = Field(
+        None, description="Dictionary for tracking entries for CP2K calculations"
+    )
+
+    @classmethod
+    def from_tasks(
+        cls,
+        task_group: List[TaskDocument],
+        quality_scores=SETTINGS.CP2K_QUALITY_SCORES,
+        special_tags=SETTINGS.CP2K_SPECIAL_TAGS,
+    ) -> "MaterialsDoc":
+        """
+        Converts a group of tasks into one material
+        """
+
+        # Metadata
+        last_updated = max(task.last_updated for task in task_group)
+        created_at = min(task.completed_at for task in task_group)
+        task_ids = list({task.task_id for task in task_group})
+        sandboxes = list({sbxn for task in task_group for sbxn in task.sandboxes})
+
+        deprecated_tasks = list(
+            {task.task_id for task in task_group if not task.is_valid}
+        )
+        run_types = {task.task_id: task.run_type for task in task_group}
+        task_types = {task.task_id: task.task_type for task in task_group}
+        calc_types = {task.task_id: task.calc_type for task in task_group}
+
+        # TODO: Fix the type checking by hardcoding the Enums?
+        structure_optimizations = [
+            task
+            for task in task_group
+            if task.task_type == TaskType.Structure_Optimization  # type: ignore
+        ]
+        statics = [task for task in task_group if task.task_type == TaskType.Static]  # type: ignore
+
+        # Material ID
+        possible_mat_ids = [task.task_id for task in structure_optimizations] # TODO remove + statics ?
+        possible_mat_ids = sorted(possible_mat_ids, key=ID_to_int)
+
+        from emmet.builders.cp2k.utils import get_mpid
+        matched_id = get_mpid([task.output.structure for task in structure_optimizations + statics][0])
+        if matched_id: possible_mat_ids.insert(0, matched_id)
+
+        if len(possible_mat_ids) == 0:
+            raise Exception(f"Could not find a material ID for {task_ids}")
+        else:
+            material_id = possible_mat_ids[0]
+
+        def _structure_eval(task: TaskDocument):
+            """
+            Helper function to order structures optimization and statics calcs by
+            - Functional Type
+            - Spin polarization
+            - Special Tags
+            - Energy
+            """
+
+            task_run_type = task.run_type
+
+            is_valid = task.task_id in deprecated_tasks
+
+            return (
+                -1 * is_valid,
+                -1 * quality_scores.get(task_run_type.value, 0),
+                -1 * (2 if task.input.dft.get('UKS') else 1),
+                -1 * task.nsites  # TODO Make k-point density
+                -1 * sum(task.input.get(tag, False) for tag in special_tags),  # TODO what is this?
+                task.output.energy_per_atom,
+            )
+
+        structure_calcs = structure_optimizations + statics
+        best_structure_calc = sorted(structure_calcs, key=_structure_eval)[0]
+        structure = best_structure_calc.output.structure
+
+        # Initial Structures
+        initial_structures = [task.input.structure for task in task_group]
+        sm = StructureMatcher(
+            ltol=0.1, stol=0.1, angle_tol=0.1, scale=False, attempt_supercell=False
+        )
+        initial_structures = [
+            group[0] for group in sm.group_structures(initial_structures)
+        ]
+
+        # Deprecated
+        deprecated = all(
+            task.task_id in deprecated_tasks for task in structure_optimizations
+        )
+
+        # Origins
+        origins = [
+            PropertyOrigin(
+                name="structure",
+                task_id=best_structure_calc.task_id,
+                last_updated=best_structure_calc.last_updated,
+            )
+        ]
+        from pymatgen.entries.computed_entries import ComputedStructureEntry
+        # entries
+        entries = {}
+        all_run_types = set(run_types.values())
+        for rt in all_run_types:
+            relevant_calcs = sorted(
+                [doc for doc in structure_calcs if doc.run_type == rt],
+                key=_structure_eval,
+            )
+            if len(relevant_calcs) > 0:
+                best_task_doc = relevant_calcs[0]
+                entry = best_task_doc.entry
+                entry.data["task_id"] = entry.entry_id
+                entry.entry_id = material_id
+                # TODO Standard material doc doesn't have this
+                entries[rt] = ComputedStructureEntry(
+                    structure=best_task_doc.output.structure,
+                    energy=entry.energy,
+                    correction=entry.correction,
+                    energy_adjustments=entry.energy_adjustments,
+                    parameters=entry.parameters,
+                    data=entry.data,
+                    entry_id=entry.entry_id
+                )
+
+        # Warnings
+        # TODO: What warning should we process?
+
+        return cls.from_structure(
+            structure=structure,
+            material_id=material_id,
+            last_updated=last_updated,
+            created_at=created_at,
+            task_ids=task_ids,
+            calc_types=calc_types,
+            run_types=run_types,
+            task_types=task_types,
+            initial_structures=initial_structures,
+            deprecated=deprecated,
+            deprecated_tasks=deprecated_tasks,
+            origins=origins,
+            entries=entries,
+            sandboxes=sandboxes if sandboxes else None,
+        )
+
+
+def ID_to_int(s_id: str) -> Tuple[str, int]:
+    """
+    Converts a string id to tuple
+    falls back to assuming ID is an Int if it can't process
+    Assumes string IDs are of form "[chars]-[int]" such as mp-234
+    """
+    if isinstance(s_id, str):
+        return (s_id.split("-")[0], int(str(s_id).split("-")[-1]))
+    elif isinstance(s_id, (int, float)):
+        return ("", s_id)
+    else:
+        return None
+
+
+
diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
new file mode 100644
index 0000000000..160870614e
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -0,0 +1,210 @@
+""" Core definition of a CP2K Task Document """
+from datetime import datetime
+from functools import lru_cache, partial
+from typing import ClassVar, Dict, List, Optional, Union
+
+from pydantic import BaseModel, Field, validator
+from pymatgen.analysis.magnetism import CollinearMagneticStructureAnalyzer, Ordering
+from pymatgen.analysis.structure_analyzer import oxide_type
+from pymatgen.io.cp2k.inputs import Cp2kInput
+
+from emmet.core import SETTINGS
+from emmet.core.structure import StructureMetadata
+from emmet.core.utils import ValueEnum
+from emmet.core.cp2k.calc_types import (
+    CalcType,
+    RunType,
+    TaskType,
+    calc_type,
+    run_type,
+    task_type,
+)
+from emmet.stubs import ComputedEntry, Matrix3D, Structure, Vector3D
+
+
+class Status(ValueEnum):
+    """
+    CP2K Calculation State
+    """
+
+    SUCESS = "successful"
+    FAILED = "failed"
+
+
+class InputSummary(BaseModel):
+    """
+    Summary of inputs for a CP2K calculation
+    """
+
+    structure: Structure = Field(None, description="The input structure object")
+
+    atomic_kind_info: Dict = Field(None, description="Description of parameters used for each atomic kind")
+
+    cp2k_input_set: Dict = Field(None, description="The cp2k input used for this task")
+
+    dft: Dict = Field(None, description="Description of the DFT parameters used in the last calc of this task")
+
+    cp2k_global: Dict = Field(None, description="CP2K global parameters used in the last calc of this task")
+
+    @validator('atomic_kind_info')
+    def remove_unnecessary(cls, atomic_kind_info):
+        for k in atomic_kind_info:
+            if 'total_pseudopotential_energy' in atomic_kind_info[k]:
+                del atomic_kind_info[k]['total_pseudopotential_energy']
+        return atomic_kind_info
+
+    @validator('dft')
+    def cleanup_dft(cls, dft):
+        if any(v.upper() == 'UKS' for v in dft.values()):
+            dft['UKS'] = True
+        return dft
+
+
+class OutputSummary(BaseModel):
+    """
+    Summary of the outputs for a CP2K calculation
+    """
+
+    structure: Structure = Field(None, description="The output structure object")
+    energy: float = Field(
+        None, description="The final total DFT energy for the last calculation"
+    )
+    energy_per_atom: float = Field(
+        None, description="The final DFT energy per atom for the last calculation"
+    )
+    bandgap: float = Field(None, description="The DFT band gap for the last calculation")
+    forces: List[Vector3D] = Field(
+        None, description="Forces on atoms from the last calculation"
+    )
+    stress: Matrix3D = Field(
+        None, description="Stress on the unit cell from the last calculation"
+    )
+
+
+class RunStatistics(BaseModel):
+    """
+    Summary of the Run statistics for a CP2K calculation
+    """
+
+    average_memory: float = Field(None, description="The average memory used in kb")
+    max_memory: float = Field(None, description="The maximum memory used in kb")
+    elapsed_time: float = Field(None, description="The real time elapsed in seconds")
+    system_time: float = Field(None, description="The system CPU time in seconds")
+    user_time: float = Field(
+        None, description="The user CPU time spent by CP2K in seconds"
+    )
+    total_time: float = Field(
+        None, description="The total CPU time for this calculation"
+    )
+    cores: int = Field(None, description="The number of cores used by CP2K")
+
+
+class TaskDocument(StructureMetadata):
+    """
+    Definition of CP2K Task Document
+    """
+
+    dir_name: str = Field(None, description="The directory for this CP2K task")
+    run_stats: RunStatistics = Field(
+        None,
+        description="Summary of runtime statistics for each calculation in this task",
+    )
+    completed_at: datetime = Field(
+        None, description="Timestamp for when this task was completed"
+    )
+    last_updated: datetime = Field(
+        None, description="Timestamp for this task document was last updated"
+    )
+
+    is_valid: bool = Field(
+        True, description="Whether this task document passed validation or not"
+    )
+
+    input: InputSummary = Field(None)
+
+    output: OutputSummary = Field(None)
+
+    state: Status = Field(None, description="State of this calculation")
+
+    orig_inputs: Dict = Field(
+        None, description="Summary of the original CP2K inputs"
+    )
+    task_id: str = Field(None, description="the Task ID For this document")
+    tags: List[str] = Field([], description="Metadata tags for this task document")
+
+    sandboxes: List[str] = Field(
+        None, description="List of sandboxes this task document is allowed in"
+    )
+
+    run_type: RunType = Field(None)
+    task_type: TaskType = Field(None)
+    calc_type: CalcType = Field(None)
+
+    @validator('input', pre=True, always=True)
+    def rename_global(cls, input):
+        if 'cp2k_global' not in input:
+            input['cp2k_global'] = input.pop('global')
+        return input
+
+    @validator('run_type', pre=True, always=True)
+    def find_run_type(cls, v, values):
+        return run_type(values.get('input').dft)
+
+    @validator('task_type', pre=True, always=True)
+    def find_task_type(cls, v, values):
+        return task_type(values.get('input').cp2k_global)
+
+    @validator('calc_type', pre=True, always=True)
+    def find_calc_type(cls, v, values):
+        d = values.get('input').dft
+        d.update(values.get('input').cp2k_global)
+        return calc_type(d)
+
+    @property
+    def entry(self):
+        """ Turns a Task Doc into a ComputedEntry"""
+        entry_dict = {
+            "correction": 0.0,
+            "entry_id": self.task_id,
+            "composition": self.output.structure.composition,
+            "energy": self.output.energy,
+            "parameters": {
+                "atomic_kind_info": self.input.atomic_kind_info,
+                # This is done to be compatible with MontyEncoder for the ComputedEntry
+                "run_type": str(self.run_type),
+            },
+            "data": {
+                "oxide_type": oxide_type(self.output.structure),
+                "last_updated": self.last_updated,
+            },
+        }
+
+        return ComputedEntry.from_dict(entry_dict)
+
+    @validator("sandboxes", always=True)
+    def tags_to_sandboxes(cls, v, values):
+        tag_mapping = SETTINGS.TAGS_TO_SANDBOXES
+
+        if v is None:
+
+            if tag_mapping is not None:
+
+                sandboxed_tags = {
+                    tag for tag_list in tag_mapping.values() for tag in tag_list
+                }
+
+                if any(tag in sandboxed_tags for tag in values.get("tags", [])):
+
+                    v = sorted(
+                        {
+                            sandbox
+                            for sandbox, tags in tag_mapping.items()
+                            if len(set(tags).intersection(values.get("tags", []))) > 0
+                        }
+                    )
+                else:
+                    v = ["core"]
+            else:
+                v = ["core"]
+
+        return v
diff --git a/emmet-core/emmet/core/cp2k/validation.py b/emmet-core/emmet/core/cp2k/validation.py
new file mode 100644
index 0000000000..d0c8a1062a
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/validation.py
@@ -0,0 +1,121 @@
+from datetime import datetime
+from typing import Dict, List, Union
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+from emmet.core import SETTINGS
+from emmet.core.utils import DocEnum
+from emmet.core.vasp.task import TaskDocument
+from emmet.stubs import Structure
+
+
+class DeprecationMessage(DocEnum):
+
+    encut = "cutoff", "PW cutoff too low"
+    ldau = "ldau", "LDAU parameters don't match"
+    manual = "manual", "Manually deprecated"
+
+
+class ValidationDoc(BaseModel):
+    """
+    Validation document for a CP2K calculation
+    """
+
+    task_id: str = Field(..., description="The task_id for this validation document")
+    valid: bool = Field(False, description="Whether this task is valid or not")
+    last_updated: datetime = Field(
+        description="Last updated date for this document",
+        default_factory=datetime.utcnow,
+    )
+    reasons: List[Union[DeprecationMessage, str]] = Field(
+        [], description="List of deprecation tags detailing why this task isn't valid"
+    )
+
+    class Config:
+        extra = "allow"
+
+    @classmethod
+    def from_task_doc(
+        cls,
+        task_doc: TaskDocument,
+        kpts_tolerance: float = SETTINGS.VASP_KPTS_TOLERANCE,
+        input_sets: Dict[str, type] = SETTINGS.VASP_DEFAULT_INPUT_SETS,
+        LDAU_fields: List[str] = SETTINGS.VASP_CHECKED_LDAU_FIELDS,
+    ) -> "ValidationDoc":
+        """
+        Determines if a calculation is valid based on expected input parameters from a pymatgen inputset
+
+        Args:
+            task_doc: the task document to process
+            input_sets (dict): a dictionary of task_types -> pymatgen input set for validation
+            kpts_tolerance (float): the tolerance to allow kpts to lag behind the input set settings
+            LDAU_fields (list(String)): LDAU fields to check for consistency
+        """
+
+        structure = task_doc.output.structure
+        task_type = task_doc.task_type
+        inputs = task_doc.orig_inputs
+
+        is_valid = True
+        reasons = []
+        data = {}
+
+        if task_type in input_sets:
+            valid_input_set = input_sets[task_type](structure)
+
+            # Checking K-Points
+            valid_num_kpts = valid_input_set.kpoints.num_kpts or np.prod(
+                valid_input_set.kpoints.kpts[0]
+            )
+            num_kpts = inputs.get("kpoints", {}).get("nkpoints", 0) or np.prod(
+                inputs.get("kpoints", {}).get("kpoints", [1, 1, 1])
+            )
+            data["kpts_ratio"] = num_kpts / valid_num_kpts
+            if data["kpts_ratio"] < kpts_tolerance:
+                is_valid = False
+                reasons.append(DeprecationMessage.kpoints)
+
+            # Checking ENCUT
+            encut = inputs.get("incar", {}).get("ENCUT")
+            valid_encut = valid_input_set.incar["ENCUT"]
+            data["encut_ratio"] = float(encut) / valid_encut  # type: ignore
+            if data["encut_ratio"] < 1:
+                is_valid = False
+                reasons.append(DeprecationMessage.encut)
+
+            # Checking U-values
+            if valid_input_set.incar.get("LDAU"):
+                # Assemble actual input LDAU params into dictionary to account for possibility
+                # of differing order of elements
+                structure_set_symbol_set = structure.symbol_set
+                inputs_ldau_fields = [structure_set_symbol_set] + [
+                    inputs.get("incar", {}).get(k, []) for k in LDAU_fields
+                ]
+                input_ldau_params = {d[0]: d[1:] for d in zip(*inputs_ldau_fields)}
+
+                # Assemble required input_set LDAU params into dictionary
+                input_set_symbol_set = valid_input_set.poscar.structure.symbol_set
+                input_set_ldau_fields = [input_set_symbol_set] + [
+                    valid_input_set.incar.get(k) for k in LDAU_fields
+                ]
+                input_set_ldau_params = {
+                    d[0]: d[1:] for d in zip(*input_set_ldau_fields)
+                }
+
+                if any(
+                    input_set_ldau_params[el] != input_params
+                    for el, input_params in input_ldau_params.items()
+                ):
+                    is_valid = False
+                    reasons.append(DeprecationMessage.ldau)
+
+        doc = ValidationDoc(
+            task_id=task_doc.task_id,
+            task_type=task_doc.task_type,
+            run_type=task_doc.run_type,
+            valid=is_valid,
+            reasons=reasons,
+            **data
+        )
+        return doc
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
new file mode 100644
index 0000000000..8a270a9644
--- /dev/null
+++ b/emmet-core/emmet/core/defect.py
@@ -0,0 +1,387 @@
+""" Core definition for Defect property Document """
+from datetime import datetime
+from typing import ClassVar, Dict, Sequence, Tuple, Mapping, List
+
+import numpy as np
+from pydantic import BaseModel, Field
+from pymatgen.analysis.defects.core import Defect, DefectEntry
+
+from emmet.core import SETTINGS
+from emmet.core.material import PropertyDoc
+from emmet.core.cp2k.task import TaskDocument
+from emmet.core.cp2k.material import MaterialsDoc
+from emmet.core.structure import StructureMetadata
+from emmet.stubs import Matrix3D, Vector3D
+from emmet.core.cp2k.calc_types.utils import run_type
+
+from pymatgen.core import Structure, Composition
+from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+import numpy as np
+from pymatgen.ext.matproj import MPRester
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+from monty.json import MontyDecoder
+from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
+from itertools import groupby
+from pydantic import BaseModel, Field, validator, dataclasses
+from pydantic.dataclasses import dataclass
+from emmet.builders.cp2k.utils import get_mpid, get_dielectric, matcher
+from pydantic.class_validators import root_validator
+from pymatgen.entries.computed_entries import CompositionEnergyAdjustment, ComputedStructureEntry
+from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, DefectPredominanceDiagram
+from pydantic import BaseModel, ValidationError, root_validator
+from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
+from pymatgen.core import Element, Species
+
+
+class DefectDoc(BaseModel):
+    """
+    A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
+
+    This document can contain an arbitrary number of defect entries, originating from
+    pairs (defect and bulk) of calculations. This document provides access to the "best"
+    calculation of each run_type.
+    """
+
+    # TODO VASP MatDocs dont need this, but I get error requiring arbitrary type
+    class Config:
+        arbitrary_types_allowed = True
+
+    property_name: ClassVar[str] = "defect"
+
+    material_id: str = Field(None, description="Unique material ID for the host material")
+
+    chemsys: List = Field(None, description="Chemical system of the bulk")
+
+    calc_types: Mapping[str, CalcType] = Field(  # type: ignore
+        None,
+        description="Calculation types for all the calculations that make up this material",
+    )
+    task_types: Mapping[str, TaskType] = Field(
+        None,
+        description="Task types for all the calculations that make up this material",
+    )
+    run_types: Mapping[str, RunType] = Field(
+        None,
+        description="Run types for all the calculations that make up this material",
+    )
+
+    tasks: Mapping[RunType, Tuple[TaskDocument, TaskDocument]] = Field(
+        None, description="Task documents (defect task, bulk task) for the defect entry of RunType"
+    )
+
+    task_ids: List = Field(
+        None, description="All task ids used in creating this defect doc."
+    )
+
+    entries: Mapping[RunType, DefectEntry] = Field(
+        None, description="Dictionary for tracking entries for CP2K calculations"
+    )
+
+    # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
+    @validator("entries", pre=True)
+    def decode(cls, entries):
+        for e in entries:
+            if isinstance(entries[e], dict):
+                entries[e] = MontyDecoder().process_decoded({k: v for k, v in entries[e].items()})
+        return entries
+
+    @classmethod
+    def from_tasks(cls, tasks: List, query='defect') -> "DefectDoc":
+        """
+        The standard way to create this document.
+
+        Args:
+            tasks: A list of defect,bulk task pairs which will be used to construct a
+                series of DefectEntry objects.
+            query: How to retrieve the defect object stored in the task.
+        """
+
+        task_group = [TaskDocument(**defect_task) for defect_task, bulk_task in tasks]
+
+        # Metadata
+        last_updated = datetime.now() or max(task.last_updated for task in task_group)
+        created_at = datetime.now() or min(task.completed_at for task in task_group)
+        task_ids = list({task.task_id for task in task_group})
+        sandboxes = list({sbxn for task in task_group for sbxn in task.sandboxes})
+
+        deprecated_tasks = list(
+            {task.task_id for task in task_group if not task.is_valid}
+        )
+
+        run_types = {task.task_id: task.run_type for task in task_group}
+        task_types = {task.task_id: task.task_type for task in task_group}
+        calc_types = {task.task_id: task.calc_type for task in task_group}
+
+        def _run_type(x):
+            return run_type(x[0]['input']['dft']).value
+
+        entries = {}
+        final_tasks = {}
+        for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
+            entry_and_docs = [
+                (
+                    cls.get_defect_entry_from_tasks(defect_task=defect_task, bulk_task=bulk_task, query=query),
+                    TaskDocument(**defect_task), TaskDocument(**bulk_task)
+                )
+                for defect_task, bulk_task in tasks_for_runtype
+            ]
+            entry_and_docs.sort(key=lambda x: x[1].nsites, reverse=True)  # TODO Turn into kpoint density sorting
+            best_entry, best_defect_task, best_bulk_task = entry_and_docs[0]
+            entries[best_defect_task.run_type] = best_entry
+            final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task)
+
+        data = {
+                'entries': entries,
+                'run_types': run_types,
+                'task_types': task_types,
+                'calc_types': calc_types,
+                'last_updated': last_updated,
+                'created_at': created_at,
+                'task_ids': set(task_ids),
+                'sandboxes': sandboxes,
+                'deprecated_tasks': deprecated_tasks,
+                'tasks': final_tasks,
+                'material_id': list({v.parameters['material_id'] for v in entries.values()})[0],
+                'entry_ids': {rt: entries[rt].entry_id for rt in entries},
+                'chemsys': list([v.defect.bulk_structure.composition.elements for v in entries.values()])[0],
+        }
+
+        return cls(**{k: v for k, v in data.items()})
+
+    @classmethod
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, query='defect'):
+        """
+        Extract a defect entry from a single pair (defect and bulk) of tasks.
+
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+            query: Mongo-style query to retrieve the defect object from the defect task
+        """
+        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+
+        defect_entry = DefectEntry(
+            cls.get_defect_from_task(query=query, task=defect_task),
+            uncorrected_energy=parameters['defect_energy'] - parameters['bulk_energy'],
+            parameters=parameters,
+            entry_id=parameters['entry_id']
+        )
+        DefectCompatibility().process_entry(defect_entry, perform_corrections=True)
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
+
+        return defect_entry
+
+    @classmethod
+    def get_parameters_from_tasks(cls, defect_task, bulk_task):
+        """
+        Get parameters necessary to create a defect entry from defect and bulk task dicts
+
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+        """
+
+        def get_init(x):
+            if x.get('transformations', {}).get('history'):
+                return Structure.from_dict(x['transformations']['history'][0]['input_structure'])
+            return Structure.from_dict(x['input']['structure'])
+
+        init_defect_structure = get_init(defect_task)
+        init_bulk_structure = get_init(bulk_task)  # use init to avoid site_properties in matching
+
+        final_defect_structure = Structure.from_dict(defect_task['output']['structure'])
+        final_bulk_structure = Structure.from_dict(bulk_task['output']['structure'])
+
+        axis_grid = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_grid']]
+        bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree']]
+        defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree']]
+
+        dfi, site_matching_indices = matcher(
+            init_bulk_structure, init_defect_structure,
+            final_bulk_struc=final_bulk_structure, final_defect_struc=final_defect_structure
+        )
+
+        mpid = get_mpid(init_bulk_structure)
+
+        dielectric = get_dielectric(mpid)
+
+        parameters = {
+            'defect_energy': defect_task['output']['energy'],
+            'bulk_energy': bulk_task['output']['energy'],
+            'axis_grid': axis_grid,
+            'defect_frac_sc_coords': final_defect_structure[dfi].frac_coords,
+            'defect_planar_averages': defect_planar_averages,
+            'bulk_planar_averages': bulk_planar_averages,
+            'site_matching_indices': site_matching_indices,
+            'initial_defect_structure': init_defect_structure,
+            'final_defect_structure': final_defect_structure,
+            'vbm': bulk_task['output']['vbm'],
+            'cbm': bulk_task['output']['cbm'],
+            'dielectric': dielectric,
+            'material_id': mpid,
+            'entry_id': defect_task.get('task_id')
+        }
+
+        # cannot be easily queried for, so check here.
+        if 'v_hartree' in final_defect_structure.site_properties:
+            parameters['bulk_atomic_site_averages'] = final_bulk_structure.site_properties['v_hartree']
+        if 'v_hartree' in final_defect_structure.site_properties:
+            parameters['defect_atomic_site_averages'] = final_defect_structure.site_properties['v_hartree']
+
+        return parameters
+
+    @classmethod
+    def get_defect_from_task(cls, query, task):
+        """
+        Unpack a Mongo-style query and retrieve a defect object from a task.
+        """
+        defect = unpack(query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+
+class DefectThermoDoc(BaseModel):
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    property_name: ClassVar[str] = "Defect Thermo"
+
+    material_id: str = Field(None, description="Unique material ID for the host material")
+
+    task_ids: Dict = Field(
+        None, description="All task ids used in creating these phase diagrams"
+    )
+
+    defect_predominance_diagrams: Mapping[RunType, DefectPredominanceDiagram] = Field(
+        None, description="Defect predominance diagrams"
+    )
+
+    # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
+    @validator("defect_predominance_diagrams", pre=True)
+    def decode(cls, defect_predominance_diagrams):
+        for e in defect_predominance_diagrams:
+            if isinstance(defect_predominance_diagrams[e], dict):
+                defect_predominance_diagrams[e] = MontyDecoder().process_decoded({k: v for k, v in defect_predominance_diagrams[e].items()})
+        return defect_predominance_diagrams
+
+    @classmethod
+    def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc]) -> "DefectThermoDoc":
+
+        DEFAULT_RT = RunType('GGA')  # TODO NEED A procedure for getting all GGA or GGA+U keys
+        DEFAULT_RT_U = RunType('GGA+U')
+
+        mpid = docs[0].material_id
+        cls.get_adjusted_entries(materials=materials, defects=docs)
+
+        defect_entries = {}
+        defect_phase_diagram = {}
+        vbms = {}
+        band_gaps = {}
+        defect_predominance_diagrams = {}
+        task_ids = {}
+
+        for d in docs:
+            for run_type in d.entries:
+                if run_type not in defect_entries:
+                    defect_entries[run_type] = []
+                if run_type not in task_ids:
+                    task_ids[run_type] = set()
+                defect_entries[run_type].append(d.entries[run_type])
+                vbms[run_type] = d.entries[run_type].parameters['vbm']  # TODO Need to find best vbm
+                band_gaps[run_type] = d.entries[run_type].parameters['cbm'] - vbms[run_type]  # TODO Need to find best bg
+                task_ids[run_type].update(d.task_ids)
+
+        for run_type in defect_entries:
+            # TODO MUST FILTER COMPATIBLE AT SOME POINT
+            defect_phase_diagram[run_type] = DefectPhaseDiagram(
+                entries=defect_entries[run_type],
+                vbm=vbms[run_type],
+                band_gap=band_gaps[run_type],
+                filter_compatible=False
+            )
+            defect_predominance_diagrams[run_type] = DefectPredominanceDiagram(
+                defect_phase_diagram=defect_phase_diagram[run_type],
+                bulk_dos=get_dos(mpid),
+                entries=[
+                    m.entries[run_type]
+                    if run_type in m.entries else m.entries[DEFAULT_RT_U]
+                    if DEFAULT_RT_U in m.entries else m.entries[DEFAULT_RT]
+                    for m in materials
+                ]
+            )
+
+        data = {
+            'material_id': mpid,
+            'task_ids': task_ids,
+            'defect_predominance_diagrams': defect_predominance_diagrams,
+        }
+
+        return cls(**{k: v for k, v in data.items()})
+
+    @classmethod
+    def get_adjusted_entries(cls, materials: List[MaterialsDoc], defects: List[DefectDoc]):
+        """
+        Shift the energy of all entries to formation energy space, i.e. elemental entries to 0eV
+
+        The chemical potentials (elemental energies) are acquired for each run type that is available
+        in materials. If a chempot does not exist for a non-standard run type, but does exist for
+        a GGA calculation, then this will be used as the fall-back.
+
+        Args:
+            materials: list of MaterialsDocs with *ComputedStructureEntries* as the entries
+
+            defects: list of DefectDocs
+
+        returns:
+            None
+        """
+
+        DEFAULT_RT = RunType('GGA')  # TODO NEED A procedure for getting all GGA or GGA+U keys
+
+        chempots = {
+            m.structure.composition.elements[0].element:
+                {rt: m.entries[rt].energy_per_atom for rt in m.entries}
+            for m in materials if m.structure.composition.is_element
+        }
+
+        for m in materials:
+            for rt, ent in m.entries.items():
+                ent.parameters['software'] = 'cp2k'
+                ent.structure.remove_spin()
+                ent.structure.remove_oxidation_states()
+                MaterialsProject2020Compatibility().process_entry(ent)
+                for el, amt in ent.composition.element_composition.items():
+                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
+                    adj = CompositionEnergyAdjustment(-chempots[Element(el)][_rt],
+                                                      n_atoms=amt,
+                                                      name=f"Elemental shift {el} to formation energy space"
+                                                      )
+                    ent.energy_adjustments.append(adj)
+
+        for d in defects:
+            for rt, ent in d.entries.items():
+                comp = Composition(ent.defect.defect_composition, allow_negative=True) - \
+                       ent.defect.bulk_structure.composition
+                for el, amt in comp.items():
+                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
+                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt*chempots[Element(el)][_rt]
+
+
+def get_dos(mpid):
+    with MPRester() as mp:
+        return mp.get_dos_by_material_id(mpid)
+
+
+def get_entries(chemsys):
+    with MPRester() as mp:
+        return mp.get_entries_in_chemsys(chemsys)
+
+
+def unpack(query, d):
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
diff --git a/emmet-core/emmet/core/settings.py b/emmet-core/emmet/core/settings.py
index e302cb7c6e..f67396cd1f 100644
--- a/emmet-core/emmet/core/settings.py
+++ b/emmet-core/emmet/core/settings.py
@@ -76,6 +76,23 @@ class EmmetSettings(BaseSettings):
         ["LDAUU", "LDAUJ", "LDAUL"], description="LDAU fields to validate for tasks"
     )
 
+    CP2K_SPECIAL_TAGS: List[str] = Field(
+        [], description="Special tags to prioritize for CP2K task documents"
+    )
+
+    CP2K_QUALITY_SCORES: Dict[str, int] = Field(
+        {"HYBRID": 4, "SCAN": 3, "GGA+U": 2, "GGA": 1},
+        description="Dictionary Mapping CP2K calculation run types to rung level for CP2K materials builders",
+    )
+
+    CP2K_DEFAULT_INPUT_SETS: Dict = Field(
+        {
+            "GGA Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
+            "GGA+U Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
+        },
+        description="Default input sets for task validation",
+    )
+
     class Config:
         env_prefix = "emmet_"
         extra = "ignore"

From 38c10fcc617c7a31bdf4412cdb3ecf5970b047e6 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 28 Jul 2021 16:45:51 -0700
Subject: [PATCH 02/41] Update.

---
 emmet-builders/emmet/builders/cp2k/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/emmet-builders/emmet/builders/cp2k/utils.py b/emmet-builders/emmet/builders/cp2k/utils.py
index 933ca49991..73bf4dda45 100644
--- a/emmet-builders/emmet/builders/cp2k/utils.py
+++ b/emmet-builders/emmet/builders/cp2k/utils.py
@@ -59,6 +59,9 @@ def matcher(bulk_struc, defect_struc, final_bulk_struc=None, final_defect_struc=
 def get_dielectric(mpid):
     with MPRester() as mp:
         dat = mp.get_data(mpid, prop='diel')
+        band_gap = mp.get_data(mpid, prop='band_gap')[0]['band_gap']
+    if band_gap == 0.0:
+        return np.inf
     try:
         return dat[0]['diel']['e_total']
     except:

From 5955cf5a2aa3ecd26e3af0758db212fa06e6ed2a Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 28 Jul 2021 16:46:39 -0700
Subject: [PATCH 03/41] Remove unecessary methods. Update condition.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 87 ++-----------------
 1 file changed, 9 insertions(+), 78 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 7a3aff9423..e6e3b2ca9b 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -272,7 +272,7 @@ def preprocess_bulk(self, task):
             self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")
             return False
         diel = get_dielectric(mpid)
-        if not diel:
+        if diel is None:
             self.logger.debug(f"NO DIEL FOUND FOR {task} - {struc.composition}")
             return False
         return True
@@ -346,6 +346,7 @@ def match_defects_to_bulks(self, bulk_ids, defect_ids):
         props = [
             'task_id',
             'input',
+            'nsites',
             'output.structure'
             'transformations',
             'defect',
@@ -361,7 +362,7 @@ def match_defects_to_bulks(self, bulk_ids, defect_ids):
             angle_tol=SETTINGS.ANGLE_TOL,
             primitive_cell=False,
             scale=True,
-            attempt_supercell=False,
+            attempt_supercell=True,
             allow_subset=False,
             comparator=ElementComparator(),
         )
@@ -370,7 +371,8 @@ def _compare(b, d):
             if run_type(b.get('input').get('dft')).value.split('+U')[0] == \
                 run_type(d.get('input').get('dft')).value.split('+U')[0] and \
                     sm.fit(DefectBuilder.get_pristine_supercell(d), DefectBuilder.get_pristine_supercell(b)):
-                return True
+                if abs(b['nsites'] - d['nsites']) <= 1:
+                    return True
             return False
 
         pairs = [(defect['task_id'], bulk['task_id']) for bulk in bulks for defect in defects if _compare(bulk, defect)]
@@ -380,14 +382,10 @@ def _compare(b, d):
     @staticmethod
     def get_pristine_supercell(x):
         if x.get('defect'):
-            s = load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).bulk_structure
-            s.make_supercell(x.get('scale'))
+            return load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).bulk_structure
         elif x.get('transformations'):
-            s = Structure.from_dict(x['transformations']['history'][0]['input_structure']),
-        else:
-            s = Structure.from_dict(x['input']['structure'])
-
-        return s
+            return Structure.from_dict(x['transformations']['history'][0]['input_structure'])
+        return Structure.from_dict(x['input']['structure'])
 
     def process_item(self, tasks):
         """
@@ -401,73 +399,6 @@ def process_item(self, tasks):
         self.logger.debug(f"Produced {len(defect_docs)} ")
         return [d.dict() for d in defect_docs]
 
-    def get_defect_entry_from_tasks(self, defect_task, bulk_task):
-        parameters = self.get_parameters_from_tasks(defect_task, bulk_task)
-
-        defect_entry = DefectEntry(
-            self.get_defect_from_task(defect_task),
-            uncorrected_energy=parameters['defect_energy'] - parameters['bulk_energy'],
-            parameters=parameters,
-            entry_id=parameters['task_id']
-        )
-        DefectCompatibility().process_entry(defect_entry, perform_corrections=True)
-        defect_entry_as_dict = defect_entry.as_dict()
-        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
-
-        self.logger.info(f"Produced defect entry for {parameters['material_id']}")
-
-        return defect_entry_as_dict
-
-    def get_parameters_from_tasks(self, defect_task, bulk_task):
-
-        self.logger.debug("Getting parameters from defect and bulk task")
-
-        def get_init(x):
-            if x.get('transformations'):
-                return Structure.from_dict(x['transformations']['history'][0]['input_structure'])
-            return Structure.from_dict(x['input']['structure'])
-
-        init_defect_structure = get_init(defect_task)
-        init_bulk_structure = get_init(bulk_task)  # use init to avoid site_properties in matching
-
-        final_defect_structure = Structure.from_dict(defect_task['output']['structure'])
-        final_bulk_structure = Structure.from_dict(bulk_task['output']['structure'])
-
-        axis_grid = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_grid']]
-        bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree']]
-        defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree']]
-
-        dfi, site_matching_indices = matcher(init_bulk_structure, init_defect_structure)
-
-        mpid = get_mpid(init_bulk_structure)
-
-        dielectric = get_dielectric(mpid)
-
-        parameters = {
-            'defect_energy': defect_task['output']['energy'],
-            'bulk_energy': bulk_task['output']['energy'],
-            'axis_grid': axis_grid,
-            'defect_frac_sc_coords': final_defect_structure[dfi].frac_coords,
-            'defect_planar_averages': defect_planar_averages,
-            'bulk_planar_averages': bulk_planar_averages,
-            'site_matching_indices': site_matching_indices,
-            'initial_defect_structure': init_defect_structure,
-            'final_defect_structure': final_defect_structure,
-            'vbm': bulk_task['output']['vbm'],
-            'cbm': bulk_task['output']['cbm'],
-            'dielectric': dielectric,
-            'material_id': mpid,
-            'entry_id': defect_task.get('task_id')
-        }
-
-        # cannot be easily queried for, so check here.
-        if 'v_hartree' in final_defect_structure.site_properties:
-            parameters['bulk_atomic_site_averages'] = final_bulk_structure.site_properties['v_hartree']
-        if 'v_hartree' in final_defect_structure.site_properties:
-            parameters['defect_atomic_site_averages'] = final_defect_structure.site_properties['v_hartree']
-
-        return parameters
-
     def get_defect_from_task(self, task):
         defect = unpack(self.defect_query.split('.'), task)
         needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
@@ -486,7 +417,7 @@ def update_targets(self, items):
         task_ids = list(chain.from_iterable([item['task_ids'] for item in items]))
 
         if len(items) > 0:
-            self.logger.info(f"Updating {len(items)} materials")
+            self.logger.info(f"Updating {len(items)} defects")
             self.defects.remove_docs({self.defects.key: {"$in": task_ids}})
             self.defects.update(
                 docs=jsanitize(items, allow_bson=True),

From bd173827392056aaf9e7919004802827e464c7fc Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 29 Jul 2021 19:35:28 -0700
Subject: [PATCH 04/41] Update to parameter getter.

---
 emmet-core/emmet/core/defect.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index 8a270a9644..f78a38fa42 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -204,8 +204,6 @@ def get_init(x):
 
         mpid = get_mpid(init_bulk_structure)
 
-        dielectric = get_dielectric(mpid)
-
         parameters = {
             'defect_energy': defect_task['output']['energy'],
             'bulk_energy': bulk_task['output']['energy'],
@@ -218,13 +216,16 @@ def get_init(x):
             'final_defect_structure': final_defect_structure,
             'vbm': bulk_task['output']['vbm'],
             'cbm': bulk_task['output']['cbm'],
-            'dielectric': dielectric,
             'material_id': mpid,
             'entry_id': defect_task.get('task_id')
         }
 
+        dielectric = get_dielectric(mpid)
+        if dielectric is not np.inf or None:
+            parameters['dielectric'] = dielectric
+
         # cannot be easily queried for, so check here.
-        if 'v_hartree' in final_defect_structure.site_properties:
+        if 'v_hartree' in final_bulk_structure.site_properties:
             parameters['bulk_atomic_site_averages'] = final_bulk_structure.site_properties['v_hartree']
         if 'v_hartree' in final_defect_structure.site_properties:
             parameters['defect_atomic_site_averages'] = final_defect_structure.site_properties['v_hartree']
@@ -263,7 +264,9 @@ class Config:
     def decode(cls, defect_predominance_diagrams):
         for e in defect_predominance_diagrams:
             if isinstance(defect_predominance_diagrams[e], dict):
-                defect_predominance_diagrams[e] = MontyDecoder().process_decoded({k: v for k, v in defect_predominance_diagrams[e].items()})
+                defect_predominance_diagrams[e] = MontyDecoder().process_decoded(
+                    {k: v for k, v in defect_predominance_diagrams[e].items()}
+                )
         return defect_predominance_diagrams
 
     @classmethod
@@ -282,6 +285,13 @@ def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc]) -> "Def
         defect_predominance_diagrams = {}
         task_ids = {}
 
+        dos = get_dos(mpid)
+
+        for m in materials:
+            for run_type in m.entries:
+                bg = dos.get_gap()
+                band_gaps[run_type] = bg
+
         for d in docs:
             for run_type in d.entries:
                 if run_type not in defect_entries:
@@ -290,7 +300,6 @@ def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc]) -> "Def
                     task_ids[run_type] = set()
                 defect_entries[run_type].append(d.entries[run_type])
                 vbms[run_type] = d.entries[run_type].parameters['vbm']  # TODO Need to find best vbm
-                band_gaps[run_type] = d.entries[run_type].parameters['cbm'] - vbms[run_type]  # TODO Need to find best bg
                 task_ids[run_type].update(d.task_ids)
 
         for run_type in defect_entries:
@@ -303,7 +312,7 @@ def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc]) -> "Def
             )
             defect_predominance_diagrams[run_type] = DefectPredominanceDiagram(
                 defect_phase_diagram=defect_phase_diagram[run_type],
-                bulk_dos=get_dos(mpid),
+                bulk_dos=dos,
                 entries=[
                     m.entries[run_type]
                     if run_type in m.entries else m.entries[DEFAULT_RT_U]

From 4e96e0600d5e67c87e9dd9f6ab341e56ad3c7ef8 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 10:35:02 -0700
Subject: [PATCH 05/41] Clean imports and update doc.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 52 +++++++++----------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index e6e3b2ca9b..f82fa1e75a 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1,57 +1,51 @@
 from datetime import datetime
 from itertools import chain, groupby, combinations
-from operator import itemgetter
 from typing import Dict, Iterator, List, Optional
-import numpy as np
 
 from maggma.builders import Builder
 from maggma.stores import Store
 from pymatgen.core import Structure
-from pymatgen.analysis.structure_analyzer import oxide_type
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
 from atomate.utils.utils import load_class
 
-from pymatgen.analysis.defects.core import (
-    Defect, Vacancy, Substitution, Polaron, Interstitial, GhostVacancy, DefectEntry
-)
-from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+
 from monty.json import MontyDecoder
 
-from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
-from emmet.builders.cp2k.utils import matcher, get_dielectric, get_mpid
+from emmet.builders.cp2k.utils import get_dielectric, get_mpid
 from emmet.core import SETTINGS
 from emmet.core.utils import jsanitize, get_sg
 from emmet.core.cp2k.calc_types import TaskType
 from emmet.core.cp2k.material import MaterialsDoc
-from emmet.core.cp2k.task import TaskDocument
-from emmet.stubs import ComputedEntry
+
 from emmet.core.cp2k.calc_types.utils import run_type
 from emmet.core.defect import DefectDoc, DefectThermoDoc
 
-from pymatgen.ext.matproj import MPRester
-from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
 
 
 class DefectBuilder(Builder):
     """
-    The Materials Builder matches VASP task documents by structure similarity into materials
-    document. The purpose of this builder is group calculations and determine the best structure.
-    All other properties are derived from other builders.
+    The DefectBuilder collects task documents performed on structures containing a single point defect.
+    The builder is intended to group tasks corresponding to the same defect (species including charge state),
+    find the best ones, and perform finite-size defect corrections to create a defect document. These
+    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.
 
-    The process is as follows:
+    In order to make the build process easier, an entry must exist inside of the task doc that identifies it
+    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,
+    this may be changed to having a defect transformation in the transformation history.
 
-        1.) Find all documents with the same formula
-        2.) Select only task documents for the task_types we can select properties from
-        3.) Aggregate task documents based on structure similarity
-        4.) Convert task docs to property docs with metadata for selection and aggregation
-        5.) Select the best property doc for each property
-        6.) Build material document from best property docs
-        7.) Post-process material document
-        8.) Validate material document
+    The process is as follows:
 
+        1.) Find all documents containing the defect query.
+        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already
+            calculated. These are the candidate bulk tasks.
+        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites
+            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding
+            bulk calculation.
+        4.) Convert (defect, bulk task) doc pairs to DefectDocs
+        5.) Post-process and validate defect document
+        6.) Update the defect store
     """
 
     def __init__(
@@ -71,7 +65,7 @@ def __init__(
         Args:
             tasks: Store of task documents
             defects: Store of defect documents to generate
-            query: dictionary to limit tasks to be analyzed
+            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property
             allowed_task_types: list of task_types that can be processed
             symprec: tolerance for SPGLib spacegroup finding
             ltol: StructureMatcher tuning parameter for matching tasks to materials
@@ -264,7 +258,11 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         yield grouped_pairs
 
+    # TODO: This must be changed once access to internal MP database is avaiable. RN it uses the MAPI
     def preprocess_bulk(self, task):
+        """
+        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used.
+        """
         t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))
         struc = Structure.from_dict(t.get('output').get('structure'))
         mpid = get_mpid(struc)

From cd814b916cd7ccdec0d95d7dbaaaffb344b9a2de Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 10:47:39 -0700
Subject: [PATCH 06/41] CP2K settings.

---
 emmet-core/emmet/core/settings.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/emmet-core/emmet/core/settings.py b/emmet-core/emmet/core/settings.py
index bb655c9efb..d960c173b8 100644
--- a/emmet-core/emmet/core/settings.py
+++ b/emmet-core/emmet/core/settings.py
@@ -29,6 +29,23 @@ class EmmetSettings(BaseSettings):
         DEFAULT_CONFIG_FILE_PATH, description="File to load alternative defaults from"
     )
 
+    CP2K_SPECIAL_TAGS: List[str] = Field(
+        [], description="Special tags to prioritize for CP2K task documents"
+    )
+
+    CP2K_QUALITY_SCORES: Dict[str, int] = Field(
+        {"HYBRID": 4, "SCAN": 3, "GGA+U": 2, "GGA": 1},
+        description="Dictionary Mapping CP2K calculation run types to rung level for CP2K materials builders",
+    )
+
+    CP2K_DEFAULT_INPUT_SETS: Dict = Field(
+        {
+            "GGA Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
+            "GGA+U Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
+        },
+        description="Default input sets for task validation",
+    )
+
     LTOL: float = Field(
         0.2, description="Fractional length tolerance for structure matching"
     )

From dce71e87d20573cbb151c874a8d6df444efa0d36 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 12:09:54 -0700
Subject: [PATCH 07/41] Update imports. stubs gone

---
 emmet-core/emmet/core/cp2k/task.py | 31 +++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
index 160870614e..08a2e906f0 100644
--- a/emmet-core/emmet/core/cp2k/task.py
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -1,12 +1,11 @@
 """ Core definition of a CP2K Task Document """
 from datetime import datetime
-from functools import lru_cache, partial
-from typing import ClassVar, Dict, List, Optional, Union
+from typing import Dict, List
 
 from pydantic import BaseModel, Field, validator
-from pymatgen.analysis.magnetism import CollinearMagneticStructureAnalyzer, Ordering
 from pymatgen.analysis.structure_analyzer import oxide_type
-from pymatgen.io.cp2k.inputs import Cp2kInput
+from pymatgen.core import Structure
+from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 
 from emmet.core import SETTINGS
 from emmet.core.structure import StructureMetadata
@@ -19,7 +18,7 @@
     run_type,
     task_type,
 )
-from emmet.stubs import ComputedEntry, Matrix3D, Structure, Vector3D
+from emmet.core.math import Matrix3D, Vector3D
 
 
 class Status(ValueEnum):
@@ -181,6 +180,28 @@ def entry(self):
 
         return ComputedEntry.from_dict(entry_dict)
 
+    @property
+    def structure_entry(self) -> ComputedStructureEntry:
+        """Turns a Task Doc into a ComputedStructureEntry"""
+        entry_dict = {
+            "correction": 0.0,
+            "entry_id": self.task_id,
+            "composition": self.output.structure.composition,
+            "energy": self.output.energy,
+            "parameters": {
+                "atomic_kind_info": self.input.atomic_kind_info,
+                # This is done to be compatible with MontyEncoder for the ComputedEntry
+                "run_type": str(self.run_type),
+            },
+            "data": {
+                "oxide_type": oxide_type(self.output.structure),
+                "last_updated": self.last_updated,
+            },
+            "structure": self.output.structure,
+        }
+
+        return ComputedStructureEntry.from_dict(entry_dict)
+
     @validator("sandboxes", always=True)
     def tags_to_sandboxes(cls, v, values):
         tag_mapping = SETTINGS.TAGS_TO_SANDBOXES

From a3dd4b0189f195551861fd98f29276745a44d8b6 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 12:12:00 -0700
Subject: [PATCH 08/41] Fix imports. Stubs gone.

---
 emmet-core/emmet/core/cp2k/material.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
index 47390f0990..b310480fab 100644
--- a/emmet-core/emmet/core/cp2k/material.py
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -1,10 +1,10 @@
 """ Core definition of a Materials Document """
-from datetime import datetime
-from functools import partial
-from typing import ClassVar, List, Mapping, Optional, Sequence, Tuple, TypeVar, Union
+from typing import List, Mapping, Sequence, Tuple
+
+from pydantic import Field
+from pymatgen.analysis.structure_matcher import StructureMatcher
+from pymatgen.entries.computed_entries import ComputedStructureEntry
 
-from pydantic import BaseModel, Field, create_model
-from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
 
 from emmet.core import SETTINGS
 from emmet.core.material import MaterialsDoc as CoreMaterialsDoc
@@ -12,7 +12,6 @@
 from emmet.core.structure import StructureMetadata
 from emmet.core.cp2k.calc_types import CalcType, RunType, TaskType
 from emmet.core.cp2k.task import TaskDocument
-from emmet.stubs import ComputedEntry, Structure
 
 
 class MaterialsDoc(CoreMaterialsDoc, StructureMetadata):
@@ -34,7 +33,7 @@ class MaterialsDoc(CoreMaterialsDoc, StructureMetadata):
         None, description="Mappingionary for tracking the provenance of properties"
     )
 
-    entries: Mapping[RunType, ComputedEntry] = Field(
+    entries: Mapping[RunType, ComputedStructureEntry] = Field(
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
 

From 87a62863070fb1a71ce5c05fb5cebece9057fcb7 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 12:12:16 -0700
Subject: [PATCH 09/41] Clean imports

---
 emmet-builders/emmet/builders/cp2k/materials.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
index 48d21bbb75..7b35239d33 100644
--- a/emmet-builders/emmet/builders/cp2k/materials.py
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -1,13 +1,9 @@
 from datetime import datetime
 from itertools import chain
-from operator import itemgetter
 from typing import Dict, Iterator, List, Optional
 
 from maggma.builders import Builder
 from maggma.stores import Store
-from pymatgen.core import Structure
-from pymatgen.analysis.structure_analyzer import oxide_type
-from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
 
 from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
 from emmet.core import SETTINGS
@@ -15,7 +11,6 @@
 from emmet.core.cp2k.calc_types import TaskType
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.core.cp2k.task import TaskDocument
-from emmet.stubs import ComputedEntry
 
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"

From 9590dc98c2482478094ff7ff376d6185371c6fb2 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 12:16:44 -0700
Subject: [PATCH 10/41] Clean imports

---
 emmet-core/emmet/core/defect.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index f78a38fa42..b2bd530eed 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -1,36 +1,28 @@
 """ Core definition for Defect property Document """
 from datetime import datetime
-from typing import ClassVar, Dict, Sequence, Tuple, Mapping, List
+from typing import ClassVar, Dict, Tuple, Mapping, List
 
-import numpy as np
-from pydantic import BaseModel, Field
-from pymatgen.analysis.defects.core import Defect, DefectEntry
+from pymatgen.analysis.defects.core import DefectEntry
 
-from emmet.core import SETTINGS
-from emmet.core.material import PropertyDoc
 from emmet.core.cp2k.task import TaskDocument
 from emmet.core.cp2k.material import MaterialsDoc
-from emmet.core.structure import StructureMetadata
-from emmet.stubs import Matrix3D, Vector3D
 from emmet.core.cp2k.calc_types.utils import run_type
 
 from pymatgen.core import Structure, Composition
 from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
 import numpy as np
 from pymatgen.ext.matproj import MPRester
-from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 from monty.json import MontyDecoder
 from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
 from itertools import groupby
-from pydantic import BaseModel, Field, validator, dataclasses
-from pydantic.dataclasses import dataclass
+from pydantic import Field, validator
 from emmet.builders.cp2k.utils import get_mpid, get_dielectric, matcher
-from pydantic.class_validators import root_validator
-from pymatgen.entries.computed_entries import CompositionEnergyAdjustment, ComputedStructureEntry
+
+from pymatgen.entries.computed_entries import CompositionEnergyAdjustment
 from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, DefectPredominanceDiagram
-from pydantic import BaseModel, ValidationError, root_validator
+from pydantic import BaseModel
 from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
-from pymatgen.core import Element, Species
+from pymatgen.core import Element
 
 
 class DefectDoc(BaseModel):

From 9d2947f70968f95b043e627301e1b2632bbc82fe Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 12:51:35 -0700
Subject: [PATCH 11/41] Remove validator

---
 emmet-core/emmet/core/cp2k/task.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
index 08a2e906f0..8711be5c50 100644
--- a/emmet-core/emmet/core/cp2k/task.py
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -202,30 +202,3 @@ def structure_entry(self) -> ComputedStructureEntry:
 
         return ComputedStructureEntry.from_dict(entry_dict)
 
-    @validator("sandboxes", always=True)
-    def tags_to_sandboxes(cls, v, values):
-        tag_mapping = SETTINGS.TAGS_TO_SANDBOXES
-
-        if v is None:
-
-            if tag_mapping is not None:
-
-                sandboxed_tags = {
-                    tag for tag_list in tag_mapping.values() for tag in tag_list
-                }
-
-                if any(tag in sandboxed_tags for tag in values.get("tags", [])):
-
-                    v = sorted(
-                        {
-                            sandbox
-                            for sandbox, tags in tag_mapping.items()
-                            if len(set(tags).intersection(values.get("tags", []))) > 0
-                        }
-                    )
-                else:
-                    v = ["core"]
-            else:
-                v = ["core"]
-
-        return v

From f6d3b7942cdb0bb95739d01eb722711f3965cc6e Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 13:11:37 -0700
Subject: [PATCH 12/41] Remove sandboxes

---
 emmet-core/emmet/core/defect.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index b2bd530eed..5b6577900f 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -94,7 +94,6 @@ def from_tasks(cls, tasks: List, query='defect') -> "DefectDoc":
         last_updated = datetime.now() or max(task.last_updated for task in task_group)
         created_at = datetime.now() or min(task.completed_at for task in task_group)
         task_ids = list({task.task_id for task in task_group})
-        sandboxes = list({sbxn for task in task_group for sbxn in task.sandboxes})
 
         deprecated_tasks = list(
             {task.task_id for task in task_group if not task.is_valid}
@@ -130,7 +129,6 @@ def _run_type(x):
                 'last_updated': last_updated,
                 'created_at': created_at,
                 'task_ids': set(task_ids),
-                'sandboxes': sandboxes,
                 'deprecated_tasks': deprecated_tasks,
                 'tasks': final_tasks,
                 'material_id': list({v.parameters['material_id'] for v in entries.values()})[0],

From bc8f4af24c5a59ec55c99bc550d62f9828b9dffb Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Jul 2021 13:11:54 -0700
Subject: [PATCH 13/41] Another log point

---
 emmet-builders/emmet/builders/cp2k/defects.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index f82fa1e75a..355ddd2bc3 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -244,7 +244,7 @@ def get_items(self) -> Iterator[List[Dict]]:
                     doc["is_valid"] = True
 
         # yield list of defects that are of the same type, matched to an appropriate bulk calc
-        self.logger.debug(f"Processing ")
+        self.logger.info(f"Starting defect matching.")
 
         grouped_pairs = [
             [

From 7bfec2b87e41c277be60f05398014412e01e4d15 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 2 Aug 2021 18:56:58 -0700
Subject: [PATCH 14/41] Update to material doc

---
 emmet-core/emmet/core/cp2k/material.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
index b310480fab..32089b6aa3 100644
--- a/emmet-core/emmet/core/cp2k/material.py
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -52,7 +52,6 @@ def from_tasks(
         last_updated = max(task.last_updated for task in task_group)
         created_at = min(task.completed_at for task in task_group)
         task_ids = list({task.task_id for task in task_group})
-        sandboxes = list({sbxn for task in task_group for sbxn in task.sandboxes})
 
         deprecated_tasks = list(
             {task.task_id for task in task_group if not task.is_valid}
@@ -172,7 +171,6 @@ def _structure_eval(task: TaskDocument):
             deprecated_tasks=deprecated_tasks,
             origins=origins,
             entries=entries,
-            sandboxes=sandboxes if sandboxes else None,
         )
 
 

From 983331713ffe052aea61c7b124c79581ddde4db1 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 2 Aug 2021 18:58:54 -0700
Subject: [PATCH 15/41] Temporary MaterialsBuilder update. Some things are not
 implemented.

---
 .../emmet/builders/cp2k/materials.py          | 106 ++++++++----------
 1 file changed, 48 insertions(+), 58 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
index 7b35239d33..128aa6e112 100644
--- a/emmet-builders/emmet/builders/cp2k/materials.py
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -6,6 +6,7 @@
 from maggma.stores import Store
 
 from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
+from emmet.builders.settings import EmmetBuildSettings
 from emmet.core import SETTINGS
 from emmet.core.utils import group_structures, jsanitize
 from emmet.core.cp2k.calc_types import TaskType
@@ -41,11 +42,7 @@ def __init__(
         materials: Store,
         task_validation: Optional[Store] = None,
         query: Optional[Dict] = None,
-        allowed_task_types: Optional[List[str]] = None,
-        symprec: float = SETTINGS.SYMPREC,
-        ltol: float = SETTINGS.LTOL,
-        stol: float = SETTINGS.STOL,
-        angle_tol: float = SETTINGS.ANGLE_TOL,
+        settings: Optional[EmmetBuildSettings] = None,
         **kwargs,
     ):
         """
@@ -63,19 +60,8 @@ def __init__(
         self.tasks = tasks
         self.materials = materials
         self.task_validation = task_validation
-        self.allowed_task_types = (
-            [t.value for t in TaskType]
-            if allowed_task_types is None
-            else allowed_task_types
-        )
-
-        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
-
         self.query = query if query else {}
-        self.symprec = symprec
-        self.ltol = ltol
-        self.stol = stol
-        self.angle_tol = angle_tol
+        self.settings = EmmetBuildSettings.autoload(settings)
         self.kwargs = kwargs
 
         sources = [tasks]
@@ -115,8 +101,9 @@ def get_items(self) -> Iterator[List[Dict]]:
         """
 
         self.logger.info("Materials builder started")
+        # TODO make a cp2k allowed type setting
         self.logger.info(
-            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
+            f"Allowed task types: {[task_type.value for task_type in self.settings.VASP_ALLOWED_VASP_TYPES]}"
         )
 
         self.logger.info("Setting indexes")
@@ -203,11 +190,31 @@ def process_item(self, tasks: List[Dict]) -> List[Dict]:
         self.logger.debug(f"Processing {formula} : {task_ids}")
 
         grouped_tasks = self.filter_and_group_tasks(tasks)
+        materials = []
+        for group in grouped_tasks:
+            try:
+                materials.append(
+                    MaterialsDoc.from_tasks(
+                        group,
+                        quality_scores=self.settings.CP2K_QUALITY_SCORES,
+                    )
+                )
+            except Exception as e:
+
+                # TODO construct deprecated
+
+                failed_ids = list({t_.task_id for t_ in group})
+                doc = MaterialsDoc.construct_deprecated_material(tasks)
+                doc.warnings.append(str(e))
+                materials.append(doc)
+                self.logger.warn(
+                    f"Failed making material for {failed_ids}."
+                    f" Inserted as deprecated Material: {doc.material_id}"
+                )
 
-        materials = [MaterialsDoc.from_tasks(group) for group in grouped_tasks]
         self.logger.debug(f"Produced {len(materials)} materials for {formula}")
 
-        return [mat.dict() for mat in materials]
+        return jsanitize([mat.dict() for mat in materials], allow_bson=True)
 
     def update_targets(self, items: List[List[Dict]]):
         """
@@ -230,7 +237,7 @@ def update_targets(self, items: List[List[Dict]]):
             self.materials.remove_docs({self.materials.key: {"$in": material_ids}})
             self.materials.update(
                 docs=jsanitize(items, allow_bson=True),
-                key=["material_id", "sandboxes"],
+                key=["material_id"],
             )
         else:
             self.logger.info("No items to update")
@@ -240,50 +247,33 @@ def filter_and_group_tasks(self, tasks: List[TaskDocument]) -> Iterator[List[Dic
         Groups tasks by structure matching
         """
 
-        filtered_tasks = [
-            task
-            for task in tasks
-            if any(
-                allowed_type is task.task_type
-                for allowed_type in self._allowed_task_types
-            )
-        ]
+        """
+        Groups tasks by structure matching
+        """
+
+        # TODO why did the way vasp builder did it not work here?
+        filtered_tasks = []
+        for task in tasks:
+            for allowed_type in self.settings.CP2K_ALLOWED_CP2K_TYPES:
+                if task.task_type is allowed_type:
+                    filtered_tasks.append(task)
+                    continue
 
         structures = []
 
         for idx, task in enumerate(filtered_tasks):
             s = task.output.structure
-            s.index = idx
+            s.index: int = idx  # type: ignore
             structures.append(s)
 
         grouped_structures = group_structures(
             structures,
-            ltol=self.ltol,
-            stol=self.stol,
-            angle_tol=self.angle_tol,
-            symprec=self.symprec,
+            ltol=self.settings.LTOL,
+            stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL,
+            symprec=self.settings.SYMPREC,
         )
-        from itertools import groupby
-
-        def get_sg(struc, symprec=SETTINGS.SYMPREC) -> int:
-            struc.remove_oxidation_states()
-            struc.remove_spin()
-            for p in struc.site_properties:
-                struc.remove_site_property(p)
-            """helper function to get spacegroup with a loose tolerance"""
-            try:
-                return struc.get_space_group_info(symprec=symprec)[0]
-            except Exception:
-                return -1
-
-        for key, group in groupby(sorted(structures, key=get_sg), key=get_sg):
-#        for group in grouped_structures:
-            grouped_tasks = [filtered_tasks[struc.index] for struc in group]
-            sandboxes = {frozenset(task.sandboxes) for task in grouped_tasks}
-
-            for sbx_set in maximal_spanning_non_intersecting_subsets(sandboxes):
-                yield [
-                    task
-                    for task in grouped_tasks
-                    if len(set(task.sandboxes).intersection(sbx_set)) > 0
-                ]
+
+        for group in grouped_structures:
+            grouped_tasks = [filtered_tasks[struc.index] for struc in group]  # type: ignore
+            yield grouped_tasks

From c133909bfb887feed514d6bbe993ef91aeff8a0c Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 2 Aug 2021 18:59:34 -0700
Subject: [PATCH 16/41] Some updates, subject to change, for the Defect
 builder.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 123 ++++++++++++------
 1 file changed, 81 insertions(+), 42 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 355ddd2bc3..8bf4fd09ac 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -11,7 +11,9 @@
 
 from monty.json import MontyDecoder
 
-from emmet.builders.cp2k.utils import get_dielectric, get_mpid
+from emmet.builders.cp2k.utils import get_mpid
+from emmet.builders.settings import EmmetBuildSettings
+
 from emmet.core import SETTINGS
 from emmet.core.utils import jsanitize, get_sg
 from emmet.core.cp2k.calc_types import TaskType
@@ -48,10 +50,13 @@ class DefectBuilder(Builder):
         6.) Update the defect store
     """
 
+    # TODO: should dielectric/electronic_structure be optional or required?
     def __init__(
         self,
         tasks: Store,
         defects: Store,
+        dielectric: Store,
+        electronic_structure: Store,
         task_validation: Optional[Store] = None,
         query: Optional[Dict] = None,
         allowed_task_types: Optional[List[str]] = None,
@@ -59,6 +64,7 @@ def __init__(
         ltol: float = SETTINGS.LTOL,
         stol: float = SETTINGS.STOL,
         angle_tol: float = SETTINGS.ANGLE_TOL,
+        settings: Optional[EmmetBuildSettings] = None,
         **kwargs,
     ):
         """
@@ -77,6 +83,11 @@ def __init__(
         self.tasks.key = 'task_id'
         self.defects = defects
         self.defects.key = 'task_ids'
+        self.dielectric = dielectric
+        self.dielectric.key = 'task_id'
+        self.electronic_structure = electronic_structure
+        self.electronic_structure.key = "material_id"
+
         self.task_validation = task_validation
         self.allowed_task_types = (
             [t.value for t in TaskType]
@@ -85,6 +96,7 @@ def __init__(
         )
 
         self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
+        self.settings = EmmetBuildSettings.autoload(settings)
 
         self.query = query if query else {}
         self.symprec = symprec
@@ -93,7 +105,7 @@ def __init__(
         self.angle_tol = angle_tol
         self.kwargs = kwargs
 
-        sources = [tasks]
+        sources = [tasks, dielectric, electronic_structure]
         if self.task_validation:
             sources.append(self.task_validation)
         super().__init__(sources=sources, targets=[defects], **kwargs)
@@ -166,6 +178,9 @@ def required_bulk_properties(self) -> List:
             'transformations',
         ]
 
+    def prechunk(self, number_splits: int) -> Iterator[Dict]:
+        raise NotImplementedError
+
     def get_items(self) -> Iterator[List[Dict]]:
         """
         Gets all items to process into defect documents.
@@ -246,17 +261,15 @@ def get_items(self) -> Iterator[List[Dict]]:
         # yield list of defects that are of the same type, matched to an appropriate bulk calc
         self.logger.info(f"Starting defect matching.")
 
-        grouped_pairs = [
-            [
+        for defect_task_group in self.filter_and_group_tasks(defect_tasks):
+            yield [
                 (
                     next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),
-                    next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None))
+                    next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),
+                    self.get_dielectric(bulk_tasks_id)
                 )
                 for defect_tasks_id, bulk_tasks_id in self.match_defects_to_bulks(bulk_tasks, defect_task_group)
-            ] for defect_task_group in self.filter_and_group_tasks(defect_tasks)
-        ]
-
-        yield grouped_pairs
+                ]
 
     # TODO: This must be changed once access to internal MP database is avaiable. RN it uses the MAPI
     def preprocess_bulk(self, task):
@@ -269,10 +282,14 @@ def preprocess_bulk(self, task):
         if not mpid:
             self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")
             return False
-        diel = get_dielectric(mpid)
-        if diel is None:
-            self.logger.debug(f"NO DIEL FOUND FOR {task} - {struc.composition}")
-            return False
+
+        elec = next(self.electronic_structure.query(properties=['band_gap'], criteria={"material_id": mpid}))
+        if elec['band_gap'] > 0:
+            diel = next(self.dielectric.query(criteria={"material_id": mpid}))
+            if diel is None:
+                self.logger.info(f"Task {task} for composition {struc.composition} requires"
+                                 f"dielectric properties, but none found in dielectric store")
+                return False
         return True
 
     def filter_and_group_tasks(self, tasks):
@@ -282,10 +299,10 @@ def filter_and_group_tasks(self, tasks):
         will be grouped together.
 
         Args:
-            defect_ids: task_ids for unprocessed defects
+            tasks: task_ids for unprocessed defects
 
         returns:
-            generator for groups of task_ids that correspond to the same defect
+            Groups of task_ids that correspond to the same defect
         """
 
         props = [
@@ -373,31 +390,55 @@ def _compare(b, d):
                     return True
             return False
 
-        pairs = [(defect['task_id'], bulk['task_id']) for bulk in bulks for defect in defects if _compare(bulk, defect)]
+        pairs = [
+            (defect['task_id'], bulk['task_id'])
+            for bulk in bulks
+            for defect in defects
+            if _compare(bulk, defect)
+        ]
         self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
         return pairs
 
-    @staticmethod
-    def get_pristine_supercell(x):
-        if x.get('defect'):
-            return load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).bulk_structure
-        elif x.get('transformations'):
-            return Structure.from_dict(x['transformations']['history'][0]['input_structure'])
-        return Structure.from_dict(x['input']['structure'])
+    def get_dielectric(self, task_id):
+        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
+        struc = Structure.from_dict(t.get('output').get('structure'))
+        for diel in self.dielectric.query(criteria={'task_id': get_mpid(struc)}, properties=['total']):
+            return diel
+        return None
 
-    def process_item(self, tasks):
+    @staticmethod
+    def get_pristine_supercell(task):
         """
+        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
+        If defect cannot be found in task, return the input structure.
         """
-        self.logger.debug(f"Processing tasks")
-        for group in tasks:
-            self.logger.info(f"Processing group of size {len(group)}")
+        if task.get('defect'):
+            return load_class(
+                task['defect']['@module'], task['defect']['@class']
+            ).from_dict(task['defect']).bulk_structure
+        elif task.get('transformations'):
+            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])
+        return Structure.from_dict(task['input']['structure'])
 
-        defect_docs = [DefectDoc.from_tasks(tasks=defect_group, query=self.defect_query) for defect_group in tasks if defect_group]
+    def process_item(self, items):
+        """
+        Process a group of defect tasks into a single defect document.
 
-        self.logger.debug(f"Produced {len(defect_docs)} ")
-        return [d.dict() for d in defect_docs]
+        Args:
+            tasks: list of task pairs corresponding to 1 defect (same species/oxidation state), but
+                for any number of calculation settings.
+             e.g. [(defect task, bulk task), (defect task, bulk task), ...]
+
+        returns: the defect document as a dictionary
+        """
+        self.logger.info(f"Processing group of {len(items)} defects into DefectDoc")
+        defect_doc = DefectDoc.from_tasks(tasks=items, query=self.defect_query)
+        return defect_doc.dict()
 
     def get_defect_from_task(self, task):
+        """
+        Using the defect_query property, retrieve a pymatgen defect object from the task document
+        """
         defect = unpack(self.defect_query.split('.'), task)
         needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
         return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
@@ -407,7 +448,7 @@ def update_targets(self, items):
         Inserts the new task_types into the task_types collection
         """
 
-        items = [item for item in chain.from_iterable(items) if item]
+        items = [item for item in items if item]
 
         for item in items:
             item.update({"_bt": self.timestamp})
@@ -432,6 +473,7 @@ def __init__(
             defects: Store,
             defect_thermos: Store,
             materials: Store,
+            electronic_structures: Store,
             query: Optional[Dict] = None,
             symprec: float = SETTINGS.SYMPREC,
             ltol: float = SETTINGS.LTOL,
@@ -455,6 +497,7 @@ def __init__(
         self.defects = defects
         self.defect_thermos = defect_thermos
         self.materials = materials
+        self.electronic_structures = electronic_structures
 
         self.query = query if query else {}
         self.symprec = symprec
@@ -463,14 +506,7 @@ def __init__(
         self.angle_tol = angle_tol
         self.kwargs = kwargs
 
-        super().__init__(sources=[defects], targets=[defect_thermos], **kwargs)
-
-    def connect(self):
-        """
-        Connect to the builder sources and targets.
-        """
-        for s in [self.defects, self.defect_thermos, self.materials]:
-            s.connect()
+        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)
 
     @property
     def defect_doc_query(self):
@@ -511,7 +547,10 @@ def get_items(self) -> Iterator[List[Dict]]:
         all_docs = [doc for doc in self.defects.query()]
         for key, group in groupby(sorted(all_docs, key=lambda x: x['material_id']), key=lambda x: x['material_id']):
             group = [g for g in group]
-            yield (group, self.get_materials(group))
+            yield (group, self.get_materials(group), self.get_electronic_structure(group))
+
+    def get_electronic_structure(self, group):
+        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))
 
     def get_materials(self, group) -> List:
         """
@@ -538,10 +577,10 @@ def process_item(self, docs):
         :return:
         """
         self.logger.info(f"Processing defects")
-        defects, materials = docs
+        defects, materials, elec_struc = docs
         defects = [DefectDoc(**d) for d in defects]
         materials = [MaterialsDoc(**m) for m in materials]
-        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials)
+        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)
         return defect_thermo_doc.dict()
 
     def update_targets(self, items):

From f36f5d0fcd497b6041aa5e38d3ff71f50232b9b6 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 3 Aug 2021 10:13:34 -0700
Subject: [PATCH 17/41] Update for dielectric. Will need further adjustments.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 11 +++---
 emmet-core/emmet/core/defect.py               | 36 ++++++++++++-------
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 8bf4fd09ac..58d59ac834 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -285,7 +285,7 @@ def preprocess_bulk(self, task):
 
         elec = next(self.electronic_structure.query(properties=['band_gap'], criteria={"material_id": mpid}))
         if elec['band_gap'] > 0:
-            diel = next(self.dielectric.query(criteria={"material_id": mpid}))
+            diel = next(self.dielectric.query(criteria={self.dielectric.key: mpid}))
             if diel is None:
                 self.logger.info(f"Task {task} for composition {struc.composition} requires"
                                  f"dielectric properties, but none found in dielectric store")
@@ -402,8 +402,8 @@ def _compare(b, d):
     def get_dielectric(self, task_id):
         t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
         struc = Structure.from_dict(t.get('output').get('structure'))
-        for diel in self.dielectric.query(criteria={'task_id': get_mpid(struc)}, properties=['total']):
-            return diel
+        for diel in self.dielectric.query(criteria={'task_id': get_mpid(struc)}, properties=['dielectric.total']):
+            return diel['dielectric']['total']
         return None
 
     @staticmethod
@@ -432,8 +432,9 @@ def process_item(self, items):
         returns: the defect document as a dictionary
         """
         self.logger.info(f"Processing group of {len(items)} defects into DefectDoc")
-        defect_doc = DefectDoc.from_tasks(tasks=items, query=self.defect_query)
-        return defect_doc.dict()
+        if items:
+            defect_doc = DefectDoc.from_tasks(tasks=items, query=self.defect_query)
+            return defect_doc.dict()
 
     def get_defect_from_task(self, task):
         """
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index 5b6577900f..b488dd4b85 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -4,6 +4,7 @@
 
 from pymatgen.analysis.defects.core import DefectEntry
 
+from emmet.core.mpid import MPID
 from emmet.core.cp2k.task import TaskDocument
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.core.cp2k.calc_types.utils import run_type
@@ -24,6 +25,9 @@
 from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
 from pymatgen.core import Element
 
+from emmet.core.polar import Dielectric
+from emmet.core.electronic_structure import ElectronicStructureDoc
+
 
 class DefectDoc(BaseModel):
     """
@@ -40,7 +44,9 @@ class Config:
 
     property_name: ClassVar[str] = "defect"
 
-    material_id: str = Field(None, description="Unique material ID for the host material")
+    material_id: MPID = Field(None, description="Unique material ID for the host material")
+
+    defect_id: MPID = Field(None, description="Unique ID for this defect")
 
     chemsys: List = Field(None, description="Chemical system of the bulk")
 
@@ -61,7 +67,7 @@ class Config:
         None, description="Task documents (defect task, bulk task) for the defect entry of RunType"
     )
 
-    task_ids: List = Field(
+    task_ids: List[MPID] = Field(
         None, description="All task ids used in creating this defect doc."
     )
 
@@ -87,8 +93,7 @@ def from_tasks(cls, tasks: List, query='defect') -> "DefectDoc":
                 series of DefectEntry objects.
             query: How to retrieve the defect object stored in the task.
         """
-
-        task_group = [TaskDocument(**defect_task) for defect_task, bulk_task in tasks]
+        task_group = [TaskDocument(**defect_task) for defect_task, bulk_task, dielectric in tasks]
 
         # Metadata
         last_updated = datetime.now() or max(task.last_updated for task in task_group)
@@ -111,10 +116,15 @@ def _run_type(x):
         for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
             entry_and_docs = [
                 (
-                    cls.get_defect_entry_from_tasks(defect_task=defect_task, bulk_task=bulk_task, query=query),
+                    cls.get_defect_entry_from_tasks(
+                        defect_task=defect_task,
+                        bulk_task=bulk_task,
+                        dielectric=dielectric,
+                        query=query
+                    ),
                     TaskDocument(**defect_task), TaskDocument(**bulk_task)
                 )
-                for defect_task, bulk_task in tasks_for_runtype
+                for defect_task, bulk_task, dielectric in tasks_for_runtype
             ]
             entry_and_docs.sort(key=lambda x: x[1].nsites, reverse=True)  # TODO Turn into kpoint density sorting
             best_entry, best_defect_task, best_bulk_task = entry_and_docs[0]
@@ -139,16 +149,20 @@ def _run_type(x):
         return cls(**{k: v for k, v in data.items()})
 
     @classmethod
-    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, query='defect'):
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='defect'):
         """
         Extract a defect entry from a single pair (defect and bulk) of tasks.
 
         Args:
             defect_task: task dict for the defect calculation
             bulk_task: task dict for the bulk calculation
+            dielectric: Dielectric doc if the defect is charged. If not present, no dielectric
+                corrections will be performed, even if the defect is charged.
             query: Mongo-style query to retrieve the defect object from the defect task
         """
         parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+        if dielectric:
+            parameters['dielectric'] = dielectric
 
         defect_entry = DefectEntry(
             cls.get_defect_from_task(query=query, task=defect_task),
@@ -210,10 +224,6 @@ def get_init(x):
             'entry_id': defect_task.get('task_id')
         }
 
-        dielectric = get_dielectric(mpid)
-        if dielectric is not np.inf or None:
-            parameters['dielectric'] = dielectric
-
         # cannot be easily queried for, so check here.
         if 'v_hartree' in final_bulk_structure.site_properties:
             parameters['bulk_atomic_site_averages'] = final_bulk_structure.site_properties['v_hartree']
@@ -260,7 +270,7 @@ def decode(cls, defect_predominance_diagrams):
         return defect_predominance_diagrams
 
     @classmethod
-    def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc]) -> "DefectThermoDoc":
+    def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc], electronic_structure: ElectronicStructureDoc) -> "DefectThermoDoc":
 
         DEFAULT_RT = RunType('GGA')  # TODO NEED A procedure for getting all GGA or GGA+U keys
         DEFAULT_RT_U = RunType('GGA+U')
@@ -275,7 +285,7 @@ def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc]) -> "Def
         defect_predominance_diagrams = {}
         task_ids = {}
 
-        dos = get_dos(mpid)
+        dos = electronic_structure.dos.total
 
         for m in materials:
             for run_type in m.entries:

From 35067ad170607c1d86f6e3f753323b4f61631af7 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 3 Aug 2021 10:20:49 -0700
Subject: [PATCH 18/41] Cp2kTaskType. Should be unified?

---
 emmet-builders/emmet/builders/settings.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
index d94a30437a..fdefe009e0 100644
--- a/emmet-builders/emmet/builders/settings.py
+++ b/emmet-builders/emmet/builders/settings.py
@@ -8,6 +8,7 @@
 from emmet.core.provenance import Author, History
 from emmet.core.settings import EmmetSettings
 from emmet.core.vasp.calc_types import TaskType
+from emmet.core.cp2k.calc_types import TaskType as Cp2kTaskType
 
 
 class EmmetBuildSettings(EmmetSettings):
@@ -34,6 +35,11 @@ class EmmetBuildSettings(EmmetSettings):
         description="Allowed task_types to build materials from",
     )
 
+    CP2K_ALLOWED_CP2K_TYPES: List[Cp2kTaskType] = Field(
+        [t.value for t in Cp2kTaskType],
+        description="Allowed task_types to build materials from",
+    )
+
     DEFAULT_REFERENCE: str = Field(
         "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
         "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "

From 83a15bfd0cfeec0cacc8679c22ee863bdbf44628 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 3 Aug 2021 10:32:39 -0700
Subject: [PATCH 19/41] Store as int so MPID can work with it.

---
 emmet-core/emmet/core/cp2k/task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
index 8711be5c50..3c21dbddf7 100644
--- a/emmet-core/emmet/core/cp2k/task.py
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -128,7 +128,7 @@ class TaskDocument(StructureMetadata):
     orig_inputs: Dict = Field(
         None, description="Summary of the original CP2K inputs"
     )
-    task_id: str = Field(None, description="the Task ID For this document")
+    task_id: int = Field(None, description="the Task ID For this document")
     tags: List[str] = Field([], description="Metadata tags for this task document")
 
     sandboxes: List[str] = Field(

From ad481366b154ca81c849080897e54f16a0b6e875 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 3 Aug 2021 13:53:55 -0700
Subject: [PATCH 20/41] Use integers

---
 emmet-core/emmet/core/defect.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index b488dd4b85..e3d47a68f3 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -4,14 +4,16 @@
 
 from pymatgen.analysis.defects.core import DefectEntry
 
+from emmet.core.base import EmmetBaseModel
 from emmet.core.mpid import MPID
 from emmet.core.cp2k.task import TaskDocument
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.core.cp2k.calc_types.utils import run_type
+from emmet.core.material import PropertyOrigin
 
 from pymatgen.core import Structure, Composition
 from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
-import numpy as np
+
 from pymatgen.ext.matproj import MPRester
 from monty.json import MontyDecoder
 from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
@@ -29,7 +31,7 @@
 from emmet.core.electronic_structure import ElectronicStructureDoc
 
 
-class DefectDoc(BaseModel):
+class DefectDoc(EmmetBaseModel):
     """
     A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
 
@@ -44,21 +46,24 @@ class Config:
 
     property_name: ClassVar[str] = "defect"
 
-    material_id: MPID = Field(None, description="Unique material ID for the host material")
+    chemsys: List = Field(None, description="Chemical system of the bulk")
 
-    defect_id: MPID = Field(None, description="Unique ID for this defect")
 
-    chemsys: List = Field(None, description="Chemical system of the bulk")
+    material_id: MPID = Field(None, description="Unique material ID for the host material")
+
+    task_ids: List[int] = Field(
+        None, description="All task ids used in creating this defect doc."
+    )
 
-    calc_types: Mapping[str, CalcType] = Field(  # type: ignore
+    calc_types: Mapping[int, CalcType] = Field(  # type: ignore
         None,
         description="Calculation types for all the calculations that make up this material",
     )
-    task_types: Mapping[str, TaskType] = Field(
+    task_types: Mapping[int, TaskType] = Field(
         None,
         description="Task types for all the calculations that make up this material",
     )
-    run_types: Mapping[str, RunType] = Field(
+    run_types: Mapping[int, RunType] = Field(
         None,
         description="Run types for all the calculations that make up this material",
     )
@@ -67,10 +72,6 @@ class Config:
         None, description="Task documents (defect task, bulk task) for the defect entry of RunType"
     )
 
-    task_ids: List[MPID] = Field(
-        None, description="All task ids used in creating this defect doc."
-    )
-
     entries: Mapping[RunType, DefectEntry] = Field(
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
@@ -98,7 +99,7 @@ def from_tasks(cls, tasks: List, query='defect') -> "DefectDoc":
         # Metadata
         last_updated = datetime.now() or max(task.last_updated for task in task_group)
         created_at = datetime.now() or min(task.completed_at for task in task_group)
-        task_ids = list({task.task_id for task in task_group})
+        task_ids = {task.task_id for task in task_group}
 
         deprecated_tasks = list(
             {task.task_id for task in task_group if not task.is_valid}
@@ -138,14 +139,13 @@ def _run_type(x):
                 'calc_types': calc_types,
                 'last_updated': last_updated,
                 'created_at': created_at,
-                'task_ids': set(task_ids),
+                'task_ids': task_ids,
                 'deprecated_tasks': deprecated_tasks,
                 'tasks': final_tasks,
                 'material_id': list({v.parameters['material_id'] for v in entries.values()})[0],
                 'entry_ids': {rt: entries[rt].entry_id for rt in entries},
                 'chemsys': list([v.defect.bulk_structure.composition.elements for v in entries.values()])[0],
         }
-
         return cls(**{k: v for k, v in data.items()})
 
     @classmethod

From ccee065a7a31f6c4c86480bbed3b18d247b7a384 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 5 Aug 2021 16:26:01 -0700
Subject: [PATCH 21/41] Move imports.

---
 emmet-core/emmet/core/cp2k/material.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
index 32089b6aa3..2f2549003c 100644
--- a/emmet-core/emmet/core/cp2k/material.py
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -5,13 +5,13 @@
 from pymatgen.analysis.structure_matcher import StructureMatcher
 from pymatgen.entries.computed_entries import ComputedStructureEntry
 
-
 from emmet.core import SETTINGS
 from emmet.core.material import MaterialsDoc as CoreMaterialsDoc
 from emmet.core.material import PropertyOrigin as PropertyOrigin
 from emmet.core.structure import StructureMetadata
 from emmet.core.cp2k.calc_types import CalcType, RunType, TaskType
 from emmet.core.cp2k.task import TaskDocument
+from emmet.builders.cp2k.utils import get_mpid
 
 
 class MaterialsDoc(CoreMaterialsDoc, StructureMetadata):
@@ -69,12 +69,12 @@ def from_tasks(
         statics = [task for task in task_group if task.task_type == TaskType.Static]  # type: ignore
 
         # Material ID
-        possible_mat_ids = [task.task_id for task in structure_optimizations] # TODO remove + statics ?
+        possible_mat_ids = [task.task_id for task in structure_optimizations]  # TODO remove + statics ?
         possible_mat_ids = sorted(possible_mat_ids, key=ID_to_int)
 
-        from emmet.builders.cp2k.utils import get_mpid
         matched_id = get_mpid([task.output.structure for task in structure_optimizations + statics][0])
-        if matched_id: possible_mat_ids.insert(0, matched_id)
+        if matched_id:
+            possible_mat_ids.insert(0, matched_id)
 
         if len(possible_mat_ids) == 0:
             raise Exception(f"Could not find a material ID for {task_ids}")
@@ -129,7 +129,7 @@ def _structure_eval(task: TaskDocument):
                 last_updated=best_structure_calc.last_updated,
             )
         ]
-        from pymatgen.entries.computed_entries import ComputedStructureEntry
+
         # entries
         entries = {}
         all_run_types = set(run_types.values())
@@ -143,7 +143,9 @@ def _structure_eval(task: TaskDocument):
                 entry = best_task_doc.entry
                 entry.data["task_id"] = entry.entry_id
                 entry.entry_id = material_id
+
                 # TODO Standard material doc doesn't have this
+                best_task_doc.output.structure.remove_oxidation_states()
                 entries[rt] = ComputedStructureEntry(
                     structure=best_task_doc.output.structure,
                     energy=entry.energy,

From e731d28242e984ad7c41a471aa5a82229a909eab Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 5 Aug 2021 16:27:17 -0700
Subject: [PATCH 22/41] Improved builders with private methods, more
 docstrings, and support for updating existing docs without re-building.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 403 ++++++++++--------
 1 file changed, 235 insertions(+), 168 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 58d59ac834..cb6928b022 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1,17 +1,18 @@
 from datetime import datetime
 from itertools import chain, groupby, combinations
 from typing import Dict, Iterator, List, Optional
+from copy import deepcopy
 
 from maggma.builders import Builder
 from maggma.stores import Store
 from pymatgen.core import Structure
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 from atomate.utils.utils import load_class
 
 
 from monty.json import MontyDecoder
 
-from emmet.builders.cp2k.utils import get_mpid
 from emmet.builders.settings import EmmetBuildSettings
 
 from emmet.core import SETTINGS
@@ -57,13 +58,10 @@ def __init__(
         defects: Store,
         dielectric: Store,
         electronic_structure: Store,
+        materials: Store,
         task_validation: Optional[Store] = None,
         query: Optional[Dict] = None,
         allowed_task_types: Optional[List[str]] = None,
-        symprec: float = SETTINGS.SYMPREC,
-        ltol: float = SETTINGS.LTOL,
-        stol: float = SETTINGS.STOL,
-        angle_tol: float = SETTINGS.ANGLE_TOL,
         settings: Optional[EmmetBuildSettings] = None,
         **kwargs,
     ):
@@ -80,13 +78,10 @@ def __init__(
         """
 
         self.tasks = tasks
-        self.tasks.key = 'task_id'
         self.defects = defects
-        self.defects.key = 'task_ids'
+        self.materials = materials
         self.dielectric = dielectric
-        self.dielectric.key = 'task_id'
         self.electronic_structure = electronic_structure
-        self.electronic_structure.key = "material_id"
 
         self.task_validation = task_validation
         self.allowed_task_types = (
@@ -97,40 +92,15 @@ def __init__(
 
         self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
         self.settings = EmmetBuildSettings.autoload(settings)
-
         self.query = query if query else {}
-        self.symprec = symprec
-        self.ltol = ltol
-        self.stol = stol
-        self.angle_tol = angle_tol
+        self.timestamp = None
         self.kwargs = kwargs
 
-        sources = [tasks, dielectric, electronic_structure]
+        sources = [tasks, dielectric, electronic_structure, materials]
         if self.task_validation:
             sources.append(self.task_validation)
         super().__init__(sources=sources, targets=[defects], **kwargs)
 
-    def ensure_indexes(self):
-        """
-        Ensures indicies on the tasks and materials collections
-        """
-
-        # Basic search index for tasks
-        self.tasks.ensure_index("task_id")
-        self.tasks.ensure_index("last_updated")
-        self.tasks.ensure_index("state")
-        self.tasks.ensure_index("formula_pretty")
-
-        # Search index for materials
-        self.defects.ensure_index("material_id")
-        self.defects.ensure_index("last_updated")
-        self.defects.ensure_index("sandboxes")
-        self.defects.ensure_index("task_ids")
-
-        if self.task_validation:
-            self.task_validation.ensure_index("task_id")
-            self.task_validation.ensure_index("valid")
-
     @property
     def defect_query(self) -> str:
         """
@@ -178,6 +148,32 @@ def required_bulk_properties(self) -> List:
             'transformations',
         ]
 
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Basic search index for tasks
+        self.tasks.ensure_index("task_id")
+        self.tasks.ensure_index("last_updated")
+        self.tasks.ensure_index("state")
+        self.tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("task_ids")
+
+        # Search index for materials
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("defect_id")
+        self.defects.ensure_index("last_updated")
+        self.defects.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
     def prechunk(self, number_splits: int) -> Iterator[Dict]:
         raise NotImplementedError
 
@@ -232,7 +228,7 @@ def get_items(self) -> Iterator[List[Dict]]:
             for t_id in d.get("task_ids", [])
         }
 
-        bulk_tasks = set(filter(self.preprocess_bulk, all_tasks - defect_tasks))
+        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))
         unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
 
         self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
@@ -261,38 +257,52 @@ def get_items(self) -> Iterator[List[Dict]]:
         # yield list of defects that are of the same type, matched to an appropriate bulk calc
         self.logger.info(f"Starting defect matching.")
 
-        for defect_task_group in self.filter_and_group_tasks(defect_tasks):
-            yield [
-                (
-                    next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),
-                    next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),
-                    self.get_dielectric(bulk_tasks_id)
-                )
-                for defect_tasks_id, bulk_tasks_id in self.match_defects_to_bulks(bulk_tasks, defect_task_group)
-                ]
+        for defect_id, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
+            yield self.__get_defect_doc(defect_id), self.__get_item_bundle(bulk_tasks, defect_task_group)
 
-    # TODO: This must be changed once access to internal MP database is avaiable. RN it uses the MAPI
-    def preprocess_bulk(self, task):
+    def process_item(self, items):
         """
-        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used.
+        Process a group of defect tasks that correspond to the same defect into a single defect
+        document. If the DefectDoc already exists, then update it and return it. If it does not,
+        create a new DefectDoc
+
+        Args:
+            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]
+
+        returns: the defect document as a dictionary
         """
-        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))
-        struc = Structure.from_dict(t.get('output').get('structure'))
-        mpid = get_mpid(struc)
-        if not mpid:
-            self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")
-            return False
+        defect_doc, item_bundle = items
+        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
+        if item_bundle:
+            if defect_doc:
+                defect_doc.update_all(item_bundle, query=self.defect_query)
+            else:
+                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query)
+            return defect_doc.dict()
 
-        elec = next(self.electronic_structure.query(properties=['band_gap'], criteria={"material_id": mpid}))
-        if elec['band_gap'] > 0:
-            diel = next(self.dielectric.query(criteria={self.dielectric.key: mpid}))
-            if diel is None:
-                self.logger.info(f"Task {task} for composition {struc.composition} requires"
-                                 f"dielectric properties, but none found in dielectric store")
-                return False
-        return True
+    def update_targets(self, items):
+        """
+        Inserts the new task_types into the task_types collection
+        """
+
+        items = [item for item in items if item]
+
+        for item in items:
+            item.update({"_bt": self.timestamp})
+
+        defect_id = [item['defect_id'] for item in items]
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defects")
+            self.defects.remove_docs({self.defects.key: {"$in": defect_id}})
+            self.defects.update(
+                docs=jsanitize(items, allow_bson=True),
+                key='task_ids',
+            )
+        else:
+            self.logger.info("No items to update")
 
-    def filter_and_group_tasks(self, tasks):
+    def __filter_and_group_tasks(self, tasks):
         """
         Groups defect tasks. Tasks are grouped according to the reduced representation
         of the defect, and so tasks with different settings (e.g. supercell size, functional)
@@ -302,7 +312,8 @@ def filter_and_group_tasks(self, tasks):
             tasks: task_ids for unprocessed defects
 
         returns:
-            Groups of task_ids that correspond to the same defect
+            [(defect_id, [task_ids]), ...] where task_ids correspond to the same defect, which
+            is uniquely labeled by defect_id
         """
 
         props = [
@@ -314,7 +325,7 @@ def filter_and_group_tasks(self, tasks):
 
         pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)
         defects = [
-            {'task_id': t['task_id'], 'defect': self.get_defect_from_task(t)}
+            {'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t)}
             for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)
         ]
 
@@ -340,12 +351,92 @@ def key(x):
                 inds = list(inds)
                 matches.extend([unmatched[i][0] for i in inds])
                 unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
-                all_groups.append([defects[i]['task_id'] for i in matches])
+                all_groups.append((defects[i]['defect'].get_hash(), [defects[i]['task_id'] for i in matches]))
 
         self.logger.debug(f"All groups {all_groups}")
         return all_groups
 
-    def match_defects_to_bulks(self, bulk_ids, defect_ids):
+    def __get_defect_from_task(self, task):
+        """
+        Using the defect_query property, retrieve a pymatgen defect object from the task document
+        """
+        defect = unpack(self.defect_query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+    def __get_defect_doc(self, defect_id):
+        """
+        Given a unique defect id, retrieve the defect document from the defect store if it exists
+
+        Args:
+            defect_id: Unique defect ID created by the Defect.get_hash() function
+
+        returns: DefectDoc or None
+        """
+        doc = list(self.defects.query(criteria={'defect_id': defect_id}, properties=None))
+        return doc[0] if doc else None
+
+    def __get_dielectric(self, task_id):
+        """
+        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store
+        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would
+        be the case for metallic systems, return None.
+        """
+        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
+        struc = Structure.from_dict(t.get('output').get('structure'))
+        for diel in self.dielectric.query(criteria={self.dielectric.key: self.__get_mpid(struc)}, properties=['dielectric.total']):
+            return diel['dielectric']['total']
+        return None
+
+    def __get_item_bundle(self, bulk_tasks, defect_task_group):
+        """
+        Gets a group of items that can be processed together into a defect document.
+
+        Args:
+            bulk_tasks: possible bulk tasks to match to defects
+            defect_task_group: group of equivalent defects (defined by PointDefectComparator)
+
+        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]
+        """
+        return [
+            (
+                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),
+                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),
+                self.__get_dielectric(bulk_tasks_id),
+            )
+            for defect_tasks_id, bulk_tasks_id
+            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
+        ]
+
+    def __get_mpid(self, structure):
+        """
+        Given a structure, determine if an equivalent structure exists, with a material_id,
+        in the materials store.
+
+        Args:
+            structure: Candidate structure
+
+        returns: material_id, if one exists, else None
+        """
+        struc = deepcopy(structure)
+        struc.remove_oxidation_states()
+        struc.remove_spin()
+        for p in struc.site_properties:
+            struc.remove_site_property(p)
+        sga = SpacegroupAnalyzer(struc)
+
+        sm = StructureMatcher(ltol=self.settings.LTOL, stol=self.settings.STOL, angle_tol=self.settings.ANGLE_TOL)
+        data = self.materials.query(
+            criteria={
+                'chemsys': struc.composition.chemical_system,
+                'spacegroup.symbol': sga.get_space_group_symbol()
+            },
+            properties=['material_id', 'formation_energy_per_atom']
+        )
+        data = filter(lambda x: sm.fit(x[0], x[1]), list(data))
+        return None if not data else sorted(data, key=lambda x: x)[0]['material_id']
+    
+    def __match_defects_to_bulks(self, bulk_ids, defect_ids):
         """
         Given task_ids of bulk and defect tasks, match the defects to a bulk task that has
         commensurate:
@@ -385,7 +476,7 @@ def match_defects_to_bulks(self, bulk_ids, defect_ids):
         def _compare(b, d):
             if run_type(b.get('input').get('dft')).value.split('+U')[0] == \
                 run_type(d.get('input').get('dft')).value.split('+U')[0] and \
-                    sm.fit(DefectBuilder.get_pristine_supercell(d), DefectBuilder.get_pristine_supercell(b)):
+                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):
                 if abs(b['nsites'] - d['nsites']) <= 1:
                     return True
             return False
@@ -399,15 +490,33 @@ def _compare(b, d):
         self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
         return pairs
 
-    def get_dielectric(self, task_id):
-        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
+    def __preprocess_bulk(self, task):
+        """
+        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk
+        tasks must have:
+
+            (1) Correspond to an existing material_id in the materials store
+            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store
+        """
+        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))
         struc = Structure.from_dict(t.get('output').get('structure'))
-        for diel in self.dielectric.query(criteria={'task_id': get_mpid(struc)}, properties=['dielectric.total']):
-            return diel['dielectric']['total']
-        return None
+        mpid = self.__get_mpid(struc)
+        if not mpid:
+            self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")
+            return False
+
+        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))
+        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})
+        if dos.get_gap():
+            diel = next(self.dielectric.query(criteria={self.dielectric.key: mpid}))
+            if diel is None:
+                self.logger.info(f"Task {task} for composition {struc.composition} requires"
+                                 f"dielectric properties, but none found in dielectric store")
+                return False
+        return True
 
     @staticmethod
-    def get_pristine_supercell(task):
+    def __get_pristine_supercell(self, task):
         """
         Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
         If defect cannot be found in task, return the input structure.
@@ -420,54 +529,17 @@ def get_pristine_supercell(task):
             return Structure.from_dict(task['transformations']['history'][0]['input_structure'])
         return Structure.from_dict(task['input']['structure'])
 
-    def process_item(self, items):
-        """
-        Process a group of defect tasks into a single defect document.
-
-        Args:
-            tasks: list of task pairs corresponding to 1 defect (same species/oxidation state), but
-                for any number of calculation settings.
-             e.g. [(defect task, bulk task), (defect task, bulk task), ...]
-
-        returns: the defect document as a dictionary
-        """
-        self.logger.info(f"Processing group of {len(items)} defects into DefectDoc")
-        if items:
-            defect_doc = DefectDoc.from_tasks(tasks=items, query=self.defect_query)
-            return defect_doc.dict()
-
-    def get_defect_from_task(self, task):
-        """
-        Using the defect_query property, retrieve a pymatgen defect object from the task document
-        """
-        defect = unpack(self.defect_query.split('.'), task)
-        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
-        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
-
-    def update_targets(self, items):
-        """
-        Inserts the new task_types into the task_types collection
-        """
-
-        items = [item for item in items if item]
-
-        for item in items:
-            item.update({"_bt": self.timestamp})
-
-        task_ids = list(chain.from_iterable([item['task_ids'] for item in items]))
 
-        if len(items) > 0:
-            self.logger.info(f"Updating {len(items)} defects")
-            self.defects.remove_docs({self.defects.key: {"$in": task_ids}})
-            self.defects.update(
-                docs=jsanitize(items, allow_bson=True),
-                key='task_ids',
-            )
-        else:
-            self.logger.info("No items to update")
+class DefectThermoBuilder(Builder):
 
+    """
+    This builder creates collections of the DefectThermoDoc object.
 
-class DefectThermoBuilder(Builder):
+        (1) Find all DefectDocs that correspond to the same bulk material
+            given by material_id
+        (2) Create a new DefectThermoDoc for all of those documents
+        (3) Insert/Update the defect_thermos store with the new documents
+    """
 
     def __init__(
             self,
@@ -476,10 +548,6 @@ def __init__(
             materials: Store,
             electronic_structures: Store,
             query: Optional[Dict] = None,
-            symprec: float = SETTINGS.SYMPREC,
-            ltol: float = SETTINGS.LTOL,
-            stol: float = SETTINGS.STOL,
-            angle_tol: float = SETTINGS.ANGLE_TOL,
             **kwargs,
     ):
         """
@@ -487,12 +555,8 @@ def __init__(
             defects: Store of defect documents (generated by DefectBuilder)
             defect_thermos: Store of DefectThermoDocs to generate.
             materials: Store of MaterialDocs to construct phase diagram
+            electronic_structures: Store of DOS objects
             query: dictionary to limit tasks to be analyzed
-            allowed_task_types: list of task_types that can be processed
-            symprec: tolerance for SPGLib spacegroup finding
-            ltol: StructureMatcher tuning parameter for matching tasks to materials
-            stol: StructureMatcher tuning parameter for matching tasks to materials
-            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
         """
 
         self.defects = defects
@@ -501,18 +565,11 @@ def __init__(
         self.electronic_structures = electronic_structures
 
         self.query = query if query else {}
-        self.symprec = symprec
-        self.ltol = ltol
-        self.stol = stol
-        self.angle_tol = angle_tol
+        self.timestamp = None
         self.kwargs = kwargs
 
         super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)
 
-    @property
-    def defect_doc_query(self):
-        return 'material_id'
-
     def ensure_indexes(self):
         """
         Ensures indicies on the collections
@@ -520,10 +577,12 @@ def ensure_indexes(self):
 
         # Basic search index for tasks
         self.defects.ensure_index("material_id")
+        self.defects.ensure_index("defect_id")
 
         # Search index for materials
-        self.defects.ensure_index("material_id")
+        self.defect_thermos.ensure_index("material_id")
 
+    # TODO need to only process new tasks. Fast builder so currently is OK for small collections
     def get_items(self) -> Iterator[List[Dict]]:
         """
         Gets items to process into DefectThermoDocs.
@@ -532,6 +591,7 @@ def get_items(self) -> Iterator[List[Dict]]:
             iterator yielding tuples containing:
                 - group of DefectDocs belonging to the same bulk material as indexed by material_id,
                 - materials in the chemsys of the bulk material for constructing phase diagram
+                - Dos of the bulk material for constructing phase diagrams/getting doping
 
         """
 
@@ -543,39 +603,20 @@ def get_items(self) -> Iterator[List[Dict]]:
         self.timestamp = datetime.utcnow()
 
         # Get all tasks
-        self.logger.info("Finding defect docs to process")
+        self.logger.info("Finding tasks to process")
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks
 
         all_docs = [doc for doc in self.defects.query()]
         for key, group in groupby(sorted(all_docs, key=lambda x: x['material_id']), key=lambda x: x['material_id']):
             group = [g for g in group]
-            yield (group, self.get_materials(group), self.get_electronic_structure(group))
-
-    def get_electronic_structure(self, group):
-        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))
-
-    def get_materials(self, group) -> List:
-        """
-        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
-        materials store.
-        """
-        bulk = self.materials.query(criteria={'material_id': group[0]['material_id']}, properties=None)
-        elements = group[0]['chemsys']
-
-        if isinstance(elements, str):
-            elements = elements.split("-")
-
-        all_chemsyses = []
-        for i in range(len(elements)):
-            for els in combinations(elements, i + 1):
-                all_chemsyses.append("-".join(sorted(els)))
-
-        return list(chain(self.materials.query(criteria={"chemsys": {"$in": all_chemsyses}}, properties=None), bulk))
+            yield (group, self.__get_materials(key), self.__get_electronic_structure(group))
 
     def process_item(self, docs):
         """
         Process a group of defects belonging to the same material into a defect thermo doc
-        :param item:
-        :return:
         """
         self.logger.info(f"Processing defects")
         defects, materials, elec_struc = docs
@@ -586,23 +627,49 @@ def process_item(self, docs):
 
     def update_targets(self, items):
         """
-        Inserts the new task_types into the task_types collection
+        Inserts the new DefectThermoDocs into the defect_thermos store
         """
-
-        #item.update({"_bt": self.timestamp})
+        items = [item for item in items if item]
+        for item in items:
+            item.update({"_bt": self.timestamp})
 
         if len(items) > 0:
             self.logger.info(f"Updating {len(items)} defect thermo docs")
-            #self.defects.remove_docs({self.defects.key: {"$in": task_ids}})
             self.defect_thermos.update(
                 docs=jsanitize(items, allow_bson=True),
-                key='material_id',
+                key=self.defect_thermos.key,
             )
         else:
             self.logger.info("No items to update")
 
+    def __get_electronic_structure(self, group):
+        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))
+
+    def __get_materials(self, key) -> List:
+        """
+        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
+        materials store.
+        """
+        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))
+        if not bulk:
+            raise LookupError
+        elements = bulk[0]['chemsys']
+
+        if isinstance(elements, str):
+            elements = elements.split("-")
+
+        all_chemsyses = []
+        for i in range(len(elements)):
+            for els in combinations(elements, i + 1):
+                all_chemsyses.append("-".join(sorted(els)))
+
+        return list(chain(self.materials.query(criteria={"chemsys": {"$in": all_chemsyses}}, properties=None), bulk))
+
 
 def unpack(query, d):
+    """
+    Unpack a mongo-style query into dictionary retrieval
+    """
     if not query:
         return d
     if isinstance(d, List):

From a5edc7b1437d49a813ecfc24f853ffd4853b1f15 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 5 Aug 2021 16:27:49 -0700
Subject: [PATCH 23/41] New methods to update document without re-building the
 whole thing.

---
 emmet-core/emmet/core/defect.py | 188 ++++++++++++++++++++------------
 1 file changed, 118 insertions(+), 70 deletions(-)

diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index e3d47a68f3..9c68531d4e 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -13,6 +13,7 @@
 
 from pymatgen.core import Structure, Composition
 from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+from pymatgen.electronic_structure.dos import CompleteDos
 
 from pymatgen.ext.matproj import MPRester
 from monty.json import MontyDecoder
@@ -27,10 +28,8 @@
 from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
 from pymatgen.core import Element
 
-from emmet.core.polar import Dielectric
-from emmet.core.electronic_structure import ElectronicStructureDoc
-
 
+# TODO Update DefectDoc on defect entry level so you don't re-do uncessary corrections
 class DefectDoc(EmmetBaseModel):
     """
     A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
@@ -48,9 +47,10 @@ class Config:
 
     chemsys: List = Field(None, description="Chemical system of the bulk")
 
-
     material_id: MPID = Field(None, description="Unique material ID for the host material")
 
+    defect_id: int = Field(None, description="Unique ID for this defect")
+
     task_ids: List[int] = Field(
         None, description="All task ids used in creating this defect doc."
     )
@@ -76,6 +76,16 @@ class Config:
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
 
+    last_updated: datetime = Field(
+        description="Timestamp for when this document was last updated",
+        default_factory=datetime.utcnow,
+    )
+
+    created_at: datetime = Field(
+        description="Timestamp for when this material document was first created",
+        default_factory=datetime.utcnow,
+    )
+
     # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
     @validator("entries", pre=True)
     def decode(cls, entries):
@@ -84,6 +94,52 @@ def decode(cls, entries):
                 entries[e] = MontyDecoder().process_decoded({k: v for k, v in entries[e].items()})
         return entries
 
+    def update(self, defect_task, bulk_task, dielectric, query='defect'):
+
+        defect_task = TaskDocument(**defect_task)
+        bulk_task = TaskDocument(**bulk_task)
+
+        rt = defect_task.run_type
+        tt = defect_task.task_type
+        ct = defect_task.calc_type
+
+        # Metadata
+        last_updated = max(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
+        created_at = min(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
+        task_ids = {dtsk.task_id for dtsk, btsk in self.tasks.values()} if self.tasks else {}
+
+        if defect_task.task_id in task_ids:
+            return
+        else:
+            self.last_updated = last_updated
+            self.created_at = created_at
+            self.task_ids.append(defect_task.task_id)
+            #self['deprecated_tasks'].update(defect_task.task_id)
+
+            def _run_type(x):
+                return run_type(x[0]['input']['dft']).value
+
+            def _compare(new, old):
+                # TODO return kpoint density
+                return new['nsites'] > old['nsites']
+
+            if _compare(defect_task, self.tasks[rt]):
+                self.run_types.update({defect_task.task_id: rt})
+                self.task_types.update({defect_task.task_id: tt})
+                self.calc_types.update({defect_task.task_id: ct})
+                entry = self.get_defect_entry_from_tasks(
+                            defect_task=defect_task,
+                            bulk_task=bulk_task,
+                            dielectric=dielectric,
+                            query=query
+                        )
+                self.entries[rt] = entry
+                self.tasks[rt] = (defect_task, bulk_task)
+
+    def update_all(self, tasks, query='defect'):
+        for defect_task, bulk_task, dielectric in tasks:
+            self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
+
     @classmethod
     def from_tasks(cls, tasks: List, query='defect') -> "DefectDoc":
         """
@@ -112,6 +168,10 @@ def from_tasks(cls, tasks: List, query='defect') -> "DefectDoc":
         def _run_type(x):
             return run_type(x[0]['input']['dft']).value
 
+        def _sort(x):
+            # return kpoint density
+            pass
+
         entries = {}
         final_tasks = {}
         for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
@@ -142,7 +202,8 @@ def _run_type(x):
                 'task_ids': task_ids,
                 'deprecated_tasks': deprecated_tasks,
                 'tasks': final_tasks,
-                'material_id': list({v.parameters['material_id'] for v in entries.values()})[0],
+                'material_id': best_entry.parameters['material_id'],
+                'defect_id': best_entry.defect.get_hash(),
                 'entry_ids': {rt: entries[rt].entry_id for rt in entries},
                 'chemsys': list([v.defect.bulk_structure.composition.elements for v in entries.values()])[0],
         }
@@ -270,13 +331,18 @@ def decode(cls, defect_predominance_diagrams):
         return defect_predominance_diagrams
 
     @classmethod
-    def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc], electronic_structure: ElectronicStructureDoc) -> "DefectThermoDoc":
+    def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], electronic_structure: CompleteDos) -> "DefectThermoDoc":
 
         DEFAULT_RT = RunType('GGA')  # TODO NEED A procedure for getting all GGA or GGA+U keys
         DEFAULT_RT_U = RunType('GGA+U')
 
-        mpid = docs[0].material_id
-        cls.get_adjusted_entries(materials=materials, defects=docs)
+        mpid = defects[0].material_id
+
+        chempots = {
+            m.structure.composition.elements[0]:
+                {rt: m.entries[rt].energy_per_atom for rt in m.entries}
+            for m in materials if m.structure.composition.is_element
+        }
 
         defect_entries = {}
         defect_phase_diagram = {}
@@ -285,22 +351,52 @@ def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc], electro
         defect_predominance_diagrams = {}
         task_ids = {}
 
-        dos = electronic_structure.dos.total
+        dos = CompleteDos.from_dict(electronic_structure)
+        bg = dos.get_gap()
 
         for m in materials:
-            for run_type in m.entries:
-                bg = dos.get_gap()
-                band_gaps[run_type] = bg
-
-        for d in docs:
-            for run_type in d.entries:
-                if run_type not in defect_entries:
-                    defect_entries[run_type] = []
-                if run_type not in task_ids:
-                    task_ids[run_type] = set()
-                defect_entries[run_type].append(d.entries[run_type])
-                vbms[run_type] = d.entries[run_type].parameters['vbm']  # TODO Need to find best vbm
-                task_ids[run_type].update(d.task_ids)
+            for rt, ent in m.entries.items():
+                # Chempot shift
+                for el, amt in ent.composition.element_composition.items():
+                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
+                    adj = CompositionEnergyAdjustment(-chempots[Element(el)][_rt],
+                                                      n_atoms=amt,
+                                                      name=f"Elemental shift {el} to formation energy space"
+                                                      )
+                    ent.energy_adjustments.append(adj)
+
+                # Other stuff
+                band_gaps[rt] = bg
+                ent.parameters['software'] = 'cp2k'
+                ent.structure.remove_spin()
+                ent.structure.remove_oxidation_states()
+                MaterialsProject2020Compatibility().process_entry(ent)
+
+        for d in defects:
+            for rt, ent in d.entries.items():
+                # Chempot shift
+                __found_chempots__ = True
+                comp = Composition(ent.defect.defect_composition, allow_negative=True) - \
+                       ent.defect.bulk_structure.composition
+                for el, amt in comp.items():
+                    if Element(el) not in chempots:
+                        __found_chempots__ = False
+                        break
+                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
+                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt * chempots[Element(el)][
+                        _rt]
+
+                if not __found_chempots__:
+                    continue
+
+                # Other stuff
+                if rt not in defect_entries:
+                    defect_entries[rt] = []
+                if rt not in task_ids:
+                    task_ids[rt] = set()
+                defect_entries[rt].append(d.entries[rt])
+                vbms[rt] = d.entries[rt].parameters['vbm']  # TODO Need to find best vbm
+                task_ids[rt].update(d.task_ids)
 
         for run_type in defect_entries:
             # TODO MUST FILTER COMPATIBLE AT SOME POINT
@@ -329,54 +425,6 @@ def from_docs(cls, docs: List[DefectDoc], materials: List[MaterialsDoc], electro
 
         return cls(**{k: v for k, v in data.items()})
 
-    @classmethod
-    def get_adjusted_entries(cls, materials: List[MaterialsDoc], defects: List[DefectDoc]):
-        """
-        Shift the energy of all entries to formation energy space, i.e. elemental entries to 0eV
-
-        The chemical potentials (elemental energies) are acquired for each run type that is available
-        in materials. If a chempot does not exist for a non-standard run type, but does exist for
-        a GGA calculation, then this will be used as the fall-back.
-
-        Args:
-            materials: list of MaterialsDocs with *ComputedStructureEntries* as the entries
-
-            defects: list of DefectDocs
-
-        returns:
-            None
-        """
-
-        DEFAULT_RT = RunType('GGA')  # TODO NEED A procedure for getting all GGA or GGA+U keys
-
-        chempots = {
-            m.structure.composition.elements[0].element:
-                {rt: m.entries[rt].energy_per_atom for rt in m.entries}
-            for m in materials if m.structure.composition.is_element
-        }
-
-        for m in materials:
-            for rt, ent in m.entries.items():
-                ent.parameters['software'] = 'cp2k'
-                ent.structure.remove_spin()
-                ent.structure.remove_oxidation_states()
-                MaterialsProject2020Compatibility().process_entry(ent)
-                for el, amt in ent.composition.element_composition.items():
-                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
-                    adj = CompositionEnergyAdjustment(-chempots[Element(el)][_rt],
-                                                      n_atoms=amt,
-                                                      name=f"Elemental shift {el} to formation energy space"
-                                                      )
-                    ent.energy_adjustments.append(adj)
-
-        for d in defects:
-            for rt, ent in d.entries.items():
-                comp = Composition(ent.defect.defect_composition, allow_negative=True) - \
-                       ent.defect.bulk_structure.composition
-                for el, amt in comp.items():
-                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
-                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt*chempots[Element(el)][_rt]
-
 
 def get_dos(mpid):
     with MPRester() as mp:

From 54627959568cc09775e6c2ce0eba9722d9b33262 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 6 Aug 2021 12:03:52 -0700
Subject: [PATCH 24/41] Include BaseTaskDocument

---
 emmet-core/emmet/core/cp2k/task.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
index 3c21dbddf7..19bfe8b364 100644
--- a/emmet-core/emmet/core/cp2k/task.py
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -7,7 +7,7 @@
 from pymatgen.core import Structure
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 
-from emmet.core import SETTINGS
+from emmet.core.task import TaskDocument as BaseTaskDocument
 from emmet.core.structure import StructureMetadata
 from emmet.core.utils import ValueEnum
 from emmet.core.cp2k.calc_types import (
@@ -98,7 +98,7 @@ class RunStatistics(BaseModel):
     cores: int = Field(None, description="The number of cores used by CP2K")
 
 
-class TaskDocument(StructureMetadata):
+class TaskDocument(BaseTaskDocument, StructureMetadata):
     """
     Definition of CP2K Task Document
     """
@@ -131,10 +131,6 @@ class TaskDocument(StructureMetadata):
     task_id: int = Field(None, description="the Task ID For this document")
     tags: List[str] = Field([], description="Metadata tags for this task document")
 
-    sandboxes: List[str] = Field(
-        None, description="List of sandboxes this task document is allowed in"
-    )
-
     run_type: RunType = Field(None)
     task_type: TaskType = Field(None)
     calc_type: CalcType = Field(None)

From a28ea1409d243233042d07b7162e116e99bc496d Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 6 Aug 2021 12:04:44 -0700
Subject: [PATCH 25/41] (1) Reordered the imports.

(2) Updated the update() method
---
 emmet-core/emmet/core/defect.py | 98 ++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 51 deletions(-)

diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index 9c68531d4e..4ba9cbe5b0 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -1,36 +1,31 @@
 """ Core definition for Defect property Document """
 from datetime import datetime
 from typing import ClassVar, Dict, Tuple, Mapping, List
+from monty.json import MontyDecoder
+from itertools import groupby
+from pydantic import Field, validator, BaseModel
 
+from pymatgen.core import Structure, Composition, Element
 from pymatgen.analysis.defects.core import DefectEntry
+from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, DefectPredominanceDiagram
+from pymatgen.electronic_structure.dos import CompleteDos
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+from pymatgen.entries.computed_entries import CompositionEnergyAdjustment
+from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
+from pymatgen.ext.matproj import MPRester
 
-from emmet.core.base import EmmetBaseModel
+from emmet.core.structure import StructureMetadata
 from emmet.core.mpid import MPID
 from emmet.core.cp2k.task import TaskDocument
+from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.core.cp2k.calc_types.utils import run_type
-from emmet.core.material import PropertyOrigin
-
-from pymatgen.core import Structure, Composition
-from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
-from pymatgen.electronic_structure.dos import CompleteDos
-
-from pymatgen.ext.matproj import MPRester
-from monty.json import MontyDecoder
-from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
-from itertools import groupby
-from pydantic import Field, validator
-from emmet.builders.cp2k.utils import get_mpid, get_dielectric, matcher
-
-from pymatgen.entries.computed_entries import CompositionEnergyAdjustment
-from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, DefectPredominanceDiagram
-from pydantic import BaseModel
-from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
-from pymatgen.core import Element
+from emmet.builders.cp2k.utils import get_mpid, matcher
 
 
 # TODO Update DefectDoc on defect entry level so you don't re-do uncessary corrections
-class DefectDoc(EmmetBaseModel):
+class DefectDoc(StructureMetadata):
     """
     A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
 
@@ -45,11 +40,11 @@ class Config:
 
     property_name: ClassVar[str] = "defect"
 
-    chemsys: List = Field(None, description="Chemical system of the bulk")
+    name: str = Field(None, description="Name of this defect")
 
-    material_id: MPID = Field(None, description="Unique material ID for the host material")
+    charge: int = Field(None, description="Charge of this defect")
 
-    defect_id: int = Field(None, description="Unique ID for this defect")
+    material_id: MPID = Field(None, description="Unique material ID for the bulk material")
 
     task_ids: List[int] = Field(
         None, description="All task ids used in creating this defect doc."
@@ -96,24 +91,23 @@ def decode(cls, entries):
 
     def update(self, defect_task, bulk_task, dielectric, query='defect'):
 
-        defect_task = TaskDocument(**defect_task)
-        bulk_task = TaskDocument(**bulk_task)
+        defect_task_doc = TaskDocument(**defect_task)
+        bulk_task_doc = TaskDocument(**bulk_task)
 
-        rt = defect_task.run_type
-        tt = defect_task.task_type
-        ct = defect_task.calc_type
+        rt = defect_task_doc.run_type
+        tt = defect_task_doc.task_type
+        ct = defect_task_doc.calc_type
 
         # Metadata
         last_updated = max(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
         created_at = min(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
-        task_ids = {dtsk.task_id for dtsk, btsk in self.tasks.values()} if self.tasks else {}
 
-        if defect_task.task_id in task_ids:
+        if defect_task_doc.task_id in self.task_ids:
             return
         else:
             self.last_updated = last_updated
             self.created_at = created_at
-            self.task_ids.append(defect_task.task_id)
+            self.task_ids.append(defect_task_doc.task_id)
             #self['deprecated_tasks'].update(defect_task.task_id)
 
             def _run_type(x):
@@ -123,10 +117,10 @@ def _compare(new, old):
                 # TODO return kpoint density
                 return new['nsites'] > old['nsites']
 
-            if _compare(defect_task, self.tasks[rt]):
-                self.run_types.update({defect_task.task_id: rt})
-                self.task_types.update({defect_task.task_id: tt})
-                self.calc_types.update({defect_task.task_id: ct})
+            if defect_task_doc.run_type not in self.tasks or _compare(defect_task, self.tasks[rt]):
+                self.run_types.update({defect_task_doc.task_id: rt})
+                self.task_types.update({defect_task_doc.task_id: tt})
+                self.calc_types.update({defect_task_doc.task_id: ct})
                 entry = self.get_defect_entry_from_tasks(
                             defect_task=defect_task,
                             bulk_task=bulk_task,
@@ -134,7 +128,7 @@ def _compare(new, old):
                             query=query
                         )
                 self.entries[rt] = entry
-                self.tasks[rt] = (defect_task, bulk_task)
+                self.tasks[rt] = (defect_task_doc, bulk_task_doc)
 
     def update_all(self, tasks, query='defect'):
         for defect_task, bulk_task, dielectric in tasks:
@@ -169,8 +163,8 @@ def _run_type(x):
             return run_type(x[0]['input']['dft']).value
 
         def _sort(x):
-            # return kpoint density
-            pass
+            # TODO return kpoint density, currently just does supercell size
+            return x[1].nsites
 
         entries = {}
         final_tasks = {}
@@ -187,7 +181,7 @@ def _sort(x):
                 )
                 for defect_task, bulk_task, dielectric in tasks_for_runtype
             ]
-            entry_and_docs.sort(key=lambda x: x[1].nsites, reverse=True)  # TODO Turn into kpoint density sorting
+            entry_and_docs.sort(key=_sort, reverse=True)
             best_entry, best_defect_task, best_bulk_task = entry_and_docs[0]
             entries[best_defect_task.run_type] = best_entry
             final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task)
@@ -203,11 +197,13 @@ def _sort(x):
                 'deprecated_tasks': deprecated_tasks,
                 'tasks': final_tasks,
                 'material_id': best_entry.parameters['material_id'],
-                'defect_id': best_entry.defect.get_hash(),
                 'entry_ids': {rt: entries[rt].entry_id for rt in entries},
-                'chemsys': list([v.defect.bulk_structure.composition.elements for v in entries.values()])[0],
+                'name': best_entry.defect.name,
+                'charge': best_entry.defect.charge,
         }
-        return cls(**{k: v for k, v in data.items()})
+        prim = SpacegroupAnalyzer(best_entry.defect.bulk_structure).get_primitive_standard_structure()
+        data.update(StructureMetadata.from_structure(prim).dict())
+        return cls(**data)
 
     @classmethod
     def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='defect'):
@@ -237,6 +233,15 @@ def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, qu
 
         return defect_entry
 
+    @classmethod
+    def get_defect_from_task(cls, query, task):
+        """
+        Unpack a Mongo-style query and retrieve a defect object from a task.
+        """
+        defect = unpack(query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
     @classmethod
     def get_parameters_from_tasks(cls, defect_task, bulk_task):
         """
@@ -293,15 +298,6 @@ def get_init(x):
 
         return parameters
 
-    @classmethod
-    def get_defect_from_task(cls, query, task):
-        """
-        Unpack a Mongo-style query and retrieve a defect object from a task.
-        """
-        defect = unpack(query.split('.'), task)
-        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
-        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
-
 
 class DefectThermoDoc(BaseModel):
 

From 2d362403235f86adb4bcccfad773de36fbbc8b8d Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 6 Aug 2021 12:05:45 -0700
Subject: [PATCH 26/41] Update query for defects and reorder imports.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 83 +++++++++++++------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index cb6928b022..1a4d3f944a 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -2,26 +2,26 @@
 from itertools import chain, groupby, combinations
 from typing import Dict, Iterator, List, Optional
 from copy import deepcopy
+from monty.json import MontyDecoder
 
 from maggma.builders import Builder
 from maggma.stores import Store
+
 from pymatgen.core import Structure
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-from atomate.utils.utils import load_class
 
-
-from monty.json import MontyDecoder
-
-from emmet.builders.settings import EmmetBuildSettings
+from atomate.utils.utils import load_class
 
 from emmet.core import SETTINGS
 from emmet.core.utils import jsanitize, get_sg
+from emmet.core.defect import DefectDoc, DefectThermoDoc
 from emmet.core.cp2k.calc_types import TaskType
+from emmet.core.cp2k.calc_types.utils import run_type
 from emmet.core.cp2k.material import MaterialsDoc
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.builders.cp2k.utils import get_mpid
 
-from emmet.core.cp2k.calc_types.utils import run_type
-from emmet.core.defect import DefectDoc, DefectThermoDoc
 
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
@@ -113,6 +113,12 @@ def defect_query(self) -> str:
         """
         return 'defect'
 
+    @property
+    def identifying_defect_properties(self):
+        return [
+            'charge'
+        ]
+
     @property
     def required_defect_properties(self) -> List:
         return [
@@ -287,14 +293,19 @@ def update_targets(self, items):
 
         items = [item for item in items if item]
 
-        for item in items:
-            item.update({"_bt": self.timestamp})
-
-        defect_id = [item['defect_id'] for item in items]
-
         if len(items) > 0:
             self.logger.info(f"Updating {len(items)} defects")
-            self.defects.remove_docs({self.defects.key: {"$in": defect_id}})
+            for item in items:
+                item.update({"_bt": self.timestamp})
+                self.defects.remove_docs(
+                    {
+                       "name": item['name'],
+                       "symmetry.symbol": item['symmetry']['symbol'],
+                       "density": item['density'],
+                       "composition_reduced": item['composition_reduced'].as_dict(),
+                       "charge": item['charge'],
+                    }
+                )
             self.defects.update(
                 docs=jsanitize(items, allow_bson=True),
                 key='task_ids',
@@ -351,7 +362,9 @@ def key(x):
                 inds = list(inds)
                 matches.extend([unmatched[i][0] for i in inds])
                 unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
-                all_groups.append((defects[i]['defect'].get_hash(), [defects[i]['task_id'] for i in matches]))
+                all_groups.append(
+                    (self.__get_defect_identifer(defects[i]['defect']), [defects[i]['task_id'] for i in matches])
+                )
 
         self.logger.debug(f"All groups {all_groups}")
         return all_groups
@@ -373,8 +386,32 @@ def __get_defect_doc(self, defect_id):
 
         returns: DefectDoc or None
         """
-        doc = list(self.defects.query(criteria={'defect_id': defect_id}, properties=None))
-        return doc[0] if doc else None
+        doc = list(self.defects.query(criteria={**defect_id}, properties=None))
+        return DefectDoc(**doc[0]) if doc else None
+
+    # TODO Should query by more than just space group symbol
+    # TODO I think I need one more identifier to uniquely ID I think... volume of reduced cell maybe?
+    def __get_defect_identifer(self, defect):
+        """
+        Get a dict of mongo queries that can be used to uniquely ID a defect. Currently this is:
+            (1) "name" the defect name which contains the class (defect type) and multiplicity
+            (2) "charge" on the defect
+            (3) "symmetry.symbol" the space group symbol for the bulk structure
+            (4) "density" of the bulk structure
+            (5) "composition_reduced" of the bulk structure
+
+        (1) and (2) identify the defect while (3)-(5) identify the host material
+        """
+        sga = SpacegroupAnalyzer(
+            defect.bulk_structure, symprec=self.settings.SYMPREC, angle_tolerance=self.settings.ANGLE_TOL
+        )
+        return {
+                   "name": defect.name,
+                   "charge": defect.charge,
+                   "symmetry.symbol": sga.get_space_group_symbol(),
+                   "density": defect.bulk_structure.density,
+                   "composition_reduced": defect.bulk_structure.composition.reduced_composition.as_dict(),
+        }
 
     def __get_dielectric(self, task_id):
         """
@@ -384,7 +421,7 @@ def __get_dielectric(self, task_id):
         """
         t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
         struc = Structure.from_dict(t.get('output').get('structure'))
-        for diel in self.dielectric.query(criteria={self.dielectric.key: self.__get_mpid(struc)}, properties=['dielectric.total']):
+        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):
             return diel['dielectric']['total']
         return None
 
@@ -408,6 +445,7 @@ def __get_item_bundle(self, bulk_tasks, defect_task_group):
             in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
         ]
 
+    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC
     def __get_mpid(self, structure):
         """
         Given a structure, determine if an equivalent structure exists, with a material_id,
@@ -424,14 +462,13 @@ def __get_mpid(self, structure):
         for p in struc.site_properties:
             struc.remove_site_property(p)
         sga = SpacegroupAnalyzer(struc)
-
         sm = StructureMatcher(ltol=self.settings.LTOL, stol=self.settings.STOL, angle_tol=self.settings.ANGLE_TOL)
         data = self.materials.query(
             criteria={
                 'chemsys': struc.composition.chemical_system,
                 'spacegroup.symbol': sga.get_space_group_symbol()
             },
-            properties=['material_id', 'formation_energy_per_atom']
+            properties=['material_id']
         )
         data = filter(lambda x: sm.fit(x[0], x[1]), list(data))
         return None if not data else sorted(data, key=lambda x: x)[0]['material_id']
@@ -500,7 +537,7 @@ def __preprocess_bulk(self, task):
         """
         t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))
         struc = Structure.from_dict(t.get('output').get('structure'))
-        mpid = self.__get_mpid(struc)
+        mpid = get_mpid(struc)
         if not mpid:
             self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")
             return False
@@ -516,7 +553,7 @@ def __preprocess_bulk(self, task):
         return True
 
     @staticmethod
-    def __get_pristine_supercell(self, task):
+    def __get_pristine_supercell(task):
         """
         Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
         If defect cannot be found in task, return the input structure.
@@ -675,7 +712,3 @@ def unpack(query, d):
     if isinstance(d, List):
         return unpack(query[1:], d.__getitem__(int(query.pop(0))))
     return unpack(query[1:], d.__getitem__(query.pop(0)))
-
-
-
-

From 4fd13c7981bef58962ff316ead83c4c93af24c91 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 6 Aug 2021 13:55:56 -0700
Subject: [PATCH 27/41] Extra log message.

---
 emmet-builders/emmet/builders/cp2k/defects.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 1a4d3f944a..ef31265f2c 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -237,6 +237,10 @@ def get_items(self) -> Iterator[List[Dict]]:
         bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))
         unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
 
+        if not unprocessed_defect_tasks:
+            self.logger.info("No unprocessed to tasks, exiting")
+            return
+
         self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
         self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")
 

From ae1698f678a5d0a28c24524cafe1693fa526ef52 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 10 Aug 2021 18:32:51 -0700
Subject: [PATCH 28/41] Update.

builders/defects.py: use the defect object itself for matching
 after grouping by mpid

core/defect.py: store defect so that new builder works.
---
 emmet-builders/emmet/builders/cp2k/defects.py | 719 +-----------------
 emmet-core/emmet/core/defect.py               |   8 +-
 2 files changed, 5 insertions(+), 722 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index ef31265f2c..e136d325c3 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1,718 +1 @@
-from datetime import datetime
-from itertools import chain, groupby, combinations
-from typing import Dict, Iterator, List, Optional
-from copy import deepcopy
-from monty.json import MontyDecoder
-
-from maggma.builders import Builder
-from maggma.stores import Store
-
-from pymatgen.core import Structure
-from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
-from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-
-from atomate.utils.utils import load_class
-
-from emmet.core import SETTINGS
-from emmet.core.utils import jsanitize, get_sg
-from emmet.core.defect import DefectDoc, DefectThermoDoc
-from emmet.core.cp2k.calc_types import TaskType
-from emmet.core.cp2k.calc_types.utils import run_type
-from emmet.core.cp2k.material import MaterialsDoc
-from emmet.builders.settings import EmmetBuildSettings
-from emmet.builders.cp2k.utils import get_mpid
-
-
-__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
-__maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
-
-
-class DefectBuilder(Builder):
-    """
-    The DefectBuilder collects task documents performed on structures containing a single point defect.
-    The builder is intended to group tasks corresponding to the same defect (species including charge state),
-    find the best ones, and perform finite-size defect corrections to create a defect document. These
-    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.
-
-    In order to make the build process easier, an entry must exist inside of the task doc that identifies it
-    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,
-    this may be changed to having a defect transformation in the transformation history.
-
-    The process is as follows:
-
-        1.) Find all documents containing the defect query.
-        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already
-            calculated. These are the candidate bulk tasks.
-        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites
-            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding
-            bulk calculation.
-        4.) Convert (defect, bulk task) doc pairs to DefectDocs
-        5.) Post-process and validate defect document
-        6.) Update the defect store
-    """
-
-    # TODO: should dielectric/electronic_structure be optional or required?
-    def __init__(
-        self,
-        tasks: Store,
-        defects: Store,
-        dielectric: Store,
-        electronic_structure: Store,
-        materials: Store,
-        task_validation: Optional[Store] = None,
-        query: Optional[Dict] = None,
-        allowed_task_types: Optional[List[str]] = None,
-        settings: Optional[EmmetBuildSettings] = None,
-        **kwargs,
-    ):
-        """
-        Args:
-            tasks: Store of task documents
-            defects: Store of defect documents to generate
-            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property
-            allowed_task_types: list of task_types that can be processed
-            symprec: tolerance for SPGLib spacegroup finding
-            ltol: StructureMatcher tuning parameter for matching tasks to materials
-            stol: StructureMatcher tuning parameter for matching tasks to materials
-            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
-        """
-
-        self.tasks = tasks
-        self.defects = defects
-        self.materials = materials
-        self.dielectric = dielectric
-        self.electronic_structure = electronic_structure
-
-        self.task_validation = task_validation
-        self.allowed_task_types = (
-            [t.value for t in TaskType]
-            if allowed_task_types is None
-            else allowed_task_types
-        )
-
-        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
-        self.settings = EmmetBuildSettings.autoload(settings)
-        self.query = query if query else {}
-        self.timestamp = None
-        self.kwargs = kwargs
-
-        sources = [tasks, dielectric, electronic_structure, materials]
-        if self.task_validation:
-            sources.append(self.task_validation)
-        super().__init__(sources=sources, targets=[defects], **kwargs)
-
-    @property
-    def defect_query(self) -> str:
-        """
-        The standard query for defect tasks. Update this if
-        schema changes in the future.
-
-        For example, if top level key exists 'defect' can be returned.
-        Alternatively, if an initial defect transformation was performed, then
-        you can check via 'transformations.history.0.defect'
-        """
-        return 'defect'
-
-    @property
-    def identifying_defect_properties(self):
-        return [
-            'charge'
-        ]
-
-    @property
-    def required_defect_properties(self) -> List:
-        return [
-            self.defect_query,
-            'output.energy',
-            'output.v_hartree_grid',
-            'output.v_hartree',
-            'output.structure',
-            'input',
-            'transformations',
-            'task_id',
-            'nsites'
-        ]
-
-    @property
-    def optional_defect_properties(self) -> List:
-        return [
-            'last_updated',
-            'created_on',
-            'tags'
-        ]
-
-    @property
-    def required_bulk_properties(self) -> List:
-        return [
-            'output.energy',
-            'output.v_hartree_grid',
-            'output.v_hartree',
-            'output.structure',
-            'output.vbm',
-            'output.cbm',
-            'input',
-            'transformations',
-        ]
-
-    def ensure_indexes(self):
-        """
-        Ensures indicies on the tasks and materials collections
-        """
-
-        # Basic search index for tasks
-        self.tasks.ensure_index("task_id")
-        self.tasks.ensure_index("last_updated")
-        self.tasks.ensure_index("state")
-        self.tasks.ensure_index("formula_pretty")
-
-        # Search index for materials
-        self.materials.ensure_index("material_id")
-        self.materials.ensure_index("last_updated")
-        self.materials.ensure_index("task_ids")
-
-        # Search index for materials
-        self.defects.ensure_index("material_id")
-        self.defects.ensure_index("defect_id")
-        self.defects.ensure_index("last_updated")
-        self.defects.ensure_index("task_ids")
-
-        if self.task_validation:
-            self.task_validation.ensure_index("task_id")
-            self.task_validation.ensure_index("valid")
-
-    def prechunk(self, number_splits: int) -> Iterator[Dict]:
-        raise NotImplementedError
-
-    def get_items(self) -> Iterator[List[Dict]]:
-        """
-        Gets all items to process into defect documents.
-        This does no datetime checking; relying on on whether
-        task_ids are included in the Defect Collection.
-
-        The procedure is as follows:
-
-            1. Get all tasks with standard "defect" query tag
-            2. Filter all tasks by skipping tasks which are already in the Defect Store
-            3. Get all tasks that could be used as bulk
-
-
-        Returns:
-            generator or list relevant tasks and materials to process into materials documents
-        """
-
-        self.logger.info("Defect builder started")
-        self.logger.info(
-            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
-        )
-
-        self.logger.info("Setting indexes")
-        self.ensure_indexes()
-
-        # Save timestamp to mark buildtime for material documents
-        self.timestamp = datetime.utcnow()
-
-        # Get all tasks
-        self.logger.info("Finding tasks to process")
-        temp_query = dict(self.query)
-        temp_query["state"] = "successful"
-
-        all_tasks = {
-            doc[self.tasks.key]
-            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
-        }
-
-        temp_query.update({self.defect_query: {'$exists': True}})
-        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})
-        defect_tasks = {
-            doc[self.tasks.key]
-            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
-        }
-
-        processed_defect_tasks = {
-            t_id
-            for d in self.defects.query({}, ["task_ids"])
-            for t_id in d.get("task_ids", [])
-        }
-
-        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))
-        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
-
-        if not unprocessed_defect_tasks:
-            self.logger.info("No unprocessed to tasks, exiting")
-            return
-
-        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
-        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")
-
-        # Set total for builder bars to have a total
-        self.total = len(unprocessed_defect_tasks)
-
-        if self.task_validation:
-            invalid_ids = {
-                doc[self.tasks.key]
-                for doc in self.task_validation.query(
-                    {"is_valid": False}, [self.task_validation.key]
-                )
-            }
-        else:
-            invalid_ids = set()
-
-        for t in bulk_tasks.union(unprocessed_defect_tasks):
-            for doc in self.tasks.query({self.tasks.key: t}):
-                if t in invalid_ids:
-                    doc["is_valid"] = False
-                else:
-                    doc["is_valid"] = True
-
-        # yield list of defects that are of the same type, matched to an appropriate bulk calc
-        self.logger.info(f"Starting defect matching.")
-
-        for defect_id, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
-            yield self.__get_defect_doc(defect_id), self.__get_item_bundle(bulk_tasks, defect_task_group)
-
-    def process_item(self, items):
-        """
-        Process a group of defect tasks that correspond to the same defect into a single defect
-        document. If the DefectDoc already exists, then update it and return it. If it does not,
-        create a new DefectDoc
-
-        Args:
-            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]
-
-        returns: the defect document as a dictionary
-        """
-        defect_doc, item_bundle = items
-        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
-        if item_bundle:
-            if defect_doc:
-                defect_doc.update_all(item_bundle, query=self.defect_query)
-            else:
-                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query)
-            return defect_doc.dict()
-
-    def update_targets(self, items):
-        """
-        Inserts the new task_types into the task_types collection
-        """
-
-        items = [item for item in items if item]
-
-        if len(items) > 0:
-            self.logger.info(f"Updating {len(items)} defects")
-            for item in items:
-                item.update({"_bt": self.timestamp})
-                self.defects.remove_docs(
-                    {
-                       "name": item['name'],
-                       "symmetry.symbol": item['symmetry']['symbol'],
-                       "density": item['density'],
-                       "composition_reduced": item['composition_reduced'].as_dict(),
-                       "charge": item['charge'],
-                    }
-                )
-            self.defects.update(
-                docs=jsanitize(items, allow_bson=True),
-                key='task_ids',
-            )
-        else:
-            self.logger.info("No items to update")
-
-    def __filter_and_group_tasks(self, tasks):
-        """
-        Groups defect tasks. Tasks are grouped according to the reduced representation
-        of the defect, and so tasks with different settings (e.g. supercell size, functional)
-        will be grouped together.
-
-        Args:
-            tasks: task_ids for unprocessed defects
-
-        returns:
-            [(defect_id, [task_ids]), ...] where task_ids correspond to the same defect, which
-            is uniquely labeled by defect_id
-        """
-
-        props = [
-            self.defect_query,
-            'task_id'
-        ]
-
-        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")
-
-        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)
-        defects = [
-            {'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t)}
-            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)
-        ]
-
-        def key(x):
-            s = x.get('defect').bulk_structure
-            return get_sg(s), s.composition.reduced_composition
-
-        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))
-        all_groups = []
-
-        # For each pre-grouped list of structures, perform actual matching.
-        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):
-            unmatched = list(g)
-            while len(unmatched) > 0:
-                i, refs = unmatched.pop(0)
-                matches = [i]
-
-                inds = filter(
-                    lambda i: pdc.are_equal(refs['defect'], unmatched[i][1]['defect']),
-                    list(range(len(unmatched))),
-                )
-
-                inds = list(inds)
-                matches.extend([unmatched[i][0] for i in inds])
-                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
-                all_groups.append(
-                    (self.__get_defect_identifer(defects[i]['defect']), [defects[i]['task_id'] for i in matches])
-                )
-
-        self.logger.debug(f"All groups {all_groups}")
-        return all_groups
-
-    def __get_defect_from_task(self, task):
-        """
-        Using the defect_query property, retrieve a pymatgen defect object from the task document
-        """
-        defect = unpack(self.defect_query.split('.'), task)
-        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
-        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
-
-    def __get_defect_doc(self, defect_id):
-        """
-        Given a unique defect id, retrieve the defect document from the defect store if it exists
-
-        Args:
-            defect_id: Unique defect ID created by the Defect.get_hash() function
-
-        returns: DefectDoc or None
-        """
-        doc = list(self.defects.query(criteria={**defect_id}, properties=None))
-        return DefectDoc(**doc[0]) if doc else None
-
-    # TODO Should query by more than just space group symbol
-    # TODO I think I need one more identifier to uniquely ID I think... volume of reduced cell maybe?
-    def __get_defect_identifer(self, defect):
-        """
-        Get a dict of mongo queries that can be used to uniquely ID a defect. Currently this is:
-            (1) "name" the defect name which contains the class (defect type) and multiplicity
-            (2) "charge" on the defect
-            (3) "symmetry.symbol" the space group symbol for the bulk structure
-            (4) "density" of the bulk structure
-            (5) "composition_reduced" of the bulk structure
-
-        (1) and (2) identify the defect while (3)-(5) identify the host material
-        """
-        sga = SpacegroupAnalyzer(
-            defect.bulk_structure, symprec=self.settings.SYMPREC, angle_tolerance=self.settings.ANGLE_TOL
-        )
-        return {
-                   "name": defect.name,
-                   "charge": defect.charge,
-                   "symmetry.symbol": sga.get_space_group_symbol(),
-                   "density": defect.bulk_structure.density,
-                   "composition_reduced": defect.bulk_structure.composition.reduced_composition.as_dict(),
-        }
-
-    def __get_dielectric(self, task_id):
-        """
-        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store
-        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would
-        be the case for metallic systems, return None.
-        """
-        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
-        struc = Structure.from_dict(t.get('output').get('structure'))
-        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):
-            return diel['dielectric']['total']
-        return None
-
-    def __get_item_bundle(self, bulk_tasks, defect_task_group):
-        """
-        Gets a group of items that can be processed together into a defect document.
-
-        Args:
-            bulk_tasks: possible bulk tasks to match to defects
-            defect_task_group: group of equivalent defects (defined by PointDefectComparator)
-
-        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]
-        """
-        return [
-            (
-                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),
-                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),
-                self.__get_dielectric(bulk_tasks_id),
-            )
-            for defect_tasks_id, bulk_tasks_id
-            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
-        ]
-
-    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC
-    def __get_mpid(self, structure):
-        """
-        Given a structure, determine if an equivalent structure exists, with a material_id,
-        in the materials store.
-
-        Args:
-            structure: Candidate structure
-
-        returns: material_id, if one exists, else None
-        """
-        struc = deepcopy(structure)
-        struc.remove_oxidation_states()
-        struc.remove_spin()
-        for p in struc.site_properties:
-            struc.remove_site_property(p)
-        sga = SpacegroupAnalyzer(struc)
-        sm = StructureMatcher(ltol=self.settings.LTOL, stol=self.settings.STOL, angle_tol=self.settings.ANGLE_TOL)
-        data = self.materials.query(
-            criteria={
-                'chemsys': struc.composition.chemical_system,
-                'spacegroup.symbol': sga.get_space_group_symbol()
-            },
-            properties=['material_id']
-        )
-        data = filter(lambda x: sm.fit(x[0], x[1]), list(data))
-        return None if not data else sorted(data, key=lambda x: x)[0]['material_id']
-    
-    def __match_defects_to_bulks(self, bulk_ids, defect_ids):
-        """
-        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has
-        commensurate:
-
-            - Composition
-            - Number of sites
-            - Symmetry
-
-        """
-
-        self.logger.debug(f"Finding bulk/defect task combinations.")
-
-        props = [
-            'task_id',
-            'input',
-            'nsites',
-            'output.structure'
-            'transformations',
-            'defect',
-            'scale',
-        ]
-
-        bulks = list(self.tasks.query(criteria={'task_id': {'$in': list(bulk_ids)}}, properties=props))
-        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))
-
-        sm = StructureMatcher(
-            ltol=SETTINGS.LTOL,
-            stol=SETTINGS.STOL,
-            angle_tol=SETTINGS.ANGLE_TOL,
-            primitive_cell=False,
-            scale=True,
-            attempt_supercell=True,
-            allow_subset=False,
-            comparator=ElementComparator(),
-        )
-
-        def _compare(b, d):
-            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \
-                run_type(d.get('input').get('dft')).value.split('+U')[0] and \
-                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):
-                if abs(b['nsites'] - d['nsites']) <= 1:
-                    return True
-            return False
-
-        pairs = [
-            (defect['task_id'], bulk['task_id'])
-            for bulk in bulks
-            for defect in defects
-            if _compare(bulk, defect)
-        ]
-        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
-        return pairs
-
-    def __preprocess_bulk(self, task):
-        """
-        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk
-        tasks must have:
-
-            (1) Correspond to an existing material_id in the materials store
-            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store
-        """
-        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))
-        struc = Structure.from_dict(t.get('output').get('structure'))
-        mpid = get_mpid(struc)
-        if not mpid:
-            self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")
-            return False
-
-        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))
-        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})
-        if dos.get_gap():
-            diel = next(self.dielectric.query(criteria={self.dielectric.key: mpid}))
-            if diel is None:
-                self.logger.info(f"Task {task} for composition {struc.composition} requires"
-                                 f"dielectric properties, but none found in dielectric store")
-                return False
-        return True
-
-    @staticmethod
-    def __get_pristine_supercell(task):
-        """
-        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
-        If defect cannot be found in task, return the input structure.
-        """
-        if task.get('defect'):
-            return load_class(
-                task['defect']['@module'], task['defect']['@class']
-            ).from_dict(task['defect']).bulk_structure
-        elif task.get('transformations'):
-            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])
-        return Structure.from_dict(task['input']['structure'])
-
-
-class DefectThermoBuilder(Builder):
-
-    """
-    This builder creates collections of the DefectThermoDoc object.
-
-        (1) Find all DefectDocs that correspond to the same bulk material
-            given by material_id
-        (2) Create a new DefectThermoDoc for all of those documents
-        (3) Insert/Update the defect_thermos store with the new documents
-    """
-
-    def __init__(
-            self,
-            defects: Store,
-            defect_thermos: Store,
-            materials: Store,
-            electronic_structures: Store,
-            query: Optional[Dict] = None,
-            **kwargs,
-    ):
-        """
-        Args:
-            defects: Store of defect documents (generated by DefectBuilder)
-            defect_thermos: Store of DefectThermoDocs to generate.
-            materials: Store of MaterialDocs to construct phase diagram
-            electronic_structures: Store of DOS objects
-            query: dictionary to limit tasks to be analyzed
-        """
-
-        self.defects = defects
-        self.defect_thermos = defect_thermos
-        self.materials = materials
-        self.electronic_structures = electronic_structures
-
-        self.query = query if query else {}
-        self.timestamp = None
-        self.kwargs = kwargs
-
-        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)
-
-    def ensure_indexes(self):
-        """
-        Ensures indicies on the collections
-        """
-
-        # Basic search index for tasks
-        self.defects.ensure_index("material_id")
-        self.defects.ensure_index("defect_id")
-
-        # Search index for materials
-        self.defect_thermos.ensure_index("material_id")
-
-    # TODO need to only process new tasks. Fast builder so currently is OK for small collections
-    def get_items(self) -> Iterator[List[Dict]]:
-        """
-        Gets items to process into DefectThermoDocs.
-
-        returns:
-            iterator yielding tuples containing:
-                - group of DefectDocs belonging to the same bulk material as indexed by material_id,
-                - materials in the chemsys of the bulk material for constructing phase diagram
-                - Dos of the bulk material for constructing phase diagrams/getting doping
-
-        """
-
-        self.logger.info("Defect thermo builder started")
-        self.logger.info("Setting indexes")
-        self.ensure_indexes()
-
-        # Save timestamp to mark build time for defect thermo documents
-        self.timestamp = datetime.utcnow()
-
-        # Get all tasks
-        self.logger.info("Finding tasks to process")
-        temp_query = dict(self.query)
-        temp_query["state"] = "successful"
-
-        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks
-
-        all_docs = [doc for doc in self.defects.query()]
-        for key, group in groupby(sorted(all_docs, key=lambda x: x['material_id']), key=lambda x: x['material_id']):
-            group = [g for g in group]
-            yield (group, self.__get_materials(key), self.__get_electronic_structure(group))
-
-    def process_item(self, docs):
-        """
-        Process a group of defects belonging to the same material into a defect thermo doc
-        """
-        self.logger.info(f"Processing defects")
-        defects, materials, elec_struc = docs
-        defects = [DefectDoc(**d) for d in defects]
-        materials = [MaterialsDoc(**m) for m in materials]
-        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)
-        return defect_thermo_doc.dict()
-
-    def update_targets(self, items):
-        """
-        Inserts the new DefectThermoDocs into the defect_thermos store
-        """
-        items = [item for item in items if item]
-        for item in items:
-            item.update({"_bt": self.timestamp})
-
-        if len(items) > 0:
-            self.logger.info(f"Updating {len(items)} defect thermo docs")
-            self.defect_thermos.update(
-                docs=jsanitize(items, allow_bson=True),
-                key=self.defect_thermos.key,
-            )
-        else:
-            self.logger.info("No items to update")
-
-    def __get_electronic_structure(self, group):
-        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))
-
-    def __get_materials(self, key) -> List:
-        """
-        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
-        materials store.
-        """
-        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))
-        if not bulk:
-            raise LookupError
-        elements = bulk[0]['chemsys']
-
-        if isinstance(elements, str):
-            elements = elements.split("-")
-
-        all_chemsyses = []
-        for i in range(len(elements)):
-            for els in combinations(elements, i + 1):
-                all_chemsyses.append("-".join(sorted(els)))
-
-        return list(chain(self.materials.query(criteria={"chemsys": {"$in": all_chemsyses}}, properties=None), bulk))
-
-
-def unpack(query, d):
-    """
-    Unpack a mongo-style query into dictionary retrieval
-    """
-    if not query:
-        return d
-    if isinstance(d, List):
-        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
-    return unpack(query[1:], d.__getitem__(query.pop(0)))
+from datetime import datetimefrom itertools import chain, groupby, combinationsfrom typing import Dict, Iterator, List, Optionalfrom copy import deepcopyfrom monty.json import MontyDecoderfrom maggma.builders import Builderfrom maggma.stores import Storefrom pymatgen.core import Structurefrom pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparatorfrom pymatgen.symmetry.analyzer import SpacegroupAnalyzerfrom atomate.utils.utils import load_classfrom emmet.core import SETTINGSfrom emmet.core.utils import jsanitize, get_sgfrom emmet.core.defect import DefectDoc, DefectThermoDocfrom emmet.core.cp2k.calc_types import TaskTypefrom emmet.core.cp2k.calc_types.utils import run_typefrom emmet.core.cp2k.material import MaterialsDocfrom emmet.builders.settings import EmmetBuildSettingsfrom emmet.builders.cp2k.utils import get_mpid__author__ = "Nicholas Winner <nwinner@berkeley.edu>"__maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"class DefectBuilder(Builder):    """    The DefectBuilder collects task documents performed on structures containing a single point defect.    The builder is intended to group tasks corresponding to the same defect (species including charge state),    find the best ones, and perform finite-size defect corrections to create a defect document. These    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.    In order to make the build process easier, an entry must exist inside of the task doc that identifies it    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,    this may be changed to having a defect transformation in the transformation history.    The process is as follows:        1.) Find all documents containing the defect query.        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already            calculated. These are the candidate bulk tasks.        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding            bulk calculation.        4.) Convert (defect, bulk task) doc pairs to DefectDocs        5.) Post-process and validate defect document        6.) Update the defect store    """    # TODO: should dielectric/electronic_structure be optional or required?    def __init__(        self,        tasks: Store,        defects: Store,        dielectric: Store,        electronic_structure: Store,        materials: Store,        task_validation: Optional[Store] = None,        query: Optional[Dict] = None,        allowed_task_types: Optional[List[str]] = None,        settings: Optional[EmmetBuildSettings] = None,        **kwargs,    ):        """        Args:            tasks: Store of task documents            defects: Store of defect documents to generate            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property            allowed_task_types: list of task_types that can be processed            symprec: tolerance for SPGLib spacegroup finding            ltol: StructureMatcher tuning parameter for matching tasks to materials            stol: StructureMatcher tuning parameter for matching tasks to materials            angle_tol: StructureMatcher tuning parameter for matching tasks to materials        """        self.tasks = tasks        self.defects = defects        self.materials = materials        self.dielectric = dielectric        self.electronic_structure = electronic_structure        self.task_validation = task_validation        self.allowed_task_types = (            [t.value for t in TaskType]            if allowed_task_types is None            else allowed_task_types        )        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}        self.settings = EmmetBuildSettings.autoload(settings)        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        sources = [tasks, dielectric, electronic_structure, materials]        if self.task_validation:            sources.append(self.task_validation)        super().__init__(sources=sources, targets=[defects], **kwargs)    @property    def defect_query(self) -> str:        """        The standard query for defect tasks. Update this if        schema changes in the future.        For example, if top level key exists 'defect' can be returned.        Alternatively, if an initial defect transformation was performed, then        you can check via 'transformations.history.0.defect'        """        return 'defect'    @property    def identifying_defect_properties(self):        return [            'charge'        ]    @property    def required_defect_properties(self) -> List:        return [            self.defect_query,            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'input',            'transformations',            'task_id',            'nsites'        ]    @property    def optional_defect_properties(self) -> List:        return [            'last_updated',            'created_on',            'tags'        ]    @property    def required_bulk_properties(self) -> List:        return [            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'output.vbm',            'output.cbm',            'input',            'transformations',        ]    def ensure_indexes(self):        """        Ensures indicies on the tasks and materials collections        """        # Basic search index for tasks        self.tasks.ensure_index("task_id")        self.tasks.ensure_index("last_updated")        self.tasks.ensure_index("state")        self.tasks.ensure_index("formula_pretty")        # Search index for materials        self.materials.ensure_index("material_id")        self.materials.ensure_index("last_updated")        self.materials.ensure_index("task_ids")        # Search index for materials        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        self.defects.ensure_index("last_updated")        self.defects.ensure_index("task_ids")        if self.task_validation:            self.task_validation.ensure_index("task_id")            self.task_validation.ensure_index("valid")    def prechunk(self, number_splits: int) -> Iterator[Dict]:        raise NotImplementedError    def get_items(self) -> Iterator[List[Dict]]:        """        Gets all items to process into defect documents.        This does no datetime checking; relying on on whether        task_ids are included in the Defect Collection.        The procedure is as follows:            1. Get all tasks with standard "defect" query tag            2. Filter all tasks by skipping tasks which are already in the Defect Store            3. Get all tasks that could be used as bulk        Returns:            generator or list relevant tasks and materials to process into materials documents        """        self.logger.info("Defect builder started")        self.logger.info(            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"        )        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark buildtime for material documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        all_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        temp_query.update({self.defect_query: {'$exists': True}})        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})        defect_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        processed_defect_tasks = {            t_id            for d in self.defects.query({}, ["task_ids"])            for t_id in d.get("task_ids", [])        }        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks        if not unprocessed_defect_tasks:            self.logger.info("No unprocessed to tasks, exiting")            return        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")        # Set total for builder bars to have a total        self.total = len(unprocessed_defect_tasks)        if self.task_validation:            invalid_ids = {                doc[self.tasks.key]                for doc in self.task_validation.query(                    {"is_valid": False}, [self.task_validation.key]                )            }        else:            invalid_ids = set()        for t in bulk_tasks.union(unprocessed_defect_tasks):            for doc in self.tasks.query({self.tasks.key: t}):                if t in invalid_ids:                    doc["is_valid"] = False                else:                    doc["is_valid"] = True        # yield list of defects that are of the same type, matched to an appropriate bulk calc        self.logger.info(f"Starting defect matching.")        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)    def process_item(self, items):        """        Process a group of defect tasks that correspond to the same defect into a single defect        document. If the DefectDoc already exists, then update it and return it. If it does not,        create a new DefectDoc        Args:            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]        returns: the defect document as a dictionary        """        defect_doc, item_bundle = items        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")        if item_bundle:            if defect_doc:                defect_doc.update_all(item_bundle, query=self.defect_query)            else:                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query)            return defect_doc.dict()    def update_targets(self, items):        """        Inserts the new task_types into the task_types collection        """        items = [item for item in items if item]        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defects")            for item in items:                item.update({"_bt": self.timestamp})                self.defects.remove_docs(                    {                       "task_ids": {'$in': item['task_ids']},                    }                )            self.defects.update(                docs=jsanitize(items, allow_bson=True),                key='task_ids',            )        else:            self.logger.info("No items to update")    def __filter_and_group_tasks(self, tasks):        """        Groups defect tasks. Tasks are grouped according to the reduced representation        of the defect, and so tasks with different settings (e.g. supercell size, functional)        will be grouped together.        Args:            tasks: task_ids for unprocessed defects        returns:            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect        """        props = [            self.defect_query,            'task_id'        ]        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)        defects = [            {'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t)}            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)        ]        def key(x):            s = x.get('defect').bulk_structure            return get_sg(s), s.composition.reduced_composition        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))        all_groups = []        # For each pre-grouped list of structures, perform actual matching.        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):            unmatched = list(g)            while len(unmatched) > 0:                i, refs = unmatched.pop(0)                matches = [i]                inds = filter(                    lambda i: pdc.are_equal(refs['defect'], unmatched[i][1]['defect']),                    list(range(len(unmatched))),                )                inds = list(inds)                matches.extend([unmatched[i][0] for i in inds])                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]                all_groups.append(                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])                )        self.logger.debug(f"All groups {all_groups}")        return all_groups    def __get_defect_from_task(self, task):        """        Using the defect_query property, retrieve a pymatgen defect object from the task document        """        defect = unpack(self.defect_query.split('.'), task)        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})    def __get_defect_doc(self, defect):        """        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists        returns: DefectDoc or None        """        material_id = self.__get_mpid(defect.bulk_structure)        docs = [            DefectDoc(**doc)            for doc in self.defects.query(criteria={'material_id': material_id}, properties=['defect'])        ]        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)        for doc in docs:            if pdc.are_equal(defect, doc.defect):                return doc        return None    def __get_dielectric(self, task_id):        """        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would        be the case for metallic systems, return None.        """        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):            return diel['dielectric']['total']        return None    def __get_item_bundle(self, bulk_tasks, defect_task_group):        """        Gets a group of items that can be processed together into a defect document.        Args:            bulk_tasks: possible bulk tasks to match to defects            defect_task_group: group of equivalent defects (defined by PointDefectComparator)        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]        """        return [            (                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),                self.__get_dielectric(bulk_tasks_id),            )            for defect_tasks_id, bulk_tasks_id            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)        ]    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC    def __get_mpid(self, structure):        """        Given a structure, determine if an equivalent structure exists, with a material_id,        in the materials store.        Args:            structure: Candidate structure        returns: material_id, if one exists, else None        """        struc = deepcopy(structure)        struc.remove_oxidation_states()        struc.remove_spin()        for p in struc.site_properties:            struc.remove_site_property(p)        sga = SpacegroupAnalyzer(struc)        sm = StructureMatcher(ltol=self.settings.LTOL, stol=self.settings.STOL, angle_tol=self.settings.ANGLE_TOL)        data = self.materials.query(            criteria={                'chemsys': struc.composition.chemical_system,                'symmetry.symbol': sga.get_space_group_symbol()            },            properties=['material_id', 'structure']        )        data = list(filter(lambda x: sm.fit(struc, Structure.from_dict(x['structure'])), list(data)))        return None if not data else data[0]['material_id']        def __match_defects_to_bulks(self, bulk_ids, defect_ids):        """        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has        commensurate:            - Composition            - Number of sites            - Symmetry        """        self.logger.debug(f"Finding bulk/defect task combinations.")        props = [            'task_id',            'input',            'nsites',            'output.structure'            'transformations',            'defect',            'scale',        ]        bulks = list(self.tasks.query(criteria={'task_id': {'$in': list(bulk_ids)}}, properties=props))        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))        sm = StructureMatcher(            ltol=SETTINGS.LTOL,            stol=SETTINGS.STOL,            angle_tol=SETTINGS.ANGLE_TOL,            primitive_cell=False,            scale=True,            attempt_supercell=True,            allow_subset=False,            comparator=ElementComparator(),        )        def _compare(b, d):            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \                run_type(d.get('input').get('dft')).value.split('+U')[0] and \                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):                if abs(b['nsites'] - d['nsites']) <= 1:                    return True            return False        pairs = [            (defect['task_id'], bulk['task_id'])            for bulk in bulks            for defect in defects            if _compare(bulk, defect)        ]        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")        return pairs    def __preprocess_bulk(self, task):        """        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk        tasks must have:            (1) Correspond to an existing material_id in the materials store            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store        """        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        mpid = get_mpid(struc)        if not mpid:            self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")            return False        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})        if dos.get_gap():            diel = next(self.dielectric.query(criteria={self.dielectric.key: mpid}))            if diel is None:                self.logger.info(f"Task {task} for composition {struc.composition} requires"                                 f"dielectric properties, but none found in dielectric store")                return False        return True    @staticmethod    def __get_pristine_supercell(task):        """        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.        If defect cannot be found in task, return the input structure.        """        if task.get('defect'):            return load_class(                task['defect']['@module'], task['defect']['@class']            ).from_dict(task['defect']).bulk_structure        elif task.get('transformations'):            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])        return Structure.from_dict(task['input']['structure'])class DefectThermoBuilder(Builder):    """    This builder creates collections of the DefectThermoDoc object.        (1) Find all DefectDocs that correspond to the same bulk material            given by material_id        (2) Create a new DefectThermoDoc for all of those documents        (3) Insert/Update the defect_thermos store with the new documents    """    def __init__(            self,            defects: Store,            defect_thermos: Store,            materials: Store,            electronic_structures: Store,            query: Optional[Dict] = None,            **kwargs,    ):        """        Args:            defects: Store of defect documents (generated by DefectBuilder)            defect_thermos: Store of DefectThermoDocs to generate.            materials: Store of MaterialDocs to construct phase diagram            electronic_structures: Store of DOS objects            query: dictionary to limit tasks to be analyzed        """        self.defects = defects        self.defect_thermos = defect_thermos        self.materials = materials        self.electronic_structures = electronic_structures        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)    def ensure_indexes(self):        """        Ensures indicies on the collections        """        # Basic search index for tasks        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        # Search index for materials        self.defect_thermos.ensure_index("material_id")    # TODO need to only process new tasks. Fast builder so currently is OK for small collections    def get_items(self) -> Iterator[List[Dict]]:        """        Gets items to process into DefectThermoDocs.        returns:            iterator yielding tuples containing:                - group of DefectDocs belonging to the same bulk material as indexed by material_id,                - materials in the chemsys of the bulk material for constructing phase diagram                - Dos of the bulk material for constructing phase diagrams/getting doping        """        self.logger.info("Defect thermo builder started")        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark build time for defect thermo documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks        all_docs = [doc for doc in self.defects.query()]        for key, group in groupby(sorted(all_docs, key=lambda x: x['material_id']), key=lambda x: x['material_id']):            group = [g for g in group]            yield (group, self.__get_materials(key), self.__get_electronic_structure(group))    def process_item(self, docs):        """        Process a group of defects belonging to the same material into a defect thermo doc        """        self.logger.info(f"Processing defects")        defects, materials, elec_struc = docs        defects = [DefectDoc(**d) for d in defects]        materials = [MaterialsDoc(**m) for m in materials]        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)        return defect_thermo_doc.dict()    def update_targets(self, items):        """        Inserts the new DefectThermoDocs into the defect_thermos store        """        items = [item for item in items if item]        for item in items:            item.update({"_bt": self.timestamp})        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defect thermo docs")            self.defect_thermos.update(                docs=jsanitize(items, allow_bson=True),                key=self.defect_thermos.key,            )        else:            self.logger.info("No items to update")    def __get_electronic_structure(self, group):        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))    def __get_materials(self, key) -> List:        """        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the        materials store.        """        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))        if not bulk:            raise LookupError        elements = bulk[0]['chemsys']        if isinstance(elements, str):            elements = elements.split("-")        all_chemsyses = []        for i in range(len(elements)):            for els in combinations(elements, i + 1):                all_chemsyses.append("-".join(sorted(els)))        return list(chain(self.materials.query(criteria={"chemsys": {"$in": all_chemsyses}}, properties=None), bulk))def unpack(query, d):    """    Unpack a mongo-style query into dictionary retrieval    """    if not query:        return d    if isinstance(d, List):        return unpack(query[1:], d.__getitem__(int(query.pop(0))))    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index 4ba9cbe5b0..343fe3de20 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -6,7 +6,7 @@
 from pydantic import Field, validator, BaseModel
 
 from pymatgen.core import Structure, Composition, Element
-from pymatgen.analysis.defects.core import DefectEntry
+from pymatgen.analysis.defects.core import DefectEntry, Defect
 from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
 from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, DefectPredominanceDiagram
 from pymatgen.electronic_structure.dos import CompleteDos
@@ -40,9 +40,9 @@ class Config:
 
     property_name: ClassVar[str] = "defect"
 
-    name: str = Field(None, description="Name of this defect")
+    defect: Defect = Field(None, description="Pymatgen defect object for this defect doc")
 
-    charge: int = Field(None, description="Charge of this defect")
+    name: str = Field(None, description="Name of this defect as generated by the defect object")
 
     material_id: MPID = Field(None, description="Unique material ID for the bulk material")
 
@@ -198,8 +198,8 @@ def _sort(x):
                 'tasks': final_tasks,
                 'material_id': best_entry.parameters['material_id'],
                 'entry_ids': {rt: entries[rt].entry_id for rt in entries},
+                'defect': best_entry.defect,
                 'name': best_entry.defect.name,
-                'charge': best_entry.defect.charge,
         }
         prim = SpacegroupAnalyzer(best_entry.defect.bulk_structure).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())

From 0b4b103ea57d99b2548cecb3939f3c0d75b387a0 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 26 Aug 2021 11:38:24 -0700
Subject: [PATCH 29/41] Add R2SCAN option

---
 emmet-core/emmet/core/cp2k/calc_types/enums.py       | 12 ++++++++++++
 emmet-core/emmet/core/cp2k/calc_types/run_types.yaml |  8 ++++++++
 emmet-core/emmet/core/cp2k/calc_types/utils.py       |  6 ++++--
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/emmet-core/emmet/core/cp2k/calc_types/enums.py b/emmet-core/emmet/core/cp2k/calc_types/enums.py
index b0d280fa8e..5d5ab109eb 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/enums.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/enums.py
@@ -10,12 +10,16 @@ class RunType(ValueEnum):
 
     PBE = "PBE"
     GGA = "GGA"
+    R2SCAN = "R2SCAN"
+    METAGGA = "METAGGA"
     HYBRID = "HYBRID"
     HSE06 = "HSE06"
     PB0 = "PB0"
     RSH = "RSH"
     PBE_U = "PBE+U"
     GGA_U = "GGA+U"
+    R2SCAN_U = "R2SCAN+U"
+    METAGGA_U = "METAGGA+U"
     HYBRID_U = "HYBRID+U"
     HSE06_U = "HSE06+U"
     PB0_U = "PB0+U"
@@ -38,6 +42,10 @@ class CalcType(ValueEnum):
     PBE_Structure_Optimization = "PBE Structure Optimization"
     GGA_Static = "GGA Static"
     GGA_Structure_Optimization = "GGA Structure Optimization"
+    R2SCAN_Static = "R2SCAN Static"
+    R2SCAN_Structure_Optimization = "R2SCAN Structure Optimization"
+    METAGGA_Static = "METAGGA Static"
+    METAGGA_Structure_Optimization = "METAGGA Structure Optimization"
     HYBRID_Static = "HYBRID Static"
     HYBRID_Structure_Optimization = "HYBRID Structure Optimization"
     HSE06_Static = "HSE06 Static"
@@ -50,6 +58,10 @@ class CalcType(ValueEnum):
     PBE_U_Structure_Optimization = "PBE+U Structure Optimization"
     GGA_U_Static = "GGA+U Static"
     GGA_U_Structure_Optimization = "GGA+U Structure Optimization"
+    R2SCAN_U_Static = "R2SCAN+U Static"
+    R2SCAN_U_Structure_Optimization = "R2SCAN+U Structure Optimization"
+    METAGGA_U_Static = "METAGGA+U Static"
+    METAGGA_U_Structure_Optimization = "METAGGA+U Structure Optimization"
     HYBRID_U_Static = "HYBRID+U Static"
     HYBRID_U_Structure_Optimization = "HYBRID+U Structure Optimization"
     HSE06_U_Static = "HSE06+U Static"
diff --git a/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
index aab07a0092..9d22ee02a8 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
+++ b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
@@ -4,6 +4,14 @@ GGA:
     FRACTION: 0
   GGA:
     GGA: --
+METAGGA:
+  R2SCAN:
+    FUNCTIONAL:
+      - MGGA_C_R2SCAN
+      - MGGA_X_R2SCAN
+    FRACTION: 0
+  METAGGA:
+    METAGGA: --
 HYBRID:
   HYBRID:
     HYBRID: --
diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py
index 0ca93d4964..b245e93c12 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/utils.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 from typing import Dict
 from monty.serialization import loadfn
-
+from typing import Iterable
 
 from emmet.core.cp2k.calc_types.enums import RunType, TaskType, CalcType
 
@@ -24,6 +24,8 @@ def _variant_equal(v1, v2) -> bool:
         """
         if isinstance(v1, str) and isinstance(v2, str):
             return v1.strip().upper() == v2.strip().upper()
+        elif isinstance(v1, Iterable) and isinstance(v2, Iterable):
+            return set(v1) == set(v2)
         else:
             return v1 == v2
 
@@ -46,7 +48,7 @@ def _variant_equal(v1, v2) -> bool:
         for special_type, params in _RUN_TYPE_DATA[functional_class].items():
             if all(
                 [
-                    _variant_equal(parameters.get(param, None), value)
+                    _variant_equal(parameters.get(param, True), value)
                     for param, value in params.items()
                 ]
             ):

From f2b70e08f9b5e9758eed54aed7af848953124e6c Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 2 Nov 2021 15:12:11 -0700
Subject: [PATCH 30/41] Updates and refinements.

---
 emmet-builders/emmet/builders/cp2k/defects.py |  2 +-
 .../emmet/builders/cp2k/materials.py          |  3 +-
 emmet-builders/emmet/builders/cp2k/utils.py   |  3 +
 emmet-core/emmet/core/cp2k/material.py        |  3 +-
 emmet-core/emmet/core/cp2k/validation.py      | 11 ++-
 emmet-core/emmet/core/defect.py               | 78 ++++++++++++-------
 emmet-core/emmet/core/mpid.py                 |  2 +-
 7 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index e136d325c3..978ad7ed38 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1 +1 @@
-from datetime import datetimefrom itertools import chain, groupby, combinationsfrom typing import Dict, Iterator, List, Optionalfrom copy import deepcopyfrom monty.json import MontyDecoderfrom maggma.builders import Builderfrom maggma.stores import Storefrom pymatgen.core import Structurefrom pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparatorfrom pymatgen.symmetry.analyzer import SpacegroupAnalyzerfrom atomate.utils.utils import load_classfrom emmet.core import SETTINGSfrom emmet.core.utils import jsanitize, get_sgfrom emmet.core.defect import DefectDoc, DefectThermoDocfrom emmet.core.cp2k.calc_types import TaskTypefrom emmet.core.cp2k.calc_types.utils import run_typefrom emmet.core.cp2k.material import MaterialsDocfrom emmet.builders.settings import EmmetBuildSettingsfrom emmet.builders.cp2k.utils import get_mpid__author__ = "Nicholas Winner <nwinner@berkeley.edu>"__maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"class DefectBuilder(Builder):    """    The DefectBuilder collects task documents performed on structures containing a single point defect.    The builder is intended to group tasks corresponding to the same defect (species including charge state),    find the best ones, and perform finite-size defect corrections to create a defect document. These    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.    In order to make the build process easier, an entry must exist inside of the task doc that identifies it    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,    this may be changed to having a defect transformation in the transformation history.    The process is as follows:        1.) Find all documents containing the defect query.        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already            calculated. These are the candidate bulk tasks.        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding            bulk calculation.        4.) Convert (defect, bulk task) doc pairs to DefectDocs        5.) Post-process and validate defect document        6.) Update the defect store    """    # TODO: should dielectric/electronic_structure be optional or required?    def __init__(        self,        tasks: Store,        defects: Store,        dielectric: Store,        electronic_structure: Store,        materials: Store,        task_validation: Optional[Store] = None,        query: Optional[Dict] = None,        allowed_task_types: Optional[List[str]] = None,        settings: Optional[EmmetBuildSettings] = None,        **kwargs,    ):        """        Args:            tasks: Store of task documents            defects: Store of defect documents to generate            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property            allowed_task_types: list of task_types that can be processed            symprec: tolerance for SPGLib spacegroup finding            ltol: StructureMatcher tuning parameter for matching tasks to materials            stol: StructureMatcher tuning parameter for matching tasks to materials            angle_tol: StructureMatcher tuning parameter for matching tasks to materials        """        self.tasks = tasks        self.defects = defects        self.materials = materials        self.dielectric = dielectric        self.electronic_structure = electronic_structure        self.task_validation = task_validation        self.allowed_task_types = (            [t.value for t in TaskType]            if allowed_task_types is None            else allowed_task_types        )        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}        self.settings = EmmetBuildSettings.autoload(settings)        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        sources = [tasks, dielectric, electronic_structure, materials]        if self.task_validation:            sources.append(self.task_validation)        super().__init__(sources=sources, targets=[defects], **kwargs)    @property    def defect_query(self) -> str:        """        The standard query for defect tasks. Update this if        schema changes in the future.        For example, if top level key exists 'defect' can be returned.        Alternatively, if an initial defect transformation was performed, then        you can check via 'transformations.history.0.defect'        """        return 'defect'    @property    def identifying_defect_properties(self):        return [            'charge'        ]    @property    def required_defect_properties(self) -> List:        return [            self.defect_query,            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'input',            'transformations',            'task_id',            'nsites'        ]    @property    def optional_defect_properties(self) -> List:        return [            'last_updated',            'created_on',            'tags'        ]    @property    def required_bulk_properties(self) -> List:        return [            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'output.vbm',            'output.cbm',            'input',            'transformations',        ]    def ensure_indexes(self):        """        Ensures indicies on the tasks and materials collections        """        # Basic search index for tasks        self.tasks.ensure_index("task_id")        self.tasks.ensure_index("last_updated")        self.tasks.ensure_index("state")        self.tasks.ensure_index("formula_pretty")        # Search index for materials        self.materials.ensure_index("material_id")        self.materials.ensure_index("last_updated")        self.materials.ensure_index("task_ids")        # Search index for materials        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        self.defects.ensure_index("last_updated")        self.defects.ensure_index("task_ids")        if self.task_validation:            self.task_validation.ensure_index("task_id")            self.task_validation.ensure_index("valid")    def prechunk(self, number_splits: int) -> Iterator[Dict]:        raise NotImplementedError    def get_items(self) -> Iterator[List[Dict]]:        """        Gets all items to process into defect documents.        This does no datetime checking; relying on on whether        task_ids are included in the Defect Collection.        The procedure is as follows:            1. Get all tasks with standard "defect" query tag            2. Filter all tasks by skipping tasks which are already in the Defect Store            3. Get all tasks that could be used as bulk        Returns:            generator or list relevant tasks and materials to process into materials documents        """        self.logger.info("Defect builder started")        self.logger.info(            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"        )        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark buildtime for material documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        all_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        temp_query.update({self.defect_query: {'$exists': True}})        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})        defect_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        processed_defect_tasks = {            t_id            for d in self.defects.query({}, ["task_ids"])            for t_id in d.get("task_ids", [])        }        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks        if not unprocessed_defect_tasks:            self.logger.info("No unprocessed to tasks, exiting")            return        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")        # Set total for builder bars to have a total        self.total = len(unprocessed_defect_tasks)        if self.task_validation:            invalid_ids = {                doc[self.tasks.key]                for doc in self.task_validation.query(                    {"is_valid": False}, [self.task_validation.key]                )            }        else:            invalid_ids = set()        for t in bulk_tasks.union(unprocessed_defect_tasks):            for doc in self.tasks.query({self.tasks.key: t}):                if t in invalid_ids:                    doc["is_valid"] = False                else:                    doc["is_valid"] = True        # yield list of defects that are of the same type, matched to an appropriate bulk calc        self.logger.info(f"Starting defect matching.")        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)    def process_item(self, items):        """        Process a group of defect tasks that correspond to the same defect into a single defect        document. If the DefectDoc already exists, then update it and return it. If it does not,        create a new DefectDoc        Args:            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]        returns: the defect document as a dictionary        """        defect_doc, item_bundle = items        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")        if item_bundle:            if defect_doc:                defect_doc.update_all(item_bundle, query=self.defect_query)            else:                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query)            return defect_doc.dict()    def update_targets(self, items):        """        Inserts the new task_types into the task_types collection        """        items = [item for item in items if item]        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defects")            for item in items:                item.update({"_bt": self.timestamp})                self.defects.remove_docs(                    {                       "task_ids": {'$in': item['task_ids']},                    }                )            self.defects.update(                docs=jsanitize(items, allow_bson=True),                key='task_ids',            )        else:            self.logger.info("No items to update")    def __filter_and_group_tasks(self, tasks):        """        Groups defect tasks. Tasks are grouped according to the reduced representation        of the defect, and so tasks with different settings (e.g. supercell size, functional)        will be grouped together.        Args:            tasks: task_ids for unprocessed defects        returns:            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect        """        props = [            self.defect_query,            'task_id'        ]        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)        defects = [            {'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t)}            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)        ]        def key(x):            s = x.get('defect').bulk_structure            return get_sg(s), s.composition.reduced_composition        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))        all_groups = []        # For each pre-grouped list of structures, perform actual matching.        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):            unmatched = list(g)            while len(unmatched) > 0:                i, refs = unmatched.pop(0)                matches = [i]                inds = filter(                    lambda i: pdc.are_equal(refs['defect'], unmatched[i][1]['defect']),                    list(range(len(unmatched))),                )                inds = list(inds)                matches.extend([unmatched[i][0] for i in inds])                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]                all_groups.append(                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])                )        self.logger.debug(f"All groups {all_groups}")        return all_groups    def __get_defect_from_task(self, task):        """        Using the defect_query property, retrieve a pymatgen defect object from the task document        """        defect = unpack(self.defect_query.split('.'), task)        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})    def __get_defect_doc(self, defect):        """        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists        returns: DefectDoc or None        """        material_id = self.__get_mpid(defect.bulk_structure)        docs = [            DefectDoc(**doc)            for doc in self.defects.query(criteria={'material_id': material_id}, properties=['defect'])        ]        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)        for doc in docs:            if pdc.are_equal(defect, doc.defect):                return doc        return None    def __get_dielectric(self, task_id):        """        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would        be the case for metallic systems, return None.        """        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):            return diel['dielectric']['total']        return None    def __get_item_bundle(self, bulk_tasks, defect_task_group):        """        Gets a group of items that can be processed together into a defect document.        Args:            bulk_tasks: possible bulk tasks to match to defects            defect_task_group: group of equivalent defects (defined by PointDefectComparator)        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]        """        return [            (                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),                self.__get_dielectric(bulk_tasks_id),            )            for defect_tasks_id, bulk_tasks_id            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)        ]    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC    def __get_mpid(self, structure):        """        Given a structure, determine if an equivalent structure exists, with a material_id,        in the materials store.        Args:            structure: Candidate structure        returns: material_id, if one exists, else None        """        struc = deepcopy(structure)        struc.remove_oxidation_states()        struc.remove_spin()        for p in struc.site_properties:            struc.remove_site_property(p)        sga = SpacegroupAnalyzer(struc)        sm = StructureMatcher(ltol=self.settings.LTOL, stol=self.settings.STOL, angle_tol=self.settings.ANGLE_TOL)        data = self.materials.query(            criteria={                'chemsys': struc.composition.chemical_system,                'symmetry.symbol': sga.get_space_group_symbol()            },            properties=['material_id', 'structure']        )        data = list(filter(lambda x: sm.fit(struc, Structure.from_dict(x['structure'])), list(data)))        return None if not data else data[0]['material_id']        def __match_defects_to_bulks(self, bulk_ids, defect_ids):        """        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has        commensurate:            - Composition            - Number of sites            - Symmetry        """        self.logger.debug(f"Finding bulk/defect task combinations.")        props = [            'task_id',            'input',            'nsites',            'output.structure'            'transformations',            'defect',            'scale',        ]        bulks = list(self.tasks.query(criteria={'task_id': {'$in': list(bulk_ids)}}, properties=props))        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))        sm = StructureMatcher(            ltol=SETTINGS.LTOL,            stol=SETTINGS.STOL,            angle_tol=SETTINGS.ANGLE_TOL,            primitive_cell=False,            scale=True,            attempt_supercell=True,            allow_subset=False,            comparator=ElementComparator(),        )        def _compare(b, d):            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \                run_type(d.get('input').get('dft')).value.split('+U')[0] and \                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):                if abs(b['nsites'] - d['nsites']) <= 1:                    return True            return False        pairs = [            (defect['task_id'], bulk['task_id'])            for bulk in bulks            for defect in defects            if _compare(bulk, defect)        ]        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")        return pairs    def __preprocess_bulk(self, task):        """        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk        tasks must have:            (1) Correspond to an existing material_id in the materials store            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store        """        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        mpid = get_mpid(struc)        if not mpid:            self.logger.debug(f"NO MPID FOUND FOR {task} - {struc.composition}")            return False        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})        if dos.get_gap():            diel = next(self.dielectric.query(criteria={self.dielectric.key: mpid}))            if diel is None:                self.logger.info(f"Task {task} for composition {struc.composition} requires"                                 f"dielectric properties, but none found in dielectric store")                return False        return True    @staticmethod    def __get_pristine_supercell(task):        """        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.        If defect cannot be found in task, return the input structure.        """        if task.get('defect'):            return load_class(                task['defect']['@module'], task['defect']['@class']            ).from_dict(task['defect']).bulk_structure        elif task.get('transformations'):            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])        return Structure.from_dict(task['input']['structure'])class DefectThermoBuilder(Builder):    """    This builder creates collections of the DefectThermoDoc object.        (1) Find all DefectDocs that correspond to the same bulk material            given by material_id        (2) Create a new DefectThermoDoc for all of those documents        (3) Insert/Update the defect_thermos store with the new documents    """    def __init__(            self,            defects: Store,            defect_thermos: Store,            materials: Store,            electronic_structures: Store,            query: Optional[Dict] = None,            **kwargs,    ):        """        Args:            defects: Store of defect documents (generated by DefectBuilder)            defect_thermos: Store of DefectThermoDocs to generate.            materials: Store of MaterialDocs to construct phase diagram            electronic_structures: Store of DOS objects            query: dictionary to limit tasks to be analyzed        """        self.defects = defects        self.defect_thermos = defect_thermos        self.materials = materials        self.electronic_structures = electronic_structures        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)    def ensure_indexes(self):        """        Ensures indicies on the collections        """        # Basic search index for tasks        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        # Search index for materials        self.defect_thermos.ensure_index("material_id")    # TODO need to only process new tasks. Fast builder so currently is OK for small collections    def get_items(self) -> Iterator[List[Dict]]:        """        Gets items to process into DefectThermoDocs.        returns:            iterator yielding tuples containing:                - group of DefectDocs belonging to the same bulk material as indexed by material_id,                - materials in the chemsys of the bulk material for constructing phase diagram                - Dos of the bulk material for constructing phase diagrams/getting doping        """        self.logger.info("Defect thermo builder started")        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark build time for defect thermo documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks        all_docs = [doc for doc in self.defects.query()]        for key, group in groupby(sorted(all_docs, key=lambda x: x['material_id']), key=lambda x: x['material_id']):            group = [g for g in group]            yield (group, self.__get_materials(key), self.__get_electronic_structure(group))    def process_item(self, docs):        """        Process a group of defects belonging to the same material into a defect thermo doc        """        self.logger.info(f"Processing defects")        defects, materials, elec_struc = docs        defects = [DefectDoc(**d) for d in defects]        materials = [MaterialsDoc(**m) for m in materials]        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)        return defect_thermo_doc.dict()    def update_targets(self, items):        """        Inserts the new DefectThermoDocs into the defect_thermos store        """        items = [item for item in items if item]        for item in items:            item.update({"_bt": self.timestamp})        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defect thermo docs")            self.defect_thermos.update(                docs=jsanitize(items, allow_bson=True),                key=self.defect_thermos.key,            )        else:            self.logger.info("No items to update")    def __get_electronic_structure(self, group):        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))    def __get_materials(self, key) -> List:        """        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the        materials store.        """        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))        if not bulk:            raise LookupError        elements = bulk[0]['chemsys']        if isinstance(elements, str):            elements = elements.split("-")        all_chemsyses = []        for i in range(len(elements)):            for els in combinations(elements, i + 1):                all_chemsyses.append("-".join(sorted(els)))        return list(chain(self.materials.query(criteria={"chemsys": {"$in": all_chemsyses}}, properties=None), bulk))def unpack(query, d):    """    Unpack a mongo-style query into dictionary retrieval    """    if not query:        return d    if isinstance(d, List):        return unpack(query[1:], d.__getitem__(int(query.pop(0))))    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file
+from datetime import datetimefrom itertools import chain, groupby, combinationsfrom typing import Dict, Iterator, List, Optionalfrom copy import deepcopyfrom monty.json import MontyDecoderfrom maggma.builders import Builderfrom maggma.stores import Storefrom pymatgen.core import Structurefrom pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparatorfrom pymatgen.symmetry.analyzer import SpacegroupAnalyzerfrom atomate.utils.utils import load_classfrom emmet.core import SETTINGSfrom emmet.core.utils import jsanitize, get_sgfrom emmet.core.defect import DefectDoc, DefectThermoDocfrom emmet.core.cp2k.calc_types import TaskTypefrom emmet.core.cp2k.calc_types.utils import run_typefrom emmet.core.cp2k.material import MaterialsDocfrom emmet.builders.settings import EmmetBuildSettingsfrom emmet.builders.cp2k.utils import get_mpid__author__ = "Nicholas Winner <nwinner@berkeley.edu>"__maintainer__ = "Jason Munro"class DefectBuilder(Builder):    """    The DefectBuilder collects task documents performed on structures containing a single point defect.    The builder is intended to group tasks corresponding to the same defect (species including charge state),    find the best ones, and perform finite-size defect corrections to create a defect document. These    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.    In order to make the build process easier, an entry must exist inside of the task doc that identifies it    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,    this may be changed to having a defect transformation in the transformation history.    The process is as follows:        1.) Find all documents containing the defect query.        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already            calculated. These are the candidate bulk tasks.        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding            bulk calculation.        4.) Convert (defect, bulk task) doc pairs to DefectDocs        5.) Post-process and validate defect document        6.) Update the defect store    """    # TODO: should dielectric/electronic_structure be optional or required?    def __init__(        self,        tasks: Store,        defects: Store,        dielectric: Store,        electronic_structure: Store,        materials: Store,        task_validation: Optional[Store] = None,        query: Optional[Dict] = None,        allowed_task_types: Optional[List[str]] = None,        settings: Optional[EmmetBuildSettings] = None,        **kwargs,    ):        """        Args:            tasks: Store of task documents            defects: Store of defect documents to generate            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property            allowed_task_types: list of task_types that can be processed            symprec: tolerance for SPGLib spacegroup finding            ltol: StructureMatcher tuning parameter for matching tasks to materials            stol: StructureMatcher tuning parameter for matching tasks to materials            angle_tol: StructureMatcher tuning parameter for matching tasks to materials        """        self.tasks = tasks        self.defects = defects        self.materials = materials        self.dielectric = dielectric        self.electronic_structure = electronic_structure        self.task_validation = task_validation        self.allowed_task_types = (            [t.value for t in TaskType]            if allowed_task_types is None            else allowed_task_types        )        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}        self.settings = EmmetBuildSettings.autoload(settings)        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        sources = [tasks, dielectric, electronic_structure, materials]        if self.task_validation:            sources.append(self.task_validation)        super().__init__(sources=sources, targets=[defects], **kwargs)    @property    def defect_query(self) -> str:        """        The standard query for defect tasks. Update this if        schema changes in the future.        For example, if top level key exists 'defect' can be returned.        Alternatively, if an initial defect transformation was performed, then        you can check via 'transformations.history.0.defect'        """        return 'defect'    @property    def identifying_defect_properties(self):        return [            'charge'        ]    @property    def required_defect_properties(self) -> List:        return [            self.defect_query,            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'input',            'transformations',            'task_id',            'nsites'        ]    @property    def optional_defect_properties(self) -> List:        return [            'last_updated',            'created_on',            'tags'        ]    @property    def required_bulk_properties(self) -> List:        return [            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'output.vbm',            'output.cbm',            'input',            'transformations',        ]    def ensure_indexes(self):        """        Ensures indicies on the tasks and materials collections        """        # Basic search index for tasks        self.tasks.ensure_index("task_id")        self.tasks.ensure_index("last_updated")        self.tasks.ensure_index("state")        self.tasks.ensure_index("formula_pretty")        # Search index for materials        self.materials.ensure_index("material_id")        self.materials.ensure_index("last_updated")        self.materials.ensure_index("task_ids")        # Search index for materials        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        self.defects.ensure_index("last_updated")        self.defects.ensure_index("task_ids")        if self.task_validation:            self.task_validation.ensure_index("task_id")            self.task_validation.ensure_index("valid")    def prechunk(self, number_splits: int) -> Iterator[Dict]:        raise NotImplementedError    def get_items(self) -> Iterator[List[Dict]]:        """        Gets all items to process into defect documents.        This does no datetime checking; relying on on whether        task_ids are included in the Defect Collection.        The procedure is as follows:            1. Get all tasks with standard "defect" query tag            2. Filter all tasks by skipping tasks which are already in the Defect Store            3. Get all tasks that could be used as bulk        Returns:            generator or list relevant tasks and materials to process into materials documents        """        self.logger.info("Defect builder started")        self.logger.info(            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"        )        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark buildtime for material documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        all_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        temp_query.update({self.defect_query: {'$exists': True}})        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})        defect_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        processed_defect_tasks = {            t_id            for d in self.defects.query({}, ["task_ids"])            for t_id in d.get("task_ids", [])        }        self.logger.debug("All tasks: {}".format(len(all_tasks)))        self.logger.debug("Bulk tasks before filter: {}".format(len(all_tasks-defect_tasks)))        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks        if not unprocessed_defect_tasks:            self.logger.info("No unprocessed to tasks. Exiting")            return        elif not bulk_tasks:            self.logger.info("No compatible bulk calculations. Exiting.")            return        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")        # Set total for builder bars to have a total        self.total = len(unprocessed_defect_tasks)        if self.task_validation:            invalid_ids = {                doc[self.tasks.key]                for doc in self.task_validation.query(                    {"is_valid": False}, [self.task_validation.key]                )            }        else:            invalid_ids = set()        for t in bulk_tasks.union(unprocessed_defect_tasks):            for doc in self.tasks.query({self.tasks.key: t}):                if t in invalid_ids:                    doc["is_valid"] = False                else:                    doc["is_valid"] = True        # yield list of defects that are of the same type, matched to an appropriate bulk calc        self.logger.info(f"Starting defect matching.")        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)    def process_item(self, items):        """        Process a group of defect tasks that correspond to the same defect into a single defect        document. If the DefectDoc already exists, then update it and return it. If it does not,        create a new DefectDoc        Args:            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]        returns: the defect document as a dictionary        """        defect_doc, item_bundle = items        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")        if item_bundle:            material_id = self.__get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..            if defect_doc:                defect_doc.update_all(item_bundle, query=self.defect_query)            else:                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)            return defect_doc.dict()    def update_targets(self, items):        """        Inserts the new task_types into the task_types collection        """        items = [item for item in items if item]        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defects")            for item in items:                item.update({"_bt": self.timestamp})                self.defects.remove_docs(                    {                       "task_ids": item['task_ids'],                    }                )            self.defects.update(                docs=jsanitize(items, allow_bson=True),                key='task_ids',            )        else:            self.logger.info("No items to update")    def __filter_and_group_tasks(self, tasks):        """        Groups defect tasks. Tasks are grouped according to the reduced representation        of the defect, and so tasks with different settings (e.g. supercell size, functional)        will be grouped together.        Args:            tasks: task_ids for unprocessed defects        returns:            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect        """        props = [            self.defect_query,            'task_id',            'output.structure'        ]        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)        sm = StructureMatcher(            ltol=self.settings.LTOL, stol=self.settings.STOL,            angle_tol=self.settings.ANGLE_TOL, allow_subset=False        )        defects = [            {                'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t),                'structure': Structure.from_dict(t['output']['structure'])            }            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)        ]        for d in defects:            # TODO remove oxidation state because spins/oxidation cause errors in comparison.            #  but they shouldnt if those props are close in value            d['structure'].remove_oxidation_states()        def key(x):            s = x.get('defect').bulk_structure            return get_sg(s), s.composition.reduced_composition        def are_equal(x, y):            """            To decide if defects are equal. Either the defect objects are            equal, OR two different defect objects relaxed to the same final structure            (common with interstitials).            TODO Need a way to do the output structure comparison for a X atom defect cell            TODO which can be embedded in a Y atom defect cell up to tolerance.            """            if pdc.are_equal(x['defect'], y['defect']):                return True            # TODO This is needed for ghost vacancy unfortunately, since  sm.fit can't distinguish ghosts            if sm.fit(x['structure'], y['structure']) and \                    x['defect'].defect_composition == y['defect'].defect_composition and \                    x['defect'].charge == y['defect'].charge:                return True            return False        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))        all_groups = []        # For each pre-grouped list of structures, perform actual matching.        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):            unmatched = list(g)            while len(unmatched) > 0:                i, refs = unmatched.pop(0)                matches = [i]                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))                matches.extend([unmatched[i][0] for i in inds])                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]                all_groups.append(                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])                )        self.logger.debug(f"All groups {all_groups}")        return all_groups    def __get_defect_from_task(self, task):        """        Using the defect_query property, retrieve a pymatgen defect object from the task document        """        defect = unpack(self.defect_query.split('.'), task)        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})    def __get_defect_doc(self, defect):        """        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists        returns: DefectDoc or None        """        material_id = self.__get_mpid(defect.bulk_structure)        docs = [            DefectDoc(**doc)            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)        ]        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)        for doc in docs:            if pdc.are_equal(defect, doc.defect):                return doc        return None    def __get_dielectric(self, task_id):        """        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would        be the case for metallic systems, return None.        """        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):            return diel['dielectric']['total']        return None    def __get_item_bundle(self, bulk_tasks, defect_task_group):        """        Gets a group of items that can be processed together into a defect document.        Args:            bulk_tasks: possible bulk tasks to match to defects            defect_task_group: group of equivalent defects (defined by PointDefectComparator)        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]        """        return [            (                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),                self.__get_dielectric(bulk_tasks_id),            )            for defect_tasks_id, bulk_tasks_id            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)        ]    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC    def __get_mpid(self, structure):        """        Given a structure, determine if an equivalent structure exists, with a material_id,        in the materials store.        Args:            structure: Candidate structure        returns: material_id, if one exists, else None        """        sga = SpacegroupAnalyzer(structure)        mats = self.materials.query(            criteria={                'chemsys': structure.composition.chemical_system,                'symmetry.symbol': sga.get_space_group_symbol()            }, properties=['structure', 'material_id']        )        sm = StructureMatcher()        for m in mats:            if sm.fit(structure, Structure.from_dict(m['structure'])):                return m['material_id']        return None    def __match_defects_to_bulks(self, bulk_ids, defect_ids):        """        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has        commensurate:            - Composition            - Number of sites            - Symmetry        """        self.logger.debug(f"Finding bulk/defect task combinations.")        props = [            'task_id',            'input',            'nsites',            'output.structure',            'transformations',            'defect',            'scale',        ]        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))        ps = DefectBuilder.__get_pristine_supercell(defects[0])        bulks = list(            self.tasks.query(                criteria={                    'task_id': {'$in': list(bulk_ids)},                    'composition_reduced': ps.composition.reduced_composition.as_dict(),                },                properties=props            )        )        sm = StructureMatcher(            ltol=SETTINGS.LTOL,            stol=SETTINGS.STOL,            angle_tol=SETTINGS.ANGLE_TOL,            primitive_cell=False,            scale=True,            attempt_supercell=True,            allow_subset=False,            comparator=ElementComparator(),        )        def _compare(b, d):            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \                run_type(d.get('input').get('dft')).value.split('+U')[0] and \                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):                if abs(b['nsites'] - d['nsites']) <= 1:                    return True            return False        pairs = [            (defect['task_id'], bulk['task_id'])            for bulk in bulks            for defect in defects            if _compare(bulk, defect)        ]        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")        return pairs    def __preprocess_bulk(self, task):        """        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk        tasks must have:            (1) Correspond to an existing material_id in the materials store            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store        """        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid']))        # TODO: This is for my personal use to get around the 2D materials problem. Should not be made official        if t['mpid']:            self.logger.debug(f"Found monolayer for...")            mpid = t['mpid'].split('-ML')[0]        else:            struc = Structure.from_dict(t.get('output').get('structure'))            mpid = self.__get_mpid(struc)            if not mpid:                return False        self.logger.debug(f"Material ID: {mpid}")        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})        if dos.get_gap():            diel = list(self.dielectric.query(criteria={self.dielectric.key: mpid}))            if not diel:                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "                                 f"dielectric properties, but none found in dielectric store")                return False        return True    @staticmethod    def __get_pristine_supercell(task):        """        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.        If defect cannot be found in task, return the input structure.        """        if task.get('defect'):            return load_class(                task['defect']['@module'], task['defect']['@class']            ).from_dict(task['defect']).bulk_structure        elif task.get('transformations'):            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])        return Structure.from_dict(task['input']['structure'])class DefectThermoBuilder(Builder):    """    This builder creates collections of the DefectThermoDoc object.        (1) Find all DefectDocs that correspond to the same bulk material            given by material_id        (2) Create a new DefectThermoDoc for all of those documents        (3) Insert/Update the defect_thermos store with the new documents    """    def __init__(            self,            defects: Store,            defect_thermos: Store,            materials: Store,            electronic_structures: Store,            query: Optional[Dict] = None,            **kwargs,    ):        """        Args:            defects: Store of defect documents (generated by DefectBuilder)            defect_thermos: Store of DefectThermoDocs to generate.            materials: Store of MaterialDocs to construct phase diagram            electronic_structures: Store of DOS objects            query: dictionary to limit tasks to be analyzed        """        self.defects = defects        self.defect_thermos = defect_thermos        self.materials = materials        self.electronic_structures = electronic_structures        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)    def ensure_indexes(self):        """        Ensures indicies on the collections        """        # Basic search index for tasks        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        # Search index for materials        self.defect_thermos.ensure_index("material_id")    # TODO need to only process new tasks. Fast builder so currently is OK for small collections    def get_items(self) -> Iterator[List[Dict]]:        """        Gets items to process into DefectThermoDocs.        returns:            iterator yielding tuples containing:                - group of DefectDocs belonging to the same bulk material as indexed by material_id,                - materials in the chemsys of the bulk material for constructing phase diagram                - Dos of the bulk material for constructing phase diagrams/getting doping        """        self.logger.info("Defect thermo builder started")        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark build time for defect thermo documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks        all_docs = [doc for doc in self.defects.query()]        def filterfunc(x):            return bool(list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)))        for key, group in groupby(                filter(                    filterfunc,                    sorted(all_docs, key=lambda x: x['material_id'])                ), key=lambda x: x['material_id']        ):            group = [g for g in group]            try:                yield (group, self.__get_materials(key), self.__get_electronic_structure(group))            except LookupError as exception:                raise exception    def process_item(self, docs):        """        Process a group of defects belonging to the same material into a defect thermo doc        """        self.logger.info(f"Processing defects")        defects, materials, elec_struc = docs        defects = [DefectDoc(**d) for d in defects]        materials = [MaterialsDoc(**m) for m in materials]        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)        return defect_thermo_doc.dict()    def update_targets(self, items):        """        Inserts the new DefectThermoDocs into the defect_thermos store        """        items = [item for item in items if item]        for item in items:            item.update({"_bt": self.timestamp})        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defect thermo docs")            self.defect_thermos.update(                docs=jsanitize(items, allow_bson=True),                key=self.defect_thermos.key,            )        else:            self.logger.info("No items to update")    def __get_electronic_structure(self, group):        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))    def __get_materials(self, key) -> List:        """        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the        materials store.        """        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))        if not bulk:            raise LookupError(                f"The bulk material ({key}) for these defects cannot be found in the materials store"            )        elements = bulk[0]['chemsys']        if isinstance(elements, str):            elements = elements.split("-")        return list(chain(self.materials.query(criteria={"chemsys": {"$in": elements}}, properties=None), bulk))def unpack(query, d):    """    Unpack a mongo-style query into dictionary retrieval    """    if not query:        return d    if isinstance(d, List):        return unpack(query[1:], d.__getitem__(int(query.pop(0))))    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file
diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
index 128aa6e112..15b5c0a07a 100644
--- a/emmet-builders/emmet/builders/cp2k/materials.py
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -200,7 +200,6 @@ def process_item(self, tasks: List[Dict]) -> List[Dict]:
                     )
                 )
             except Exception as e:
-
                 # TODO construct deprecated
 
                 failed_ids = list({t_.task_id for t_ in group})
@@ -264,6 +263,8 @@ def filter_and_group_tasks(self, tasks: List[TaskDocument]) -> Iterator[List[Dic
         for idx, task in enumerate(filtered_tasks):
             s = task.output.structure
             s.index: int = idx  # type: ignore
+            s.remove_oxidation_states()
+            s.remove_spin()
             structures.append(s)
 
         grouped_structures = group_structures(
diff --git a/emmet-builders/emmet/builders/cp2k/utils.py b/emmet-builders/emmet/builders/cp2k/utils.py
index 73bf4dda45..47fbc65fe5 100644
--- a/emmet-builders/emmet/builders/cp2k/utils.py
+++ b/emmet-builders/emmet/builders/cp2k/utils.py
@@ -13,6 +13,8 @@ def unpack(query, d):
 
 
 # TODO Move polaron compare to a different function to make validation easier
+# TODO Matching should have have an easier way of setting distance tolerance
+    # can't this all be done inside of structure matcher to find mapping or something?
 def matcher(bulk_struc, defect_struc, final_bulk_struc=None, final_defect_struc=None):
     matching_indices = []
     dis = []
@@ -50,6 +52,7 @@ def matcher(bulk_struc, defect_struc, final_bulk_struc=None, final_defect_struc=
         return def_index, matching_indices
 
     elif len(def_index) > 1:
+        print(def_index)
         raise ValueError("The provided defect structure and bulk structure "
                          "have more than one potential defect site")
 
diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
index 2f2549003c..cf6d8b4b65 100644
--- a/emmet-core/emmet/core/cp2k/material.py
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -69,10 +69,11 @@ def from_tasks(
         statics = [task for task in task_group if task.task_type == TaskType.Static]  # type: ignore
 
         # Material ID
-        possible_mat_ids = [task.task_id for task in structure_optimizations]  # TODO remove + statics ?
+        possible_mat_ids = [task.task_id for task in structure_optimizations + statics]  # TODO remove + statics ?
         possible_mat_ids = sorted(possible_mat_ids, key=ID_to_int)
 
         matched_id = get_mpid([task.output.structure for task in structure_optimizations + statics][0])
+
         if matched_id:
             possible_mat_ids.insert(0, matched_id)
 
diff --git a/emmet-core/emmet/core/cp2k/validation.py b/emmet-core/emmet/core/cp2k/validation.py
index d0c8a1062a..9055bdf39d 100644
--- a/emmet-core/emmet/core/cp2k/validation.py
+++ b/emmet-core/emmet/core/cp2k/validation.py
@@ -7,14 +7,17 @@
 from emmet.core import SETTINGS
 from emmet.core.utils import DocEnum
 from emmet.core.vasp.task import TaskDocument
-from emmet.stubs import Structure
 
 
 class DeprecationMessage(DocEnum):
 
-    encut = "cutoff", "PW cutoff too low"
-    ldau = "ldau", "LDAU parameters don't match"
-    manual = "manual", "Manually deprecated"
+    MANUAL = "M", "manual deprecation"
+    CUTOFF = "C002", "PW cutoff too low"
+    FORCES = "C003", "Forces too large"
+    CONVERGENCE = "E001", "Calculation did not converge"
+    MAX_SCF = "E002", "Max SCF gradient too large"
+    LDAU = "I001", "LDAU Parameters don't match the inputset"
+    BASIS = "", "Improper basis sets"
 
 
 class ValidationDoc(BaseModel):
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index 343fe3de20..e962291bff 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -115,9 +115,9 @@ def _run_type(x):
 
             def _compare(new, old):
                 # TODO return kpoint density
-                return new['nsites'] > old['nsites']
+                return new['nsites'] > old.nsites
 
-            if defect_task_doc.run_type not in self.tasks or _compare(defect_task, self.tasks[rt]):
+            if defect_task_doc.run_type not in self.tasks or _compare(defect_task, self.tasks[rt][0]):
                 self.run_types.update({defect_task_doc.task_id: rt})
                 self.task_types.update({defect_task_doc.task_id: tt})
                 self.calc_types.update({defect_task_doc.task_id: ct})
@@ -135,7 +135,7 @@ def update_all(self, tasks, query='defect'):
             self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
 
     @classmethod
-    def from_tasks(cls, tasks: List, query='defect') -> "DefectDoc":
+    def from_tasks(cls, tasks: List, query='defect', material_id=None) -> "DefectDoc":
         """
         The standard way to create this document.
 
@@ -164,25 +164,29 @@ def _run_type(x):
 
         def _sort(x):
             # TODO return kpoint density, currently just does supercell size
-            return x[1].nsites
+            return -x[0]['nsites'], x[0]['output']['energy']
 
         entries = {}
         final_tasks = {}
         for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
-            entry_and_docs = [
+            sorted_tasks = sorted(tasks_for_runtype, key=_sort)
+
+            convergence = [
                 (
-                    cls.get_defect_entry_from_tasks(
-                        defect_task=defect_task,
-                        bulk_task=bulk_task,
-                        dielectric=dielectric,
-                        query=query
-                    ),
-                    TaskDocument(**defect_task), TaskDocument(**bulk_task)
+                    t[0]['nsites'],
+                    t[0]['output']['energy'], t[1]['output']['energy'],
+                    t[0]['defect']['@class'],
                 )
-                for defect_task, bulk_task, dielectric in tasks_for_runtype
+                for t in sorted_tasks
             ]
-            entry_and_docs.sort(key=_sort, reverse=True)
-            best_entry, best_defect_task, best_bulk_task = entry_and_docs[0]
+            #print(convergence)
+            #for t in sorted_tasks:
+            #    print('\t', TaskDocument(**t[0]).task_id)
+            #    print('\t', TaskDocument(**t[1]).task_id)
+
+            best_defect_task, best_bulk_task, dielectric = sorted_tasks[0]
+            best_entry = cls.get_defect_entry_from_tasks(best_defect_task, best_bulk_task, dielectric, query)
+            best_defect_task, best_bulk_task = TaskDocument(**best_defect_task), TaskDocument(**best_bulk_task)
             entries[best_defect_task.run_type] = best_entry
             final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task)
 
@@ -196,7 +200,7 @@ def _sort(x):
                 'task_ids': task_ids,
                 'deprecated_tasks': deprecated_tasks,
                 'tasks': final_tasks,
-                'material_id': best_entry.parameters['material_id'],
+                'material_id': material_id if material_id else best_entry.parameters['material_id'],
                 'entry_ids': {rt: entries[rt].entry_id for rt in entries},
                 'defect': best_entry.defect,
                 'name': best_entry.defect.name,
@@ -253,7 +257,17 @@ def get_parameters_from_tasks(cls, defect_task, bulk_task):
         """
 
         def get_init(x):
+            """
+            Helper function. If transformations were applied, get the structure post defect
+            transformation
+            """
             if x.get('transformations', {}).get('history'):
+                for i, y in enumerate(x['transformations']['history']):
+                    if y['@class'] == 'DefectTransformation':
+                        if len(x['transformations']['history']) == 1:
+                            return Structure.from_dict(x['input']['structure'])
+                        else:
+                            return Structure.from_dict(x['transformations']['history'][i+1]['input_structure'])
                 return Structure.from_dict(x['transformations']['history'][0]['input_structure'])
             return Structure.from_dict(x['input']['structure'])
 
@@ -263,25 +277,11 @@ def get_init(x):
         final_defect_structure = Structure.from_dict(defect_task['output']['structure'])
         final_bulk_structure = Structure.from_dict(bulk_task['output']['structure'])
 
-        axis_grid = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_grid']]
-        bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree']]
-        defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree']]
-
-        dfi, site_matching_indices = matcher(
-            init_bulk_structure, init_defect_structure,
-            final_bulk_struc=final_bulk_structure, final_defect_struc=final_defect_structure
-        )
-
         mpid = get_mpid(init_bulk_structure)
 
         parameters = {
             'defect_energy': defect_task['output']['energy'],
             'bulk_energy': bulk_task['output']['energy'],
-            'axis_grid': axis_grid,
-            'defect_frac_sc_coords': final_defect_structure[dfi].frac_coords,
-            'defect_planar_averages': defect_planar_averages,
-            'bulk_planar_averages': bulk_planar_averages,
-            'site_matching_indices': site_matching_indices,
             'initial_defect_structure': init_defect_structure,
             'final_defect_structure': final_defect_structure,
             'vbm': bulk_task['output']['vbm'],
@@ -290,6 +290,24 @@ def get_init(x):
             'entry_id': defect_task.get('task_id')
         }
 
+        # TODO Should probably get these even if not needed for corrections
+        if init_defect_structure.charge != 0:
+            axis_grid = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_grid']]
+            bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree']]
+            defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree']]
+
+            dfi, site_matching_indices = matcher(
+                init_bulk_structure, init_defect_structure,
+                final_bulk_struc=final_bulk_structure, final_defect_struc=final_defect_structure
+            )
+            defect_frac_sc_coords = final_defect_structure[dfi].frac_coords
+
+            parameters['axis_grid'] = axis_grid
+            parameters['bulk_planar_averages'] = bulk_planar_averages
+            parameters['defect_planar_averages'] = defect_planar_averages
+            parameters['defect_frac_sc_coords'] = defect_frac_sc_coords
+            parameters['site_matching_indices'] = site_matching_indices
+
         # cannot be easily queried for, so check here.
         if 'v_hartree' in final_bulk_structure.site_properties:
             parameters['bulk_atomic_site_averages'] = final_bulk_structure.site_properties['v_hartree']
diff --git a/emmet-core/emmet/core/mpid.py b/emmet-core/emmet/core/mpid.py
index e037f80642..68741fd264 100644
--- a/emmet-core/emmet/core/mpid.py
+++ b/emmet-core/emmet/core/mpid.py
@@ -24,7 +24,7 @@ def __init__(self, val: Union["MPID", int, str]):
 
         elif isinstance(val, str):
             parts = val.split("-")
-            parts[1] = int(parts[1])  # type: ignore
+            parts[-1] = int(parts[-1])  # type: ignore
             self.parts = tuple(parts)
             self.string = val
 

From f51ddd1062b71a2a1024b46edca1d246107630b1 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 2 Nov 2021 18:14:37 -0700
Subject: [PATCH 31/41] bug

---
 emmet-builders/emmet/builders/cp2k/defects.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 978ad7ed38..7c5c81d852 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1 +1 @@
-from datetime import datetimefrom itertools import chain, groupby, combinationsfrom typing import Dict, Iterator, List, Optionalfrom copy import deepcopyfrom monty.json import MontyDecoderfrom maggma.builders import Builderfrom maggma.stores import Storefrom pymatgen.core import Structurefrom pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparatorfrom pymatgen.symmetry.analyzer import SpacegroupAnalyzerfrom atomate.utils.utils import load_classfrom emmet.core import SETTINGSfrom emmet.core.utils import jsanitize, get_sgfrom emmet.core.defect import DefectDoc, DefectThermoDocfrom emmet.core.cp2k.calc_types import TaskTypefrom emmet.core.cp2k.calc_types.utils import run_typefrom emmet.core.cp2k.material import MaterialsDocfrom emmet.builders.settings import EmmetBuildSettingsfrom emmet.builders.cp2k.utils import get_mpid__author__ = "Nicholas Winner <nwinner@berkeley.edu>"__maintainer__ = "Jason Munro"class DefectBuilder(Builder):    """    The DefectBuilder collects task documents performed on structures containing a single point defect.    The builder is intended to group tasks corresponding to the same defect (species including charge state),    find the best ones, and perform finite-size defect corrections to create a defect document. These    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.    In order to make the build process easier, an entry must exist inside of the task doc that identifies it    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,    this may be changed to having a defect transformation in the transformation history.    The process is as follows:        1.) Find all documents containing the defect query.        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already            calculated. These are the candidate bulk tasks.        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding            bulk calculation.        4.) Convert (defect, bulk task) doc pairs to DefectDocs        5.) Post-process and validate defect document        6.) Update the defect store    """    # TODO: should dielectric/electronic_structure be optional or required?    def __init__(        self,        tasks: Store,        defects: Store,        dielectric: Store,        electronic_structure: Store,        materials: Store,        task_validation: Optional[Store] = None,        query: Optional[Dict] = None,        allowed_task_types: Optional[List[str]] = None,        settings: Optional[EmmetBuildSettings] = None,        **kwargs,    ):        """        Args:            tasks: Store of task documents            defects: Store of defect documents to generate            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property            allowed_task_types: list of task_types that can be processed            symprec: tolerance for SPGLib spacegroup finding            ltol: StructureMatcher tuning parameter for matching tasks to materials            stol: StructureMatcher tuning parameter for matching tasks to materials            angle_tol: StructureMatcher tuning parameter for matching tasks to materials        """        self.tasks = tasks        self.defects = defects        self.materials = materials        self.dielectric = dielectric        self.electronic_structure = electronic_structure        self.task_validation = task_validation        self.allowed_task_types = (            [t.value for t in TaskType]            if allowed_task_types is None            else allowed_task_types        )        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}        self.settings = EmmetBuildSettings.autoload(settings)        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        sources = [tasks, dielectric, electronic_structure, materials]        if self.task_validation:            sources.append(self.task_validation)        super().__init__(sources=sources, targets=[defects], **kwargs)    @property    def defect_query(self) -> str:        """        The standard query for defect tasks. Update this if        schema changes in the future.        For example, if top level key exists 'defect' can be returned.        Alternatively, if an initial defect transformation was performed, then        you can check via 'transformations.history.0.defect'        """        return 'defect'    @property    def identifying_defect_properties(self):        return [            'charge'        ]    @property    def required_defect_properties(self) -> List:        return [            self.defect_query,            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'input',            'transformations',            'task_id',            'nsites'        ]    @property    def optional_defect_properties(self) -> List:        return [            'last_updated',            'created_on',            'tags'        ]    @property    def required_bulk_properties(self) -> List:        return [            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'output.vbm',            'output.cbm',            'input',            'transformations',        ]    def ensure_indexes(self):        """        Ensures indicies on the tasks and materials collections        """        # Basic search index for tasks        self.tasks.ensure_index("task_id")        self.tasks.ensure_index("last_updated")        self.tasks.ensure_index("state")        self.tasks.ensure_index("formula_pretty")        # Search index for materials        self.materials.ensure_index("material_id")        self.materials.ensure_index("last_updated")        self.materials.ensure_index("task_ids")        # Search index for materials        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        self.defects.ensure_index("last_updated")        self.defects.ensure_index("task_ids")        if self.task_validation:            self.task_validation.ensure_index("task_id")            self.task_validation.ensure_index("valid")    def prechunk(self, number_splits: int) -> Iterator[Dict]:        raise NotImplementedError    def get_items(self) -> Iterator[List[Dict]]:        """        Gets all items to process into defect documents.        This does no datetime checking; relying on on whether        task_ids are included in the Defect Collection.        The procedure is as follows:            1. Get all tasks with standard "defect" query tag            2. Filter all tasks by skipping tasks which are already in the Defect Store            3. Get all tasks that could be used as bulk        Returns:            generator or list relevant tasks and materials to process into materials documents        """        self.logger.info("Defect builder started")        self.logger.info(            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"        )        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark buildtime for material documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        all_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        temp_query.update({self.defect_query: {'$exists': True}})        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})        defect_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        processed_defect_tasks = {            t_id            for d in self.defects.query({}, ["task_ids"])            for t_id in d.get("task_ids", [])        }        self.logger.debug("All tasks: {}".format(len(all_tasks)))        self.logger.debug("Bulk tasks before filter: {}".format(len(all_tasks-defect_tasks)))        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks        if not unprocessed_defect_tasks:            self.logger.info("No unprocessed to tasks. Exiting")            return        elif not bulk_tasks:            self.logger.info("No compatible bulk calculations. Exiting.")            return        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")        # Set total for builder bars to have a total        self.total = len(unprocessed_defect_tasks)        if self.task_validation:            invalid_ids = {                doc[self.tasks.key]                for doc in self.task_validation.query(                    {"is_valid": False}, [self.task_validation.key]                )            }        else:            invalid_ids = set()        for t in bulk_tasks.union(unprocessed_defect_tasks):            for doc in self.tasks.query({self.tasks.key: t}):                if t in invalid_ids:                    doc["is_valid"] = False                else:                    doc["is_valid"] = True        # yield list of defects that are of the same type, matched to an appropriate bulk calc        self.logger.info(f"Starting defect matching.")        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)    def process_item(self, items):        """        Process a group of defect tasks that correspond to the same defect into a single defect        document. If the DefectDoc already exists, then update it and return it. If it does not,        create a new DefectDoc        Args:            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]        returns: the defect document as a dictionary        """        defect_doc, item_bundle = items        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")        if item_bundle:            material_id = self.__get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..            if defect_doc:                defect_doc.update_all(item_bundle, query=self.defect_query)            else:                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)            return defect_doc.dict()    def update_targets(self, items):        """        Inserts the new task_types into the task_types collection        """        items = [item for item in items if item]        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defects")            for item in items:                item.update({"_bt": self.timestamp})                self.defects.remove_docs(                    {                       "task_ids": item['task_ids'],                    }                )            self.defects.update(                docs=jsanitize(items, allow_bson=True),                key='task_ids',            )        else:            self.logger.info("No items to update")    def __filter_and_group_tasks(self, tasks):        """        Groups defect tasks. Tasks are grouped according to the reduced representation        of the defect, and so tasks with different settings (e.g. supercell size, functional)        will be grouped together.        Args:            tasks: task_ids for unprocessed defects        returns:            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect        """        props = [            self.defect_query,            'task_id',            'output.structure'        ]        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)        sm = StructureMatcher(            ltol=self.settings.LTOL, stol=self.settings.STOL,            angle_tol=self.settings.ANGLE_TOL, allow_subset=False        )        defects = [            {                'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t),                'structure': Structure.from_dict(t['output']['structure'])            }            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)        ]        for d in defects:            # TODO remove oxidation state because spins/oxidation cause errors in comparison.            #  but they shouldnt if those props are close in value            d['structure'].remove_oxidation_states()        def key(x):            s = x.get('defect').bulk_structure            return get_sg(s), s.composition.reduced_composition        def are_equal(x, y):            """            To decide if defects are equal. Either the defect objects are            equal, OR two different defect objects relaxed to the same final structure            (common with interstitials).            TODO Need a way to do the output structure comparison for a X atom defect cell            TODO which can be embedded in a Y atom defect cell up to tolerance.            """            if pdc.are_equal(x['defect'], y['defect']):                return True            # TODO This is needed for ghost vacancy unfortunately, since  sm.fit can't distinguish ghosts            if sm.fit(x['structure'], y['structure']) and \                    x['defect'].defect_composition == y['defect'].defect_composition and \                    x['defect'].charge == y['defect'].charge:                return True            return False        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))        all_groups = []        # For each pre-grouped list of structures, perform actual matching.        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):            unmatched = list(g)            while len(unmatched) > 0:                i, refs = unmatched.pop(0)                matches = [i]                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))                matches.extend([unmatched[i][0] for i in inds])                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]                all_groups.append(                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])                )        self.logger.debug(f"All groups {all_groups}")        return all_groups    def __get_defect_from_task(self, task):        """        Using the defect_query property, retrieve a pymatgen defect object from the task document        """        defect = unpack(self.defect_query.split('.'), task)        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})    def __get_defect_doc(self, defect):        """        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists        returns: DefectDoc or None        """        material_id = self.__get_mpid(defect.bulk_structure)        docs = [            DefectDoc(**doc)            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)        ]        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)        for doc in docs:            if pdc.are_equal(defect, doc.defect):                return doc        return None    def __get_dielectric(self, task_id):        """        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would        be the case for metallic systems, return None.        """        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):            return diel['dielectric']['total']        return None    def __get_item_bundle(self, bulk_tasks, defect_task_group):        """        Gets a group of items that can be processed together into a defect document.        Args:            bulk_tasks: possible bulk tasks to match to defects            defect_task_group: group of equivalent defects (defined by PointDefectComparator)        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]        """        return [            (                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),                self.__get_dielectric(bulk_tasks_id),            )            for defect_tasks_id, bulk_tasks_id            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)        ]    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC    def __get_mpid(self, structure):        """        Given a structure, determine if an equivalent structure exists, with a material_id,        in the materials store.        Args:            structure: Candidate structure        returns: material_id, if one exists, else None        """        sga = SpacegroupAnalyzer(structure)        mats = self.materials.query(            criteria={                'chemsys': structure.composition.chemical_system,                'symmetry.symbol': sga.get_space_group_symbol()            }, properties=['structure', 'material_id']        )        sm = StructureMatcher()        for m in mats:            if sm.fit(structure, Structure.from_dict(m['structure'])):                return m['material_id']        return None    def __match_defects_to_bulks(self, bulk_ids, defect_ids):        """        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has        commensurate:            - Composition            - Number of sites            - Symmetry        """        self.logger.debug(f"Finding bulk/defect task combinations.")        props = [            'task_id',            'input',            'nsites',            'output.structure',            'transformations',            'defect',            'scale',        ]        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))        ps = DefectBuilder.__get_pristine_supercell(defects[0])        bulks = list(            self.tasks.query(                criteria={                    'task_id': {'$in': list(bulk_ids)},                    'composition_reduced': ps.composition.reduced_composition.as_dict(),                },                properties=props            )        )        sm = StructureMatcher(            ltol=SETTINGS.LTOL,            stol=SETTINGS.STOL,            angle_tol=SETTINGS.ANGLE_TOL,            primitive_cell=False,            scale=True,            attempt_supercell=True,            allow_subset=False,            comparator=ElementComparator(),        )        def _compare(b, d):            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \                run_type(d.get('input').get('dft')).value.split('+U')[0] and \                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):                if abs(b['nsites'] - d['nsites']) <= 1:                    return True            return False        pairs = [            (defect['task_id'], bulk['task_id'])            for bulk in bulks            for defect in defects            if _compare(bulk, defect)        ]        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")        return pairs    def __preprocess_bulk(self, task):        """        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk        tasks must have:            (1) Correspond to an existing material_id in the materials store            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store        """        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid']))        # TODO: This is for my personal use to get around the 2D materials problem. Should not be made official        if t['mpid']:            self.logger.debug(f"Found monolayer for...")            mpid = t['mpid'].split('-ML')[0]        else:            struc = Structure.from_dict(t.get('output').get('structure'))            mpid = self.__get_mpid(struc)            if not mpid:                return False        self.logger.debug(f"Material ID: {mpid}")        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})        if dos.get_gap():            diel = list(self.dielectric.query(criteria={self.dielectric.key: mpid}))            if not diel:                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "                                 f"dielectric properties, but none found in dielectric store")                return False        return True    @staticmethod    def __get_pristine_supercell(task):        """        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.        If defect cannot be found in task, return the input structure.        """        if task.get('defect'):            return load_class(                task['defect']['@module'], task['defect']['@class']            ).from_dict(task['defect']).bulk_structure        elif task.get('transformations'):            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])        return Structure.from_dict(task['input']['structure'])class DefectThermoBuilder(Builder):    """    This builder creates collections of the DefectThermoDoc object.        (1) Find all DefectDocs that correspond to the same bulk material            given by material_id        (2) Create a new DefectThermoDoc for all of those documents        (3) Insert/Update the defect_thermos store with the new documents    """    def __init__(            self,            defects: Store,            defect_thermos: Store,            materials: Store,            electronic_structures: Store,            query: Optional[Dict] = None,            **kwargs,    ):        """        Args:            defects: Store of defect documents (generated by DefectBuilder)            defect_thermos: Store of DefectThermoDocs to generate.            materials: Store of MaterialDocs to construct phase diagram            electronic_structures: Store of DOS objects            query: dictionary to limit tasks to be analyzed        """        self.defects = defects        self.defect_thermos = defect_thermos        self.materials = materials        self.electronic_structures = electronic_structures        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)    def ensure_indexes(self):        """        Ensures indicies on the collections        """        # Basic search index for tasks        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        # Search index for materials        self.defect_thermos.ensure_index("material_id")    # TODO need to only process new tasks. Fast builder so currently is OK for small collections    def get_items(self) -> Iterator[List[Dict]]:        """        Gets items to process into DefectThermoDocs.        returns:            iterator yielding tuples containing:                - group of DefectDocs belonging to the same bulk material as indexed by material_id,                - materials in the chemsys of the bulk material for constructing phase diagram                - Dos of the bulk material for constructing phase diagrams/getting doping        """        self.logger.info("Defect thermo builder started")        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark build time for defect thermo documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks        all_docs = [doc for doc in self.defects.query()]        def filterfunc(x):            return bool(list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)))        for key, group in groupby(                filter(                    filterfunc,                    sorted(all_docs, key=lambda x: x['material_id'])                ), key=lambda x: x['material_id']        ):            group = [g for g in group]            try:                yield (group, self.__get_materials(key), self.__get_electronic_structure(group))            except LookupError as exception:                raise exception    def process_item(self, docs):        """        Process a group of defects belonging to the same material into a defect thermo doc        """        self.logger.info(f"Processing defects")        defects, materials, elec_struc = docs        defects = [DefectDoc(**d) for d in defects]        materials = [MaterialsDoc(**m) for m in materials]        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)        return defect_thermo_doc.dict()    def update_targets(self, items):        """        Inserts the new DefectThermoDocs into the defect_thermos store        """        items = [item for item in items if item]        for item in items:            item.update({"_bt": self.timestamp})        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defect thermo docs")            self.defect_thermos.update(                docs=jsanitize(items, allow_bson=True),                key=self.defect_thermos.key,            )        else:            self.logger.info("No items to update")    def __get_electronic_structure(self, group):        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))    def __get_materials(self, key) -> List:        """        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the        materials store.        """        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))        if not bulk:            raise LookupError(                f"The bulk material ({key}) for these defects cannot be found in the materials store"            )        elements = bulk[0]['chemsys']        if isinstance(elements, str):            elements = elements.split("-")        return list(chain(self.materials.query(criteria={"chemsys": {"$in": elements}}, properties=None), bulk))def unpack(query, d):    """    Unpack a mongo-style query into dictionary retrieval    """    if not query:        return d    if isinstance(d, List):        return unpack(query[1:], d.__getitem__(int(query.pop(0))))    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file
+from datetime import datetimefrom itertools import chain, groupby, combinationsfrom typing import Dict, Iterator, List, Optionalfrom copy import deepcopyfrom monty.json import MontyDecoderfrom maggma.builders import Builderfrom maggma.stores import Storefrom pymatgen.core import Structurefrom pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparatorfrom pymatgen.symmetry.analyzer import SpacegroupAnalyzerfrom atomate.utils.utils import load_classfrom emmet.core import SETTINGSfrom emmet.core.utils import jsanitize, get_sgfrom emmet.core.defect import DefectDoc, DefectThermoDocfrom emmet.core.cp2k.calc_types import TaskTypefrom emmet.core.cp2k.calc_types.utils import run_typefrom emmet.core.cp2k.material import MaterialsDocfrom emmet.builders.settings import EmmetBuildSettingsfrom emmet.builders.cp2k.utils import get_mpid__author__ = "Nicholas Winner <nwinner@berkeley.edu>"__maintainer__ = "Jason Munro"class DefectBuilder(Builder):    """    The DefectBuilder collects task documents performed on structures containing a single point defect.    The builder is intended to group tasks corresponding to the same defect (species including charge state),    find the best ones, and perform finite-size defect corrections to create a defect document. These    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.    In order to make the build process easier, an entry must exist inside of the task doc that identifies it    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,    this may be changed to having a defect transformation in the transformation history.    The process is as follows:        1.) Find all documents containing the defect query.        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already            calculated. These are the candidate bulk tasks.        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding            bulk calculation.        4.) Convert (defect, bulk task) doc pairs to DefectDocs        5.) Post-process and validate defect document        6.) Update the defect store    """    # TODO: should dielectric/electronic_structure be optional or required?    def __init__(        self,        tasks: Store,        defects: Store,        dielectric: Store,        electronic_structure: Store,        materials: Store,        task_validation: Optional[Store] = None,        query: Optional[Dict] = None,        allowed_task_types: Optional[List[str]] = None,        settings: Optional[EmmetBuildSettings] = None,        **kwargs,    ):        """        Args:            tasks: Store of task documents            defects: Store of defect documents to generate            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property            allowed_task_types: list of task_types that can be processed            symprec: tolerance for SPGLib spacegroup finding            ltol: StructureMatcher tuning parameter for matching tasks to materials            stol: StructureMatcher tuning parameter for matching tasks to materials            angle_tol: StructureMatcher tuning parameter for matching tasks to materials        """        self.tasks = tasks        self.defects = defects        self.materials = materials        self.dielectric = dielectric        self.electronic_structure = electronic_structure        self.task_validation = task_validation        self.allowed_task_types = (            [t.value for t in TaskType]            if allowed_task_types is None            else allowed_task_types        )        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}        self.settings = EmmetBuildSettings.autoload(settings)        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        sources = [tasks, dielectric, electronic_structure, materials]        if self.task_validation:            sources.append(self.task_validation)        super().__init__(sources=sources, targets=[defects], **kwargs)    @property    def defect_query(self) -> str:        """        The standard query for defect tasks. Update this if        schema changes in the future.        For example, if top level key exists 'defect' can be returned.        Alternatively, if an initial defect transformation was performed, then        you can check via 'transformations.history.0.defect'        """        return 'defect'    @property    def identifying_defect_properties(self):        return [            'charge'        ]    @property    def required_defect_properties(self) -> List:        return [            self.defect_query,            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'input',            'transformations',            'task_id',            'nsites'        ]    @property    def optional_defect_properties(self) -> List:        return [            'last_updated',            'created_on',            'tags'        ]    @property    def required_bulk_properties(self) -> List:        return [            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'output.vbm',            'output.cbm',            'input',            'transformations',        ]    def ensure_indexes(self):        """        Ensures indicies on the tasks and materials collections        """        # Basic search index for tasks        self.tasks.ensure_index("task_id")        self.tasks.ensure_index("last_updated")        self.tasks.ensure_index("state")        self.tasks.ensure_index("formula_pretty")        # Search index for materials        self.materials.ensure_index("material_id")        self.materials.ensure_index("last_updated")        self.materials.ensure_index("task_ids")        # Search index for materials        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        self.defects.ensure_index("last_updated")        self.defects.ensure_index("task_ids")        if self.task_validation:            self.task_validation.ensure_index("task_id")            self.task_validation.ensure_index("valid")    def prechunk(self, number_splits: int) -> Iterator[Dict]:        raise NotImplementedError    def get_items(self) -> Iterator[List[Dict]]:        """        Gets all items to process into defect documents.        This does no datetime checking; relying on on whether        task_ids are included in the Defect Collection.        The procedure is as follows:            1. Get all tasks with standard "defect" query tag            2. Filter all tasks by skipping tasks which are already in the Defect Store            3. Get all tasks that could be used as bulk        Returns:            generator or list relevant tasks and materials to process into materials documents        """        self.logger.info("Defect builder started")        self.logger.info(            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"        )        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark buildtime for material documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        all_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        temp_query.update({self.defect_query: {'$exists': True}})        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})        defect_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        processed_defect_tasks = {            t_id            for d in self.defects.query({}, ["task_ids"])            for t_id in d.get("task_ids", [])        }        self.logger.debug("All tasks: {}".format(len(all_tasks)))        self.logger.debug("Bulk tasks before filter: {}".format(len(all_tasks-defect_tasks)))        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks        if not unprocessed_defect_tasks:            self.logger.info("No unprocessed to tasks. Exiting")            return        elif not bulk_tasks:            self.logger.info("No compatible bulk calculations. Exiting.")            return        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")        # Set total for builder bars to have a total        self.total = len(unprocessed_defect_tasks)        if self.task_validation:            invalid_ids = {                doc[self.tasks.key]                for doc in self.task_validation.query(                    {"is_valid": False}, [self.task_validation.key]                )            }        else:            invalid_ids = set()        for t in bulk_tasks.union(unprocessed_defect_tasks):            for doc in self.tasks.query({self.tasks.key: t}):                if t in invalid_ids:                    doc["is_valid"] = False                else:                    doc["is_valid"] = True        # yield list of defects that are of the same type, matched to an appropriate bulk calc        self.logger.info(f"Starting defect matching.")        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)    def process_item(self, items):        """        Process a group of defect tasks that correspond to the same defect into a single defect        document. If the DefectDoc already exists, then update it and return it. If it does not,        create a new DefectDoc        Args:            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]        returns: the defect document as a dictionary        """        defect_doc, item_bundle = items        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")        if item_bundle:            material_id = self.__get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..            if defect_doc:                defect_doc.update_all(item_bundle, query=self.defect_query)            else:                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)            return defect_doc.dict()    def update_targets(self, items):        """        Inserts the new task_types into the task_types collection        """        items = [item for item in items if item]        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defects")            for item in items:                item.update({"_bt": self.timestamp})                self.defects.remove_docs(                    {                       "task_ids": item['task_ids'],                    }                )            self.defects.update(                docs=jsanitize(items, allow_bson=True),                key='task_ids',            )        else:            self.logger.info("No items to update")    def __filter_and_group_tasks(self, tasks):        """        Groups defect tasks. Tasks are grouped according to the reduced representation        of the defect, and so tasks with different settings (e.g. supercell size, functional)        will be grouped together.        Args:            tasks: task_ids for unprocessed defects        returns:            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect        """        props = [            self.defect_query,            'task_id',            'output.structure'        ]        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)        sm = StructureMatcher(            ltol=self.settings.LTOL, stol=self.settings.STOL,            angle_tol=self.settings.ANGLE_TOL, allow_subset=False        )        defects = [            {                'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t),                'structure': Structure.from_dict(t['output']['structure'])            }            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)        ]        for d in defects:            # TODO remove oxidation state because spins/oxidation cause errors in comparison.            #  but they shouldnt if those props are close in value            d['structure'].remove_oxidation_states()        def key(x):            s = x.get('defect').bulk_structure            return get_sg(s), s.composition.reduced_composition        def are_equal(x, y):            """            To decide if defects are equal. Either the defect objects are            equal, OR two different defect objects relaxed to the same final structure            (common with interstitials).            TODO Need a way to do the output structure comparison for a X atom defect cell            TODO which can be embedded in a Y atom defect cell up to tolerance.            """            if pdc.are_equal(x['defect'], y['defect']):                return True            # TODO This is needed for ghost vacancy unfortunately, since  sm.fit can't distinguish ghosts            if sm.fit(x['structure'], y['structure']) and \                    x['defect'].defect_composition == y['defect'].defect_composition and \                    x['defect'].charge == y['defect'].charge:                return True            return False        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))        all_groups = []        # For each pre-grouped list of structures, perform actual matching.        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):            unmatched = list(g)            while len(unmatched) > 0:                i, refs = unmatched.pop(0)                matches = [i]                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))                matches.extend([unmatched[i][0] for i in inds])                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]                all_groups.append(                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])                )        self.logger.debug(f"All groups {all_groups}")        return all_groups    def __get_defect_from_task(self, task):        """        Using the defect_query property, retrieve a pymatgen defect object from the task document        """        defect = unpack(self.defect_query.split('.'), task)        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})    def __get_defect_doc(self, defect):        """        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists        returns: DefectDoc or None        """        material_id = self.__get_mpid(defect.bulk_structure)        docs = [            DefectDoc(**doc)            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)        ]        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)        for doc in docs:            if pdc.are_equal(defect, doc.defect):                return doc        return None    def __get_dielectric(self, task_id):        """        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would        be the case for metallic systems, return None.        """        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):            return diel['dielectric']['total']        return None    def __get_item_bundle(self, bulk_tasks, defect_task_group):        """        Gets a group of items that can be processed together into a defect document.        Args:            bulk_tasks: possible bulk tasks to match to defects            defect_task_group: group of equivalent defects (defined by PointDefectComparator)        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]        """        return [            (                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),                self.__get_dielectric(bulk_tasks_id),            )            for defect_tasks_id, bulk_tasks_id            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)        ]    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC    def __get_mpid(self, structure):        """        Given a structure, determine if an equivalent structure exists, with a material_id,        in the materials store.        Args:            structure: Candidate structure        returns: material_id, if one exists, else None        """        sga = SpacegroupAnalyzer(structure)        mats = self.materials.query(            criteria={                'chemsys': structure.composition.chemical_system,                'symmetry.symbol': sga.get_space_group_symbol()            }, properties=['structure', 'material_id']        )        sm = StructureMatcher()        for m in mats:            if sm.fit(structure, Structure.from_dict(m['structure'])):                return m['material_id']        return None    def __match_defects_to_bulks(self, bulk_ids, defect_ids):        """        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has        commensurate:            - Composition            - Number of sites            - Symmetry        """        self.logger.debug(f"Finding bulk/defect task combinations.")        props = [            'task_id',            'input',            'nsites',            'output.structure',            'transformations',            'defect',            'scale',        ]        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))        ps = DefectBuilder.__get_pristine_supercell(defects[0])        bulks = list(            self.tasks.query(                criteria={                    'task_id': {'$in': list(bulk_ids)},                    'composition_reduced': ps.composition.reduced_composition.as_dict(),                },                properties=props            )        )        sm = StructureMatcher(            ltol=SETTINGS.LTOL,            stol=SETTINGS.STOL,            angle_tol=SETTINGS.ANGLE_TOL,            primitive_cell=False,            scale=True,            attempt_supercell=True,            allow_subset=False,            comparator=ElementComparator(),        )        def _compare(b, d):            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \                run_type(d.get('input').get('dft')).value.split('+U')[0] and \                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):                if abs(b['nsites'] - d['nsites']) <= 1:                    return True            return False        pairs = [            (defect['task_id'], bulk['task_id'])            for bulk in bulks            for defect in defects            if _compare(bulk, defect)        ]        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")        return pairs    def __preprocess_bulk(self, task):        """        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk        tasks must have:            (1) Correspond to an existing material_id in the materials store            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store        """        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid']))        # TODO: This is for my personal use to get around the 2D materials problem. Should not be made official        if 'ML' in t['mpid']:            self.logger.debug(f"Found monolayer for...")            mpid = t['mpid'].split('-ML')[0]        else:            struc = Structure.from_dict(t.get('output').get('structure'))            mpid = self.__get_mpid(struc)            if not mpid:                return False        self.logger.debug(f"Material ID: {mpid}")        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})        if dos.get_gap():            diel = list(self.dielectric.query(criteria={self.dielectric.key: mpid}))            if not diel:                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "                                 f"dielectric properties, but none found in dielectric store")                return False        return True    @staticmethod    def __get_pristine_supercell(task):        """        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.        If defect cannot be found in task, return the input structure.        """        if task.get('defect'):            return load_class(                task['defect']['@module'], task['defect']['@class']            ).from_dict(task['defect']).bulk_structure        elif task.get('transformations'):            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])        return Structure.from_dict(task['input']['structure'])class DefectThermoBuilder(Builder):    """    This builder creates collections of the DefectThermoDoc object.        (1) Find all DefectDocs that correspond to the same bulk material            given by material_id        (2) Create a new DefectThermoDoc for all of those documents        (3) Insert/Update the defect_thermos store with the new documents    """    def __init__(            self,            defects: Store,            defect_thermos: Store,            materials: Store,            electronic_structures: Store,            query: Optional[Dict] = None,            **kwargs,    ):        """        Args:            defects: Store of defect documents (generated by DefectBuilder)            defect_thermos: Store of DefectThermoDocs to generate.            materials: Store of MaterialDocs to construct phase diagram            electronic_structures: Store of DOS objects            query: dictionary to limit tasks to be analyzed        """        self.defects = defects        self.defect_thermos = defect_thermos        self.materials = materials        self.electronic_structures = electronic_structures        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)    def ensure_indexes(self):        """        Ensures indicies on the collections        """        # Basic search index for tasks        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        # Search index for materials        self.defect_thermos.ensure_index("material_id")    # TODO need to only process new tasks. Fast builder so currently is OK for small collections    def get_items(self) -> Iterator[List[Dict]]:        """        Gets items to process into DefectThermoDocs.        returns:            iterator yielding tuples containing:                - group of DefectDocs belonging to the same bulk material as indexed by material_id,                - materials in the chemsys of the bulk material for constructing phase diagram                - Dos of the bulk material for constructing phase diagrams/getting doping        """        self.logger.info("Defect thermo builder started")        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark build time for defect thermo documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks        all_docs = [doc for doc in self.defects.query()]        def filterfunc(x):            return bool(list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)))        for key, group in groupby(                filter(                    filterfunc,                    sorted(all_docs, key=lambda x: x['material_id'])                ), key=lambda x: x['material_id']        ):            group = [g for g in group]            try:                yield (group, self.__get_materials(key), self.__get_electronic_structure(group))            except LookupError as exception:                raise exception    def process_item(self, docs):        """        Process a group of defects belonging to the same material into a defect thermo doc        """        self.logger.info(f"Processing defects")        defects, materials, elec_struc = docs        defects = [DefectDoc(**d) for d in defects]        materials = [MaterialsDoc(**m) for m in materials]        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)        return defect_thermo_doc.dict()    def update_targets(self, items):        """        Inserts the new DefectThermoDocs into the defect_thermos store        """        items = [item for item in items if item]        for item in items:            item.update({"_bt": self.timestamp})        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defect thermo docs")            self.defect_thermos.update(                docs=jsanitize(items, allow_bson=True),                key=self.defect_thermos.key,            )        else:            self.logger.info("No items to update")    def __get_electronic_structure(self, group):        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))    def __get_materials(self, key) -> List:        """        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the        materials store.        """        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))        if not bulk:            raise LookupError(                f"The bulk material ({key}) for these defects cannot be found in the materials store"            )        elements = bulk[0]['chemsys']        if isinstance(elements, str):            elements = elements.split("-")        return list(chain(self.materials.query(criteria={"chemsys": {"$in": elements}}, properties=None), bulk))def unpack(query, d):    """    Unpack a mongo-style query into dictionary retrieval    """    if not query:        return d    if isinstance(d, List):        return unpack(query[1:], d.__getitem__(int(query.pop(0))))    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file

From 6a3a73b8da235ea20daac577589cd87b39de9d45 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 7 Mar 2022 10:59:42 -0800
Subject: [PATCH 32/41] Updates

Updates to emmet as I work on integrating 2d defects. Fresysoldt sxdefectalign2d code cannot run on MacOS so I'll have to bounce back and force with the cluster to debug some of it.
---
 emmet-builders/emmet/builders/cp2k/defects.py | 794 +++++++++++++++++-
 .../emmet/builders/cp2k/task_validator.py     |  74 ++
 emmet-builders/emmet/builders/cp2k/utils.py   |  40 +-
 .../emmet/core/cp2k/calc_types/run_types.yaml |   4 +-
 .../emmet/core/cp2k/calc_types/utils.py       |   2 +
 emmet-core/emmet/core/defect.py               | 125 ++-
 6 files changed, 985 insertions(+), 54 deletions(-)
 create mode 100644 emmet-builders/emmet/builders/cp2k/task_validator.py

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 7c5c81d852..7344248519 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1 +1,793 @@
-from datetime import datetimefrom itertools import chain, groupby, combinationsfrom typing import Dict, Iterator, List, Optionalfrom copy import deepcopyfrom monty.json import MontyDecoderfrom maggma.builders import Builderfrom maggma.stores import Storefrom pymatgen.core import Structurefrom pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparatorfrom pymatgen.symmetry.analyzer import SpacegroupAnalyzerfrom atomate.utils.utils import load_classfrom emmet.core import SETTINGSfrom emmet.core.utils import jsanitize, get_sgfrom emmet.core.defect import DefectDoc, DefectThermoDocfrom emmet.core.cp2k.calc_types import TaskTypefrom emmet.core.cp2k.calc_types.utils import run_typefrom emmet.core.cp2k.material import MaterialsDocfrom emmet.builders.settings import EmmetBuildSettingsfrom emmet.builders.cp2k.utils import get_mpid__author__ = "Nicholas Winner <nwinner@berkeley.edu>"__maintainer__ = "Jason Munro"class DefectBuilder(Builder):    """    The DefectBuilder collects task documents performed on structures containing a single point defect.    The builder is intended to group tasks corresponding to the same defect (species including charge state),    find the best ones, and perform finite-size defect corrections to create a defect document. These    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.    In order to make the build process easier, an entry must exist inside of the task doc that identifies it    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,    this may be changed to having a defect transformation in the transformation history.    The process is as follows:        1.) Find all documents containing the defect query.        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already            calculated. These are the candidate bulk tasks.        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding            bulk calculation.        4.) Convert (defect, bulk task) doc pairs to DefectDocs        5.) Post-process and validate defect document        6.) Update the defect store    """    # TODO: should dielectric/electronic_structure be optional or required?    def __init__(        self,        tasks: Store,        defects: Store,        dielectric: Store,        electronic_structure: Store,        materials: Store,        task_validation: Optional[Store] = None,        query: Optional[Dict] = None,        allowed_task_types: Optional[List[str]] = None,        settings: Optional[EmmetBuildSettings] = None,        **kwargs,    ):        """        Args:            tasks: Store of task documents            defects: Store of defect documents to generate            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property            allowed_task_types: list of task_types that can be processed            symprec: tolerance for SPGLib spacegroup finding            ltol: StructureMatcher tuning parameter for matching tasks to materials            stol: StructureMatcher tuning parameter for matching tasks to materials            angle_tol: StructureMatcher tuning parameter for matching tasks to materials        """        self.tasks = tasks        self.defects = defects        self.materials = materials        self.dielectric = dielectric        self.electronic_structure = electronic_structure        self.task_validation = task_validation        self.allowed_task_types = (            [t.value for t in TaskType]            if allowed_task_types is None            else allowed_task_types        )        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}        self.settings = EmmetBuildSettings.autoload(settings)        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        sources = [tasks, dielectric, electronic_structure, materials]        if self.task_validation:            sources.append(self.task_validation)        super().__init__(sources=sources, targets=[defects], **kwargs)    @property    def defect_query(self) -> str:        """        The standard query for defect tasks. Update this if        schema changes in the future.        For example, if top level key exists 'defect' can be returned.        Alternatively, if an initial defect transformation was performed, then        you can check via 'transformations.history.0.defect'        """        return 'defect'    @property    def identifying_defect_properties(self):        return [            'charge'        ]    @property    def required_defect_properties(self) -> List:        return [            self.defect_query,            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'input',            'transformations',            'task_id',            'nsites'        ]    @property    def optional_defect_properties(self) -> List:        return [            'last_updated',            'created_on',            'tags'        ]    @property    def required_bulk_properties(self) -> List:        return [            'output.energy',            'output.v_hartree_grid',            'output.v_hartree',            'output.structure',            'output.vbm',            'output.cbm',            'input',            'transformations',        ]    def ensure_indexes(self):        """        Ensures indicies on the tasks and materials collections        """        # Basic search index for tasks        self.tasks.ensure_index("task_id")        self.tasks.ensure_index("last_updated")        self.tasks.ensure_index("state")        self.tasks.ensure_index("formula_pretty")        # Search index for materials        self.materials.ensure_index("material_id")        self.materials.ensure_index("last_updated")        self.materials.ensure_index("task_ids")        # Search index for materials        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        self.defects.ensure_index("last_updated")        self.defects.ensure_index("task_ids")        if self.task_validation:            self.task_validation.ensure_index("task_id")            self.task_validation.ensure_index("valid")    def prechunk(self, number_splits: int) -> Iterator[Dict]:        raise NotImplementedError    def get_items(self) -> Iterator[List[Dict]]:        """        Gets all items to process into defect documents.        This does no datetime checking; relying on on whether        task_ids are included in the Defect Collection.        The procedure is as follows:            1. Get all tasks with standard "defect" query tag            2. Filter all tasks by skipping tasks which are already in the Defect Store            3. Get all tasks that could be used as bulk        Returns:            generator or list relevant tasks and materials to process into materials documents        """        self.logger.info("Defect builder started")        self.logger.info(            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"        )        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark buildtime for material documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        all_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        temp_query.update({self.defect_query: {'$exists': True}})        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})        defect_tasks = {            doc[self.tasks.key]            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])        }        processed_defect_tasks = {            t_id            for d in self.defects.query({}, ["task_ids"])            for t_id in d.get("task_ids", [])        }        self.logger.debug("All tasks: {}".format(len(all_tasks)))        self.logger.debug("Bulk tasks before filter: {}".format(len(all_tasks-defect_tasks)))        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks        if not unprocessed_defect_tasks:            self.logger.info("No unprocessed to tasks. Exiting")            return        elif not bulk_tasks:            self.logger.info("No compatible bulk calculations. Exiting.")            return        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")        # Set total for builder bars to have a total        self.total = len(unprocessed_defect_tasks)        if self.task_validation:            invalid_ids = {                doc[self.tasks.key]                for doc in self.task_validation.query(                    {"is_valid": False}, [self.task_validation.key]                )            }        else:            invalid_ids = set()        for t in bulk_tasks.union(unprocessed_defect_tasks):            for doc in self.tasks.query({self.tasks.key: t}):                if t in invalid_ids:                    doc["is_valid"] = False                else:                    doc["is_valid"] = True        # yield list of defects that are of the same type, matched to an appropriate bulk calc        self.logger.info(f"Starting defect matching.")        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)    def process_item(self, items):        """        Process a group of defect tasks that correspond to the same defect into a single defect        document. If the DefectDoc already exists, then update it and return it. If it does not,        create a new DefectDoc        Args:            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]        returns: the defect document as a dictionary        """        defect_doc, item_bundle = items        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")        if item_bundle:            material_id = self.__get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..            if defect_doc:                defect_doc.update_all(item_bundle, query=self.defect_query)            else:                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)            return defect_doc.dict()    def update_targets(self, items):        """        Inserts the new task_types into the task_types collection        """        items = [item for item in items if item]        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defects")            for item in items:                item.update({"_bt": self.timestamp})                self.defects.remove_docs(                    {                       "task_ids": item['task_ids'],                    }                )            self.defects.update(                docs=jsanitize(items, allow_bson=True),                key='task_ids',            )        else:            self.logger.info("No items to update")    def __filter_and_group_tasks(self, tasks):        """        Groups defect tasks. Tasks are grouped according to the reduced representation        of the defect, and so tasks with different settings (e.g. supercell size, functional)        will be grouped together.        Args:            tasks: task_ids for unprocessed defects        returns:            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect        """        props = [            self.defect_query,            'task_id',            'output.structure'        ]        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)        sm = StructureMatcher(            ltol=self.settings.LTOL, stol=self.settings.STOL,            angle_tol=self.settings.ANGLE_TOL, allow_subset=False        )        defects = [            {                'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t),                'structure': Structure.from_dict(t['output']['structure'])            }            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)        ]        for d in defects:            # TODO remove oxidation state because spins/oxidation cause errors in comparison.            #  but they shouldnt if those props are close in value            d['structure'].remove_oxidation_states()        def key(x):            s = x.get('defect').bulk_structure            return get_sg(s), s.composition.reduced_composition        def are_equal(x, y):            """            To decide if defects are equal. Either the defect objects are            equal, OR two different defect objects relaxed to the same final structure            (common with interstitials).            TODO Need a way to do the output structure comparison for a X atom defect cell            TODO which can be embedded in a Y atom defect cell up to tolerance.            """            if pdc.are_equal(x['defect'], y['defect']):                return True            # TODO This is needed for ghost vacancy unfortunately, since  sm.fit can't distinguish ghosts            if sm.fit(x['structure'], y['structure']) and \                    x['defect'].defect_composition == y['defect'].defect_composition and \                    x['defect'].charge == y['defect'].charge:                return True            return False        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))        all_groups = []        # For each pre-grouped list of structures, perform actual matching.        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):            unmatched = list(g)            while len(unmatched) > 0:                i, refs = unmatched.pop(0)                matches = [i]                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))                matches.extend([unmatched[i][0] for i in inds])                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]                all_groups.append(                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])                )        self.logger.debug(f"All groups {all_groups}")        return all_groups    def __get_defect_from_task(self, task):        """        Using the defect_query property, retrieve a pymatgen defect object from the task document        """        defect = unpack(self.defect_query.split('.'), task)        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})    def __get_defect_doc(self, defect):        """        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists        returns: DefectDoc or None        """        material_id = self.__get_mpid(defect.bulk_structure)        docs = [            DefectDoc(**doc)            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)        ]        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)        for doc in docs:            if pdc.are_equal(defect, doc.defect):                return doc        return None    def __get_dielectric(self, task_id):        """        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would        be the case for metallic systems, return None.        """        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))        struc = Structure.from_dict(t.get('output').get('structure'))        for diel in self.dielectric.query(criteria={self.dielectric.key: get_mpid(struc)}, properties=['dielectric.total']):            return diel['dielectric']['total']        return None    def __get_item_bundle(self, bulk_tasks, defect_task_group):        """        Gets a group of items that can be processed together into a defect document.        Args:            bulk_tasks: possible bulk tasks to match to defects            defect_task_group: group of equivalent defects (defined by PointDefectComparator)        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]        """        return [            (                next(self.tasks.query({'task_id': defect_tasks_id}, properties=None)),                next(self.tasks.query({'task_id': bulk_tasks_id}, properties=None)),                self.__get_dielectric(bulk_tasks_id),            )            for defect_tasks_id, bulk_tasks_id            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)        ]    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC    def __get_mpid(self, structure):        """        Given a structure, determine if an equivalent structure exists, with a material_id,        in the materials store.        Args:            structure: Candidate structure        returns: material_id, if one exists, else None        """        sga = SpacegroupAnalyzer(structure)        mats = self.materials.query(            criteria={                'chemsys': structure.composition.chemical_system,                'symmetry.symbol': sga.get_space_group_symbol()            }, properties=['structure', 'material_id']        )        sm = StructureMatcher()        for m in mats:            if sm.fit(structure, Structure.from_dict(m['structure'])):                return m['material_id']        return None    def __match_defects_to_bulks(self, bulk_ids, defect_ids):        """        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has        commensurate:            - Composition            - Number of sites            - Symmetry        """        self.logger.debug(f"Finding bulk/defect task combinations.")        props = [            'task_id',            'input',            'nsites',            'output.structure',            'transformations',            'defect',            'scale',        ]        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))        ps = DefectBuilder.__get_pristine_supercell(defects[0])        bulks = list(            self.tasks.query(                criteria={                    'task_id': {'$in': list(bulk_ids)},                    'composition_reduced': ps.composition.reduced_composition.as_dict(),                },                properties=props            )        )        sm = StructureMatcher(            ltol=SETTINGS.LTOL,            stol=SETTINGS.STOL,            angle_tol=SETTINGS.ANGLE_TOL,            primitive_cell=False,            scale=True,            attempt_supercell=True,            allow_subset=False,            comparator=ElementComparator(),        )        def _compare(b, d):            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \                run_type(d.get('input').get('dft')).value.split('+U')[0] and \                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):                if abs(b['nsites'] - d['nsites']) <= 1:                    return True            return False        pairs = [            (defect['task_id'], bulk['task_id'])            for bulk in bulks            for defect in defects            if _compare(bulk, defect)        ]        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")        return pairs    def __preprocess_bulk(self, task):        """        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk        tasks must have:            (1) Correspond to an existing material_id in the materials store            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store        """        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid']))        # TODO: This is for my personal use to get around the 2D materials problem. Should not be made official        if 'ML' in t['mpid']:            self.logger.debug(f"Found monolayer for...")            mpid = t['mpid'].split('-ML')[0]        else:            struc = Structure.from_dict(t.get('output').get('structure'))            mpid = self.__get_mpid(struc)            if not mpid:                return False        self.logger.debug(f"Material ID: {mpid}")        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})        if dos.get_gap():            diel = list(self.dielectric.query(criteria={self.dielectric.key: mpid}))            if not diel:                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "                                 f"dielectric properties, but none found in dielectric store")                return False        return True    @staticmethod    def __get_pristine_supercell(task):        """        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.        If defect cannot be found in task, return the input structure.        """        if task.get('defect'):            return load_class(                task['defect']['@module'], task['defect']['@class']            ).from_dict(task['defect']).bulk_structure        elif task.get('transformations'):            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])        return Structure.from_dict(task['input']['structure'])class DefectThermoBuilder(Builder):    """    This builder creates collections of the DefectThermoDoc object.        (1) Find all DefectDocs that correspond to the same bulk material            given by material_id        (2) Create a new DefectThermoDoc for all of those documents        (3) Insert/Update the defect_thermos store with the new documents    """    def __init__(            self,            defects: Store,            defect_thermos: Store,            materials: Store,            electronic_structures: Store,            query: Optional[Dict] = None,            **kwargs,    ):        """        Args:            defects: Store of defect documents (generated by DefectBuilder)            defect_thermos: Store of DefectThermoDocs to generate.            materials: Store of MaterialDocs to construct phase diagram            electronic_structures: Store of DOS objects            query: dictionary to limit tasks to be analyzed        """        self.defects = defects        self.defect_thermos = defect_thermos        self.materials = materials        self.electronic_structures = electronic_structures        self.query = query if query else {}        self.timestamp = None        self.kwargs = kwargs        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)    def ensure_indexes(self):        """        Ensures indicies on the collections        """        # Basic search index for tasks        self.defects.ensure_index("material_id")        self.defects.ensure_index("defect_id")        # Search index for materials        self.defect_thermos.ensure_index("material_id")    # TODO need to only process new tasks. Fast builder so currently is OK for small collections    def get_items(self) -> Iterator[List[Dict]]:        """        Gets items to process into DefectThermoDocs.        returns:            iterator yielding tuples containing:                - group of DefectDocs belonging to the same bulk material as indexed by material_id,                - materials in the chemsys of the bulk material for constructing phase diagram                - Dos of the bulk material for constructing phase diagrams/getting doping        """        self.logger.info("Defect thermo builder started")        self.logger.info("Setting indexes")        self.ensure_indexes()        # Save timestamp to mark build time for defect thermo documents        self.timestamp = datetime.utcnow()        # Get all tasks        self.logger.info("Finding tasks to process")        temp_query = dict(self.query)        temp_query["state"] = "successful"        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks        all_docs = [doc for doc in self.defects.query()]        def filterfunc(x):            return bool(list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)))        for key, group in groupby(                filter(                    filterfunc,                    sorted(all_docs, key=lambda x: x['material_id'])                ), key=lambda x: x['material_id']        ):            group = [g for g in group]            try:                yield (group, self.__get_materials(key), self.__get_electronic_structure(group))            except LookupError as exception:                raise exception    def process_item(self, docs):        """        Process a group of defects belonging to the same material into a defect thermo doc        """        self.logger.info(f"Processing defects")        defects, materials, elec_struc = docs        defects = [DefectDoc(**d) for d in defects]        materials = [MaterialsDoc(**m) for m in materials]        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)        return defect_thermo_doc.dict()    def update_targets(self, items):        """        Inserts the new DefectThermoDocs into the defect_thermos store        """        items = [item for item in items if item]        for item in items:            item.update({"_bt": self.timestamp})        if len(items) > 0:            self.logger.info(f"Updating {len(items)} defect thermo docs")            self.defect_thermos.update(                docs=jsanitize(items, allow_bson=True),                key=self.defect_thermos.key,            )        else:            self.logger.info("No items to update")    def __get_electronic_structure(self, group):        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))    def __get_materials(self, key) -> List:        """        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the        materials store.        """        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))        if not bulk:            raise LookupError(                f"The bulk material ({key}) for these defects cannot be found in the materials store"            )        elements = bulk[0]['chemsys']        if isinstance(elements, str):            elements = elements.split("-")        return list(chain(self.materials.query(criteria={"chemsys": {"$in": elements}}, properties=None), bulk))def unpack(query, d):    """    Unpack a mongo-style query into dictionary retrieval    """    if not query:        return d    if isinstance(d, List):        return unpack(query[1:], d.__getitem__(int(query.pop(0))))    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file
+from datetime import datetime
+from itertools import chain, groupby, combinations
+from typing import Dict, Iterator, List, Optional
+from copy import deepcopy
+from monty.json import MontyDecoder
+
+from maggma.builders import Builder
+from maggma.stores import Store
+
+from pymatgen.core import Structure
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+
+from atomate.utils.utils import load_class
+
+from emmet.core import SETTINGS
+from emmet.core.utils import jsanitize, get_sg
+from emmet.core.defect import DefectDoc, DefectDoc2d, DefectThermoDoc
+from emmet.core.cp2k.calc_types import TaskType
+from emmet.core.cp2k.calc_types.utils import run_type
+from emmet.core.cp2k.material import MaterialsDoc
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.builders.cp2k.utils import get_mpid, synchronous_query
+
+from line_profiler import *
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+__maintainer__ = "Jason Munro"
+
+
+class DefectBuilder(Builder):
+    """
+    The DefectBuilder collects task documents performed on structures containing a single point defect.
+    The builder is intended to group tasks corresponding to the same defect (species including charge state),
+    find the best ones, and perform finite-size defect corrections to create a defect document. These
+    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.
+
+    In order to make the build process easier, an entry must exist inside of the task doc that identifies it
+    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,
+    this may be changed to having a defect transformation in the transformation history.
+
+    The process is as follows:
+
+        1.) Find all documents containing the defect query.
+        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already
+            calculated. These are the candidate bulk tasks.
+        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites
+            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding
+            bulk calculation.
+        4.) Convert (defect, bulk task) doc pairs to DefectDocs
+        5.) Post-process and validate defect document
+        6.) Update the defect store
+    """
+
+    # TODO: should dielectric/electronic_structure be optional or required?
+    def __init__(
+        self,
+        tasks: Store,
+        defects: Store,
+        dielectric: Store,
+        electronic_structure: Store,
+        materials: Store,
+        electrostatic_potentials: Store,
+        task_validation: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        allowed_task_types: Optional[List[str]] = None,
+        settings: Optional[EmmetBuildSettings] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            defects: Store of defect documents to generate
+            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property
+            allowed_task_types: list of task_types that can be processed
+            symprec: tolerance for SPGLib spacegroup finding
+            ltol: StructureMatcher tuning parameter for matching tasks to materials
+            stol: StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
+        """
+
+        self.tasks = tasks
+        self.defects = defects
+        self.materials = materials
+        self.dielectric = dielectric
+        self.electronic_structure = electronic_structure
+        self.electrostatic_potentials = electrostatic_potentials
+
+        self.task_validation = task_validation
+        self.allowed_task_types = (
+            [t.value for t in TaskType]
+            if allowed_task_types is None
+            else allowed_task_types
+        )
+
+        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.query = query if query else {}
+        self.timestamp = None
+        self.kwargs = kwargs
+
+        sources = [tasks, dielectric, electronic_structure, materials, electrostatic_potentials]
+        if self.task_validation:
+            sources.append(self.task_validation)
+        super().__init__(sources=sources, targets=[defects], **kwargs)
+
+    @property
+    def defect_query(self) -> str:
+        """
+        The standard query for defect tasks. Update this if
+        schema changes in the future.
+
+        For example, if top level key exists 'defect' can be returned.
+        Alternatively, if an initial defect transformation was performed, then
+        you can check via 'transformations.history.0.defect'
+        """
+        return 'transformations.history.0.defect'
+
+    @property
+    def identifying_defect_properties(self):
+        return [
+            'charge'
+        ]
+
+    @property
+    def required_defect_properties(self) -> List:
+        return [
+            self.defect_query,
+            'output.energy',
+            'output.structure',
+            'input',
+            'transformations',
+            'task_id',
+            'nsites'
+        ]
+
+    @property
+    def optional_defect_properties(self) -> List:
+        return [
+            'last_updated',
+            'created_on',
+            'tags'
+        ]
+
+    @property
+    def required_bulk_properties(self) -> List:
+        return [
+            'output.energy',
+            'output.structure',
+            'output.vbm',
+            'output.cbm',
+            'input',
+            'transformations',
+        ]
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Basic search index for tasks
+        self.tasks.ensure_index("task_id")
+        self.tasks.ensure_index("last_updated")
+        self.tasks.ensure_index("state")
+        self.tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("task_ids")
+
+        # Search index for materials
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("defect_id")
+        self.defects.ensure_index("last_updated")
+        self.defects.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    def prechunk(self, number_splits: int) -> Iterator[Dict]:
+        raise NotImplementedError
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into defect documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Defect Collection.
+
+        The procedure is as follows:
+
+            1. Get all tasks with standard "defect" query tag
+            2. Filter all tasks by skipping tasks which are already in the Defect Store
+            3. Get all tasks that could be used as bulk
+
+
+        Returns:
+            generator or list relevant tasks and materials to process into materials documents
+        """
+
+        self.logger.info("Defect builder started")
+        self.logger.info(
+            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
+        )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all tasks
+        self.logger.info("Finding tasks to process")
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        all_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        temp_query = {
+            "$and": [
+                temp_query, 
+                {self.defect_query: {'$exists': True}}, 
+                {d: {'$exists': True} for d in self.required_defect_properties}
+                ]
+            }
+        defect_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        processed_defect_tasks = {
+            t_id
+            for d in self.defects.query({}, ["task_ids"])
+            for t_id in d.get("task_ids", [])
+        }
+
+        self.logger.debug("All tasks: {}".format(len(all_tasks)))
+        self.logger.debug("Bulk tasks before filter: {}".format(len(all_tasks-defect_tasks)))
+        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))
+        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))
+        self.logger.debug("All defect tasks: {}".format(len(defect_tasks)))
+        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
+
+        if not unprocessed_defect_tasks:
+            self.logger.info("No unprocessed to tasks. Exiting")
+            return
+        elif not bulk_tasks:
+            self.logger.info("No compatible bulk calculations. Exiting.")
+            return
+
+        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
+        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")
+
+        # Set total for builder bars to have a total
+        self.total = len(unprocessed_defect_tasks)
+
+        if self.task_validation:
+            invalid_ids = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {"is_valid": False}, [self.task_validation.key]
+                )
+            }
+            for t in bulk_tasks.union(unprocessed_defect_tasks):
+                for doc in self.tasks.query({self.tasks.key: t}):
+                    if t in invalid_ids:
+                        doc["is_valid"] = False
+                    else:
+                        doc["is_valid"] = True
+
+        # yield list of defects that are of the same type, matched to an appropriate bulk calc
+        self.logger.info(f"Starting defect matching.")
+
+        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
+            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)
+
+    def process_item(self, items):
+        """
+        Process a group of defect tasks that correspond to the same defect into a single defect
+        document. If the DefectDoc already exists, then update it and return it. If it does not,
+        create a new DefectDoc
+
+        Args:
+            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]
+
+        returns: the defect document as a dictionary
+        """
+        defect_doc, item_bundle = items
+        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
+        if item_bundle:
+            material_id = self._get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..
+            print("THE MATRIAL ID I FOUND IS ", material_id)
+            if defect_doc:
+                defect_doc.update_all(item_bundle, query=self.defect_query)
+            else:
+                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
+            return defect_doc.dict()
+
+    def update_targets(self, items):
+        """
+        Inserts the new task_types into the task_types collection
+        """
+
+        items = [item for item in items if item]
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defects")
+            for item in items:
+                item.update({"_bt": self.timestamp})
+                self.defects.remove_docs(
+                    {
+                       "task_ids": item['task_ids'],
+                    }
+                )
+            self.defects.update(
+                docs=jsanitize(items, allow_bson=True),
+                key='task_ids',
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def __filter_and_group_tasks(self, tasks):
+        """
+        Groups defect tasks. Tasks are grouped according to the reduced representation
+        of the defect, and so tasks with different settings (e.g. supercell size, functional)
+        will be grouped together.
+
+        Args:
+            tasks: task_ids for unprocessed defects
+
+        returns:
+            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect
+        """
+
+        props = [
+            #self.defect_query,
+            'transformations',
+            'task_id',
+            'output.structure'
+        ]
+
+        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")
+
+        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)
+        sm = StructureMatcher(
+            ltol=self.settings.LTOL, stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL, allow_subset=False
+        )
+        defects = [
+            {
+                'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t),
+                'structure': Structure.from_dict(t['output']['structure'])
+            }
+            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)
+        ]
+        for d in defects:
+            # TODO remove oxidation state because spins/oxidation cause errors in comparison.
+            #  but they shouldnt if those props are close in value
+            d['structure'].remove_oxidation_states()
+
+        def key(x):
+            s = x.get('defect').bulk_structure
+            return get_sg(s), s.composition.reduced_composition
+
+        def are_equal(x, y):
+            """
+            To decide if defects are equal. Either the defect objects are
+            equal, OR two different defect objects relaxed to the same final structure
+            (common with interstitials).
+
+            TODO Need a way to do the output structure comparison for a X atom defect cell
+            TODO which can be embedded in a Y atom defect cell up to tolerance.
+            """
+            if pdc.are_equal(x['defect'], y['defect']):
+                return True
+
+            # TODO This is needed for ghost vacancy unfortunately, since  sm.fit can't distinguish ghosts
+            if x['defect'].defect_composition == y['defect'].defect_composition and \
+                    x['defect'].charge == y['defect'].charge and \
+                    sm.fit(x['structure'], y['structure']):
+                return True
+            return False
+
+        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))
+        all_groups = []
+
+        # For each pre-grouped list of structures, perform actual matching.
+        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):
+            unmatched = list(g)
+            while len(unmatched) > 0:
+                i, refs = unmatched.pop(0)
+                matches = [i]
+                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))
+                matches.extend([unmatched[i][0] for i in inds])
+                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
+                all_groups.append(
+                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])
+                )
+
+        self.logger.debug(f"All groups {all_groups}")
+        return all_groups
+
+    def __get_defect_from_task(self, task):
+        """
+        Using the defect_query property, retrieve a pymatgen defect object from the task document
+        """
+        defect = unpack(self.defect_query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+    def __get_defect_doc(self, defect):
+        """
+        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists
+
+        returns: DefectDoc or None
+        """
+        material_id = self._get_mpid(defect.bulk_structure)
+        docs = [
+            DefectDoc(**doc)
+            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)
+        ]
+        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)
+        for doc in docs:
+            if pdc.are_equal(defect, doc.defect):
+                return doc
+        return None
+
+    def __get_dielectric(self, task_id):
+        """
+        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store
+        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would
+        be the case for metallic systems, return None.
+        """
+        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
+        struc = Structure.from_dict(t.get('output').get('structure'))
+        self.logger.debug("Finding dielectric for task_id {} for MPID {}".format(task_id, self._get_mpid(struc)))
+        for diel in self.dielectric.query(criteria={self.dielectric.key: self._get_mpid(struc)}, properties=['dielectric.total']):
+            return diel['dielectric']['total']
+        return None
+
+    def __get_item_bundle(self, bulk_tasks, defect_task_group):
+        """
+        Gets a group of items that can be processed together into a defect document.
+
+        Args:
+            bulk_tasks: possible bulk tasks to match to defects
+            defect_task_group: group of equivalent defects (defined by PointDefectComparator)
+
+        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]
+        """
+        return [
+            (
+                next(synchronous_query(self.tasks, self.electrostatic_potentials, query={'task_id': defect_tasks_id}, properties=None)),
+                next(synchronous_query(self.tasks, self.electrostatic_potentials, query={'task_id': bulk_tasks_id}, properties=None)),
+                self.__get_dielectric(bulk_tasks_id),
+            )
+            for defect_tasks_id, bulk_tasks_id
+            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
+        ]
+
+    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC
+    def _get_mpid(self, structure):
+        """
+        Given a structure, determine if an equivalent structure exists, with a material_id,
+        in the materials store.
+
+        Args:
+            structure: Candidate structure
+
+        returns: material_id, if one exists, else None
+        """
+        sga = SpacegroupAnalyzer(structure)
+        mats = self.materials.query(
+            criteria={
+                'chemsys': structure.composition.chemical_system,
+                'symmetry.symbol': sga.get_space_group_symbol()
+            }, properties=['structure', 'material_id']
+        )
+        sm = StructureMatcher()
+        for m in mats:
+            if sm.fit(structure, Structure.from_dict(m['structure'])):
+                return m['material_id']
+        return None
+
+    def __match_defects_to_bulks(self, bulk_ids, defect_ids):
+        """
+        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has
+        commensurate:
+
+            - Composition
+            - Number of sites
+            - Symmetry
+
+        """
+
+        self.logger.debug(f"Finding bulk/defect task combinations.")
+        self.logger.debug(f"Bulk tasks: {bulk_ids}")
+        self.logger.debug(f"Defect tasks: {defect_ids}")
+
+        props = [
+            'task_id',
+            'input',
+            'nsites',
+            'output.structure',
+            'transformations',
+            'defect',
+            'scale',
+        ]
+        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))
+        ps = DefectBuilder.__get_pristine_supercell(defects[0])
+        bulks = list(
+            self.tasks.query(
+                criteria={
+                    'task_id': {'$in': list(bulk_ids)},
+                    'composition_reduced': ps.composition.reduced_composition.as_dict(),
+                },
+                properties=props
+            )
+        )
+
+        sm = StructureMatcher(
+            ltol=SETTINGS.LTOL,
+            stol=SETTINGS.STOL,
+            angle_tol=SETTINGS.ANGLE_TOL,
+            primitive_cell=False,
+            scale=True,
+            attempt_supercell=True,
+            allow_subset=False,
+            comparator=ElementComparator(),
+        )
+
+        # TODO I think the secondary (nsites) comparison might have some edge case issues
+        def _compare(b, d):
+            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \
+                run_type(d.get('input').get('dft')).value.split('+U')[0] and \
+                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):
+                if abs(b['nsites'] - d['nsites']) <= 1:
+                    return True
+            return False
+
+        pairs = []
+        for defect in defects:
+            for bulk in bulks:
+                if _compare(bulk, defect):
+                    pairs.append((defect['task_id'], bulk['task_id']))
+                    break
+
+        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
+        return pairs
+
+    def __preprocess_bulk(self, task):
+        """
+        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk
+        tasks must have:
+
+            (1) Correspond to an existing material_id in the materials store
+            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store
+
+        """
+        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid']))
+
+        # TODO: This is for my personal use to get around the 2D materials problem. Should not be made official
+
+        if 'ML' in t['mpid']:
+            self.logger.debug(f"Found monolayer for...")
+            mpid = t['mpid'].split('-ML')[0]
+        else:
+            struc = Structure.from_dict(t.get('output').get('structure'))
+            mpid = self._get_mpid(struc)
+            if not mpid:
+                return False
+        self.logger.debug(f"Material ID: {mpid}")
+
+        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))
+        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})
+        if dos.get_gap():
+            diel = list(self.dielectric.query(criteria={self.dielectric.key: mpid}))
+            if not diel:
+                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "
+                                 f"dielectric properties, but none found in dielectric store")
+                return False
+
+        return True
+
+    @staticmethod
+    def __get_pristine_supercell(task):
+        """
+        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
+        If defect cannot be found in task, return the input structure.
+        """
+        if task.get('defect'):
+            return load_class(
+                task['defect']['@module'], task['defect']['@class']
+            ).from_dict(task['defect']).bulk_structure
+        elif task.get('transformations'):
+            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])
+        return Structure.from_dict(task['input']['structure'])
+
+# TODO This needs to be unified into one builder somehow
+class DefectBuilder2d(DefectBuilder):
+
+    def process_item(self, items):
+        """
+        Process a group of defect tasks that correspond to the same defect into a single defect
+        document. If the DefectDoc already exists, then update it and return it. If it does not,
+        create a new DefectDoc
+
+        Args:
+            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]
+
+        returns: the defect document as a dictionary
+        """
+        defect_doc, item_bundle = items
+        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
+
+        if item_bundle:
+            material_id = self._get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..
+            if defect_doc:
+                defect_doc.update_all(item_bundle, query=self.defect_query)
+            else:
+                defect_doc = DefectDoc2d.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
+            return defect_doc.dict()
+
+class DefectThermoBuilder(Builder):
+
+    """
+    This builder creates collections of the DefectThermoDoc object.
+
+        (1) Find all DefectDocs that correspond to the same bulk material
+            given by material_id
+        (2) Create a new DefectThermoDoc for all of those documents
+        (3) Insert/Update the defect_thermos store with the new documents
+    """
+
+    def __init__(
+            self,
+            defects: Store,
+            defect_thermos: Store,
+            materials: Store,
+            electronic_structures: Store,
+            query: Optional[Dict] = None,
+            **kwargs,
+    ):
+        """
+        Args:
+            defects: Store of defect documents (generated by DefectBuilder)
+            defect_thermos: Store of DefectThermoDocs to generate.
+            materials: Store of MaterialDocs to construct phase diagram
+            electronic_structures: Store of DOS objects
+            query: dictionary to limit tasks to be analyzed
+        """
+
+        self.defects = defects
+        self.defect_thermos = defect_thermos
+        self.materials = materials
+        self.electronic_structures = electronic_structures
+
+        self.query = query if query else {}
+        self.timestamp = None
+        self.kwargs = kwargs
+
+        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the collections
+        """
+
+        # Basic search index for tasks
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("defect_id")
+
+        # Search index for materials
+        self.defect_thermos.ensure_index("material_id")
+
+    # TODO need to only process new tasks. Fast builder so currently is OK for small collections
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets items to process into DefectThermoDocs.
+
+        returns:
+            iterator yielding tuples containing:
+                - group of DefectDocs belonging to the same bulk material as indexed by material_id,
+                - materials in the chemsys of the bulk material for constructing phase diagram
+                - Dos of the bulk material for constructing phase diagrams/getting doping
+
+        """
+
+        self.logger.info("Defect thermo builder started")
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark build time for defect thermo documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all tasks
+        self.logger.info("Finding tasks to process")
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks
+
+        all_docs = [doc for doc in self.defects.query(self.query)]
+
+        def filterfunc(x):
+            # material for defect x exists
+            if not list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)):
+                return False
+
+            # All chempots exist in material store
+            if not all(
+                bool(list(self.materials.query(criteria={'chemsys': str(el)}, properties=None)))
+                for el in
+                load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).defect_composition
+            ):
+                return False
+
+            return True
+
+        for key, group in groupby(
+                filter(
+                    filterfunc,
+                    sorted(all_docs, key=lambda x: x['material_id'])
+                ), key=lambda x: x['material_id']
+        ):
+            group = [g for g in group]
+            try:
+                yield (group, self.__get_materials(key), self.__get_electronic_structure(group))
+            except LookupError as exception:
+                raise exception
+
+    def process_item(self, docs):
+        """
+        Process a group of defects belonging to the same material into a defect thermo doc
+        """
+        self.logger.info(f"Processing defects")
+        defects, materials, elec_struc = docs
+        defects = [DefectDoc(**d) for d in defects]
+        materials = [MaterialsDoc(**m) for m in materials]
+        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)
+        return defect_thermo_doc.dict()
+
+    def update_targets(self, items):
+        """
+        Inserts the new DefectThermoDocs into the defect_thermos store
+        """
+        items = [item for item in items if item]
+        for item in items:
+            item.update({"_bt": self.timestamp})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defect thermo docs")
+            self.defect_thermos.update(
+                docs=jsanitize(items, allow_bson=True),
+                key=self.defect_thermos.key,
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def __get_electronic_structure(self, group):
+        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))
+
+    def __get_materials(self, key) -> List:
+        """
+        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
+        materials store.
+        """
+        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))
+        if not bulk:
+            raise LookupError(
+                f"The bulk material ({key}) for these defects cannot be found in the materials store"
+            )
+        elements = bulk[0]['chemsys']
+
+        if isinstance(elements, str):
+            elements = elements.split("-")
+
+        return list(chain(self.materials.query(criteria={"chemsys": {"$in": elements}}, properties=None), bulk))
+
+
+def unpack(query, d):
+    """
+    Unpack a mongo-style query into dictionary retrieval
+    """
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
diff --git a/emmet-builders/emmet/builders/cp2k/task_validator.py b/emmet-builders/emmet/builders/cp2k/task_validator.py
new file mode 100644
index 0000000000..061ace03b5
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/task_validator.py
@@ -0,0 +1,74 @@
+from typing import Dict, Optional
+
+from maggma.builders import MapBuilder
+from maggma.core import Store
+
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.vasp.task import TaskDocument
+from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc
+
+
+class TaskValidator(MapBuilder):
+    def __init__(
+        self,
+        tasks: Store,
+        task_validation: Store,
+        settings: Optional[EmmetBuildSettings] = None,
+        query: Optional[Dict] = None,
+        **kwargs,
+    ):
+        """
+        Creates task_types from tasks and type definitions
+
+        Args:
+            tasks: Store of task documents
+            task_validation: Store of task_types for tasks
+        """
+        self.tasks = tasks
+        self.task_validation = task_validation
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.query = query
+        self.kwargs = kwargs
+
+        super().__init__(
+            source=tasks,
+            target=task_validation,
+            projection=[
+                "orig_inputs",
+                "input.hubbards",
+                "output.structure",
+                "output.bandgap",
+                "calcs_reversed.output.ionic_steps.electronic_steps.e_fr_energy",
+                "tags",
+                # Need these two for proper run_type determination
+                "calcs_reversed.input.parameters",
+                "calcs_reversed.input.incar",
+            ],
+            query=query,
+            **kwargs,
+        )
+
+    def unary_function(self, item):
+        """
+        Find the task_type for the item
+
+        Args:
+            item (dict): a (projection of a) task doc
+        """
+        task_doc = TaskDocument(**item)
+        validation_doc = ValidationDoc.from_task_doc(
+            task_doc=task_doc,
+            kpts_tolerance=self.settings.VASP_KPTS_TOLERANCE,
+            kspacing_tolerance=self.settings.VASP_KSPACING_TOLERANCE,
+            input_sets=self.settings.VASP_DEFAULT_INPUT_SETS,
+            LDAU_fields=self.settings.VASP_CHECKED_LDAU_FIELDS,
+            max_allowed_scf_gradient=self.settings.VASP_MAX_SCF_GRADIENT,
+        )
+
+        bad_tags = list(set(task_doc.tags).intersection(self.settings.DEPRECATED_TAGS))
+        if len(bad_tags) > 0:
+            validation_doc.warnings.append(f"Manual Deprecation by tags: {bad_tags}")
+            validation_doc.valid = False
+            validation_doc.reasons.append(DeprecationMessage.MANUAL)
+
+        return validation_doc
diff --git a/emmet-builders/emmet/builders/cp2k/utils.py b/emmet-builders/emmet/builders/cp2k/utils.py
index 47fbc65fe5..e3bd98c8a3 100644
--- a/emmet-builders/emmet/builders/cp2k/utils.py
+++ b/emmet-builders/emmet/builders/cp2k/utils.py
@@ -1,7 +1,9 @@
+from typing import List
+from copy import deepcopy
 import numpy as np
 from pymatgen.ext.matproj import MPRester
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-from typing import List
+from pymatgen.analysis.structure_matcher import StructureMatcher
 
 
 def unpack(query, d):
@@ -46,7 +48,13 @@ def matcher(bulk_struc, defect_struc, final_bulk_struc=None, final_defect_struc=
                     matching_indices.append((i, j))
                     break
 
-        oxi_diff = [abs(final_defect_struc[d].specie.oxi_state - final_bulk_struc[b].specie.oxi_state) for b, d in matching_indices]
+        oxi_diff = [
+            abs(
+                final_defect_struc.site_properties['oxidation_state'][d] -
+                final_bulk_struc.site_properties['oxidation_state'][b]
+            )
+            for b, d in matching_indices
+        ]
         def_index = np.argmax(oxi_diff)
         matching_indices.pop(def_index)
         return def_index, matching_indices
@@ -70,10 +78,6 @@ def get_dielectric(mpid):
     except:
         return None
 
-
-from copy import deepcopy
-
-
 def get_mpid(s):
     struc = deepcopy(s)
     struc.remove_oxidation_states()
@@ -81,17 +85,29 @@ def get_mpid(s):
     for p in struc.site_properties:
         struc.remove_site_property(p)
     sga = SpacegroupAnalyzer(struc)
+    sm = StructureMatcher()
     with MPRester() as mp:
         dat = mp.query(
             criteria={
                 'chemsys': struc.composition.chemical_system,
                 'spacegroup.symbol': sga.get_space_group_symbol()
             },
-            properties=['material_id', 'formation_energy_per_atom']
+            properties=['material_id', 'structure']
         )
-
-    dat.sort(key=lambda x: x['formation_energy_per_atom'])
-    try:
+    dat = list(filter(lambda x: sm.fit(s, x['structure']), dat))
+    if len(dat) == 1:
         return dat[0]['material_id']
-    except:
-        return None
+    return None
+
+
+def synchronous_query(store1, store2, query, properties=None):
+    for t in store1.query(query, properties):
+        for key in t.keys():
+            if isinstance(t[key], dict) and t[key].get('gfs_id'):
+                t.update(
+                    {
+                        k: v for k,v in next(store2.query({"_id": t[key].get('gfs_id')}, properties)).items()
+                        if properties is None or k in properties
+                    }
+                    )
+        yield t
diff --git a/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
index 9d22ee02a8..8d52d481fb 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
+++ b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
@@ -18,8 +18,8 @@ HYBRID:
   HSE06:
     FRACTION: 0.25
     INTERACTION_POTENTIAL: SHORTRANGE
-  PB0:
+  PBE0:
     FRACTION: 0.25
     INTERACTION_POTENTIAL: TRUNCATED
   RSH:
-    INTERACTION_POTENTIAL: TRUNCATED_MIX_CL
+    INTERACTION_POTENTIAL: TRUNCATED_MIX_CL
\ No newline at end of file
diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py
index b245e93c12..0e13af3e8a 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/utils.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py
@@ -54,6 +54,8 @@ def _variant_equal(v1, v2) -> bool:
             ):
                 return RunType(f"{functional_class}{is_hubbard}")
 
+    if parameters.get('FRACTION'):
+        return RunType(f"HYBRID{is_hubbard}")
 
 def task_type(
     inputs: Dict
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index e962291bff..ed059efe64 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -2,13 +2,14 @@
 from datetime import datetime
 from typing import ClassVar, Dict, Tuple, Mapping, List
 from monty.json import MontyDecoder
+from monty.tempfile import ScratchDir
 from itertools import groupby
 from pydantic import Field, validator, BaseModel
 
 from pymatgen.core import Structure, Composition, Element
 from pymatgen.analysis.defects.core import DefectEntry, Defect
 from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
-from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, DefectPredominanceDiagram
+from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, BrouwerDiagram
 from pymatgen.electronic_structure.dos import CompleteDos
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 from pymatgen.entries.computed_entries import CompositionEnergyAdjustment
@@ -23,6 +24,8 @@
 from emmet.core.cp2k.calc_types.utils import run_type
 from emmet.builders.cp2k.utils import get_mpid, matcher
 
+from pymatgen.analysis.defects.corrections import FreysoldtCorrection2d
+from pymatgen.io.vasp.outputs import VolumetricData
 
 # TODO Update DefectDoc on defect entry level so you don't re-do uncessary corrections
 class DefectDoc(StructureMetadata):
@@ -170,19 +173,9 @@ def _sort(x):
         final_tasks = {}
         for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
-
-            convergence = [
-                (
-                    t[0]['nsites'],
-                    t[0]['output']['energy'], t[1]['output']['energy'],
-                    t[0]['defect']['@class'],
-                )
-                for t in sorted_tasks
-            ]
-            #print(convergence)
-            #for t in sorted_tasks:
-            #    print('\t', TaskDocument(**t[0]).task_id)
-            #    print('\t', TaskDocument(**t[1]).task_id)
+            ents = [cls.get_defect_entry_from_tasks(t[0], t[1], t[2], query) for t in sorted_tasks]
+            for e in ents:
+                print(e.energy)
 
             best_defect_task, best_bulk_task, dielectric = sorted_tasks[0]
             best_entry = cls.get_defect_entry_from_tasks(best_defect_task, best_bulk_task, dielectric, query)
@@ -277,7 +270,7 @@ def get_init(x):
         final_defect_structure = Structure.from_dict(defect_task['output']['structure'])
         final_bulk_structure = Structure.from_dict(bulk_task['output']['structure'])
 
-        mpid = get_mpid(init_bulk_structure)
+        #mpid = get_mpid(init_bulk_structure)
 
         parameters = {
             'defect_energy': defect_task['output']['energy'],
@@ -286,15 +279,17 @@ def get_init(x):
             'final_defect_structure': final_defect_structure,
             'vbm': bulk_task['output']['vbm'],
             'cbm': bulk_task['output']['cbm'],
-            'material_id': mpid,
+            #'material_id': mpid,
             'entry_id': defect_task.get('task_id')
         }
 
         # TODO Should probably get these even if not needed for corrections
         if init_defect_structure.charge != 0:
-            axis_grid = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_grid']]
-            bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree']]
-            defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree']]
+            axis_grid = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_grid']] if 'v_hartree_grid' in bulk_task['output'] else None
+            bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_planar']] if 'v_hartree_planar' in bulk_task['output'] else None
+            defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_planar']] if 'v_hartree_planar' in defect_task['output'] else None
+            bulk_site_averages = [float(x) for x in bulk_task['output']['v_hartree_sites']] if 'v_hartree_sites' in bulk_task['output'] else None
+            defect_site_averages = [float(x) for x in defect_task['output']['v_hartree_sites']] if 'v_hartree_sites' in defect_task['output'] else None
 
             dfi, site_matching_indices = matcher(
                 init_bulk_structure, init_defect_structure,
@@ -307,16 +302,61 @@ def get_init(x):
             parameters['defect_planar_averages'] = defect_planar_averages
             parameters['defect_frac_sc_coords'] = defect_frac_sc_coords
             parameters['site_matching_indices'] = site_matching_indices
-
-        # cannot be easily queried for, so check here.
-        if 'v_hartree' in final_bulk_structure.site_properties:
-            parameters['bulk_atomic_site_averages'] = final_bulk_structure.site_properties['v_hartree']
-        if 'v_hartree' in final_defect_structure.site_properties:
-            parameters['defect_atomic_site_averages'] = final_defect_structure.site_properties['v_hartree']
+            parameters['bulk_site_averages'] = bulk_site_averages
+            parameters['defect_site_averages'] = defect_site_averages
 
         return parameters
 
 
+# TODO Some of this should be done by DefectCompatibility,
+# but it's not clear how to do that since 2d materials 
+# are not tagged in any particular way to allow defect compatibility
+# to decide which correction to apply
+class DefectDoc2d(DefectDoc):
+    """
+    DefectDoc subclass for 2D defects
+    """
+
+    @classmethod
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='defect'):
+        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+        if dielectric:
+            parameters['dielectric'] = dielectric
+
+        defect_entry = DefectEntry(
+            cls.get_defect_from_task(query=query, task=defect_task),
+            uncorrected_energy=parameters['defect_energy'] - parameters['bulk_energy'],
+            parameters=parameters,
+            entry_id=parameters['entry_id']
+        )
+
+        DefectCompatibility().process_entry(defect_entry, perform_corrections=False)
+
+        if False: 
+            with ScratchDir('.'):
+                fc = FreysoldtCorrection2d(
+                    defect_entry.parameters.get('dielectric', 22), 
+                    "LOCPOT.ref", "LOCPOT.def", encut=520, buffer=2
+                ) 
+                lref = VolumetricData(
+                    structure=Structure.from_dict(bulk_task['output']['structure']), 
+                    data={'total': MontyDecoder().process_decoded(bulk_task['v_hartree'])}
+                )
+                ldef = VolumetricData(
+                    structure=Structure.from_dict(defect_task['output']['structure']), 
+                    data={'total': MontyDecoder().process_decoded(defect_task['v_hartree'])}
+                )
+                lref.write_file("LOCPOT.ref")
+                ldef.write_file("LOCPOT.def")
+                ecorr = fc.get_correction(defect_entry)
+                defect_entry.corrections.update(ecorr)
+                defect_entry.parameters['freysoldt2d_meta'] = fc.metadata
+
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
+
+        return defect_entry
+
 class DefectThermoDoc(BaseModel):
 
     class Config:
@@ -330,19 +370,19 @@ class Config:
         None, description="All task ids used in creating these phase diagrams"
     )
 
-    defect_predominance_diagrams: Mapping[RunType, DefectPredominanceDiagram] = Field(
-        None, description="Defect predominance diagrams"
+    brouwer_diagrams: Mapping[RunType, BrouwerDiagram] = Field(
+        None, description="Brouwer diagrams"
     )
 
     # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
-    @validator("defect_predominance_diagrams", pre=True)
-    def decode(cls, defect_predominance_diagrams):
-        for e in defect_predominance_diagrams:
-            if isinstance(defect_predominance_diagrams[e], dict):
-                defect_predominance_diagrams[e] = MontyDecoder().process_decoded(
-                    {k: v for k, v in defect_predominance_diagrams[e].items()}
+    @validator("brouwer_diagrams", pre=True)
+    def decode(cls, brouwer_diagrams):
+        for e in brouwer_diagrams:
+            if isinstance(brouwer_diagrams[e], dict):
+                brouwer_diagrams[e] = MontyDecoder().process_decoded(
+                    {k: v for k, v in brouwer_diagrams[e].items()}
                 )
-        return defect_predominance_diagrams
+        return brouwer_diagrams
 
     @classmethod
     def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], electronic_structure: CompleteDos) -> "DefectThermoDoc":
@@ -362,7 +402,7 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
         defect_phase_diagram = {}
         vbms = {}
         band_gaps = {}
-        defect_predominance_diagrams = {}
+        brouwer_diagrams = {}
         task_ids = {}
 
         dos = CompleteDos.from_dict(electronic_structure)
@@ -370,8 +410,13 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
 
         for m in materials:
             for rt, ent in m.entries.items():
+                __found_chempots__ = True
+
                 # Chempot shift
                 for el, amt in ent.composition.element_composition.items():
+                    if Element(el) not in chempots:
+                        __found_chempots__ = False
+                        break
                     _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
                     adj = CompositionEnergyAdjustment(-chempots[Element(el)][_rt],
                                                       n_atoms=amt,
@@ -379,6 +424,9 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
                                                       )
                     ent.energy_adjustments.append(adj)
 
+                if not __found_chempots__:
+                    continue
+
                 # Other stuff
                 band_gaps[rt] = bg
                 ent.parameters['software'] = 'cp2k'
@@ -397,8 +445,7 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
                         __found_chempots__ = False
                         break
                     _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
-                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt * chempots[Element(el)][
-                        _rt]
+                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt * chempots[Element(el)][_rt]
 
                 if not __found_chempots__:
                     continue
@@ -420,7 +467,7 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
                 band_gap=band_gaps[run_type],
                 filter_compatible=False
             )
-            defect_predominance_diagrams[run_type] = DefectPredominanceDiagram(
+            brouwer_diagrams[run_type] = BrouwerDiagram(
                 defect_phase_diagram=defect_phase_diagram[run_type],
                 bulk_dos=dos,
                 entries=[
@@ -434,7 +481,7 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
         data = {
             'material_id': mpid,
             'task_ids': task_ids,
-            'defect_predominance_diagrams': defect_predominance_diagrams,
+            'brouwer_diagrams': brouwer_diagrams,
         }
 
         return cls(**{k: v for k, v in data.items()})

From c0c434e84b1a0df9e76abcc33a5e9c1108ddf54f Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 7 Mar 2022 20:40:56 -0800
Subject: [PATCH 33/41] Todos

---
 emmet-builders/emmet/builders/cp2k/defects.py |   5 +-
 emmet-builders/emmet/builders/vasp/defects.py | 936 ++++++++++++++++++
 .../emmet/core/cp2k/calc_types/utils.py       |   2 +
 3 files changed, 941 insertions(+), 2 deletions(-)
 create mode 100644 emmet-builders/emmet/builders/vasp/defects.py

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 7344248519..978fa777a6 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -22,8 +22,6 @@
 from emmet.builders.settings import EmmetBuildSettings
 from emmet.builders.cp2k.utils import get_mpid, synchronous_query
 
-from line_profiler import *
-
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Jason Munro"
 
@@ -542,6 +540,9 @@ def _compare(b, d):
                     return True
             return False
 
+        # TODO This loop will terminate the match when the first bulk match for a defect is found. This should 
+        # be fine if we can ensure that the commenserate bulks are all the same given they both compare True.
+        # Need to double check that they really are the same.
         pairs = []
         for defect in defects:
             for bulk in bulks:
diff --git a/emmet-builders/emmet/builders/vasp/defects.py b/emmet-builders/emmet/builders/vasp/defects.py
new file mode 100644
index 0000000000..09424127e2
--- /dev/null
+++ b/emmet-builders/emmet/builders/vasp/defects.py
@@ -0,0 +1,936 @@
+"""
+This module contains 2 builders for defect properties in non-metals.
+
+1.  The DefectBuilder builds individual Defect documents
+    with bulk and dielectric properties included, along
+    with properties neccessary for accessing delocalization
+    and parsing further defect thermodynamics.
+2.  The DefectThermoBuilder builds DefectPhaseDiagram
+    documents from defect objects with identical bulk structures
+    and calculation metadata.
+"""
+
+from datetime import datetime
+import numpy as np
+
+from monty.json import MontyDecoder, jsanitize
+
+from pymatgen import Structure, MPRester, PeriodicSite
+from pymatgen.analysis.structure_matcher import StructureMatcher, PointDefectComparator
+from pymatgen.electronic_structure.bandstructure import BandStructure
+from pymatgen.analysis.defects.core import Vacancy, Substitution, Interstitial, DefectEntry
+from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram
+from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+
+from maggma.builder import Builder
+
+
+__author__ = "Danny Broberg, Shyam Dwaraknath"
+
+
+class DefectBuilder(Builder):
+    def __init__(self,
+                 tasks,
+                 defects,
+                 query=None,
+                 compatibility=DefectCompatibility(),
+                 ltol=0.2,
+                 stol=0.3,
+                 angle_tol=5,
+                 max_items_size=0,
+                 update_all=False,
+                 **kwargs):
+        """
+        Creates DefectEntry from vasp task docs
+
+        Args:
+            tasks (Store): Store of tasks documents
+            defects (Store): Store of defect entries with all metadata required for followup decisions on defect thermo
+            query (dict): dictionary to limit materials to be analyzed
+            compatibility (DefectCompatability): Compatability module to ensure defect calculations are compatible
+            ltol (float): StructureMatcher tuning parameter for matching tasks to materials
+            stol (float): StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol (float): StructureMatcher tuning parameter for matching tasks to materials
+            max_items_size (int): limits number of items approached from tasks (zero places no limit on number of items)
+            update_all (bool): Whether to consider all task ids from defects store.
+                Default is False (will not re-consider task-ids which have been considered previously.
+        """
+
+        self.tasks = tasks
+        self.defects = defects
+        self.query = query if query else {}
+        self.compatibility = compatibility
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.max_items_size = max_items_size
+        self.update_all=update_all
+        super().__init__(sources=[tasks], targets=[defects], **kwargs)
+
+    def get_items(self):
+        """
+        Gets sets of entries from chemical systems that need to be processed
+
+        Returns:
+            generator of relevant entries from one chemical system
+
+        """
+        self.logger.info("Defect Builder Started")
+
+        self.logger.info("Setting indexes")
+        self.ensure_indicies()
+
+        # Save timestamp for update operation
+        self.time_stamp = datetime.utcnow()
+
+        # Get all successful defect tasks that have been updated since
+        # defect_store was last updated
+        q = dict(self.query)
+        q["state"] = "successful"
+        if not self.update_all:
+            if 'task_id' in q:
+                if isinstance(q['task_id'], int) or isinstance(q['task_id'], float):
+                    q['task_id'] = {'$nin': [], '$in': [q['task_id']]}
+                if '$nin' in q['task_id']:
+                    q['task_id']['$nin'].extend( self.defects.distinct('entry_id'))
+                else:
+                    q['task_id'].update( {'$nin': self.defects.distinct('entry_id')})
+            else:
+                q.update({'task_id': {'$nin': self.defects.distinct('entry_id')}})
+
+        q.update({'transformations.history.@module':
+                      {'$in': ['pymatgen.transformations.defect_transformations']}})
+        defect_tasks = list(self.tasks.query(criteria=q,
+                                             properties=['task_id', 'transformations.history.0.defect.structure']))
+        if self.max_items_size and len(defect_tasks) > self.max_items_size:
+            defect_tasks = [dtask for dind, dtask in enumerate(defect_tasks) if dind < self.max_items_size]
+        task_ids = [dtask['task_id'] for dtask in defect_tasks]
+        self.logger.info("Found {} new defect tasks to consider:\n{}".format( len(defect_tasks), task_ids))
+
+        # get a few other tasks which are needed for defect entries (regardless of when they were last updated):
+        # bulk_supercell, dielectric calc, BS calc, HSE-BS calc
+        bulksc = {"state": "successful", 'transformations.history.0.@module':
+            {'$in': ['pymatgen.transformations.standard_transformations']}}
+        dielq = {"state": "successful", "input.incar.LEPSILON": True, "input.incar.LPEAD": True}
+        HSE_BSq = {"state": "successful", 'calcs_reversed.0.input.incar.LHFCALC': True,
+                   'transformations.history.0.@module':
+                        {'$nin': ['pymatgen.transformations.defect_transformations',
+                                  'pymatgen.transformations.standard_transformations']}}
+        # TODO: add smarter capability for getting HSE bandstructure from tasks
+        # TODO: add capability for getting GGA bandstructure from tasks?
+
+        # now load up all defect tasks with relevant information required for process_item step
+        # includes querying bulk, diel and hybrid calcs as you go along.
+        log_additional_tasks = dict() #to minimize number of bulk + diel + hse queries, log by chemsys
+
+        needed_defect_properties = ['task_id', 'transformations', 'input',
+                                    'task_label', 'last_updated',
+                                    'output', 'calcs_reversed', 'chemsys']
+        needed_bulk_properties = ['task_id', 'chemsys', 'task_label',
+                                  'last_updated', 'transformations',
+                                  'input', 'output', 'calcs_reversed']
+        for d_task_id in task_ids:
+            d_task = list(self.tasks.query(criteria={"task_id": d_task_id},
+                                           properties=needed_defect_properties))[0]
+            chemsys = "-".join(sorted((Structure.from_dict(
+                d_task['transformations']['history'][0]['defect']['structure']).symbol_set)))
+
+            if chemsys not in log_additional_tasks.keys():
+                #grab all bulk calcs for chemsys
+                q = bulksc.copy()
+                q.update( {"chemsys": chemsys})
+                bulk_tasks = list(self.tasks.query(criteria=q, properties=needed_bulk_properties))
+
+                #grab all diel calcs for chemsys
+                q = dielq.copy()
+                q.update( {"chemsys": chemsys})
+                diel_tasks = list(self.tasks.query(criteria=q,
+                                                   properties=['task_id', 'task_label', 'last_updated',
+                                                               'input', 'output']))
+
+                #grab all hse bs calcs for chemsys
+                q = HSE_BSq.copy()
+                q.update( {"chemsys": chemsys})
+                hybrid_tasks = list(self.tasks.query(criteria=q,
+                                                     properties=['task_id', 'input',
+                                                                 'output', 'task_label']))
+
+                self.logger.debug("\t{} has {} bulk loaded {} diel and {} hse"
+                                  "".format( chemsys, len(bulk_tasks),
+                                             len(diel_tasks), len(hybrid_tasks)))
+
+                log_additional_tasks.update({chemsys: {'bulksc': bulk_tasks[:],
+                                                       'diel': diel_tasks[:],
+                                                       'hsebs': hybrid_tasks[:]}})
+
+            yield [d_task, log_additional_tasks[chemsys]]
+
+    def process_item(self, item):
+        """
+        Process a defect item (containing defect, bulk and dielectric information as processed in get_items)
+
+        Args:
+            item (defect_task): a defect_task to process into a DefectEntry object
+
+        Returns:
+            dict: a DefectEntry dictionary to update defect database with
+        """
+        d_task, chemsys_additional_tasks = item
+
+        defect_task = self.find_and_load_bulk_tasks(d_task, chemsys_additional_tasks)
+        if defect_task is None:
+            self.logger.error("Could not determine defect bulk properties for {}".format( d_task['task_id']))
+            return
+        elif 'bulk_task' not in defect_task.keys():
+            self.logger.error("bulk_task is not in item! Cannot parse task id = {}.".format( d_task['task_id']))
+            return
+        elif 'diel_task_meta' not in defect_task.keys():
+            self.logger.error("diel_task_meta is not in item! Cannot parse task id = {}.".format( d_task['task_id']))
+            return
+
+        #initialize parameters with dielectric data
+        eps_ionic = defect_task['diel_task_meta']['epsilon_ionic']
+        eps_static = defect_task['diel_task_meta']['epsilon_static']
+        eps_total = []
+        for i in range(3):
+            eps_total.append([e[0]+e[1] for e in zip(eps_ionic[i], eps_static[i])])
+        parameters = {'epsilon_ionic': eps_ionic, 'epsilon_static':  eps_static,
+                      'dielectric': eps_total,
+                      'task_level_metadata':
+                          {'diel_taskdb_task_id': defect_task['diel_task_meta']['diel_taskdb_task_id']}}
+
+        parameters = self.get_run_metadata( defect_task, parameters)
+
+        # add bulk data to parameters
+        bulk_task = defect_task['bulk_task']
+        bulk_energy = bulk_task['output']['energy']
+        bulk_sc_structure = Structure.from_dict( bulk_task['input']['structure'])
+        parameters.update( {'bulk_energy': bulk_energy, 'bulk_sc_structure': bulk_sc_structure})
+
+        parameters = self.get_bulk_gap_data( bulk_task, parameters)
+        parameters = self.get_bulk_chg_correction_metadata( bulk_task, parameters)
+
+
+        # Add defect data to parameters
+        defect_energy = defect_task['output']['energy']
+        parameters.update({'defect_energy': defect_energy})
+
+        defect, parameters = self.load_defect_and_structure_data( defect_task, parameters)
+
+        parameters = self.load_defect_chg_correction_metadata( defect, defect_task, parameters)
+
+        if 'vr_eigenvalue_dict' in defect_task['calcs_reversed'][0]['output'].keys():
+            eigenvalues = defect_task['calcs_reversed'][0]['output']['vr_eigenvalue_dict']['eigenvalues']
+            kpoint_weights = defect_task['calcs_reversed'][0]['output']['vr_eigenvalue_dict']['kpoint_weights']
+            parameters.update( {'eigenvalues': eigenvalues,
+                                'kpoint_weights': kpoint_weights} )
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have eigenvalue data for parsing '
+                  'bandfilling.'.format(defect_task['task_label'], defect_task['task_id']))
+
+        if 'defect' in defect_task['calcs_reversed'][0]['output'].keys():
+            parameters.update( {'defect_ks_delocal_data':
+                                    defect_task['calcs_reversed'][0]['output']['defect'].copy()})
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have defect data for parsing '
+                  'delocalization.'.format(defect_task['task_label'], defect_task['task_id']))
+
+        defect_entry = DefectEntry( defect, parameters['defect_energy'] - parameters['bulk_energy'],
+                                    corrections = {}, parameters = parameters, entry_id= defect_task['task_id'])
+
+        defect_entry = self.compatibility.process_entry( defect_entry)
+        defect_entry.parameters = jsanitize( defect_entry.parameters, strict=True, allow_bson=True)
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id'] #this seemed neccessary for legacy db
+
+        return defect_entry_as_dict
+
+    def update_targets(self, items):
+        """
+        Inserts the defect docs into the defect collection
+
+        Args:
+            items ([dict]): a list of defect entries as dictionaries
+        """
+        items = [item for item in items if item]
+        self.logger.info("Updating {} defect documents".format(len(items)))
+
+        self.defects.update(items, update_lu=True, key='entry_id')
+
+    def ensure_indicies(self):
+        """
+        Ensures indicies on the tasks and defects collections
+        :return:
+        """
+        # Search indicies for tasks
+        self.tasks.ensure_index(self.tasks.key, unique=True)
+        self.tasks.ensure_index("chemsys")
+
+        # Search indicies for defects
+        self.defects.ensure_index(self.defects.key, unique=True)
+        self.defects.ensure_index("chemsys")
+
+    def find_and_load_bulk_tasks(self, defect_task, additional_tasks):
+        """
+        This takes defect_task and finds bulk task, diel task, and other things as appropriate (example hse_data...)
+
+        needs to make sure INCAR settings and structure matching works for all
+
+        :param defect_task:
+        :param additional_tasks:
+        :return:
+        """
+        out_defect_task = defect_task.copy()
+
+        # get suitable BULK calc by matching structures (right lattice
+        # constant + same supercell size...) and matching essential INCAR + POTCAR settings...
+        bulk_tasks = additional_tasks['bulksc']
+        bulk_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+            primitive_cell=False, scale=False, attempt_supercell=False, allow_subset=False)
+        bulk_matched = []
+        dstruct_withoutdefect = Structure.from_dict(out_defect_task['transformations']['history'][0]['defect']['structure'])
+        scaling_matrix = out_defect_task['transformations']['history'][0]['scaling_matrix']
+        dstruct_withoutdefect.make_supercell( scaling_matrix)
+        dincar = out_defect_task["calcs_reversed"][0]["input"]["incar"] #identify essential INCAR properties which differentiate different calcs
+        dincar_reduced = {k: dincar.get(k, None) if dincar.get(k) not in  ['None', 'False', False] else None
+                          for k in ["LHFCALC", "HFSCREEN", "IVDW", "LUSE_VDW", "LDAU", "METAGGA"]
+                          }
+        d_potcar_base = {'pot_spec': [potelt["titel"] for potelt in out_defect_task['input']['potcar_spec']],
+                         'pot_labels': out_defect_task['input']['pseudo_potential']['labels'][:],
+                         'pot_type': out_defect_task['input']['pseudo_potential']['pot_type'],
+                         'functional': out_defect_task['input']['pseudo_potential']['functional']}
+
+        for b_task in bulk_tasks:
+            bstruct = Structure.from_dict(b_task['input']['structure'])
+            if bulk_sm.fit( bstruct, dstruct_withoutdefect):
+                #also match essential INCAR and POTCAR settings
+                bincar = b_task["calcs_reversed"][0]["input"]["incar"]
+                bincar_reduced = {k: bincar.get(k, None) if bincar.get(k) not in ['None', 'False', False] else None
+                                  for k in dincar_reduced.keys()
+                                  }
+
+                b_potcar = {'pot_spec': set([potelt["titel"] for potelt in b_task['input']['potcar_spec']]),
+                            'pot_labels': set(b_task['input']['pseudo_potential']['labels'][:]),
+                            'pot_type': b_task['input']['pseudo_potential']['pot_type'],
+                            'functional': b_task['input']['pseudo_potential']['functional']}
+                d_potcar = d_potcar_base.copy() #need to reduce in cases of extrinsic species or reordering
+                d_potcar['pot_spec'] = set([d for d in d_potcar['pot_spec'] if d in b_potcar['pot_spec']])
+                d_potcar['pot_labels'] = set([d for d in d_potcar['pot_labels'] if d in b_potcar['pot_labels']])
+
+                #track to make sure that cartesian coords are same (important for several levels of analysis in defect builder)
+                same_cart_positions = True
+                for bsite_coords in bstruct.cart_coords:
+                    if not len( dstruct_withoutdefect.get_sites_in_sphere(bsite_coords, .1)):
+                        same_cart_positions = False
+
+                if bincar_reduced == dincar_reduced and b_potcar == d_potcar and same_cart_positions:
+                    bulk_matched.append( b_task.copy())
+                else:
+                    try:
+                        self.logger.debug("Bulk structure match was found for {} with {}, "
+                                          "but:".format( b_task['task_label'], out_defect_task['task_label']))
+                    except:
+                        self.logger.debug("BULK STRUCT MATCH was found, but task_label did not exist AND:")
+                    if bincar_reduced != dincar_reduced:
+                        out_inc = {k:[v, bincar_reduced[k]] for k,v in dincar_reduced.items() if v != bincar_reduced[k]}
+                        self.logger.debug("\tIncars were different: {} ".format( out_inc))
+                    if b_potcar != d_potcar:
+                        out_pot = {k:[v, b_potcar[k]] for k,v in d_potcar.items() if v != b_potcar[k]}
+                        self.logger.debug("\tPotcar specs were different: {} ".format( out_pot))
+                    if not same_cart_positions:
+                        self.logger.debug("\tBulk site coords were different")
+
+        #if bulk_task found then take most recently updated bulk_task for defect
+        if len( bulk_matched):
+            bulk_matched = sorted( bulk_matched, key=lambda x: x["last_updated"], reverse=True)
+            out_defect_task["bulk_task"] = bulk_matched[0].copy()
+            self.logger.debug("Found {} possible bulk supercell structures. Taking most recent entry updated "
+                  "on: {}".format(len(bulk_matched), bulk_matched[0]['last_updated']))
+        else:
+            self.logger.error("Bulk task doesnt exist for: {} ({})! Cant create defect "
+                             "object...\nMetadata: {}\n{}".format( out_defect_task['task_label'],
+                                                                   out_defect_task['task_id'],
+                                                                   d_potcar, dincar_reduced))
+            return None
+
+
+        # get suitable dielectric calc by matching structures (only lattice
+        # constant fitting needed - not supercell) and POTCAR settings...
+        diel_task_list = additional_tasks['diel']
+        diel_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+            primitive_cell=True, scale=False, attempt_supercell=True, allow_subset=False)
+        diel_matched = []
+        for diel_task in diel_task_list:
+            diel_struct = Structure.from_dict( diel_task['input']['structure'])
+            if diel_sm.fit( diel_struct, dstruct_withoutdefect):
+                #also match essential POTCAR settings and confirm LVTOT = True and LVHAR = True
+
+                diel_potcar = {'pot_spec': set([potelt["titel"] for potelt in diel_task['input']['potcar_spec']]),
+                            'pot_labels': set(diel_task['input']['pseudo_potential']['labels'][:]),
+                            'pot_type': diel_task['input']['pseudo_potential']['pot_type'],
+                            'functional': diel_task['input']['pseudo_potential']['functional']}
+                d_potcar = d_potcar_base.copy() #need to reduce in cases of extrinsic species or reordering
+                d_potcar['pot_spec'] = set([d for d in d_potcar['pot_spec'] if d in diel_potcar['pot_spec']])
+                d_potcar['pot_labels'] = set([d for d in d_potcar['pot_labels'] if d in diel_potcar['pot_labels']])
+
+                if diel_potcar == d_potcar:
+                    diel_matched.append( diel_task.copy())
+                else:
+                    try:
+                        self.logger.debug("Dielectric structure match was found for {} with {}, "
+                                          "but:".format( diel_task['task_label'], out_defect_task['task_label']))
+                    except:
+                        self.logger.debug("Dielectric STRUCT MATCH was found, but task_label did not exist AND:")
+                    out_pot = {k:[v, diel_potcar[k]] for k,v in d_potcar.items() if v != diel_potcar[k]}
+                    self.logger.debug("\tPotcar specs were different: {} ".format( out_pot))
+            # else:
+            #     self.logger.debug("{} ({}) had a structure which did not match {} for use "
+            #                       "as a dielectric calculation".format( diel_task['task_label'],
+            #                                                          diel_task['task_id'],
+            #                                                          out_defect_task['task_label']))
+
+        #if diel_tasks found then take most recently updated bulk_task for defect
+        if len( diel_matched):
+            diel_matched = sorted( diel_matched, key=lambda x: x["last_updated"], reverse=True)
+            diel_dict = {'diel_taskdb_task_id': diel_matched[0]['task_id'],
+                         'epsilon_static': diel_matched[0]['output']['epsilon_static'],
+                         'epsilon_ionic': diel_matched[0]['output']['epsilon_ionic']}
+            out_defect_task["diel_task_meta"] = diel_dict.copy()
+            self.logger.debug("Found {} possible dieletric calcs. Taking most recent entry updated "
+                  "on: {}".format(len(diel_matched), diel_matched[0]['last_updated']))
+        else:
+            try:
+                self.logger.error("Dielectric task doesnt exist for: {} ({})! Cant create defect "
+                                 "object...\nMetadata for defect: {}\n{}".format( out_defect_task['task_label'],
+                                                                                  out_defect_task['task_id'],
+                                                                                  d_potcar, dincar_reduced))
+            except:
+                self.logger.debug("DIEL TASK DNE.")
+
+            return None
+
+        # FINALLY consider grabbing extra hybrid BS information...
+        # first confirm from the INCAR setting that this defect is NOT an HSE calculation itself...
+        if dincar_reduced['LHFCALC'] in [None, False, 'False'] and len(additional_tasks['hsebs']):
+            hse_bs_matched = []
+            for hse_bs_task in additional_tasks['hsebs']:
+                hse_bs_struct = Structure.from_dict(hse_bs_task['input']['structure'])
+                if diel_sm.fit(hse_bs_struct, dstruct_withoutdefect): #can use same matching scheme as the dielectric structure matcher
+                    hse_bs_matched.append( hse_bs_task.copy())
+                else:
+                    self.logger.debug("task id {} has a structure which did not match {} for use "
+                                      "as an HSE BS calculation".format( hse_bs_task['task_id'],
+                                                                         out_defect_task['task_label']))
+
+            if len(hse_bs_matched): #match the lowest CBM  and highest VBM values, keeping track of their task_ids
+                hybrid_cbm_data = min([[htask['output']['cbm'], htask['task_id']] for htask in hse_bs_matched])
+                hybrid_vbm_data = max([[htask['output']['vbm'], htask['task_id']] for htask in hse_bs_matched])
+                hybrid_meta = {'hybrid_cbm': hybrid_cbm_data[0], 'hybrid_CBM_task_id': hybrid_cbm_data[1],
+                               'hybrid_vbm': hybrid_vbm_data[0], 'hybrid_VBM_task_id': hybrid_vbm_data[1]}
+                out_defect_task["hybrid_bs_meta"] = hybrid_meta.copy()
+                self.logger.debug("Found hybrid band structure properties for {}:\n\t{}".format( out_defect_task['task_label'],
+                                                                                               hybrid_meta))
+            else:
+                self.logger.debug("Could NOT find hybrid band structure properties for {} despite "
+                                  "there being {} eligible hse calculations".format( out_defect_task['task_label'],
+                                                                                     len(additional_tasks['hsebs'])))
+
+        return out_defect_task
+
+    def get_run_metadata(self, item, parameters):
+        bulk_task = item['bulk_task']
+
+        # load INCAR, KPOINTS, POTCAR and task_id (for both bulk and defect)
+        potcar_summary = {'pot_spec': list([potelt["titel"] for potelt in item['input']['potcar_spec']]),
+                          'pot_labels': list(item['input']['pseudo_potential']['labels'][:]),
+                          'pot_type': item['input']['pseudo_potential']['pot_type'],
+                          'functional': item['input']['pseudo_potential'][
+                              'functional']}  # note bulk has these potcar values also, other wise it would not get to process_items
+        dincar = item["input"]["incar"].copy()
+        dincar_reduced = {k: dincar.get(k, None) for k in ["LHFCALC", "HFSCREEN", "IVDW", "LUSE_VDW",
+                                                           "LDAU", "METAGGA"]}  # same as bulk
+        bincar = item["input"]["incar"].copy()
+        d_kpoints = item['calcs_reversed'][0]['input']['kpoints']
+        if type(d_kpoints) != dict:
+            d_kpoints = d_kpoints.as_dict()
+        b_kpoints = bulk_task['calcs_reversed'][0]['input']['kpoints']
+        if type(b_kpoints) != dict:
+            b_kpoints = b_kpoints.as_dict()
+
+        dir_name = item['calcs_reversed'][0]['dir_name']
+        bulk_dir_name = bulk_task['calcs_reversed'][0]['dir_name']
+
+        parameters['task_level_metadata'].update({'defect_dir_name': dir_name,
+                                                  'bulk_dir_name': bulk_dir_name,
+                                                  'bulk_taskdb_task_id': bulk_task['task_id'],
+                                                  'potcar_summary': potcar_summary.copy(),
+                                                  'incar_calctype_summary': dincar_reduced.copy(),
+                                                  'defect_incar': dincar.copy(),
+                                                  'bulk_incar': bincar.copy(),
+                                                  'defect_kpoints': d_kpoints.copy(),
+                                                  'bulk_kpoints': b_kpoints.copy(),
+                                                  'defect_task_last_updated': item['last_updated']})
+
+        if (dincar_reduced['HFSCREEN'] in [None, False, 'False']) and ("hybrid_bs_meta" in item):
+            parameters.update({k: item["hybrid_bs_meta"][k] for k in ['hybrid_cbm', 'hybrid_vbm']})
+            parameters.update({'hybrid_gap': parameters['hybrid_cbm'] - parameters['hybrid_vbm']})
+            parameters['task_level_metadata'].update({k: item["hybrid_bs_meta"][k] for k in ['hybrid_CBM_task_id',
+                                                                                             'hybrid_VBM_task_id']})
+        return parameters
+
+    def get_bulk_chg_correction_metadata(self, bulk_task, parameters):
+        if 'locpot' in bulk_task['calcs_reversed'][0]['output'].keys():
+            bulklpt = bulk_task['calcs_reversed'][0]['output']['locpot']
+            axes = list(bulklpt.keys())
+            axes.sort()
+            parameters.update( {'bulk_planar_averages': [bulklpt[ax] for ax in axes]} )
+        else:
+            self.logger.error('BULKTYPEcalc: {} (task-id {}) does not '
+                              'have locpot values for parsing'.format( bulk_task['task_label'],
+                                                                       bulk_task['task_id']))
+
+        if 'outcar' in bulk_task['calcs_reversed'][0]['output'].keys():
+            bulkoutcar = bulk_task['calcs_reversed'][0]['output']['outcar']
+            bulk_atomic_site_averages = bulkoutcar['electrostatic_potential']
+            parameters.update( {'bulk_atomic_site_averages': bulk_atomic_site_averages})
+        else:
+            self.logger.error('BULKTYPEcalc: {} (task-id {}) does not '
+                              'have outcar values for parsing'.format( bulk_task['task_label'],
+                                                                       bulk_task['task_id']))
+        return parameters
+
+    def get_bulk_gap_data(self, bulk_task, parameters):
+        bulk_structure = parameters['bulk_sc_structure']
+        try:
+            with MPRester() as mp:
+                # mplist = mp.find_structure(bulk_structure) #had to hack this because this wasnt working??
+                tmp_mplist = mp.get_entries_in_chemsys(list(bulk_structure.symbol_set))
+            mplist = [ment.entry_id for ment in tmp_mplist if ment.composition.reduced_composition == \
+                      bulk_structure.composition.reduced_composition]
+            #TODO: this is a hack because find_structure was data intensive. simplify the hack to do less queries...
+        except:
+            raise ValueError("Error with querying MPRester for {}".format( bulk_structure.composition.reduced_formula))
+
+        mpid_fit_list = []
+        for trial_mpid in mplist:
+            with MPRester() as mp:
+                mpstruct = mp.get_structure_by_material_id(trial_mpid)
+            if StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+                                primitive_cell=True, scale=False, attempt_supercell=True,
+                                allow_subset=False).fit(bulk_structure, mpstruct):
+                mpid_fit_list.append( trial_mpid)
+
+        if len(mpid_fit_list) == 1:
+            mpid = mpid_fit_list[0]
+            self.logger.debug("Single mp-id found for bulk structure:{}.".format( mpid))
+        elif len(mpid_fit_list) > 1:
+            num_mpid_list = [int(mp.split('-')[1]) for mp in mpid_fit_list]
+            num_mpid_list.sort()
+            mpid  = 'mp-'+str(num_mpid_list[0])
+            self.logger.debug("Multiple mp-ids found for bulk structure:{}\nWill use lowest number mpid "
+                  "for bulk band structure = {}.".format(str(mpid_fit_list), mpid))
+        else:
+            self.logger.debug("Could not find bulk structure in MP database after tying the "
+                              "following list:\n{}".format( mplist))
+            mpid = None
+
+        if mpid and not parameters['task_level_metadata']['incar_calctype_summary']['LHFCALC']:
+            #TODO: NEED to be smarter about use of +U etc in MP gga band structure calculations...
+            with MPRester() as mp:
+                bs = mp.get_bandstructure_by_material_id(mpid)
+
+            parameters['task_level_metadata'].update( {'MP_gga_BScalc_data':
+                                                           bs.get_band_gap().copy()} ) #contains gap kpt transition
+            cbm = bs.get_cbm()['energy']
+            vbm = bs.get_vbm()['energy']
+            gap = bs.get_band_gap()['energy']
+        else:
+            parameters['task_level_metadata'].update( {'MP_gga_BScalc_data': None}) #to signal no MP BS is used
+            cbm = bulk_task['output']['cbm']
+            vbm = bulk_task['output']['vbm']
+            gap = bulk_task['output']['bandgap']
+
+        parameters.update( {'mpid': mpid,
+                            "cbm": cbm, "vbm": vbm, "gap": gap} )
+
+        return parameters
+
+    def load_defect_and_structure_data(self, defect_task, parameters):
+        """
+        This loads defect object from task_dict AND
+        loads initial and final structures (making sure their sites are indexed in an equivalent manner)
+
+        if can't confirm that indices are equivalent, then initial_defect_structure = None
+
+        :param defect_task:
+        :param parameters:
+        :return:
+        """
+
+        if type(defect_task['transformations']) != dict:
+            defect_task['transformations'] = defect_task['transformations'].as_dict()
+
+        defect = defect_task['transformations']['history'][0]['defect']
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        defect = MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+        scaling_matrix = MontyDecoder().process_decoded(defect_task['transformations']['history'][0]['scaling_matrix'])
+
+        final_defect_structure = defect_task['output']['structure']
+        if type(final_defect_structure) != Structure:
+            final_defect_structure = Structure.from_dict(final_defect_structure)
+
+        # build initial_defect_structure from very first ionic relaxation step (ensures they are indexed the same]
+        initial_defect_structure = Structure.from_dict(
+            defect_task['calcs_reversed'][-1]['output']['ionic_steps'][0]['structure'])
+
+        # confirm structure matching
+        ids = defect.generate_defect_structure(scaling_matrix)
+        ids_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+                                   primitive_cell=False, scale=False, attempt_supercell=False,
+                                   allow_subset=False)
+        if not ids_sm.fit( ids, initial_defect_structure):
+            self.logger.error("Could not match initial-to-final structure. Will not load initial structure.")
+            initial_defect_structure = None
+
+        parameters.update({'final_defect_structure': final_defect_structure,
+                           'initial_defect_structure': initial_defect_structure,
+                           'scaling_matrix': scaling_matrix})
+
+        return defect, parameters
+
+    def load_defect_chg_correction_metadata(self, defect, defect_task, parameters):
+
+        # --> Load information for Freysoldt related parsing
+        if 'locpot' in defect_task['calcs_reversed'][0]['output'].keys():
+            deflpt = defect_task['calcs_reversed'][0]['output']['locpot']
+            axes = list(deflpt.keys())
+            axes.sort()
+            defect_planar_averages = [deflpt[ax] for ax in axes]
+            abc = parameters['initial_defect_structure'].lattice.abc
+            axis_grid = []
+            for ax in range(3):
+                num_pts = len(defect_planar_averages[ax])
+                axis_grid.append([i / num_pts * abc[ax] for i in range(num_pts)])
+            parameters.update({'axis_grid': axis_grid,
+                               'defect_planar_averages': defect_planar_averages})
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have locpot values for '
+                              'parsing Freysoldt correction'.format(defect_task['task_label'], defect_task['task_id']))
+
+
+        # --> Load information for Kumagai related parsing
+        if 'outcar' in defect_task['calcs_reversed'][0]['output'].keys() and \
+                parameters['initial_defect_structure']:
+
+            defoutcar = defect_task['calcs_reversed'][0]['output']['outcar']
+            defect_atomic_site_averages = defoutcar['electrostatic_potential']
+            bulk_sc_structure = parameters['bulk_sc_structure']
+            initial_defect_structure = parameters['initial_defect_structure']
+
+            bulksites = [site.frac_coords for site in bulk_sc_structure]
+            initsites = [site.frac_coords for site in initial_defect_structure]
+            distmatrix = initial_defect_structure.lattice.get_all_distances(bulksites, initsites) #first index of this list is bulk index
+            min_dist_with_index = [[min(distmatrix[bulk_index]), int(bulk_index),
+                                    int(distmatrix[bulk_index].argmin())] for bulk_index in range(len(distmatrix))] # list of [min dist, bulk ind, defect ind]
+
+            found_defect = False
+            site_matching_indices = []
+            poss_defect = []
+            if isinstance(defect, (Vacancy, Interstitial)):
+                for mindist, bulk_index, defect_index in min_dist_with_index:
+                    if mindist < self.ltol:
+                        site_matching_indices.append( [bulk_index, defect_index])
+                    elif isinstance(defect, Vacancy):
+                        poss_defect.append( [bulk_index, bulksites[ bulk_index][:]])
+
+                if isinstance(defect, Interstitial):
+                    poss_defect = [ [ind, fc[:]] for ind, fc in enumerate(initsites) \
+                                    if ind not in np.array(site_matching_indices)[:,1]]
+
+            elif isinstance(defect, Substitution):
+                for mindist, bulk_index, defect_index in min_dist_with_index:
+                    species_match = bulk_sc_structure[bulk_index].specie == \
+                                    initial_defect_structure[defect_index].specie
+                    if mindist < self.ltol and species_match:
+                        site_matching_indices.append( [bulk_index, defect_index])
+
+                    elif not species_match:
+                        poss_defect.append( [defect_index, initsites[ defect_index][:]])
+
+
+            if len(poss_defect) == 1:
+                found_defect = True
+                defect_index_sc_coords = poss_defect[0][0]
+                defect_frac_sc_coords = poss_defect[0][1]
+            else:
+                self.logger.error("Found {} possible defect sites when matching bulk and "
+                                  "defect structure".format(len(poss_defect)))
+
+
+            if len(set(np.array(site_matching_indices)[:, 0])) != len(set(np.array(site_matching_indices)[:, 1])):
+                self.logger.error("Error occured in site_matching routine. Double counting of site matching "
+                                  "occured:{}\nAbandoning Kumagai parsing.".format(site_matching_indices))
+                found_defect = False
+
+            # assuming Wigner-Seitz radius for sampling radius
+            wz = initial_defect_structure.lattice.get_wigner_seitz_cell()
+            dist = []
+            for facet in wz:
+                midpt = np.mean(np.array(facet), axis=0)
+                dist.append(np.linalg.norm(midpt))
+            sampling_radius = min(dist)
+
+            if found_defect:
+                parameters.update({'defect_atomic_site_averages': defect_atomic_site_averages,
+                                   'site_matching_indices': site_matching_indices,
+                                   'sampling_radius': sampling_radius,
+                                   'defect_frac_sc_coords': defect_frac_sc_coords,
+                                   'defect_index_sc_coords': defect_index_sc_coords})
+            else:
+                self.logger.error("Error in mapping procedure for bulk to initial defect structure.")
+
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have outcar values for '
+                              'parsing Kumagai'.format(defect_task['task_label'], defect_task['task_id']))
+
+        return parameters
+
+
+class DefectThermoBuilder(Builder):
+    def __init__(self,
+                 defects,
+                 defectthermo,
+                 query=None,
+                 ltol=0.2,
+                 stol=0.3,
+                 angle_tol=5,
+                 update_all=False,
+                 **kwargs):
+        """
+        Creates DefectEntry from vasp task docs
+
+        Args:
+            defects (Store): Store of defect entries
+            defectthermo (Store): Store of DefectPhaseDiagram documents
+            query (dict): dictionary to limit materials to be analyzed (note query is for defects Store)
+            ltol (float): StructureMatcher tuning parameter for matching tasks to materials
+            stol (float): StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol (float): StructureMatcher tuning parameter for matching tasks to materials
+            update_all (bool): Whether to consider all task ids from defects store.
+                Default is False (will not re-consider task-ids which have been considered previously.
+        """
+
+        self.defects = defects
+        self.defectthermo = defectthermo
+        self.query = query if query else {}
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.update_all = update_all
+        super().__init__(sources=[defects], targets=[defectthermo], **kwargs)
+
+    def get_items(self):
+        self.logger.info("DefectThermo Builder Started")
+
+        # Save timestamp for update operation
+        self.time_stamp = datetime.utcnow()
+
+        #get all new Defect Entries since last time DefectThermo was updated...
+        q = dict(self.query)
+        self.logger.debug('query is initially: {}'.format( q))
+        if not self.update_all:
+            # if not update_all then grab entry_ids of defects that have been analyzed already...
+            prev_dpd = list(self.defectthermo.query(properties=['metadata.all_entry_ids_considered']))
+            self.logger.debug('Found {} previous dpd objects'.format( len(prev_dpd)))
+
+            if "entry_id" not in q.keys():
+                q["entry_id"] = {"$nin": []}
+            elif "$nin" not in q["entry_id"].keys():
+                q["entry_id"]["$nin"] = []
+            for dpd in prev_dpd:
+                q["entry_id"]["$nin"].extend( list( dpd['metadata']['all_entry_ids_considered']))
+
+            self.logger.debug('query after removing previously considered entry_ids is: {}'.format( q))
+
+        # q.update(self.defects.lu_filter(self.defectthermo))  #TODO: does this work?? / is it needed?
+
+        # restricted amount of defect info for PD, so not an overwhelming database query
+        entry_keys_needed_for_thermo = ['defect', 'parameters.task_level_metadata', 'parameters.last_updated',
+                                        'task_id', 'entry_id', '@module', '@class',
+                                        'uncorrected_energy', 'corrections', 'parameters.dielectric',
+                                        'parameters.cbm', 'parameters.vbm', 'parameters.gap',
+                                        'parameters.hybrid_cbm', 'parameters.hybrid_vbm',
+                                        'parameters.hybrid_gap', 'parameters.potalign',
+                                        'parameters.freysoldt_meta',
+                                        'parameters.kumagai_meta', 'parameters.is_compatible',
+                                        'parameters.delocalization_meta', 'parameters.phasediagram_meta']
+        defect_entries = list(self.defects.query(criteria=q,
+                                                        properties=entry_keys_needed_for_thermo))
+        thermo_entries = list(self.defectthermo.query(properties=['bulk_chemsys', 'bulk_prim_struct',
+                                                                  'run_metadata', 'entry_id']))
+        self.logger.info("Found {} new defect entries to consider".format( len(defect_entries)))
+
+        #group defect entries based on bulk types and task level metadata info
+
+        sm = StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+                              primitive_cell=True, scale=False, attempt_supercell=True,
+                                allow_subset=False)
+        grpd_entry_list = []
+        for entry_dict in defect_entries:
+            #get bulk chemsys and struct
+            base_bulk_struct = Structure.from_dict(entry_dict['defect']['structure'])
+            base_bulk_struct = base_bulk_struct.get_primitive_structure()
+            bulk_chemsys = "-".join(sorted((base_bulk_struct.symbol_set)))
+
+            #get run metadata info
+            entry_dict_md = entry_dict['parameters']['task_level_metadata']
+            run_metadata = entry_dict_md['incar_calctype_summary'].copy()
+            if 'potcar_summary' in entry_dict_md:
+                run_metadata.update( entry_dict_md['potcar_summary'].copy())
+                #only want to filter based on bulk POTCAR
+                pspec, plab = set(), set()
+                for symbol, full_potcar_label in zip(run_metadata['pot_labels'],
+                                                     run_metadata['pot_spec']):
+                    red_sym = symbol.split('_')[0]
+                    if red_sym in base_bulk_struct.symbol_set:
+                        pspec.add( symbol)
+                        plab.add( full_potcar_label)
+
+                run_metadata['pot_spec'] = "-".join(sorted(set(pspec))) #had to switch to this approach for proper dict comparison later on
+                run_metadata['pot_labels'] = "-".join(sorted(set(plab)))
+
+            # see if defect_entry matches an grouped entry list item already in progress
+            # does this by matching bulk structure and run metadata
+            matched = False
+            for grp_ind, grpd_entry in enumerate(grpd_entry_list):
+                if grpd_entry[0] == bulk_chemsys and grpd_entry[1] == run_metadata:
+                    if sm.fit( base_bulk_struct, grpd_entry[2]):
+                        grpd_entry[3].append( entry_dict.copy())
+                        matched = True
+                        break
+                    else:
+                        continue
+                else:
+                    continue
+
+            if not matched: #have not logged yet, see if previous thermo phase diagram was created for this system
+                for t_ent in thermo_entries:
+                    if t_ent['bulk_chemsys'] == bulk_chemsys and t_ent['run_metadata'] == run_metadata:
+                        if sm.fit(base_bulk_struct, Structure.from_dict(t_ent['bulk_prim_struct'])):
+                            if not self.update_all:
+                                #if not wanting to update everything, then get previous entries from thermo database
+                                old_entry_set = list(self.defectthermo.query( {'entry_id': t_ent['entry_id']},
+                                                                               properties=['entries']))[0]['entries']
+                                old_entries_list = [entry.as_dict() if type(entry) != dict else entry for entry in
+                                                    old_entry_set]
+                            else:
+                                old_entries_list = []
+
+                            old_entries_list.append( entry_dict.copy())
+                            grpd_entry_list.append( [bulk_chemsys, run_metadata.copy(),  base_bulk_struct.copy(),
+                                                     old_entries_list, t_ent['entry_id']])
+                            matched = True
+                            break
+                        else:
+                            continue
+                    else:
+                        continue
+
+            if not matched:
+                # create new thermo phase diagram for this system,
+                # entry_id will be generated later on
+                grpd_entry_list.append( [bulk_chemsys, run_metadata.copy(), base_bulk_struct.copy(),
+                                         [entry_dict.copy()], 'None'])
+
+
+        for new_defects_for_thermo_entry in grpd_entry_list:
+            yield new_defects_for_thermo_entry
+
+
+    def process_item(self, items):
+        #group defect entries into groups of same defect type (charge included)
+        distinct_entries_full = [] #for storing full defect_dict
+        distinct_entries = {} #for storing defect object
+        bulk_chemsys, run_metadata, bulk_prim_struct, entrylist, entry_id = items
+        self.logger.debug("Processing bulk_chemsys {}".format(bulk_chemsys))
+
+        needed_entry_keys = ['@module', '@class', 'defect', 'uncorrected_energy', 'corrections',
+                             'parameters', 'entry_id', 'task_id']
+        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)
+        for entry_dict in entrylist:
+            entry = DefectEntry.from_dict({k:v for k,v in entry_dict.items() if k in needed_entry_keys})
+            matched = False
+            for grpind, grpdefect in distinct_entries.items():
+                if pdc.are_equal( entry.defect, grpdefect.defect):
+                    matched = True
+                    break
+
+            if matched:
+                distinct_entries_full[grpind].append( entry_dict.copy())
+            else:
+                distinct_entries_full.append( [entry_dict.copy()])
+                nxt_ind = max( distinct_entries.keys()) + 1 if len(distinct_entries.keys()) else 0
+                distinct_entries[nxt_ind] = entry.copy()
+
+        #now sort through entries and pick one that has been updated last (but track all previous entry_ids)
+        all_entry_ids_considered, entries = [], []
+        for ident_entries in distinct_entries_full:
+            all_entry_ids_considered.extend([ent['entry_id'] for ent in ident_entries])
+            # sort based on which was done most recently
+            lu_list = []
+            for ent_ind, ent in enumerate(ident_entries):
+                try:
+                    lu = ent['parameters']['task_level_metadata']['defect_task_last_updated']
+                except:
+                    lu = ent['parameters']['last_updated']
+                if isinstance( lu, dict): #deal with case when datetime objects were not properly loaded
+                    lu = MontyDecoder().process_decoded( lu)
+                lu_list.append( [lu, ent_ind])
+            try:
+                lu_list.sort(reverse=True)
+            except:
+                for_print_lu_list = [[lu, ident_entries[ent_ind]['entry_id']] for lu, ent_ind in lu_list]
+                raise ValueError("Error with last_updated sorting of list:\n{}".format( for_print_lu_list))
+            recent_entry_dict = ident_entries[ lu_list[0][1]]
+
+            new_entry = DefectEntry.from_dict( {k:v for k,v in recent_entry_dict.items() if k in needed_entry_keys})
+            entries.append( new_entry.copy())
+
+        # get vbm and bandgap; also verify all vbms and bandgaps are the same
+        vbm = entries[0].parameters['vbm']
+        band_gap = entries[0].parameters['gap']
+        for ent in entries:
+            if vbm != ent.parameters['vbm']:
+                self.logger.error("Logging vbm = {} (retrieved from {}, task-id {}) but {} "
+                                  "(task-id {}) has vbm = {}. Be careful with this defectphasediagram "
+                                  "if these are very different".format( vbm, entries[0].name, entries[0].entry_id,
+                                                                        ent.name, ent.entry_id, ent.parameters['vbm']))
+            if band_gap != ent.parameters['gap']:
+                self.logger.error("Logging gap = {} (retrieved from {}, task-id {}) but {} "
+                                  "(task-id {}) has gap = {}. Be careful with this defectphasediagram "
+                                  "if these are very different".format( band_gap, entries[0].name, entries[0].entry_id,
+                                                                        ent.name, ent.entry_id, ent.parameters['gap']))
+
+        defect_phase_diagram = DefectPhaseDiagram( entries, vbm, band_gap, filter_compatible=False,
+                                                   metadata={'all_entry_ids_considered': all_entry_ids_considered})
+        defect_phase_diagram_as_dict = defect_phase_diagram.as_dict()
+        defect_phase_diagram_as_dict.update( {'bulk_chemsys': bulk_chemsys, 'bulk_prim_struct': bulk_prim_struct.as_dict(),
+                                'run_metadata': run_metadata, 'entry_id': entry_id, 'last_updated': datetime.utcnow()})
+
+        return defect_phase_diagram_as_dict
+
+    def update_targets(self, items):
+        list_entry_ids = list(self.defectthermo.distinct('entry_id'))
+        if len(list_entry_ids):
+            next_entry_id = max(list_entry_ids) + 1
+        else:
+            next_entry_id = 1
+        for item in items:
+            if item['entry_id'] == 'None':
+                item['entry_id'] = next_entry_id
+                next_entry_id += 1
+
+        self.logger.info("Updating {} DefectThermo documents".format(len(items)))
+
+        self.defectthermo.update(items, update_lu=True, key='entry_id')
diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py
index 0e13af3e8a..1bd448d77a 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/utils.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py
@@ -54,6 +54,8 @@ def _variant_equal(v1, v2) -> bool:
             ):
                 return RunType(f"{functional_class}{is_hubbard}")
 
+    # TODO elegant way to handle this? 
+    # This is a hack to get the non-standard hybrids to work
     if parameters.get('FRACTION'):
         return RunType(f"HYBRID{is_hubbard}")
 

From e6310e9ae21edc399b2987681139d00db59df1ec Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 8 Mar 2022 14:18:52 -0800
Subject: [PATCH 34/41] Versions

---
 emmet-builders/_version.py | 2 +-
 emmet-core/_version.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/emmet-builders/_version.py b/emmet-builders/_version.py
index 75a69a764e..4e2738d992 100644
--- a/emmet-builders/_version.py
+++ b/emmet-builders/_version.py
@@ -1 +1 @@
-__version__ = "0.21.19"
\ No newline at end of file
+__version__ = "0.21.21.dev40+g598f9125.d20220308"
\ No newline at end of file
diff --git a/emmet-core/_version.py b/emmet-core/_version.py
index b350b0bcd3..4e2738d992 100644
--- a/emmet-core/_version.py
+++ b/emmet-core/_version.py
@@ -1 +1 @@
-__version__ = "0.21.20"
\ No newline at end of file
+__version__ = "0.21.21.dev40+g598f9125.d20220308"
\ No newline at end of file

From 182e8208debbf72752bc85f6f0b1cc27b0a6201f Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 8 Mar 2022 16:56:26 -0800
Subject: [PATCH 35/41] Import updates

---
 emmet-builders/emmet/builders/cp2k/defects.py |  5 ++--
 .../emmet/builders/cp2k/materials.py          |  2 +-
 emmet-core/_version.py                        |  2 +-
 emmet-core/emmet/core/cp2k/material.py        |  3 +-
 emmet-core/emmet/core/cp2k/task.py            | 30 ++++++++++++++++++-
 emmet-core/emmet/core/defect.py               |  6 +++-
 6 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 978fa777a6..6f8880a863 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -13,7 +13,7 @@
 
 from atomate.utils.utils import load_class
 
-from emmet.core import SETTINGS
+from emmet.core.settings import EmmetSettings
 from emmet.core.utils import jsanitize, get_sg
 from emmet.core.defect import DefectDoc, DefectDoc2d, DefectThermoDoc
 from emmet.core.cp2k.calc_types import TaskType
@@ -22,6 +22,8 @@
 from emmet.builders.settings import EmmetBuildSettings
 from emmet.builders.cp2k.utils import get_mpid, synchronous_query
 
+SETTINGS = EmmetSettings()
+
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Jason Munro"
 
@@ -617,7 +619,6 @@ def process_item(self, items):
         """
         defect_doc, item_bundle = items
         self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
-
         if item_bundle:
             material_id = self._get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..
             if defect_doc:
diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
index 15b5c0a07a..f74a37d696 100644
--- a/emmet-builders/emmet/builders/cp2k/materials.py
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -7,12 +7,12 @@
 
 from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
 from emmet.builders.settings import EmmetBuildSettings
-from emmet.core import SETTINGS
 from emmet.core.utils import group_structures, jsanitize
 from emmet.core.cp2k.calc_types import TaskType
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.core.cp2k.task import TaskDocument
 
+SETTINGS = EmmetBuildSettings()
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
 
diff --git a/emmet-core/_version.py b/emmet-core/_version.py
index 4e2738d992..db3d19f16c 100644
--- a/emmet-core/_version.py
+++ b/emmet-core/_version.py
@@ -1 +1 @@
-__version__ = "0.21.21.dev40+g598f9125.d20220308"
\ No newline at end of file
+__version__ = "0.21.21.dev41+ge6310e9a.d20220309"
\ No newline at end of file
diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
index cf6d8b4b65..8f85abeb8e 100644
--- a/emmet-core/emmet/core/cp2k/material.py
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -5,7 +5,7 @@
 from pymatgen.analysis.structure_matcher import StructureMatcher
 from pymatgen.entries.computed_entries import ComputedStructureEntry
 
-from emmet.core import SETTINGS
+from emmet.core.settings import EmmetSettings
 from emmet.core.material import MaterialsDoc as CoreMaterialsDoc
 from emmet.core.material import PropertyOrigin as PropertyOrigin
 from emmet.core.structure import StructureMetadata
@@ -13,6 +13,7 @@
 from emmet.core.cp2k.task import TaskDocument
 from emmet.builders.cp2k.utils import get_mpid
 
+SETTINGS = EmmetSettings()
 
 class MaterialsDoc(CoreMaterialsDoc, StructureMetadata):
 
diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
index 19bfe8b364..d30db31acb 100644
--- a/emmet-core/emmet/core/cp2k/task.py
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -7,7 +7,7 @@
 from pymatgen.core import Structure
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 
-from emmet.core.task import TaskDocument as BaseTaskDocument
+from emmet.core.base import EmmetBaseModel
 from emmet.core.structure import StructureMetadata
 from emmet.core.utils import ValueEnum
 from emmet.core.cp2k.calc_types import (
@@ -19,6 +19,33 @@
     task_type,
 )
 from emmet.core.math import Matrix3D, Vector3D
+from emmet.core.mpid import MPID
+
+
+class BaseTaskDocument(EmmetBaseModel):
+    """
+    Definition of base Task Document
+    """
+
+    calc_code: str = Field(description="The calculation code used to compute this task")
+    version: str = Field(None, description="The version of the calculation code")
+    dir_name: str = Field(None, description="The directory for this task")
+    task_id: MPID = Field(None, description="the Task ID For this document")
+
+    completed: bool = Field(False, description="Whether this calcuation completed")
+    completed_at: datetime = Field(
+        None, description="Timestamp for when this task was completed"
+    )
+    last_updated: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Timestamp for this task document was last updated",
+    )
+
+    tags: List[str] = Field([], description="Metadata tags for this task document")
+
+    warnings: List[str] = Field(
+        None, description="Any warnings related to this property"
+    )
 
 
 class Status(ValueEnum):
@@ -103,6 +130,7 @@ class TaskDocument(BaseTaskDocument, StructureMetadata):
     Definition of CP2K Task Document
     """
 
+    calc_code = "CP2K"
     dir_name: str = Field(None, description="The directory for this CP2K task")
     run_stats: RunStatistics = Field(
         None,
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index ed059efe64..e2826d9e54 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -1,10 +1,14 @@
 """ Core definition for Defect property Document """
+from __future__ import annotations
+
 from datetime import datetime
 from typing import ClassVar, Dict, Tuple, Mapping, List
+from pydantic import BaseModel, Field
+from pydantic import validator
+
 from monty.json import MontyDecoder
 from monty.tempfile import ScratchDir
 from itertools import groupby
-from pydantic import Field, validator, BaseModel
 
 from pymatgen.core import Structure, Composition, Element
 from pymatgen.analysis.defects.core import DefectEntry, Defect

From 9a9baac51bcc64858faf3bb60a3b7aaecf412b34 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 15 Mar 2022 15:49:26 -0700
Subject: [PATCH 36/41] Changes

Fixing imports, some formatting problems, add Constrained Optimization
---
 emmet-builders/_version.py                    |  2 +-
 emmet-builders/emmet/builders/cp2k/defects.py |  2 +
 .../emmet/builders/cp2k/materials.py          |  2 +-
 emmet-core/_version.py                        |  2 +-
 .../emmet/core/cp2k/calc_types/enums.py       | 31 +++++++--
 .../emmet/core/cp2k/calc_types/generate.py    | 15 +----
 .../emmet/core/cp2k/calc_types/utils.py       | 17 ++---
 emmet-core/emmet/core/cp2k/task.py            | 17 ++---
 emmet-core/emmet/core/defect.py               | 63 ++++++++++---------
 9 files changed, 81 insertions(+), 70 deletions(-)

diff --git a/emmet-builders/_version.py b/emmet-builders/_version.py
index 4e2738d992..129f1c9dcc 100644
--- a/emmet-builders/_version.py
+++ b/emmet-builders/_version.py
@@ -1 +1 @@
-__version__ = "0.21.21.dev40+g598f9125.d20220308"
\ No newline at end of file
+__version__ = "0.21.21.dev42+g182e8208.d20220315"
\ No newline at end of file
diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index 6f8880a863..e0c8b9140c 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -603,6 +603,7 @@ def __get_pristine_supercell(task):
             return Structure.from_dict(task['transformations']['history'][0]['input_structure'])
         return Structure.from_dict(task['input']['structure'])
 
+
 # TODO This needs to be unified into one builder somehow
 class DefectBuilder2d(DefectBuilder):
 
@@ -627,6 +628,7 @@ def process_item(self, items):
                 defect_doc = DefectDoc2d.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
             return defect_doc.dict()
 
+
 class DefectThermoBuilder(Builder):
 
     """
diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
index f74a37d696..82ec58f099 100644
--- a/emmet-builders/emmet/builders/cp2k/materials.py
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -8,11 +8,11 @@
 from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
 from emmet.builders.settings import EmmetBuildSettings
 from emmet.core.utils import group_structures, jsanitize
-from emmet.core.cp2k.calc_types import TaskType
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.core.cp2k.task import TaskDocument
 
 SETTINGS = EmmetBuildSettings()
+
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
 
diff --git a/emmet-core/_version.py b/emmet-core/_version.py
index db3d19f16c..129f1c9dcc 100644
--- a/emmet-core/_version.py
+++ b/emmet-core/_version.py
@@ -1 +1 @@
-__version__ = "0.21.21.dev41+ge6310e9a.d20220309"
\ No newline at end of file
+__version__ = "0.21.21.dev42+g182e8208.d20220315"
\ No newline at end of file
diff --git a/emmet-core/emmet/core/cp2k/calc_types/enums.py b/emmet-core/emmet/core/cp2k/calc_types/enums.py
index 5d5ab109eb..8f82f8fcd7 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/enums.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/enums.py
@@ -14,7 +14,7 @@ class RunType(ValueEnum):
     METAGGA = "METAGGA"
     HYBRID = "HYBRID"
     HSE06 = "HSE06"
-    PB0 = "PB0"
+    PBE0 = "PBE0"
     RSH = "RSH"
     PBE_U = "PBE+U"
     GGA_U = "GGA+U"
@@ -22,7 +22,7 @@ class RunType(ValueEnum):
     METAGGA_U = "METAGGA+U"
     HYBRID_U = "HYBRID+U"
     HSE06_U = "HSE06+U"
-    PB0_U = "PB0+U"
+    PBE0_U = "PBE0+U"
     RSH_U = "RSH+U"
     LDA = "LDA"
     LDA_U = "LDA+U"
@@ -33,6 +33,7 @@ class TaskType(ValueEnum):
 
     Static = "Static"
     Structure_Optimization = "Structure Optimization"
+    Constrained_Structure_Optimization = "Constrained Structure Optimization"
 
 
 class CalcType(ValueEnum):
@@ -40,37 +41,55 @@ class CalcType(ValueEnum):
 
     PBE_Static = "PBE Static"
     PBE_Structure_Optimization = "PBE Structure Optimization"
+    PBE_Constrained_Structure_Optimization = "PBE Constrained Structure Optimization"
     GGA_Static = "GGA Static"
     GGA_Structure_Optimization = "GGA Structure Optimization"
+    GGA_Constrained_Structure_Optimization = "GGA Constrained Structure Optimization"
     R2SCAN_Static = "R2SCAN Static"
     R2SCAN_Structure_Optimization = "R2SCAN Structure Optimization"
+    R2SCAN_Constrained_Structure_Optimization = "R2SCAN Constrained Structure Optimization"
     METAGGA_Static = "METAGGA Static"
     METAGGA_Structure_Optimization = "METAGGA Structure Optimization"
+    METAGGA_Constrained_Structure_Optimization = "METAGGA Constrained Structure Optimization"
     HYBRID_Static = "HYBRID Static"
     HYBRID_Structure_Optimization = "HYBRID Structure Optimization"
+    HYBRID_Constrained_Structure_Optimization = "HYBRID Constrained Structure Optimization"
     HSE06_Static = "HSE06 Static"
     HSE06_Structure_Optimization = "HSE06 Structure Optimization"
-    PB0_Static = "PB0 Static"
-    PB0_Structure_Optimization = "PB0 Structure Optimization"
+    HSE06_Constrained_Structure_Optimization = "HSE06 Constrained Structure Optimization"
+    PBE0_Static = "PBE0 Static"
+    PBE0_Structure_Optimization = "PBE0 Structure Optimization"
+    PBE0_Constrained_Structure_Optimization = "PBE0 Constrained Structure Optimization"
     RSH_Static = "RSH Static"
     RSH_Structure_Optimization = "RSH Structure Optimization"
+    RSH_Constrained_Structure_Optimization = "RSH Constrained Structure Optimization"
     PBE_U_Static = "PBE+U Static"
     PBE_U_Structure_Optimization = "PBE+U Structure Optimization"
+    PBE_U_Constrained_Structure_Optimization = "PBE+U Constrained Structure Optimization"
     GGA_U_Static = "GGA+U Static"
     GGA_U_Structure_Optimization = "GGA+U Structure Optimization"
+    GGA_U_Constrained_Structure_Optimization = "GGA+U Constrained Structure Optimization"
     R2SCAN_U_Static = "R2SCAN+U Static"
     R2SCAN_U_Structure_Optimization = "R2SCAN+U Structure Optimization"
+    R2SCAN_U_Constrained_Structure_Optimization = "R2SCAN+U Constrained Structure Optimization"
     METAGGA_U_Static = "METAGGA+U Static"
     METAGGA_U_Structure_Optimization = "METAGGA+U Structure Optimization"
+    METAGGA_U_Constrained_Structure_Optimization = "METAGGA+U Constrained Structure Optimization"
     HYBRID_U_Static = "HYBRID+U Static"
     HYBRID_U_Structure_Optimization = "HYBRID+U Structure Optimization"
+    HYBRID_U_Constrained_Structure_Optimization = "HYBRID+U Constrained Structure Optimization"
     HSE06_U_Static = "HSE06+U Static"
     HSE06_U_Structure_Optimization = "HSE06+U Structure Optimization"
-    PB0_U_Static = "PB0+U Static"
-    PB0_U_Structure_Optimization = "PB0+U Structure Optimization"
+    HSE06_U_Constrained_Structure_Optimization = "HSE06+U Constrained Structure Optimization"
+    PBE0_U_Static = "PBE0+U Static"
+    PBE0_U_Structure_Optimization = "PBE0+U Structure Optimization"
+    PBE0_U_Constrained_Structure_Optimization = "PBE0+U Constrained Structure Optimization"
     RSH_U_Static = "RSH+U Static"
     RSH_U_Structure_Optimization = "RSH+U Structure Optimization"
+    RSH_U_Constrained_Structure_Optimization = "RSH+U Constrained Structure Optimization"
     LDA_Static = "LDA Static"
     LDA_Structure_Optimization = "LDA Structure Optimization"
+    LDA_Constrained_Structure_Optimization = "LDA Constrained Structure Optimization"
     LDA_U_Static = "LDA+U Static"
     LDA_U_Structure_Optimization = "LDA+U Structure Optimization"
+    LDA_U_Constrained_Structure_Optimization = "LDA+U Constrained Structure Optimization"
diff --git a/emmet-core/emmet/core/cp2k/calc_types/generate.py b/emmet-core/emmet/core/cp2k/calc_types/generate.py
index a6b2b21cd8..96b7f6f2cd 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/generate.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/generate.py
@@ -1,25 +1,14 @@
 """ Module to define various calculation types as Enums for CP2K """
-import datetime
-from itertools import groupby, product
+from itertools import product
 from pathlib import Path
-from typing import Dict, Iterator, List
 
-import bson
-import numpy as np
-from monty.json import MSONable
 from monty.serialization import loadfn
-from pydantic import BaseModel
-from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
-from pymatgen.core.structure import Structure
-from typing_extensions import Literal
-
-from emmet.core import SETTINGS
-from emmet.core.utils import ValueEnum
 
 _RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve()))
 _TASK_TYPES = [
     "Static",
     "Structure Optimization",
+    "Constrained Structure Optimization"
 ]
 
 _RUN_TYPES = (
diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py
index 1bd448d77a..df9d9126d6 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/utils.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py
@@ -4,7 +4,9 @@
 from monty.serialization import loadfn
 from typing import Iterable
 
-from emmet.core.cp2k.calc_types.enums import RunType, TaskType, CalcType
+from pymatgen.io.cp2k.inputs import Cp2kInput
+
+from emmet.core.cp2k.calc_types import RunType, TaskType, CalcType
 
 _RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve()))
 
@@ -70,8 +72,10 @@ def task_type(
     """
 
     calc_type = []
-
-    cp2k_run_type = inputs.get('Run_type', '')
+    ci = Cp2kInput.from_dict(inputs.get('cp2k_input'))
+    if ci.check("MOTION/CONSTRAINT"):
+        calc_type.append("Constrained")
+    cp2k_run_type = inputs.get('cp2k_global').get('Run_type', '')
 
     if cp2k_run_type.upper() in ['ENERGY', 'ENERGY_FORCE', 'WAVEFUNCTION_OPTIMIZATION', 'WFN_OPT']:
         calc_type.append('Static')
@@ -117,7 +121,6 @@ def task_type(
 
     return TaskType(" ".join(calc_type))
 
-
 def calc_type(
     inputs: Dict,
 ) -> CalcType:
@@ -125,9 +128,9 @@ def calc_type(
     Determines the calc type
 
     Args:
-        inputs: inputs dict with an incar, kpoints, potcar, and poscar dictionaries
-        parameters: Dictionary of VASP parameters from Vasprun.xml
+        inputs: dict from InputSummary containing necessary data for determining 
+            calc type
     """
-    rt = run_type(inputs).value
+    rt = run_type(inputs.get("dft")).value
     tt = task_type(inputs).value
     return CalcType(f"{rt} {tt}")
diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
index d30db31acb..9d569fbac0 100644
--- a/emmet-core/emmet/core/cp2k/task.py
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -10,14 +10,7 @@
 from emmet.core.base import EmmetBaseModel
 from emmet.core.structure import StructureMetadata
 from emmet.core.utils import ValueEnum
-from emmet.core.cp2k.calc_types import (
-    CalcType,
-    RunType,
-    TaskType,
-    calc_type,
-    run_type,
-    task_type,
-)
+from emmet.core.cp2k.calc_types import calc_type, run_type, task_type, CalcType, RunType, TaskType
 from emmet.core.math import Matrix3D, Vector3D
 from emmet.core.mpid import MPID
 
@@ -66,7 +59,7 @@ class InputSummary(BaseModel):
 
     atomic_kind_info: Dict = Field(None, description="Description of parameters used for each atomic kind")
 
-    cp2k_input_set: Dict = Field(None, description="The cp2k input used for this task")
+    cp2k_input: Dict = Field(None, description="The cp2k input used for this task")
 
     dft: Dict = Field(None, description="Description of the DFT parameters used in the last calc of this task")
 
@@ -175,13 +168,11 @@ def find_run_type(cls, v, values):
 
     @validator('task_type', pre=True, always=True)
     def find_task_type(cls, v, values):
-        return task_type(values.get('input').cp2k_global)
+        return task_type(values.get('input').dict())
 
     @validator('calc_type', pre=True, always=True)
     def find_calc_type(cls, v, values):
-        d = values.get('input').dft
-        d.update(values.get('input').cp2k_global)
-        return calc_type(d)
+        return calc_type(values.get("input").dict())
 
     @property
     def entry(self):
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index e2826d9e54..562a370396 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 from datetime import datetime
+from tokenize import group
 from typing import ClassVar, Dict, Tuple, Mapping, List
 from pydantic import BaseModel, Field
 from pydantic import validator
@@ -25,7 +26,7 @@
 from emmet.core.cp2k.task import TaskDocument
 from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
 from emmet.core.cp2k.material import MaterialsDoc
-from emmet.core.cp2k.calc_types.utils import run_type
+from emmet.core.cp2k.calc_types.utils import run_type, task_type
 from emmet.builders.cp2k.utils import get_mpid, matcher
 
 from pymatgen.analysis.defects.corrections import FreysoldtCorrection2d
@@ -88,6 +89,8 @@ class Config:
         default_factory=datetime.utcnow,
     )
 
+    metadata: Dict = Field(description="Metadata for this defect")
+
     # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
     @validator("entries", pre=True)
     def decode(cls, entries):
@@ -128,7 +131,7 @@ def _compare(new, old):
                 self.run_types.update({defect_task_doc.task_id: rt})
                 self.task_types.update({defect_task_doc.task_id: tt})
                 self.calc_types.update({defect_task_doc.task_id: ct})
-                entry = self.get_defect_entry_from_tasks(
+                entry = self.__class__.get_defect_entry_from_tasks(
                             defect_task=defect_task,
                             bulk_task=bulk_task,
                             dielectric=dielectric,
@@ -169,17 +172,20 @@ def from_tasks(cls, tasks: List, query='defect', material_id=None) -> "DefectDoc
         def _run_type(x):
             return run_type(x[0]['input']['dft']).value
 
+        def _task_type(x):
+            return task_type(x[0]['input']['dft']).value
+
         def _sort(x):
             # TODO return kpoint density, currently just does supercell size
             return -x[0]['nsites'], x[0]['output']['energy']
 
         entries = {}
         final_tasks = {}
+        metadata = {}
         for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
             ents = [cls.get_defect_entry_from_tasks(t[0], t[1], t[2], query) for t in sorted_tasks]
-            for e in ents:
-                print(e.energy)
+            metadata[key] = {'convergence': [(sorted_tasks[i][0]['nsites'], ents[i].energy) for i in range(len(ents))]}
 
             best_defect_task, best_bulk_task, dielectric = sorted_tasks[0]
             best_entry = cls.get_defect_entry_from_tasks(best_defect_task, best_bulk_task, dielectric, query)
@@ -201,6 +207,7 @@ def _sort(x):
                 'entry_ids': {rt: entries[rt].entry_id for rt in entries},
                 'defect': best_entry.defect,
                 'name': best_entry.defect.name,
+                'metadata': metadata,
         }
         prim = SpacegroupAnalyzer(best_entry.defect.bulk_structure).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())
@@ -289,11 +296,11 @@ def get_init(x):
 
         # TODO Should probably get these even if not needed for corrections
         if init_defect_structure.charge != 0:
-            axis_grid = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_grid']] if 'v_hartree_grid' in bulk_task['output'] else None
-            bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_planar']] if 'v_hartree_planar' in bulk_task['output'] else None
-            defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_planar']] if 'v_hartree_planar' in defect_task['output'] else None
-            bulk_site_averages = [float(x) for x in bulk_task['output']['v_hartree_sites']] if 'v_hartree_sites' in bulk_task['output'] else None
-            defect_site_averages = [float(x) for x in defect_task['output']['v_hartree_sites']] if 'v_hartree_sites' in defect_task['output'] else None
+            axis_grid = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_grid']] if bulk_task['output'].get('v_hartree_grid') else None
+            bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_planar']] if bulk_task['output'].get('v_hartree_planar') else None
+            defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_planar']] if defect_task['output'].get('v_hartree_planar') else None
+            bulk_site_averages = [float(x) for x in bulk_task['output']['v_hartree_sites']] if bulk_task['output'].get('v_hartree_sites') else None
+            defect_site_averages = [float(x) for x in defect_task['output']['v_hartree_sites']] if defect_task['output'].get('v_hartree_sites') else None
 
             dfi, site_matching_indices = matcher(
                 init_bulk_structure, init_defect_structure,
@@ -325,7 +332,9 @@ class DefectDoc2d(DefectDoc):
     def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='defect'):
         parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
         if dielectric:
-            parameters['dielectric'] = dielectric
+            eps_parallel = (dielectric[0][0] + dielectric[1][1]) / 2
+            eps_perp = dielectric[2][2]
+            parameters['dielectric'] = (eps_parallel - 1) / (1 - 1/eps_perp)
 
         defect_entry = DefectEntry(
             cls.get_defect_from_task(query=query, task=defect_task),
@@ -335,32 +344,30 @@ def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, qu
         )
 
         DefectCompatibility().process_entry(defect_entry, perform_corrections=False)
-
-        if False: 
-            with ScratchDir('.'):
-                fc = FreysoldtCorrection2d(
+        with ScratchDir('.'):
+            fc = FreysoldtCorrection2d(
                     defect_entry.parameters.get('dielectric', 22), 
                     "LOCPOT.ref", "LOCPOT.def", encut=520, buffer=2
                 ) 
-                lref = VolumetricData(
-                    structure=Structure.from_dict(bulk_task['output']['structure']), 
-                    data={'total': MontyDecoder().process_decoded(bulk_task['v_hartree'])}
-                )
-                ldef = VolumetricData(
-                    structure=Structure.from_dict(defect_task['output']['structure']), 
-                    data={'total': MontyDecoder().process_decoded(defect_task['v_hartree'])}
-                )
-                lref.write_file("LOCPOT.ref")
-                ldef.write_file("LOCPOT.def")
-                ecorr = fc.get_correction(defect_entry)
-                defect_entry.corrections.update(ecorr)
-                defect_entry.parameters['freysoldt2d_meta'] = fc.metadata
+            lref = VolumetricData(
+                structure=Structure.from_dict(bulk_task['input']['structure']), 
+                data={'total': MontyDecoder().process_decoded(bulk_task['v_hartree'])}
+            )
+            ldef = VolumetricData(
+                structure=Structure.from_dict(defect_task['input']['structure']), 
+                data={'total': MontyDecoder().process_decoded(defect_task['v_hartree'])}
+            )
+            lref.write_file("LOCPOT.ref")
+            ldef.write_file("LOCPOT.def")
+            ecorr = fc.get_correction(defect_entry)
+            defect_entry.corrections.update(ecorr)
+            defect_entry.parameters['freysoldt2d_meta'] = fc.metadata
 
         defect_entry_as_dict = defect_entry.as_dict()
         defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
 
         return defect_entry
-
+    
 class DefectThermoDoc(BaseModel):
 
     class Config:

From d53da0137ff261172c011236274f108001c9022f Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 23 Mar 2022 14:06:57 -0700
Subject: [PATCH 37/41] Updates

---
 emmet-builders/emmet/builders/cp2k/defects.py | 388 ++++++++++--------
 .../emmet/builders/cp2k/materials.py          |   1 -
 .../emmet/builders/cp2k/task_validator.py     |  19 +-
 .../emmet/core/cp2k/calc_types/utils.py       |   3 -
 emmet-core/emmet/core/cp2k/validation.py      | 107 ++---
 emmet-core/emmet/core/defect.py               |  42 +-
 6 files changed, 287 insertions(+), 273 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index e0c8b9140c..f751c83553 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -21,13 +21,15 @@
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.builders.settings import EmmetBuildSettings
 from emmet.builders.cp2k.utils import get_mpid, synchronous_query
-
+from maggma.stores.gridfs import GridFSStore
 SETTINGS = EmmetSettings()
 
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 __maintainer__ = "Jason Munro"
 
 
+# TODO this builder is very close to being code agnostic. We need only resolve the standard key names and
+# how they are fed to the DefectDoc class. e.g. VASP calcs store "locpot", but CP2K store "v_hartree"
 class DefectBuilder(Builder):
     """
     The DefectBuilder collects task documents performed on structures containing a single point defect.
@@ -52,6 +54,14 @@ class DefectBuilder(Builder):
         6.) Update the defect store
     """
 
+    #TODO how to incorporate into settings?
+    DEFAULT_ALLOWED_TASKS = [
+            TaskType.Structure_Optimization.value, 
+            TaskType.Static.value
+            ]
+
+    # TODO: should electrostatic_potential store be optional in case people
+    # unify their LOCPOT data?
     # TODO: should dielectric/electronic_structure be optional or required?
     def __init__(
         self,
@@ -63,20 +73,24 @@ def __init__(
         electrostatic_potentials: Store,
         task_validation: Optional[Store] = None,
         query: Optional[Dict] = None,
-        allowed_task_types: Optional[List[str]] = None,
+        allowed_task_types: Optional[List[str]] = DEFAULT_ALLOWED_TASKS,
         settings: Optional[EmmetBuildSettings] = None,
+        defects_2d: Optional[bool] = False,
         **kwargs,
     ):
         """
         Args:
             tasks: Store of task documents
             defects: Store of defect documents to generate
+            dielectric: Store of dielectric data
+            electronic_structure: Store of electronic structure data
+            materials: Store of materials documents
+            electrostatic_potentials: Store of electrostatic potential data. These
+                are generally stored in seperately from the tasks on GridFS due to their size.
+            task_validation: Store of task validation documents.
             query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property
             allowed_task_types: list of task_types that can be processed
-            symprec: tolerance for SPGLib spacegroup finding
-            ltol: StructureMatcher tuning parameter for matching tasks to materials
-            stol: StructureMatcher tuning parameter for matching tasks to materials
-            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
+            settings: EmmetBuildSettings object
         """
 
         self.tasks = tasks
@@ -85,20 +99,53 @@ def __init__(
         self.dielectric = dielectric
         self.electronic_structure = electronic_structure
         self.electrostatic_potentials = electrostatic_potentials
-
         self.task_validation = task_validation
-        self.allowed_task_types = (
-            [t.value for t in TaskType]
-            if allowed_task_types is None
-            else allowed_task_types
-        )
+        self.allowed_task_types = allowed_task_types #TODO How to incorporate into getitems?
 
         self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
         self.settings = EmmetBuildSettings.autoload(settings)
         self.query = query if query else {}
         self.timestamp = None
+        self._mpid_map = {}
+        self.defects_2d = defects_2d
         self.kwargs = kwargs
 
+        self._defect_query = 'transformations.history.0.defect'
+
+        self._required_defect_properties = [
+            self._defect_query,
+            'output.energy',
+            'output.structure',
+            'input',
+            'transformations',
+            'task_id',
+            'nsites'
+        ]
+
+        self._required_bulk_properties = [
+            'output.energy',
+            'output.structure',
+            'output.vbm',
+            'output.cbm',
+            'input',
+            'task_id',
+        ]
+
+        self._optional_defect_properties = []
+        self._optional_bulk_properties = []
+
+        # TODO This seems pretty inelegant
+        if defects_2d:
+            self._required_defect_properties.append('v_hartree')
+            self._required_bulk_properties.append('v_hartree')
+        else:
+            self._required_defect_properties.append('output.v_hartree_planar')
+            self._required_defect_properties.append('output.v_hartree_grid')
+            self._optional_defect_properties.append('output.v_hartree_sites')
+            self._required_bulk_properties.append('output.v_hartree_planar')
+            self._required_bulk_properties.append('output.v_hartree_grid')
+            self._optional_bulk_properties.append('output.v_hartree_sites')
+
         sources = [tasks, dielectric, electronic_structure, materials, electrostatic_potentials]
         if self.task_validation:
             sources.append(self.task_validation)
@@ -107,51 +154,38 @@ def __init__(
     @property
     def defect_query(self) -> str:
         """
-        The standard query for defect tasks. Update this if
-        schema changes in the future.
-
-        For example, if top level key exists 'defect' can be returned.
-        Alternatively, if an initial defect transformation was performed, then
-        you can check via 'transformations.history.0.defect'
+        The standard query for defect tasks.
         """
-        return 'transformations.history.0.defect'
+        return self._defect_query
 
+    #TODO Hartree pot should be required but only for charged defects
     @property
-    def identifying_defect_properties(self):
-        return [
-            'charge'
-        ]
+    def required_defect_properties(self) -> List:
+        """
+        Properties essential to processing a defect task.
+        """
+        return self._required_defect_properties
 
     @property
-    def required_defect_properties(self) -> List:
-        return [
-            self.defect_query,
-            'output.energy',
-            'output.structure',
-            'input',
-            'transformations',
-            'task_id',
-            'nsites'
-        ]
+    def required_bulk_properties(self) -> List:
+        """
+        Properties essential to processing a bulk task.
+        """
+        return self._required_bulk_properties
 
     @property
     def optional_defect_properties(self) -> List:
-        return [
-            'last_updated',
-            'created_on',
-            'tags'
-        ]
+        """
+        Properties that are optional for processing a defect task.
+        """
+        return self._optional_defect_properties
 
     @property
-    def required_bulk_properties(self) -> List:
-        return [
-            'output.energy',
-            'output.structure',
-            'output.vbm',
-            'output.cbm',
-            'input',
-            'transformations',
-        ]
+    def optional_bulk_properties(self) -> List:
+        """
+        Properties that are optional for bulk tasks.
+        """
+        return self._optional_bulk_properties
 
     def ensure_indexes(self):
         """
@@ -169,9 +203,8 @@ def ensure_indexes(self):
         self.materials.ensure_index("last_updated")
         self.materials.ensure_index("task_ids")
 
-        # Search index for materials
+        # Search index for defects
         self.defects.ensure_index("material_id")
-        self.defects.ensure_index("defect_id")
         self.defects.ensure_index("last_updated")
         self.defects.ensure_index("task_ids")
 
@@ -179,6 +212,10 @@ def ensure_indexes(self):
             self.task_validation.ensure_index("task_id")
             self.task_validation.ensure_index("valid")
 
+    # TODO this is tricky to implement.
+    # Prechunking normally chunks all of the tasks, and then runs get_items on each
+    # chunk. This is a problem when we need to do defect matching, where the first chiunk
+    # may not contain the defect that we are trying to match. 
     def prechunk(self, number_splits: int) -> Iterator[Dict]:
         raise NotImplementedError
 
@@ -193,11 +230,21 @@ def get_items(self) -> Iterator[List[Dict]]:
             1. Get all tasks with standard "defect" query tag
             2. Filter all tasks by skipping tasks which are already in the Defect Store
             3. Get all tasks that could be used as bulk
-
+            4. Filter all bulks which do not have corresponding Dielectric and 
+               ElectronicStructure data (if a band gap exists for that task).
+            5. Group defect tasks by defect matching 
+            6. Given defect object in a group, bundle them with bulk tasks
+               identified with structure matching
+            7. Yield the item bundles
 
         Returns:
-            generator or list relevant tasks and materials to process into materials documents
-        """
+            Iterator of (defect documents, task bundles)
+
+                The defect document is an existing defect doc to be updated with new data, or None
+            
+                task bundles bundle are all the tasks that correspond to the same defect and all possible
+                bulk tasks that could be matched to them.
+d        """
 
         self.logger.info("Defect builder started")
         self.logger.info(
@@ -210,37 +257,59 @@ def get_items(self) -> Iterator[List[Dict]]:
         # Save timestamp to mark buildtime for material documents
         self.timestamp = datetime.utcnow()
 
-        # Get all tasks
         self.logger.info("Finding tasks to process")
-        temp_query = dict(self.query)
-        temp_query["state"] = "successful"
 
-        all_tasks = {
+        # Get defect tasks
+        temp_query = self.query.copy()
+        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})
+        temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"})
+        defect_tasks = {
             doc[self.tasks.key]
             for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
         }
 
-        temp_query = {
-            "$and": [
-                temp_query, 
-                {self.defect_query: {'$exists': True}}, 
-                {d: {'$exists': True} for d in self.required_defect_properties}
-                ]
-            }
-        defect_tasks = {
+        # Get bulk tasks
+        temp_query = {d: {'$exists': True} for d in self.required_bulk_properties}
+        temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"})
+        bulk_tasks = {
             doc[self.tasks.key]
             for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
         }
 
+        # TODO Not the same validation behavior as material builders?
+        # If validation store exists, find tasks that are invalid and remove them
+        if self.task_validation:
+            validated = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {}, [self.task_validation.key]        
+                )
+            }
+            
+            defect_tasks = defect_tasks.intersection(validated)
+            bulk_tasks = bulk_tasks.intersection(validated)
+
+            invalid_ids = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {"is_valid": False}, [self.task_validation.key]
+                )
+            }
+            self.logger.info("Removing {} invalid tasks".format(len(invalid_ids)))
+            defect_tasks = defect_tasks - invalid_ids
+            bulk_tasks = bulk_tasks - invalid_ids
+
         processed_defect_tasks = {
             t_id
             for d in self.defects.query({}, ["task_ids"])
             for t_id in d.get("task_ids", [])
         }
 
+        all_tasks = defect_tasks | bulk_tasks
+
         self.logger.debug("All tasks: {}".format(len(all_tasks)))
-        self.logger.debug("Bulk tasks before filter: {}".format(len(all_tasks-defect_tasks)))
-        bulk_tasks = set(filter(self.__preprocess_bulk, all_tasks - defect_tasks))
+        self.logger.debug("Bulk tasks before filter: {}".format(len(bulk_tasks)))
+        bulk_tasks = set(filter(self.__preprocess_bulk, bulk_tasks))
         self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))
         self.logger.debug("All defect tasks: {}".format(len(defect_tasks)))
         unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
@@ -258,25 +327,12 @@ def get_items(self) -> Iterator[List[Dict]]:
         # Set total for builder bars to have a total
         self.total = len(unprocessed_defect_tasks)
 
-        if self.task_validation:
-            invalid_ids = {
-                doc[self.tasks.key]
-                for doc in self.task_validation.query(
-                    {"is_valid": False}, [self.task_validation.key]
-                )
-            }
-            for t in bulk_tasks.union(unprocessed_defect_tasks):
-                for doc in self.tasks.query({self.tasks.key: t}):
-                    if t in invalid_ids:
-                        doc["is_valid"] = False
-                    else:
-                        doc["is_valid"] = True
-
         # yield list of defects that are of the same type, matched to an appropriate bulk calc
         self.logger.info(f"Starting defect matching.")
 
         for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
-            yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)
+            #yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)
+            yield defect, self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
 
     def process_item(self, items):
         """
@@ -289,15 +345,18 @@ def process_item(self, items):
 
         returns: the defect document as a dictionary
         """
-        defect_doc, item_bundle = items
+        defect, task_ids = items
+        defect_doc, item_bundle = self.__get_defect_doc(defect), self.__get_item_bundle(task_ids) 
         self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
         if item_bundle:
-            material_id = self._get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..
-            print("THE MATRIAL ID I FOUND IS ", material_id)
+            material_id = self._mpid_map[item_bundle[0][1]['task_id']]
+            tags = item_bundle[0][0].get('tags', [])
+            # TODO: This is a hack to get the 2d doc
+            DEFECT_DOC = DefectDoc2d if '2d' in tags else DefectDoc
             if defect_doc:
                 defect_doc.update_all(item_bundle, query=self.defect_query)
             else:
-                defect_doc = DefectDoc.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
+                defect_doc = DEFECT_DOC.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
             return defect_doc.dict()
 
     def update_targets(self, items):
@@ -336,6 +395,8 @@ def __filter_and_group_tasks(self, tasks):
             [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect
         """
 
+        # TODO self.defect_query does not work here because mongo does not allow projection 
+        # that retrieves a list index. So, we have to retrieve the whole transformations
         props = [
             #self.defect_query,
             'transformations',
@@ -378,7 +439,7 @@ def are_equal(x, y):
             if pdc.are_equal(x['defect'], y['defect']):
                 return True
 
-            # TODO This is needed for ghost vacancy unfortunately, since  sm.fit can't distinguish ghosts
+            # TODO This is needed for ghost vacancy unfortunately, since sm.fit can't distinguish ghosts
             if x['defect'].defect_composition == y['defect'].defect_composition and \
                     x['defect'].charge == y['defect'].charge and \
                     sm.fit(x['structure'], y['structure']):
@@ -429,20 +490,19 @@ def __get_defect_doc(self, defect):
                 return doc
         return None
 
-    def __get_dielectric(self, task_id):
+    # TODO should move to returning dielectric doc or continue returning the total diel tensor?
+    def __get_dielectric(self, key):
         """
         Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store
         and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would
         be the case for metallic systems, return None.
         """
-        t = next(self.tasks.query(criteria={'task_id': task_id}, properties=['output.structure']))
-        struc = Structure.from_dict(t.get('output').get('structure'))
-        self.logger.debug("Finding dielectric for task_id {} for MPID {}".format(task_id, self._get_mpid(struc)))
-        for diel in self.dielectric.query(criteria={self.dielectric.key: self._get_mpid(struc)}, properties=['dielectric.total']):
-            return diel['dielectric']['total']
+        for diel in self.dielectric.query(criteria={"material_id": key}, properties=['total']):
+            return diel['total']
         return None
 
-    def __get_item_bundle(self, bulk_tasks, defect_task_group):
+    #TODO retrieving the electrostatic potential is by far the most expesive part of the builder. Any way to reduce?
+    def __get_item_bundle(self, task_ids):
         """
         Gets a group of items that can be processed together into a defect document.
 
@@ -454,15 +514,23 @@ def __get_item_bundle(self, bulk_tasks, defect_task_group):
         """
         return [
             (
-                next(synchronous_query(self.tasks, self.electrostatic_potentials, query={'task_id': defect_tasks_id}, properties=None)),
-                next(synchronous_query(self.tasks, self.electrostatic_potentials, query={'task_id': bulk_tasks_id}, properties=None)),
+                next(
+                    synchronous_query(
+                        self.tasks, 
+                        self.electrostatic_potentials, 
+                        query={'task_id': defect_tasks_id}, properties=None) #self.required_defect_properties) #TODO
+                        ),
+                next(
+                    synchronous_query(
+                        self.tasks, 
+                        self.electrostatic_potentials, 
+                        query={'task_id': bulk_tasks_id}, properties=self.required_bulk_properties)
+                        ),
                 self.__get_dielectric(bulk_tasks_id),
             )
-            for defect_tasks_id, bulk_tasks_id
-            in self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
+            for defect_tasks_id, bulk_tasks_id in task_ids
         ]
 
-    # TODO NEED TO GET FORM EN FOR SORTING FROM MATDOC
     def _get_mpid(self, structure):
         """
         Given a structure, determine if an equivalent structure exists, with a material_id,
@@ -473,14 +541,14 @@ def _get_mpid(self, structure):
 
         returns: material_id, if one exists, else None
         """
-        sga = SpacegroupAnalyzer(structure)
+        sga = SpacegroupAnalyzer(structure, symprec=SETTINGS.SYMPREC, angle_tolerance=SETTINGS.ANGLE_TOL)
         mats = self.materials.query(
             criteria={
                 'chemsys': structure.composition.chemical_system,
-                'symmetry.symbol': sga.get_space_group_symbol()
             }, properties=['structure', 'material_id']
         )
-        sm = StructureMatcher()
+        # TODO coudl more than one material match true?
+        sm = StructureMatcher(ltol=SETTINGS.LTOL, stol=SETTINGS.STOL, angle_tol=SETTINGS.ANGLE_TOL)
         for m in mats:
             if sm.fit(structure, Structure.from_dict(m['structure'])):
                 return m['material_id']
@@ -501,6 +569,7 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids):
         self.logger.debug(f"Bulk tasks: {bulk_ids}")
         self.logger.debug(f"Defect tasks: {defect_ids}")
 
+        # TODO mongo projection on array doesn't work (see above)
         props = [
             'task_id',
             'input',
@@ -533,24 +602,19 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids):
             comparator=ElementComparator(),
         )
 
-        # TODO I think the secondary (nsites) comparison might have some edge case issues
         def _compare(b, d):
             if run_type(b.get('input').get('dft')).value.split('+U')[0] == \
                 run_type(d.get('input').get('dft')).value.split('+U')[0] and \
-                    sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):
-                if abs(b['nsites'] - d['nsites']) <= 1:
-                    return True
+                   sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):
+                       return True
             return False
 
-        # TODO This loop will terminate the match when the first bulk match for a defect is found. This should 
-        # be fine if we can ensure that the commenserate bulks are all the same given they both compare True.
-        # Need to double check that they really are the same.
-        pairs = []
-        for defect in defects:
-            for bulk in bulks:
-                if _compare(bulk, defect):
-                    pairs.append((defect['task_id'], bulk['task_id']))
-                    break
+        pairs = [
+            (defect['task_id'], bulk['task_id'])
+            for bulk in bulks
+            for defect in defects
+            if _compare(bulk, defect)
+        ]
 
         self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
         return pairs
@@ -562,26 +626,30 @@ def __preprocess_bulk(self, task):
 
             (1) Correspond to an existing material_id in the materials store
             (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store
+            (3) If bulk is not a metal, electronic structure document must exist in the store
 
         """
+        self.logger.debug("Preprocessing bulk task {}".format(task))
         t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid']))
 
-        # TODO: This is for my personal use to get around the 2D materials problem. Should not be made official
-
-        if 'ML' in t['mpid']:
-            self.logger.debug(f"Found monolayer for...")
-            mpid = t['mpid'].split('-ML')[0]
-        else:
-            struc = Structure.from_dict(t.get('output').get('structure'))
-            mpid = self._get_mpid(struc)
-            if not mpid:
-                return False
+        struc = Structure.from_dict(t.get('output').get('structure'))
+        mpid = self._get_mpid(struc)
+        if not mpid:
+            self.logger.debug(f"No material id found for bulk task {task}")
+            return False
+        self._mpid_map[task] = mpid
         self.logger.debug(f"Material ID: {mpid}")
 
-        elec = next(self.electronic_structure.query(properties=None, criteria={"material_id": mpid}))
-        dos = MontyDecoder().process_decoded({k: v for k, v in elec.items()})
-        if dos.get_gap():
-            diel = list(self.dielectric.query(criteria={self.dielectric.key: mpid}))
+        elec = self.electronic_structure.query_one(
+            properties=['band_gap'], criteria={self.electronic_structure.key: mpid}
+            )
+        if not elec:
+            self.logger.debug(f"Electronic structure data not found for {mpid}")
+            return False
+
+        # TODO right now pulling dos from electronic structure, should just pull summary document
+        if elec['band_gap'] > 0:
+            diel = self.__get_dielectric(mpid)
             if not diel:
                 self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "
                                  f"dielectric properties, but none found in dielectric store")
@@ -593,42 +661,20 @@ def __preprocess_bulk(self, task):
     def __get_pristine_supercell(task):
         """
         Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
+            - If defect transform exists, the following transform's input will be returned
+            - If no follow up transform exists, the calculation input will be returned
+
         If defect cannot be found in task, return the input structure.
         """
-        if task.get('defect'):
-            return load_class(
-                task['defect']['@module'], task['defect']['@class']
-            ).from_dict(task['defect']).bulk_structure
-        elif task.get('transformations'):
-            return Structure.from_dict(task['transformations']['history'][0]['input_structure'])
+        if task.get('transformations'):
+            for i, transformation in enumerate(task['transformations']['history']):
+                    if transformation['@class'] == 'DefectTransformation':
+                        tmp = Structure.from_dict(transformation['input_structure'])
+                        tmp.make_supercell(transformation['scaling_matrix'])
+                        return tmp
         return Structure.from_dict(task['input']['structure'])
 
 
-# TODO This needs to be unified into one builder somehow
-class DefectBuilder2d(DefectBuilder):
-
-    def process_item(self, items):
-        """
-        Process a group of defect tasks that correspond to the same defect into a single defect
-        document. If the DefectDoc already exists, then update it and return it. If it does not,
-        create a new DefectDoc
-
-        Args:
-            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]
-
-        returns: the defect document as a dictionary
-        """
-        defect_doc, item_bundle = items
-        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
-        if item_bundle:
-            material_id = self._get_mpid(Structure.from_dict(item_bundle[0][1]['output']['structure']))  # TODO more mpid messes..
-            if defect_doc:
-                defect_doc.update_all(item_bundle, query=self.defect_query)
-            else:
-                defect_doc = DefectDoc2d.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
-            return defect_doc.dict()
-
-
 class DefectThermoBuilder(Builder):
 
     """
@@ -710,6 +756,8 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         all_docs = [doc for doc in self.defects.query(self.query)]
 
+        self.logger.debug(f"Found {len(all_docs)} defect docs to process")
+
         def filterfunc(x):
             # material for defect x exists
             if not list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)):
@@ -733,7 +781,11 @@ def filterfunc(x):
         ):
             group = [g for g in group]
             try:
-                yield (group, self.__get_materials(key), self.__get_electronic_structure(group))
+                yield (
+                    group, 
+                    self.__get_materials(key), 
+                    self.__get_electronic_structure(group[0]['material_id']),
+                    )
             except LookupError as exception:
                 raise exception
 
@@ -765,8 +817,20 @@ def update_targets(self, items):
         else:
             self.logger.info("No items to update")
 
-    def __get_electronic_structure(self, group):
-        return next(self.electronic_structures.query(criteria={'material_id': group[0]['material_id']}, properties=None))
+    def __get_electronic_structure(self, material_id):
+        """
+        Gets the electronic structure of the bulk material
+        """
+        self.logger.info(f"Getting electronic structure for {material_id}")
+
+        # TODO This is updated to return the whole query because a.t.m. the
+        # DOS part of the electronic builder isn't working, so I'm using
+        # this to pull direct from the store of dos objects with no processing.
+        dosdoc = self.electronic_structures.query(
+            criteria={self.electronic_structures.key: material_id},
+            properties=None,
+        )[0]
+        return dosdoc
 
     def __get_materials(self, key) -> List:
         """
diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
index 82ec58f099..7d6b95dabc 100644
--- a/emmet-builders/emmet/builders/cp2k/materials.py
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -5,7 +5,6 @@
 from maggma.builders import Builder
 from maggma.stores import Store
 
-from emmet.builders.utils import maximal_spanning_non_intersecting_subsets
 from emmet.builders.settings import EmmetBuildSettings
 from emmet.core.utils import group_structures, jsanitize
 from emmet.core.cp2k.material import MaterialsDoc
diff --git a/emmet-builders/emmet/builders/cp2k/task_validator.py b/emmet-builders/emmet/builders/cp2k/task_validator.py
index 061ace03b5..6ae2e4b684 100644
--- a/emmet-builders/emmet/builders/cp2k/task_validator.py
+++ b/emmet-builders/emmet/builders/cp2k/task_validator.py
@@ -4,8 +4,8 @@
 from maggma.core import Store
 
 from emmet.builders.settings import EmmetBuildSettings
-from emmet.core.vasp.task import TaskDocument
-from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc
+from emmet.core.cp2k.task import TaskDocument
+from emmet.core.cp2k.validation import DeprecationMessage, ValidationDoc
 
 
 class TaskValidator(MapBuilder):
@@ -34,15 +34,9 @@ def __init__(
             source=tasks,
             target=task_validation,
             projection=[
-                "orig_inputs",
-                "input.hubbards",
-                "output.structure",
-                "output.bandgap",
-                "calcs_reversed.output.ionic_steps.electronic_steps.e_fr_energy",
+                "input",
+                "output.forces",
                 "tags",
-                # Need these two for proper run_type determination
-                "calcs_reversed.input.parameters",
-                "calcs_reversed.input.incar",
             ],
             query=query,
             **kwargs,
@@ -58,11 +52,6 @@ def unary_function(self, item):
         task_doc = TaskDocument(**item)
         validation_doc = ValidationDoc.from_task_doc(
             task_doc=task_doc,
-            kpts_tolerance=self.settings.VASP_KPTS_TOLERANCE,
-            kspacing_tolerance=self.settings.VASP_KSPACING_TOLERANCE,
-            input_sets=self.settings.VASP_DEFAULT_INPUT_SETS,
-            LDAU_fields=self.settings.VASP_CHECKED_LDAU_FIELDS,
-            max_allowed_scf_gradient=self.settings.VASP_MAX_SCF_GRADIENT,
         )
 
         bad_tags = list(set(task_doc.tags).intersection(self.settings.DEPRECATED_TAGS))
diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py
index df9d9126d6..99a1112264 100644
--- a/emmet-core/emmet/core/cp2k/calc_types/utils.py
+++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py
@@ -72,9 +72,6 @@ def task_type(
     """
 
     calc_type = []
-    ci = Cp2kInput.from_dict(inputs.get('cp2k_input'))
-    if ci.check("MOTION/CONSTRAINT"):
-        calc_type.append("Constrained")
     cp2k_run_type = inputs.get('cp2k_global').get('Run_type', '')
 
     if cp2k_run_type.upper() in ['ENERGY', 'ENERGY_FORCE', 'WAVEFUNCTION_OPTIMIZATION', 'WFN_OPT']:
diff --git a/emmet-core/emmet/core/cp2k/validation.py b/emmet-core/emmet/core/cp2k/validation.py
index 9055bdf39d..bfb0fe066e 100644
--- a/emmet-core/emmet/core/cp2k/validation.py
+++ b/emmet-core/emmet/core/cp2k/validation.py
@@ -2,37 +2,42 @@
 from typing import Dict, List, Union
 
 import numpy as np
-from pydantic import BaseModel, Field
+from pydantic import Field, PyObject
 
-from emmet.core import SETTINGS
+from emmet.core.settings import EmmetSettings
+from emmet.core.base import EmmetBaseModel
+from emmet.core.mpid import MPID
 from emmet.core.utils import DocEnum
 from emmet.core.vasp.task import TaskDocument
 
+SETTINGS = EmmetSettings()
 
-class DeprecationMessage(DocEnum):
 
+class DeprecationMessage(DocEnum):
     MANUAL = "M", "manual deprecation"
-    CUTOFF = "C002", "PW cutoff too low"
-    FORCES = "C003", "Forces too large"
+    FORCES = "C001", "Forces too large"
     CONVERGENCE = "E001", "Calculation did not converge"
-    MAX_SCF = "E002", "Max SCF gradient too large"
-    LDAU = "I001", "LDAU Parameters don't match the inputset"
-    BASIS = "", "Improper basis sets"
-
 
-class ValidationDoc(BaseModel):
+class ValidationDoc(EmmetBaseModel):
     """
-    Validation document for a CP2K calculation
+    Validation document for a VASP calculation
     """
 
-    task_id: str = Field(..., description="The task_id for this validation document")
+    task_id: MPID = Field(..., description="The task_id for this validation document")
     valid: bool = Field(False, description="Whether this task is valid or not")
     last_updated: datetime = Field(
         description="Last updated date for this document",
         default_factory=datetime.utcnow,
     )
     reasons: List[Union[DeprecationMessage, str]] = Field(
-        [], description="List of deprecation tags detailing why this task isn't valid"
+        None, description="List of deprecation tags detailing why this task isn't valid"
+    )
+    warnings: List[str] = Field(
+        [], description="List of potential warnings about this calculation"
+    )
+    data: Dict = Field(
+        description="Dictioary of data used to perform validation."
+        " Useful for post-mortem analysis"
     )
 
     class Config:
@@ -42,83 +47,29 @@ class Config:
     def from_task_doc(
         cls,
         task_doc: TaskDocument,
-        kpts_tolerance: float = SETTINGS.VASP_KPTS_TOLERANCE,
-        input_sets: Dict[str, type] = SETTINGS.VASP_DEFAULT_INPUT_SETS,
-        LDAU_fields: List[str] = SETTINGS.VASP_CHECKED_LDAU_FIELDS,
     ) -> "ValidationDoc":
         """
-        Determines if a calculation is valid based on expected input parameters from a pymatgen inputset
 
-        Args:
-            task_doc: the task document to process
-            input_sets (dict): a dictionary of task_types -> pymatgen input set for validation
-            kpts_tolerance (float): the tolerance to allow kpts to lag behind the input set settings
-            LDAU_fields (list(String)): LDAU fields to check for consistency
         """
 
-        structure = task_doc.output.structure
-        task_type = task_doc.task_type
-        inputs = task_doc.orig_inputs
+        calc_type = task_doc.calc_type
+
+        # Force check
+        forces = np.array(task_doc.output.forces) > 0.05
+        if np.any(forces):
+            reasons = [DeprecationMessage.FORCES] 
 
-        is_valid = True
         reasons = []
         data = {}
-
-        if task_type in input_sets:
-            valid_input_set = input_sets[task_type](structure)
-
-            # Checking K-Points
-            valid_num_kpts = valid_input_set.kpoints.num_kpts or np.prod(
-                valid_input_set.kpoints.kpts[0]
-            )
-            num_kpts = inputs.get("kpoints", {}).get("nkpoints", 0) or np.prod(
-                inputs.get("kpoints", {}).get("kpoints", [1, 1, 1])
-            )
-            data["kpts_ratio"] = num_kpts / valid_num_kpts
-            if data["kpts_ratio"] < kpts_tolerance:
-                is_valid = False
-                reasons.append(DeprecationMessage.kpoints)
-
-            # Checking ENCUT
-            encut = inputs.get("incar", {}).get("ENCUT")
-            valid_encut = valid_input_set.incar["ENCUT"]
-            data["encut_ratio"] = float(encut) / valid_encut  # type: ignore
-            if data["encut_ratio"] < 1:
-                is_valid = False
-                reasons.append(DeprecationMessage.encut)
-
-            # Checking U-values
-            if valid_input_set.incar.get("LDAU"):
-                # Assemble actual input LDAU params into dictionary to account for possibility
-                # of differing order of elements
-                structure_set_symbol_set = structure.symbol_set
-                inputs_ldau_fields = [structure_set_symbol_set] + [
-                    inputs.get("incar", {}).get(k, []) for k in LDAU_fields
-                ]
-                input_ldau_params = {d[0]: d[1:] for d in zip(*inputs_ldau_fields)}
-
-                # Assemble required input_set LDAU params into dictionary
-                input_set_symbol_set = valid_input_set.poscar.structure.symbol_set
-                input_set_ldau_fields = [input_set_symbol_set] + [
-                    valid_input_set.incar.get(k) for k in LDAU_fields
-                ]
-                input_set_ldau_params = {
-                    d[0]: d[1:] for d in zip(*input_set_ldau_fields)
-                }
-
-                if any(
-                    input_set_ldau_params[el] != input_params
-                    for el, input_params in input_ldau_params.items()
-                ):
-                    is_valid = False
-                    reasons.append(DeprecationMessage.ldau)
+        warnings = []
 
         doc = ValidationDoc(
             task_id=task_doc.task_id,
-            task_type=task_doc.task_type,
+            calc_type=calc_type,
             run_type=task_doc.run_type,
-            valid=is_valid,
+            valid=len(reasons) == 0,
             reasons=reasons,
-            **data
+            data=data,
+            warnings=warnings,
         )
         return doc
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index 562a370396..d7ca93432a 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -27,7 +27,7 @@
 from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
 from emmet.core.cp2k.material import MaterialsDoc
 from emmet.core.cp2k.calc_types.utils import run_type, task_type
-from emmet.builders.cp2k.utils import get_mpid, matcher
+from emmet.builders.cp2k.utils import matcher
 
 from pymatgen.analysis.defects.corrections import FreysoldtCorrection2d
 from pymatgen.io.vasp.outputs import VolumetricData
@@ -214,7 +214,7 @@ def _sort(x):
         return cls(**data)
 
     @classmethod
-    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='defect'):
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
         """
         Extract a defect entry from a single pair (defect and bulk) of tasks.
 
@@ -281,7 +281,11 @@ def get_init(x):
         final_defect_structure = Structure.from_dict(defect_task['output']['structure'])
         final_bulk_structure = Structure.from_dict(bulk_task['output']['structure'])
 
-        #mpid = get_mpid(init_bulk_structure)
+        dfi, site_matching_indices = matcher(
+            init_bulk_structure, init_defect_structure,
+            final_bulk_struc=final_bulk_structure, final_defect_struc=final_defect_structure
+        )
+        defect_frac_sc_coords = final_defect_structure[dfi].frac_coords
 
         parameters = {
             'defect_energy': defect_task['output']['energy'],
@@ -290,7 +294,7 @@ def get_init(x):
             'final_defect_structure': final_defect_structure,
             'vbm': bulk_task['output']['vbm'],
             'cbm': bulk_task['output']['cbm'],
-            #'material_id': mpid,
+            'defect_frac_sc_coords': defect_frac_sc_coords,
             'entry_id': defect_task.get('task_id')
         }
 
@@ -302,12 +306,6 @@ def get_init(x):
             bulk_site_averages = [float(x) for x in bulk_task['output']['v_hartree_sites']] if bulk_task['output'].get('v_hartree_sites') else None
             defect_site_averages = [float(x) for x in defect_task['output']['v_hartree_sites']] if defect_task['output'].get('v_hartree_sites') else None
 
-            dfi, site_matching_indices = matcher(
-                init_bulk_structure, init_defect_structure,
-                final_bulk_struc=final_bulk_structure, final_defect_struc=final_defect_structure
-            )
-            defect_frac_sc_coords = final_defect_structure[dfi].frac_coords
-
             parameters['axis_grid'] = axis_grid
             parameters['bulk_planar_averages'] = bulk_planar_averages
             parameters['defect_planar_averages'] = defect_planar_averages
@@ -329,7 +327,16 @@ class DefectDoc2d(DefectDoc):
     """
 
     @classmethod
-    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='defect'):
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
+        """
+        Get defect entry from defect and bulk tasks. 
+
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+            dielectric: dielectric tensor for the defect calculation
+            query: query string for defect entry
+        """
         parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
         if dielectric:
             eps_parallel = (dielectric[0][0] + dielectric[1][1]) / 2
@@ -338,15 +345,15 @@ def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, qu
 
         defect_entry = DefectEntry(
             cls.get_defect_from_task(query=query, task=defect_task),
-            uncorrected_energy=parameters['defect_energy'] - parameters['bulk_energy'],
+            uncorrected_energy=parameters.pop('defect_energy') - parameters.pop('bulk_energy'),
             parameters=parameters,
-            entry_id=parameters['entry_id']
+            entry_id=parameters.pop('entry_id')
         )
 
         DefectCompatibility().process_entry(defect_entry, perform_corrections=False)
         with ScratchDir('.'):
             fc = FreysoldtCorrection2d(
-                    defect_entry.parameters.get('dielectric', 22), 
+                    defect_entry.parameters.get('dielectric'), 
                     "LOCPOT.ref", "LOCPOT.def", encut=520, buffer=2
                 ) 
             lref = VolumetricData(
@@ -381,6 +388,13 @@ class Config:
         None, description="All task ids used in creating these phase diagrams"
     )
 
+    #TODO not by run type... in principle shouldn't be this way, but DOS is almost always GGA
+    bulk_dos: CompleteDos = Field(None, "Complete Density of States for the bulk Structure")
+
+    defect_phase_diagrams: Mapping[RunType, DefectPhaseDiagram] = Field(
+        None, description="Defect phase diagrams for each run type"
+    )
+
     brouwer_diagrams: Mapping[RunType, BrouwerDiagram] = Field(
         None, description="Brouwer diagrams"
     )

From a45100d6ff09fd47a154ac7e69aeac7e241c887a Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 3 May 2022 13:09:28 -0700
Subject: [PATCH 38/41] Updates

---
 emmet-builders/emmet/builders/cp2k/defects.py | 72 +++++++++++--------
 .../emmet/builders/cp2k/materials.py          |  8 +--
 .../emmet/builders/materials/dielectric.py    |  3 +-
 .../materials/electronic_structure.py         | 19 +++--
 emmet-builders/emmet/builders/settings.py     |  9 ++-
 .../emmet/builders/vasp/materials.py          |  4 +-
 emmet-core/_version.py                        |  2 +-
 emmet-core/emmet/core/cp2k/material.py        | 64 +++++++++++++----
 emmet-core/emmet/core/cp2k/validation.py      |  3 +-
 emmet-core/emmet/core/defect.py               | 40 +++++++----
 emmet-core/emmet/core/electronic_structure.py |  4 +-
 emmet-core/emmet/core/settings.py             |  1 +
 .../emmet/core/vasp/calc_types/enums.py       | 48 ++++++-------
 .../emmet/core/vasp/calc_types/run_types.yaml | 14 ++--
 emmet-core/emmet/core/vasp/validation.py      |  2 +
 15 files changed, 178 insertions(+), 115 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index f751c83553..d7a934da74 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 from itertools import chain, groupby, combinations
+from tkinter import W
 from typing import Dict, Iterator, List, Optional
 from copy import deepcopy
 from monty.json import MontyDecoder
@@ -14,14 +15,19 @@
 from atomate.utils.utils import load_class
 
 from emmet.core.settings import EmmetSettings
+from emmet.core.thermo import ThermoDoc
+from emmet.core.material import MaterialsDoc
+
 from emmet.core.utils import jsanitize, get_sg
 from emmet.core.defect import DefectDoc, DefectDoc2d, DefectThermoDoc
 from emmet.core.cp2k.calc_types import TaskType
 from emmet.core.cp2k.calc_types.utils import run_type
-from emmet.core.cp2k.material import MaterialsDoc
 from emmet.builders.settings import EmmetBuildSettings
 from emmet.builders.cp2k.utils import get_mpid, synchronous_query
 from maggma.stores.gridfs import GridFSStore
+
+from emmet.core.electronic_structure import ElectronicStructureDoc
+
 SETTINGS = EmmetSettings()
 
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
@@ -261,7 +267,7 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         # Get defect tasks
         temp_query = self.query.copy()
-        temp_query.update({d: {'$exists': True} for d in self.required_defect_properties})
+        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
         temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"})
         defect_tasks = {
             doc[self.tasks.key]
@@ -285,7 +291,7 @@ def get_items(self) -> Iterator[List[Dict]]:
                     {}, [self.task_validation.key]        
                 )
             }
-            
+
             defect_tasks = defect_tasks.intersection(validated)
             bulk_tasks = bulk_tasks.intersection(validated)
 
@@ -315,7 +321,7 @@ def get_items(self) -> Iterator[List[Dict]]:
         unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
 
         if not unprocessed_defect_tasks:
-            self.logger.info("No unprocessed to tasks. Exiting")
+            self.logger.info("No unprocessed defect tasks. Exiting")
             return
         elif not bulk_tasks:
             self.logger.info("No compatible bulk calculations. Exiting.")
@@ -526,7 +532,7 @@ def __get_item_bundle(self, task_ids):
                         self.electrostatic_potentials, 
                         query={'task_id': bulk_tasks_id}, properties=self.required_bulk_properties)
                         ),
-                self.__get_dielectric(bulk_tasks_id),
+                self.__get_dielectric(self._mpid_map[bulk_tasks_id]),
             )
             for defect_tasks_id, bulk_tasks_id in task_ids
         ]
@@ -597,7 +603,7 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids):
             angle_tol=SETTINGS.ANGLE_TOL,
             primitive_cell=False,
             scale=True,
-            attempt_supercell=True,
+            attempt_supercell=False,
             allow_subset=False,
             comparator=ElementComparator(),
         )
@@ -675,6 +681,9 @@ def __get_pristine_supercell(task):
         return Structure.from_dict(task['input']['structure'])
 
 
+#TODO Major problem with this builder. materials store is used to sync the diel, elec, and pd with a single material id
+#TODO This is a problem because the material id in vasp store is not synced to cp2k store
+#TODO Also the chempots needed to adjust entries must come from cp2k, but you need to give vasp to sync the others
 class DefectThermoBuilder(Builder):
 
     """
@@ -691,7 +700,9 @@ def __init__(
             defects: Store,
             defect_thermos: Store,
             materials: Store,
+            thermo: Store,
             electronic_structures: Store,
+            dos: Store,
             query: Optional[Dict] = None,
             **kwargs,
     ):
@@ -707,13 +718,15 @@ def __init__(
         self.defects = defects
         self.defect_thermos = defect_thermos
         self.materials = materials
+        self.thermo = thermo
+        self.dos = dos
         self.electronic_structures = electronic_structures
 
         self.query = query if query else {}
         self.timestamp = None
         self.kwargs = kwargs
 
-        super().__init__(sources=[defects, materials, electronic_structures], targets=[defect_thermos], **kwargs)
+        super().__init__(sources=[defects, materials, thermo, electronic_structures, dos], targets=[defect_thermos], **kwargs)
 
     def ensure_indexes(self):
         """
@@ -761,15 +774,13 @@ def get_items(self) -> Iterator[List[Dict]]:
         def filterfunc(x):
             # material for defect x exists
             if not list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)):
+                self.logger.debug(f"No material with MPID={x['material_id']} in the material store")
                 return False
 
-            # All chempots exist in material store
-            if not all(
-                bool(list(self.materials.query(criteria={'chemsys': str(el)}, properties=None)))
-                for el in
-                load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).defect_composition
-            ):
-                return False
+            for el in load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).defect_composition:
+                if not list(self.thermo.query(criteria={'chemsys': str(el)}, properties=None)):
+                    self.logger.debug(f"No entry for {el} in Thermo Store")
+                    return False
 
             return True
 
@@ -781,11 +792,10 @@ def filterfunc(x):
         ):
             group = [g for g in group]
             try:
-                yield (
-                    group, 
-                    self.__get_materials(key), 
-                    self.__get_electronic_structure(group[0]['material_id']),
-                    )
+                mat = self.__get_materials(key)
+                thermo = self.__get_thermos(mat.composition)
+                elec = self.__get_electronic_structure(group[0]['material_id'])
+                yield (group, mat, thermo, elec)
             except LookupError as exception:
                 raise exception
 
@@ -794,10 +804,10 @@ def process_item(self, docs):
         Process a group of defects belonging to the same material into a defect thermo doc
         """
         self.logger.info(f"Processing defects")
-        defects, materials, elec_struc = docs
+        defects, material, thermos, elec_struc = docs
         defects = [DefectDoc(**d) for d in defects]
-        materials = [MaterialsDoc(**m) for m in materials]
-        defect_thermo_doc = DefectThermoDoc.from_docs(defects, materials=materials, electronic_structure=elec_struc)
+        thermos = [ThermoDoc(**t) for t in thermos]
+        defect_thermo_doc = DefectThermoDoc.from_docs(defects, thermos=thermos, electronic_structure=elec_struc)
         return defect_thermo_doc.dict()
 
     def update_targets(self, items):
@@ -826,28 +836,28 @@ def __get_electronic_structure(self, material_id):
         # TODO This is updated to return the whole query because a.t.m. the
         # DOS part of the electronic builder isn't working, so I'm using
         # this to pull direct from the store of dos objects with no processing.
-        dosdoc = self.electronic_structures.query(
+        dosdoc = self.electronic_structures.query_one(
             criteria={self.electronic_structures.key: material_id},
             properties=None,
-        )[0]
-        return dosdoc
+        )
+        t_id = ElectronicStructureDoc(**dosdoc).dos.total['1'].task_id
+        dos = self.dos.query_one(criteria={'task_id': int(t_id)}, properties=None) #TODO MPID str/int issues
+        return dos
 
     def __get_materials(self, key) -> List:
         """
         Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
         materials store.
         """
-        bulk = list(self.materials.query(criteria={'material_id': key}, properties=None))
+        bulk = self.materials.query_one(criteria={'material_id': key}, properties=None)
         if not bulk:
             raise LookupError(
                 f"The bulk material ({key}) for these defects cannot be found in the materials store"
             )
-        elements = bulk[0]['chemsys']
-
-        if isinstance(elements, str):
-            elements = elements.split("-")
+        return MaterialsDoc(**bulk)
 
-        return list(chain(self.materials.query(criteria={"chemsys": {"$in": elements}}, properties=None), bulk))
+    def __get_thermos(self, composition) -> List:
+        return list(self.thermo.query(criteria={"elements": {"$in": jsanitize(composition.elements)}}))
 
 
 def unpack(query, d):
diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
index 7d6b95dabc..1846bcc57d 100644
--- a/emmet-builders/emmet/builders/cp2k/materials.py
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -102,7 +102,7 @@ def get_items(self) -> Iterator[List[Dict]]:
         self.logger.info("Materials builder started")
         # TODO make a cp2k allowed type setting
         self.logger.info(
-            f"Allowed task types: {[task_type.value for task_type in self.settings.VASP_ALLOWED_VASP_TYPES]}"
+            f"Allowed task types: {[task_type.value for task_type in self.settings.CP2K_ALLOWED_TASK_TYPES]}"
         )
 
         self.logger.info("Setting indexes")
@@ -245,14 +245,10 @@ def filter_and_group_tasks(self, tasks: List[TaskDocument]) -> Iterator[List[Dic
         Groups tasks by structure matching
         """
 
-        """
-        Groups tasks by structure matching
-        """
-
         # TODO why did the way vasp builder did it not work here?
         filtered_tasks = []
         for task in tasks:
-            for allowed_type in self.settings.CP2K_ALLOWED_CP2K_TYPES:
+            for allowed_type in self.settings.CP2K_ALLOWED_TASK_TYPES:
                 if task.task_type is allowed_type:
                     filtered_tasks.append(task)
                     continue
diff --git a/emmet-builders/emmet/builders/materials/dielectric.py b/emmet-builders/emmet/builders/materials/dielectric.py
index 68d9a8aa86..262dbcf5ae 100644
--- a/emmet-builders/emmet/builders/materials/dielectric.py
+++ b/emmet-builders/emmet/builders/materials/dielectric.py
@@ -74,7 +74,6 @@ def get_items(self):
 
         for mat in mats:
             doc = self._get_processed_doc(mat)
-
             if doc is not None:
                 yield doc
             else:
@@ -151,7 +150,7 @@ def _get_processed_doc(self, mat):
                     "output.epsilon_ionic",
                     "output.bandgap",
                 ],
-                criteria={self.tasks.key: str(task_id)},
+                criteria={self.tasks.key: int(task_id)},
             )
 
             if task_query["output"]["bandgap"] > 0:
diff --git a/emmet-builders/emmet/builders/materials/electronic_structure.py b/emmet-builders/emmet/builders/materials/electronic_structure.py
index 95efd0167c..85b72d8f8c 100644
--- a/emmet-builders/emmet/builders/materials/electronic_structure.py
+++ b/emmet-builders/emmet/builders/materials/electronic_structure.py
@@ -17,7 +17,7 @@
 from emmet.core.settings import EmmetSettings
 from emmet.core.electronic_structure import ElectronicStructureDoc
 from emmet.core.utils import jsanitize
-
+import bson
 SETTINGS = EmmetSettings()
 
 
@@ -201,7 +201,6 @@ def process_item(self, mat):
             doc = ElectronicStructureDoc.from_structure(**d)
 
         else:
-
             try:
                 doc = ElectronicStructureDoc.from_bsdos(
                     material_id=mat[self.materials.key],
@@ -367,7 +366,6 @@ def _update_materials_doc(self, mat_id):
             if "NSCF Line" in mat["task_types"][task_id]:
 
                 bs_type = None
-
                 task_query = self.tasks.query_one(
                     properties=[
                         "calcs_reversed",
@@ -378,7 +376,7 @@ def _update_materials_doc(self, mat_id):
                         "input.parameters",
                         "output.structure",
                     ],
-                    criteria={"task_id": str(task_id)},
+                    criteria={"task_id": int(task_id)},
                 )
 
                 fs_id = str(task_query["calcs_reversed"][0]["bandstructure_fs_id"])
@@ -441,7 +439,7 @@ def _update_materials_doc(self, mat_id):
                         "input.parameters",
                         "output.structure",
                     ],
-                    criteria={"task_id": str(task_id)},
+                    criteria={"task_id": int(task_id)},
                 )
 
                 fs_id = str(task_query["calcs_reversed"][0]["dos_fs_id"])
@@ -493,7 +491,7 @@ def _update_materials_doc(self, mat_id):
                         "calcs_reversed",
                         "output.structure",
                     ],
-                    criteria={"task_id": str(task_id)},
+                    criteria={"task_id": int(task_id)},
                 )
 
                 structure = Structure.from_dict(task_query["output"]["structure"])
@@ -566,11 +564,11 @@ def _obtain_blessed_calculations(
                 ]
 
                 bs_obj = self.bandstructure_fs.query_one(
-                    criteria={"fs_id": sorted_bs_data[0]["fs_id"]}
+                    criteria={"_id": bson.ObjectId(sorted_bs_data[0]["fs_id"])}
                 )
 
                 materials_doc["bandstructure"][bs_type]["object"] = (
-                    bs_obj["data"] if bs_obj is not None else None
+                    bs_obj if bs_obj is not None else None
                 )
 
                 materials_doc["bandstructure"][bs_type][
@@ -593,12 +591,11 @@ def _obtain_blessed_calculations(
             materials_doc["dos"]["task_id"] = sorted_dos_data[0]["task_id"]
 
             materials_doc["dos"]["lmaxmix"] = sorted_dos_data[0]["lmaxmix"]
-
             dos_obj = self.dos_fs.query_one(
-                criteria={"fs_id": sorted_dos_data[0]["fs_id"]}
+                criteria={"_id": bson.ObjectId(sorted_dos_data[0]["fs_id"])}
             )
             materials_doc["dos"]["object"] = (
-                dos_obj["data"] if dos_obj is not None else None
+                dos_obj if dos_obj is not None else None
             )
 
             materials_doc["dos"]["output_structure"] = sorted_dos_data[0][
diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
index fdefe009e0..a511945a48 100644
--- a/emmet-builders/emmet/builders/settings.py
+++ b/emmet-builders/emmet/builders/settings.py
@@ -30,12 +30,17 @@ class EmmetBuildSettings(EmmetSettings):
         [], description="Tags for calculations to deprecate"
     )
 
-    VASP_ALLOWED_VASP_TYPES: List[TaskType] = Field(
+    ALLOWED_CALC_CODES: List[str] = Field(
+        ['VASP', 'CP2K'],
+        description="Calculation codes to allow in the build pipeline"
+    )
+
+    VASP_ALLOWED_TASK_TYPES: List[TaskType] = Field(
         [t.value for t in TaskType],
         description="Allowed task_types to build materials from",
     )
 
-    CP2K_ALLOWED_CP2K_TYPES: List[Cp2kTaskType] = Field(
+    CP2K_ALLOWED_TASK_TYPES: List[Cp2kTaskType] = Field(
         [t.value for t in Cp2kTaskType],
         description="Allowed task_types to build materials from",
     )
diff --git a/emmet-builders/emmet/builders/vasp/materials.py b/emmet-builders/emmet/builders/vasp/materials.py
index 34d1f89a8d..28c1586cb0 100644
--- a/emmet-builders/emmet/builders/vasp/materials.py
+++ b/emmet-builders/emmet/builders/vasp/materials.py
@@ -126,7 +126,7 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         self.logger.info("Materials builder started")
         self.logger.info(
-            f"Allowed task types: {[task_type.value for task_type in self.settings.VASP_ALLOWED_VASP_TYPES]}"
+            f"Allowed task types: {[task_type.value for task_type in self.settings.VASP_ALLOWED_TASK_TYPES]}"
         )
 
         self.logger.info("Setting indexes")
@@ -286,7 +286,7 @@ def filter_and_group_tasks(
             for task in tasks
             if any(
                 allowed_type is task.task_type
-                for allowed_type in self.settings.VASP_ALLOWED_VASP_TYPES
+                for allowed_type in self.settings.VASP_ALLOWED_TASK_TYPES
             )
         ]
 
diff --git a/emmet-core/_version.py b/emmet-core/_version.py
index 129f1c9dcc..16fb744178 100644
--- a/emmet-core/_version.py
+++ b/emmet-core/_version.py
@@ -1 +1 @@
-__version__ = "0.21.21.dev42+g182e8208.d20220315"
\ No newline at end of file
+__version__ = "0.21.21.dev46+gf61a481e.d20220503"
\ No newline at end of file
diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
index 8f85abeb8e..46f633e14c 100644
--- a/emmet-core/emmet/core/cp2k/material.py
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -13,6 +13,8 @@
 from emmet.core.cp2k.task import TaskDocument
 from emmet.builders.cp2k.utils import get_mpid
 
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+
 SETTINGS = EmmetSettings()
 
 class MaterialsDoc(CoreMaterialsDoc, StructureMetadata):
@@ -70,18 +72,8 @@ def from_tasks(
         statics = [task for task in task_group if task.task_type == TaskType.Static]  # type: ignore
 
         # Material ID
-        possible_mat_ids = [task.task_id for task in structure_optimizations + statics]  # TODO remove + statics ?
-        possible_mat_ids = sorted(possible_mat_ids, key=ID_to_int)
-
-        matched_id = get_mpid([task.output.structure for task in structure_optimizations + statics][0])
-
-        if matched_id:
-            possible_mat_ids.insert(0, matched_id)
-
-        if len(possible_mat_ids) == 0:
-            raise Exception(f"Could not find a material ID for {task_ids}")
-        else:
-            material_id = possible_mat_ids[0]
+        possible_mat_ids = [task.task_id for task in structure_optimizations + statics]
+        material_id = min(possible_mat_ids)
 
         def _structure_eval(task: TaskDocument):
             """
@@ -177,6 +169,54 @@ def _structure_eval(task: TaskDocument):
             entries=entries,
         )
 
+    @classmethod
+    def construct_deprecated_material(
+        cls, task_group: List[TaskDocument],
+    ) -> "MaterialsDoc":
+        """
+        Converts a group of tasks into a deprecated material
+
+        Args:
+            task_group: List of task document
+        """
+        if len(task_group) == 0:
+            raise Exception("Must have more than one task in the group.")
+
+        # Metadata
+        last_updated = max(task.last_updated for task in task_group)
+        created_at = min(task.completed_at for task in task_group)
+        task_ids = list({task.task_id for task in task_group})
+
+        deprecated_tasks = {task.task_id for task in task_group}
+        run_types = {task.task_id: task.run_type for task in task_group}
+        task_types = {task.task_id: task.task_type for task in task_group}
+        calc_types = {task.task_id: task.calc_type for task in task_group}
+
+        # Material ID
+        material_id = min([task.task_id for task in task_group])
+
+        # Choose any random structure for metadata
+        structure = SpacegroupAnalyzer(
+            task_group[0].output.structure, symprec=0.1
+        ).get_conventional_standard_structure()
+
+        # Deprecated
+        deprecated = True
+
+        return cls.from_structure(
+            structure=structure,
+            material_id=material_id,
+            last_updated=last_updated,
+            created_at=created_at,
+            task_ids=task_ids,
+            calc_types=calc_types,
+            run_types=run_types,
+            task_types=task_types,
+            deprecated=deprecated,
+            deprecated_tasks=deprecated_tasks,
+        )
+
+
 
 def ID_to_int(s_id: str) -> Tuple[str, int]:
     """
diff --git a/emmet-core/emmet/core/cp2k/validation.py b/emmet-core/emmet/core/cp2k/validation.py
index bfb0fe066e..f727158cee 100644
--- a/emmet-core/emmet/core/cp2k/validation.py
+++ b/emmet-core/emmet/core/cp2k/validation.py
@@ -8,7 +8,7 @@
 from emmet.core.base import EmmetBaseModel
 from emmet.core.mpid import MPID
 from emmet.core.utils import DocEnum
-from emmet.core.vasp.task import TaskDocument
+from emmet.core.cp2k.task import TaskDocument
 
 SETTINGS = EmmetSettings()
 
@@ -22,6 +22,7 @@ class ValidationDoc(EmmetBaseModel):
     """
     Validation document for a VASP calculation
     """
+    calc_code = "cp2k"
 
     task_id: MPID = Field(..., description="The task_id for this validation document")
     valid: bool = Field(False, description="Whether this task is valid or not")
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index d7ca93432a..30aec7e24a 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -23,6 +23,7 @@
 
 from emmet.core.structure import StructureMetadata
 from emmet.core.mpid import MPID
+from emmet.core.thermo import ThermoDoc
 from emmet.core.cp2k.task import TaskDocument
 from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
 from emmet.core.cp2k.material import MaterialsDoc
@@ -389,16 +390,17 @@ class Config:
     )
 
     #TODO not by run type... in principle shouldn't be this way, but DOS is almost always GGA
-    bulk_dos: CompleteDos = Field(None, "Complete Density of States for the bulk Structure")
+    bulk_dos: CompleteDos = Field(None, description="Complete Density of States for the bulk Structure")
 
     defect_phase_diagrams: Mapping[RunType, DefectPhaseDiagram] = Field(
         None, description="Defect phase diagrams for each run type"
     )
 
-    brouwer_diagrams: Mapping[RunType, BrouwerDiagram] = Field(
-        None, description="Brouwer diagrams"
-    )
+    #brouwer_diagrams: Mapping[RunType, BrouwerDiagram] = Field(
+    #    None, description="Brouwer diagrams"
+    #)
 
+    '''
     # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
     @validator("brouwer_diagrams", pre=True)
     def decode(cls, brouwer_diagrams):
@@ -408,20 +410,27 @@ def decode(cls, brouwer_diagrams):
                     {k: v for k, v in brouwer_diagrams[e].items()}
                 )
         return brouwer_diagrams
-
+    '''
     @classmethod
-    def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], electronic_structure: CompleteDos) -> "DefectThermoDoc":
+    def from_docs(cls, defects: List[DefectDoc], thermos: List[ThermoDoc], electronic_structure: CompleteDos) -> "DefectThermoDoc":
 
         DEFAULT_RT = RunType('GGA')  # TODO NEED A procedure for getting all GGA or GGA+U keys
         DEFAULT_RT_U = RunType('GGA+U')
 
         mpid = defects[0].material_id
 
+        #chempots = {
+        #    m.structure.composition.elements[0]:
+        #        {rt: m.entries[rt].energy_per_atom for rt in m.entries}
+        #    for m in materials if m.structure.composition.is_element
+        #}
+
         chempots = {
-            m.structure.composition.elements[0]:
-                {rt: m.entries[rt].energy_per_atom for rt in m.entries}
-            for m in materials if m.structure.composition.is_element
+           td.composition.elements[0]: td.energy_per_atom 
+           for td in thermos if td.composition.is_element
         }
+        print("!!!!!!!")
+        print(chempots)
 
         defect_entries = {}
         defect_phase_diagram = {}
@@ -433,6 +442,7 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
         dos = CompleteDos.from_dict(electronic_structure)
         bg = dos.get_gap()
 
+        """
         for m in materials:
             for rt, ent in m.entries.items():
                 __found_chempots__ = True
@@ -458,7 +468,7 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
                 ent.structure.remove_spin()
                 ent.structure.remove_oxidation_states()
                 MaterialsProject2020Compatibility().process_entry(ent)
-
+        """
         for d in defects:
             for rt, ent in d.entries.items():
                 # Chempot shift
@@ -469,8 +479,7 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
                     if Element(el) not in chempots:
                         __found_chempots__ = False
                         break
-                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
-                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt * chempots[Element(el)][_rt]
+                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt * chempots[Element(el)]
 
                 if not __found_chempots__:
                     continue
@@ -489,9 +498,10 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
             defect_phase_diagram[run_type] = DefectPhaseDiagram(
                 entries=defect_entries[run_type],
                 vbm=vbms[run_type],
-                band_gap=band_gaps[run_type],
+                band_gap=bg, #band_gaps[run_type], # TODO Need to do rt dependent band gap
                 filter_compatible=False
             )
+            """
             brouwer_diagrams[run_type] = BrouwerDiagram(
                 defect_phase_diagram=defect_phase_diagram[run_type],
                 bulk_dos=dos,
@@ -502,11 +512,13 @@ def from_docs(cls, defects: List[DefectDoc], materials: List[MaterialsDoc], elec
                     for m in materials
                 ]
             )
+            """
 
         data = {
             'material_id': mpid,
             'task_ids': task_ids,
-            'brouwer_diagrams': brouwer_diagrams,
+            'bulk_dos': dos,
+            'defect_phase_diagrams': defect_phase_diagram,
         }
 
         return cls(**{k: v for k, v in data.items()})
diff --git a/emmet-core/emmet/core/electronic_structure.py b/emmet-core/emmet/core/electronic_structure.py
index 9352d11039..57d1608584 100644
--- a/emmet-core/emmet/core/electronic_structure.py
+++ b/emmet-core/emmet/core/electronic_structure.py
@@ -258,8 +258,8 @@ def from_bsdos(  # type: ignore[override]
 
             try:
                 spin_polarization = dos_obj.spin_polarization
-                if isnan(spin_polarization):
-                    spin_polarization = None
+                #if isnan(spin_polarization):
+                #    spin_polarization = None
             except KeyError:
                 spin_polarization = None
 
diff --git a/emmet-core/emmet/core/settings.py b/emmet-core/emmet/core/settings.py
index d960c173b8..ffae76649c 100644
--- a/emmet-core/emmet/core/settings.py
+++ b/emmet-core/emmet/core/settings.py
@@ -40,6 +40,7 @@ class EmmetSettings(BaseSettings):
 
     CP2K_DEFAULT_INPUT_SETS: Dict = Field(
         {
+            "Static DFT": "pymatgen.io.cp2k.sets.StaticSet",
             "GGA Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
             "GGA+U Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
         },
diff --git a/emmet-core/emmet/core/vasp/calc_types/enums.py b/emmet-core/emmet/core/vasp/calc_types/enums.py
index 583d48a907..2650efbc59 100644
--- a/emmet-core/emmet/core/vasp/calc_types/enums.py
+++ b/emmet-core/emmet/core/vasp/calc_types/enums.py
@@ -21,7 +21,7 @@ class RunType(ValueEnum):
     HF = "HF"
     HSE03 = "HSE03"
     HSE06 = "HSE06"
-    PB0 = "PB0"
+    PBE0 = "PBE0"
     M06L = "M06L"
     MBJL = "MBJL"
     MS0 = "MS0"
@@ -52,7 +52,7 @@ class RunType(ValueEnum):
     HF_U = "HF+U"
     HSE03_U = "HSE03+U"
     HSE06_U = "HSE06+U"
-    PB0_U = "PB0+U"
+    PBE0_U = "PBE0+U"
     M06L_U = "M06L+U"
     MBJL_U = "MBJL+U"
     MS0_U = "MS0+U"
@@ -236,17 +236,17 @@ class CalcType(ValueEnum):
     HSE06_Structure_Optimization = "HSE06 Structure Optimization"
     HSE06_Deformation = "HSE06 Deformation"
     HSE06_Unrecognized = "HSE06 Unrecognized"
-    PB0_NSCF_Line = "PB0 NSCF Line"
-    PB0_NSCF_Uniform = "PB0 NSCF Uniform"
-    PB0_Dielectric = "PB0 Dielectric"
-    PB0_DFPT = "PB0 DFPT"
-    PB0_DFPT_Dielectric = "PB0 DFPT Dielectric"
-    PB0_NMR_Nuclear_Shielding = "PB0 NMR Nuclear Shielding"
-    PB0_NMR_Electric_Field_Gradient = "PB0 NMR Electric Field Gradient"
-    PB0_Static = "PB0 Static"
-    PB0_Structure_Optimization = "PB0 Structure Optimization"
-    PB0_Deformation = "PB0 Deformation"
-    PB0_Unrecognized = "PB0 Unrecognized"
+    PBE0_NSCF_Line = "PBE0 NSCF Line"
+    PBE0_NSCF_Uniform = "PBE0 NSCF Uniform"
+    PBE0_Dielectric = "PBE0 Dielectric"
+    PBE0_DFPT = "PBE0 DFPT"
+    PBE0_DFPT_Dielectric = "PBE0 DFPT Dielectric"
+    PBE0_NMR_Nuclear_Shielding = "PBE0 NMR Nuclear Shielding"
+    PBE0_NMR_Electric_Field_Gradient = "PBE0 NMR Electric Field Gradient"
+    PBE0_Static = "PBE0 Static"
+    PBE0_Structure_Optimization = "PBE0 Structure Optimization"
+    PBE0_Deformation = "PBE0 Deformation"
+    PBE0_Unrecognized = "PBE0 Unrecognized"
     M06L_NSCF_Line = "M06L NSCF Line"
     M06L_NSCF_Uniform = "M06L NSCF Uniform"
     M06L_Dielectric = "M06L Dielectric"
@@ -577,17 +577,17 @@ class CalcType(ValueEnum):
     HSE06_U_Structure_Optimization = "HSE06+U Structure Optimization"
     HSE06_U_Deformation = "HSE06+U Deformation"
     HSE06_U_Unrecognized = "HSE06+U Unrecognized"
-    PB0_U_NSCF_Line = "PB0+U NSCF Line"
-    PB0_U_NSCF_Uniform = "PB0+U NSCF Uniform"
-    PB0_U_Dielectric = "PB0+U Dielectric"
-    PB0_U_DFPT = "PB0+U DFPT"
-    PB0_U_DFPT_Dielectric = "PB0+U DFPT Dielectric"
-    PB0_U_NMR_Nuclear_Shielding = "PB0+U NMR Nuclear Shielding"
-    PB0_U_NMR_Electric_Field_Gradient = "PB0+U NMR Electric Field Gradient"
-    PB0_U_Static = "PB0+U Static"
-    PB0_U_Structure_Optimization = "PB0+U Structure Optimization"
-    PB0_U_Deformation = "PB0+U Deformation"
-    PB0_U_Unrecognized = "PB0+U Unrecognized"
+    PBE0_U_NSCF_Line = "PBE0+U NSCF Line"
+    PBE0_U_NSCF_Uniform = "PBE0+U NSCF Uniform"
+    PBE0_U_Dielectric = "PBE0+U Dielectric"
+    PBE0_U_DFPT = "PBE0+U DFPT"
+    PBE0_U_DFPT_Dielectric = "PBE0+U DFPT Dielectric"
+    PBE0_U_NMR_Nuclear_Shielding = "PBE0+U NMR Nuclear Shielding"
+    PBE0_U_NMR_Electric_Field_Gradient = "PBE0+U NMR Electric Field Gradient"
+    PBE0_U_Static = "PBE0+U Static"
+    PBE0_U_Structure_Optimization = "PBE0+U Structure Optimization"
+    PBE0_U_Deformation = "PBE0+U Deformation"
+    PBE0_U_Unrecognized = "PBE0+U Unrecognized"
     M06L_U_NSCF_Line = "M06L+U NSCF Line"
     M06L_U_NSCF_Uniform = "M06L+U NSCF Uniform"
     M06L_U_Dielectric = "M06L+U Dielectric"
diff --git a/emmet-core/emmet/core/vasp/calc_types/run_types.yaml b/emmet-core/emmet/core/vasp/calc_types/run_types.yaml
index db92d3c655..95f74b9078 100644
--- a/emmet-core/emmet/core/vasp/calc_types/run_types.yaml
+++ b/emmet-core/emmet/core/vasp/calc_types/run_types.yaml
@@ -35,22 +35,22 @@ HF:
   HSE03:
     AEXX: 0.25
     AGGAC: 1.0
-    AGGAX: 1.0
-    ALDCAC: 1.0
+    AGGAX: 0.75
+    ALDAC: 1.0
     HFSCREEN: 0.3
     LHFCALC: true
   HSE06:
     AEXX: 0.25
     AGGAC: 1.0
-    AGGAX: 1.0
-    ALDCAC: 1.0
+    AGGAX: 0.75
+    ALDAC: 1.0
     HFSCREEN: 0.2
     LHFCALC: true
-  PB0:
+  PBE0:
     AEXX: 0.25
     AGGAC: 1.0
-    AGGAX: 1.0
-    ALDCAC: 1.0
+    AGGAX: 0.75
+    ALDAC: 1.0
     LHFCALC: true
 METAGGA:
   M06L:
diff --git a/emmet-core/emmet/core/vasp/validation.py b/emmet-core/emmet/core/vasp/validation.py
index 79120d4110..d13bb9f404 100644
--- a/emmet-core/emmet/core/vasp/validation.py
+++ b/emmet-core/emmet/core/vasp/validation.py
@@ -31,6 +31,8 @@ class ValidationDoc(EmmetBaseModel):
     Validation document for a VASP calculation
     """
 
+    calc_code = "vasp"
+
     task_id: MPID = Field(..., description="The task_id for this validation document")
     valid: bool = Field(False, description="Whether this task is valid or not")
     last_updated: datetime = Field(

From c12e5fc02e4f1d558ba799dbbf2ce824984857a0 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 3 May 2022 13:10:30 -0700
Subject: [PATCH 39/41] Possible additions

---
 emmet-builders/emmet/builders/cp2k/thermo.py  | 368 ++++++++++++++++++
 .../emmet/builders/materials/materials.py     | 336 ++++++++++++++++
 2 files changed, 704 insertions(+)
 create mode 100644 emmet-builders/emmet/builders/cp2k/thermo.py
 create mode 100644 emmet-builders/emmet/builders/materials/materials.py

diff --git a/emmet-builders/emmet/builders/cp2k/thermo.py b/emmet-builders/emmet/builders/cp2k/thermo.py
new file mode 100644
index 0000000000..9c8345a018
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/thermo.py
@@ -0,0 +1,368 @@
+import warnings
+from collections import defaultdict
+from itertools import chain
+from typing import Dict, Iterable, Iterator, List, Optional, Set
+from math import ceil
+
+from maggma.core import Builder, Store
+from maggma.utils import grouper
+from monty.json import MontyDecoder
+from pymatgen.analysis.phase_diagram import PhaseDiagramError
+from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+
+from emmet.builders.utils import chemsys_permutations
+from emmet.core.thermo import ThermoDoc, PhaseDiagramDoc
+from emmet.core.utils import jsanitize
+
+
+class ThermoBuilder(Builder):
+    def __init__(
+        self,
+        materials: Store,
+        thermo: Store,
+        phase_diagram: Optional[Store] = None,
+        oxidation_states: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        compatibility=None,
+        num_phase_diagram_eles: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Calculates thermodynamic quantities for materials from phase
+        diagram constructions
+
+        Args:
+            materials (Store): Store of materials documents
+            thermo (Store): Store of thermodynamic data such as formation
+                energy and decomposition pathway
+            phase_diagram (Store): Store of phase diagram data for each unique chemical system
+            oxidation_states (Store): Store of oxidation state data to use in correction scheme application
+            query (dict): dictionary to limit materials to be analyzed
+            compatibility (PymatgenCompatability): Compatability module
+                to ensure energies are compatible
+            num_phase_diagram_eles (int): Maximum number of elements to use in phase diagram construction
+                for data within the separate phase_diagram collection
+        """
+
+        self.materials = materials
+        self.thermo = thermo
+        self.query = query if query else {}
+        self.compatibility = compatibility
+        self.oxidation_states = oxidation_states
+        self.phase_diagram = phase_diagram
+        self.num_phase_diagram_eles = num_phase_diagram_eles
+        self._completed_tasks: Set[str] = set()
+        self._entries_cache: Dict[str, List[ComputedStructureEntry]] = defaultdict(list)
+
+        sources = [materials]
+        if oxidation_states is not None:
+            sources.append(oxidation_states)
+
+        targets = [thermo]
+        if phase_diagram is not None:
+            targets.append(phase_diagram)
+
+        super().__init__(sources=sources, targets=targets, **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("chemsys")
+        self.materials.ensure_index("last_updated")
+
+        # Search index for thermo
+        self.thermo.ensure_index("material_id")
+        self.thermo.ensure_index("last_updated")
+
+        # Search index for phase_diagram
+        if self.phase_diagram:
+            self.phase_diagram.ensure_index("chemsys")
+
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:  # pragma: no cover
+        updated_chemsys = self.get_updated_chemsys()
+        new_chemsys = self.get_new_chemsys()
+
+        affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys)
+
+        # Remove overlapping chemical systems
+        to_process_chemsys = set()
+        for chemsys in updated_chemsys | new_chemsys | affected_chemsys:
+            if chemsys not in to_process_chemsys:
+                to_process_chemsys |= chemsys_permutations(chemsys)
+
+        N = ceil(len(to_process_chemsys) / number_splits)
+
+        for chemsys_chunk in grouper(to_process_chemsys, N):
+
+            yield {"query": {"chemsys": {"$in": list(chemsys_chunk)}}}
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets whole chemical systems of entries to process
+        """
+
+        self.logger.info("Thermo Builder Started")
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        updated_chemsys = self.get_updated_chemsys()
+        new_chemsys = self.get_new_chemsys()
+
+        affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys)
+
+        # Remove overlapping chemical systems
+        processed = set()
+        to_process_chemsys = []
+        for chemsys in sorted(
+            updated_chemsys | new_chemsys | affected_chemsys,
+            key=lambda x: len(x),
+            reverse=True,
+        ):
+            if chemsys not in processed:
+                processed |= chemsys_permutations(chemsys)
+                to_process_chemsys.append(chemsys)
+
+        self.logger.info(
+            f"Found {len(to_process_chemsys)} chemical systems with new/updated materials to process"
+        )
+        self.total = len(to_process_chemsys)
+
+        # Yield the chemical systems in order of increasing size
+        # Will build them in a similar manner to fast Pourbaix
+        for chemsys in sorted(
+            to_process_chemsys, key=lambda x: len(x.split("-")), reverse=False
+        ):
+            entries = self.get_entries(chemsys)
+            yield entries
+
+    def process_item(self, item: List[Dict]):
+
+        if len(item) == 0:
+            return []
+
+        entries = [ComputedStructureEntry.from_dict(entry) for entry in item]
+        # determine chemsys
+        elements = sorted(
+            set([el.symbol for e in entries for el in e.composition.elements])
+        )
+        chemsys = "-".join(elements)
+
+        self.logger.debug(f"Processing {len(entries)} entries for {chemsys}")
+
+        material_entries: Dict[str, Dict[str, ComputedStructureEntry]] = defaultdict(
+            dict
+        )
+        pd_entries = []
+        for entry in entries:
+            material_entries[entry.entry_id][entry.data["run_type"]] = entry
+
+        if self.compatibility:
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore", message="Failed to guess oxidation states.*"
+                )
+                pd_entries = self.compatibility.process_entries(entries)
+        else:
+            pd_entries = entries
+        self.logger.debug(f"{len(pd_entries)} remain in {chemsys} after filtering")
+
+        try:
+            docs, pd = ThermoDoc.from_entries(pd_entries, deprecated=False)
+            for doc in docs:
+                doc.entries = material_entries[doc.material_id]
+                doc.entry_types = list(material_entries[doc.material_id].keys())
+
+            pd_data = None
+
+            if self.phase_diagram:
+                if (
+                    self.num_phase_diagram_eles is None
+                    or len(elements) <= self.num_phase_diagram_eles
+                ):
+                    pd_doc = PhaseDiagramDoc(chemsys=chemsys, phase_diagram=pd)
+                    pd_data = jsanitize(pd_doc.dict(), allow_bson=True)
+
+            docs_pd_pair = (
+                jsanitize([d.dict() for d in docs], allow_bson=True),
+                [pd_data],
+            )
+
+        except PhaseDiagramError as p:
+            elsyms = []
+            for e in entries:
+                elsyms.extend([el.symbol for el in e.composition.elements])
+
+            self.logger.warning(
+                f"Phase diagram error in chemsys {'-'.join(sorted(set(elsyms)))}: {p}"
+            )
+            return []
+        except Exception as e:
+            self.logger.error(
+                f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}"
+            )
+            return []
+
+        return docs_pd_pair
+
+    def update_targets(self, items):
+        """
+        Inserts the thermo and phase diagram docs into the thermo collection
+        Args:
+            items ([[tuple(List[dict],List[dict])]]): a list of list of thermo dictionaries to update
+        """
+
+        # print(len(items))
+        thermo_docs = [item[0] for item in items if item]
+        phase_diagram_docs = [item[1] for item in items if item]
+
+        # flatten out lists
+        thermo_docs = list(filter(None, chain.from_iterable(thermo_docs)))
+        phase_diagram_docs = list(filter(None, chain.from_iterable(phase_diagram_docs)))
+
+        # Check if already updated this run
+        thermo_docs = [
+            i for i in thermo_docs if i["material_id"] not in self._completed_tasks
+        ]
+
+        self._completed_tasks |= {i["material_id"] for i in thermo_docs}
+
+        for item in thermo_docs:
+            if isinstance(item["last_updated"], dict):
+                item["last_updated"] = MontyDecoder().process_decoded(
+                    item["last_updated"]
+                )
+
+        if self.phase_diagram:
+            self.phase_diagram.update(phase_diagram_docs)
+
+        if len(thermo_docs) > 0:
+            self.logger.info(f"Updating {len(thermo_docs)} thermo documents")
+            self.thermo.update(docs=thermo_docs, key=["material_id"])
+        else:
+            self.logger.info("No thermo items to update")
+
+    def get_entries(self, chemsys: str) -> List[Dict]:
+        """
+        Gets a entries from the tasks collection for the corresponding chemical systems
+        Args:
+            chemsys(str): a chemical system represented by string elements seperated by a dash (-)
+        Returns:
+            set(ComputedEntry): a set of entries for this system
+        """
+
+        self.logger.info(f"Getting entries for: {chemsys}")
+        # First check the cache
+        all_chemsys = chemsys_permutations(chemsys)
+        cached_chemsys = all_chemsys & set(self._entries_cache.keys())
+        query_chemsys = all_chemsys - cached_chemsys
+        all_entries = list(
+            chain.from_iterable(self._entries_cache[c] for c in cached_chemsys)
+        )
+
+        self.logger.debug(
+            f"Getting {len(cached_chemsys)} sub-chemsys from cache for {chemsys}"
+        )
+        self.logger.debug(
+            f"Getting {len(query_chemsys)} sub-chemsys from DB for {chemsys}"
+        )
+
+        # Second grab the materials docs
+        new_q = dict(self.query)
+        new_q["chemsys"] = {"$in": list(query_chemsys)}
+        new_q["deprecated"] = False
+        materials_docs = list(
+            self.materials.query(
+                criteria=new_q, properties=["material_id", "entries", "deprecated"]
+            )
+        )
+
+        # Get Oxidation state data for each material
+        oxi_states_data = {}
+        if self.oxidation_states:
+            material_ids = [t["material_id"] for t in materials_docs]
+            oxi_states_data = {
+                d["material_id"]: d.get("average_oxidation_states", {})
+                for d in self.oxidation_states.query(
+                    properties=["material_id", "average_oxidation_states"],
+                    criteria={
+                        "material_id": {"$in": material_ids},
+                        "state": "successful",
+                    },
+                )
+            }
+
+        self.logger.debug(
+            f"Got {len(materials_docs)} entries from DB for {len(query_chemsys)} sub-chemsys for {chemsys}"
+        )
+
+        # Convert the entries into ComputedEntries and store
+        for doc in materials_docs:
+            for r_type, entry_dict in doc.get("entries", {}).items():
+                entry_dict["data"]["oxidation_states"] = oxi_states_data.get(
+                    entry_dict["entry_id"], {}
+                )
+                entry_dict["data"]["run_type"] = r_type
+                elsyms = sorted(set([el for el in entry_dict["composition"]]))
+                self._entries_cache["-".join(elsyms)].append(entry_dict)
+                all_entries.append(entry_dict)
+
+        self.logger.info(f"Total entries in {chemsys} : {len(all_entries)}")
+
+        return all_entries
+
+    def get_updated_chemsys(self,) -> Set:
+        """Gets updated chemical system as defined by the updating of an existing material"""
+
+        updated_mats = self.thermo.newer_in(self.materials, criteria=self.query)
+        updated_chemsys = set(
+            self.materials.distinct(
+                "chemsys", {"material_id": {"$in": list(updated_mats)}, **self.query}
+            )
+        )
+        self.logger.debug(f"Found {len(updated_chemsys)} updated chemical systems")
+
+        return updated_chemsys
+
+    def get_new_chemsys(self) -> Set:
+        """Gets newer chemical system as defined by introduction of a new material"""
+
+        # All materials that are not present in the thermo collection
+        thermo_mat_ids = self.thermo.distinct("material_id")
+        mat_ids = self.materials.distinct("material_id", self.query)
+        dif_task_ids = list(set(mat_ids) - set(thermo_mat_ids))
+        q = {"material_id": {"$in": dif_task_ids}}
+        new_mat_chemsys = set(self.materials.distinct("chemsys", q))
+        self.logger.debug(f"Found {len(new_mat_chemsys)} new chemical systems")
+
+        return new_mat_chemsys
+
+    def get_affected_chemsys(self, chemical_systems: Set) -> Set:
+        """Gets chemical systems affected by changes in the supplied chemical systems"""
+        # First get all chemsys with any of the elements we've marked
+        affected_chemsys = set()
+        affected_els = list({el for c in chemical_systems for el in c.split("-")})
+        possible_affected_chemsys = self.materials.distinct(
+            "chemsys", {"elements": {"$in": affected_els}}
+        )
+
+        sub_chemsys = defaultdict(list)
+        # Build a dictionary mapping sub_chemsys to all super_chemsys
+        for chemsys in possible_affected_chemsys:
+            for permutation in chemsys_permutations(chemsys):
+                sub_chemsys[permutation].append(chemsys)
+
+        # Select and merge distinct super chemsys from sub_chemsys
+        for chemsys in chemical_systems:
+            affected_chemsys |= set(sub_chemsys[chemsys])
+
+        self.logger.debug(
+            f"Found {len(affected_chemsys)} chemical systems affected by this build"
+        )
+
+        return affected_chemsys
diff --git a/emmet-builders/emmet/builders/materials/materials.py b/emmet-builders/emmet/builders/materials/materials.py
new file mode 100644
index 0000000000..322bdc1017
--- /dev/null
+++ b/emmet-builders/emmet/builders/materials/materials.py
@@ -0,0 +1,336 @@
+"""
+Unified materials builder for tasks docs from different calc codes.
+"""
+
+from datetime import datetime
+from itertools import chain, groupby
+import itertools
+from math import ceil
+from typing import Dict, Iterable, Iterator, List, Optional, Union
+
+from maggma.builders import Builder
+from maggma.stores import Store
+from maggma.utils import grouper
+
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.utils import group_structures, jsanitize
+from pymatgen.core import Structure
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+
+SETTINGS = EmmetBuildSettings()
+
+class MaterialsBuilder(Builder):
+    """
+    The Materials Builder matches VASP task documents by structure similarity into materials
+    document. The purpose of this builder is group calculations and determine the best structure.
+    All other properties are derived from other builders.
+
+    The process is as follows:
+
+        1.) Find all documents with the same formula
+        2.) Select only task documents for the task_types we can select properties from
+        3.) Aggregate task documents based on strucutre similarity
+        4.) Create a MaterialDoc from the group of task documents
+        5.) Validate material document
+
+    """
+
+    def __init__(
+        self,
+        tasks: Union[Store, Iterable[Store]],
+        materials: Store,
+        task_validation: Store,
+        query: Optional[Dict] = None,
+        settings: Optional[EmmetBuildSettings] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            materials: Store of materials documents to generate
+            task_validation: Store for storing task validation results
+            query: dictionary to limit tasks to be analyzed
+            settings: EmmetSettings to use in the build process
+        """
+
+        self.tasks = tasks if isinstance(tasks, Iterable) else [tasks]
+        self.materials = materials
+        self.task_validation = task_validation
+        self.query = query if query else {}
+        self.settings = EmmetBuildSettings.autoload(settings)
+        
+        if len(set(t.key for t in self.tasks)) != 1:
+            raise ValueError("All task stores must have the same key")
+
+        self._tasks_key = self.tasks[0].key
+        self._TASK_DOCS = dict()
+        self._MAT_DOCS = dict()
+        for calc_code in self.settings.ALLOWED_CALC_CODES:
+            mod = __import__(f"emmet.core.{calc_code.lower()}.task", globals(), locals(), ["TaskDocument"], 0)
+            self._TASK_DOCS[calc_code] = getattr(mod, "TaskDocument") 
+            mod = __import__(f"emmet.core.{calc_code.lower()}.material", globals(), locals(), ["MaterialsDoc"], 0)
+            self._MAT_DOCS[calc_code] = getattr(mod, "MaterialsDoc")
+
+        self.kwargs = kwargs
+
+        sources = self.tasks + [self.task_validation]
+        super().__init__(sources=sources, targets=[materials], **kwargs)
+
+    def _get_allowed_task_types(self, calc_code):
+        if calc_code == 'vasp':
+            return self.settings.VASP_ALLOWED_VASP_TYPES
+        if calc_code == 'cp2k':
+            return self.settings.CP2K_ALLOWED_CP2K_TYPES
+
+    @property
+    def tasks_key(self):
+        return self._tasks_key
+
+    @property
+    def task_docs(self):
+        """
+        Returns:
+            dict: A dictionary of task document classes
+        """
+        return self._TASK_DOCS
+    
+    @property
+    def mat_docs(self):
+        """
+        Returns:
+            dict: A dictionary of material document classes
+        """
+        return self._MAT_DOCS
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        for tasks in self.tasks:
+            # Basic search index for tasks
+            tasks.ensure_index("task_id")
+            tasks.ensure_index("last_updated")
+            tasks.ensure_index("state")
+            tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:  # pragma: no cover
+        """Prechunk the materials builder for distributed computation"""
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+        if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
+            temp_query["$and"] = [
+                {"tags": {"$in": self.settings.BUILD_TAGS}},
+                {"tags": {"$nin": self.settings.EXCLUDED_TAGS}},
+            ]
+        elif len(self.settings.BUILD_TAGS) > 0:
+            temp_query["tags"] = {"$in": self.settings.BUILD_TAGS}
+
+        self.logger.info("Finding tasks to process")
+        all_tasks = list(
+            self.tasks.query(temp_query, [self.tasks.key, "formula_pretty"])
+        )
+
+        processed_tasks = set(self.materials.distinct("task_ids"))
+        to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
+        to_process_forms = {
+            d["formula_pretty"]
+            for d in all_tasks
+            if d[self.tasks.key] in to_process_tasks
+        }
+
+        N = ceil(len(to_process_forms) / number_splits)
+
+        for formula_chunk in grouper(to_process_forms, N):
+
+            yield {"query": {"formula_pretty": {"$in": list(formula_chunk)}}}
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into materials documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Materials Colection
+
+        Returns:
+            generator or list relevant tasks and materials to process into materials documents
+        """
+
+        self.logger.info("Materials builder started")
+        for t in self.settings.ALLOWED_CALC_CODES:
+            tt = getattr(self.settings, f"{t.upper()}_ALLOWED_TASK_TYPES") 
+            self.logger.info(
+                f"Allowed {t} task types: {[task_type.value for task_type in tt]}"
+            )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all processed tasks:
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+        if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
+            temp_query["$and"] = [
+                {"tags": {"$in": self.settings.BUILD_TAGS}},
+                {"tags": {"$nin": self.settings.EXCLUDED_TAGS}},
+            ]
+        elif len(self.settings.BUILD_TAGS) > 0:
+            temp_query["tags"] = {"$in": self.settings.BUILD_TAGS}
+
+        self.logger.info("Finding tasks to process")
+        self.logger.info(self.tasks)
+        all_tasks = []
+        for tasks in self.tasks:
+            all_tasks.extend(
+               [t for t in tasks.query(temp_query, [self.tasks_key, "formula_pretty"])] 
+            )
+        processed_tasks = set(self.materials.distinct("task_ids"))
+        to_process_tasks = {d[self.tasks_key] for d in all_tasks} - processed_tasks
+        to_process_forms = {
+            d["formula_pretty"]
+            for d in all_tasks
+            if d[self.tasks_key] in to_process_tasks
+        }
+
+        self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
+
+        # Set total for builder bars to have a total
+        self.total = len(to_process_forms)
+
+        validation = {
+            doc[self.tasks_key]: {'valid': doc["valid"], 'calc_code': doc['calc_code']}
+            for doc in self.task_validation.query(
+                criteria={"calc_code": {"$exists": True}, "valid": {"$exists": True}},
+                properties=[self.task_validation.key, "valid", "calc_code"],
+            )
+        }
+
+        for formula in to_process_forms:
+            tasks_query = dict(temp_query)
+            tasks_query["formula_pretty"] = formula
+            tasks = []
+            for t in self.tasks:
+                tasks.extend(
+                    [tsk for tsk in t.query(criteria=tasks_query, properties=None)]
+                )
+
+            for t in tasks:
+                t["is_valid"] = validation[t[self.tasks_key]]['valid']
+                t["calc_code"] = validation[t[self.tasks_key]]['calc_code']
+
+            yield tasks
+
+    def process_item(self, items: List[Dict]) -> List[Dict]:
+        """
+        Process the tasks into a list of materials
+
+        Args:
+            tasks [dict] : a list of task docs
+
+        Returns:
+            ([dict],list) : a list of new materials docs and a list of task_ids that were processsed
+        """
+
+        materials = []
+        for structure_group in self.group_tasks(items):
+            materials.append([])
+            sorted_structures = sorted(structure_group, key=lambda x: x['calc_code'])
+            self.logger.debug(f"Processing {len(sorted_structures)} tasks for a structure")
+
+            for key, task_group in groupby(sorted_structures, key=lambda x: x["calc_code"]):
+                self.logger.debug(f"Processing tasks for the calc_code {key}")
+
+                tasks = [self.task_docs[key.upper()](**task) for task in task_group]
+                formula = tasks[0].formula_pretty
+                task_ids = [task.task_id for task in tasks]
+                self.logger.debug(f"Processing {formula} : {task_ids}")
+
+                try:
+                    doc = self.mat_docs[key.upper()].from_tasks(
+                            tasks,
+                            quality_scores=self.settings.VASP_QUALITY_SCORES,
+                            use_statics=self.settings.VASP_USE_STATICS,
+                        )
+                    materials[-1].append(doc)
+                    
+                except Exception as e:
+                    failed_ids = list({t_.task_id for t_ in tasks})
+                    doc = self.mat_docs[key.upper()].construct_deprecated_material(tasks)
+                    doc.warnings.append(str(e))
+                    materials[-1].append(doc)
+                    self.logger.warn(
+                        f"Failed making material for {failed_ids}."
+                        f" Inserted as deprecated Material: {doc.material_id}"
+                    )
+
+                if key == 'vasp':
+                    material_id = doc.material_id
+            
+            for m in materials[-1]:
+                m.material_id = material_id
+        
+        self.logger.debug(f"Produced {len(materials)} materials for {formula}")
+        mats = list(itertools.chain.from_iterable(materials))
+        return jsanitize([m.dict() for m in mats], allow_bson=True)
+
+    def update_targets(self, items: List[List[Dict]]):
+        """
+        Inserts the new task_types into the task_types collection
+
+        Args:
+            items ([([dict],[int])]): A list of tuples of materials to update and the corresponding
+                processed task_ids
+        """
+
+        docs = list(chain.from_iterable(items))  # type: ignore
+
+        for item in docs:
+            item.update({"_bt": self.timestamp})
+
+        material_ids = list({item["material_id"] for item in docs})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(docs)} materials")
+            self.materials.remove_docs({self.materials.key: {"$in": material_ids}})
+            self.materials.update(
+                docs=docs, key=["material_id"],
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def group_tasks(
+        self, tasks: List
+    ) -> Iterator[List]:
+        """
+        Groups tasks by structure matching
+        """
+        structures = []
+
+        for idx, task in enumerate(tasks):
+            s = Structure.from_dict(task['output']['structure'])
+            s.index: int = idx  # type: ignore
+            structures.append(s)
+
+        grouped_structures = group_structures(
+            structures,
+            ltol=self.settings.LTOL,
+            stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL,
+            symprec=self.settings.SYMPREC,
+        )
+        for group in grouped_structures:
+            grouped_tasks = [tasks[struc.index] for struc in group]  # type: ignore
+            yield grouped_tasks

From 80f66729e565752c26fe6bb6304c734c0322977b Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 4 May 2022 14:08:02 -0700
Subject: [PATCH 40/41] Update

---
 emmet-builders/emmet/builders/cp2k/defects.py | 2 +-
 emmet-core/emmet/core/defect.py               | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index d7a934da74..d5f375b1e3 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -857,7 +857,7 @@ def __get_materials(self, key) -> List:
         return MaterialsDoc(**bulk)
 
     def __get_thermos(self, composition) -> List:
-        return list(self.thermo.query(criteria={"elements": {"$in": jsanitize(composition.elements)}}))
+        return list(self.thermo.query(criteria={'elements': {"$size": 1}}, properties=None))
 
 
 def unpack(query, d):
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
index 30aec7e24a..d82e19959e 100644
--- a/emmet-core/emmet/core/defect.py
+++ b/emmet-core/emmet/core/defect.py
@@ -186,10 +186,9 @@ def _sort(x):
         for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
             ents = [cls.get_defect_entry_from_tasks(t[0], t[1], t[2], query) for t in sorted_tasks]
-            metadata[key] = {'convergence': [(sorted_tasks[i][0]['nsites'], ents[i].energy) for i in range(len(ents))]}
-
+            best_entry = ents[0]
             best_defect_task, best_bulk_task, dielectric = sorted_tasks[0]
-            best_entry = cls.get_defect_entry_from_tasks(best_defect_task, best_bulk_task, dielectric, query)
+            metadata[key] = {'convergence': [(sorted_tasks[i][0]['nsites'], ents[i].energy) for i in range(len(ents))]}
             best_defect_task, best_bulk_task = TaskDocument(**best_defect_task), TaskDocument(**best_bulk_task)
             entries[best_defect_task.run_type] = best_entry
             final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task)
@@ -429,8 +428,6 @@ def from_docs(cls, defects: List[DefectDoc], thermos: List[ThermoDoc], electroni
            td.composition.elements[0]: td.energy_per_atom 
            for td in thermos if td.composition.is_element
         }
-        print("!!!!!!!")
-        print(chempots)
 
         defect_entries = {}
         defect_phase_diagram = {}

From bea2f8694cbd5c393c27bdd696dbf3989611fb8a Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 10 May 2022 17:25:31 -0700
Subject: [PATCH 41/41] Updates

---
 emmet-builders/emmet/builders/cp2k/defects.py | 65 +++++++++++++++----
 emmet-builders/emmet/builders/settings.py     |  2 +-
 2 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
index d5f375b1e3..e988665d7e 100644
--- a/emmet-builders/emmet/builders/cp2k/defects.py
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -1,12 +1,15 @@
 from datetime import datetime
 from itertools import chain, groupby, combinations
+from re import A
 from tkinter import W
 from typing import Dict, Iterator, List, Optional
 from copy import deepcopy
+from math import ceil
 from monty.json import MontyDecoder
 
 from maggma.builders import Builder
 from maggma.stores import Store
+from maggma.utils import grouper
 
 from pymatgen.core import Structure
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
@@ -23,8 +26,7 @@
 from emmet.core.cp2k.calc_types import TaskType
 from emmet.core.cp2k.calc_types.utils import run_type
 from emmet.builders.settings import EmmetBuildSettings
-from emmet.builders.cp2k.utils import get_mpid, synchronous_query
-from maggma.stores.gridfs import GridFSStore
+from emmet.builders.cp2k.utils import synchronous_query
 
 from emmet.core.electronic_structure import ElectronicStructureDoc
 
@@ -79,6 +81,7 @@ def __init__(
         electrostatic_potentials: Store,
         task_validation: Optional[Store] = None,
         query: Optional[Dict] = None,
+        bulk_query: Optional[Dict] = None,
         allowed_task_types: Optional[List[str]] = DEFAULT_ALLOWED_TASKS,
         settings: Optional[EmmetBuildSettings] = None,
         defects_2d: Optional[bool] = False,
@@ -111,6 +114,7 @@ def __init__(
         self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
         self.settings = EmmetBuildSettings.autoload(settings)
         self.query = query if query else {}
+        self.bulk_query = bulk_query if bulk_query else {}
         self.timestamp = None
         self._mpid_map = {}
         self.defects_2d = defects_2d
@@ -193,6 +197,10 @@ def optional_bulk_properties(self) -> List:
         """
         return self._optional_bulk_properties
 
+    @property
+    def mpid_map(self) -> Dict:
+        return self._mpid_map
+
     def ensure_indexes(self):
         """
         Ensures indicies on the tasks and materials collections
@@ -218,12 +226,40 @@ def ensure_indexes(self):
             self.task_validation.ensure_index("task_id")
             self.task_validation.ensure_index("valid")
 
-    # TODO this is tricky to implement.
-    # Prechunking normally chunks all of the tasks, and then runs get_items on each
-    # chunk. This is a problem when we need to do defect matching, where the first chiunk
-    # may not contain the defect that we are trying to match. 
     def prechunk(self, number_splits: int) -> Iterator[Dict]:
-        raise NotImplementedError
+
+        tag_query = {} 
+        if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
+            tag_query["$and"] = [
+                {"tags": {"$in": self.settings.BUILD_TAGS}},
+                {"tags": {"$nin": self.settings.EXCLUDED_TAGS}},
+            ]
+        elif len(self.settings.BUILD_TAGS) > 0:
+            tag_query["tags"] = {"$in": self.settings.BUILD_TAGS}
+
+        # Get defect tasks
+        temp_query = self.query.copy()
+        temp_query.update(tag_query)
+        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
+        temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"})
+        defect_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        # Get bulk tasks
+        temp_query = self.bulk_query.copy()
+        temp_query.update(tag_query)
+        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
+        temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"})
+        bulk_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        N = ceil(len(defect_tasks) / number_splits)
+        for task_chunk in grouper(defect_tasks, N):
+            yield {"query": {"task_id": {"$in": task_chunk + list(bulk_tasks)}}}
 
     def get_items(self) -> Iterator[List[Dict]]:
         """
@@ -275,7 +311,8 @@ def get_items(self) -> Iterator[List[Dict]]:
         }
 
         # Get bulk tasks
-        temp_query = {d: {'$exists': True} for d in self.required_bulk_properties}
+        temp_query = self.bulk_query.copy()
+        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
         temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"})
         bulk_tasks = {
             doc[self.tasks.key]
@@ -337,8 +374,11 @@ def get_items(self) -> Iterator[List[Dict]]:
         self.logger.info(f"Starting defect matching.")
 
         for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
-            #yield self.__get_defect_doc(defect), self.__get_item_bundle(bulk_tasks, defect_task_group)
-            yield defect, self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
+            task_ids = self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
+            doc = self.__get_defect_doc(defect)
+            item_bundle = self.__get_item_bundle(task_ids)
+            material_id = self.mpid_map[item_bundle[0][1]['task_id']]
+            yield doc, item_bundle, material_id
 
     def process_item(self, items):
         """
@@ -351,11 +391,9 @@ def process_item(self, items):
 
         returns: the defect document as a dictionary
         """
-        defect, task_ids = items
-        defect_doc, item_bundle = self.__get_defect_doc(defect), self.__get_item_bundle(task_ids) 
+        defect_doc, item_bundle, material_id = items
         self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
         if item_bundle:
-            material_id = self._mpid_map[item_bundle[0][1]['task_id']]
             tags = item_bundle[0][0].get('tags', [])
             # TODO: This is a hack to get the 2d doc
             DEFECT_DOC = DefectDoc2d if '2d' in tags else DefectDoc
@@ -364,6 +402,7 @@ def process_item(self, items):
             else:
                 defect_doc = DEFECT_DOC.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
             return defect_doc.dict()
+        return {}
 
     def update_targets(self, items):
         """
diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
index be00c46bac..9cfadbad50 100644
--- a/emmet-builders/emmet/builders/settings.py
+++ b/emmet-builders/emmet/builders/settings.py
@@ -7,7 +7,7 @@
 
 from emmet.core.provenance import Author, History
 from emmet.core.settings import EmmetSettings
-from emmet.core.vasp.calc_types import TaskType as VaspTastType
+from emmet.core.vasp.calc_types import TaskType as VaspTaskType
 from emmet.core.qchem.calc_types import TaskType as QChemTaskType
 from emmet.core.cp2k.calc_types import TaskType as Cp2kTaskType