materialsproject
diff --git a/‎emmet-core/emmet/core/tasks.py
Lines changed: 90 additions & 96 deletions b/‎emmet-core/emmet/core/tasks.py
Lines changed: 90 additions & 96 deletions
@@ -5,10 +5,9 @@
 import json
 import logging
 import re
-from collections.abc import Mapping
 from datetime import datetime
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 import numpy as np
 from monty.json import MontyDecoder
@@ -29,7 +28,7 @@
 
 from emmet.core import ARROW_COMPATIBLE
 from emmet.core.common import convert_datetime
-from emmet.core.mpid import MPID
+from emmet.core.mpid import MPID, AlphaID
 from emmet.core.structure import StructureMetadata
 from emmet.core.utils import jsanitize, type_override, utcnow
 from emmet.core.vasp.calc_types import (
@@ -43,6 +42,7 @@
 from emmet.core.vasp.calculation import (
     Calculation,
     CalculationInput,
+    CalculationOutput,
     PotcarSpec,
     RunStatistics,
     VaspObject,
@@ -88,7 +88,6 @@ class OrigInputs(CalculationInput):
 
 
 class OutputDoc(BaseModel):
-    # structure: Optional[Structure] = Field(
     structure: Optional[AnnotatedStructure] = Field(
         None,
         title="Output Structure",
@@ -350,121 +349,135 @@ def from_vasp_calc_docs(
         )
 
 
-@type_override(
-    {
-        "additional_json": str,
-        "transformations": str,
-        "vasp_objects": str,
-    }
-)
-class TaskDoc(StructureMetadata, extra="allow", use_enum_values=True):
+class ProductionTaskDoc(StructureMetadata):
     """Calculation-level details about VASP calculations that power Materials Project."""
 
-    tags: list[str] | None = Field(
-        None, title="tag", description="Metadata tagged to a given task."
-    )
-    dir_name: str | None = Field(None, description="The directory for this VASP task")
-    state: TaskState | None = Field(None, description="State of this calculation")
-
-    calcs_reversed: list[Calculation] | None = Field(
+    batch_id: Optional[str] = Field(
         None,
-        title="Calcs reversed data",
-        description="Detailed data for each VASP calculation contributing to the task document.",
+        description="Identifier for this calculation; should provide rough information about the calculation origin and purpose.",
     )
-
-    # structure: Structure | None = Field(
-    structure: AnnotatedStructure | None = Field(
-        None, description="Final output structure from the task"
+    calc_type: Optional[CalcType] = Field(
+        None, description="The functional and task type used in the calculation."
     )
-
-    task_type: TaskType | CalcType | None = Field(
-        None, description="The type of calculation."
+    completed_at: Optional[datetime] = Field(
+        None, description="Timestamp for when this task was completed"
     )
-
-    run_type: RunType | None = Field(
-        None, description="The functional used in the calculation."
+    dir_name: Optional[str] = Field(
+        None, description="The directory for this VASP task"
     )
-
-    calc_type: CalcType | None = Field(
-        None, description="The functional and task type used in the calculation."
+    icsd_id: Optional[str | int] = Field(
+        None, description="Inorganic Crystal Structure Database id of the structure"
     )
-
-    task_id: MPID | str | None = Field(
+    input: CalculationInput | None = Field(
         None,
-        description="The (task) ID of this calculation, used as a universal reference across property documents."
-        "This comes in the form: mp-******.",
+        description="The input structure used to generate the current task document.",
     )
-
-    orig_inputs: OrigInputs | None = Field(
-        None,
-        description="The exact set of input parameters used to generate the current task document.",
+    last_updated: Optional[datetime] = Field(
+        utcnow(),
+        description="Timestamp for the most recent calculation for this task document",
     )
-
-    input: InputDoc | None = Field(
+    orig_inputs: CalculationInput | None = Field(
         None,
-        description="The input structure used to generate the current task document.",
+        description="The exact set of input parameters used to generate the current task document.",
     )
-
-    output: OutputDoc | None = Field(
+    output: CalculationOutput | None = Field(
         None,
         description="The exact set of output parameters used to generate the current task document.",
     )
-
-    included_objects: list[VaspObject] | None = Field(
-        None, description="List of VASP objects included with this task document"
+    run_type: Optional[RunType] = Field(
+        None, description="The functional used in the calculation."
     )
-    vasp_objects: dict[VaspObject, Any] | None = Field(
-        None, description="Vasp objects associated with this task"
+    structure: Optional[AnnotatedStructure] = Field(
+        None, description="Final output structure from the task"
     )
-    entry: ComputedEntry | None = Field(
-        None, description="The ComputedEntry from the task doc"
+    tags: list[str] | None = Field(
+        None, title="tag", description="Metadata tagged to a given task."
     )
-    task_label: str | None = Field(None, description="A description of the task")
-    author: str | None = Field(
-        None, description="Author extracted from transformations"
+    task_id: AlphaID | MPID | str | None = Field(
+        None,
+        description="The (task) ID of this calculation, used as a universal reference across property documents."
+        "This comes in the form: mp-******.",
     )
-    icsd_id: int | None = Field(
-        None, description="Inorganic Crystal Structure Database id of the structure"
+    task_type: Optional[TaskType | CalcType] = Field(
+        None, description="The type of calculation."
     )
     transformations: Any | None = Field(
         None,
         description="Information on the structural transformations, parsed from a "
         "transformations.json file",
     )
-    additional_json: dict[str, Any] | None = Field(
-        None, description="Additional json loaded from the calculation directory"
+    vasp_objects: Optional[dict[VaspObject, Any]] = Field(
+        None, description="Vasp objects associated with this task"
     )
 
-    custodian: list[CustodianDoc] | None = Field(
-        None,
-        title="Calcs reversed data",
-        description="Detailed custodian data for each VASP calculation contributing to the task document.",
-    )
+    @model_validator(mode="before")
+    @classmethod
+    def set_prod_model_pre_fields(cls, values: Any) -> Any:
+        """Ensure all important model fields are set and refreshed."""
+        values["last_updated"] = convert_datetime(
+            cls, values.get("last_updated", utcnow())
+        )
+
+        if (batch_id := values.get("batch_id")) is not None:
+            invalid_chars = set(
+                char
+                for char in batch_id
+                if (not char.isalnum()) and (char not in {"-", "_"})
+            )
+            if len(invalid_chars) > 0:
+                raise ValueError(
+                    f"Invalid characters in batch_id: {' '.join(invalid_chars)}"
+                )
+
 
-    analysis: AnalysisDoc | None = Field(
+@type_override(
+    {
+        "additional_json": str,
+        "transformations": str,
+        "vasp_objects": str,
+    }
+)
+class TaskDoc(ProductionTaskDoc, extra="allow", use_enum_values=True):
+    """Flexible wrapper around ProductionTaskDoc"""
+
+    additional_json: Optional[dict[str, Any]] = Field(
+        None, description="Additional json loaded from the calculation directory"
+    )
+    analysis: Optional[AnalysisDoc] = Field(
         None,
         title="Calculation Analysis",
         description="Some analysis of calculation data after collection.",
     )
-
-    last_updated: datetime = Field(
-        default_factory=utcnow,
-        description="Timestamp for the most recent calculation for this task document",
+    author: Optional[str] = Field(
+        None, description="Author extracted from transformations"
     )
-
-    completed_at: datetime | None = Field(
-        None, description="Timestamp for when this task was completed"
+    calcs_reversed: Optional[list[Calculation]] = Field(
+        None,
+        title="Calcs reversed data",
+        description="Detailed data for each VASP calculation contributing to the task document.",
     )
-
-    batch_id: str | None = Field(
+    custodian: Optional[list[CustodianDoc]] = Field(
         None,
-        description="Identifier for this calculation; should provide rough information about the calculation origin and purpose.",
+        title="Calcs reversed data",
+        description="Detailed custodian data for each VASP calculation contributing to the task document.",
+    )
+    entry: Optional[ComputedEntry] = Field(
+        None, description="The ComputedEntry from the task doc"
+    )
+    included_objects: Optional[list[VaspObject]] = Field(
+        None, description="List of VASP objects included with this task document"
+    )
+    output: Optional[OutputDoc] = Field(
+        None,
+        description="The exact set of output parameters used to generate the current task document.",
     )
 
     run_stats: dict[str, RunStatistics] | None = Field(
         None,
         description="Summary of runtime statistics for each calculation in this task",
     )
+    state: Optional[TaskState] = Field(None, description="State of this calculation")
+    task_label: Optional[str] = Field(None, description="A description of the task")
 
     # Note that private fields are needed because TaskDoc permits extra info
     # added to the model, unlike TaskDocument. Because of this, when pydantic looks up
@@ -554,25 +567,6 @@ def deserialize_entry(cls, entry):
     @classmethod
     def set_model_pre_fields(cls, values: Any) -> Any:
         """Ensure all important model fields are set and refreshed."""
-
-        # Make sure that the datetime field is properly formatted
-        # (Unclear when this is not the case, please leave comment if observed)
-        values["last_updated"] = convert_datetime(
-            cls, values.get("last_updated", utcnow())
-        )
-
-        # Ensure batch_id includes only valid characters
-        if (batch_id := values.get("batch_id")) is not None:
-            invalid_chars = set(
-                char
-                for char in batch_id
-                if (not char.isalnum()) and (char not in {"-", "_"})
-            )
-            if len(invalid_chars) > 0:
-                raise ValueError(
-                    f"Invalid characters in batch_id: {' '.join(invalid_chars)}"
-                )
-
         # Always refresh task_type, calc_type, run_type
         # if attributes containing input sets are available.
         # See, e.g. https://github.com/materialsproject/emmet/issues/960