With the new DFT schema, identify examples and point to main fields in the schema. (#55)

JosePizarro3 · Copilot · web-flow · commit c03e8dc5d661 · 2025-10-29T15:46:24.000+01:00
* Refine example.json Added script.py for testing locally stuff * Added multiple file support in CLI prompt * Added dft constraints and target in PROMPT_REGISTRY Improved StructuredPrompt * Fix test expectations for updated StructuredPrompt instruction format (#56) * Fix failing test for StructuredPrompt instructions format --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
diff --git a/nerxiv/cli/cli.py b/nerxiv/cli/cli.py
@@ -64,8 +64,9 @@ def cli():
     "-path",
     type=str,
     required=True,
+    multiple=True,
     help="""
-    The path to the HDF5 file used to prompt the LLM.
+    The path to the HDF5 file or files used to prompt the LLM.
     """,
 )
 @click.option(
@@ -167,22 +168,23 @@ def prompt(
     chunker_kwargs = parse_llm_option_to_args(chunker_option)
 
     # Transform to Path and get the hdf5 data
-    paper = Path(file_path)
-    paper_time = run_prompt_paper(
-        paper=paper,
-        chunker=chunker,
-        retriever_model=retriever_model,
-        n_top_chunks=n_top_chunks,
-        model=model,
-        retriever_query=retriever_query,
-        prompt=prompt,
-        query=query,
-        paper_time=start_time,
-        logger=logger,
-        **chunker_kwargs,
-        **llm_kwargs,
-    )
-    click.echo(f"Processed arXiv papers in {paper_time:.2f} seconds\n\n")
+    for file in file_path:
+        paper = Path(file)
+        paper_time = run_prompt_paper(
+            paper=paper,
+            chunker=chunker,
+            retriever_model=retriever_model,
+            n_top_chunks=n_top_chunks,
+            model=model,
+            retriever_query=retriever_query,
+            prompt=prompt,
+            query=query,
+            paper_time=start_time,
+            logger=logger,
+            **chunker_kwargs,
+            **llm_kwargs,
+        )
+        click.echo(f"Processed arXiv paper {file} in {paper_time:.2f} seconds\n\n")
 
 
 @cli.command(
diff --git a/nerxiv/datamodel/example.json b/nerxiv/datamodel/example.json
@@ -1,4 +1,5 @@
 {
+  "crystal_structure": [],
   "dft": [
     {
       "code": "VASP",
@@ -92,5 +93,9 @@
       "spin_treatment": "unrestricted",
       "relativistic_treatment": "atomic ZORA"
     }
-  ]
+  ],
+  "projection": [],
+  "interactions": [],
+  "dmft": [],
+  "analytical_continuation": []
 }
diff --git a/nerxiv/prompts/prompts.py b/nerxiv/prompts/prompts.py
@@ -203,7 +203,7 @@ class StructuredPrompt(BasePrompt):
     target_fields: list[str] = Field(
         ...,
         description="""
-        The fields within `output_schema` that the prompt should extract.
+        The fields within `output_schema` that the prompt should extract. If set to `all`, all fields defined in `output_schema` will be extracted.
         """,
     )
 
@@ -221,6 +221,9 @@ def validate_target_fields_in_schema(cls, data: Any) -> Any:
         """
         model_properties = data.output_schema.model_json_schema().get("properties", {})
         for field in data.target_fields:
+            if field == "all":
+                data.target_fields = list(model_properties.keys())
+                break
             if field not in model_properties:
                 raise ValueError(
                     f"Field '{field}' is not defined in the output schema '{data.output_schema.__name__}'."
@@ -243,8 +246,8 @@ def _build_instructions(self) -> str:
         description = clean_description(
             model.get("description", "<<no definition provided>>")
         )
-        instruction_lines = f"Given the following scientific text, your task is: to identify all mentions of the {name}. "
-        instruction_lines += f"This is defined as {description}. "
+        instruction_lines = f"Given the following scientific text, your task is: to identify all mentions of the {name} section. "
+        instruction_lines += f"This is defined as a {description} "
 
         instruction_lines += "You must extract the values of the following fields:"
         # getting the fields defined for the class and maching them with `target_fields`
@@ -255,18 +258,26 @@ def _build_instructions(self) -> str:
             prop_types = [
                 p.get("type") for p in prop.get("anyOf", []) if p.get("type") != "null"
             ]  # only non-null types
-            instruction_lines += f"\n- {field} defined as '{prop_description}' and which is of type {prop_types[0]}"
+            if not prop_types:
+                instruction_lines += f"\n- {field} defined as {prop_description}"
+            else:
+                prop_type = prop_types[0]
+                if prop_type == "object":
+                    prop_type = "dictionary"
+                instruction_lines += f"\n- {field} defined as {prop_description} and which is of type {prop_type}"
             # TODO add data type
 
         instruction_lines += (
-            "\nYou must return the extracted values in the following format:"
+            "\nYou must return the extracted values in JSON format:"
             "\n```json\n"
-            f"'{name}': " + "{\n"
+            "{\n"
+            f"  '{name}': " + "{\n"
         )
         for field in self.target_fields:
             instruction_lines += f"    '{field}': <parsed-value>,\n"
 
-        instruction_lines += "}\n```\n"
+        instruction_lines += "  }\n}\n```\n"
+        instruction_lines += "Note that <parsed-value> means a value of the correct type defined for that field."
         return instruction_lines
 
     def build(self, text: str) -> str:
diff --git a/nerxiv/prompts/prompts_registry.py b/nerxiv/prompts/prompts_registry.py
@@ -37,8 +37,14 @@
         prompt=StructuredPrompt(
             expert="Condensed Matter Physics",
             output_schema=DFT,
-            target_fields=[],
-            constraints=[],
+            target_fields=["all"],
+            constraints=[
+                "Return ONLY the requested JSON object without any additional text or explanation.",
+                "If you do NOT find the value of a field in the text, do NOT make up a value. Leave it as null in the JSON output.",
+                "Do NOT infere values of fields that are not explicitly mentioned in the text.",
+                "Return the JSON as specified in the prompt. Do NOT make up a new JSON with different field names or structure.",
+                "Ensure that all parsed values are of the correct data type as defined in the DFT schema.",
+            ],
             examples=[],
         ),
     ),
diff --git a/tests/prompts/test_prompts.py b/tests/prompts/test_prompts.py
@@ -167,18 +167,19 @@ def test_build_instructions(self):
         )
         assert prompt._build_instructions() == (
             "Given the following scientific text, your task is: to identify all mentions of "
-            "the ChemicalFormulation. This is defined as A ChemicalFormulation is a descriptive "
+            "the ChemicalFormulation section. This is defined as a A ChemicalFormulation is a descriptive "
             "representation of the chemical composition of a material system, expressed in one or "
             "more standardized formula formats (e.g., IUPAC, anonymous, Hill, or reduced), each "
             "encoding the stoichiometry and elemental ordering according to specific conventions. "
             "For the compound H2O2 (hydrogen peroxide), the different formulations would be: iupac: "
-            "H2O2 anonymous: AB hill: H2O2 reduced: H2O2. You must extract the values of the following "
-            "fields:\n- iupac defined as 'Chemical formula where the elements are ordered using a "
+            "H2O2 anonymous: AB hill: H2O2 reduced: H2O2 You must extract the values of the following "
+            "fields:\n- iupac defined as Chemical formula where the elements are ordered using a "
             "formal list based on electronegativity as defined in the IUPAC nomenclature of inorganic "
             "chemistry (2005): - https://en.wikipedia.org/wiki/List_of_inorganic_compounds Contains "
             "reduced integer chemical proportion numbers where the proportion number is omitted if it "
-            "is 1.' and which is of type string\n- reduced defined as 'Alphabetically sorted chemical "
+            "is 1. and which is of type string\n- reduced defined as Alphabetically sorted chemical "
             "formula with reduced integer chemical proportion numbers. The proportion number is omitted "
-            "if it is 1.' and which is of type string\nYou must return the extracted values in the "
-            "following format:\n```json\n'ChemicalFormulation': {\n    'iupac': <parsed-value>,\n    'reduced': <parsed-value>,\n}\n```\n"
+            "if it is 1. and which is of type string\nYou must return the extracted values in JSON "
+            "format:\n```json\n{\n  'ChemicalFormulation': {\n    'iupac': <parsed-value>,\n    'reduced': <parsed-value>,\n  }\n}\n```\n"
+            "Note that <parsed-value> means a value of the correct type defined for that field."
         )
diff --git a/tutorials/script.py b/tutorials/script.py
@@ -0,0 +1,18 @@
+from pathlib import Path
+
+import h5py
+
+for i, path in enumerate(Path("./data").glob("*.hdf5")):
+    # if i > 0:
+    #     break
+    with h5py.File(path, "r+") as f:
+        if "raw_llm_answers" not in f:
+            continue
+        raw = f["raw_llm_answers"]
+        old = raw.require_group("20251028_OLD_raw_llm_answers")
+        for run in list(raw.keys()):
+            if not run.startswith("run_"):
+                continue
+            old.copy(raw[run], run)
+            del raw[run]
+        print(f"Processed file: {path}")

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`{`
	`2`	`+ "crystal_structure": [],`
`2`	`3`	`"dft": [`
`3`	`4`	`{`
`4`	`5`	`"code": "VASP",`
`@@ -92,5 +93,9 @@`
`92`	`93`	`"spin_treatment": "unrestricted",`
`93`	`94`	`"relativistic_treatment": "atomic ZORA"`
`94`	`95`	`}`
`95`		`- ]`
	`96`	`+ ],`
	`97`	`+ "projection": [],`
	`98`	`+ "interactions": [],`
	`99`	`+ "dmft": [],`
	`100`	`+ "analytical_continuation": []`
`96`	`101`	`}`