ExtensityAI · Dec 9, 2024
diff --git a/‎.DS_Store
6 KB b/‎.DS_Store
6 KB
diff --git a/‎src/.DS_Store
6 KB b/‎src/.DS_Store
6 KB
diff --git a/‎src/functions.py
+179 b/‎src/functions.py
+179
diff --git a/‎src/hierarchical.py
+31-3 b/‎src/hierarchical.py
+31-3
@@ -0,0 +1,179 @@
+from typing import Any, List
+from symai.components import ExceptionWithUsage, LengthConstrainedFunction
+from loguru import logger
+from symai import Symbol
+from pydantic import BaseModel, Field
+
+
+class ResultValidator(LengthConstrainedFunction):
+    def __init__(
+        self,
+        validation_retry_count: int = 5,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(character_constraints=[], *args, **kwargs)
+        self.validation_retry_count = validation_retry_count
+
+    def validate(self, result) -> List[str]:
+        # validation_criteria = {
+        #     "Interview": "Does this summary identify different speakers and their key discussion points?",
+        #     "Keynote": "Does this summary include speaker details, their expertise, and key messages?",
+        #     "Scientific Paper": "Does this summary include methodology details and research findings?",
+        #     "Report": "Does this summary include specific numerical results and statistics?",
+        #     "Book": "Does this summary include character descriptions and their relationships?",
+        #     "Presentation Slides": "Does this summary include the core idea and value proposition?"
+        # }
+        
+        # # Get the validation prompt for this content type
+        # if result.type in validation_criteria:
+        #     validation_prompt = validation_criteria[result.type]
+        # else:
+        #     validation_prompt = "Is this a content summary?"
+        
+        # validation = Symbol(f"{validation_prompt} Return yes or no.\n{result.summary}").interpret()
+        # is_valid = "yes" in validation.lower() or "true" in validation.lower()
+
+        # print(f"Validation: {validation}")
+        
+        # # Return a list of validation errors (empty list if valid)
+        # return [] if is_valid else [f"Content type '{result.type}' validation failed"]
+
+        return []
+
+    def forward(self, *args, **kwargs):
+        result, usage = super().forward(*args, **kwargs)
+
+        if self.validation_retry_count > 0:
+            # save original task
+            original_task = args[0]
+
+            # get list of seeds for remedy (to avoid same remedy for same input)
+            remedy_seeds = self.prepare_seeds(self.validation_retry_count, **kwargs)
+
+            # validate the result
+            for i in range(self.validation_retry_count):
+                validation_errors = self.validate(result)
+
+                if len(validation_errors) > 0:
+                    for violation in validation_errors:
+                        logger.info(f"Validation error: {violation}")
+                        
+                    logger.debug(str(result))
+
+                    # build remedy task
+                    remedy_task = self.wrap_task(
+                        original_task, result.model_dump_json(), validation_errors
+                    )
+
+                    # attempt to remedy the result
+                    kwargs["seed"] = remedy_seeds[i]
+                    result, remedy_usage = super().forward(remedy_task, *args[1:], **kwargs)
+
+                    # update local usage
+                    usage.prompt_tokens += remedy_usage.prompt_tokens
+                    usage.completion_tokens += remedy_usage.completion_tokens
+                    usage.total_tokens += remedy_usage.total_tokens
+                else:
+                    break
+
+            validation_errors = self.check_constraints(result)
+            if i == self.validation_retry_count and len(validation_errors) > 0:
+                raise ExceptionWithUsage(
+                    f"Failed to enforce constraints: {' | '.join(validation_errors)}",
+                    usage,
+                )
+
+        return result, usage
+
+    def wrap_task(self, task: str, result: str, validation_errors: List[str]):
+        joined_validation_errors = "\n".join(validation_errors)
+
+        remedy_task = f"""
+            You had the following task:
+
+            [Original Task]
+            {task}
+
+            [Original Output]
+            {result}
+
+            However, the output has the following validation errors:
+
+            [Validation Errors]
+            {joined_validation_errors}
+
+            [Task]
+            Follow the origianl task but fix the validation errors.
+            """
+
+        return remedy_task
+    
+    @property
+    def static_context(self):
+        return (
+            "You are an agent for validating 'JSON' schemas and fixing errors."
+        )
+
+class LLMDataModel(BaseModel):
+    """
+    A base class for Pydantic models that provides nicely formatted string output,
+    suitable for LLM prompts, with support for nested models, lists, and optional section headers.
+    """
+
+    section_header: str = Field(
+        default=None, exclude=True, frozen=True
+    )  # Optional section header for top-level models
+
+    def format_field(self, key: str, value: Any, indent: int = 0) -> str:
+        """
+        Formats a single field for output. Handles nested models, lists, and dictionaries.
+        """
+        indent_str = " " * indent
+        if isinstance(value, LLMDataModel):
+            # Nested model
+            nested_str = value.__str__(indent + 2).strip()
+            return f"{indent_str}{key}:\n{nested_str}" if key else nested_str
+        elif isinstance(value, list):
+            # List of items (handle nested models inside lists)
+            formatted_items = "\n".join(
+                [
+                    f"{indent_str}  - {self.format_field('', item, indent).strip()}"
+                    for item in value
+                ]
+            )
+            return f"{indent_str}{key}:\n{formatted_items}" if key else formatted_items
+        elif isinstance(value, dict):
+            # Dictionary of key-value pairs
+            formatted_items = "\n".join(
+                [
+                    f"{indent_str}  {k}: {self.format_field('', v, indent + 4).strip()}"
+                    for k, v in value.items()
+                ]
+            )
+            return f"{indent_str}{key}:\n{formatted_items}" if key else formatted_items
+        else:
+            # Primitive types
+            return f"{indent_str}{key}: {value}" if key else f"{indent_str}{value}"
+
+    def __str__(self, indent: int = 0) -> str:
+        """
+        Converts the model into a formatted string for LLM prompts.
+        Handles indentation for nested models and includes an optional section header.
+        """
+        indent_str = " " * indent
+        fields = "\n".join(
+            self.format_field(name, getattr(self, name), indent + 2)
+            for name, field in self.model_fields.items()
+            if (
+                getattr(self, name, None) is not None
+                and not getattr(field, "exclude", False)
+                and not name == "section_header"
+            )  # Exclude None values and "exclude" fields
+        )
+        fields += "\n"  # add line break at the end to separate from the next section
+
+        if self.section_header and indent == 0:
+            header = f"{indent_str}[[{self.section_header}]]\n"
+            return f"{header}{fields}"
+        return fields
@@ -7,6 +7,14 @@
 
 from symai.components import FileReader, Function, ValidatedFunction
 from symai.core_ext import bind
+from functions import LLMDataModel, ResultValidator
+
+
+class ChunkSummary(LLMDataModel):
+    summary: str
+    facts: List[str]
+    type: str = None
+    section_header: str = "CHUNK SUMMARY"
 
 
 class Summary(BaseModel):
@@ -43,6 +51,10 @@ def __init__(
         self.max_output_tokens = max_output_tokens
         self.content_types = content_types
         self.seed = seed
+        self.result_validator = ResultValidator(
+            data_model=Summary,
+            validation_retry_count=3,
+        )
 
         file_content = None
         file_name = None
@@ -203,20 +215,36 @@ def summarize_chunks(self, chunks):
         chunk_summaries = []
         chunk_facts = []
 
-        for chunk in chunks:
+        for i, chunk in enumerate(chunks):
+            # Use ChunkSummary for individual chunk validation
             res, usage = super().forward(
                 chunk,
                 preview=False,
                 response_format={"type": "json_object"},
             )
+            
+            # Validate each chunk using LLMDataModel
+            chunk_summary = ChunkSummary(
+                summary=res.summary,
+                facts=res.facts,
+                type=self._content_type,
+            )
+            self.print_verbose(f"Chunk {i+1} Summary:\n{str(chunk_summary)}")
+            
             chunk_summaries.append(res.summary)
             chunk_facts.extend(res.facts)
 
+        # Create final summary
         res = Summary(
             summary="\n".join(chunk_summaries),
             facts=chunk_facts,
+            type=self._content_type
         )
-        return res, self.compute_required_tokens(res.summary, count_context=False)
+        
+        # Validate entire summary using ResultValidator
+        validated_res, validator_usage = self.result_validator(self.prompt)
+        
+        return validated_res, self.compute_required_tokens(validated_res.summary, count_context=False)
 
     def calculate_chunk_size(self, total_tokens):
         num_prompt_tokens = self.compute_required_tokens("", count_context=True)
@@ -328,8 +356,8 @@ def forward(self) -> Summary:
             res = Summary(
                 summary=data,
                 facts=facts,
+                type=asset_type,
             )
-            res.type = asset_type
             return res, self.get_usage()
         else:
             asset_type = self.get_asset_type(self.content)