enoch3712
diff --git a/‎extract_thinker/document_loader/document_loader_llm_image.py
Lines changed: 24 additions & 44 deletions b/‎extract_thinker/document_loader/document_loader_llm_image.py
Lines changed: 24 additions & 44 deletions
diff --git a/‎extract_thinker/document_loader/document_loader_pypdf.py
Lines changed: 2 additions & 3 deletions b/‎extract_thinker/document_loader/document_loader_pypdf.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎extract_thinker/document_loader/document_loader_tesseract.py
Lines changed: 22 additions & 12 deletions b/‎extract_thinker/document_loader/document_loader_tesseract.py
Lines changed: 22 additions & 12 deletions
diff --git a/‎extract_thinker/extractor.py
Lines changed: 72 additions & 36 deletions b/‎extract_thinker/extractor.py
Lines changed: 72 additions & 36 deletions
diff --git a/‎extract_thinker/utils.py
Lines changed: 28 additions & 3 deletions b/‎extract_thinker/utils.py
Lines changed: 28 additions & 3 deletions
@@ -1,54 +1,34 @@
 from abc import ABC
 from io import BytesIO
+from typing import Any, List, Union
 from PIL import Image
 from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
-from extract_thinker.utils import extract_json
-
 
 class DocumentLoaderLLMImage(CachedDocumentLoader, ABC):
+    SUPPORTED_FORMATS = ['pdf', 'jpg', 'jpeg', 'png']
+    
     def __init__(self, content=None, cache_ttl=300, llm=None):
         super().__init__(content, cache_ttl)
         self.llm = llm
 
-    def extract_image_content(self, image_stream: BytesIO) -> str:
-        """
-        Extracts text or data from an image using an LLM.
-        The actual implementation uses an LLM to process the image content.
-        """
-        # Load the image from the stream
-        image = Image.open(image_stream)
-
-        # Encode the image to base64
-        base64_image = self.encode_image(image)
-
-        # Use the LLM to extract the content from the image
-        resp = self.llm.completion(
-            model="claude-3-sonnet-20240229",
-            messages=[
-                {
-                    "role": "system",
-                    "content": 'You are a worldclass Image data extractor. You receive an image and extract useful information from it. You output a JSON with the extracted information.',
-                },
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "data:image/jpeg;base64," + base64_image
-                            },
-                        },
-                        {"type": "text", "text": "###JSON Output\n"},
-                    ],
-                },
-            ],
-        )
-
-        # Extract the JSON text from the response
-        jsonText = resp.choices[0].message.content
-
-        # Extract the JSON from the text
-        jsonText = extract_json(jsonText)
-
-        # Return the extracted content
-        return jsonText
+    def load_content_from_file(self, file_path: str) -> Union[str, object]:
+        images = self.convert_to_images(file_path)
+        results = []
+        for _, image_bytes in images.items():
+            image_stream = BytesIO(image_bytes)
+            results.append({"image": image_stream})
+        return results
+
+    def load_content_from_stream(self, stream: BytesIO) -> Union[str, object]:
+        images = self.convert_to_images(stream)
+        results = []
+        for _, image_bytes in images.items():
+            image_stream = BytesIO(image_bytes)
+            results.append({"image": image_stream})
+        return results
+
+    def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
+        return self.load_content_from_stream(stream)
+
+    def load_content_from_file_list(self, file_path: str) -> List[Any]:
+        return self.load_content_from_file(file_path)
@@ -1,12 +1,11 @@
 import io
 from typing import Any, Dict, List, Union
 from PyPDF2 import PdfReader
-from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage
 from extract_thinker.utils import get_file_extension
-
+from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
 SUPPORTED_FORMATS = ['pdf']
 
-class DocumentLoaderPyPdf(DocumentLoaderLLMImage):
+class DocumentLoaderPyPdf(CachedDocumentLoader):
     def __init__(self, content: Any = None, cache_ttl: int = 300):
         super().__init__(content, cache_ttl)
 
 
@@ -84,13 +84,15 @@ def process_pdf(self, stream: BytesIO) -> str:
 
             extracted_text = []
             for page_number, image_bytes in images.items():
-                # Check if image_bytes is not empty and has the expected structure
-                # if not image_bytes or not isinstance(image_bytes, (list, tuple)):
-                #     print(f"Skipping page {page_number}: Invalid image data")
-                #     continue
-                    
-                # image = BytesIO(image_bytes[0])
-                text = self.process_image(image_bytes)
+                # Convert image data to proper BytesIO stream
+                if isinstance(image_bytes, bytes):
+                    image_stream = BytesIO(image_bytes)
+                elif isinstance(image_bytes, BytesIO):
+                    image_stream = image_bytes
+                else:
+                    raise ValueError(f"Unexpected image data type for page {page_number}: {type(image_bytes)}")
+
+                text = self.process_image(image_stream)
                 extracted_text.append(text)
 
             if not extracted_text:
@@ -117,14 +119,22 @@ def process_image(self, image: BytesIO) -> str:
 
     def worker(self, input_queue: Queue, output_queue: Queue):
         while True:
-            image = input_queue.get()
-            if image is None:  # Sentinel to indicate shutdown
+            image_data = input_queue.get()
+            if image_data is None:  # Sentinel to indicate shutdown
                 break
             try:
-                text = self.process_image(image)
-                output_queue.put((image, text))
+                # Convert bytes to BytesIO if needed
+                if isinstance(image_data, bytes):
+                    image_stream = BytesIO(image_data)
+                elif isinstance(image_data, BytesIO):
+                    image_stream = image_data
+                else:
+                    raise ValueError(f"Unexpected image data type: {type(image_data)}")
+                    
+                text = self.process_image(image_stream)
+                output_queue.put((image_data, text))
             except Exception as e:
-                output_queue.put((image, str(e)))
+                output_queue.put((image_data, str(e)))
             input_queue.task_done()
 
     @cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream)))
 
@@ -6,6 +6,7 @@
 import litellm
 from pydantic import BaseModel
 from extract_thinker.document_loader.document_loader import DocumentLoader
+from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage
 from extract_thinker.models.classification import Classification
 from extract_thinker.models.classification_response import ClassificationResponse
 from extract_thinker.llm import LLM
@@ -84,6 +85,12 @@ def extract(
         if not issubclass(response_model, BaseModel):
             raise ValueError("response_model must be a subclass of Pydantic's BaseModel.")
 
+        if vision and not self.get_document_loader_for_file(source):
+            if not litellm.supports_vision(self.llm.model):
+                raise ValueError(f"Model {self.llm.model} does not support vision. Please provide a document loader or a model that supports vision.")
+            else:
+                self.document_loader = DocumentLoaderLLMImage(llm=self.llm)
+
         if isinstance(source, str):
             if os.path.exists(source):
                 return self.extract_from_file(source, response_model, vision)
@@ -449,55 +456,84 @@ def _extract(
         for interceptor in self.llm_interceptors:
             interceptor.intercept(self.llm)
 
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a server API that receives document information "
-                "and returns specific fields in JSON format.",
-            },
-        ]
-
-        if self.extra_content is not None:
-            if isinstance(self.extra_content, dict):
-                self.extra_content = yaml.dump(self.extra_content)
-            messages.append(
-                {
-                    "role": "user",
-                    "content": "##Extra Content\n\n" + self.extra_content,
-                }
-            )
-
-        if content is not None:
-            if isinstance(content, dict):
-                if content.get("is_spreadsheet", False):
-                    content = json_to_formatted_string(content.get("data", {}))
-                content = yaml.dump(content, default_flow_style=True)
-            messages.append(
-                {"role": "user", "content": "##Content\n\n" + content}
-            )
-
         if vision:
             if not litellm.supports_vision(model=self.llm.model):
                 raise ValueError(
                     f"Model {self.llm.model} is not supported for vision, since it's not a vision model."
                 )
 
-            base64_encoded_image = encode_image(file_or_stream, is_stream)
+            # Initialize the content list for the message
+            message_content = []
+            
+            # Add text content if it exists
+            if isinstance(content, str):
+                message_content.append({
+                    "type": "text",
+                    "text": content
+                })
+            
+            # Add images
+            if isinstance(content, list):  # Assuming content is a list of dicts with 'image' key
+                for page in content:
+                    if 'image' in page:
+                        base64_image = encode_image(page['image'])
+                        message_content.append({
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }
+                        })
 
+            # Create the messages array with the correct structure
             messages = [
+                {
+                    "role": "system",
+                    "content": "You are a server API that receives document information and returns specific fields in JSON format.",
+                },
                 {
                     "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "data:image/jpeg;base64," + base64_encoded_image
-                            },
-                        },
-                    ],
+                    "content": message_content
                 }
             ]
 
+            # Add extra content if it exists
+            if self.extra_content is not None:
+                if isinstance(self.extra_content, dict):
+                    self.extra_content = yaml.dump(self.extra_content)
+                messages.insert(1, {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "##Extra Content\n\n" + self.extra_content}]
+                })
+
+        else:
+            # Non-vision logic remains the same
+            messages = [
+                {
+                    "role": "system",
+                    "content": "You are a server API that receives document information and returns specific fields in JSON format.",
+                },
+            ]
+
+            if self.extra_content is not None:
+                if isinstance(self.extra_content, dict):
+                    self.extra_content = yaml.dump(self.extra_content)
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": "##Extra Content\n\n" + self.extra_content,
+                    }
+                )
+
+            if content is not None:
+                if isinstance(content, dict):
+                    if content.get("is_spreadsheet", False):
+                        content = json_to_formatted_string(content.get("data", {}))
+                    content = yaml.dump(content, default_flow_style=True)
+                if isinstance(content, str):
+                    messages.append(
+                        {"role": "user", "content": "##Content\n\n" + content}
+                    )
+
         if self.llm.token_limit:
             max_tokens_per_request = self.llm.token_limit - 1000
             content_tokens = num_tokens_from_string(str(content))
 
@@ -10,9 +10,34 @@
 from io import BytesIO
 from typing import Union
 
-def encode_image(image_path):
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode("utf-8")
+def encode_image(image_source: Union[str, BytesIO]) -> str:
+    """
+    Encode an image to base64 string from either a file path or BytesIO stream.
+
+    Args:
+        image_source (Union[str, BytesIO]): The image source, either a file path or BytesIO stream
+
+    Returns:
+        str: Base64 encoded string of the image
+    """
+    try:
+        if isinstance(image_source, str):
+            with open(image_source, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        elif isinstance(image_source, BytesIO):
+            # Save current position
+            current_position = image_source.tell()
+            # Move to start of stream
+            image_source.seek(0)
+            # Encode stream content
+            encoded = base64.b64encode(image_source.read()).decode("utf-8")
+            # Restore original position
+            image_source.seek(current_position)
+            return encoded
+        else:
+            raise ValueError("Image source must be either a file path (str) or BytesIO stream")
+    except Exception as e:
+        raise Exception(f"Failed to encode image: {str(e)}")
 
 def is_pdf_stream(stream: Union[BytesIO, str]) -> bool:
     """