Skip to content

Commit

Permalink
Merge pull request #56 from enoch3712/55-vision-without-documentloade…
Browse files Browse the repository at this point in the history
…r-is-not-working

55 vision without documentloader is not working
  • Loading branch information
enoch3712 authored Nov 4, 2024
2 parents b317c8c + d2abd28 commit b237386
Show file tree
Hide file tree
Showing 14 changed files with 1,811 additions and 1,459 deletions.
68 changes: 24 additions & 44 deletions extract_thinker/document_loader/document_loader_llm_image.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,34 @@
from abc import ABC
from io import BytesIO
from typing import Any, List, Union
from PIL import Image
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from extract_thinker.utils import extract_json


class DocumentLoaderLLMImage(CachedDocumentLoader, ABC):
SUPPORTED_FORMATS = ['pdf', 'jpg', 'jpeg', 'png']

def __init__(self, content=None, cache_ttl=300, llm=None):
super().__init__(content, cache_ttl)
self.llm = llm

def extract_image_content(self, image_stream: BytesIO) -> str:
"""
Extracts text or data from an image using an LLM.
The actual implementation uses an LLM to process the image content.
"""
# Load the image from the stream
image = Image.open(image_stream)

# Encode the image to base64
base64_image = self.encode_image(image)

# Use the LLM to extract the content from the image
resp = self.llm.completion(
model="claude-3-sonnet-20240229",
messages=[
{
"role": "system",
"content": 'You are a worldclass Image data extractor. You receive an image and extract useful information from it. You output a JSON with the extracted information.',
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "data:image/jpeg;base64," + base64_image
},
},
{"type": "text", "text": "###JSON Output\n"},
],
},
],
)

# Extract the JSON text from the response
jsonText = resp.choices[0].message.content

# Extract the JSON from the text
jsonText = extract_json(jsonText)

# Return the extracted content
return jsonText
def load_content_from_file(self, file_path: str) -> Union[str, object]:
images = self.convert_to_images(file_path)
results = []
for _, image_bytes in images.items():
image_stream = BytesIO(image_bytes)
results.append({"image": image_stream})
return results

def load_content_from_stream(self, stream: BytesIO) -> Union[str, object]:
images = self.convert_to_images(stream)
results = []
for _, image_bytes in images.items():
image_stream = BytesIO(image_bytes)
results.append({"image": image_stream})
return results

def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
return self.load_content_from_stream(stream)

def load_content_from_file_list(self, file_path: str) -> List[Any]:
return self.load_content_from_file(file_path)
5 changes: 2 additions & 3 deletions extract_thinker/document_loader/document_loader_pypdf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import io
from typing import Any, Dict, List, Union
from PyPDF2 import PdfReader
from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage
from extract_thinker.utils import get_file_extension

from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
SUPPORTED_FORMATS = ['pdf']

class DocumentLoaderPyPdf(DocumentLoaderLLMImage):
class DocumentLoaderPyPdf(CachedDocumentLoader):
def __init__(self, content: Any = None, cache_ttl: int = 300):
super().__init__(content, cache_ttl)

Expand Down
34 changes: 22 additions & 12 deletions extract_thinker/document_loader/document_loader_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,15 @@ def process_pdf(self, stream: BytesIO) -> str:

extracted_text = []
for page_number, image_bytes in images.items():
# Check if image_bytes is not empty and has the expected structure
# if not image_bytes or not isinstance(image_bytes, (list, tuple)):
# print(f"Skipping page {page_number}: Invalid image data")
# continue

# image = BytesIO(image_bytes[0])
text = self.process_image(image_bytes)
# Convert image data to proper BytesIO stream
if isinstance(image_bytes, bytes):
image_stream = BytesIO(image_bytes)
elif isinstance(image_bytes, BytesIO):
image_stream = image_bytes
else:
raise ValueError(f"Unexpected image data type for page {page_number}: {type(image_bytes)}")

text = self.process_image(image_stream)
extracted_text.append(text)

if not extracted_text:
Expand All @@ -117,14 +119,22 @@ def process_image(self, image: BytesIO) -> str:

def worker(self, input_queue: Queue, output_queue: Queue):
while True:
image = input_queue.get()
if image is None: # Sentinel to indicate shutdown
image_data = input_queue.get()
if image_data is None: # Sentinel to indicate shutdown
break
try:
text = self.process_image(image)
output_queue.put((image, text))
# Convert bytes to BytesIO if needed
if isinstance(image_data, bytes):
image_stream = BytesIO(image_data)
elif isinstance(image_data, BytesIO):
image_stream = image_data
else:
raise ValueError(f"Unexpected image data type: {type(image_data)}")

text = self.process_image(image_stream)
output_queue.put((image_data, text))
except Exception as e:
output_queue.put((image, str(e)))
output_queue.put((image_data, str(e)))
input_queue.task_done()

@cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream)))
Expand Down
108 changes: 72 additions & 36 deletions extract_thinker/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import litellm
from pydantic import BaseModel
from extract_thinker.document_loader.document_loader import DocumentLoader
from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage
from extract_thinker.models.classification import Classification
from extract_thinker.models.classification_response import ClassificationResponse
from extract_thinker.llm import LLM
Expand Down Expand Up @@ -84,6 +85,12 @@ def extract(
if not issubclass(response_model, BaseModel):
raise ValueError("response_model must be a subclass of Pydantic's BaseModel.")

if vision and not self.get_document_loader_for_file(source):
if not litellm.supports_vision(self.llm.model):
raise ValueError(f"Model {self.llm.model} does not support vision. Please provide a document loader or a model that supports vision.")
else:
self.document_loader = DocumentLoaderLLMImage(llm=self.llm)

if isinstance(source, str):
if os.path.exists(source):
return self.extract_from_file(source, response_model, vision)
Expand Down Expand Up @@ -449,55 +456,84 @@ def _extract(
for interceptor in self.llm_interceptors:
interceptor.intercept(self.llm)

messages = [
{
"role": "system",
"content": "You are a server API that receives document information "
"and returns specific fields in JSON format.",
},
]

if self.extra_content is not None:
if isinstance(self.extra_content, dict):
self.extra_content = yaml.dump(self.extra_content)
messages.append(
{
"role": "user",
"content": "##Extra Content\n\n" + self.extra_content,
}
)

if content is not None:
if isinstance(content, dict):
if content.get("is_spreadsheet", False):
content = json_to_formatted_string(content.get("data", {}))
content = yaml.dump(content, default_flow_style=True)
messages.append(
{"role": "user", "content": "##Content\n\n" + content}
)

if vision:
if not litellm.supports_vision(model=self.llm.model):
raise ValueError(
f"Model {self.llm.model} is not supported for vision, since it's not a vision model."
)

base64_encoded_image = encode_image(file_or_stream, is_stream)
# Initialize the content list for the message
message_content = []

# Add text content if it exists
if isinstance(content, str):
message_content.append({
"type": "text",
"text": content
})

# Add images
if isinstance(content, list): # Assuming content is a list of dicts with 'image' key
for page in content:
if 'image' in page:
base64_image = encode_image(page['image'])
message_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
})

# Create the messages array with the correct structure
messages = [
{
"role": "system",
"content": "You are a server API that receives document information and returns specific fields in JSON format.",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "data:image/jpeg;base64," + base64_encoded_image
},
},
],
"content": message_content
}
]

# Add extra content if it exists
if self.extra_content is not None:
if isinstance(self.extra_content, dict):
self.extra_content = yaml.dump(self.extra_content)
messages.insert(1, {
"role": "user",
"content": [{"type": "text", "text": "##Extra Content\n\n" + self.extra_content}]
})

else:
# Non-vision logic remains the same
messages = [
{
"role": "system",
"content": "You are a server API that receives document information and returns specific fields in JSON format.",
},
]

if self.extra_content is not None:
if isinstance(self.extra_content, dict):
self.extra_content = yaml.dump(self.extra_content)
messages.append(
{
"role": "user",
"content": "##Extra Content\n\n" + self.extra_content,
}
)

if content is not None:
if isinstance(content, dict):
if content.get("is_spreadsheet", False):
content = json_to_formatted_string(content.get("data", {}))
content = yaml.dump(content, default_flow_style=True)
if isinstance(content, str):
messages.append(
{"role": "user", "content": "##Content\n\n" + content}
)

if self.llm.token_limit:
max_tokens_per_request = self.llm.token_limit - 1000
content_tokens = num_tokens_from_string(str(content))
Expand Down
31 changes: 28 additions & 3 deletions extract_thinker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,34 @@
from io import BytesIO
from typing import Union

def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def encode_image(image_source: Union[str, BytesIO]) -> str:
"""
Encode an image to base64 string from either a file path or BytesIO stream.
Args:
image_source (Union[str, BytesIO]): The image source, either a file path or BytesIO stream
Returns:
str: Base64 encoded string of the image
"""
try:
if isinstance(image_source, str):
with open(image_source, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
elif isinstance(image_source, BytesIO):
# Save current position
current_position = image_source.tell()
# Move to start of stream
image_source.seek(0)
# Encode stream content
encoded = base64.b64encode(image_source.read()).decode("utf-8")
# Restore original position
image_source.seek(current_position)
return encoded
else:
raise ValueError("Image source must be either a file path (str) or BytesIO stream")
except Exception as e:
raise Exception(f"Failed to encode image: {str(e)}")

def is_pdf_stream(stream: Union[BytesIO, str]) -> bool:
"""
Expand Down
Loading

0 comments on commit b237386

Please sign in to comment.