Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions autogpt_platform/backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ RUN apt-get update \
libssl-dev \
postgresql-client \
nodejs \
tesseract-ocr \
libtesseract-dev \
tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*

ENV POETRY_HOME=/opt/poetry
Expand All @@ -45,6 +48,10 @@ COPY autogpt_platform/backend/poetry.lock autogpt_platform/backend/pyproject.tom
WORKDIR /app/autogpt_platform/backend
RUN poetry install --no-ansi --no-root


# pytessearct
RUN poetry add pytesseract --no-ansi || true

# Generate Prisma client
COPY autogpt_platform/backend/schema.prisma ./
COPY autogpt_platform/backend/backend/data/partial_types.py ./backend/data/partial_types.py
Expand All @@ -65,6 +72,9 @@ ENV PATH=/opt/poetry/bin:$PATH
RUN apt-get update && apt-get install -y \
python3.13 \
python3-pip \
tesseract-ocr \
libtesseract-dev \
tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*

# Copy only necessary files from builder
Expand Down
110 changes: 92 additions & 18 deletions autogpt_platform/backend/backend/blocks/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
from json import JSONDecodeError
from typing import Any, Iterable, List, Literal, NamedTuple, Optional


import pytesseract
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: pytesseract is unconditionally imported in llm.py but is missing from pyproject.toml, leading to ModuleNotFoundError at startup.
Severity: CRITICAL | Confidence: 1.00

🔍 Detailed Analysis

The application will crash at startup with a ModuleNotFoundError: No module named 'pytesseract' because pytesseract is imported unconditionally in llm.py at line 13, but it is not declared as a permanent dependency in pyproject.toml. The poetry add pytesseract --no-ansi || true command in the Dockerfile is an unreliable installation method that does not guarantee the dependency is always present, especially in non-Docker environments.

💡 Suggested Fix

Add pytesseract as a formal dependency to pyproject.toml. Remove the unreliable poetry add pytesseract --no-ansi || true from the Dockerfile, allowing Poetry to manage dependencies correctly.

🤖 Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: autogpt_platform/backend/backend/blocks/llm.py#L13

Potential issue: The application will crash at startup with a `ModuleNotFoundError: No
module named 'pytesseract'` because `pytesseract` is imported unconditionally in
`llm.py` at line 13, but it is not declared as a permanent dependency in
`pyproject.toml`. The `poetry add pytesseract --no-ansi || true` command in the
Dockerfile is an unreliable installation method that does not guarantee the dependency
is always present, especially in non-Docker environments.

Did we get this right? 👍 / 👎 to inform future reviews.

Reference_id: 2669854

from PIL import Image
import requests
import base64
import io


import anthropic
import ollama
import openai
Expand All @@ -35,6 +43,7 @@
from backend.util.logging import TruncatedLogger
from backend.util.prompt import compress_prompt, estimate_token_count
from backend.util.text import TextFormatter
from backend.util.type import MediaFileType

logger = TruncatedLogger(logging.getLogger(__name__), "[LLM-Block]")
fmt = TextFormatter(autoescape=False)
Expand Down Expand Up @@ -838,6 +847,13 @@ class Input(BlockSchemaInput):
default="localhost:11434",
description="Ollama host for local models",
)
image: MediaFileType | None = SchemaField(
title="Media Input",
default=None,
description=(
"Optional media input file (URL, local path, or base64 data URI)."
),
)

class Output(BlockSchemaOutput):
response: dict[str, Any] | list[dict[str, Any]] = SchemaField(
Expand Down Expand Up @@ -918,6 +934,41 @@ async def run(
logger.debug(f"Calling LLM with input data: {input_data}")
prompt = [json.to_dict(p) for p in input_data.conversation_history]

# Process image with OCR if present
if input_data.image:
try:
# Handle different image input formats
if input_data.image.startswith('http'):
# URL image
response = requests.get(input_data.image)
image = Image.open(io.BytesIO(response.content))
elif input_data.image.startswith('data:image'):
# Base64 image
base64_data = re.sub('^data:image/.+;base64,', '', input_data.image)
image_data = base64.b64decode(base64_data)
image = Image.open(io.BytesIO(image_data))
else:
# Local file path
image = Image.open(input_data.image)

# Perform OCR
ocr_text = pytesseract.image_to_string(image)
logger.debug(f"OCR extracted text: {ocr_text}")

# Append OCR text to prompt if text was extracted
if ocr_text.strip():
if input_data.prompt:
input_data.prompt += f"\n\nExtracted text from image:\n{ocr_text}"
else:
input_data.prompt = f"Extracted text from image:\n{ocr_text}"

except Exception as e:
logger.error(f"Error processing image with OCR: {str(e)}")
if input_data.prompt:
input_data.prompt += f"\n\nError processing image: {str(e)}"
else:
input_data.prompt = f"Error processing image: {str(e)}"

values = input_data.prompt_values
if values:
input_data.prompt = fmt.format_string(input_data.prompt, values)
Expand Down Expand Up @@ -1214,52 +1265,60 @@ def trim_prompt(s: str) -> str:
class AITextGeneratorBlock(AIBlockBase):
class Input(BlockSchemaInput):
prompt: str = SchemaField(
description="The prompt to send to the language model. You can use any of the {keys} from Prompt Values to fill in the prompt with values from the prompt values dictionary by putting them in curly braces.",
description="The prompt to send to the language model.",
placeholder="Enter your prompt here...",
)
model: LlmModel = SchemaField(
title="LLM Model",
default=LlmModel.GPT4O,
description="The language model to use for answering the prompt.",
advanced=False,
)
credentials: AICredentials = AICredentialsField()
sys_prompt: str = SchemaField(
title="System Prompt",
default="",
description="The system prompt to provide additional context to the model.",
description="Optional system prompt for additional context.",
)
retry: int = SchemaField(
title="Retry Count",
default=3,
description="Number of times to retry the LLM call if the response does not match the expected format.",
description="Number of times to retry the LLM call if needed.",
)
prompt_values: dict[str, str] = SchemaField(
advanced=False,
default_factory=dict,
description="Values used to fill in the prompt. The values can be used in the prompt by putting them in a double curly braces, e.g. {{variable_name}}.",
description="Values used to fill in the prompt ({{var}} syntax).",
)
ollama_host: str = SchemaField(
advanced=True,
default="localhost:11434",
description="Ollama host for local models",
description="Ollama host for local models.",
)
max_tokens: int | None = SchemaField(
advanced=True,
default=None,
description="The maximum number of tokens to generate in the chat completion.",
description="Maximum number of tokens to generate.",
)
image: MediaFileType | None = SchemaField(
title="Media Input",
default=None,
description=(
"Optional media input file (URL, local path, or base64 data URI)."
),
)

class Output(BlockSchemaOutput):
response: str = SchemaField(
description="The response generated by the language model."
)
prompt: list = SchemaField(description="The prompt sent to the language model.")
prompt: list = SchemaField(
description="The prompt sent to the language model."
)

def __init__(self):
super().__init__(
id="1f292d4a-41a4-4977-9684-7c8d560b9f91",
description="Call a Large Language Model (LLM) to generate a string based on the given prompt.",
description="Generate text using a language model.",
categories={BlockCategory.AI},
input_schema=AITextGeneratorBlock.Input,
output_schema=AITextGeneratorBlock.Output,
Expand All @@ -1277,26 +1336,40 @@ def __init__(self):

async def llm_call(
self,
input_data: AIStructuredResponseGeneratorBlock.Input,
credentials: APIKeyCredentials,
) -> dict:
input_data: "AIStructuredResponseGeneratorBlock.Input",
credentials: "APIKeyCredentials",
) -> str:
"""Delegate to structured response block and return only the text string."""
block = AIStructuredResponseGeneratorBlock()
response = await block.run_once(input_data, "response", credentials=credentials)
# Track stats
self.merge_llm_stats(block)

# Return plain string for the response
return response["response"]

async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
object_input_data = AIStructuredResponseGeneratorBlock.Input(
self,
input_data: Input,
*,
credentials: "APIKeyCredentials",
**kwargs,
) -> "BlockOutput":
"""Run the block and yield outputs for FastAPI and tests."""
# Prepare input for the structured response generator
structured_input = AIStructuredResponseGeneratorBlock.Input(
**{
attr: getattr(input_data, attr)
for attr in AITextGeneratorBlock.Input.model_fields
},
expected_format={},
)
response = await self.llm_call(object_input_data, credentials)
yield "response", response
print(structured_input)
# Call the underlying LLM (mocked in test)
response_text = await self.llm_call(structured_input, credentials)

# Yield outputs
yield "response", response_text
yield "prompt", self.prompt


Expand Down Expand Up @@ -1738,7 +1811,7 @@ async def run(
self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs
) -> BlockOutput:
logger.debug(f"Starting AIListGeneratorBlock.run with input data: {input_data}")

# Check for API key
api_key_check = credentials.api_key.get_secret_value()
if not api_key_check:
Expand Down Expand Up @@ -1780,6 +1853,7 @@ async def run(
|Do not include any explanations or additional text, just respond with the list in the format specified above.
|Do not include code fences or any other formatting, just the raw list.
"""

# If a focus is provided, add it to the prompt
if input_data.focus:
prompt = f"Generate a list with the following focus:\n<focus>\n\n{input_data.focus}</focus>"
Expand Down
66 changes: 66 additions & 0 deletions autogpt_platform/backend/backend/server/routers/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@
Security,
UploadFile,
)

from backend.blocks.llm import (
TEST_CREDENTIALS,
TEST_CREDENTIALS_INPUT,
AIBlockBase,
AICredentials,
AICredentialsField,
LlmModel,
LLMResponse,
llm_call,
)
from fastapi.concurrency import run_in_threadpool
from pydantic import BaseModel
from starlette.status import HTTP_204_NO_CONTENT, HTTP_404_NOT_FOUND
Expand Down Expand Up @@ -63,6 +74,8 @@
update_user_notification_preference,
update_user_timezone,
)

from backend.blocks.llm import AITextGeneratorBlock
from backend.executor import scheduler
from backend.executor import utils as execution_utils
from backend.integrations.webhooks.graph_lifecycle_hooks import (
Expand Down Expand Up @@ -1358,3 +1371,56 @@ async def update_permissions(
return await api_key_db.update_api_key_permissions(
key_id, user_id, request.permissions
)


# @v1_router.post(
# "/VLAD/TEST",
# summary="Run AITextGeneratorBlock with prompt via Ollama",
# tags=["AI"],
# )
# async def llm_call_endpoint(
# body: dict = Body(
# ...,
# example={
# "prompt": "Write a poem about the ocean.",
# "sys_prompt": "You are a helpful AI assistant.",
# "credentials": TEST_CREDENTIALS_INPUT,
# "model": "llama3.2",
# "retry": 2,
# "prompt_values": {"mood": "calm"},
# "ollama_host": "http://host.docker.internal:11434",
# "max_tokens": 512,
# "image": "https://marketplace.canva.com/EAGZEaj1Dl0/1/0/1280w/canva-beige-aesthetic-motivational-quote-instagram-post-WAe2YFurmmg.jpg"
# },
# ),
# ):
# """Endpoint to run the AITextGeneratorBlock using Ollama."""
# print("➡️ Entered /VLAD/TEST endpoint")
# print(body["image"])
# ai_block = AITextGeneratorBlock()
# print('passed ai block')
# # Validate and create input schema
# try:
# input_data = ai_block.Input(**body)
# print('passed input data')
# except Exception as e:
# print("❌ Validation error:", e.errors())
# raise HTTPException(status_code=422, detail=e.errors())

# # print ports and what is running on them


# # Run the block asynchronously
# output = {}
# async for key, value in ai_block.run(input_data, credentials=None):
# output[key] = value

# print("✅ Block finished executing")

# return {
# "llm_response": output.get("response"),
# "prompt_used": output.get("prompt"),
# "ollama_host": input_data.ollama_host,
# "stats": getattr(ai_block, "execution_stats", None),
# }

2 changes: 2 additions & 0 deletions autogpt_platform/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ services:
extends:
file: ./docker-compose.platform.yml
service: rest_server
# extra_hosts:
# - "host.docker.internal:host-gateway"

executor:
<<: *agpt-services
Expand Down
Loading