Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,7 @@ jobs:
env:
ANTHROPIC_API_KEY: "ci_dummy_key"
run: uv run pytest tests/

# --- 8. Smoke test cli.lpy
- name: Run CLI Smoke Test (--help)
run: uv run python cli.py --help
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
omnimcp.egg-info/
omnimcp.log
__pycache__
runs/
logs/
images/*/
67 changes: 44 additions & 23 deletions cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
# cli.py

"""
Expand All @@ -10,20 +11,7 @@

import fire

# Import necessary components from the project
from omnimcp.agent_executor import AgentExecutor
from omnimcp.config import config
from omnimcp.core import plan_action_for_ui
from omnimcp.input import InputController, _pynput_error # Check pynput import status
from omnimcp.omniparser.client import OmniParserClient
from omnimcp.omnimcp import VisualState
from omnimcp.utils import (
logger,
draw_bounding_boxes,
draw_action_highlight,
NSScreen, # Check for AppKit on macOS
)

from omnimcp.utils import logger

# Default configuration
DEFAULT_OUTPUT_DIR = "runs"
Expand All @@ -35,6 +23,7 @@ def run(
goal: str = DEFAULT_GOAL,
max_steps: int = DEFAULT_MAX_STEPS,
output_dir: str = DEFAULT_OUTPUT_DIR,
ci_mode: bool = False,
):
"""
Runs the OmniMCP agent to achieve a specified goal.
Expand All @@ -43,9 +32,34 @@ def run(
goal: The natural language goal for the agent.
max_steps: Maximum number of steps to attempt.
output_dir: Base directory to save run artifacts (timestamped subdirs).
ci_mode: Run in CI mode (skips API validation and actual execution).
"""
# --- Initial Checks ---
logger.info("--- OmniMCP CLI ---")

# Skip import-time checks if we're in CI mode
if ci_mode:
logger.info("Running in CI mode - skipping credential checks and execution")
return 0

# Delay imports to avoid credential checks at import time
try:
# Import necessary components from the project
from omnimcp.config import config
from omnimcp.input import InputController, _pynput_error
from omnimcp.agent_executor import AgentExecutor
from omnimcp.core import plan_action_for_ui
from omnimcp.omniparser.client import OmniParserClient
from omnimcp.visual_state import VisualState
from omnimcp.utils import (
draw_bounding_boxes,
draw_action_highlight,
NSScreen, # Check for AppKit on macOS
)
except ImportError as e:
logger.critical(f"Required dependency not found: {e}")
return 1

logger.info("Performing initial checks...")
success = True

Expand Down Expand Up @@ -84,7 +98,7 @@ def run(

if not success:
logger.error("Prerequisite checks failed. Exiting.")
sys.exit(1)
return 1

# --- Component Initialization ---
logger.info("\nInitializing components...")
Expand Down Expand Up @@ -116,10 +130,10 @@ def run(
logger.critical(
" Ensure all requirements are installed (`uv pip install -e .`)"
)
sys.exit(1)
return 1
except Exception as e:
logger.critical(f"❌ Component initialization failed: {e}", exc_info=True)
sys.exit(1)
return 1

# --- Agent Executor Initialization ---
logger.info("\nInitializing Agent Executor...")
Expand All @@ -134,7 +148,7 @@ def run(
logger.success("✅ Agent Executor initialized successfully.")
except Exception as e:
logger.critical(f"❌ Agent Executor initialization failed: {e}", exc_info=True)
sys.exit(1)
return 1

# --- User Confirmation & Start ---
print("\n" + "=" * 60)
Expand All @@ -159,13 +173,13 @@ def run(
)
except KeyboardInterrupt:
logger.warning("\nExecution interrupted by user (Ctrl+C).")
sys.exit(1)
return 1
except Exception as run_e:
logger.critical(
f"\nAn unexpected error occurred during the agent run: {run_e}",
exc_info=True,
)
sys.exit(1)
return 1
finally:
# Optional: Add cleanup here if needed (e.g., stopping parser server)
logger.info(
Expand All @@ -176,13 +190,20 @@ def run(
# --- Exit ---
if overall_success:
logger.success("\nAgent run finished successfully (goal achieved).")
sys.exit(0)
return 0
else:
logger.error(
"\nAgent run finished unsuccessfully (goal not achieved or error occurred)."
)
sys.exit(1)
return 1


def main():
"""Main entry point that handles Fire's return code conversion."""
result = fire.Fire(run)
if isinstance(result, int):
sys.exit(result)


if __name__ == "__main__":
fire.Fire(run)
main()
51 changes: 36 additions & 15 deletions omnimcp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,45 @@
# omnimcp/__init__.py

import sys
import os

from loguru import logger

from omnimcp.config import config

log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
# Define file path using a format string recognized by loguru's sink
log_file_path = os.path.join(log_dir, "run_{time:YYYY-MM-DD_HH-mm-ss}.log")
# Remove default handler
logger.remove()

logger.remove() # Remove default handler to configure levels precisely
# Log INFO and above to stderr
# Add stderr handler (keep this functionality)
logger.add(sys.stderr, level=config.LOG_LEVEL.upper() if config.LOG_LEVEL else "INFO")
# Log DEBUG and above to a rotating file
logger.add(
log_file_path, rotation="50 MB", level="DEBUG", encoding="utf8", enqueue=True
) # enqueue for async safety

logger.info("Logger configured.")
# You might want to set LOG_LEVEL=DEBUG in your .env file now

# Define a function to configure run-specific logging
def setup_run_logging(run_dir=None):
"""
Configure additional logging for a specific run.

Args:
run_dir: Directory to store run-specific logs. If None, logs go to default logs directory.

Returns:
The log file path
"""
# Determine log file location
if run_dir:
os.makedirs(run_dir, exist_ok=True)
log_file_path = os.path.join(run_dir, "run.log")
else:
log_dir = config.LOG_DIR or "logs"
os.makedirs(log_dir, exist_ok=True)
log_file_path = os.path.join(log_dir, "run_{time:YYYY-MM-DD_HH-mm-ss}.log")

# Add run-specific log handler
logger.add(
log_file_path, rotation="50 MB", level="DEBUG", encoding="utf8", enqueue=True
)

logger.info(f"Run logging configured. Log path: {log_file_path}")
return log_file_path


# Set up default logging (for non-run use)
if not config.DISABLE_DEFAULT_LOGGING:
setup_run_logging()
28 changes: 23 additions & 5 deletions omnimcp/agent_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from PIL import Image


# Used for type hinting if Protocol is simple:
from .types import LLMActionPlan, UIElement
from .utils import (
from omnimcp import config, setup_run_logging
from omnimcp.types import LLMActionPlan, UIElement
from omnimcp.utils import (
denormalize_coordinates,
draw_action_highlight,
draw_bounding_boxes,
Expand Down Expand Up @@ -194,10 +194,16 @@ def _execute_scroll(

# Comparison Note:
# This `run` method implements an explicit, sequential perceive-plan-act loop.
# Alternative agent architectures exist... (rest of comment remains same)
# Alternative agent architectures exist, such as:
# - ReAct (Reasoning-Acting): Where the LLM explicitly decides between
# reasoning steps and action steps.
# - Callback-driven: Where UI events or timers might trigger agent actions.
# - More complex state machines or graph-based execution flows.
# This simple sequential loop provides a clear baseline. Future work might explore
# these alternatives for more complex or reactive tasks.

def run(
self, goal: str, max_steps: int = 10, output_base_dir: str = "runs"
self, goal: str, max_steps: int = 10, output_base_dir: Optional[str] = None
) -> bool:
"""
Runs the main perceive-plan-act loop to achieve the goal.
Expand All @@ -206,16 +212,28 @@ def run(
goal: The natural language goal for the agent.
max_steps: Maximum number of steps to attempt.
output_base_dir: Base directory to save run artifacts (timestamped).
If None, uses config.RUN_OUTPUT_DIR.

Returns:
True if the goal was achieved, False otherwise (error or max steps reached).
"""

# Use configured output dir if none provided
if output_base_dir is None:
output_base_dir = config.RUN_OUTPUT_DIR

run_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
run_output_dir = os.path.join(output_base_dir, run_timestamp)

try:
os.makedirs(run_output_dir, exist_ok=True)

# Configure run-specific logging
log_path = setup_run_logging(run_output_dir)

logger.info(f"Starting agent run. Goal: '{goal}'")
logger.info(f"Saving outputs to: {run_output_dir}")
logger.info(f"Run log file: {log_path}")
except OSError as e:
logger.error(f"Failed to create output directory {run_output_dir}: {e}")
return False
Expand Down
2 changes: 1 addition & 1 deletion omnimcp/completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
)

MAX_RETRIES = 3
DEFAULT_MODEL = "claude-3-7-sonnet-20250219"
DEFAULT_MODEL = config.ANTHROPIC_DEFAULT_MODEL or "claude-3-7-sonnet-20250219"


@retry(
Expand Down
18 changes: 16 additions & 2 deletions omnimcp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class OmniMCPConfig(BaseSettings):

# Claude API configuration
ANTHROPIC_API_KEY: Optional[str] = None
ANTHROPIC_DEFAULT_MODEL: str = "claude-3-7-sonnet-20250219"
# ANTHROPIC_DEFAULT_MODEL: str = "claude-3-haiku-20240307"

# Auto-shutdown OmniParser after 60min inactivity
INACTIVITY_TIMEOUT_MINUTES: int = 60
Expand All @@ -29,13 +31,25 @@ class OmniMCPConfig(BaseSettings):
# OmniParser deployment configuration
PROJECT_NAME: str = "omniparser"
REPO_URL: str = "https://github.com/microsoft/OmniParser.git"
AWS_EC2_AMI: str = "ami-06835d15c4de57810"
# AWS_EC2_AMI: str = "ami-06835d15c4de57810"
AWS_EC2_AMI: str = (
"ami-04631c7d8811d9bae" # Official AWS DLAMI Base Ubuntu 22.04 (G6 Compatible)
)
AWS_EC2_DISK_SIZE: int = 128 # GB
AWS_EC2_INSTANCE_TYPE: str = "g4dn.xlarge" # (T4 16GB $0.526/hr x86_64)
# AWS_EC2_INSTANCE_TYPE: str = "g4dn.xlarge" # (T4 16GB $0.526/hr x86_64)
AWS_EC2_INSTANCE_TYPE: str = "g6.xlarge" # (L4 24GB $0.805/hr x86_64)
# AWS_EC2_INSTANCE_TYPE: str = "p3.2xlarge" # (V100 16GB $3.06/hr x86_64)
AWS_EC2_USER: str = "ubuntu"
PORT: int = 8000 # FastAPI port
COMMAND_TIMEOUT: int = 600 # 10 minutes

# Logging configuration
LOG_DIR: Optional[str] = "logs"
DISABLE_DEFAULT_LOGGING: bool = False

# Run output configuration
RUN_OUTPUT_DIR: str = "runs"

# Debug settings
# DEBUG: bool = False
LOG_LEVEL: str = "INFO"
Expand Down