OpenAdaptAI · abrichr · Apr 5, 2025 · Apr 5, 2025 · Apr 5, 2025 · Apr 5, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,3 +48,7 @@ jobs:
         env:
           ANTHROPIC_API_KEY: "ci_dummy_key"
         run: uv run pytest tests/
+
+      # --- 8. Smoke test cli.lpy
+      - name: Run CLI Smoke Test (--help)
+        run: uv run python cli.py --help
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,6 @@
 omnimcp.egg-info/
 omnimcp.log
 __pycache__
+runs/
+logs/
+images/*/
diff --git a/cli.py b/cli.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # cli.py
 
 """
@@ -10,20 +11,7 @@
 
 import fire
 
-# Import necessary components from the project
-from omnimcp.agent_executor import AgentExecutor
-from omnimcp.config import config
-from omnimcp.core import plan_action_for_ui
-from omnimcp.input import InputController, _pynput_error  # Check pynput import status
-from omnimcp.omniparser.client import OmniParserClient
-from omnimcp.omnimcp import VisualState
-from omnimcp.utils import (
-    logger,
-    draw_bounding_boxes,
-    draw_action_highlight,
-    NSScreen,  # Check for AppKit on macOS
-)
-
+from omnimcp.utils import logger
 
 # Default configuration
 DEFAULT_OUTPUT_DIR = "runs"
@@ -35,6 +23,7 @@ def run(
     goal: str = DEFAULT_GOAL,
     max_steps: int = DEFAULT_MAX_STEPS,
     output_dir: str = DEFAULT_OUTPUT_DIR,
+    ci_mode: bool = False,
 ):
     """
     Runs the OmniMCP agent to achieve a specified goal.
@@ -43,9 +32,34 @@ def run(
         goal: The natural language goal for the agent.
         max_steps: Maximum number of steps to attempt.
         output_dir: Base directory to save run artifacts (timestamped subdirs).
+        ci_mode: Run in CI mode (skips API validation and actual execution).
     """
     # --- Initial Checks ---
     logger.info("--- OmniMCP CLI ---")
+
+    # Skip import-time checks if we're in CI mode
+    if ci_mode:
+        logger.info("Running in CI mode - skipping credential checks and execution")
+        return 0
+
+    # Delay imports to avoid credential checks at import time
+    try:
+        # Import necessary components from the project
+        from omnimcp.config import config
+        from omnimcp.input import InputController, _pynput_error
+        from omnimcp.agent_executor import AgentExecutor
+        from omnimcp.core import plan_action_for_ui
+        from omnimcp.omniparser.client import OmniParserClient
+        from omnimcp.visual_state import VisualState
+        from omnimcp.utils import (
+            draw_bounding_boxes,
+            draw_action_highlight,
+            NSScreen,  # Check for AppKit on macOS
+        )
+    except ImportError as e:
+        logger.critical(f"Required dependency not found: {e}")
+        return 1
+
     logger.info("Performing initial checks...")
     success = True
 
@@ -84,7 +98,7 @@ def run(
 
     if not success:
         logger.error("Prerequisite checks failed. Exiting.")
-        sys.exit(1)
+        return 1
 
     # --- Component Initialization ---
     logger.info("\nInitializing components...")
@@ -116,10 +130,10 @@ def run(
         logger.critical(
             "   Ensure all requirements are installed (`uv pip install -e .`)"
         )
-        sys.exit(1)
+        return 1
     except Exception as e:
         logger.critical(f"❌ Component initialization failed: {e}", exc_info=True)
-        sys.exit(1)
+        return 1
 
     # --- Agent Executor Initialization ---
     logger.info("\nInitializing Agent Executor...")
@@ -134,7 +148,7 @@ def run(
         logger.success("✅ Agent Executor initialized successfully.")
     except Exception as e:
         logger.critical(f"❌ Agent Executor initialization failed: {e}", exc_info=True)
-        sys.exit(1)
+        return 1
 
     # --- User Confirmation & Start ---
     print("\n" + "=" * 60)
@@ -159,13 +173,13 @@ def run(
         )
     except KeyboardInterrupt:
         logger.warning("\nExecution interrupted by user (Ctrl+C).")
-        sys.exit(1)
+        return 1
     except Exception as run_e:
         logger.critical(
             f"\nAn unexpected error occurred during the agent run: {run_e}",
             exc_info=True,
         )
-        sys.exit(1)
+        return 1
     finally:
         # Optional: Add cleanup here if needed (e.g., stopping parser server)
         logger.info(
@@ -176,13 +190,20 @@ def run(
     # --- Exit ---
     if overall_success:
         logger.success("\nAgent run finished successfully (goal achieved).")
-        sys.exit(0)
+        return 0
     else:
         logger.error(
             "\nAgent run finished unsuccessfully (goal not achieved or error occurred)."
         )
-        sys.exit(1)
+        return 1
+
+
+def main():
+    """Main entry point that handles Fire's return code conversion."""
+    result = fire.Fire(run)
+    if isinstance(result, int):
+        sys.exit(result)
 
 
 if __name__ == "__main__":
-    fire.Fire(run)
+    main()
diff --git a/omnimcp/__init__.py b/omnimcp/__init__.py
@@ -1,24 +1,45 @@
-# omnimcp/__init__.py
-
 import sys
 import os
-
 from loguru import logger
 
 from omnimcp.config import config
 
-log_dir = "logs"
-os.makedirs(log_dir, exist_ok=True)
-# Define file path using a format string recognized by loguru's sink
-log_file_path = os.path.join(log_dir, "run_{time:YYYY-MM-DD_HH-mm-ss}.log")
+# Remove default handler
+logger.remove()
 
-logger.remove()  # Remove default handler to configure levels precisely
-# Log INFO and above to stderr
+# Add stderr handler (keep this functionality)
 logger.add(sys.stderr, level=config.LOG_LEVEL.upper() if config.LOG_LEVEL else "INFO")
-# Log DEBUG and above to a rotating file
-logger.add(
-    log_file_path, rotation="50 MB", level="DEBUG", encoding="utf8", enqueue=True
-)  # enqueue for async safety
 
-logger.info("Logger configured.")
-# You might want to set LOG_LEVEL=DEBUG in your .env file now
+
+# Define a function to configure run-specific logging
+def setup_run_logging(run_dir=None):
+    """
+    Configure additional logging for a specific run.
+
+    Args:
+        run_dir: Directory to store run-specific logs. If None, logs go to default logs directory.
+
+    Returns:
+        The log file path
+    """
+    # Determine log file location
+    if run_dir:
+        os.makedirs(run_dir, exist_ok=True)
+        log_file_path = os.path.join(run_dir, "run.log")
+    else:
+        log_dir = config.LOG_DIR or "logs"
+        os.makedirs(log_dir, exist_ok=True)
+        log_file_path = os.path.join(log_dir, "run_{time:YYYY-MM-DD_HH-mm-ss}.log")
+
+    # Add run-specific log handler
+    logger.add(
+        log_file_path, rotation="50 MB", level="DEBUG", encoding="utf8", enqueue=True
+    )
+
+    logger.info(f"Run logging configured. Log path: {log_file_path}")
+    return log_file_path
+
+
+# Set up default logging (for non-run use)
+if not config.DISABLE_DEFAULT_LOGGING:
+    setup_run_logging()
diff --git a/omnimcp/agent_executor.py b/omnimcp/agent_executor.py
@@ -8,9 +8,9 @@
 from PIL import Image
 
 
-# Used for type hinting if Protocol is simple:
-from .types import LLMActionPlan, UIElement
-from .utils import (
+from omnimcp import config, setup_run_logging
+from omnimcp.types import LLMActionPlan, UIElement
+from omnimcp.utils import (
     denormalize_coordinates,
     draw_action_highlight,
     draw_bounding_boxes,
@@ -194,10 +194,16 @@ def _execute_scroll(
 
     # Comparison Note:
     # This `run` method implements an explicit, sequential perceive-plan-act loop.
-    # Alternative agent architectures exist... (rest of comment remains same)
+    # Alternative agent architectures exist, such as:
+    # - ReAct (Reasoning-Acting): Where the LLM explicitly decides between
+    #   reasoning steps and action steps.
+    # - Callback-driven: Where UI events or timers might trigger agent actions.
+    # - More complex state machines or graph-based execution flows.
+    # This simple sequential loop provides a clear baseline. Future work might explore
+    # these alternatives for more complex or reactive tasks.
 
     def run(
-        self, goal: str, max_steps: int = 10, output_base_dir: str = "runs"
+        self, goal: str, max_steps: int = 10, output_base_dir: Optional[str] = None
     ) -> bool:
         """
         Runs the main perceive-plan-act loop to achieve the goal.
@@ -206,16 +212,28 @@ def run(
             goal: The natural language goal for the agent.
             max_steps: Maximum number of steps to attempt.
             output_base_dir: Base directory to save run artifacts (timestamped).
+                            If None, uses config.RUN_OUTPUT_DIR.
 
         Returns:
             True if the goal was achieved, False otherwise (error or max steps reached).
         """
+
+        # Use configured output dir if none provided
+        if output_base_dir is None:
+            output_base_dir = config.RUN_OUTPUT_DIR
+
         run_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
         run_output_dir = os.path.join(output_base_dir, run_timestamp)
+
         try:
             os.makedirs(run_output_dir, exist_ok=True)
+
+            # Configure run-specific logging
+            log_path = setup_run_logging(run_output_dir)
+
             logger.info(f"Starting agent run. Goal: '{goal}'")
             logger.info(f"Saving outputs to: {run_output_dir}")
+            logger.info(f"Run log file: {log_path}")
         except OSError as e:
             logger.error(f"Failed to create output directory {run_output_dir}: {e}")
             return False

diff --git a/omnimcp/completions.py b/omnimcp/completions.py
@@ -32,7 +32,7 @@
 )
 
 MAX_RETRIES = 3
-DEFAULT_MODEL = "claude-3-7-sonnet-20250219"
+DEFAULT_MODEL = config.ANTHROPIC_DEFAULT_MODEL or "claude-3-7-sonnet-20250219"
 
 
 @retry(

diff --git a/omnimcp/config.py b/omnimcp/config.py
@@ -14,6 +14,8 @@ class OmniMCPConfig(BaseSettings):
 
     # Claude API configuration
     ANTHROPIC_API_KEY: Optional[str] = None
+    ANTHROPIC_DEFAULT_MODEL: str = "claude-3-7-sonnet-20250219"
+    # ANTHROPIC_DEFAULT_MODEL: str = "claude-3-haiku-20240307"
 
     # Auto-shutdown OmniParser after 60min inactivity
     INACTIVITY_TIMEOUT_MINUTES: int = 60
@@ -29,13 +31,25 @@ class OmniMCPConfig(BaseSettings):
     # OmniParser deployment configuration
     PROJECT_NAME: str = "omniparser"
     REPO_URL: str = "https://github.com/microsoft/OmniParser.git"
-    AWS_EC2_AMI: str = "ami-06835d15c4de57810"
+    # AWS_EC2_AMI: str = "ami-06835d15c4de57810"
+    AWS_EC2_AMI: str = (
+        "ami-04631c7d8811d9bae"  # Official AWS DLAMI Base Ubuntu 22.04 (G6 Compatible)
+    )
     AWS_EC2_DISK_SIZE: int = 128  # GB
-    AWS_EC2_INSTANCE_TYPE: str = "g4dn.xlarge"  # (T4 16GB $0.526/hr x86_64)
+    # AWS_EC2_INSTANCE_TYPE: str = "g4dn.xlarge"  # (T4 16GB $0.526/hr x86_64)
+    AWS_EC2_INSTANCE_TYPE: str = "g6.xlarge"  # (L4 24GB $0.805/hr x86_64)
+    # AWS_EC2_INSTANCE_TYPE: str = "p3.2xlarge"  # (V100 16GB $3.06/hr x86_64)
     AWS_EC2_USER: str = "ubuntu"
     PORT: int = 8000  # FastAPI port
     COMMAND_TIMEOUT: int = 600  # 10 minutes
 
+    # Logging configuration
+    LOG_DIR: Optional[str] = "logs"
+    DISABLE_DEFAULT_LOGGING: bool = False
+
+    # Run output configuration
+    RUN_OUTPUT_DIR: str = "runs"
+
     # Debug settings
     # DEBUG: bool = False
     LOG_LEVEL: str = "INFO"
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,7 +32,7 @@ @@
     )
     MAX_RETRIES = 3
-    DEFAULT_MODEL = "claude-3-7-sonnet-20250219"
+    DEFAULT_MODEL = config.ANTHROPIC_DEFAULT_MODEL or "claude-3-7-sonnet-20250219"
     @retry(
@@ Expand Down @@