unclecode
diff --git a/‎.dockerignore‎
Lines changed: 12 additions & 0 deletions b/‎.dockerignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 110 additions & 87 deletions b/‎Dockerfile‎
Lines changed: 110 additions & 87 deletions
diff --git a/‎crawl4ai/model_loader.py‎
Lines changed: 44 additions & 18 deletions b/‎crawl4ai/model_loader.py‎
Lines changed: 44 additions & 18 deletions
@@ -0,0 +1,12 @@
+# Ignore everything
+*
+
+# But not these files...
+!/crawl4ai
+!/deploy/docker
+!uv.lock
+!pyproject.toml
+!requirements.txt
+!setup.cfg
+!setup.py
+!README.md
@@ -1,34 +1,46 @@
-FROM python:3.10-slim
-
 # Set build arguments
-ARG APP_HOME=/app
-ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
-ARG GITHUB_BRANCH=main
-ARG USE_LOCAL=true
+ARG UV_IMAGE=ghcr.io/astral-sh/uv:0.6.6 \
+    PYTHON_IMAGE=python:3.10-slim \
+    BUILD_ENV=local
+
+# Create an alias for the UV image so we can reference it in the base stage.
+FROM ${UV_IMAGE} AS uv
 
-ENV PYTHONFAULTHANDLER=1 \
+FROM ${PYTHON_IMAGE} AS base
+COPY --from=uv /uv /uvx /bin/
+
+# Enable bytecode compilation during build to improve runtime
+# performance, set the link mode to copy to avoid warnings
+# with the default mode.
+ENV \
+    PYTHONFAULTHANDLER=1 \
     PYTHONHASHSEED=random \
     PYTHONUNBUFFERED=1 \
-    PIP_NO_CACHE_DIR=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1 \
     PIP_DEFAULT_TIMEOUT=100 \
     DEBIAN_FRONTEND=noninteractive \
     REDIS_HOST=localhost \
-    REDIS_PORT=6379
-
-ARG PYTHON_VERSION=3.10
-ARG INSTALL_TYPE=default
-ARG ENABLE_GPU=false
-ARG TARGETARCH
-
-LABEL maintainer="unclecode"
-LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-LABEL version="1.0"    
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
+    REDIS_PORT=6379 \
+    UV_COMPILE_BYTECODE=1 \
+    UV_LINK_MODE=copy \
+    UV_SYSTEM_PYTHON=1 \
+    PLAYWRIGHT_DOWNLOAD_CONNECTION_TIMEOUT=120000
+
+LABEL \
+    maintainer="unclecode" \
+    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" \
+    version="1.0"
+
+# Install dependencies with caching to speed up the build process.
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    rm -f /etc/apt/apt.conf.d/docker-clean && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
     curl \
+    ca-certificates \
+    build-essential \
     wget \
     gnupg \
     git \
@@ -38,9 +50,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libjpeg-dev \
     redis-server \
     supervisor \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
     libglib2.0-0 \
     libnss3 \
     libnspr4 \
@@ -61,83 +70,97 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libpango-1.0-0 \
     libcairo2 \
     libasound2 \
-    libatspi2.0-0 \
-    && rm -rf /var/lib/apt/lists/*
+    libatspi2.0-0
 
-RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
-    apt-get update && apt-get install -y --no-install-recommends \
-    nvidia-cuda-toolkit \
-    && rm -rf /var/lib/apt/lists/* ; \
+ARG \
+    TARGETARCH \
+    ENABLE_GPU=false
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
+    apt-get install -y --no-install-recommends \
+    nvidia-cuda-toolkit; \
 else \
     echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
 fi
 
-RUN if [ "$TARGETARCH" = "arm64" ]; then \
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+if [ "$TARGETARCH" = "arm64" ]; then \
     echo "🦾 Installing ARM-specific optimizations"; \
-    apt-get update && apt-get install -y --no-install-recommends \
-    libopenblas-dev \
-    && rm -rf /var/lib/apt/lists/*; \
+    apt-get install -y --no-install-recommends \
+    libopenblas-dev; \
 elif [ "$TARGETARCH" = "amd64" ]; then \
     echo "🖥️ Installing AMD64-specific optimizations"; \
-    apt-get update && apt-get install -y --no-install-recommends \
-    libomp-dev \
-    && rm -rf /var/lib/apt/lists/*; \
+    apt-get install -y --no-install-recommends \
+    libomp-dev; \
 else \
-    echo "Skipping platform-specific optimizations (unsupported platform)"; \
+    echo "Skipping platform-specific optimizations (unsupported platform ${TARGET_ARCH})"; \
 fi
 
+ARG APP_HOME=/app
 WORKDIR ${APP_HOME}
 
-RUN echo '#!/bin/bash\n\
-if [ "$USE_LOCAL" = "true" ]; then\n\
-    echo "📦 Installing from local source..."\n\
-    pip install --no-cache-dir /tmp/project/\n\
-else\n\
-    echo "🌐 Installing from GitHub..."\n\
-    for i in {1..3}; do \n\
-        git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
-        { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
-    done\n\
-    pip install --no-cache-dir /tmp/crawl4ai\n\
-fi' > /tmp/install.sh && chmod +x /tmp/install.sh
-
-COPY . /tmp/project/
-
-COPY deploy/docker/supervisord.conf .
-
-COPY deploy/docker/requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
-        pip install --no-cache-dir \
-            torch \
-            torchvision \
-            torchaudio \
-            scikit-learn \
-            nltk \
-            transformers \
-            tokenizers && \
-        python -m nltk.downloader punkt stopwords ; \
-    fi
-
-RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
-        pip install "/tmp/project/[all]" && \
-        python -m crawl4ai.model_loader ; \
-    elif [ "$INSTALL_TYPE" = "torch" ] ; then \
-        pip install "/tmp/project/[torch]" ; \
-    elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
-        pip install "/tmp/project/[transformer]" && \
-        python -m crawl4ai.model_loader ; \
-    else \
-        pip install "/tmp/project" ; \
-    fi
-    
-RUN pip install --no-cache-dir --upgrade pip && \
-    /tmp/install.sh && \
+COPY --link . /tmp/project/
+
+COPY deploy/docker/requirements.txt deploy/docker/supervisord.conf ./
+
+# Install the docker dependencies.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --upgrade pip -r requirements.txt
+
+# Install the dependencies for the specified install type.
+ARG INSTALL_TYPE=default
+RUN --mount=type=cache,target=/root/.cache/uv \
+    GROUP=$([ "$INSTALL_TYPE" = "default" ] && echo "" || echo "[$INSTALL_TYPE]") ; \
+    uv pip install "/tmp/project/$GROUP"
+
+# If the install type is all or transformer, download the models.
+RUN --mount=type=cache,target=/tmp/.cache/huggingface \
+    --mount=type=cache,target=/tmp/.cache/nltk \
+if [ "$INSTALL_TYPE" = "all" ] || [ "$INSTALL_TYPE" = "transformer" ] ; then \
+    NLTK_DATA=/tmp/.cache/nltk \
+    HF_HOME=/tmp/.cache/huggingface \
+    python -m crawl4ai.model_loader && \
+    mkdir -p /root/.cache && \
+    rm -rf /root/.cache/ms-playwright/ /root/nltk_data/ && \
+    cp -R /tmp/.cache/nltk/ /root/nltk_data/ && \
+    cp -R /tmp/.cache/huggingface/ /root/.cache/huggingface/ ; \
+fi
+
+# Install from local source.
+FROM base AS local
+RUN --mount=type=cache,target=/root/.cache/uv \
+    echo "📦 Installing from local source..." ; \
+    uv pip install /tmp/project/
+
+# Install from GitHub.
+FROM base AS github
+ARG \
+    GITHUB_REPO=https://github.com/unclecode/crawl4ai.git \
+    GITHUB_BRANCH=main
+RUN --mount=type=cache,target=/root/.cache/uv \
+    echo "🌐 Installing from GitHub..." ; \
+    for i in {1..3}; do \
+        git clone --depth 1 --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \
+        { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \
+    done ; \
+    uv pip install /tmp/crawl4ai
+
+FROM ${BUILD_ENV} AS final
+
+# Test the installation.
+RUN \
     python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
     python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
-    
-RUN playwright install --with-deps chromium
+
+# Install Playwright browsers.
+RUN --mount=type=cache,target=/tmp/.cache/ms-playwright \
+    PLAYWRIGHT_BROWSERS_PATH=/tmp/.cache/ms-playwright playwright install --no-shell chromium && \
+    mkdir -p /root/.cache && \
+    rm -rf /root/.cache/ms-playwright/ && \
+    cp -R /tmp/.cache/ms-playwright/ /root/.cache/ms-playwright/
 
 COPY deploy/docker/* ${APP_HOME}/
 
@@ -153,4 +176,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
 
 EXPOSE 6379
 CMD ["supervisord", "-c", "supervisord.conf"]
-    
+
@@ -1,9 +1,12 @@
+import argparse
+import os
+import shutil
+import subprocess
 from functools import lru_cache
 from pathlib import Path
-import subprocess, os
-import shutil
-from .model_loader import *
-import argparse
+
+from transformers import AutoModel
+
 from crawl4ai.config import MODEL_REPO_BRANCH
 
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@@ -73,7 +76,7 @@ def get_home_folder():
 
 @lru_cache()
 def load_bert_base_uncased():
-    from transformers import BertTokenizer, BertModel
+    from transformers import BertModel, BertTokenizer
 
     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", resume_download=None)
     model = BertModel.from_pretrained("bert-base-uncased", resume_download=None)
@@ -92,7 +95,7 @@ def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
     Returns:
         tuple: The tokenizer and model.
     """
-    from transformers import AutoTokenizer, AutoModel
+    from transformers import AutoTokenizer
 
     tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
     model = AutoModel.from_pretrained(model_name, resume_download=None)
@@ -103,8 +106,7 @@ def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
 
 @lru_cache()
 def load_text_classifier():
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    from transformers import pipeline
+    from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 
     tokenizer = AutoTokenizer.from_pretrained(
         "dstefa/roberta-base_topic_classification_nyt_news"
@@ -118,11 +120,25 @@ def load_text_classifier():
     return pipe
 
 
+MODEL = "cardiffnlp/tweet-topic-21-multi"
+
+
+def download_text_multilabel_classifier():
+    """Download the multilabel classifer model from Hugging Face Hub.
+
+    Unlike load_text_multilabel_classifier, this function does not
+    load the model into memory only downloads it to the local cache
+    in the foreground."""
+    from huggingface_hub import snapshot_download
+
+    snapshot_download(MODEL)
+
+
 @lru_cache()
-def load_text_multilabel_classifier():
-    from transformers import AutoModelForSequenceClassification, AutoTokenizer
-    from scipy.special import expit
+def load_text_multilabel_classifier(download_only=False):
     import torch
+    from scipy.special import expit
+    from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
     # # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
     # if torch.cuda.is_available():
@@ -133,11 +149,8 @@ def load_text_multilabel_classifier():
     #     device = torch.device("cpu")
     #     # return load_spacy_model(), torch.device("cpu")
 
-    MODEL = "cardiffnlp/tweet-topic-21-multi"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
-    model = AutoModelForSequenceClassification.from_pretrained(
-        MODEL, resume_download=None
-    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
     model.eval()
     model, device = set_model_device(model)
     class_mapping = model.config.id2label
@@ -184,6 +197,17 @@ def load_nltk_punkt():
     return nltk.data.find("tokenizers/punkt")
 
 
+@lru_cache()
+def load_nltk_stopwords():
+    import nltk
+
+    try:
+        nltk.data.find("corpora/stopwords")
+    except LookupError:
+        nltk.download("stopwords")
+    return nltk.data.find("corpora/stopwords")
+
+
 @lru_cache()
 def load_spacy_model():
     import spacy
@@ -271,10 +295,12 @@ def download_all_models(remove_existing=False):
     # print("[LOG] Downloading ONNX model...")
     # load_onnx_all_MiniLM_l6_v2()
     print("[LOG] Downloading text classifier...")
-    _, device = load_text_multilabel_classifier()
-    print(f"[LOG] Text classifier loaded on {device}")
+    download_text_multilabel_classifier()
+    print("[LOG] Text classifier downloaded")
     print("[LOG] Downloading custom NLTK Punkt model...")
     load_nltk_punkt()
+    print("[LOG] Downloading custom NLTK stopwords model...")
+    load_nltk_stopwords()
     print("[LOG] ✅ All models downloaded successfully.")