Skip to content

Commit 1415f77

Browse files
committed
perf: docker build
Improve the docker build performance by enabling caching, using uv for package management and optimising the layers. Leverage pip recursive dependencies to simplify package management. Add a .dockerignore to ensure only relavent context is transferred. Remove duplicate keys in deploy/docker/config.yml. Remove unnecessary package dependency for crawl4ai from deploy/docker/requirements.txt. Download multilabel classifier in the foreground to improve speeds on limited networks. Add missing spacy dependency and pin to avoid issue on Mac hardware. Remove duplicate nltk downloads. Leverage multi stage for github / local replacing USE_LOCAL=true|false with BUILD_ENV=github|local Results for a building an arm64 image with base images already cached on a Macbook M3 Max: Original: * 12 minutes 15 seconds New: * Cached: 27 seconds * No cache: 7 mins 25 seconds
1 parent 6eed4ad commit 1415f77

File tree

8 files changed

+211
-140
lines changed

8 files changed

+211
-140
lines changed

.dockerignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Ignore everything
2+
*
3+
4+
# But not these files...
5+
!/crawl4ai
6+
!/deploy/docker
7+
!uv.lock
8+
!pyproject.toml
9+
!requirements.txt
10+
!setup.cfg
11+
!setup.py
12+
!README.md

Dockerfile

Lines changed: 110 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,46 @@
1-
FROM python:3.10-slim
2-
31
# Set build arguments
4-
ARG APP_HOME=/app
5-
ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
6-
ARG GITHUB_BRANCH=main
7-
ARG USE_LOCAL=true
2+
ARG UV_IMAGE=ghcr.io/astral-sh/uv:0.6.6 \
3+
PYTHON_IMAGE=python:3.10-slim \
4+
BUILD_ENV=local
5+
6+
# Create an alias for the UV image so we can reference it in the base stage.
7+
FROM ${UV_IMAGE} AS uv
88

9-
ENV PYTHONFAULTHANDLER=1 \
9+
FROM ${PYTHON_IMAGE} AS base
10+
COPY --from=uv /uv /uvx /bin/
11+
12+
# Enable bytecode compilation during build to improve runtime
13+
# performance, set the link mode to copy to avoid warnings
14+
# with the default mode.
15+
ENV \
16+
PYTHONFAULTHANDLER=1 \
1017
PYTHONHASHSEED=random \
1118
PYTHONUNBUFFERED=1 \
12-
PIP_NO_CACHE_DIR=1 \
1319
PYTHONDONTWRITEBYTECODE=1 \
1420
PIP_DISABLE_PIP_VERSION_CHECK=1 \
1521
PIP_DEFAULT_TIMEOUT=100 \
1622
DEBIAN_FRONTEND=noninteractive \
1723
REDIS_HOST=localhost \
18-
REDIS_PORT=6379
19-
20-
ARG PYTHON_VERSION=3.10
21-
ARG INSTALL_TYPE=default
22-
ARG ENABLE_GPU=false
23-
ARG TARGETARCH
24-
25-
LABEL maintainer="unclecode"
26-
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
27-
LABEL version="1.0"
28-
29-
RUN apt-get update && apt-get install -y --no-install-recommends \
30-
build-essential \
24+
REDIS_PORT=6379 \
25+
UV_COMPILE_BYTECODE=1 \
26+
UV_LINK_MODE=copy \
27+
UV_SYSTEM_PYTHON=1 \
28+
PLAYWRIGHT_DOWNLOAD_CONNECTION_TIMEOUT=120000
29+
30+
LABEL \
31+
maintainer="unclecode" \
32+
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" \
33+
version="1.0"
34+
35+
# Install dependencies with caching to speed up the build process.
36+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
37+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
38+
rm -f /etc/apt/apt.conf.d/docker-clean && \
39+
apt-get update && \
40+
apt-get install -y --no-install-recommends \
3141
curl \
42+
ca-certificates \
43+
build-essential \
3244
wget \
3345
gnupg \
3446
git \
@@ -38,9 +50,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
3850
libjpeg-dev \
3951
redis-server \
4052
supervisor \
41-
&& rm -rf /var/lib/apt/lists/*
42-
43-
RUN apt-get update && apt-get install -y --no-install-recommends \
4453
libglib2.0-0 \
4554
libnss3 \
4655
libnspr4 \
@@ -61,83 +70,97 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
6170
libpango-1.0-0 \
6271
libcairo2 \
6372
libasound2 \
64-
libatspi2.0-0 \
65-
&& rm -rf /var/lib/apt/lists/*
73+
libatspi2.0-0
6674

67-
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
68-
apt-get update && apt-get install -y --no-install-recommends \
69-
nvidia-cuda-toolkit \
70-
&& rm -rf /var/lib/apt/lists/* ; \
75+
ARG \
76+
TARGETARCH \
77+
ENABLE_GPU=false
78+
79+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
80+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
81+
if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
82+
apt-get install -y --no-install-recommends \
83+
nvidia-cuda-toolkit; \
7184
else \
7285
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
7386
fi
7487

75-
RUN if [ "$TARGETARCH" = "arm64" ]; then \
88+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
89+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
90+
if [ "$TARGETARCH" = "arm64" ]; then \
7691
echo "🦾 Installing ARM-specific optimizations"; \
77-
apt-get update && apt-get install -y --no-install-recommends \
78-
libopenblas-dev \
79-
&& rm -rf /var/lib/apt/lists/*; \
92+
apt-get install -y --no-install-recommends \
93+
libopenblas-dev; \
8094
elif [ "$TARGETARCH" = "amd64" ]; then \
8195
echo "🖥️ Installing AMD64-specific optimizations"; \
82-
apt-get update && apt-get install -y --no-install-recommends \
83-
libomp-dev \
84-
&& rm -rf /var/lib/apt/lists/*; \
96+
apt-get install -y --no-install-recommends \
97+
libomp-dev; \
8598
else \
86-
echo "Skipping platform-specific optimizations (unsupported platform)"; \
99+
echo "Skipping platform-specific optimizations (unsupported platform ${TARGET_ARCH})"; \
87100
fi
88101

102+
ARG APP_HOME=/app
89103
WORKDIR ${APP_HOME}
90104

91-
RUN echo '#!/bin/bash\n\
92-
if [ "$USE_LOCAL" = "true" ]; then\n\
93-
echo "📦 Installing from local source..."\n\
94-
pip install --no-cache-dir /tmp/project/\n\
95-
else\n\
96-
echo "🌐 Installing from GitHub..."\n\
97-
for i in {1..3}; do \n\
98-
git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
99-
{ echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
100-
done\n\
101-
pip install --no-cache-dir /tmp/crawl4ai\n\
102-
fi' > /tmp/install.sh && chmod +x /tmp/install.sh
103-
104-
COPY . /tmp/project/
105-
106-
COPY deploy/docker/supervisord.conf .
107-
108-
COPY deploy/docker/requirements.txt .
109-
RUN pip install --no-cache-dir -r requirements.txt
110-
111-
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
112-
pip install --no-cache-dir \
113-
torch \
114-
torchvision \
115-
torchaudio \
116-
scikit-learn \
117-
nltk \
118-
transformers \
119-
tokenizers && \
120-
python -m nltk.downloader punkt stopwords ; \
121-
fi
122-
123-
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
124-
pip install "/tmp/project/[all]" && \
125-
python -m crawl4ai.model_loader ; \
126-
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
127-
pip install "/tmp/project/[torch]" ; \
128-
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
129-
pip install "/tmp/project/[transformer]" && \
130-
python -m crawl4ai.model_loader ; \
131-
else \
132-
pip install "/tmp/project" ; \
133-
fi
134-
135-
RUN pip install --no-cache-dir --upgrade pip && \
136-
/tmp/install.sh && \
105+
COPY --link . /tmp/project/
106+
107+
COPY deploy/docker/requirements.txt deploy/docker/supervisord.conf ./
108+
109+
# Install the docker dependencies.
110+
RUN --mount=type=cache,target=/root/.cache/uv \
111+
uv pip install --upgrade pip -r requirements.txt
112+
113+
# Install the dependencies for the specified install type.
114+
ARG INSTALL_TYPE=default
115+
RUN --mount=type=cache,target=/root/.cache/uv \
116+
GROUP=$([ "$INSTALL_TYPE" = "default" ] && echo "" || echo "[$INSTALL_TYPE]") ; \
117+
uv pip install "/tmp/project/$GROUP"
118+
119+
# If the install type is all or transformer, download the models.
120+
RUN --mount=type=cache,target=/tmp/.cache/huggingface \
121+
--mount=type=cache,target=/tmp/.cache/nltk \
122+
if [ "$INSTALL_TYPE" = "all" ] || [ "$INSTALL_TYPE" = "transformer" ] ; then \
123+
NLTK_DATA=/tmp/.cache/nltk \
124+
HF_HOME=/tmp/.cache/huggingface \
125+
python -m crawl4ai.model_loader && \
126+
mkdir -p /root/.cache && \
127+
rm -rf /root/.cache/ms-playwright/ /root/nltk_data/ && \
128+
cp -R /tmp/.cache/nltk/ /root/nltk_data/ && \
129+
cp -R /tmp/.cache/huggingface/ /root/.cache/huggingface/ ; \
130+
fi
131+
132+
# Install from local source.
133+
FROM base AS local
134+
RUN --mount=type=cache,target=/root/.cache/uv \
135+
echo "📦 Installing from local source..." ; \
136+
uv pip install /tmp/project/
137+
138+
# Install from GitHub.
139+
FROM base AS github
140+
ARG \
141+
GITHUB_REPO=https://github.com/unclecode/crawl4ai.git \
142+
GITHUB_BRANCH=main
143+
RUN --mount=type=cache,target=/root/.cache/uv \
144+
echo "🌐 Installing from GitHub..." ; \
145+
for i in {1..3}; do \
146+
git clone --depth 1 --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \
147+
{ echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \
148+
done ; \
149+
uv pip install /tmp/crawl4ai
150+
151+
FROM ${BUILD_ENV} AS final
152+
153+
# Test the installation.
154+
RUN \
137155
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
138156
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
139-
140-
RUN playwright install --with-deps chromium
157+
158+
# Install Playwright browsers.
159+
RUN --mount=type=cache,target=/tmp/.cache/ms-playwright \
160+
PLAYWRIGHT_BROWSERS_PATH=/tmp/.cache/ms-playwright playwright install --no-shell chromium && \
161+
mkdir -p /root/.cache && \
162+
rm -rf /root/.cache/ms-playwright/ && \
163+
cp -R /tmp/.cache/ms-playwright/ /root/.cache/ms-playwright/
141164

142165
COPY deploy/docker/* ${APP_HOME}/
143166

@@ -153,4 +176,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
153176

154177
EXPOSE 6379
155178
CMD ["supervisord", "-c", "supervisord.conf"]
156-
179+

crawl4ai/model_loader.py

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import argparse
2+
import os
3+
import shutil
4+
import subprocess
15
from functools import lru_cache
26
from pathlib import Path
3-
import subprocess, os
4-
import shutil
5-
from .model_loader import *
6-
import argparse
7+
8+
from transformers import AutoModel
9+
710
from crawl4ai.config import MODEL_REPO_BRANCH
811

912
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@@ -73,7 +76,7 @@ def get_home_folder():
7376

7477
@lru_cache()
7578
def load_bert_base_uncased():
76-
from transformers import BertTokenizer, BertModel
79+
from transformers import BertModel, BertTokenizer
7780

7881
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", resume_download=None)
7982
model = BertModel.from_pretrained("bert-base-uncased", resume_download=None)
@@ -92,7 +95,7 @@ def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
9295
Returns:
9396
tuple: The tokenizer and model.
9497
"""
95-
from transformers import AutoTokenizer, AutoModel
98+
from transformers import AutoTokenizer
9699

97100
tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
98101
model = AutoModel.from_pretrained(model_name, resume_download=None)
@@ -103,8 +106,7 @@ def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
103106

104107
@lru_cache()
105108
def load_text_classifier():
106-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
107-
from transformers import pipeline
109+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
108110

109111
tokenizer = AutoTokenizer.from_pretrained(
110112
"dstefa/roberta-base_topic_classification_nyt_news"
@@ -118,11 +120,25 @@ def load_text_classifier():
118120
return pipe
119121

120122

123+
MODEL = "cardiffnlp/tweet-topic-21-multi"
124+
125+
126+
def download_text_multilabel_classifier():
127+
"""Download the multilabel classifer model from Hugging Face Hub.
128+
129+
Unlike load_text_multilabel_classifier, this function does not
130+
load the model into memory only downloads it to the local cache
131+
in the foreground."""
132+
from huggingface_hub import snapshot_download
133+
134+
snapshot_download(MODEL)
135+
136+
121137
@lru_cache()
122-
def load_text_multilabel_classifier():
123-
from transformers import AutoModelForSequenceClassification, AutoTokenizer
124-
from scipy.special import expit
138+
def load_text_multilabel_classifier(download_only=False):
125139
import torch
140+
from scipy.special import expit
141+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
126142

127143
# # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
128144
# if torch.cuda.is_available():
@@ -133,11 +149,8 @@ def load_text_multilabel_classifier():
133149
# device = torch.device("cpu")
134150
# # return load_spacy_model(), torch.device("cpu")
135151

136-
MODEL = "cardiffnlp/tweet-topic-21-multi"
137-
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
138-
model = AutoModelForSequenceClassification.from_pretrained(
139-
MODEL, resume_download=None
140-
)
152+
tokenizer = AutoTokenizer.from_pretrained(MODEL)
153+
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
141154
model.eval()
142155
model, device = set_model_device(model)
143156
class_mapping = model.config.id2label
@@ -184,6 +197,17 @@ def load_nltk_punkt():
184197
return nltk.data.find("tokenizers/punkt")
185198

186199

200+
@lru_cache()
201+
def load_nltk_stopwords():
202+
import nltk
203+
204+
try:
205+
nltk.data.find("corpora/stopwords")
206+
except LookupError:
207+
nltk.download("stopwords")
208+
return nltk.data.find("corpora/stopwords")
209+
210+
187211
@lru_cache()
188212
def load_spacy_model():
189213
import spacy
@@ -271,10 +295,12 @@ def download_all_models(remove_existing=False):
271295
# print("[LOG] Downloading ONNX model...")
272296
# load_onnx_all_MiniLM_l6_v2()
273297
print("[LOG] Downloading text classifier...")
274-
_, device = load_text_multilabel_classifier()
275-
print(f"[LOG] Text classifier loaded on {device}")
298+
download_text_multilabel_classifier()
299+
print("[LOG] Text classifier downloaded")
276300
print("[LOG] Downloading custom NLTK Punkt model...")
277301
load_nltk_punkt()
302+
print("[LOG] Downloading custom NLTK stopwords model...")
303+
load_nltk_stopwords()
278304
print("[LOG] ✅ All models downloaded successfully.")
279305

280306

0 commit comments

Comments
 (0)