1- FROM python:3.10-slim
2-
31# Set build arguments
4- ARG APP_HOME=/app
5- ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
6- ARG GITHUB_BRANCH=main
7- ARG USE_LOCAL=true
2+ ARG UV_IMAGE=ghcr.io/astral-sh/uv:0.6.6 \
3+ PYTHON_IMAGE=python:3.10-slim \
4+ BUILD_ENV=local
5+
6+ # Create an alias for the UV image so we can reference it in the base stage.
7+ FROM ${UV_IMAGE} AS uv
88
9- ENV PYTHONFAULTHANDLER=1 \
9+ FROM ${PYTHON_IMAGE} AS base
10+ COPY --from=uv /uv /uvx /bin/
11+
12+ # Enable bytecode compilation during build to improve runtime
13+ # performance, set the link mode to copy to avoid warnings
14+ # with the default mode.
15+ ENV \
16+ PYTHONFAULTHANDLER=1 \
1017 PYTHONHASHSEED=random \
1118 PYTHONUNBUFFERED=1 \
12- PIP_NO_CACHE_DIR=1 \
1319 PYTHONDONTWRITEBYTECODE=1 \
1420 PIP_DISABLE_PIP_VERSION_CHECK=1 \
1521 PIP_DEFAULT_TIMEOUT=100 \
1622 DEBIAN_FRONTEND=noninteractive \
1723 REDIS_HOST=localhost \
18- REDIS_PORT=6379
19-
20- ARG PYTHON_VERSION=3.10
21- ARG INSTALL_TYPE=default
22- ARG ENABLE_GPU=false
23- ARG TARGETARCH
24-
25- LABEL maintainer="unclecode"
26- LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
27- LABEL version="1.0"
28-
29- RUN apt-get update && apt-get install -y --no-install-recommends \
30- build-essential \
24+ REDIS_PORT=6379 \
25+ UV_COMPILE_BYTECODE=1 \
26+ UV_LINK_MODE=copy \
27+ UV_SYSTEM_PYTHON=1 \
28+ PLAYWRIGHT_DOWNLOAD_CONNECTION_TIMEOUT=120000
29+
30+ LABEL \
31+ maintainer="unclecode" \
32+ description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" \
33+ version="1.0"
34+
35+ # Install dependencies with caching to speed up the build process.
36+ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
37+ --mount=type=cache,target=/var/lib/apt,sharing=locked \
38+ rm -f /etc/apt/apt.conf.d/docker-clean && \
39+ apt-get update && \
40+ apt-get install -y --no-install-recommends \
3141 curl \
42+ ca-certificates \
43+ build-essential \
3244 wget \
3345 gnupg \
3446 git \
@@ -38,9 +50,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
3850 libjpeg-dev \
3951 redis-server \
4052 supervisor \
41- && rm -rf /var/lib/apt/lists/*
42-
43- RUN apt-get update && apt-get install -y --no-install-recommends \
4453 libglib2.0-0 \
4554 libnss3 \
4655 libnspr4 \
@@ -61,83 +70,97 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
6170 libpango-1.0-0 \
6271 libcairo2 \
6372 libasound2 \
64- libatspi2.0-0 \
65- && rm -rf /var/lib/apt/lists/*
73+ libatspi2.0-0
6674
67- RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
68- apt-get update && apt-get install -y --no-install-recommends \
69- nvidia-cuda-toolkit \
70- && rm -rf /var/lib/apt/lists/* ; \
75+ ARG \
76+ TARGETARCH \
77+ ENABLE_GPU=false
78+
79+ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
80+ --mount=type=cache,target=/var/lib/apt,sharing=locked \
81+ if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
82+ apt-get install -y --no-install-recommends \
83+ nvidia-cuda-toolkit; \
7184else \
7285 echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)" ; \
7386fi
7487
75- RUN if [ "$TARGETARCH" = "arm64" ]; then \
88+ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
89+ --mount=type=cache,target=/var/lib/apt,sharing=locked \
90+ if [ "$TARGETARCH" = "arm64" ]; then \
7691 echo "🦾 Installing ARM-specific optimizations" ; \
77- apt-get update && apt-get install -y --no-install-recommends \
78- libopenblas-dev \
79- && rm -rf /var/lib/apt/lists/*; \
92+ apt-get install -y --no-install-recommends \
93+ libopenblas-dev; \
8094elif [ "$TARGETARCH" = "amd64" ]; then \
8195 echo "🖥️ Installing AMD64-specific optimizations" ; \
82- apt-get update && apt-get install -y --no-install-recommends \
83- libomp-dev \
84- && rm -rf /var/lib/apt/lists/*; \
96+ apt-get install -y --no-install-recommends \
97+ libomp-dev; \
8598else \
86- echo "Skipping platform-specific optimizations (unsupported platform)" ; \
99+ echo "Skipping platform-specific optimizations (unsupported platform ${TARGET_ARCH} )" ; \
87100fi
88101
102+ ARG APP_HOME=/app
89103WORKDIR ${APP_HOME}
90104
91- RUN echo '#!/bin/bash\n \
92- if [ "$USE_LOCAL" = "true" ]; then\n \
93- echo "📦 Installing from local source..."\n \
94- pip install --no-cache-dir /tmp/project/\n \
95- else\n \
96- echo "🌐 Installing from GitHub..."\n \
97- for i in {1..3}; do \n \
98- git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n \
99- { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n \
100- done\n \
101- pip install --no-cache-dir /tmp/crawl4ai\n \
102- fi' > /tmp/install.sh && chmod +x /tmp/install.sh
103-
104- COPY . /tmp/project/
105-
106- COPY deploy/docker/supervisord.conf .
107-
108- COPY deploy/docker/requirements.txt .
109- RUN pip install --no-cache-dir -r requirements.txt
110-
111- RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
112- pip install --no-cache-dir \
113- torch \
114- torchvision \
115- torchaudio \
116- scikit-learn \
117- nltk \
118- transformers \
119- tokenizers && \
120- python -m nltk.downloader punkt stopwords ; \
121- fi
122-
123- RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
124- pip install "/tmp/project/[all]" && \
125- python -m crawl4ai.model_loader ; \
126- elif [ "$INSTALL_TYPE" = "torch" ] ; then \
127- pip install "/tmp/project/[torch]" ; \
128- elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
129- pip install "/tmp/project/[transformer]" && \
130- python -m crawl4ai.model_loader ; \
131- else \
132- pip install "/tmp/project" ; \
133- fi
134-
135- RUN pip install --no-cache-dir --upgrade pip && \
136- /tmp/install.sh && \
105+ COPY --link . /tmp/project/
106+
107+ COPY deploy/docker/requirements.txt deploy/docker/supervisord.conf ./
108+
109+ # Install the docker dependencies.
110+ RUN --mount=type=cache,target=/root/.cache/uv \
111+ uv pip install --upgrade pip -r requirements.txt
112+
113+ # Install the dependencies for the specified install type.
114+ ARG INSTALL_TYPE=default
115+ RUN --mount=type=cache,target=/root/.cache/uv \
116+ GROUP=$([ "$INSTALL_TYPE" = "default" ] && echo "" || echo "[$INSTALL_TYPE]" ) ; \
117+ uv pip install "/tmp/project/$GROUP"
118+
119+ # If the install type is all or transformer, download the models.
120+ RUN --mount=type=cache,target=/tmp/.cache/huggingface \
121+ --mount=type=cache,target=/tmp/.cache/nltk \
122+ if [ "$INSTALL_TYPE" = "all" ] || [ "$INSTALL_TYPE" = "transformer" ] ; then \
123+ NLTK_DATA=/tmp/.cache/nltk \
124+ HF_HOME=/tmp/.cache/huggingface \
125+ python -m crawl4ai.model_loader && \
126+ mkdir -p /root/.cache && \
127+ rm -rf /root/.cache/ms-playwright/ /root/nltk_data/ && \
128+ cp -R /tmp/.cache/nltk/ /root/nltk_data/ && \
129+ cp -R /tmp/.cache/huggingface/ /root/.cache/huggingface/ ; \
130+ fi
131+
132+ # Install from local source.
133+ FROM base AS local
134+ RUN --mount=type=cache,target=/root/.cache/uv \
135+ echo "📦 Installing from local source..." ; \
136+ uv pip install /tmp/project/
137+
138+ # Install from GitHub.
139+ FROM base AS github
140+ ARG \
141+ GITHUB_REPO=https://github.com/unclecode/crawl4ai.git \
142+ GITHUB_BRANCH=main
143+ RUN --mount=type=cache,target=/root/.cache/uv \
144+ echo "🌐 Installing from GitHub..." ; \
145+ for i in {1..3}; do \
146+ git clone --depth 1 --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \
147+ { echo "Attempt $i/3 failed! Taking a short break... ☕" ; sleep 5; }; \
148+ done ; \
149+ uv pip install /tmp/crawl4ai
150+
151+ FROM ${BUILD_ENV} AS final
152+
153+ # Test the installation.
154+ RUN \
137155 python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
138156 python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
139-
140- RUN playwright install --with-deps chromium
157+
158+ # Install Playwright browsers.
159+ RUN --mount=type=cache,target=/tmp/.cache/ms-playwright \
160+ PLAYWRIGHT_BROWSERS_PATH=/tmp/.cache/ms-playwright playwright install --no-shell chromium && \
161+ mkdir -p /root/.cache && \
162+ rm -rf /root/.cache/ms-playwright/ && \
163+ cp -R /tmp/.cache/ms-playwright/ /root/.cache/ms-playwright/
141164
142165COPY deploy/docker/* ${APP_HOME}/
143166
@@ -153,4 +176,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
153176
154177EXPOSE 6379
155178CMD ["supervisord" , "-c" , "supervisord.conf" ]
156-
179+
0 commit comments