diff --git a/Dockerfile b/Dockerfile index e53efcbd..7c43e743 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,53 +1,63 @@ -FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS DEPENDENCIES +# Base image with CUDA support +FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS base -ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py" -ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth" - -ENV FORCE_CUDA="1" -ENV MMCV_WITH_OPS=1 +# Set environment variables +ENV FORCE_CUDA="1" \ + MMCV_WITH_OPS=1 \ + DEBIAN_FRONTEND=noninteractive +# Install system dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ - python3-pip \ + python3-pip \ libgl1-mesa-glx \ - libsm6 \ - libxext6 \ - libxrender-dev \ - libglib2.0-0 \ - git \ - python3-dev \ - python3-wheel \ - curl - -# Uncomment the following if you want to download a specific set of weights -# RUN mkdir weights -# RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT - -RUN pip3 install --upgrade pip \ - && pip3 install wheel \ - && pip3 install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 \ - && pip3 install \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libglib2.0-0 \ + git \ + python3-dev \ + python3-wheel \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +FROM base AS python_deps + +RUN pip3 install --upgrade pip wheel \ + && pip3 install --no-cache-dir torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 \ + && pip3 install --no-cache-dir \ gradio==4.16.0 \ opencv-python==4.9.0.80 \ supervision \ mmengine==0.10.4 \ setuptools \ openmim \ + onnx \ + onnxsim \ && mim install mmcv==2.1.0 \ && mim install mmdet==3.3.0 \ - && pip install git+https://github.com/onuralpszr/mmyolo.git + && pip3 install --no-cache-dir git+https://github.com/onuralpszr/mmyolo.git + +# Clone and install YOLO-World +FROM python_deps AS yolo_world -FROM DEPENDENCIES as INSTALLING_YOLO -RUN git clone --recursive https://github.com/tim-win/YOLO-World /yolo/ -#COPY . /yolo +RUN git clone --recursive https://github.com/AILab-CVC/YOLO-World /yolo/ WORKDIR /yolo RUN pip3 install -e .[demo] -RUN pip3 install onnx onnxsim +# Final stage +FROM yolo_world AS final + +ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py" +ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth" -FROM INSTALLING_YOLO as OK_THIS_PART_IS_TRICKY_DONT_HATE +# Create weights directory and set permissions +RUN mkdir /weights/ \ + && chmod a+rwx /yolo/configs/*/* -RUN mkdir /weights/ -RUN chmod a+rwx /yolo/configs/*/* +# Optionally download weights (commented out by default) +# RUN curl -o /weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT -CMD [ "bash" ] +# Set the default command +CMD ["bash"] \ No newline at end of file diff --git a/README.md b/README.md index d89e9a4f..c213924d 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ We recommend that everyone **use English to communicate on issues**, as this hel For business licensing and other related inquiries, don't hesitate to contact `yixiaoge@tencent.com`. ## 🔥 Updates +`[2024-8-31]`: Segmentation demo added to the demo/ folder. Try it out in docker with `./build_and_run.sh seg-l`! `[2024-7-8]`: YOLO-World now has been integrated into [ComfyUI](https://github.com/StevenGrove/ComfyUI-YOLOWorld)! Come and try adding YOLO-World to your workflow now! You can access it at [StevenGrove/ComfyUI-YOLOWorld](https://github.com/StevenGrove/ComfyUI-YOLOWorld)! `[2024-5-18]:` YOLO-World models have been [integrated with the FiftyOne computer vision toolkit](https://docs.voxel51.com/integrations/ultralytics.html#open-vocabulary-detection) for streamlined open-vocabulary inference across image and video datasets. `[2024-5-16]:` Hey guys! Long time no see! This update contains (1) [fine-tuning guide](https://github.com/AILab-CVC/YOLO-World?#highlights--introduction) and (2) [TFLite Export](./docs/tflite_deploy.md) with INT8 Quantization. diff --git a/build_and_run.sh b/build_and_run.sh index edb73942..2ee55e8d 100755 --- a/build_and_run.sh +++ b/build_and_run.sh @@ -1,8 +1,33 @@ #!/usr/bin/env bash + +# Exit immediately if a command exits with a non-zero status. set -e -MODEL_DIR="../models/models-yoloworld" +# Set MODEL_DIR if not already set in the environment +: "${MODEL_DIR:="../models/models-yoloworld"}" + +# DocString for the script +: ' +This script builds and runs a Docker container for YOLO-World demos. +It supports various pre-trained models and configurations for object detection and segmentation. + +Usage: + ./build_and_run.sh + +Environment Variables: + MODEL_DIR: Path to the directory containing model weights (default: "../models/models-yoloworld") +Arguments: + : Key for the desired model configuration (see available keys below) + +Available model keys: + seg-l, seg-l-seghead, seg-m, seg-m-seghead, + pretrain-l-clip-800ft, pretrain-l-clip, pretrain-l-1280ft, pretrain-l, + pretrain-m-1280ft, pretrain-m, pretrain-s-1280ft, pretrain-s, + pretrain-x-cc3mlite, pretrain-x-1280ft +' + +# Define associative array for model configurations declare -A models models["seg-l"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth" models["seg-l-seghead"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-5a642d30.pth" @@ -19,33 +44,52 @@ models["pretrain-s"]="yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_ models["pretrain-x-cc3mlite"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth" models["pretrain-x-1280ft"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth" -if [ $# -eq 0 ]; then +# Function to display usage information +show_usage() { + echo "Usage: $0 " echo "Available model keys:" for key in "${!models[@]}"; do echo " $key" done - echo "Usage: $0 " +} + +# Check if a model key is provided +if [ $# -eq 0 ]; then + show_usage exit 1 fi model_key=$1 +# Validate the model key if [ -z "${models[$model_key]}" ]; then - echo "Invalid model key. Available keys are:" - for key in "${!models[@]}"; do - echo " $key" - done + echo "Invalid model key." + show_usage exit 1 fi -read MODEL WEIGHT <<< "${models[$model_key]}" +# Extract model and weight information +read -r MODEL WEIGHT <<< "${models[$model_key]}" +# Set configuration directory and demo file based on model type config_dir="configs/pretrain" -demo_file=demo/gradio_demo.py +demo_file="demo/gradio_demo.py" if [[ $model_key == seg-* ]]; then - export config_dir="configs/segmentation" - export demo_file="demo/segmentation_demo.py" + config_dir="configs/segmentation" + demo_file="demo/segmentation_demo.py" fi -# docker build -f ./Dockerfile --build-arg="MODEL=$MODEL" --build-arg="WEIGHT=$WEIGHT" -t "yolo-demo:latest" . && \ -docker run -it -v "$(readlink -f $MODEL_DIR):/weights/" --runtime nvidia -p 8080:8080 "yolo-demo:latest" python3 $demo_file "$config_dir/$MODEL" "/weights/$WEIGHT" \ No newline at end of file +# Build Docker image and run container +echo "Building Docker image..." +docker build -f ./Dockerfile --no-cache \ + --build-arg="MODEL=$MODEL" \ + --build-arg="WEIGHT=$WEIGHT" \ + -t "yolo-demo:latest" . + +echo "Running Docker container..." +docker run -it \ + -v "$(readlink -f "$MODEL_DIR"):/weights/" \ + --runtime nvidia \ + -p 8080:8080 \ + "yolo-demo:latest" \ + python3 "$demo_file" "$config_dir/$MODEL" "/weights/$WEIGHT" diff --git a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py index 32fcc51c..714e1492 100644 --- a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +++ b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -18,7 +18,6 @@ weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth' -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' persistent_workers = False diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py index 1c34f3a4..630f5710 100644 --- a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py +++ b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -16,7 +16,6 @@ weight_decay = 0.025 train_batch_size_per_gpu = 4 load_from = "pretrained_models/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth" -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' img_scale = (1280, 1280) diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py index cb8beec0..5a770bce 100644 --- a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py +++ b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -15,7 +15,6 @@ base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' # model settings model = dict( diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py index 70b19b28..197289bb 100644 --- a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py +++ b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py @@ -15,7 +15,6 @@ base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' # model settings model = dict( diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py index a2ba421e..4d8ff3aa 100644 --- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py +++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py @@ -15,7 +15,6 @@ base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' # model settings model = dict( diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py index 40c2e5c1..35050ecc 100644 --- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py +++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -15,7 +15,6 @@ base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' img_scale = (1280, 1280) diff --git a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py index e3c1226d..92afae3b 100644 --- a/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py +++ b/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -15,7 +15,6 @@ base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' # model settings model = dict( diff --git a/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py b/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py index 062c9e31..d2006659 100644 --- a/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py +++ b/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py @@ -16,7 +16,6 @@ weight_decay = 0.05 train_batch_size_per_gpu = 8 load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' -# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' text_model_name = 'openai/clip-vit-base-patch32' persistent_workers = False diff --git a/demo/README.md b/demo/README.md index c6f607c5..9fe600d9 100644 --- a/demo/README.md +++ b/demo/README.md @@ -19,11 +19,24 @@ pip install gradio==4.16.0 python demo/demo.py path/to/config path/to/weights ``` -Additionaly, you can use a Dockerfile to build an image with gradio. As a prerequisite, make sure you have respective drivers installed alongside [nvidia-container-runtime](https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime). Replace MODEL_NAME and WEIGHT_NAME with the respective values or ommit this and use default values from the [Dockerfile](Dockerfile#3) +Additionally, you can use our Docker build system for an easier setup: ```bash -docker build --build-arg="MODEL=MODEL_NAME" --build-arg="WEIGHT=WEIGHT_NAME" -t yolo_demo . -docker run --runtime nvidia -p 8080:8080 +./build_and_run.sh +``` + +Available model keys include: +- seg-l, seg-l-seghead, seg-m, seg-m-seghead +- pretrain-l-clip-800ft, pretrain-l-clip, pretrain-l-1280ft, pretrain-l +- pretrain-m-1280ft, pretrain-m, pretrain-s-1280ft, pretrain-s +- pretrain-x-cc3mlite, pretrain-x-1280ft + +This script will build the Docker image and run the container with the specified model configuration. The Gradio interface will be accessible at `http://localhost:8080`. + +You can also customize the model weights directory by setting the `MODEL_DIR` environment variable: + +```bash +MODEL_DIR=/path/to/your/weights ./build_and_run.sh ``` #### Image Demo