Add benchmark on parallel upload and search (#215)

tellet-q · pre-commit-ci[bot] · KShivendu · web-flow · commit a0d672cf5e13 · 2024-12-02T15:24:08.000+01:00
* Add parallel upload&amp;search workflow

* Introduce new step: search without upsert

* Update tools/upload_parallel_results_postgres.sh

Co-authored-by: Kumar Shivendu &lt;kshivendu1@gmail.com&gt;

* Update tools/run_ci.sh

Co-authored-by: Kumar Shivendu &lt;kshivendu1@gmail.com&gt;

* Fix indent

* Explicit mode check

* Provide CONTAINER_MEM_LIMIT explicitly

* Store parallel results in separate folder

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Kumar Shivendu &lt;kshivendu1@gmail.com&gt;
diff --git a/.github/workflows/continuous-benchmark.yaml b/.github/workflows/continuous-benchmark.yaml
@@ -283,6 +283,93 @@ jobs:
                 }
               ]
             }
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.CI_ALERTS_CHANNEL_WEBHOOK_URL }}
+          SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
+
+  runParallelBenchmark:
+    runs-on: ubuntu-latest
+    needs: [ runLoadTimeBenchmark, runTenantsBenchmark ]
+    if: ${{ always() }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: webfactory/ssh-agent@v0.8.0
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+      - name: Benches
+        id: benches
+        run: |
+            export HCLOUD_TOKEN=${{ secrets.HCLOUD_TOKEN }}
+            export POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }}
+            export POSTGRES_HOST=${{ secrets.POSTGRES_HOST }}
+            bash -x tools/setup_ci.sh
+
+            set +e
+
+            # Benchmark parallel search&upload
+
+            export ENGINE_NAME="qdrant-continuous-benchmark"
+            export DATASETS="laion-small-clip"
+            export BENCHMARK_STRATEGY="parallel"
+            export POSTGRES_TABLE="benchmark_parallel_search_upload"
+
+            # Benchmark the dev branch:
+            export QDRANT_VERSION=ghcr/dev
+            timeout 30m bash -x tools/run_ci.sh
+
+            # Benchmark the master branch:
+            export QDRANT_VERSION=docker/master
+            timeout 30m bash -x tools/run_ci.sh
+
+            set -e
+      - name: Fail job if any of the benches failed
+        if: steps.benches.outputs.failed == 'error' || steps.benches.outputs.failed == 'timeout'
+        run: exit 1
+      - name: Send Notification
+        if: failure() || cancelled()
+        uses: slackapi/slack-github-action@v1.26.0
+        with:
+          payload: |
+            {
+              "text": "CI benchmarks (runTenantsBenchmark) run status: ${{ job.status }}",
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "CI benchmarks (runTenantsBenchmark) failed because of *${{ steps.benches.outputs.failed }}*."
+                  }
+                },
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "Qdrant version: *${{ steps.benches.outputs.qdrant_version }}*."
+                  }
+                },
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "Engine: *${{ steps.benches.outputs.engine_name }}*."
+                  }
+                },
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "Dataset: *${{ steps.benches.outputs.dataset }}*."
+                  }
+                },
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "View the results <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|here>"
+                  }
+                }
+              ]
+            }
         env:
           SLACK_WEBHOOK_URL: ${{ secrets.CI_ALERTS_CHANNEL_WEBHOOK_URL }}
           SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
diff --git a/engine/base_client/client.py b/engine/base_client/client.py
@@ -1,7 +1,7 @@
 import json
 import os
 from datetime import datetime
-from typing import List
+from typing import List, Optional
 
 from benchmark import ROOT_DIR
 from benchmark.dataset import Dataset
@@ -84,6 +84,7 @@ def run_experiment(
         skip_upload: bool = False,
         skip_search: bool = False,
         skip_if_exists: bool = True,
+        skip_configure: Optional[bool] = False,
     ):
         execution_params = self.configurator.execution_params(
             distance=dataset.config.distance, vector_size=dataset.config.vector_size
@@ -101,8 +102,9 @@ def run_experiment(
                 return
 
         if not skip_upload:
-            print("Experiment stage: Configure")
-            self.configurator.configure(dataset)
+            if not skip_configure:
+                print("Experiment stage: Configure")
+                self.configurator.configure(dataset)
 
             print("Experiment stage: Upload")
             upload_stats = self.uploader.upload(
diff --git a/run.py b/run.py
@@ -1,6 +1,6 @@
 import fnmatch
 import traceback
-from typing import List
+from typing import List, Optional
 
 import stopit
 import typer
@@ -23,6 +23,7 @@ def run(
     skip_if_exists: bool = False,
     exit_on_error: bool = True,
     timeout: float = 86400.0,
+    skip_configure: Optional[bool] = False,
 ):
     """
     Example:
@@ -57,7 +58,11 @@ def run(
 
                 with stopit.ThreadingTimeout(timeout) as tt:
                     client.run_experiment(
-                        dataset, skip_upload, skip_search, skip_if_exists
+                        dataset,
+                        skip_upload,
+                        skip_search,
+                        skip_if_exists,
+                        skip_configure,
                     )
                 client.delete_client()
 
diff --git a/tools/run_ci.sh b/tools/run_ci.sh
@@ -39,10 +39,20 @@ else
   # any other strategies are considered to have search & upload results
   export SEARCH_RESULTS_FILE=$(ls -t results/*-search-*.json | head -n 1)
   export UPLOAD_RESULTS_FILE=$(ls -t results/*-upload-*.json | head -n 1)
+
+  if [[ "$BENCHMARK_STRATEGY" == "parallel" ]]; then
+    export PARALLEL_UPLOAD_RESULTS_FILE=$(ls -t results/parallel/*-upload-*.json | head -n 1)
+    export PARALLEL_SEARCH_RESULTS_FILE=$(ls -t results/parallel/*-search-*.json | head -n 1)
+  fi
 fi
 
 export VM_RSS_MEMORY_USAGE_FILE=$(ls -t results/vm-rss-memory-usage-*.txt | head -n 1)
 export RSS_ANON_MEMORY_USAGE_FILE=$(ls -t results/rss-anon-memory-usage-*.txt | head -n 1)
 export ROOT_API_RESPONSE_FILE=$(ls -t results/root-api-*.json | head -n 1)
 
-bash -x "${SCRIPT_PATH}/upload_results_postgres.sh"
+if [[ "$BENCHMARK_STRATEGY" == "parallel" ]]; then
+  bash -x "${SCRIPT_PATH}/upload_parallel_results_postgres.sh"
+else
+  bash -x "${SCRIPT_PATH}/upload_results_postgres.sh"
+fi
+
diff --git a/tools/run_client_script.sh b/tools/run_client_script.sh
@@ -3,7 +3,7 @@
 PS4='ts=$(date "+%Y-%m-%dT%H:%M:%SZ") level=DEBUG line=$LINENO file=$BASH_SOURCE '
 set -euo pipefail
 
-# Possible values are: full|upload|search
+# Possible values are: full|upload|search|parallel|snapshot
 EXPERIMENT_MODE=${1:-"full"}
 
 CLOUD_NAME=${CLOUD_NAME:-"hetzner"}
@@ -52,6 +52,7 @@ fi
 
 echo "Gather experiment results..."
 result_files_arr=()
+result_parallel_files_arr=()
 
 if [[ "$EXPERIMENT_MODE" == "full" ]] || [[ "$EXPERIMENT_MODE" == "upload" ]]; then
   UPLOAD_RESULT_FILE=$(ssh "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}" "ls -t results/*-upload-*.json | head -n 1")
@@ -63,9 +64,22 @@ if [[ "$EXPERIMENT_MODE" == "full" ]] || [[ "$EXPERIMENT_MODE" == "search" ]]; t
   result_files_arr+=("$SEARCH_RESULT_FILE")
 fi
 
+if [[ "$EXPERIMENT_MODE" == "parallel" ]]; then
+  UPLOAD_RESULT_FILE=$(ssh "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}" "ls -t results/parallel/*-upload-*.json | head -n 1")
+  result_parallel_files_arr+=("$UPLOAD_RESULT_FILE")
+
+  SEARCH_RESULT_FILE=$(ssh "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}" "ls -t results/parallel/*-search-*.json | head -n 1")
+  result_parallel_files_arr+=("$SEARCH_RESULT_FILE")
+fi
+
 mkdir -p results
+mkdir -p results/parallel
 
 for RESULT_FILE in "${result_files_arr[@]}"; do
     # -p preseves modification time, access time, and modes (but not change time)
     scp -p "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}:~/${RESULT_FILE}" "./results"
 done
+
+for RESULT_FILE in "${result_parallel_files_arr[@]}"; do
+    scp -p "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}:~/${RESULT_FILE}" "./results/parallel"
+done
diff --git a/tools/run_experiment.sh b/tools/run_experiment.sh
@@ -29,7 +29,7 @@ if [[ -z "$PRIVATE_IP_OF_THE_SERVER" ]]; then
 fi
 
 if [[ -z "$EXPERIMENT_MODE" ]]; then
-  echo "EXPERIMENT_MODE is not set, possible values are: full | upload | search | snapshot"
+  echo "EXPERIMENT_MODE is not set, possible values are: full | upload | search | snapshot | parallel"
   exit 1
 fi
 
@@ -75,6 +75,37 @@ if [[ "$EXPERIMENT_MODE" == "full" ]] || [[ "$EXPERIMENT_MODE" == "search" ]]; t
 fi
 
 
+if [[ "$EXPERIMENT_MODE" == "parallel" ]]; then
+  echo "EXPERIMENT_MODE=$EXPERIMENT_MODE"
+
+  docker pull qdrant/vector-db-benchmark:latest
+
+  echo "Starting ci-benchmark-upload container"
+  docker run \
+    --rm \
+    --name ci-benchmark-upload \
+    -v "$HOME/results/parallel:/code/results" \
+    qdrant/vector-db-benchmark:latest \
+    python run.py --engines "${ENGINE_NAME}" --datasets "${DATASETS}" --host "${PRIVATE_IP_OF_THE_SERVER}" --no-skip-if-exists --skip-search --skip-configure &
+  UPLOAD_PID=$!
+
+  echo "Starting ci-benchmark-search container"
+  docker run \
+    --rm \
+    --name ci-benchmark-search \
+    -v "$HOME/results/parallel:/code/results" \
+    qdrant/vector-db-benchmark:latest \
+    python run.py --engines "${ENGINE_NAME}" --datasets "${DATASETS}" --host "${PRIVATE_IP_OF_THE_SERVER}" --no-skip-if-exists --skip-upload &
+  SEARCH_PID=$!
+
+  echo "Waiting for both containers to finish"
+  wait $UPLOAD_PID
+  wait $SEARCH_PID
+
+  echo "EXPERIMENT_MODE=$EXPERIMENT_MODE DONE"
+fi
+
+
 if [[ "$EXPERIMENT_MODE" == "snapshot" ]]; then
   echo "EXPERIMENT_MODE=$EXPERIMENT_MODE"
 
diff --git a/tools/run_remote_benchmark.sh b/tools/run_remote_benchmark.sh
@@ -58,7 +58,7 @@ case "$BENCHMARK_STRATEGY" in
 
   SERVER_CONTAINER_NAME=${SERVER_CONTAINER_NAME:-"qdrant-continuous-benchmarks-with-volume"}
 
-  bash -x "${SCRIPT_PATH}/run_server_container_with_volume.sh" "$SERVER_CONTAINER_NAME"
+  bash -x "${SCRIPT_PATH}/run_server_container_with_volume.sh" "$SERVER_CONTAINER_NAME" "25Gb"
 
   bash -x "${SCRIPT_PATH}/run_client_script.sh" "upload"
 
@@ -74,7 +74,7 @@ case "$BENCHMARK_STRATEGY" in
 
   SERVER_CONTAINER_NAME=${SERVER_CONTAINER_NAME:-"qdrant-continuous-benchmarks-snapshot"}
 
-  bash -x "${SCRIPT_PATH}/run_server_container_with_volume.sh" "$SERVER_CONTAINER_NAME"
+  bash -x "${SCRIPT_PATH}/run_server_container_with_volume.sh" "$SERVER_CONTAINER_NAME" "25Gb"
 
   bash -x "${SCRIPT_PATH}/run_client_script.sh" "snapshot"
 
@@ -85,6 +85,26 @@ case "$BENCHMARK_STRATEGY" in
   bash -x "${SCRIPT_PATH}/qdrant_collect_stats.sh" "$SERVER_CONTAINER_NAME"
   ;;
 
+  "parallel")
+  echo "Parallel benchmark, run upload&search at the same time"
+
+  SERVER_CONTAINER_NAME=${SERVER_CONTAINER_NAME:-"qdrant-continuous-benchmarks-with-volume"}
+
+  bash -x "${SCRIPT_PATH}/run_server_container_with_volume.sh" "$SERVER_CONTAINER_NAME" "25Gb"
+
+  bash -x "${SCRIPT_PATH}/run_client_script.sh" "upload"
+
+  bash -x "${SCRIPT_PATH}/run_server_container_with_volume.sh" "$SERVER_CONTAINER_NAME" "25Gb" "continue"
+
+  bash -x "${SCRIPT_PATH}/run_client_script.sh" "search"
+
+  bash -x "${SCRIPT_PATH}/run_server_container_with_volume.sh" "$SERVER_CONTAINER_NAME" "25Gb" "continue"
+
+  bash -x "${SCRIPT_PATH}/run_client_script.sh" "parallel"
+
+  bash -x "${SCRIPT_PATH}/qdrant_collect_stats.sh" "$SERVER_CONTAINER_NAME"
+  ;;
+
   *)
     echo "Invalid BENCHMARK_STRATEGY value: $BENCHMARK_STRATEGY"
     exit 1
diff --git a/tools/run_server_container_with_volume.sh b/tools/run_server_container_with_volume.sh
@@ -38,10 +38,13 @@ if [[ ${QDRANT_VERSION} == docker/* ]] || [[ ${QDRANT_VERSION} == ghcr/* ]]; the
     if [[ "$EXECUTION_MODE" == "init" ]]; then
       echo "Initialize qdrant from scratch, with qdrant_storage volume"
       DOCKER_COMPOSE="export QDRANT_VERSION=${QDRANT_VERSION}; export CONTAINER_REGISTRY=${CONTAINER_REGISTRY}; export CONTAINER_MEM_LIMIT=${CONTAINER_MEM_LIMIT}; docker compose down; pkill qdrant; docker rm -f qdrant-continuous || true; docker rmi -f ${CONTAINER_REGISTRY}/qdrant/qdrant:${QDRANT_VERSION} || true; docker volume rm -f qdrant_storage || true; docker compose up -d; docker container ls -a"
-    else
+    elif [[ "$EXECUTION_MODE" == "continue" ]]; then
       # suggest that volume qdrant_storage exist and start qdrant
       echo "Reload qdrant with existing data"
       DOCKER_COMPOSE="export QDRANT_VERSION=${QDRANT_VERSION}; export CONTAINER_REGISTRY=${CONTAINER_REGISTRY}; export CONTAINER_MEM_LIMIT=${CONTAINER_MEM_LIMIT}; docker compose down; pkill qdrant; docker rm -f qdrant-continuous || true; docker rmi -f ${CONTAINER_REGISTRY}/qdrant/qdrant:${QDRANT_VERSION} || true ; sudo bash -c 'sync; echo 1 > /proc/sys/vm/drop_caches'; docker compose up -d; docker container ls -a"
+    else
+      echo "Error: unknown execution mode ${EXECUTION_MODE}. Execution mode should be 'init' or 'continue'"
+      exit 1
     fi
 
     ssh -t  -o ServerAliveInterval=60 -o ServerAliveCountMax=3 "${SERVER_USERNAME}@${IP_OF_THE_SERVER}" "cd ./projects/vector-db-benchmark/engine/servers/${CONTAINER_NAME} ; $DOCKER_COMPOSE"
diff --git a/tools/upload_parallel_results_postgres.sh b/tools/upload_parallel_results_postgres.sh