SpatialBench Benchmark #36

Workflow file for this run

.github/workflows/benchmark.yml at ae57d71

	name: SpatialBench Benchmark

	on:
	# Run every other week on Monday at 6:00 UTC
	schedule:
	- cron: '0 6 1-7,15-21 * 1'
	# Run on PRs that modify benchmark code
	pull_request:
	paths:
	- 'benchmark/**'
	- 'spatialbench-queries/**'
	- '.github/workflows/benchmark.yml'
	# Run on pushes to main that modify benchmark code
	push:
	branches: ["main"]
	paths:
	- 'benchmark/**'
	- 'spatialbench-queries/**'
	- '.github/workflows/benchmark.yml'
	# Allow manual triggering with extended options
	workflow_dispatch:
	inputs:
	scale_factor:
	description: 'Scale factor for benchmark'
	required: false
	default: '1'
	type: choice
	options:
	- '0.1'
	- '1'
	- '10'
	engines:
	description: 'Engines to benchmark (comma-separated)'
	required: false
	default: 'duckdb,geopandas,sedonadb,spatial_polars'
	type: string
	timeout:
	description: 'Query timeout in seconds (default: 60, increase for full benchmark)'
	required: false
	default: '60'
	type: string
	sedonadb_version:
	description: 'SedonaDB version (e.g., 1.0.0, leave empty for latest)'
	required: false
	default: ''
	type: string
	duckdb_version:
	description: 'DuckDB version (e.g., 1.0.0, leave empty for latest)'
	required: false
	default: ''
	type: string
	geopandas_version:
	description: 'GeoPandas version (e.g., 1.0.0, leave empty for latest)'
	required: false
	default: ''
	type: string
	spatial_polars_version:
	description: 'Spatial Polars version (e.g., 1.0.0, leave empty for latest)'
	required: false
	default: ''
	type: string
	runs:
	description: 'Number of runs per query (average taken for fair comparison)'
	required: false
	default: '3'
	type: choice
	options:
	- '1'
	- '3'
	- '5'
	sedonadb_nightly:
	description: 'Use SedonaDB nightly build from Gemfury (ignores version if true)'
	required: false
	default: true
	type: boolean
	duckdb_nightly:
	description: 'Use DuckDB pre-release/nightly build (ignores version if true)'
	required: false
	default: true
	type: boolean

	concurrency:
	group: ${{ github.repository }}-${{ github.ref }}-benchmark
	cancel-in-progress: true

	env:
	CARGO_TERM_COLOR: always
	SCALE_FACTOR: ${{ github.event.inputs.scale_factor \|\| '1' }}
	BENCHMARK_ENGINES: ${{ github.event.inputs.engines \|\| 'duckdb,geopandas,sedonadb,spatial_polars' }}
	QUERY_TIMEOUT: ${{ github.event.inputs.timeout \|\| '60' }}
	BENCHMARK_RUNS: ${{ github.event.inputs.runs \|\| '3' }}
	# Package versions (empty = latest, can be overridden via workflow_dispatch)
	SEDONADB_VERSION: ${{ github.event.inputs.sedonadb_version }}
	DUCKDB_VERSION: ${{ github.event.inputs.duckdb_version }}
	GEOPANDAS_VERSION: ${{ github.event.inputs.geopandas_version }}
	SPATIAL_POLARS_VERSION: ${{ github.event.inputs.spatial_polars_version }}
	# Nightly build options (default: true)
	SEDONADB_NIGHTLY: ${{ github.event.inputs.sedonadb_nightly \|\| 'true' }}
	DUCKDB_NIGHTLY: ${{ github.event.inputs.duckdb_nightly \|\| 'true' }}
	# Hugging Face dataset for benchmark data
	HF_DATASET: apache-sedona/spatialbench
	HF_DATA_VERSION: v0.1.0

	jobs:
	# Download benchmark data from Hugging Face
	download-data:
	name: Download Data (SF${{ github.event.inputs.scale_factor \|\| '1' }})
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Cache benchmark data
	id: cache-data
	uses: actions/cache@v4
	with:
	path: benchmark-data-sf${{ env.SCALE_FACTOR }}
	key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}

	- name: Setup Python
	if: steps.cache-data.outputs.cache-hit != 'true'
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install huggingface-hub
	if: steps.cache-data.outputs.cache-hit != 'true'
	run: pip install huggingface-hub

	- name: Download benchmark data from Hugging Face
	if: steps.cache-data.outputs.cache-hit != 'true'
	run: \|
	# Map scale factor to HF folder name
	SF="${{ env.SCALE_FACTOR }}"
	if [ "$SF" = "0.1" ]; then
	HF_SF="sf0.1"
	else
	HF_SF="sf${SF}"
	fi

	echo "Downloading data from HF: ${{ env.HF_DATASET }}/${{ env.HF_DATA_VERSION }}/${HF_SF}"

	python -c "
	from huggingface_hub import snapshot_download
	import os

	sf = os.environ['SCALE_FACTOR']
	hf_sf = 'sf0.1' if sf == '0.1' else f'sf{sf}'

	snapshot_download(
	repo_id='${{ env.HF_DATASET }}',
	repo_type='dataset',
	local_dir='hf-data',
	allow_patterns=[f'${{ env.HF_DATA_VERSION }}/{hf_sf}/**'],
	)
	"

	# Move data to expected location
	mkdir -p benchmark-data-sf${{ env.SCALE_FACTOR }}

	SF="${{ env.SCALE_FACTOR }}"
	if [ "$SF" = "0.1" ]; then
	HF_SF="sf0.1"
	else
	HF_SF="sf${SF}"
	fi

	cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/* benchmark-data-sf${{ env.SCALE_FACTOR }}/

	echo "Downloaded data structure:"
	find benchmark-data-sf${{ env.SCALE_FACTOR }} -type f -name "*.parquet" \| head -20
	echo ""
	echo "Directory contents:"
	ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
	echo ""
	echo "Total size:"
	du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/

	- name: Show cached data info
	if: steps.cache-data.outputs.cache-hit == 'true'
	run: \|
	echo "Using cached benchmark data"
	echo "Directory contents:"
	ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
	echo ""
	echo "Total size:"
	du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/

	benchmark-duckdb:
	name: Benchmark DuckDB (SF${{ github.event.inputs.scale_factor \|\| '1' }})
	needs: download-data
	runs-on: ubuntu-latest
	if: contains(github.event.inputs.engines \|\| 'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb')
	steps:
	- uses: actions/checkout@v4

	- name: Restore benchmark data from cache
	uses: actions/cache/restore@v4
	with:
	path: benchmark-data-sf${{ env.SCALE_FACTOR }}
	key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
	fail-on-cache-miss: true

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install dependencies
	run: \|
	echo "=== DuckDB Installation Parameters ==="
	echo "DUCKDB_NIGHTLY: ${{ env.DUCKDB_NIGHTLY }}"
	echo "DUCKDB_VERSION: ${{ env.DUCKDB_VERSION }}"
	echo "======================================"
	if [ "${{ env.DUCKDB_NIGHTLY }}" = "true" ]; then
	# Use --pre to install pre-release dev builds (e.g., 1.4.4.dev48)
	# Constraint <1.5.0 ensures we get 1.4.x branch dev builds
	pip install "duckdb<1.5.0" --pre pyarrow pandas
	elif [ -n "${{ env.DUCKDB_VERSION }}" ]; then
	pip install "duckdb==${{ env.DUCKDB_VERSION }}" pyarrow pandas
	else
	pip install duckdb pyarrow pandas
	fi
	echo "Installed DuckDB version: $(python -c 'import duckdb; print(duckdb.__version__)')"

	- name: Pre-install DuckDB spatial extension
	run: \|
	# Dev builds don't have spatial extension in core_nightly, so always use default repo
	python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial'); print('DuckDB spatial extension installed')"

	- name: Run DuckDB benchmark
	run: \|
	python benchmark/run_benchmark.py \
	--data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
	--engines duckdb \
	--timeout ${{ env.QUERY_TIMEOUT }} \
	--runs ${{ env.BENCHMARK_RUNS }} \
	--scale-factor ${{ env.SCALE_FACTOR }} \
	--output duckdb_results.json

	- name: Upload results
	uses: actions/upload-artifact@v4
	with:
	name: duckdb-results-sf${{ env.SCALE_FACTOR }}
	path: duckdb_results.json
	retention-days: 30

	benchmark-geopandas:
	name: Benchmark GeoPandas (SF${{ github.event.inputs.scale_factor \|\| '1' }})
	needs: download-data
	runs-on: ubuntu-latest
	if: contains(github.event.inputs.engines \|\| 'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas')
	steps:
	- uses: actions/checkout@v4

	- name: Restore benchmark data from cache
	uses: actions/cache/restore@v4
	with:
	path: benchmark-data-sf${{ env.SCALE_FACTOR }}
	key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
	fail-on-cache-miss: true

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install dependencies
	run: \|
	if [ -n "${{ env.GEOPANDAS_VERSION }}" ]; then
	pip install "geopandas==${{ env.GEOPANDAS_VERSION }}" pandas pyarrow shapely
	else
	pip install geopandas pandas pyarrow shapely
	fi
	echo "Installed GeoPandas version: $(python -c 'from importlib.metadata import version; print(version("geopandas"))')"

	- name: Run GeoPandas benchmark
	run: \|
	python benchmark/run_benchmark.py \
	--data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
	--engines geopandas \
	--timeout ${{ env.QUERY_TIMEOUT }} \
	--runs ${{ env.BENCHMARK_RUNS }} \
	--scale-factor ${{ env.SCALE_FACTOR }} \
	--output geopandas_results.json

	- name: Upload results
	uses: actions/upload-artifact@v4
	with:
	name: geopandas-results-sf${{ env.SCALE_FACTOR }}
	path: geopandas_results.json
	retention-days: 30

	benchmark-sedonadb:
	name: Benchmark SedonaDB (SF${{ github.event.inputs.scale_factor \|\| '1' }})
	needs: download-data
	runs-on: ubuntu-latest
	if: contains(github.event.inputs.engines \|\| 'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb')
	steps:
	- uses: actions/checkout@v4

	- name: Restore benchmark data from cache
	uses: actions/cache/restore@v4
	with:
	path: benchmark-data-sf${{ env.SCALE_FACTOR }}
	key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
	fail-on-cache-miss: true

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install dependencies
	run: \|
	echo "=== SedonaDB Installation Parameters ==="
	echo "SEDONADB_NIGHTLY: ${{ env.SEDONADB_NIGHTLY }}"
	echo "SEDONADB_VERSION: ${{ env.SEDONADB_VERSION }}"
	echo "========================================"
	if [ "${{ env.SEDONADB_NIGHTLY }}" = "true" ]; then
	# Use Gemfury as primary index and --pre to install nightly alpha builds (e.g., 0.3.0a69)
	pip install "sedonadb[geopandas]" pandas pyarrow pyproj \
	--pre \
	--index-url https://repo.fury.io/sedona-nightlies/ \
	--extra-index-url https://pypi.org/simple/
	elif [ -n "${{ env.SEDONADB_VERSION }}" ]; then
	pip install "sedonadb[geopandas]==${{ env.SEDONADB_VERSION }}" pandas pyarrow pyproj
	else
	pip install "sedonadb[geopandas]" pandas pyarrow pyproj
	fi
	echo "Installed SedonaDB version: $(python -c 'from importlib.metadata import version; print(version("sedonadb"))')"

	- name: Run SedonaDB benchmark
	run: \|
	python benchmark/run_benchmark.py \
	--data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
	--engines sedonadb \
	--timeout ${{ env.QUERY_TIMEOUT }} \
	--runs ${{ env.BENCHMARK_RUNS }} \
	--scale-factor ${{ env.SCALE_FACTOR }} \
	--output sedonadb_results.json

	- name: Upload results
	uses: actions/upload-artifact@v4
	with:
	name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
	path: sedonadb_results.json
	retention-days: 30

	benchmark-spatial-polars:
	name: Benchmark Spatial Polars (SF${{ github.event.inputs.scale_factor \|\| '1' }})
	needs: download-data
	runs-on: ubuntu-latest
	if: contains(github.event.inputs.engines \|\| 'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars')
	steps:
	- uses: actions/checkout@v4

	- name: Restore benchmark data from cache
	uses: actions/cache/restore@v4
	with:
	path: benchmark-data-sf${{ env.SCALE_FACTOR }}
	key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
	fail-on-cache-miss: true

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install dependencies
	run: \|
	if [ -n "${{ env.SPATIAL_POLARS_VERSION }}" ]; then
	pip install "spatial-polars[knn]==${{ env.SPATIAL_POLARS_VERSION }}" pyarrow
	else
	pip install "spatial-polars[knn]" pyarrow
	fi
	echo "Installed Spatial Polars version: $(python -c 'from importlib.metadata import version; print(version("spatial-polars"))')"

	- name: Run Spatial Polars benchmark
	run: \|
	python benchmark/run_benchmark.py \
	--data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
	--engines spatial_polars \
	--timeout ${{ env.QUERY_TIMEOUT }} \
	--runs ${{ env.BENCHMARK_RUNS }} \
	--scale-factor ${{ env.SCALE_FACTOR }} \
	--output spatial_polars_results.json

	- name: Upload results
	uses: actions/upload-artifact@v4
	with:
	name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
	path: spatial_polars_results.json
	retention-days: 30

	summarize-results:
	name: Summarize Results (SF${{ github.event.inputs.scale_factor \|\| '1' }})
	needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb, benchmark-spatial-polars]
	if: always() && (needs.benchmark-duckdb.result == 'success' \|\| needs.benchmark-geopandas.result == 'success' \|\| needs.benchmark-sedonadb.result == 'success' \|\| needs.benchmark-spatial-polars.result == 'success')
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Download DuckDB results
	if: needs.benchmark-duckdb.result == 'success'
	uses: actions/download-artifact@v4
	with:
	name: duckdb-results-sf${{ env.SCALE_FACTOR }}
	path: results
	continue-on-error: true

	- name: Download GeoPandas results
	if: needs.benchmark-geopandas.result == 'success'
	uses: actions/download-artifact@v4
	with:
	name: geopandas-results-sf${{ env.SCALE_FACTOR }}
	path: results
	continue-on-error: true

	- name: Download SedonaDB results
	if: needs.benchmark-sedonadb.result == 'success'
	uses: actions/download-artifact@v4
	with:
	name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
	path: results
	continue-on-error: true

	- name: Download Spatial Polars results
	if: needs.benchmark-spatial-polars.result == 'success'
	uses: actions/download-artifact@v4
	with:
	name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
	path: results
	continue-on-error: true

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Generate summary
	run: \|
	python benchmark/summarize_results.py \
	--results-dir results \
	--timeout ${{ env.QUERY_TIMEOUT }} \
	--runs ${{ env.BENCHMARK_RUNS }} \
	--output benchmark_summary.md

	- name: Display summary
	run: cat benchmark_summary.md

	- name: Add summary to job output
	run: cat benchmark_summary.md >> $GITHUB_STEP_SUMMARY

	- name: Upload combined results
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-summary-sf${{ env.SCALE_FACTOR }}
	path: \|
	results/
	benchmark_summary.md
	retention-days: 90

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SpatialBench Benchmark #36

Workflow file

SpatialBench Benchmark #36

Uh oh!

Workflow file for this run