diff --git a/notebooks/advanced/autodask.ipynb b/notebooks/advanced/autodask.ipynb new file mode 100644 index 00000000..2c26b3e2 --- /dev/null +++ b/notebooks/advanced/autodask.ipynb @@ -0,0 +1,95 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import fsspec\n", + "import xarray as xr\n", + "from distributed import Client\n", + "from kerchunk.combine import auto_dask\n", + "from kerchunk.hdf import SingleHdf5ToZarr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = Client(n_workers=8, silence_logs=logging.ERROR)\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initiate fsspec filesystems for reading\n", + "fs_read = fsspec.filesystem(\"s3\", anon=True, skip_instance_cache=True)\n", + "\n", + "files_paths = fs_read.glob(\"s3://smn-ar-wrf/DATA/WRF/DET/2022/12/31/12/*\")\n", + "\n", + "# Here we prepend the prefix 's3://', which points to AWS.\n", + "file_pattern = sorted([\"s3://\" + f for f in files_paths])\n", + "\n", + "# Keep just the \"01H\" data\n", + "file_pattern = file_pattern[0:-5]\n", + "print(len(file_pattern))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mzz = auto_dask(\n", + " urls=file_pattern,\n", + " single_kwargs=dict(\n", + " inline_threshold=300,\n", + " storage_options=dict(\n", + " mode=\"rb\", anon=True, default_fill_cache=False, default_cache_type=\"first\"\n", + " ),\n", + " ),\n", + " single_driver=SingleHdf5ToZarr,\n", + " mzz_kwargs={\"concat_dims\": [\"time\"], \"identical_dims\": [\"y\", \"x\"]},\n", + " n_batches=20,\n", + " remote_protocol=\"s3\",\n", + " remote_options={\"anon\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = xr.open_dataset(\n", + " \"reference://\",\n", + " engine=\"zarr\",\n", + " backend_kwargs={\n", + " \"storage_options\": {\n", + " \"fo\": mzz,\n", + " },\n", + " \"consolidated\": False,\n", + " },\n", + ")\n", + "ds" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}