Error while streaming rasters from a private S3 with dask #843
-
|
Hello, I'm trying to stream rasters from a private S3, using using
I tried running the same code without initializing a SLURMCluster, and I could use import os
import boto3
import rasterio
import rioxarray as rxr
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
# Configure S3 credentials
credentials = {
"AWS_ACCESS_KEY_ID":"...",
"AWS_SECRET_ACCESS_KEY":"...",
"AWS_SESSION_TOKEN":"...",
"AWS_DEFAULT_REGION":"...",
}
# Set environment variables (required)
os.environ['AWS_ACCESS_KEY_ID'] = credentials["AWS_ACCESS_KEY_ID"]
os.environ['AWS_SECRET_ACCESS_KEY'] = credentials["AWS_SECRET_ACCESS_KEY"]
os.environ['AWS_SESSION_TOKEN'] = credentials["AWS_SESSION_TOKEN"]
os.environ['AWS_DEFAULT_REGION']="us-east-1"
# Create the dask cluster
cluster = SLURMCluster(
cores=4,
memory="50GB",
walltime="01:00:00",
job_script_prologue="..."
)
client = Client(cluster)
# Configure the s3client => not used here, but I used it to list files
ENDPOINT_URL = "https://s3..."
s3_client = boto3.client(
's3',
endpoint_url=ENDPOINT_URL,
aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'],
aws_session_token=credentials['AWS_SESSION_TOKEN'],
region_name='us-east-1'
)
bucket_name = "..."
object_key = "..."
file_name = "..."
s3_file_path = f"zip+s3://{bucket_name}/{object_key}!/{file_name}"
rio_env = rasterio.Env(
AWS_S3_ENDPOINT=ENDPOINT_URL,
AWS_VIRTUAL_HOSTING='FALSE',
GDAL_HTTP_UNSAFESSL='YES',
AWS_REGION='us-east-1',
GDAL_DISABLE_READDIR_ON_OPEN='EMPTY_DIR',
GDAL_HTTP_COOKIEFILE=os.path.expanduser('~/cookies.txt'),
GDAL_HTTP_COOKIEJAR=os.path.expanduser('~/cookies.txt'),
)
with rio_env:
ds = rxr.open_rasterio(s3_file_path)
print(ds) # Gives the right metadata
print(ds.data) # Displays the actual data array
ds = rxr.open_rasterio(s3_file_path, chunks=True)
print(ds) # Gives the right metadata
print(ds.data) # Still displays metadata: dask.array<open_rasterio...>, shape=(1, 5490, 5490), dtype=int16, chunksize=(1, 5490, 5490), chunktype=numpy.ndarray>
ds.load() # failsI tried different solutions:
Is there something I'm missing ? Maybe a specific configuration parameter ? The documentation of the S3 I'm accessing doesn't provide any help on that matter. |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment
-
|
I found a solution to this problem, using the import os
import rasterio
from dask_jobqueue import SLURMCluster
from distributed import Client
from osgeo import gdal
cluster = SLURMCluster(...)
client = Client(cluster)
credentials = {...}
def configure_workers_environment():
gdal.SetConfigOption('AWS_REGION', 'eu-central-1')
gdal.SetConfigOption('AWS_SECRET_ACCESS_KEY', credentials["AWS_SECRET_ACCESS_KEY"])
gdal.SetConfigOption('AWS_ACCESS_KEY_ID', credentials["AWS_ACCESS_KEY_ID"])
gdal.SetConfigOption('AWS_SESSION_TOKEN', credentials["AWS_SESSION_TOKEN"])
rio_env = rasterio.Env(
AWS_S3_ENDPOINT='...',
AWS_VIRTUAL_HOSTING='FALSE',
GDAL_HTTP_UNSAFESSL='YES',
AWS_REGION='us-east-1',
GDAL_DISABLE_READDIR_ON_OPEN='EMPTY_DIR',
GDAL_HTTP_COOKIEFILE=os.path.expanduser('~/cookies.txt'),
GDAL_HTTP_COOKIEJAR=os.path.expanduser('~/cookies.txt'),
)
rio_env.__enter__()
return None
client.register_worker_callbacks(configure_workers_environment) |
Beta Was this translation helpful? Give feedback.
I found a solution to this problem, using the
client.register_worker_callbacksmethod (https://distributed.dask.org/en/latest/api.html#distributed.Client.register_worker_callbacks). It sets up the right environment for every dask worker. For example: