Skip to content

Commit 641ecd5

Browse files
feat(_put_file): dynamically adjust chunksize based on file size (#990)
Co-authored-by: Martin Durant <[email protected]>
1 parent 291fef9 commit 641ecd5

File tree

2 files changed

+45
-4
lines changed

2 files changed

+45
-4
lines changed

s3fs/core.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import errno
44
import io
55
import logging
6+
import math
67
import mimetypes
78
import os
89
import socket
@@ -69,6 +70,8 @@ def setup_logging(level=None):
6970
ResponseParserError,
7071
)
7172

73+
MAX_UPLOAD_PARTS = 10_000 # maximum number of parts for S3 multipart upload
74+
7275
if ClientPayloadError is not None:
7376
S3_RETRYABLE_ERRORS += (ClientPayloadError,)
7477

@@ -174,6 +177,18 @@ def _coalesce_version_id(*args):
174177
return version_ids.pop()
175178

176179

180+
def calculate_chunksize(filesize, chunksize=None, max_parts=MAX_UPLOAD_PARTS) -> int:
181+
if chunksize is None:
182+
chunksize = 50 * 2**20 # default chunksize set to 50 MiB
183+
required_chunks = math.ceil(filesize / chunksize)
184+
# increase chunksize to fit within the max_parts limit
185+
if required_chunks > max_parts:
186+
# S3 supports uploading objects up to 5 TiB in size,
187+
# so each chunk can be up to ~524 MiB.
188+
chunksize = math.ceil(filesize / max_parts)
189+
return chunksize
190+
191+
177192
class S3FileSystem(AsyncFileSystem):
178193
"""
179194
Access S3 as if it were a file system.
@@ -1242,7 +1257,7 @@ async def _put_file(
12421257
lpath,
12431258
rpath,
12441259
callback=_DEFAULT_CALLBACK,
1245-
chunksize=50 * 2**20,
1260+
chunksize=None,
12461261
max_concurrency=None,
12471262
mode="overwrite",
12481263
**kwargs,
@@ -1270,6 +1285,7 @@ async def _put_file(
12701285
if content_type is not None:
12711286
kwargs["ContentType"] = content_type
12721287

1288+
chunksize = calculate_chunksize(size, chunksize=chunksize)
12731289
with open(lpath, "rb") as f0:
12741290
if size < min(5 * 2**30, 2 * chunksize):
12751291
chunk = f0.read()
@@ -1288,8 +1304,8 @@ async def _put_file(
12881304
key,
12891305
mpu,
12901306
f0,
1307+
chunksize,
12911308
callback=callback,
1292-
chunksize=chunksize,
12931309
max_concurrency=max_concurrency,
12941310
)
12951311
parts = [
@@ -1317,8 +1333,8 @@ async def _upload_file_part_concurrent(
13171333
key,
13181334
mpu,
13191335
f0,
1336+
chunksize,
13201337
callback=_DEFAULT_CALLBACK,
1321-
chunksize=50 * 2**20,
13221338
max_concurrency=None,
13231339
):
13241340
max_concurrency = max_concurrency or self.max_concurrency

s3fs/tests/test_s3fs.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import botocore
2222
import s3fs.core
23-
from s3fs.core import S3FileSystem
23+
from s3fs.core import MAX_UPLOAD_PARTS, S3FileSystem, calculate_chunksize
2424
from s3fs.utils import ignoring, SSEParams
2525
from botocore.exceptions import NoCredentialsError
2626
from fsspec.asyn import sync
@@ -2995,6 +2995,31 @@ def test_bucket_info(s3):
29952995
assert info["name"] == test_bucket_name
29962996

29972997

2998+
MB = 2**20
2999+
GB = 2**30
3000+
TB = 2**40
3001+
3002+
3003+
@pytest.mark.parametrize(
3004+
["filesize", "chunksize", "expected"],
3005+
[
3006+
# small file, use default chunksize
3007+
(1000, None, 50 * MB),
3008+
# exact boundary, use default chunksize
3009+
(50 * MB * MAX_UPLOAD_PARTS, None, 50 * MB),
3010+
# file requiring increased chunksize
3011+
(50 * MB * (MAX_UPLOAD_PARTS + 1), None, 52_434_043),
3012+
# very large files, expect increased chunksize
3013+
(1 * TB, None, 109_951_163),
3014+
(5 * TB, None, 549_755_814),
3015+
# respect explicit chunksize
3016+
(5 * GB, 10 * MB, 10 * MB),
3017+
],
3018+
)
3019+
def test_calculate_chunksize(filesize, chunksize, expected):
3020+
assert calculate_chunksize(filesize, chunksize) == expected
3021+
3022+
29983023
def test_find_ls_fail(s3):
29993024
# beacuse of https://github.com/fsspec/s3fs/pull/989
30003025
client = get_boto3_client()

0 commit comments

Comments
 (0)