s3io_benchmark: Add test object generator (#1767)

yerzhan7 · web-flow · commit 57e1d2345cec · 2026-02-10T14:51:20.000Z
**What changed and why?** Currently, `s3io_benchmark` requires s3 object to exist in the bucket (for `read` jobs) which is inconvenient. - Added ability to automatically generate new test objects during initialization for read jobs. - Uses separate S3 CRT/Uploader client stack to not influence benchmark jobs. - Didn't use Rust AWS SDK as it's slower than CRT - Didn't use Rust Transfer Manager as it's not GA yet - Added new config flag `generate_object` with default value of `true`. - Manually tested generating 100GB object. - Automatically overwrite write part size if it exceeds 10K max parts S3 MPU limit - Do not perform upload if object with correct size already exists ### Does this change impact existing behavior? Only benchmark which is currently not used anywhere. ### Does this change need a changelog entry? Does it require a version change? No - benchmark script only. --- By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license and I agree to the terms of the [Developer Certificate of Origin (DCO)](https://developercertificate.org/). --------- Signed-off-by: Yerzhan Mazhkenov <20302932+yerzhan7@users.noreply.github.com>
diff --git a/mountpoint-s3-fs/examples/s3io_benchmark/config.rs b/mountpoint-s3-fs/examples/s3io_benchmark/config.rs
@@ -4,6 +4,8 @@ use std::path::Path;
 use std::time::Duration;
 use thiserror::Error;
 
+use crate::test_object_generator::generate_test_objects;
+
 /// Top-level configuration structure
 #[derive(Debug, Clone, Deserialize, PartialEq)]
 pub struct Config {
@@ -58,6 +60,9 @@ pub struct JobConfig {
     /// If specified then job is time-based instead of reading until total bytes equal to file size.
     #[serde(default, with = "humantime_serde")]
     pub iteration_duration: Option<Duration>,
+    /// Whether to generate test objects before running read benchmarks.
+    /// Only applies to read workloads. Default: true.
+    pub generate_object: Option<bool>,
 }
 
 /// Configuration for a single job execution
@@ -77,6 +82,7 @@ pub struct ResolvedJobConfig {
     pub iterations: usize,
     pub max_duration: Option<Duration>,
     pub iteration_duration: Option<Duration>,
+    pub generate_object: bool,
 }
 
 /// Workload type: read or write
@@ -144,7 +150,7 @@ pub fn parse_config_string(content: &str) -> Result<Config, ConfigError> {
 }
 
 /// Prepare jobs by resolving configuration inheritance and validating
-pub fn prepare_jobs(config: Config) -> Result<Vec<ResolvedJobConfig>, ConfigError> {
+pub async fn prepare_jobs(config: Config) -> Result<Vec<ResolvedJobConfig>, ConfigError> {
     // Validate global network interfaces if specified
     if let Some(bind) = &config.global.bind {
         if bind.is_empty() {
@@ -194,7 +200,9 @@ pub fn prepare_jobs(config: Config) -> Result<Vec<ResolvedJobConfig>, ConfigErro
         }
     }
 
-    // TODO: Generate/upload test objects for read workloads
+    generate_test_objects(&resolved_jobs, &config.global)
+        .await
+        .map_err(|e| ConfigError::Validation(format!("Object generation failed: {}", e)))?;
 
     Ok(resolved_jobs)
 }
@@ -254,6 +262,12 @@ fn merge_and_resolve(job_name: &str, job: &JobConfig, global: &GlobalConfig) ->
     // iteration_duration: Optional, no default (random read only)
     let iteration_duration = job.iteration_duration.or(global.job_defaults.iteration_duration);
 
+    // generate_object: Optional with default of true
+    let generate_object = job
+        .generate_object
+        .or(global.job_defaults.generate_object)
+        .unwrap_or(true);
+
     Ok(ResolvedJobConfig {
         name: job_name.to_string(),
         workload_type,
@@ -268,5 +282,6 @@ fn merge_and_resolve(job_name: &str, job: &JobConfig, global: &GlobalConfig) ->
         iterations,
         max_duration,
         iteration_duration,
+        generate_object,
     })
 }
diff --git a/mountpoint-s3-fs/examples/s3io_benchmark/executor.rs b/mountpoint-s3-fs/examples/s3io_benchmark/executor.rs
@@ -35,8 +35,8 @@ pub enum ExecutionError {
 }
 
 pub struct Executor {
-    client: S3CrtClient,
-    uploader: Uploader<S3CrtClient>,
+    pub client: S3CrtClient,
+    pub uploader: Uploader<S3CrtClient>,
     prefetcher: Prefetcher<S3CrtClient>,
 }
 
diff --git a/mountpoint-s3-fs/examples/s3io_benchmark/main.rs b/mountpoint-s3-fs/examples/s3io_benchmark/main.rs
@@ -2,6 +2,7 @@ mod config;
 mod executor;
 mod monitoring;
 mod results;
+mod test_object_generator;
 
 use anyhow::{Context, Result};
 use clap::Parser;
@@ -45,7 +46,7 @@ async fn run_benchmark() -> Result<()> {
     let config = parse_config_file(&cli.config_file).context("Failed to load configuration file")?;
 
     eprintln!("Preparing and validating jobs...");
-    let resolved_jobs = prepare_jobs(config.clone()).context("Failed to prepare jobs")?;
+    let resolved_jobs = prepare_jobs(config.clone()).await.context("Failed to prepare jobs")?;
 
     eprintln!("Found {} job(s) to execute", resolved_jobs.len());
 
diff --git a/mountpoint-s3-fs/examples/s3io_benchmark/test_object_generator.rs b/mountpoint-s3-fs/examples/s3io_benchmark/test_object_generator.rs
@@ -0,0 +1,131 @@
+use mountpoint_s3_client::ObjectClient;
+use mountpoint_s3_client::types::HeadObjectParams;
+use thiserror::Error;
+
+use crate::config::{GlobalConfig, ResolvedJobConfig, WorkloadType};
+use crate::executor::Executor;
+
+/// S3 multipart upload has a hard limit of 10,000 parts per upload
+/// https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
+const MAX_PARTS: u64 = 10_000;
+
+#[derive(Debug, Error)]
+pub enum ObjectGenerationError {
+    #[error("Setup failed: {0}")]
+    Setup(String),
+    #[error("Upload failed for '{key}': {reason}")]
+    Upload { key: String, reason: String },
+}
+
+// Note: This intentionally creates a separate Executor instance to ensure
+// test object generation doesn't influence benchmark jobs
+pub async fn generate_test_objects(
+    jobs: &[ResolvedJobConfig],
+    global: &GlobalConfig,
+) -> Result<(), ObjectGenerationError> {
+    let jobs_requiring_generation: Vec<&ResolvedJobConfig> = jobs
+        .iter()
+        .filter(|job| job.workload_type == WorkloadType::Read && job.generate_object)
+        .collect();
+
+    if jobs_requiring_generation.is_empty() {
+        return Ok(());
+    }
+
+    // Override write_part_size if any job would exceed S3's 10,000 part limit
+    let mut adjusted_global = global.clone();
+    let max_object_size = jobs_requiring_generation
+        .iter()
+        .map(|job| job.object_size)
+        .max()
+        .unwrap_or(0);
+    let default_write_part_size = global.write_part_size.unwrap_or(8 * 1024 * 1024) as u64;
+    let min_required_part_size = max_object_size.div_ceil(MAX_PARTS);
+    if min_required_part_size > default_write_part_size {
+        adjusted_global.write_part_size = Some(min_required_part_size as usize);
+        eprintln!(
+            "Test Object Generator: Adjusted write_part_size from {} to {} bytes to stay within S3's 10,000 part limit for object size {} bytes",
+            default_write_part_size, min_required_part_size, max_object_size
+        );
+    }
+
+    let executor = Executor::new(&adjusted_global).map_err(|e| ObjectGenerationError::Setup(e.to_string()))?;
+    for job in jobs_requiring_generation {
+        // Skip generation if object already exists with correct size
+        match executor
+            .client
+            .head_object(&job.bucket, &job.object_key, &HeadObjectParams::new())
+            .await
+        {
+            Ok(head_result) => {
+                if head_result.size == job.object_size {
+                    eprintln!(
+                        "Test object for job '{}' already exists with correct size: key={}, size={} bytes",
+                        job.name, job.object_key, job.object_size
+                    );
+                    continue;
+                } else {
+                    eprintln!(
+                        "Test object for job '{}' exists but has wrong size (expected: {}, actual: {}), re-uploading: key={}",
+                        job.name, job.object_size, head_result.size, job.object_key
+                    );
+                }
+            }
+            Err(_) => {
+                eprintln!(
+                    "Test object for job '{}' does not exist, uploading: key={}",
+                    job.name, job.object_key
+                );
+            }
+        }
+
+        upload_test_object(&executor, &job.bucket, &job.object_key, job.object_size, job.write_size).await?;
+        eprintln!(
+            "Generated test object for job '{}': key={}, size={} bytes",
+            job.name, job.object_key, job.object_size
+        );
+    }
+
+    Ok(())
+}
+
+async fn upload_test_object(
+    executor: &Executor,
+    bucket: &str,
+    key: &str,
+    size: u64,
+    write_size: usize,
+) -> Result<(), ObjectGenerationError> {
+    let mut request = executor
+        .uploader
+        .start_atomic_upload(bucket.to_string(), key.to_string())
+        .map_err(|e| ObjectGenerationError::Upload {
+            key: key.to_string(),
+            reason: format!("Failed to start upload: {}", e),
+        })?;
+
+    let contents = vec![0xab; write_size];
+    let mut offset = 0u64;
+
+    while offset < size {
+        let remaining = size - offset;
+        let chunk_size = remaining.min(write_size as u64) as usize;
+
+        let bytes_written = request
+            .write(offset as i64, &contents[..chunk_size])
+            .await
+            .map_err(|e| ObjectGenerationError::Upload {
+                key: key.to_string(),
+                reason: format!("Write failed at offset {}: {}", offset, e),
+            })?;
+
+        offset += bytes_written as u64;
+    }
+
+    request.complete().await.map_err(|e| ObjectGenerationError::Upload {
+        key: key.to_string(),
+        reason: format!("Failed to complete upload: {}", e),
+    })?;
+
+    Ok(())
+}

Original file line number	Diff line number	Diff line change
`@@ -35,8 +35,8 @@ pub enum ExecutionError {`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`pub struct Executor {`
`38`		`- client: S3CrtClient,`
`39`		`- uploader: Uploader<S3CrtClient>,`
	`38`	`+ pub client: S3CrtClient,`
	`39`	`+ pub uploader: Uploader<S3CrtClient>,`
`40`	`40`	`prefetcher: Prefetcher<S3CrtClient>,`
`41`	`41`	`}`
`42`	`42`