|
5 | 5 | import signal
|
6 | 6 | import subprocess
|
7 | 7 | from datetime import datetime, timezone
|
| 8 | +from pathlib import Path |
8 | 9 | from typing import List, Dict, Any, Optional
|
9 | 10 |
|
10 | 11 | import hydra
|
| 12 | +from hydra.core.hydra_config import HydraConfig |
| 13 | +from hydra.types import RunMode |
11 | 14 | from omegaconf import DictConfig, OmegaConf
|
12 | 15 | import urllib.request
|
13 | 16 |
|
@@ -60,6 +63,40 @@ def write_metadata(metadata: Dict[str, Any]) -> None:
|
60 | 63 | log.error("Failed to write metadata", exc_info=True)
|
61 | 64 |
|
62 | 65 |
|
| 66 | +def upload_results_to_s3(bucket_name: str, region: str) -> None: |
| 67 | + """ |
| 68 | + Upload benchmark results to S3 bucket using the AWS CLI. |
| 69 | + Only uploads results from multirun directories. |
| 70 | + """ |
| 71 | + |
| 72 | + hydra_config = HydraConfig.get() |
| 73 | + |
| 74 | + if hydra_config.mode == RunMode.MULTIRUN: |
| 75 | + source_path = Path(hydra_config.runtime.output_dir).parent |
| 76 | + |
| 77 | + assert len(source_path.parts) >= 2, "Source path must have at least 2 parts for date/time extraction" |
| 78 | + date_part, time_part = source_path.parts[-2:] |
| 79 | + |
| 80 | + s3_target_path = f"s3://{bucket_name}/results/{date_part}/{time_part}" |
| 81 | + |
| 82 | + aws_cmd = [ |
| 83 | + "aws", |
| 84 | + "s3", |
| 85 | + "sync", |
| 86 | + str(source_path), |
| 87 | + s3_target_path, |
| 88 | + "--region", |
| 89 | + region, |
| 90 | + ] |
| 91 | + result = subprocess.run(aws_cmd, capture_output=True, text=True) |
| 92 | + if result.returncode == 0: |
| 93 | + log.info("Successfully uploaded benchmark results to S3") |
| 94 | + else: |
| 95 | + log.error(f"S3 upload failed: {result.stderr.strip()}") |
| 96 | + else: |
| 97 | + log.info("Skipping benchmark upload for non-multirun") |
| 98 | + |
| 99 | + |
63 | 100 | class ResourceMonitoring:
|
64 | 101 | def __init__(self, target_pid, with_bwm: bool, with_perf_stat: bool):
|
65 | 102 | """Resource monitoring setup.
|
@@ -200,6 +237,17 @@ def run_experiment(cfg: DictConfig) -> None:
|
200 | 237 |
|
201 | 238 | # Mark success if we get here without exceptions
|
202 | 239 | metadata["success"] = True
|
| 240 | + |
| 241 | + result_bucket_name = common_config.get("s3_result_bucket") |
| 242 | + |
| 243 | + # If region is not specified, default to 'us-east-1' as that is the only region we can be relavtively assued that tranium instances are available |
| 244 | + region = common_config.get("region", "us-east-1") |
| 245 | + if result_bucket_name: |
| 246 | + log.info(f"Uploading benchmark results to S3 bucket '{result_bucket_name}'") |
| 247 | + upload_results_to_s3(result_bucket_name, region) |
| 248 | + else: |
| 249 | + log.info("No results bucket specified (s3_result_bucket), skipping upload") |
| 250 | + |
203 | 251 | except Exception:
|
204 | 252 | log.error("Benchmark execution failed:", exc_info=True)
|
205 | 253 | raise
|
|
0 commit comments