Skip to content

Commit d0a3163

Browse files
AngellowXiandong Meng
andauthored
[INFRA-5590] Add functions to Download file from Cloud Providers (#2)
* Download file from AWS S3 * [INFRA-5590] Bazel functions and rules to download file from AWS S3 * Update README.md * change the file name and do some cleanups --------- Co-authored-by: Xiandong Meng <[email protected]>
1 parent 0e1fda8 commit d0a3163

File tree

5 files changed

+209
-0
lines changed

5 files changed

+209
-0
lines changed

.github/pull_request_template.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
- [ ] Include relevant Jira ticket ID(s) in the PR title
2+
3+
Why?
4+
----
5+
- Why is this PR necessary? What problem is this trying to solve?
6+
7+
How?
8+
----
9+
- How are we solving the above challenge?
10+
- Explain the design
11+
- Add any relevant reference links
12+
13+
Testing Evidence
14+
----------------
15+
- Examples:
16+
- Screenshots/Videos
17+
- API request/responses
18+
- DAG inspect results
19+
- Query plan
20+
- CLI command input/output
21+
- etc
22+
23+
Deployment Steps
24+
----------------
25+
- Examples:
26+
- Migration plan (e.g. databases, services)
27+
- Config changes (e.g. flag creation/setup)
28+
- Any PR dependencies (link/specify which PRs need to be merged and deployed before or after)
29+
30+
Deployment Verification
31+
-----------------------
32+
- How do you test the PR is working correctly?
33+
- What do SLA/on-call teams need to know to monitor for any issues as a result of this PR?

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,24 @@
11
# Rules Cloud Files
22

33
This repo contains Bazel rules related to fetching files from a cloud storage provider, namely, AWS S3, etc.
4+
5+
## Usage
6+
- Example
7+
```starlark
8+
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
9+
git_repository(
10+
name = "rules_cloud_files",
11+
remote = "https://github.com/6si/rules_cloud_files.git",
12+
commit = "d8097550e5c507f29c760a670daa3230c52dda59",
13+
)
14+
15+
load("@rules_cloud_files//cloud_file:cloud_file_rules.bzl", "s3_file")
16+
17+
s3_file(
18+
name = "my_file",
19+
bucket = "bootstrap-software",
20+
file_path = "hadoop2-configs-20180306012540.tgz",
21+
sha256 = "74a0bdd648f009ebce72494f54903230a9dcebaca1d438a13c1c691ad2f1e110",
22+
)
23+
```
24+
This is an example to download the file from s3://bootstrap-software/hadoop2-configs-20180306012540.tgz.

WORKSPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
workspace(name = "rules_cloud_files")

cloud_file/BUILD

Whitespace-only changes.

cloud_file/cloud_file_rules.bzl

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
"""
2+
Module for downloading and validating the checksum of the downloaded file.
3+
4+
This module contains the `validate_checksum` function and `cloud_file_download` function
5+
"""
6+
def validate_checksum(repo_ctx, url, local_path, expected_sha256):
7+
"""
8+
Verify the checksum of the downloaded file.
9+
10+
This function uses the sha256sum command to generate the checksum of the
11+
file located at local_path. The generated checksum is compared to the
12+
expected_sha256 value and if they do not match, the function raises an error.
13+
14+
Args:
15+
repo_ctx (object): The repository context object.
16+
url (str): The URL of the file.
17+
local_path (str): The local path of the file on the system.
18+
expected_sha256 (str): The expected sha256 value of the file.
19+
20+
Raises:
21+
Exception: If the checksum of the file does not match the expected_sha256 value.
22+
"""
23+
sha256_path = repo_ctx.which("sha256sum")
24+
repo_ctx.report_progress("Checksumming {}.".format(local_path))
25+
sha256_result = repo_ctx.execute([sha256_path, local_path])
26+
if sha256_result.return_code != 0:
27+
fail("Failed to verify checksum: {}".format(sha256_result.stderr))
28+
sha256 = sha256_result.stdout.split(" ")[0]
29+
if sha256 != expected_sha256:
30+
fail("Checksum mismatch for {}, expected {}, got {}.".format(
31+
url,
32+
expected_sha256,
33+
sha256,
34+
))
35+
36+
_CLOUD_FILE_DOWNLOAD = """
37+
package(default_visibility = ["//visibility:public"])
38+
39+
filegroup(
40+
name = "file",
41+
srcs = ["{}"],
42+
)
43+
"""
44+
45+
def cloud_file_download(
46+
repo_ctx,
47+
file_path,
48+
expected_sha256,
49+
provider,
50+
bucket = "",
51+
build_file = "",
52+
profile = ""):
53+
"""
54+
Securely download the file from the cloud provider.
55+
56+
The function downloads the specified file from a cloud provider and checks its
57+
sha256 hash to verify its integrity.
58+
59+
Args:
60+
repo_ctx (object): Bazel repository context.
61+
file_path (str): Path to the file to download from Cloud Provider.
62+
expected_sha256 (str): Expected sha256 hash of the downloaded file.
63+
provider (str): Name of the cloud provider, default is set to "s3".
64+
bucket (str): Name of the bucket containing the file.
65+
build_file(str): Build file for the downloaded file
66+
profile (str): CLI profile to use for authentication.
67+
68+
Raises:
69+
Exception: If the command line utility is not found, if downloading the
70+
file fails or if the sha256 hash of the downloaded file does not match the
71+
expected value.
72+
"""
73+
filename = repo_ctx.path(file_path).basename
74+
if provider == "s3":
75+
tool_path = repo_ctx.which("aws")
76+
if tool_path == None:
77+
fail("Could not find command line utility for S3")
78+
extra_flags = ["--profile", profile] if profile else []
79+
src_url = "s3://{}/{}".format(bucket, file_path)
80+
cmd = [tool_path] + extra_flags + ["s3", "cp", src_url, "."]
81+
elif provider == "gcp":
82+
tool_path = repo_ctx.which("gsutil")
83+
if tool_path == None:
84+
fail("Could not find command line utility for GCP")
85+
src_url = "gs://{}/{}".format(bucket, file_path)
86+
cmd = [tool_path, "cp", src_url, "."]
87+
else:
88+
fail("Provider not supported: " + provider.capitalize())
89+
90+
# Download.
91+
repo_ctx.report_progress("Downloading {}.".format(src_url))
92+
result = repo_ctx.execute(cmd, timeout = 1800)
93+
if result.return_code != 0:
94+
fail("Failed to download {} from {}: {}".format(src_url, provider.capitalize(), result.stderr))
95+
96+
# Verify.
97+
filename = repo_ctx.path(src_url).basename
98+
validate_checksum(repo_ctx, file_path, filename, expected_sha256)
99+
100+
# Default build file set to get the file
101+
repo_ctx.file("BUILD.bazel", _CLOUD_FILE_DOWNLOAD.format(filename), executable = False)
102+
103+
# Use user provided build file if exists
104+
bash_path = repo_ctx.os.environ.get("BAZEL_SH", "bash")
105+
if build_file:
106+
repo_ctx.execute([bash_path, "-c", "rm -f BUILD BUILD.bazel"])
107+
repo_ctx.symlink(build_file, "BUILD.bazel")
108+
109+
def _cloud_file_impl(ctx):
110+
cloud_file_download(
111+
ctx,
112+
ctx.attr.file_path,
113+
ctx.attr.sha256,
114+
provider = ctx.attr._provider,
115+
build_file = ctx.attr.build_file,
116+
profile = ctx.attr.profile if hasattr(ctx.attr, "profile") else "",
117+
bucket = ctx.attr.bucket if hasattr(ctx.attr, "bucket") else "",
118+
)
119+
120+
s3_file = repository_rule(
121+
implementation = _cloud_file_impl,
122+
attrs = {
123+
"bucket": attr.string(mandatory = True, doc = "Bucket name"),
124+
"file_path": attr.string(
125+
mandatory = True,
126+
doc = "Relative path to the archive file within the bucket",
127+
),
128+
"profile": attr.string(doc = "Profile to use for authentication."),
129+
"sha256": attr.string(mandatory = True, doc = "SHA256 checksum of the archive"),
130+
"build_file": attr.label(
131+
allow_single_file = True,
132+
doc = "BUILD file for the downloaded file",
133+
),
134+
"_provider": attr.string(default = "s3"),
135+
},
136+
)
137+
138+
gcp_file = repository_rule(
139+
implementation = _cloud_file_impl,
140+
attrs = {
141+
"bucket": attr.string(mandatory = True, doc = "Bucket name"),
142+
"file_path": attr.string(
143+
mandatory = True,
144+
doc = "Relative path to the archive file within the bucket",
145+
),
146+
"profile": attr.string(doc = "Profile to use for authentication."),
147+
"sha256": attr.string(mandatory = True, doc = "SHA256 checksum of the archive"),
148+
"build_file": attr.label(
149+
allow_single_file = True,
150+
doc = "BUILD file for the downloaded file",
151+
),
152+
"_provider": attr.string(default = "gcp"),
153+
},
154+
)

0 commit comments

Comments
 (0)