Skip to content

Commit 4b3a174

Browse files
authored
Merge pull request #11 from BaizeAI/codex/fix-sync.py-for-qiniu-upload
feat: stream Hugging Face repos to Qiniu
2 parents 58f82ce + 6895754 commit 4b3a174

File tree

4 files changed

+120
-11
lines changed

4 files changed

+120
-11
lines changed

README.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,20 @@
1-
# baize-ai-download
1+
# baize-ai-download
2+
3+
Utilities for syncing Hugging Face repositories or local directories to Qiniu cloud storage.
4+
5+
## Usage
6+
7+
### Upload a local directory
8+
9+
```bash
10+
python sync.py /path/to/local/dir qiniu/remote/path
11+
```
12+
13+
### Upload directly from Hugging Face
14+
15+
```bash
16+
python sync.py --hf repo-id qiniu/remote/path
17+
```
18+
19+
Use optional arguments `--ak`, `--sk` and `--bucket` or environment variables `QUNIU_ACCESS_TOKEN`, `QUNIU_SECRET_KEY` and `QUNIU_BUCKET_NAME` to configure Qiniu credentials. Provide a Hugging Face access token with `--hf-token` or the `HF_TOKEN` environment variable when required.
20+

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
qiniu
22
jinja2
33
huggingface_hub[cli]
4+
fsspec[http]

sync.py

Lines changed: 98 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
import argparse
22
import os
3-
from qiniu import Auth, put_file, BucketManager
3+
import io
4+
from collections import defaultdict
5+
from typing import Optional
6+
7+
import requests
8+
from qiniu import Auth, put_file, put_stream_v2, put_data
49
from jinja2 import Environment, BaseLoader
10+
from huggingface_hub import HfApi, hf_hub_url
11+
from huggingface_hub.hf_api import RepoFolder
12+
import fsspec
513

614

715
# 直接在脚本中定义模板
@@ -61,15 +69,102 @@ def upload_file(client, bucket_name, local_file, remote_file):
6169
print(f"Uploaded {local_file} to {remote_file}")
6270

6371

72+
def upload_data(client, bucket_name, data, remote_file):
73+
token = client.upload_token(bucket_name, remote_file, 3600)
74+
put_data(token, remote_file, data)
75+
print(f"Uploaded bytes data to {remote_file}")
76+
77+
78+
def upload_stream(client, bucket_name, stream, remote_file, file_name, file_size):
79+
token = client.upload_token(bucket_name, remote_file, 3600)
80+
print(f"Uploading stream to {remote_file}")
81+
print(put_stream_v2(token, remote_file, stream, file_name, file_size))
82+
83+
84+
def upload_hf_repo(
85+
ak,
86+
sk,
87+
bucket_name,
88+
repo_id,
89+
target,
90+
repo_type="model",
91+
revision="main",
92+
token: Optional[str] = None,
93+
):
94+
q = Auth(ak, sk)
95+
env = Environment(loader=BaseLoader())
96+
template = env.from_string(template_string)
97+
98+
api = HfApi(token=token)
99+
tree = api.list_repo_tree(
100+
repo_id=repo_id,
101+
repo_type=repo_type,
102+
revision=revision,
103+
recursive=True,
104+
)
105+
106+
structure: dict[str, dict[str, set[str]]] = defaultdict(
107+
lambda: {"dirs": set(), "files": set()}
108+
)
109+
110+
headers = {"Authorization": f"Bearer {token}"} if token else {}
111+
112+
for item in tree:
113+
if isinstance(item, RepoFolder):
114+
continue
115+
116+
dir_path = os.path.dirname(item.path)
117+
file_name = os.path.basename(item.path)
118+
parts = dir_path.split("/") if dir_path else []
119+
for i, part in enumerate(parts):
120+
parent_dir = "/".join(parts[:i])
121+
child_dir = parts[i]
122+
structure[parent_dir]["dirs"].add(child_dir)
123+
structure[dir_path]["files"].add(file_name)
124+
125+
url = hf_hub_url(repo_id, item.path, repo_type=repo_type, revision=revision)
126+
remote_path = os.path.join(target, item.path)
127+
# with requests.get(url, stream=True, headers=headers) as r:
128+
# r.raise_for_status()
129+
with fsspec.open(url, block_size=1_000_000) as f:
130+
upload_stream(q, bucket_name, f, remote_path, file_name, item.size)
131+
132+
for dir_path, content in structure.items():
133+
index_content = template.render(
134+
directory=os.path.basename(dir_path) or "/",
135+
directories=sorted(content["dirs"]),
136+
files=sorted(content["files"]),
137+
)
138+
remote_index = os.path.join(target, dir_path, "index.html")
139+
upload_data(q, bucket_name, index_content.encode("utf-8"), remote_index)
140+
141+
64142
if __name__ == '__main__':
65143
parser = argparse.ArgumentParser()
66144
parser.add_argument('--ak', type=str, help='qiniu access key')
67145
parser.add_argument('--sk', type=str, help='qiniu secret key')
68146
parser.add_argument('--bucket', type=str, required=False, help='qiniu bucket name')
69-
parser.add_argument('src', type=str, help='Local dir to be upload')
147+
parser.add_argument('--hf', action='store_true', help='treat src as HuggingFace repo id')
148+
parser.add_argument('--repo-type', type=str, default='model', help='HuggingFace repo type')
149+
parser.add_argument('--revision', type=str, default='main', help='HuggingFace repo revision')
150+
parser.add_argument('--hf-token', type=str, help='HuggingFace access token')
151+
parser.add_argument('src', type=str, help='Local dir or HF repo id to be upload')
70152
parser.add_argument('dst', type=str, help='Qiniu dir prefix')
71153
args = parser.parse_args()
72154
ak = args.ak or os.getenv('QUNIU_ACCESS_TOKEN', '')
73155
sk = args.sk or os.getenv('QUNIU_SECRET_KEY', '')
74156
bucket = args.bucket or os.getenv('QUNIU_BUCKET_NAME', '') or 'baize-ai'
75-
upload_directory(ak, sk, bucket, args.src, args.dst)
157+
if args.hf:
158+
hf_token = args.hf_token or os.getenv('HF_TOKEN', None)
159+
upload_hf_repo(
160+
ak,
161+
sk,
162+
bucket,
163+
args.src,
164+
args.dst,
165+
repo_type=args.repo_type,
166+
revision=args.revision,
167+
token=hf_token,
168+
)
169+
else:
170+
upload_directory(ak, sk, bucket, args.src, args.dst)

sync_hf.sh

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,4 @@ CUR_DIR=$(
77
pwd -P
88
)
99

10-
mkdir -p outputs
11-
12-
d=$(mktemp -d -p outputs)
13-
14-
huggingface-cli download --local-dir ${d} --local-dir-use-symlinks=False --resume-download --token "${HF_TOKEN}" $1
15-
16-
python ${CUR_DIR}/sync.py ${d} $2
10+
python ${CUR_DIR}/sync.py --hf "$1" "$2"

0 commit comments

Comments
 (0)