Skip to content

Commit 2dd4464

Browse files
committed
init: first commit
0 parents  commit 2dd4464

23 files changed

+257
-0
lines changed

.pre-commit-config.yaml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
default_stages: [commit]
2+
repos:
3+
- repo: https://github.com/yingzi113/pre-commit-hooks
4+
rev: 5863e162f1bed1f63eeb716e77d622ff8e3d9af9
5+
hooks:
6+
- id: check-case-conflict
7+
- repo: https://github.com/pre-commit/mirrors-autopep8
8+
rev: v1.4.4
9+
hooks:
10+
- id: autopep8
11+
args: [-i, --global-config=.flake8, -v, --max-line-length=120]
12+
- repo: https://github.com/pre-commit/pre-commit-hooks
13+
rev: v2.4.0
14+
hooks:
15+
- id: flake8
16+
args: [--max-line-length=120]

README.md

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# fs_datasets
2+
3+
数据集管理
4+
5+
DEMO:
6+
[小数据集管理](http://wiki.team.idea.edu.cn/pages/viewpage.action?pageId=12945137)
7+
[大数据集管理](http://wiki.team.idea.edu.cn/pages/viewpage.action?pageId=12945147)
8+
9+
## usage
10+
11+
```
12+
import fs_datasets
13+
fs_datasets.list_dataset() # 暂时有BUG还没实现
14+
fs_datasets.load_dataset('afqmc')
15+
```

__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .load import load_dataset, list_datasets
2+
__all__ = ['load_dataset', 'list_datasets']

afqmc/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .afqmc_load import load_dataset
2+
__all__ = ['load_dataset']
229 Bytes
Binary file not shown.
738 Bytes
Binary file not shown.

afqmc/afqmc_load.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import datasets
2+
3+
4+
# 支持远程或者本地的地址
5+
_TRAIN_DATA_PATH = '/cognitive_comp/common_data/afqmc/train.json'
6+
_VAL_DATA_PATH = '/cognitive_comp/common_data/afqmc/dev.json'
7+
_TEST_DATA_PATH = '/cognitive_comp/common_data/afqmc/test.json'
8+
9+
10+
def load_dataset(**kargs):
11+
afqmc_features = datasets.Features({
12+
'sentence1': datasets.Value('string'),
13+
'sentence2': datasets.Value('string'),
14+
'label': datasets.Value('int32')})
15+
16+
afqmc_raw_ds = datasets.load_dataset('json',
17+
data_files={
18+
'train': _TRAIN_DATA_PATH,
19+
'validation': _VAL_DATA_PATH, },
20+
# 'test': _TEST_DATA_PATH},
21+
features=afqmc_features)
22+
return afqmc_raw_ds

lcsts/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .lcsts_load import load_dataset
2+
__all__ = ['load_dataset']
229 Bytes
Binary file not shown.
809 Bytes
Binary file not shown.

lcsts/lcsts_load.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import datasets
2+
3+
4+
_TRAIN_DATA_PATH = '/cognitive_comp/common_data/LCSTS/train.jsonl'
5+
_VAL_DATA_PATH = '/cognitive_comp/common_data/LCSTS/valid.jsonl'
6+
_TEST_DATA_PATH = '/cognitive_comp/common_data/LCSTS/test_public.jsonl'
7+
8+
9+
def load_dataset(**kargs):
10+
# 因为train和val&test的column name不一样,需要要分开读取
11+
train = datasets.load_dataset('json',
12+
data_files=_TRAIN_DATA_PATH,
13+
split=datasets.Split.TRAIN, **kargs)
14+
val = datasets.load_dataset('json',
15+
data_files={
16+
datasets.Split.VALIDATION: _VAL_DATA_PATH},
17+
split=datasets.Split.VALIDATION, **kargs)
18+
test = datasets.load_dataset('json',
19+
data_files={
20+
datasets.Split.TEST: _TEST_DATA_PATH},
21+
split=datasets.Split.TEST, **kargs)
22+
23+
return datasets.DatasetDict(train=train, val=val, test=test)

load.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import datasets
2+
import os
3+
import glob
4+
from typing import Union
5+
from importlib import import_module
6+
7+
8+
def list_datasets():
9+
split_path = os.path.split(os.path.abspath(__file__))
10+
files = glob.glob(os.path.join(split_path[0], '*'))
11+
ds_list = []
12+
ignore_dir = ['__pycache__']
13+
for file in files:
14+
if (os.path.isdir(file)):
15+
h = os.path.split(file)
16+
if h[1] in ignore_dir:
17+
continue
18+
ds_list.append(h[1])
19+
return ds_list
20+
21+
22+
def load_dataset(name, **kargs) -> Union[
23+
datasets.DatasetDict,
24+
datasets.Dataset,
25+
datasets.IterableDatasetDict,
26+
datasets.IterableDataset]:
27+
'''
28+
通过数据集名字加载数据集
29+
'''
30+
load_func = getattr(import_module('.' + name,
31+
package='fs_datasets'), 'load_dataset')
32+
return load_func(**kargs)
33+
34+
35+
if __name__ == '__main__':
36+
print(load_dataset('afqmc'))

wudao_180g/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .wudao_180g_load import load_dataset
2+
__all__ = ['load_dataset']
239 Bytes
Binary file not shown.
Binary file not shown.

wudao_180g/wudao_180g_load.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import datasets
2+
import glob
3+
import os
4+
from concurrent.futures import ProcessPoolExecutor
5+
6+
_SPLIT_DATA_PATH = '/cognitive_comp/gaoxinyu/data/WuDaoCorpus180G_split_100k/*'
7+
# 缓存文件
8+
_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_180g/hf_cache_split_100k/'
9+
10+
11+
feats = datasets.Features({"text": datasets.Value('string')})
12+
13+
14+
def load_dataset(num_proc=1, **kargs):
15+
cache_dict_paths = glob.glob(os.path.join(_CACHE_TRAIN_DATA_PATH, '*'))
16+
ds = []
17+
res = []
18+
p = ProcessPoolExecutor(max_workers=num_proc)
19+
for path in cache_dict_paths:
20+
res.append(p.submit(datasets.load_from_disk,
21+
path, **kargs))
22+
23+
p.shutdown(wait=True)
24+
for future in res:
25+
ds.append(future.result())
26+
return datasets.DatasetDict({"train": datasets.concatenate_datasets(ds)})
27+
28+
29+
def _generate_cache_arrow(index, path):
30+
print('saving dataset shard {}'.format(index))
31+
ds = (datasets.load_dataset('json', data_files=path,
32+
cache_dir='/cognitive_comp/gaoxinyu/data/huggingface-cache',
33+
features=feats)['train'])
34+
ds.save_to_disk(os.path.join(_CACHE_TRAIN_DATA_PATH, os.path.basename(path)))
35+
return 'saving dataset shard {} done'.format(index)
36+
37+
38+
def generate_cache_arrow(num_proc=1) -> None:
39+
'''
40+
生成HF支持的缓存文件,加速后续的加载
41+
'''
42+
data_dict_paths = glob.glob(_SPLIT_DATA_PATH)
43+
p = ProcessPoolExecutor(max_workers=num_proc)
44+
res = []
45+
46+
for index, path in enumerate(data_dict_paths):
47+
res.append(p.submit(_generate_cache_arrow, index, path))
48+
49+
p.shutdown(wait=True)
50+
for future in res:
51+
print(future.result(), flush=True)

wudao_180g_mt5_tokenized/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .wudao_180g_mt5_tokenized_load import load_dataset
2+
__all__ = ['load_dataset']
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import datasets
2+
import glob
3+
import os
4+
from concurrent.futures import ProcessPoolExecutor
5+
6+
_SPLIT_DATA_PATH = ''
7+
# 缓存文件
8+
_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_180g_mt5_tokenized/'
9+
10+
11+
feats = datasets.Features({"input_ids": datasets.Value('int32')})
12+
13+
14+
def load_dataset(num_proc=1, **kargs):
15+
cache_dict_paths = glob.glob(os.path.join(_CACHE_TRAIN_DATA_PATH, '*'))
16+
ds = []
17+
res = []
18+
p = ProcessPoolExecutor(max_workers=num_proc)
19+
for path in cache_dict_paths:
20+
res.append(p.submit(datasets.load_from_disk,
21+
path, **kargs))
22+
23+
p.shutdown(wait=True)
24+
for future in res:
25+
ds.append(future.result())
26+
return datasets.DatasetDict({"train": datasets.concatenate_datasets(ds)})
27+
28+
29+
def _generate_cache_arrow(index, path):
30+
print('saving dataset shard {}'.format(index))
31+
ds = (datasets.load_dataset('json', data_files=path,
32+
cache_dir='',
33+
features=feats)['train'])
34+
ds.save_to_disk(os.path.join(_CACHE_TRAIN_DATA_PATH, os.path.basename(path)))
35+
return 'saving dataset shard {} done'.format(index)
36+
37+
38+
def generate_cache_arrow(num_proc=1) -> None:
39+
'''
40+
生成HF支持的缓存文件,加速后续的加载
41+
'''
42+
data_dict_paths = glob.glob(_SPLIT_DATA_PATH)
43+
p = ProcessPoolExecutor(max_workers=num_proc)
44+
res = []
45+
46+
for index, path in enumerate(data_dict_paths):
47+
res.append(p.submit(_generate_cache_arrow, index, path))
48+
49+
p.shutdown(wait=True)
50+
for future in res:
51+
print(future.result(), flush=True)

wudao_280g/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .wudao_280g_load import load_dataset
2+
__all__ = ['load_dataset']
239 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.

wudao_280g/wudao_280g_load.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import datasets
2+
import glob
3+
import os
4+
5+
6+
# 多进程加载这个目录下的所有json文件
7+
_SPLIT_DATA_PATH = '/cognitive_comp/gaoxinyu/data/WuDaoCorpus280G_split_100k/*'
8+
# 缓存文件
9+
_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_280g/hf_cache_split_100k/'
10+
11+
12+
def load_dataset(**kargs):
13+
cache_dict_paths = glob.glob(os.path.join(_CACHE_TRAIN_DATA_PATH, '*'))
14+
ds = []
15+
for path in cache_dict_paths:
16+
print('loading ', path, flush=True)
17+
ds.append(datasets.load_from_disk(path, **kargs))
18+
return datasets.DatasetDict({"train": datasets.concatenate_datasets(*ds)})
19+
20+
21+
def generate_cache_arrow() -> None:
22+
'''
23+
生成HF支持的缓存文件,加速后续的加载
24+
'''
25+
f = datasets.Features({"content": datasets.Value('string')})
26+
data_dict_paths = glob.glob(_SPLIT_DATA_PATH)
27+
for path in data_dict_paths:
28+
ds = (datasets.load_dataset('json', data_files=path,
29+
cache_dir='/cognitive_comp/gaoxinyu/data/huggingface-cache',
30+
features=f)['train'])
31+
ds.save_to_disk(os.path.join(_CACHE_TRAIN_DATA_PATH, os.path.basename(path)))

0 commit comments

Comments
 (0)