IDEA-CCNL
diff --git a/‎.pre-commit-config.yaml
+16 b/‎.pre-commit-config.yaml
+16
diff --git a/‎README.md
+15 b/‎README.md
+15
diff --git a/‎__init__.py
+2 b/‎__init__.py
+2
diff --git a/‎afqmc/__init__.py
+2 b/‎afqmc/__init__.py
+2
diff --git a/‎afqmc/__pycache__/__init__.cpython-38.pyc
229 Bytes b/‎afqmc/__pycache__/__init__.cpython-38.pyc
229 Bytes
diff --git a/‎afqmc/__pycache__/afqmc_load.cpython-38.pyc
738 Bytes b/‎afqmc/__pycache__/afqmc_load.cpython-38.pyc
738 Bytes
diff --git a/‎afqmc/afqmc_load.py
+22 b/‎afqmc/afqmc_load.py
+22
diff --git a/‎lcsts/__init__.py
+2 b/‎lcsts/__init__.py
+2
diff --git a/‎lcsts/__pycache__/__init__.cpython-38.pyc
229 Bytes b/‎lcsts/__pycache__/__init__.cpython-38.pyc
229 Bytes
diff --git a/‎lcsts/__pycache__/lcsts_load.cpython-38.pyc
809 Bytes b/‎lcsts/__pycache__/lcsts_load.cpython-38.pyc
809 Bytes
diff --git a/‎lcsts/lcsts_load.py
+23 b/‎lcsts/lcsts_load.py
+23
diff --git a/‎load.py
+36 b/‎load.py
+36
diff --git a/‎wudao_180g/__init__.py
+2 b/‎wudao_180g/__init__.py
+2
diff --git a/‎wudao_180g/__pycache__/__init__.cpython-38.pyc
239 Bytes b/‎wudao_180g/__pycache__/__init__.cpython-38.pyc
239 Bytes
diff --git a/‎wudao_180g/__pycache__/wudao_180g_load.cpython-38.pyc
1.83 KB b/‎wudao_180g/__pycache__/wudao_180g_load.cpython-38.pyc
1.83 KB
diff --git a/‎wudao_180g/wudao_180g_load.py
+51 b/‎wudao_180g/wudao_180g_load.py
+51
diff --git a/‎wudao_180g_mt5_tokenized/__init__.py
+2 b/‎wudao_180g_mt5_tokenized/__init__.py
+2
diff --git a/‎wudao_180g_mt5_tokenized/wudao_180g_mt5_tokenized_load.py
+51 b/‎wudao_180g_mt5_tokenized/wudao_180g_mt5_tokenized_load.py
+51
diff --git a/‎wudao_280g/__init__.py
+2 b/‎wudao_280g/__init__.py
+2
diff --git a/‎wudao_280g/__pycache__/__init__.cpython-38.pyc
239 Bytes b/‎wudao_280g/__pycache__/__init__.cpython-38.pyc
239 Bytes
diff --git a/‎wudao_280g/__pycache__/wudao-280G_load.cpython-38.pyc
1.15 KB b/‎wudao_280g/__pycache__/wudao-280G_load.cpython-38.pyc
1.15 KB
diff --git a/‎wudao_280g/__pycache__/wudao_280g_load.cpython-38.pyc
1.25 KB b/‎wudao_280g/__pycache__/wudao_280g_load.cpython-38.pyc
1.25 KB
diff --git a/‎wudao_280g/wudao_280g_load.py
+31 b/‎wudao_280g/wudao_280g_load.py
+31
@@ -0,0 +1,16 @@
+default_stages: [commit]
+repos:
+  - repo: https://github.com/yingzi113/pre-commit-hooks
+    rev: 5863e162f1bed1f63eeb716e77d622ff8e3d9af9
+    hooks:
+    - id: check-case-conflict
+  - repo: https://github.com/pre-commit/mirrors-autopep8
+    rev: v1.4.4
+    hooks:
+    - id: autopep8
+      args: [-i, --global-config=.flake8, -v, --max-line-length=120]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.4.0
+    hooks:
+    - id: flake8
+      args: [--max-line-length=120]
@@ -0,0 +1,15 @@
+# fs_datasets
+
+数据集管理
+
+DEMO:
+[小数据集管理](http://wiki.team.idea.edu.cn/pages/viewpage.action?pageId=12945137)
+[大数据集管理](http://wiki.team.idea.edu.cn/pages/viewpage.action?pageId=12945147)
+
+## usage
+
+```
+import fs_datasets
+fs_datasets.list_dataset() # 暂时有BUG还没实现
+fs_datasets.load_dataset('afqmc')
+```
@@ -0,0 +1,2 @@
+from .load import load_dataset, list_datasets
+__all__ = ['load_dataset', 'list_datasets']
@@ -0,0 +1,2 @@
+from .afqmc_load import load_dataset
+__all__ = ['load_dataset']
@@ -0,0 +1,22 @@
+import datasets
+
+
+# 支持远程或者本地的地址
+_TRAIN_DATA_PATH = '/cognitive_comp/common_data/afqmc/train.json'
+_VAL_DATA_PATH = '/cognitive_comp/common_data/afqmc/dev.json'
+_TEST_DATA_PATH = '/cognitive_comp/common_data/afqmc/test.json'
+
+
+def load_dataset(**kargs):
+    afqmc_features = datasets.Features({
+        'sentence1': datasets.Value('string'),
+        'sentence2': datasets.Value('string'),
+        'label': datasets.Value('int32')})
+
+    afqmc_raw_ds = datasets.load_dataset('json',
+                                         data_files={
+                                             'train': _TRAIN_DATA_PATH,
+                                             'validation': _VAL_DATA_PATH, },
+                                         # 'test': _TEST_DATA_PATH},
+                                         features=afqmc_features)
+    return afqmc_raw_ds
@@ -0,0 +1,2 @@
+from .lcsts_load import load_dataset
+__all__ = ['load_dataset']
@@ -0,0 +1,23 @@
+import datasets
+
+
+_TRAIN_DATA_PATH = '/cognitive_comp/common_data/LCSTS/train.jsonl'
+_VAL_DATA_PATH = '/cognitive_comp/common_data/LCSTS/valid.jsonl'
+_TEST_DATA_PATH = '/cognitive_comp/common_data/LCSTS/test_public.jsonl'
+
+
+def load_dataset(**kargs):
+    # 因为train和val&test的column name不一样，需要要分开读取
+    train = datasets.load_dataset('json',
+                                  data_files=_TRAIN_DATA_PATH,
+                                  split=datasets.Split.TRAIN, **kargs)
+    val = datasets.load_dataset('json',
+                                data_files={
+                                    datasets.Split.VALIDATION: _VAL_DATA_PATH},
+                                split=datasets.Split.VALIDATION, **kargs)
+    test = datasets.load_dataset('json',
+                                 data_files={
+                                     datasets.Split.TEST: _TEST_DATA_PATH},
+                                 split=datasets.Split.TEST, **kargs)
+
+    return datasets.DatasetDict(train=train, val=val, test=test)
@@ -0,0 +1,36 @@
+import datasets
+import os
+import glob
+from typing import Union
+from importlib import import_module
+
+
+def list_datasets():
+    split_path = os.path.split(os.path.abspath(__file__))
+    files = glob.glob(os.path.join(split_path[0], '*'))
+    ds_list = []
+    ignore_dir = ['__pycache__']
+    for file in files:
+        if (os.path.isdir(file)):
+            h = os.path.split(file)
+            if h[1] in ignore_dir:
+                continue
+            ds_list.append(h[1])
+    return ds_list
+
+
+def load_dataset(name, **kargs) -> Union[
+        datasets.DatasetDict,
+        datasets.Dataset,
+        datasets.IterableDatasetDict,
+        datasets.IterableDataset]:
+    '''
+    通过数据集名字加载数据集
+    '''
+    load_func = getattr(import_module('.' + name,
+                                      package='fs_datasets'), 'load_dataset')
+    return load_func(**kargs)
+
+
+if __name__ == '__main__':
+    print(load_dataset('afqmc'))
@@ -0,0 +1,2 @@
+from .wudao_180g_load import load_dataset
+__all__ = ['load_dataset']
@@ -0,0 +1,51 @@
+import datasets
+import glob
+import os
+from concurrent.futures import ProcessPoolExecutor
+
+_SPLIT_DATA_PATH = '/cognitive_comp/gaoxinyu/data/WuDaoCorpus180G_split_100k/*'
+# 缓存文件
+_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_180g/hf_cache_split_100k/'
+
+
+feats = datasets.Features({"text": datasets.Value('string')})
+
+
+def load_dataset(num_proc=1, **kargs):
+    cache_dict_paths = glob.glob(os.path.join(_CACHE_TRAIN_DATA_PATH, '*'))
+    ds = []
+    res = []
+    p = ProcessPoolExecutor(max_workers=num_proc)
+    for path in cache_dict_paths:
+        res.append(p.submit(datasets.load_from_disk,
+                            path, **kargs))
+
+    p.shutdown(wait=True)
+    for future in res:
+        ds.append(future.result())
+    return datasets.DatasetDict({"train": datasets.concatenate_datasets(ds)})
+
+
+def _generate_cache_arrow(index, path):
+    print('saving dataset shard {}'.format(index))
+    ds = (datasets.load_dataset('json', data_files=path,
+                                cache_dir='/cognitive_comp/gaoxinyu/data/huggingface-cache',
+                                features=feats)['train'])
+    ds.save_to_disk(os.path.join(_CACHE_TRAIN_DATA_PATH, os.path.basename(path)))
+    return 'saving dataset shard {} done'.format(index)
+
+
+def generate_cache_arrow(num_proc=1) -> None:
+    '''
+    生成HF支持的缓存文件，加速后续的加载
+    '''
+    data_dict_paths = glob.glob(_SPLIT_DATA_PATH)
+    p = ProcessPoolExecutor(max_workers=num_proc)
+    res = []
+
+    for index, path in enumerate(data_dict_paths):
+        res.append(p.submit(_generate_cache_arrow, index, path))
+
+    p.shutdown(wait=True)
+    for future in res:
+        print(future.result(), flush=True)
@@ -0,0 +1,2 @@
+from .wudao_180g_mt5_tokenized_load import load_dataset
+__all__ = ['load_dataset']
@@ -0,0 +1,51 @@
+import datasets
+import glob
+import os
+from concurrent.futures import ProcessPoolExecutor
+
+_SPLIT_DATA_PATH = ''
+# 缓存文件
+_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_180g_mt5_tokenized/'
+
+
+feats = datasets.Features({"input_ids": datasets.Value('int32')})
+
+
+def load_dataset(num_proc=1, **kargs):
+    cache_dict_paths = glob.glob(os.path.join(_CACHE_TRAIN_DATA_PATH, '*'))
+    ds = []
+    res = []
+    p = ProcessPoolExecutor(max_workers=num_proc)
+    for path in cache_dict_paths:
+        res.append(p.submit(datasets.load_from_disk,
+                            path, **kargs))
+
+    p.shutdown(wait=True)
+    for future in res:
+        ds.append(future.result())
+    return datasets.DatasetDict({"train": datasets.concatenate_datasets(ds)})
+
+
+def _generate_cache_arrow(index, path):
+    print('saving dataset shard {}'.format(index))
+    ds = (datasets.load_dataset('json', data_files=path,
+                                cache_dir='',
+                                features=feats)['train'])
+    ds.save_to_disk(os.path.join(_CACHE_TRAIN_DATA_PATH, os.path.basename(path)))
+    return 'saving dataset shard {} done'.format(index)
+
+
+def generate_cache_arrow(num_proc=1) -> None:
+    '''
+    生成HF支持的缓存文件，加速后续的加载
+    '''
+    data_dict_paths = glob.glob(_SPLIT_DATA_PATH)
+    p = ProcessPoolExecutor(max_workers=num_proc)
+    res = []
+
+    for index, path in enumerate(data_dict_paths):
+        res.append(p.submit(_generate_cache_arrow, index, path))
+
+    p.shutdown(wait=True)
+    for future in res:
+        print(future.result(), flush=True)
@@ -0,0 +1,2 @@
+from .wudao_280g_load import load_dataset
+__all__ = ['load_dataset']
@@ -0,0 +1,31 @@
+import datasets
+import glob
+import os
+
+
+# 多进程加载这个目录下的所有json文件
+_SPLIT_DATA_PATH = '/cognitive_comp/gaoxinyu/data/WuDaoCorpus280G_split_100k/*'
+# 缓存文件
+_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_280g/hf_cache_split_100k/'
+
+
+def load_dataset(**kargs):
+    cache_dict_paths = glob.glob(os.path.join(_CACHE_TRAIN_DATA_PATH, '*'))
+    ds = []
+    for path in cache_dict_paths:
+        print('loading ', path, flush=True)
+        ds.append(datasets.load_from_disk(path, **kargs))
+    return datasets.DatasetDict({"train": datasets.concatenate_datasets(*ds)})
+
+
+def generate_cache_arrow() -> None:
+    '''
+    生成HF支持的缓存文件，加速后续的加载
+    '''
+    f = datasets.Features({"content": datasets.Value('string')})
+    data_dict_paths = glob.glob(_SPLIT_DATA_PATH)
+    for path in data_dict_paths:
+        ds = (datasets.load_dataset('json', data_files=path,
+                                    cache_dir='/cognitive_comp/gaoxinyu/data/huggingface-cache',
+                                    features=f)['train'])
+        ds.save_to_disk(os.path.join(_CACHE_TRAIN_DATA_PATH, os.path.basename(path)))
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .load import load_dataset, list_datasets`
	`2`	`+__all__ = ['load_dataset', 'list_datasets']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .afqmc_load import load_dataset`
	`2`	`+__all__ = ['load_dataset']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .lcsts_load import load_dataset`
	`2`	`+__all__ = ['load_dataset']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .wudao_180g_load import load_dataset`
	`2`	`+__all__ = ['load_dataset']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .wudao_180g_mt5_tokenized_load import load_dataset`
	`2`	`+__all__ = ['load_dataset']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .wudao_280g_load import load_dataset`
	`2`	`+__all__ = ['load_dataset']`