Skip to content

Commit d50efce

Browse files
committed
ADD: add wudao_180g_spbpe_tokenized & wudao_280g_test
1 parent 4dd23cb commit d50efce

14 files changed

+275
-0
lines changed

.gitignore

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
pip-wheel-metadata/
24+
share/python-wheels/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
MANIFEST
29+
30+
# PyInstaller
31+
# Usually these files are written by a python script from a template
32+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33+
*.manifest
34+
*.spec
35+
36+
# Installer logs
37+
pip-log.txt
38+
pip-delete-this-directory.txt
39+
40+
# Unit test / coverage reports
41+
htmlcov/
42+
.tox/
43+
.nox/
44+
.coverage
45+
.coverage.*
46+
.cache
47+
nosetests.xml
48+
coverage.xml
49+
*.cover
50+
*.py,cover
51+
.hypothesis/
52+
.pytest_cache/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
target/
76+
77+
# Jupyter Notebook
78+
.ipynb_checkpoints
79+
80+
# IPython
81+
profile_default/
82+
ipython_config.py
83+
84+
# pyenv
85+
.python-version
86+
87+
# pipenv
88+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
90+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
91+
# install all needed dependencies.
92+
#Pipfile.lock
93+
94+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
95+
__pypackages__/
96+
97+
# Celery stuff
98+
celerybeat-schedule
99+
celerybeat.pid
100+
101+
# SageMath parsed files
102+
*.sage.py
103+
104+
# Environments
105+
.env
106+
.venv
107+
env/
108+
venv/
109+
ENV/
110+
env.bak/
111+
venv.bak/
112+
113+
# Spyder project settings
114+
.spyderproject
115+
.spyproject
116+
117+
# Rope project settings
118+
.ropeproject
119+
120+
# mkdocs documentation
121+
/site
122+
123+
# mypy
124+
.mypy_cache/
125+
.dmypy.json
126+
dmypy.json
127+
128+
# Pyre type checker
129+
.pyre/
130+
131+
# Log
132+
*.log
133+
*.err
134+
*.json
-229 Bytes
Binary file not shown.
-738 Bytes
Binary file not shown.
-229 Bytes
Binary file not shown.
-809 Bytes
Binary file not shown.
-19 Bytes
Binary file not shown.
Binary file not shown.
+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .load import load_dataset
2+
__all__ = [load_dataset]

wudao_180g_spbpe_tokenized/load.py

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import datasets
2+
import re
3+
import glob
4+
from transformers import MT5Tokenizer
5+
import os
6+
from concurrent.futures import ProcessPoolExecutor
7+
8+
9+
_SENTENCE_PIECE_TOKENIZERS = {
10+
'model': '/cognitive_comp/common_data/tokenizers/sentence_piece_bpe/bpe_v40000_s42_cov0.9995_max6_corpus1M.model',
11+
}
12+
13+
# 缓存文件
14+
_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_180g_spbpe_tokenized/train_split'
15+
_CACHE_TEST_DATA_PATH = '/cognitive_comp/common_data/wudao_180g_spbpe_tokenized/test'
16+
17+
18+
def load_dataset(num_proc=1, **kargs):
19+
'''
20+
加载缓存的数据
21+
'''
22+
cache_dict_paths = glob.glob(os.path.join(_CACHE_TRAIN_DATA_PATH, '*'))
23+
ds = []
24+
res = []
25+
p = ProcessPoolExecutor(max_workers=num_proc)
26+
for path in cache_dict_paths:
27+
res.append(p.submit(datasets.load_from_disk,
28+
path, **kargs))
29+
30+
p.shutdown(wait=True)
31+
for future in res:
32+
ds.append(future.result())
33+
train_ds = datasets.concatenate_datasets(ds)
34+
test_ds = datasets.load_from_disk(_CACHE_TEST_DATA_PATH)
35+
return datasets.DatasetDict({
36+
"train": train_ds,
37+
"test": test_ds})
38+
39+
40+
class ChineseSentenceSplitter(object):
41+
def merge_symmetry(self, sentences, symmetry=('“', '”')):
42+
# '''合并对称符号,如双引号'''
43+
effective_ = []
44+
merged = True
45+
for index in range(len(sentences)):
46+
if symmetry[0] in sentences[index] and symmetry[1] not in sentences[index]:
47+
merged = False
48+
effective_.append(sentences[index])
49+
elif symmetry[1] in sentences[index] and not merged:
50+
merged = True
51+
effective_[-1] += sentences[index]
52+
elif symmetry[0] not in sentences[index] and symmetry[1] not in sentences[index] and not merged:
53+
effective_[-1] += sentences[index]
54+
else:
55+
effective_.append(sentences[index])
56+
return [i.strip() for i in effective_ if len(i.strip()) > 0]
57+
58+
def to_sentences(self, paragraph):
59+
# """由段落切分成句子"""
60+
sentences = re.split(r"(?|。|[!]+|!|\…\…)", paragraph)
61+
sentences.append("")
62+
sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])]
63+
sentences = [i.strip() for i in sentences if len(i.strip()) > 0]
64+
for j in range(1, len(sentences)):
65+
if sentences[j][0] == '”':
66+
sentences[j-1] = sentences[j-1] + '”'
67+
sentences[j] = sentences[j][1:]
68+
return self.merge_symmetry(sentences)
69+
70+
def tokenize(self, text):
71+
return self.to_sentences(text)
72+
73+
74+
def _generate_cache_arrow(index, ds):
75+
print('saving dataset shard {}'.format(index))
76+
ds.save_to_disk(os.path.join(_CACHE_TRAIN_DATA_PATH, 'part_{}'.format(index)))
77+
return 'saving dataset shard {} done'.format(index)
78+
79+
80+
def generate_arrow_cache(num_proc=1) -> None:
81+
'''
82+
读取wudao_180g原始数据,并进行切句,切句后tokenizer生成datasets
83+
同时利用seed 42做shuffle 缓存下来
84+
'''
85+
import sys
86+
sys.path.append('../../')
87+
from fs_datasets import load_dataset
88+
ds = load_dataset('wudao_180g', num_proc=num_proc)
89+
ds = ds['train'].train_test_split(train_size=0.995, test_size=0.005, seed=42)
90+
print(ds)
91+
sentence_splitter = ChineseSentenceSplitter()
92+
tokenizer = MT5Tokenizer.from_pretrained(_SENTENCE_PIECE_TOKENIZERS['model'])
93+
94+
def _tokenizer(example):
95+
sentences = sentence_splitter.tokenize(example['text'])
96+
samples = [tokenizer.tokenize(s) for s in sentences]
97+
return {
98+
'tokenized_text': samples,
99+
}
100+
101+
tokenized_ds = ds.map(
102+
_tokenizer,
103+
num_proc=num_proc,
104+
remove_columns=ds['train'].column_names)
105+
106+
p = ProcessPoolExecutor(max_workers=num_proc)
107+
res = []
108+
train_shard_part = 500
109+
for i in range(0, train_shard_part):
110+
res.append(p.submit(_generate_cache_arrow, i,
111+
tokenized_ds['train'].shard(train_shard_part, i)))
112+
113+
p.shutdown(wait=True)
114+
for future in res:
115+
print(future.result(), flush=True)
116+
117+
tokenized_ds['test'].save_to_disk(_CACHE_TEST_DATA_PATH)
118+
119+
print('done')
120+
121+
122+
if __name__ == '__main__':
123+
# generate_arrow_cache(num_proc=100)
124+
ds = load_dataset(num_proc=100)
125+
print(ds)
-239 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.

wudao_280g_test/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .load import load_dataset
2+
3+
__all__ = ['load_dataset']

wudao_280g_test/load.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import datasets
2+
3+
ds = datasets.load_from_disk('/cognitive_comp/common_data/wudao_10k_for_test/hf_cache/')
4+
5+
6+
_CACHE_TRAIN_DATA_PATH = '/cognitive_comp/common_data/wudao_10k_for_test/hf_cache/'
7+
8+
9+
def load_dataset(**kargs):
10+
ds = datasets.load_from_disk(_CACHE_TRAIN_DATA_PATH)
11+
return ds

0 commit comments

Comments
 (0)