-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_dataset.py
83 lines (71 loc) · 3.17 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import csv
import pathlib
import random
import click
import librosa
import numpy as np
import soundfile
import tqdm
from textgrid import TextGrid
def build_dataset(wavs, tg, dataset, skip_silence_insertion, wav_subtype):
wavs = pathlib.Path(wavs)
tg_dir = pathlib.Path(tg)
del tg
dataset = pathlib.Path(dataset)
filelist = list(wavs.glob('*.wav'))
dataset.mkdir(parents=True, exist_ok=True)
(dataset / 'raw' / 'wavs').mkdir(exist_ok=True, parents=True)
transcriptions = []
samplerate = 44100
min_sil = int(0.1 * samplerate)
max_sil = int(0.5 * samplerate)
for wavfile in tqdm.tqdm(filelist):
y, _ = librosa.load(wavfile, sr=samplerate, mono=True)
tgfile = tg_dir / wavfile.with_suffix('.TextGrid').name
tg = TextGrid()
tg.read(str(tgfile))
phones_tier = None
for tier in tg:
if tier.name == 'phones':
phones_tier = tier
break
ph_seq = [ph.mark for ph in phones_tier]
ph_dur = [ph.maxTime - ph.minTime for ph in phones_tier]
if not skip_silence_insertion:
if random.random() < 0.5:
len_sil = random.randrange(min_sil, max_sil)
y = np.concatenate((np.zeros((len_sil,), dtype=np.float32), y))
if ph_seq[0] == 'SP':
ph_dur[0] += len_sil / samplerate
else:
ph_seq.insert(0, 'SP')
ph_dur.insert(0, len_sil / samplerate)
if random.random() < 0.5:
len_sil = random.randrange(min_sil, max_sil)
y = np.concatenate((y, np.zeros((len_sil,), dtype=np.float32)))
if ph_seq[-1] == 'SP':
ph_dur[-1] += len_sil / samplerate
else:
ph_seq.append('SP')
ph_dur.append(len_sil / samplerate)
ph_seq = ' '.join(ph_seq)
ph_dur = ' '.join([str(round(d, 6)) for d in ph_dur])
soundfile.write(dataset / 'raw' / 'wavs' / wavfile.name, y, samplerate, subtype=wav_subtype)
transcriptions.append({'name': wavfile.stem, 'ph_seq': ph_seq, 'ph_dur': ph_dur})
with open(dataset / 'raw' / 'transcriptions.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur'])
writer.writeheader()
writer.writerows(transcriptions)
print(f'All wavs and transcriptions saved in {dataset}')
@click.command(help='Collect phoneme alignments into transcriptions.csv')
@click.option('--wavs', required=True, help='Path to the segments directory')
@click.option('--tg', required=True, help='Path to the final TextGrids directory')
@click.option('--dataset', required=True, help='Path to dataset directory')
@click.option('--skip_silence_insertion', is_flag=True, show_default=True,
help='Do not insert silence around segments')
@click.option('--wav_subtype', default="PCM_16", show_default=True,
help='WAV subtype')
def main(wavs, tg, dataset, skip_silence_insertion, wav_subtype):
build_dataset(wavs, tg, dataset, skip_silence_insertion, wav_subtype)
if __name__ == '__main__':
main()