-
-
Notifications
You must be signed in to change notification settings - Fork 6
/
write_records.py
79 lines (63 loc) · 2.46 KB
/
write_records.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from os import path
from avsr.dataset_writer import RecordFileWriter
from avsr.io_utils import get_files
def main():
r"""
Writes audio, video, and labels .tfrecord files
Please tailor this script for your own use case
Required files:
- train.scp and test.scp: must contain one example per line
as a relative path from dataset root, e.g.
foo/bar/file1noext
foo/bar/file2noext
bar/baz/file9000noext
- label_file: must contain pairs of (example name - transcription)
on each line, delimited by a space, e.g.
foo/bar/file1noext if liberty is not entire it is not liberty
foo/bar/file2noext you must unlearn what you have learned
bar/baz/file9000noext i'm sorry i don't want to be an emperor
- unit_list_file: defines the vocabulary, one token per line
Before writing the video record files, it is required to process
video clips in advance (with OpenFace) to store the aligned faces.
Please refer to the provided example `extract_faces.py`.
:return:
"""
dataset_dir = '/path/to/your/dataset_root/'
train_list = '/path/to/train.scp'
test_list = '/path/to/test.scp'
train = get_files(train_list, dataset_dir)
test = get_files(test_list, dataset_dir)
label_map = dict()
for file in train+test:
label_map[path.splitext(file)[0]] = path.splitext(file.split('dataset_name/')[-1])[0]
writer = RecordFileWriter(
train_files=train,
test_files=test,
label_map=label_map,
)
writer.write_labels_records(
unit='character',
unit_list_file='./avsr/misc/character_list',
label_file='/path/to/label_file/',
train_record_name='/output/path/characters_train.tfrecord',
test_record_name='/output/path/characters_test.tfrecord',
)
writer.write_audio_records(
content_type='feature',
extension='wav',
transform='logmel_stack_w8s3',
snr_list=['clean', 10, 0, -5],
target_sr=16000,
noise_type='cafe',
train_record_name='/output/path/logmel_train',
test_record_name='/output/path/logmel_test',
)
writer.write_bmp_records(
train_record_name='/output/path/rgb36lips_train.tfrecord',
test_record_name='/output/path/rgb36lips_test.tfrecord',
bmp_dir='/path/to/your/dataset_root/aligned_openface/',
output_resolution=(36, 36),
crop_lips=True,
)
if __name__ == '__main__':
main()