build_dataset.py

import csv
import pathlib
import random

import click
import librosa
import numpy as np
import soundfile
import tqdm
from textgrid import TextGrid


def build_dataset(wavs, tg, dataset, skip_silence_insertion, wav_subtype):
    wavs = pathlib.Path(wavs)
    tg_dir = pathlib.Path(tg)
    del tg
    dataset = pathlib.Path(dataset)
    filelist = list(wavs.glob('*.wav'))

    dataset.mkdir(parents=True, exist_ok=True)
    (dataset / 'raw' / 'wavs').mkdir(exist_ok=True, parents=True)
    transcriptions = []
    samplerate = 44100
    min_sil = int(0.1 * samplerate)
    max_sil = int(0.5 * samplerate)
    for wavfile in tqdm.tqdm(filelist):
        y, _ = librosa.load(wavfile, sr=samplerate, mono=True)
        tgfile = tg_dir / wavfile.with_suffix('.TextGrid').name
        tg = TextGrid()
        tg.read(str(tgfile))
        phones_tier = None  
        for tier in tg:
            if tier.name == 'phones':
                phones_tier = tier  
                break  
        
        ph_seq = [ph.mark for ph in phones_tier]
        ph_dur = [ph.maxTime - ph.minTime for ph in phones_tier]
        if not skip_silence_insertion:
            if random.random() < 0.5:
                len_sil = random.randrange(min_sil, max_sil)
                y = np.concatenate((np.zeros((len_sil,), dtype=np.float32), y))
                if ph_seq[0] == 'SP':
                    ph_dur[0] += len_sil / samplerate
                else:
                    ph_seq.insert(0, 'SP')
                    ph_dur.insert(0, len_sil / samplerate)
            if random.random() < 0.5:
                len_sil = random.randrange(min_sil, max_sil)
                y = np.concatenate((y, np.zeros((len_sil,), dtype=np.float32)))
                if ph_seq[-1] == 'SP':
                    ph_dur[-1] += len_sil / samplerate
                else:
                    ph_seq.append('SP')
                    ph_dur.append(len_sil / samplerate)
        ph_seq = ' '.join(ph_seq)
        ph_dur = ' '.join([str(round(d, 6)) for d in ph_dur])
        soundfile.write(dataset / 'raw' / 'wavs' / wavfile.name, y, samplerate, subtype=wav_subtype)
        transcriptions.append({'name': wavfile.stem, 'ph_seq': ph_seq, 'ph_dur': ph_dur})

    with open(dataset / 'raw' / 'transcriptions.csv', 'w', encoding='utf8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur'])
        writer.writeheader()
        writer.writerows(transcriptions)

    print(f'All wavs and transcriptions saved in {dataset}')


@click.command(help='Collect phoneme alignments into transcriptions.csv')
@click.option('--wavs', required=True, help='Path to the segments directory')
@click.option('--tg', required=True, help='Path to the final TextGrids directory')
@click.option('--dataset', required=True, help='Path to dataset directory')
@click.option('--skip_silence_insertion', is_flag=True, show_default=True,
              help='Do not insert silence around segments')
@click.option('--wav_subtype', default="PCM_16", show_default=True,
              help='WAV subtype')
def main(wavs, tg, dataset, skip_silence_insertion, wav_subtype):
    
    build_dataset(wavs, tg, dataset, skip_silence_insertion, wav_subtype)

if __name__ == '__main__':
    main()