sudo apt install git-lfs # for UTMOS
conda create -y -n py39 python=3.9.21 pip=24.0
conda activate py39
pip install -r requirements.txt
pip install flash-attn --no-build-isolation # optional
sh scripts/setup.sh # download textlesslib and UTMOS
cd src/textlesslib
pip install -e .
cd -
import torchaudio
from textless.data.speech_encoder import SpeechEncoder
from src.flow_matching.models import ConditionalFlowMatchingWithBigVGan
wav_path = "/path/to/wav"
encoder = SpeechEncoder.by_name(
dense_model_name="mhubert-base-vp_mls_cv_8lang",
quantizer_model_name="kmeans-expresso",
vocab_size=2000,
deduplicate=False,
need_f0=False,
).cuda()
# download a pretrained model from hugging face hub
decoder = ConditionalFlowMatchingWithBigVGan.from_pretrained("ryota-komatsu/flow_matching_with_bigvgan").cuda()
# load a waveform
waveform, sr = torchaudio.load(wav_path)
waveform = torchaudio.functional.resample(waveform, sr, 16000)
# encode a waveform into pseudo-phonetic units
units = encoder(waveform.cuda())["units"]
units = units.unsqueeze(0) + 1 # 0: pad
# resynthesis
audio_values = decoder(units)
import torch
import torchaudio
from textless.data.speech_encoder import SpeechEncoder
from transformers import AutoModelForCausalLM, AutoTokenizer
wav_path = "/path/to/wav"
encoder = SpeechEncoder.by_name(
dense_model_name="hubert-base-ls960",
quantizer_model_name="kmeans",
vocab_size=100,
deduplicate=True,
need_f0=False,
).cuda()
model = AutoModelForCausalLM.from_pretrained("/path/to/pretrained/model").cuda()
tokenizer = AutoTokenizer.from_pretrained("/path/to/pretrained/model")
# load a waveform
waveform, sr = torchaudio.load(wav_path)
waveform = torchaudio.functional.resample(waveform, sr, 16000)
# encode a waveform into pseudo-phonetic units
input_ids = encoder(waveform.cuda())["units"].tolist()
input_ids = tokenizer("".join([f"<{unit}>" for unit in input_ids])).input_ids.cuda()
# Speech LM
logits = model(input_ids=input_ids).logits
Visit demo page for speech samples.
If you already have LibriTTS-R, you can use it by editing a config file;
dataset:
wav_dir_orig: "/path/to/LibriTTS-R" # ${dataset.wav_dir_orig}/train-clean-100, train-clean-360, ...
otherwise you can download the new one under dataset_root
.
dataset_root=data
sh scripts/download_libritts.sh ${dataset_root}
To perform speech language modeling, please download the Libri-Light under dataset_root
.
dataset_root=data
sh scripts/download_librilight.sh ${dataset_root} # 7TB
sh scripts/download_slm21.sh # download sWUGGY and sBLIMP
To run only a specific stage, pass it as an argument.
Supported processing stages
- resample
- extract_features # can be skipped when using a pretrained BigVGan
- train_bigvgan # can be skipped when using a pretrained BigVGan
- tokenize_dataset # can be skipped when using a Hugging Face datasets
- train_flow_matching
- synthesize
python main_resynth.py train_flow_matching --config=configs/unit2speech/mhubert-expresso-2000.yaml
Set the number of GPUs to nproc_per_node
to enable multi-GPU training.
torchrun \
--nnodes=1 \
--nproc_per_node=1 \
--rdzv_id=100 \
--rdzv_backend=c10d \
--rdzv_endpoint=localhost:29400 \
main_speechlm.py \
--config=configs/speechlm/hubert.yaml
To run only a sub-task (encode, tokenize, or train), specify it as an argument.
torchrun \
--nnodes=1 \
--nproc_per_node=1 \
--rdzv_id=100 \
--rdzv_backend=c10d \
--rdzv_endpoint=localhost:29400 \
main_speechlm.py encode \
--config=configs/speechlm/hubert.yaml
See Zero Resource Speech homepage and paper for task details.
python main_speechlm.py eval --config=configs/speechlm/hubert.yaml