Skip to content

Commit

Permalink
Merge pull request #2 from LEEYOONHYUNG/master
Browse files Browse the repository at this point in the history
add audios
  • Loading branch information
LEEYOONHYUNG authored Jan 21, 2020
2 parents e14a7df + 34f5e55 commit 3c121e0
Show file tree
Hide file tree
Showing 90 changed files with 14,966 additions and 0 deletions.
93 changes: 93 additions & 0 deletions audio_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import torch
import numpy as np
from scipy.signal import get_window
import librosa.util as librosa_util


def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
n_fft=800, dtype=np.float32, norm=None):
"""
# from librosa 0.6
Compute the sum-square envelope of a window function at a given hop length.
This is used to estimate modulation effects induced by windowing
observations in short-time fourier transforms.
Parameters
----------
window : string, tuple, number, callable, or list-like
Window specification, as in `get_window`
n_frames : int > 0
The number of analysis frames
hop_length : int > 0
The number of samples to advance between frames
win_length : [optional]
The length of the window function. By default, this matches `n_fft`.
n_fft : int > 0
The length of each analysis frame.
dtype : np.dtype
The data type of the output
Returns
-------
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
The sum-squared envelope of the window function
"""
if win_length is None:
win_length = n_fft

n = n_fft + hop_length * (n_frames - 1)
x = np.zeros(n, dtype=dtype)

# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
win_sq = librosa_util.pad_center(win_sq, n_fft)

# Fill the envelope
for i in range(n_frames):
sample = i * hop_length
x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
return x


def griffin_lim(magnitudes, stft_fn, n_iters=30):
"""
PARAMS
------
magnitudes: spectrogram magnitudes
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
"""

angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
angles = angles.astype(np.float32)
angles = torch.autograd.Variable(torch.from_numpy(angles))
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)

for i in range(n_iters):
_, angles = stft_fn.transform(signal)
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
return signal


def dynamic_range_compression(x, C=1, clip_val=1e-5):
"""
PARAMS
------
C: compression factor
"""
return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression(x, C=1):
"""
PARAMS
------
C: compression factor used to compress
"""
return torch.exp(x) / C
Binary file added audios/LJ001-0029_100000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0029_100000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0029_100000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0029_100000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0029_100000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ001-0029_20000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0029_20000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0029_20000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0029_20000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0029_20000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ001-0029_50000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0029_50000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0029_50000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0029_50000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0029_50000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ001-0029_70000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0029_70000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0029_70000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0029_70000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0029_70000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ001-0085_100000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0085_100000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0085_100000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0085_100000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0085_100000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ001-0085_20000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0085_20000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0085_20000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0085_20000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0085_20000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ001-0085_50000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0085_50000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0085_50000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0085_50000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0085_50000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ001-0085_70000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ001-0085_70000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ001-0085_70000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ001-0085_70000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ001-0085_70000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ002-0106_100000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ002-0106_100000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ002-0106_100000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ002-0106_100000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ002-0106_100000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ002-0106_20000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ002-0106_20000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ002-0106_20000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ002-0106_20000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ002-0106_20000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ002-0106_50000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ002-0106_50000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ002-0106_50000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ002-0106_50000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ002-0106_50000_fastspeech_9.wav
Binary file not shown.
Binary file added audios/LJ002-0106_70000_fastspeech_10.wav
Binary file not shown.
Binary file added audios/LJ002-0106_70000_fastspeech_11.wav
Binary file not shown.
Binary file added audios/LJ002-0106_70000_fastspeech_12.wav
Binary file not shown.
Binary file added audios/LJ002-0106_70000_fastspeech_8.wav
Binary file not shown.
Binary file added audios/LJ002-0106_70000_fastspeech_9.wav
Binary file not shown.
111 changes: 111 additions & 0 deletions data_inspection.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-01-07T08:42:08.097687Z",
"start_time": "2020-01-07T08:42:05.681005Z"
},
"code_folding": []
},
"outputs": [],
"source": [
"import random\n",
"import hparams\n",
"import torch\n",
"import os\n",
"import pickle as pkl\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"\n",
"targets = []\n",
"alignments = []\n",
"\n",
"for file in os.listdir(f'{hparams.teacher_dir}/targets'):\n",
" file_name = file[:-4]\n",
"\n",
" with open(f'{hparams.teacher_dir}/targets/{file_name}.pkl', 'rb') as f:\n",
" targets.append( (file_name, pkl.load(f)) )\n",
" with open(f'{hparams.teacher_dir}/alignments/{file_name}.pkl', 'rb') as f:\n",
" alignments.append( (file_name, pkl.load(f)) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-01-07T08:42:39.457484Z",
"start_time": "2020-01-07T08:42:08.099257Z"
}
},
"outputs": [],
"source": [
"idx = random.choice(range(len(alignments)))\n",
"\n",
"print(f'{targets[idx][0]}')\n",
"fig, axes = plt.subplots(2, 1, figsize=(16,8))\n",
"axes[0].imshow(targets[idx][1].numpy(),\n",
" origin='lower',\n",
" aspect='auto')\n",
"axes[1].imshow(alignments[idx][1].numpy().T,\n",
" origin='lower',\n",
" aspect='auto')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
97 changes: 97 additions & 0 deletions data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import random
import numpy as np
import hparams
import torch
import torch.utils.data
import os
import pickle as pkl

from text import text_to_sequence


def load_filepaths_and_text(metadata, teacher_path, split="|"):
filepaths_and_text = []
with open(metadata, encoding='utf-8') as f:
for line in f:
file_name, text1, text2 = line.strip().split('|')
if os.path.exists(f'{teacher_path}/alignments/{file_name}.pkl'):
filepaths_and_text.append( (file_name, text1, text2) )
return filepaths_and_text


class TextMelLoader(torch.utils.data.Dataset):
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text, hparams.teacher_dir)
random.seed(1234)
random.shuffle(self.audiopaths_and_text)

def get_mel_text_pair(self, audiopath_and_text):
# separate filename and text
file_name = audiopath_and_text[0][:10]
seq = os.path.join(hparams.data_path, 'sequence')
mel = os.path.join(hparams.data_path, 'melspectrogram')

with open(f'{seq}/{file_name}_sequence.pkl', 'rb') as f:
text = pkl.load(f)

if hparams.distillation==True:
with open(f'{hparams.teacher_dir}/targets/{file_name}.pkl', 'rb') as f:
mel = pkl.load(f)
else:
with open(f'{mel}/{file_name}_melspectrogram.pkl', 'rb') as f:
mel = pkl.load(f)

with open(f'{hparams.teacher_dir}/alignments/{file_name}.pkl', 'rb') as f:
alignments = pkl.load(f)

return (text, mel, alignments)

def __getitem__(self, index):
return self.get_mel_text_pair(self.audiopaths_and_text[index])

def __len__(self):
return len(self.audiopaths_and_text)


class TextMelCollate():
def __init__(self):
return

def __call__(self, batch):
# Right zero-pad all one-hot text sequences to max input length
input_lengths, ids_sorted_decreasing = torch.sort(
torch.LongTensor([len(x[0]) for x in batch]),
dim=0, descending=True)
max_input_len = input_lengths[0]

text_padded = torch.zeros(len(batch), max_input_len, dtype=torch.long)
for i in range(len(ids_sorted_decreasing)):
text = batch[ids_sorted_decreasing[i]][0]
text_padded[i, :text.size(0)] = text

# Right zero-pad
num_mels = batch[0][1].size(0)
max_target_len = max([x[1].size(1) for x in batch])

# include Spec padded and gate padded
mel_padded = torch.zeros(len(batch), num_mels, max_target_len)
output_lengths = torch.LongTensor(len(batch))
for i in range(len(ids_sorted_decreasing)):
mel = batch[ids_sorted_decreasing[i]][1]
mel_padded[i, :, :mel.size(1)] = mel
output_lengths[i] = mel.size(1)

# include Spec padded and gate padded
align_padded = torch.zeros(len(batch), max_target_len, max_input_len)
for i in range(len(ids_sorted_decreasing)):
align = batch[ids_sorted_decreasing[i]][2]
align_padded[i, :align.size(0), :align.size(1)] = align

return text_padded, input_lengths, mel_padded, output_lengths, align_padded







Binary file added figures/duration_loss.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/train_loss.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/train_plots.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/val_loss.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/val_plots.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 3c121e0

Please sign in to comment.