Skip to content

Commit

Permalink
start
Browse files Browse the repository at this point in the history
  • Loading branch information
LEEYOONHYUNG committed Jan 13, 2020
1 parent 204e921 commit 5a06c0f
Show file tree
Hide file tree
Showing 32 changed files with 15,358 additions and 0 deletions.
98 changes: 98 additions & 0 deletions audio_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import torch
import numpy as np
from scipy.signal import get_window
import librosa.util as librosa_util


def window_sumsquare(window,
n_frames,
hop_length=200,
win_length=800,
n_fft=800,
dtype=np.float32,
norm=None):
"""
# from librosa 0.6
Compute the sum-square envelope of a window function at a given hop length.
This is used to estimate modulation effects induced by windowing
observations in short-time fourier transforms.
Parameters
----------
window : string, tuple, number, callable, or list-like
Window specification, as in `get_window`
n_frames : int > 0
The number of analysis frames
hop_length : int > 0
The number of samples to advance between frames
win_length : [optional]
The length of the window function. By default, this matches `n_fft`.
n_fft : int > 0
The length of each analysis frame.
dtype : np.dtype
The data type of the output
Returns
-------
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
The sum-squared envelope of the window function
"""
if win_length is None:
win_length = n_fft

n = n_fft + hop_length * (n_frames - 1)
x = np.zeros(n, dtype=dtype)

# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
win_sq = librosa_util.pad_center(win_sq, n_fft)

# Fill the envelope
for i in range(n_frames):
sample = i * hop_length
x[sample:min(n, sample+n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
return x


def griffin_lim(magnitudes, stft_fn, n_iters=30):
"""
PARAMS
------
magnitudes: spectrogram magnitudes
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
"""

angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
angles = angles.astype(np.float32)
angles = torch.autograd.Variable(torch.from_numpy(angles))
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)

for i in range(n_iters):
_, angles = stft_fn.transform(signal)
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
return signal


def dynamic_range_compression(x, C=1, clip_val=1e-5):
"""
PARAMS
------
C: compression factor
"""
return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression(x, C=1):
"""
PARAMS
------
C: compression factor used to compress
"""
return torch.exp(x) / C
73 changes: 73 additions & 0 deletions data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import random
import numpy as np
import hparams
import torch
import torch.utils.data
import torch.nn.functional as F
import os
import pickle as pkl

from text import text_to_sequence


def load_filepaths_and_text(metadata, split="|"):
with open(metadata, encoding='utf-8') as f:
filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text


class TextMelLoader(torch.utils.data.Dataset):
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
random.seed(1234)
random.shuffle(self.audiopaths_and_text)

def get_mel_text_pair(self, audiopath_and_text):
# separate filename and text
file_name = audiopath_and_text[0][:10]
seq = f'{hparams.data_path}/preprocessed/sequence'
mel = f'{hparams.data_path}/preprocessed/melspectrogram'

with open(f'{seq}/{file_name}_sequence.pkl', 'rb') as f:
text = pkl.load(f)
with open(f'{mel}/{file_name}_melspectrogram.pkl', 'rb') as f:
mel = pkl.load(f)

return (text, mel)

def __getitem__(self, index):
return self.get_mel_text_pair(self.audiopaths_and_text[index])

def __len__(self):
return len(self.audiopaths_and_text)


class TextMelCollate():
def __init__(self):
return

def __call__(self, batch):
# Right zero-pad all one-hot text sequences to max input length
input_lengths, ids_sorted_decreasing = torch.sort(
torch.LongTensor([len(x[0]) for x in batch]),
dim=0, descending=True)
max_input_len = input_lengths[0]

text_padded = torch.zeros(len(batch), max_input_len, dtype=torch.long)
for i in range(len(ids_sorted_decreasing)):
text = batch[ids_sorted_decreasing[i]][0]
text_padded[i, :text.size(0)] = text

# Right zero-pad
num_mels = batch[0][1].size(0)
max_target_len = max([x[1].size(1) for x in batch])

# include Spec padded and gate padded
mel_padded = torch.zeros(len(batch), num_mels, max_target_len)
output_lengths = torch.LongTensor(len(batch))
for i in range(len(ids_sorted_decreasing)):
mel = batch[ids_sorted_decreasing[i]][1]
mel_padded[i, :, :mel.size(1)] = mel
output_lengths[i] = mel.size(1)

return text_padded, input_lengths, F.pad(mel_padded, (5,10)), output_lengths
Loading

0 comments on commit 5a06c0f

Please sign in to comment.