Merge pull request #2 from LEEYOONHYUNG/master

add audios
Deepest-Project · Jan 21, 2020 · 3c121e0 · 3c121e0
2 parents e14a7df + 34f5e55
commit 3c121e0
Show file tree

Hide file tree

Showing 90 changed files with 14,966 additions and 0 deletions.
diff --git a/audio_processing.py b/audio_processing.py
@@ -0,0 +1,93 @@
+import torch
+import numpy as np
+from scipy.signal import get_window
+import librosa.util as librosa_util
+
+
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
+                     n_fft=800, dtype=np.float32, norm=None):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+
+    n_frames : int > 0
+        The number of analysis frames
+
+    hop_length : int > 0
+        The number of samples to advance between frames
+
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+
+    n_fft : int > 0
+        The length of each analysis frame.
+
+    dtype : np.dtype
+        The data type of the output
+
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
+    return x
+
+
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
diff --git a/audios/LJ001-0029_100000_fastspeech_10.wav b/audios/LJ001-0029_100000_fastspeech_10.wav
diff --git a/audios/LJ001-0029_100000_fastspeech_11.wav b/audios/LJ001-0029_100000_fastspeech_11.wav
diff --git a/audios/LJ001-0029_100000_fastspeech_12.wav b/audios/LJ001-0029_100000_fastspeech_12.wav
diff --git a/audios/LJ001-0029_100000_fastspeech_8.wav b/audios/LJ001-0029_100000_fastspeech_8.wav
diff --git a/audios/LJ001-0029_100000_fastspeech_9.wav b/audios/LJ001-0029_100000_fastspeech_9.wav
diff --git a/audios/LJ001-0029_20000_fastspeech_10.wav b/audios/LJ001-0029_20000_fastspeech_10.wav
diff --git a/audios/LJ001-0029_20000_fastspeech_11.wav b/audios/LJ001-0029_20000_fastspeech_11.wav
diff --git a/audios/LJ001-0029_20000_fastspeech_12.wav b/audios/LJ001-0029_20000_fastspeech_12.wav
diff --git a/audios/LJ001-0029_20000_fastspeech_8.wav b/audios/LJ001-0029_20000_fastspeech_8.wav
diff --git a/audios/LJ001-0029_20000_fastspeech_9.wav b/audios/LJ001-0029_20000_fastspeech_9.wav
diff --git a/audios/LJ001-0029_50000_fastspeech_10.wav b/audios/LJ001-0029_50000_fastspeech_10.wav
diff --git a/audios/LJ001-0029_50000_fastspeech_11.wav b/audios/LJ001-0029_50000_fastspeech_11.wav
diff --git a/audios/LJ001-0029_50000_fastspeech_12.wav b/audios/LJ001-0029_50000_fastspeech_12.wav
diff --git a/audios/LJ001-0029_50000_fastspeech_8.wav b/audios/LJ001-0029_50000_fastspeech_8.wav
diff --git a/audios/LJ001-0029_50000_fastspeech_9.wav b/audios/LJ001-0029_50000_fastspeech_9.wav
diff --git a/audios/LJ001-0029_70000_fastspeech_10.wav b/audios/LJ001-0029_70000_fastspeech_10.wav
diff --git a/audios/LJ001-0029_70000_fastspeech_11.wav b/audios/LJ001-0029_70000_fastspeech_11.wav
diff --git a/audios/LJ001-0029_70000_fastspeech_12.wav b/audios/LJ001-0029_70000_fastspeech_12.wav
diff --git a/audios/LJ001-0029_70000_fastspeech_8.wav b/audios/LJ001-0029_70000_fastspeech_8.wav
diff --git a/audios/LJ001-0029_70000_fastspeech_9.wav b/audios/LJ001-0029_70000_fastspeech_9.wav
diff --git a/audios/LJ001-0085_100000_fastspeech_10.wav b/audios/LJ001-0085_100000_fastspeech_10.wav
diff --git a/audios/LJ001-0085_100000_fastspeech_11.wav b/audios/LJ001-0085_100000_fastspeech_11.wav
diff --git a/audios/LJ001-0085_100000_fastspeech_12.wav b/audios/LJ001-0085_100000_fastspeech_12.wav
diff --git a/audios/LJ001-0085_100000_fastspeech_8.wav b/audios/LJ001-0085_100000_fastspeech_8.wav
diff --git a/audios/LJ001-0085_100000_fastspeech_9.wav b/audios/LJ001-0085_100000_fastspeech_9.wav
diff --git a/audios/LJ001-0085_20000_fastspeech_10.wav b/audios/LJ001-0085_20000_fastspeech_10.wav
diff --git a/audios/LJ001-0085_20000_fastspeech_11.wav b/audios/LJ001-0085_20000_fastspeech_11.wav
diff --git a/audios/LJ001-0085_20000_fastspeech_12.wav b/audios/LJ001-0085_20000_fastspeech_12.wav
diff --git a/audios/LJ001-0085_20000_fastspeech_8.wav b/audios/LJ001-0085_20000_fastspeech_8.wav
diff --git a/audios/LJ001-0085_20000_fastspeech_9.wav b/audios/LJ001-0085_20000_fastspeech_9.wav
diff --git a/audios/LJ001-0085_50000_fastspeech_10.wav b/audios/LJ001-0085_50000_fastspeech_10.wav
diff --git a/audios/LJ001-0085_50000_fastspeech_11.wav b/audios/LJ001-0085_50000_fastspeech_11.wav
diff --git a/audios/LJ001-0085_50000_fastspeech_12.wav b/audios/LJ001-0085_50000_fastspeech_12.wav
diff --git a/audios/LJ001-0085_50000_fastspeech_8.wav b/audios/LJ001-0085_50000_fastspeech_8.wav
diff --git a/audios/LJ001-0085_50000_fastspeech_9.wav b/audios/LJ001-0085_50000_fastspeech_9.wav
diff --git a/audios/LJ001-0085_70000_fastspeech_10.wav b/audios/LJ001-0085_70000_fastspeech_10.wav
diff --git a/audios/LJ001-0085_70000_fastspeech_11.wav b/audios/LJ001-0085_70000_fastspeech_11.wav
diff --git a/audios/LJ001-0085_70000_fastspeech_12.wav b/audios/LJ001-0085_70000_fastspeech_12.wav
diff --git a/audios/LJ001-0085_70000_fastspeech_8.wav b/audios/LJ001-0085_70000_fastspeech_8.wav
diff --git a/audios/LJ001-0085_70000_fastspeech_9.wav b/audios/LJ001-0085_70000_fastspeech_9.wav
diff --git a/audios/LJ002-0106_100000_fastspeech_10.wav b/audios/LJ002-0106_100000_fastspeech_10.wav
diff --git a/audios/LJ002-0106_100000_fastspeech_11.wav b/audios/LJ002-0106_100000_fastspeech_11.wav
diff --git a/audios/LJ002-0106_100000_fastspeech_12.wav b/audios/LJ002-0106_100000_fastspeech_12.wav
diff --git a/audios/LJ002-0106_100000_fastspeech_8.wav b/audios/LJ002-0106_100000_fastspeech_8.wav
diff --git a/audios/LJ002-0106_100000_fastspeech_9.wav b/audios/LJ002-0106_100000_fastspeech_9.wav
diff --git a/audios/LJ002-0106_20000_fastspeech_10.wav b/audios/LJ002-0106_20000_fastspeech_10.wav
diff --git a/audios/LJ002-0106_20000_fastspeech_11.wav b/audios/LJ002-0106_20000_fastspeech_11.wav
diff --git a/audios/LJ002-0106_20000_fastspeech_12.wav b/audios/LJ002-0106_20000_fastspeech_12.wav
diff --git a/audios/LJ002-0106_20000_fastspeech_8.wav b/audios/LJ002-0106_20000_fastspeech_8.wav
diff --git a/audios/LJ002-0106_20000_fastspeech_9.wav b/audios/LJ002-0106_20000_fastspeech_9.wav
diff --git a/audios/LJ002-0106_50000_fastspeech_10.wav b/audios/LJ002-0106_50000_fastspeech_10.wav
diff --git a/audios/LJ002-0106_50000_fastspeech_11.wav b/audios/LJ002-0106_50000_fastspeech_11.wav
diff --git a/audios/LJ002-0106_50000_fastspeech_12.wav b/audios/LJ002-0106_50000_fastspeech_12.wav
diff --git a/audios/LJ002-0106_50000_fastspeech_8.wav b/audios/LJ002-0106_50000_fastspeech_8.wav
diff --git a/audios/LJ002-0106_50000_fastspeech_9.wav b/audios/LJ002-0106_50000_fastspeech_9.wav
diff --git a/audios/LJ002-0106_70000_fastspeech_10.wav b/audios/LJ002-0106_70000_fastspeech_10.wav
diff --git a/audios/LJ002-0106_70000_fastspeech_11.wav b/audios/LJ002-0106_70000_fastspeech_11.wav
diff --git a/audios/LJ002-0106_70000_fastspeech_12.wav b/audios/LJ002-0106_70000_fastspeech_12.wav
diff --git a/audios/LJ002-0106_70000_fastspeech_8.wav b/audios/LJ002-0106_70000_fastspeech_8.wav
diff --git a/audios/LJ002-0106_70000_fastspeech_9.wav b/audios/LJ002-0106_70000_fastspeech_9.wav
diff --git a/data_inspection.ipynb b/data_inspection.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-01-07T08:42:08.097687Z",
+     "start_time": "2020-01-07T08:42:05.681005Z"
+    },
+    "code_folding": []
+   },
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import hparams\n",
+    "import torch\n",
+    "import os\n",
+    "import pickle as pkl\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "\n",
+    "targets = []\n",
+    "alignments = []\n",
+    "\n",
+    "for file in os.listdir(f'{hparams.teacher_dir}/targets'):\n",
+    "    file_name = file[:-4]\n",
+    "\n",
+    "    with open(f'{hparams.teacher_dir}/targets/{file_name}.pkl', 'rb') as f:\n",
+    "        targets.append( (file_name, pkl.load(f)) )\n",
+    "    with open(f'{hparams.teacher_dir}/alignments/{file_name}.pkl', 'rb') as f:\n",
+    "        alignments.append( (file_name, pkl.load(f)) )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-01-07T08:42:39.457484Z",
+     "start_time": "2020-01-07T08:42:08.099257Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "idx = random.choice(range(len(alignments)))\n",
+    "\n",
+    "print(f'{targets[idx][0]}')\n",
+    "fig, axes = plt.subplots(2, 1, figsize=(16,8))\n",
+    "axes[0].imshow(targets[idx][1].numpy(),\n",
+    "               origin='lower',\n",
+    "               aspect='auto')\n",
+    "axes[1].imshow(alignments[idx][1].numpy().T,\n",
+    "               origin='lower',\n",
+    "               aspect='auto')\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_utils.py b/data_utils.py
@@ -0,0 +1,97 @@
+import random
+import numpy as np
+import hparams
+import torch
+import torch.utils.data
+import os
+import pickle as pkl
+
+from text import text_to_sequence
+
+
+def load_filepaths_and_text(metadata, teacher_path, split="|"):
+    filepaths_and_text = []
+    with open(metadata, encoding='utf-8') as f:
+        for line in f:
+            file_name, text1, text2 = line.strip().split('|')
+            if os.path.exists(f'{teacher_path}/alignments/{file_name}.pkl'):
+                filepaths_and_text.append( (file_name, text1, text2) )
+    return filepaths_and_text
+
+
+class TextMelLoader(torch.utils.data.Dataset):
+    def __init__(self, audiopaths_and_text, hparams):
+        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text, hparams.teacher_dir)
+        random.seed(1234)
+        random.shuffle(self.audiopaths_and_text)
+
+    def get_mel_text_pair(self, audiopath_and_text):
+        # separate filename and text
+        file_name = audiopath_and_text[0][:10]
+        seq = os.path.join(hparams.data_path, 'sequence')
+        mel = os.path.join(hparams.data_path, 'melspectrogram')
+
+        with open(f'{seq}/{file_name}_sequence.pkl', 'rb') as f:
+            text = pkl.load(f)
+
+        if hparams.distillation==True:
+            with open(f'{hparams.teacher_dir}/targets/{file_name}.pkl', 'rb') as f:
+                mel = pkl.load(f)
+        else:
+            with open(f'{mel}/{file_name}_melspectrogram.pkl', 'rb') as f:
+                mel = pkl.load(f)
+
+        with open(f'{hparams.teacher_dir}/alignments/{file_name}.pkl', 'rb') as f:
+            alignments = pkl.load(f)
+
+        return (text, mel, alignments)
+
+    def __getitem__(self, index):
+        return self.get_mel_text_pair(self.audiopaths_and_text[index])
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+
+
+class TextMelCollate():
+    def __init__(self):
+        return
+
+    def __call__(self, batch):
+        # Right zero-pad all one-hot text sequences to max input length
+        input_lengths, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([len(x[0]) for x in batch]),
+            dim=0, descending=True)
+        max_input_len = input_lengths[0]
+
+        text_padded = torch.zeros(len(batch), max_input_len, dtype=torch.long)
+        for i in range(len(ids_sorted_decreasing)):
+            text = batch[ids_sorted_decreasing[i]][0]
+            text_padded[i, :text.size(0)] = text
+
+        # Right zero-pad
+        num_mels = batch[0][1].size(0)
+        max_target_len = max([x[1].size(1) for x in batch])
+
+        # include Spec padded and gate padded
+        mel_padded = torch.zeros(len(batch), num_mels, max_target_len)
+        output_lengths = torch.LongTensor(len(batch))
+        for i in range(len(ids_sorted_decreasing)):
+            mel = batch[ids_sorted_decreasing[i]][1]
+            mel_padded[i, :, :mel.size(1)] = mel
+            output_lengths[i] = mel.size(1)
+
+        # include Spec padded and gate padded
+        align_padded = torch.zeros(len(batch), max_target_len, max_input_len)
+        for i in range(len(ids_sorted_decreasing)):
+            align = batch[ids_sorted_decreasing[i]][2]
+            align_padded[i, :align.size(0), :align.size(1)] = align
+
+        return text_padded, input_lengths, mel_padded, output_lengths, align_padded
+
+
+
+
+
+
+
diff --git a/figures/duration_loss.JPG b/figures/duration_loss.JPG
diff --git a/figures/train_loss.JPG b/figures/train_loss.JPG
diff --git a/figures/train_plots.JPG b/figures/train_plots.JPG
diff --git a/figures/val_loss.JPG b/figures/val_loss.JPG
diff --git a/figures/val_plots.JPG b/figures/val_plots.JPG