|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import seaborn as sns |
| 4 | +import matplotlib.pyplot as plt |
| 5 | + |
| 6 | +import math |
| 7 | +import os, re, gc |
| 8 | +from glob import glob |
| 9 | +from scipy import signal |
| 10 | +from scipy.io import wavfile |
| 11 | +from scipy.fftpack import fft |
| 12 | + |
| 13 | +import keras |
| 14 | +from keras import optimizers |
| 15 | +from keras import backend as K |
| 16 | +from keras import regularizers |
| 17 | +from keras.models import Sequential |
| 18 | +from keras.layers import Dense, Activation, Dropout, Flatten |
| 19 | +from keras.layers import Conv2D, MaxPooling2D, BatchNormalization |
| 20 | + |
| 21 | +from keras.utils import np_utils |
| 22 | +from keras.utils import plot_model |
| 23 | +from keras.models import load_model |
| 24 | + |
| 25 | +from keras.callbacks import ModelCheckpoint |
| 26 | +from keras.callbacks import TensorBoard |
| 27 | +from keras.callbacks import LearningRateScheduler |
| 28 | +from keras.callbacks import EarlyStopping |
| 29 | + |
| 30 | +import librosa |
| 31 | +import librosa.display |
| 32 | +from tqdm import tqdm |
| 33 | +from random import shuffle |
| 34 | +from sklearn.decomposition import PCA |
| 35 | +from sklearn.metrics import accuracy_score |
| 36 | + |
| 37 | +sns.set_style("whitegrid") |
| 38 | + |
| 39 | +SAVE_PATH = '/data/vision/fisher/data1/kaggle/speech/' |
| 40 | +DATA_PATH = '/data/vision/fisher/data1/kaggle/speech/data/' |
| 41 | + |
| 42 | +SAMPLE_LEN = 16000 |
| 43 | +NEW_SAMPLE_RATE = 8000 |
| 44 | +CLASS_LABELS = 'yes no up down left right on off stop go silence unknown'.split() |
| 45 | + |
| 46 | +def custom_fft(y, fs): |
| 47 | + T = 1.0 / fs |
| 48 | + N = y.shape[0] |
| 49 | + yf = fft(y) |
| 50 | + xf = np.linspace(0.0, 1.0/(2.0*T), N // 2) |
| 51 | + vals = 2.0/N * np.abs(yf[0:N//2]) |
| 52 | + return xf, vals |
| 53 | + |
| 54 | +def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10): |
| 55 | + nperseg = int(round(window_size * sample_rate / 1e3)) |
| 56 | + noverlap = int(round(step_size * sample_rate / 1e3)) |
| 57 | + freqs, times, spec = signal.spectrogram(audio, fs=sample_rate, window='hann', |
| 58 | + nperseg=nperseg, noverlap=noverlap, detrend=False) |
| 59 | + return freqs, times, np.log(spec.T.astype(np.float32) + eps) |
| 60 | + |
| 61 | +def list_wavs_fname(dirpath, ext='wav'): |
| 62 | + print(dirpath) |
| 63 | + fpaths = glob(os.path.join(dirpath, r'*/*' + ext)) |
| 64 | + pat = r'.+/(\w+)/\w+\.' + ext + '$' |
| 65 | + labels = [] |
| 66 | + for fpath in fpaths: |
| 67 | + r = re.match(pat, fpath) |
| 68 | + if r: |
| 69 | + labels.append(r.group(1)) |
| 70 | + #end if |
| 71 | + #end for |
| 72 | + pat = r'.+/(\w+\.' + ext + ')$' |
| 73 | + fnames = [] |
| 74 | + for fpath in fpaths: |
| 75 | + r = re.match(pat, fpath) |
| 76 | + if r: |
| 77 | + fnames.append(r.group(1)) |
| 78 | + #end if |
| 79 | + #end for |
| 80 | + return labels, fnames |
| 81 | + |
| 82 | +def pad_audio(samples): |
| 83 | + if len(samples) >= SAMPLE_LEN: return samples |
| 84 | + else: return np.pad(samples, pad_width=(SAMPLE_LEN - len(samples), 0), mode='constant', constant_values=(0, 0)) |
| 85 | + |
| 86 | +def chop_audio(samples, L=16000, num=20): |
| 87 | + for i in range(num): |
| 88 | + beg = np.random.randint(0, len(samples) - L) |
| 89 | + yield samples[beg: beg + L] |
| 90 | + |
| 91 | +def label_transform(labels): |
| 92 | + nlabels = [] |
| 93 | + for label in labels: |
| 94 | + if label == '_background_noise_': |
| 95 | + nlabels.append('silence') |
| 96 | + elif label not in CLASS_LABELS: |
| 97 | + nlabels.append('unknown') |
| 98 | + else: |
| 99 | + nlabels.append(label) |
| 100 | + #end if |
| 101 | + #end for |
| 102 | + return pd.get_dummies(pd.Series(nlabels)) |
| 103 | + |
| 104 | +def test_data_generator(batch=128): |
| 105 | + test_data_path = DATA_PATH + '/test/audio/' |
| 106 | + fpaths = glob(os.path.join(test_data_path, '*wav')) |
| 107 | + i = 0 |
| 108 | + for path in fpaths: |
| 109 | + if i == 0: |
| 110 | + imgs = [] |
| 111 | + fnames = [] |
| 112 | + #end if |
| 113 | + i += 1 |
| 114 | + rate, samples = wavfile.read(path) |
| 115 | + samples = pad_audio(samples) |
| 116 | + resampled = signal.resample(samples, int((NEW_SAMPLE_RATE / float(rate)) * samples.shape[0])) |
| 117 | + _, _, specgram = log_specgram(resampled, sample_rate=NEW_SAMPLE_RATE) |
| 118 | + imgs.append(specgram) |
| 119 | + fnames.append(path.split('/')[-1]) |
| 120 | + if i == batch: |
| 121 | + i = 0 |
| 122 | + imgs = np.array(imgs) |
| 123 | + imgs = np.expand_dims(imgs, axis=-1) |
| 124 | + yield fnames, imgs |
| 125 | + #end if |
| 126 | + #end for |
| 127 | + if i < batch: |
| 128 | + imgs = np.array(imgs) |
| 129 | + imgs = np.expand_dims(imgs, axis=-1) |
| 130 | + yield fnames, imgs |
| 131 | + #end if |
| 132 | + raise StopIteration() |
| 133 | + |
| 134 | +def step_decay(epoch): |
| 135 | + lr_init = 0.001 |
| 136 | + drop = 0.5 |
| 137 | + epochs_drop = 4.0 |
| 138 | + lr_new = lr_init * math.pow(drop, math.floor((1+epoch)/epochs_drop)) |
| 139 | + return lr_new |
| 140 | + |
| 141 | +class LR_hist(keras.callbacks.Callback): |
| 142 | + def on_train_begin(self, logs={}): |
| 143 | + self.losses = [] |
| 144 | + self.lr = [] |
| 145 | + def on_epoch_end(self, batch, logs={}): |
| 146 | + self.losses.append(logs.get('loss')) |
| 147 | + self.lr.append(step_decay(len(self.losses))) |
| 148 | + |
| 149 | + |
| 150 | +#load data |
| 151 | +train_data_path = DATA_PATH + '/train/audio/' |
| 152 | +labels, fnames = list_wavs_fname(train_data_path) |
| 153 | + |
| 154 | +#visualize data |
| 155 | +sample_file = '/yes/0a7c2a8d_nohash_0.wav' |
| 156 | +sample_rate, samples = wavfile.read(train_data_path + sample_file) |
| 157 | +freqs, times, spectrogram = log_specgram(samples, sample_rate) |
| 158 | + |
| 159 | +S = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=128) |
| 160 | +log_S = librosa.power_to_db(S, ref=np.max) |
| 161 | +mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13) |
| 162 | +delta_mfcc = librosa.feature.delta(mfcc, order=1) |
| 163 | +delta2_mfcc = librosa.feature.delta(mfcc, order=2) |
| 164 | +mfcc_feat = np.vstack((mfcc, delta_mfcc, delta2_mfcc)) |
| 165 | + |
| 166 | +f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=False) |
| 167 | +ax1.plot(np.linspace(0, sample_rate/float(len(samples)),sample_rate), samples) |
| 168 | +ax1.set_title("raw wave of " + sample_file); ax1.set_ylabel("amplitude") |
| 169 | +ax2.imshow(spectrogram.T, aspect='auto', origin='lower', |
| 170 | + extent=[times.min(), times.max(), freqs.min(), freqs.max()]) |
| 171 | +ax2.set_yticks(freqs[::16]); ax2.set_xticks(times[::16]) |
| 172 | +ax2.set_title('spectrogram of ' + sample_file) |
| 173 | +ax2.set_ylabel('freq in Hz'); ax2.set_xlabel('seconds') |
| 174 | +plt.savefig('./figures/speech_features1.png') |
| 175 | + |
| 176 | +plt.figure() |
| 177 | +ax1 = plt.subplot(2,1,1) |
| 178 | +librosa.display.specshow(mfcc) |
| 179 | +plt.title("MFCC") |
| 180 | +ax2 = plt.subplot(2,1,2, sharex=ax1) |
| 181 | +librosa.display.specshow(delta_mfcc, x_axis='time') |
| 182 | +plt.title("delta MFCC") |
| 183 | +plt.savefig('./figures/speech_features2.png') |
| 184 | + |
| 185 | +import pdb; pdb.set_trace() |
| 186 | + |
| 187 | +#reduce training size |
| 188 | +labels_fnames = zip(labels, fnames) |
| 189 | +shuffle(labels_fnames) |
| 190 | +NUM_TRAIN = np.int(0.1 * len(labels_fnames)) |
| 191 | +#NUM_TRAIN = -1 |
| 192 | + |
| 193 | +print "loading training data..." |
| 194 | +x_train, y_train = [], [] |
| 195 | +for label, fname in tqdm(labels_fnames[:NUM_TRAIN]): |
| 196 | + sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname)) |
| 197 | + samples = pad_audio(samples) |
| 198 | + if len(samples) > SAMPLE_LEN: |
| 199 | + n_samples = chop_audio(samples) |
| 200 | + else: |
| 201 | + n_samples = [samples] |
| 202 | + #end if |
| 203 | + |
| 204 | + for samples in n_samples: |
| 205 | + resampled = signal.resample(samples, int((NEW_SAMPLE_RATE / float(sample_rate)) * samples.shape[0])) |
| 206 | + _, _, specgram = log_specgram(resampled, sample_rate=NEW_SAMPLE_RATE) |
| 207 | + y_train.append(label) |
| 208 | + x_train.append(specgram) |
| 209 | + #end for |
| 210 | +#end for |
| 211 | + |
| 212 | +x_train = np.array(x_train) |
| 213 | +x_train = np.expand_dims(x_train, axis=-1) |
| 214 | + |
| 215 | +y_train = label_transform(y_train) |
| 216 | +label_index = y_train.columns.values |
| 217 | +num_classes = len(label_index) |
| 218 | +y_train = y_train.values |
| 219 | + |
| 220 | +#free up memory |
| 221 | +del labels, fnames, labels_fnames |
| 222 | +gc.collect() |
| 223 | + |
| 224 | +#TODO: try without re-sampling (more data) |
| 225 | +#TODO: try merging MFCC coefficients (multi-input model) |
| 226 | +#TODO: add batch normalization |
| 227 | +#TODO: check over-fitting on dev and add regularization |
| 228 | +#TODO: better pre-processing of the input |
| 229 | + |
| 230 | +#training params |
| 231 | +batch_size = 128 |
| 232 | +num_epochs = 16 |
| 233 | + |
| 234 | +#model parameters |
| 235 | +img_rows = 99 |
| 236 | +img_cols = 81 |
| 237 | +weight_decay = 1e-4 |
| 238 | + |
| 239 | +#CNN architecture |
| 240 | +print "training CNN ..." |
| 241 | +model = Sequential() |
| 242 | +model.add(BatchNormalization(input_shape=(img_rows, img_cols, 1))) |
| 243 | +model.add(Conv2D(32, kernel_size = (3, 3), padding='same', activation='relu')) |
| 244 | +model.add(Conv2D(32, kernel_size = (3, 3), padding='same', activation='relu')) |
| 245 | +model.add(MaxPooling2D(pool_size=(2,2))) |
| 246 | + |
| 247 | +model.add(Conv2D(64, kernel_size=(3,3), padding='same', activation='relu')) |
| 248 | +model.add(Conv2D(64, kernel_size=(3,3), padding='same', activation='relu')) |
| 249 | +model.add(MaxPooling2D(pool_size=(2,2))) |
| 250 | + |
| 251 | +model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')) |
| 252 | +model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')) |
| 253 | +model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')) |
| 254 | +model.add(MaxPooling2D(pool_size=(2,2))) |
| 255 | + |
| 256 | +model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')) |
| 257 | +model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')) |
| 258 | +model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu')) |
| 259 | +model.add(MaxPooling2D(pool_size=(2,2))) |
| 260 | + |
| 261 | +model.add(Flatten()) |
| 262 | +model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(weight_decay))) |
| 263 | +model.add(Dropout(0.5)) |
| 264 | +model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(weight_decay))) |
| 265 | +model.add(Dense(num_classes)) |
| 266 | +model.add(Activation('softmax')) |
| 267 | + |
| 268 | +adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) |
| 269 | +model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) |
| 270 | +model.summary() |
| 271 | + |
| 272 | +#define callbacks |
| 273 | +file_name = SAVE_PATH + 'speech-weights-checkpoint.h5' |
| 274 | +checkpoint = ModelCheckpoint(file_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min') |
| 275 | +tensor_board = TensorBoard(log_dir='./logs', write_graph=True) |
| 276 | +hist_lr = LR_hist() |
| 277 | +reduce_lr = LearningRateScheduler(step_decay) |
| 278 | +early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=8, verbose=1) |
| 279 | +callbacks_list = [checkpoint, tensor_board, hist_lr, reduce_lr, early_stopping] |
| 280 | + |
| 281 | +#model training |
| 282 | +hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.2, shuffle=True, verbose=2) |
| 283 | + |
| 284 | +model.save(SAVE_PATH + 'speech_final_model.h5', overwrite=True) |
| 285 | +model.save_weights(SAVE_PATH + 'speech_final_weights.h5',overwrite=True) |
| 286 | + |
| 287 | +#load saved model |
| 288 | +#model = load_model(SAVE_PATH + 'speech_final_model.h5') |
| 289 | + |
| 290 | +#model prediction |
| 291 | +index, results = [], [] |
| 292 | +for fname, imgs in test_data_generator(batch=batch_size): |
| 293 | + preds = model.predict(imgs) |
| 294 | + preds_class = np.argmax(preds, axis=-1) |
| 295 | + preds_labels = [label_index[p] for p in preds_class] |
| 296 | + index.extend(fname) |
| 297 | + results.extend(preds_labels) |
| 298 | +#end for |
| 299 | + |
| 300 | +#create a submission |
| 301 | +submission_df = pd.DataFrame(columns=['fname', 'label']) |
| 302 | +submission_df['fname'] = index |
| 303 | +submission_df['label'] = results |
| 304 | +submission_df.to_csv("./data/first_speech.csv", index=False) |
| 305 | + |
| 306 | + |
| 307 | +#generate plots |
| 308 | +plt.figure() |
| 309 | +plt.plot(hist.history['loss'], c='b', lw=2.0, label='train') |
| 310 | +plt.plot(hist.history['val_loss'], c='r', lw=2.0, label='val') |
| 311 | +plt.title('TF speech model') |
| 312 | +plt.xlabel('Epochs') |
| 313 | +plt.ylabel('Cross-Entropy Loss') |
| 314 | +plt.legend(loc='upper right') |
| 315 | +plt.savefig('./figures/speech_cnn_loss.png') |
| 316 | + |
| 317 | +plt.figure() |
| 318 | +plt.plot(hist.history['acc'], c='b', lw=2.0, label='train') |
| 319 | +plt.plot(hist.history['val_acc'], c='r', lw=2.0, label='val') |
| 320 | +plt.title('TF speech model') |
| 321 | +plt.xlabel('Epochs') |
| 322 | +plt.ylabel('Accuracy') |
| 323 | +plt.legend(loc='upper left') |
| 324 | +plt.savefig('./figures/speech_cnn_acc.png') |
| 325 | + |
| 326 | +plt.figure() |
| 327 | +plt.plot(hist_lr.lr, lw=2.0, label='learning rate') |
| 328 | +plt.title('TF speech Model') |
| 329 | +plt.xlabel('Epochs') |
| 330 | +plt.ylabel('Learning Rate') |
| 331 | +plt.legend() |
| 332 | +plt.savefig('./figures/speech_learning_rate.png') |
| 333 | + |
| 334 | +plot_model(model, show_shapes=True, to_file='./figures/speech_model.png') |
| 335 | + |
| 336 | + |
0 commit comments