Skip to content

Commit 8780432

Browse files
committed
speech
speech
1 parent 722e593 commit 8780432

11 files changed

+1424
-0
lines changed

speech/figures/speech_cnn_acc.png

30.3 KB
Loading

speech/figures/speech_cnn_loss.png

31.2 KB
Loading
228 KB
Loading
14.9 KB
Loading
35.5 KB
Loading

speech/figures/speech_merged.png

462 KB
Loading

speech/figures/speech_model.png

90.9 KB
Loading

speech/speech_kernel.py

Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
import numpy as np
2+
import pandas as pd
3+
import seaborn as sns
4+
import matplotlib.pyplot as plt
5+
6+
import math
7+
import os, re, gc
8+
from glob import glob
9+
from scipy import signal
10+
from scipy.io import wavfile
11+
from scipy.fftpack import fft
12+
13+
import keras
14+
from keras import optimizers
15+
from keras import backend as K
16+
from keras import regularizers
17+
from keras.models import Sequential
18+
from keras.layers import Dense, Activation, Dropout, Flatten
19+
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
20+
21+
from keras.utils import np_utils
22+
from keras.utils import plot_model
23+
from keras.models import load_model
24+
25+
from keras.callbacks import ModelCheckpoint
26+
from keras.callbacks import TensorBoard
27+
from keras.callbacks import LearningRateScheduler
28+
from keras.callbacks import EarlyStopping
29+
30+
import librosa
31+
import librosa.display
32+
from tqdm import tqdm
33+
from random import shuffle
34+
from sklearn.decomposition import PCA
35+
from sklearn.metrics import accuracy_score
36+
37+
sns.set_style("whitegrid")
38+
39+
SAVE_PATH = '/data/vision/fisher/data1/kaggle/speech/'
40+
DATA_PATH = '/data/vision/fisher/data1/kaggle/speech/data/'
41+
42+
SAMPLE_LEN = 16000
43+
NEW_SAMPLE_RATE = 8000
44+
CLASS_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
45+
46+
def custom_fft(y, fs):
47+
T = 1.0 / fs
48+
N = y.shape[0]
49+
yf = fft(y)
50+
xf = np.linspace(0.0, 1.0/(2.0*T), N // 2)
51+
vals = 2.0/N * np.abs(yf[0:N//2])
52+
return xf, vals
53+
54+
def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
55+
nperseg = int(round(window_size * sample_rate / 1e3))
56+
noverlap = int(round(step_size * sample_rate / 1e3))
57+
freqs, times, spec = signal.spectrogram(audio, fs=sample_rate, window='hann',
58+
nperseg=nperseg, noverlap=noverlap, detrend=False)
59+
return freqs, times, np.log(spec.T.astype(np.float32) + eps)
60+
61+
def list_wavs_fname(dirpath, ext='wav'):
62+
print(dirpath)
63+
fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
64+
pat = r'.+/(\w+)/\w+\.' + ext + '$'
65+
labels = []
66+
for fpath in fpaths:
67+
r = re.match(pat, fpath)
68+
if r:
69+
labels.append(r.group(1))
70+
#end if
71+
#end for
72+
pat = r'.+/(\w+\.' + ext + ')$'
73+
fnames = []
74+
for fpath in fpaths:
75+
r = re.match(pat, fpath)
76+
if r:
77+
fnames.append(r.group(1))
78+
#end if
79+
#end for
80+
return labels, fnames
81+
82+
def pad_audio(samples):
83+
if len(samples) >= SAMPLE_LEN: return samples
84+
else: return np.pad(samples, pad_width=(SAMPLE_LEN - len(samples), 0), mode='constant', constant_values=(0, 0))
85+
86+
def chop_audio(samples, L=16000, num=20):
87+
for i in range(num):
88+
beg = np.random.randint(0, len(samples) - L)
89+
yield samples[beg: beg + L]
90+
91+
def label_transform(labels):
92+
nlabels = []
93+
for label in labels:
94+
if label == '_background_noise_':
95+
nlabels.append('silence')
96+
elif label not in CLASS_LABELS:
97+
nlabels.append('unknown')
98+
else:
99+
nlabels.append(label)
100+
#end if
101+
#end for
102+
return pd.get_dummies(pd.Series(nlabels))
103+
104+
def test_data_generator(batch=128):
105+
test_data_path = DATA_PATH + '/test/audio/'
106+
fpaths = glob(os.path.join(test_data_path, '*wav'))
107+
i = 0
108+
for path in fpaths:
109+
if i == 0:
110+
imgs = []
111+
fnames = []
112+
#end if
113+
i += 1
114+
rate, samples = wavfile.read(path)
115+
samples = pad_audio(samples)
116+
resampled = signal.resample(samples, int((NEW_SAMPLE_RATE / float(rate)) * samples.shape[0]))
117+
_, _, specgram = log_specgram(resampled, sample_rate=NEW_SAMPLE_RATE)
118+
imgs.append(specgram)
119+
fnames.append(path.split('/')[-1])
120+
if i == batch:
121+
i = 0
122+
imgs = np.array(imgs)
123+
imgs = np.expand_dims(imgs, axis=-1)
124+
yield fnames, imgs
125+
#end if
126+
#end for
127+
if i < batch:
128+
imgs = np.array(imgs)
129+
imgs = np.expand_dims(imgs, axis=-1)
130+
yield fnames, imgs
131+
#end if
132+
raise StopIteration()
133+
134+
def step_decay(epoch):
135+
lr_init = 0.001
136+
drop = 0.5
137+
epochs_drop = 4.0
138+
lr_new = lr_init * math.pow(drop, math.floor((1+epoch)/epochs_drop))
139+
return lr_new
140+
141+
class LR_hist(keras.callbacks.Callback):
142+
def on_train_begin(self, logs={}):
143+
self.losses = []
144+
self.lr = []
145+
def on_epoch_end(self, batch, logs={}):
146+
self.losses.append(logs.get('loss'))
147+
self.lr.append(step_decay(len(self.losses)))
148+
149+
150+
#load data
151+
train_data_path = DATA_PATH + '/train/audio/'
152+
labels, fnames = list_wavs_fname(train_data_path)
153+
154+
#visualize data
155+
sample_file = '/yes/0a7c2a8d_nohash_0.wav'
156+
sample_rate, samples = wavfile.read(train_data_path + sample_file)
157+
freqs, times, spectrogram = log_specgram(samples, sample_rate)
158+
159+
S = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=128)
160+
log_S = librosa.power_to_db(S, ref=np.max)
161+
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)
162+
delta_mfcc = librosa.feature.delta(mfcc, order=1)
163+
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
164+
mfcc_feat = np.vstack((mfcc, delta_mfcc, delta2_mfcc))
165+
166+
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=False)
167+
ax1.plot(np.linspace(0, sample_rate/float(len(samples)),sample_rate), samples)
168+
ax1.set_title("raw wave of " + sample_file); ax1.set_ylabel("amplitude")
169+
ax2.imshow(spectrogram.T, aspect='auto', origin='lower',
170+
extent=[times.min(), times.max(), freqs.min(), freqs.max()])
171+
ax2.set_yticks(freqs[::16]); ax2.set_xticks(times[::16])
172+
ax2.set_title('spectrogram of ' + sample_file)
173+
ax2.set_ylabel('freq in Hz'); ax2.set_xlabel('seconds')
174+
plt.savefig('./figures/speech_features1.png')
175+
176+
plt.figure()
177+
ax1 = plt.subplot(2,1,1)
178+
librosa.display.specshow(mfcc)
179+
plt.title("MFCC")
180+
ax2 = plt.subplot(2,1,2, sharex=ax1)
181+
librosa.display.specshow(delta_mfcc, x_axis='time')
182+
plt.title("delta MFCC")
183+
plt.savefig('./figures/speech_features2.png')
184+
185+
import pdb; pdb.set_trace()
186+
187+
#reduce training size
188+
labels_fnames = zip(labels, fnames)
189+
shuffle(labels_fnames)
190+
NUM_TRAIN = np.int(0.1 * len(labels_fnames))
191+
#NUM_TRAIN = -1
192+
193+
print "loading training data..."
194+
x_train, y_train = [], []
195+
for label, fname in tqdm(labels_fnames[:NUM_TRAIN]):
196+
sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
197+
samples = pad_audio(samples)
198+
if len(samples) > SAMPLE_LEN:
199+
n_samples = chop_audio(samples)
200+
else:
201+
n_samples = [samples]
202+
#end if
203+
204+
for samples in n_samples:
205+
resampled = signal.resample(samples, int((NEW_SAMPLE_RATE / float(sample_rate)) * samples.shape[0]))
206+
_, _, specgram = log_specgram(resampled, sample_rate=NEW_SAMPLE_RATE)
207+
y_train.append(label)
208+
x_train.append(specgram)
209+
#end for
210+
#end for
211+
212+
x_train = np.array(x_train)
213+
x_train = np.expand_dims(x_train, axis=-1)
214+
215+
y_train = label_transform(y_train)
216+
label_index = y_train.columns.values
217+
num_classes = len(label_index)
218+
y_train = y_train.values
219+
220+
#free up memory
221+
del labels, fnames, labels_fnames
222+
gc.collect()
223+
224+
#TODO: try without re-sampling (more data)
225+
#TODO: try merging MFCC coefficients (multi-input model)
226+
#TODO: add batch normalization
227+
#TODO: check over-fitting on dev and add regularization
228+
#TODO: better pre-processing of the input
229+
230+
#training params
231+
batch_size = 128
232+
num_epochs = 16
233+
234+
#model parameters
235+
img_rows = 99
236+
img_cols = 81
237+
weight_decay = 1e-4
238+
239+
#CNN architecture
240+
print "training CNN ..."
241+
model = Sequential()
242+
model.add(BatchNormalization(input_shape=(img_rows, img_cols, 1)))
243+
model.add(Conv2D(32, kernel_size = (3, 3), padding='same', activation='relu'))
244+
model.add(Conv2D(32, kernel_size = (3, 3), padding='same', activation='relu'))
245+
model.add(MaxPooling2D(pool_size=(2,2)))
246+
247+
model.add(Conv2D(64, kernel_size=(3,3), padding='same', activation='relu'))
248+
model.add(Conv2D(64, kernel_size=(3,3), padding='same', activation='relu'))
249+
model.add(MaxPooling2D(pool_size=(2,2)))
250+
251+
model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu'))
252+
model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu'))
253+
model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu'))
254+
model.add(MaxPooling2D(pool_size=(2,2)))
255+
256+
model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu'))
257+
model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu'))
258+
model.add(Conv2D(128, kernel_size=(3,3), padding='same', activation='relu'))
259+
model.add(MaxPooling2D(pool_size=(2,2)))
260+
261+
model.add(Flatten())
262+
model.add(Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
263+
model.add(Dropout(0.5))
264+
model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
265+
model.add(Dense(num_classes))
266+
model.add(Activation('softmax'))
267+
268+
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
269+
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
270+
model.summary()
271+
272+
#define callbacks
273+
file_name = SAVE_PATH + 'speech-weights-checkpoint.h5'
274+
checkpoint = ModelCheckpoint(file_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
275+
tensor_board = TensorBoard(log_dir='./logs', write_graph=True)
276+
hist_lr = LR_hist()
277+
reduce_lr = LearningRateScheduler(step_decay)
278+
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=8, verbose=1)
279+
callbacks_list = [checkpoint, tensor_board, hist_lr, reduce_lr, early_stopping]
280+
281+
#model training
282+
hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.2, shuffle=True, verbose=2)
283+
284+
model.save(SAVE_PATH + 'speech_final_model.h5', overwrite=True)
285+
model.save_weights(SAVE_PATH + 'speech_final_weights.h5',overwrite=True)
286+
287+
#load saved model
288+
#model = load_model(SAVE_PATH + 'speech_final_model.h5')
289+
290+
#model prediction
291+
index, results = [], []
292+
for fname, imgs in test_data_generator(batch=batch_size):
293+
preds = model.predict(imgs)
294+
preds_class = np.argmax(preds, axis=-1)
295+
preds_labels = [label_index[p] for p in preds_class]
296+
index.extend(fname)
297+
results.extend(preds_labels)
298+
#end for
299+
300+
#create a submission
301+
submission_df = pd.DataFrame(columns=['fname', 'label'])
302+
submission_df['fname'] = index
303+
submission_df['label'] = results
304+
submission_df.to_csv("./data/first_speech.csv", index=False)
305+
306+
307+
#generate plots
308+
plt.figure()
309+
plt.plot(hist.history['loss'], c='b', lw=2.0, label='train')
310+
plt.plot(hist.history['val_loss'], c='r', lw=2.0, label='val')
311+
plt.title('TF speech model')
312+
plt.xlabel('Epochs')
313+
plt.ylabel('Cross-Entropy Loss')
314+
plt.legend(loc='upper right')
315+
plt.savefig('./figures/speech_cnn_loss.png')
316+
317+
plt.figure()
318+
plt.plot(hist.history['acc'], c='b', lw=2.0, label='train')
319+
plt.plot(hist.history['val_acc'], c='r', lw=2.0, label='val')
320+
plt.title('TF speech model')
321+
plt.xlabel('Epochs')
322+
plt.ylabel('Accuracy')
323+
plt.legend(loc='upper left')
324+
plt.savefig('./figures/speech_cnn_acc.png')
325+
326+
plt.figure()
327+
plt.plot(hist_lr.lr, lw=2.0, label='learning rate')
328+
plt.title('TF speech Model')
329+
plt.xlabel('Epochs')
330+
plt.ylabel('Learning Rate')
331+
plt.legend()
332+
plt.savefig('./figures/speech_learning_rate.png')
333+
334+
plot_model(model, show_shapes=True, to_file='./figures/speech_model.png')
335+
336+

0 commit comments

Comments
 (0)