I have a problem with one of my applications. I'm trying to develop a system which manages to denoise (remove the background noise) from speech recordings. For that purpose, I generated artificial data by adding noise to clean speech. I tried training a LSTM Network. My code for the training is the following:
import os
import tensorflow as tf
import numpy as np
import random
import scipy.io.wavfile as wav
import scipy.signal as signal
import math
import tensorboard
from datetime import datetime
log_path = '../model/graph' + '/train_{}'.format(datetime.utcnow().strftime("%Y%m%d%H%M%S"))
def normalize(x, xmax, xmin):
for i in range(0,len(x)):
x[i] = (x[i] - xmin)/(xmax - xmin)
return x
def length(sequence):
used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
length = tf.reduce_sum(used, 1)
length = tf.cast(length, tf.int32)
return length
def perform_stft(data_vec, data_vec_val):
starting_point = 0
temp_vec = []
temp_vec_val = []
for samp in range(0, len(data_vec)):
_, _, stft = signal.stft(data_vec[samp], fs=sample_rate, nperseg=stft_size, return_onesided = False)
_, _, stft_val = signal.stft(data_vec_val[samp], fs=sample_rate, nperseg=stft_size, return_onesided = False)
stft = stft.real
stft_val = stft.real
segm_num = int(math.ceil(stft.shape[1]/lstm_seq_length))
placeholder_vec = np.empty((0,lstm_seq_length,stft_size))
placeholder_vec_val = np.empty((0,lstm_seq_length,stft_size))
for segm in range(0, segm_num):
placeholder = np.zeros((stft_size, lstm_seq_length))
placeholder_val = np.zeros((stft_size, lstm_seq_length))
curr_seq = segm*lstm_seq_length
if lstm_seq_length <= stft.shape[1]-curr_seq:
stft_seq = stft[:,curr_seq:curr_seq+lstm_seq_length].astype('float64')
stft_seq_val = stft_val[:,curr_seq:curr_seq+lstm_seq_length].astype('float64')
np.copyto(placeholder[:,:len(stft_seq)],stft_seq)
np.copyto(placeholder_val[:,:len(stft_seq_val)],stft_seq_val)
placeholder = np.transpose(placeholder)
placeholder = np.reshape(placeholder, (1,lstm_seq_length,stft_size))
placeholder_vec = np.append(placeholder_vec, placeholder, 0)
placeholder_val = np.transpose(placeholder_val)
placeholder_val = np.reshape(placeholder_val, (1,lstm_seq_length,stft_size))
placeholder_vec_val = np.append(placeholder_vec_val, placeholder_val, 0)
elif stft.shape[1]-curr_seq < lstm_seq_length:
rest = stft.shape[1]-curr_seq
placeholder[:,:rest] = stft[:,curr_seq:].astype('float64')
placeholder_val[:,:rest] = stft_val[:,curr_seq:].astype('float64')
placeholder = np.transpose(placeholder)
placeholder = np.reshape(placeholder, (1,lstm_seq_length,stft_size))
placeholder_vec = np.append(placeholder_vec, placeholder, 0)
placeholder_val = np.transpose(placeholder_val)
placeholder_val = np.reshape(placeholder_val, (1,lstm_seq_length,stft_size))
placeholder_vec_val = np.append(placeholder_vec_val, placeholder_val, 0)
temp_vec.append(placeholder_vec)
temp_vec_val.append(placeholder_vec_val)
xmax_fin = 0
xmin_fin = 0
for i in range(0,len(temp_vec)):
if temp_vec[i].max() > xmax_fin:
xmax_fin = temp_vec[i].max()
if temp_vec[i].min() < xmin_fin:
xmin_fin = temp_vec[i].min()
temp_vec = normalize(temp_vec, xmax_fin, xmin_fin)
temp_vec_val = normalize(temp_vec_val, xmax_fin, xmin_fin)
return temp_vec, temp_vec_val
# Settings and Directories
total_train_size = 1000
buffer_size = 200
batch_size = 50
epochs = 50
sample_rate = 44100
stft_size = 1024
hop_size = 512
lstm_seq_length = 100
num_layers = 8
folder_dir = "../data/output/"
train_dir = "train/"
train_valid_dir = "train_valid/"
test_dir = "test/"
test_valid_dir = "test_valid/"
train_data_names = []
test_data_names = []
for train_fileName in os.listdir(folder_dir + train_dir):
train_data_names.append(train_fileName[10:-4])
for test_fileName in os.listdir(folder_dir + test_dir):
test_data_names.append(test_fileName[10:-4])
file_num = len(train_data_names)
test_file_num = len(test_data_names)
# TF Vars and Graph
mixed_data = tf.placeholder(tf.float32, [None, lstm_seq_length, stft_size])
valid_data = tf.placeholder(tf.float32, [None, lstm_seq_length, stft_size])
lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=stft_size, forget_bias = 1.0, state_is_tuple = True)
stacked_lstm = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(num_layers)])
rnn_outputs, final_state = tf.nn.dynamic_rnn(stacked_lstm, mixed_data, dtype=tf.float32, sequence_length=length(mixed_data), swap_memory=True, time_major=False)
mse_loss = tf.losses.mean_squared_error(rnn_outputs, valid_data)
train_optimizer = tf.train.AdamOptimizer(0.000001).minimize(mse_loss)
loss_value_ = tf.placeholder(tf.float32, shape=())
loss_valid_value_ = tf.placeholder(tf.float32, shape=())
cost_summary_train = tf.summary.scalar('Train Loss', loss_value_)
cost_summary_test = tf.summary.scalar('Test Loss', loss_valid_value_)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
file_writer = tf.summary.FileWriter(log_path, tf.get_default_graph())
with sess.as_default():
loss_step = 0
for iteration in range(0,epochs):
for it in range(int(total_train_size/buffer_size)):
mix_data_list = []
clean_data_list = []
for rand_num in random.sample(range(0, file_num), buffer_size):
file_name_m = "final_mix_" + train_data_names[rand_num] + ".wav"
_, data_m = wav.read(folder_dir + train_dir + file_name_m)
mix_data_list.append(np.array(data_m, dtype='float32'))
file_name_c = "speech_" + train_data_names[rand_num] + ".wav"
_, data_c = wav.read(folder_dir + train_valid_dir + file_name_c)
data_c = np.array(data_c, dtype='float32')
clean_data_list.append(data_c)
loss_value = 0
seg_num = 0
for batch in range(int(buffer_size/batch_size)):
mix_spec, clean_spec = perform_stft(mix_data_list[batch*batch_size:(batch+1)*batch_size], clean_data_list[batch*batch_size:(batch+1)*batch_size])
for item in range(0, batch_size):
_, temp_loss_value, final_state_value, rnn_outputs_val = sess.run([train_optimizer, mse_loss, final_state, rnn_outputs], {mixed_data: mix_spec[item], valid_data: clean_spec[item]})
loss_value += temp_loss_value
seg_num +=len(mix_spec[item])
print("Epoch " + str(iteration+1) + " of " + str(epochs))
print("Buffer Seg " + str(it+1) + " of " + str(int(total_train_size/buffer_size)) + " (" + str(int(buffer_size/batch_size)) + " Batches)")
print("\tBatch Loss: " + str(loss_value) + "\tSegment No.: " + str(seg_num))
avg_loss_val = loss_value/seg_num
print("\tAverage Batch Loss: " + str(avg_loss_val))
mix_valid_list = []
clean_valid_list = []
for rand_num in random.sample(range(0, test_file_num), batch_size):
file_name_m = "final_mix_" + test_data_names[rand_num] + ".wav"
_, data_m = wav.read(folder_dir + test_dir + file_name_m)
mix_valid_list.append(np.array(data_m, dtype='float32'))
file_name_c = "speech_" + test_data_names[rand_num] + ".wav"
_, data_c = wav.read(folder_dir + test_valid_dir + file_name_c)
data_c = np.array(data_c, dtype='float32')
clean_valid_list.append(data_c)
mix_spec_val, clean_spec_val = perform_stft(mix_valid_list, clean_valid_list)
sum_loss = 0
sum_loss_val = 0
total_len = 0
for item in range(batch_size):
_, temp_loss, _, _ = sess.run([train_optimizer, mse_loss, final_state, rnn_outputs], {mixed_data: mix_spec[item], valid_data: clean_spec[item]})
_, temp_loss_val, _, _ = sess.run([train_optimizer, mse_loss, final_state, rnn_outputs], {mixed_data: mix_spec_val[item], valid_data: clean_spec_val[item]})
total_len += len(mix_spec[item])
sum_loss += temp_loss
sum_loss_val += temp_loss_val
sum_loss = sum_loss/total_len
sum_loss_val = sum_loss_val/total_len
sum_loss = sess.run(cost_summary_train, feed_dict={loss_value_: sum_loss})
sum_loss_val = sess.run(cost_summary_test, feed_dict={loss_valid_value_: sum_loss_val})
file_writer.add_summary(sum_loss, loss_step)
file_writer.add_summary(sum_loss_val, loss_step)
loss_step +=1
saver.save(sess, '../model/current_checkpoint.ckpt', global_step=iteration)
print("Model saved!")
saver.save(sess, '../model/final_checkpoint.ckpt')
print("Finished Training!")
file_writer.close()
The training appeared to go well. This is the loss-graph from Tensorboard:
I then tried loading the model in order to denoise an audio file. I even tried it with a file used during training. The code is the following:
import os
import tensorflow as tf
import numpy as np
import scipy.io.wavfile as wav
import scipy.signal as signal
import resampy
import math, random
def normalize(x):
xmax, xmin = x.max(), x.min()
x = (x - xmin)/(xmax - xmin)
return x, xmax, xmin
def length(sequence):
used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
length = tf.reduce_sum(used, 1)
length = tf.cast(length, tf.int32)
return length
def preprocess_file(rate, data):
resampled = resampy.resample(data, rate, 44100)
if resampled.ndim == 2:
resampled_mono = np.array(resampled[:, 0])
elif resampled.ndim == 1:
resampled_mono = np.array(resampled)
return resampled_mono
def perform_stft(data):
temp_vec = np.empty((0,lstm_seq_length,stft_size))
_, _, stft = signal.stft(data, fs=sample_rate, nperseg=stft_size, return_onesided = False)
stft = stft.real
segm_num = int(math.ceil(stft.shape[1]/lstm_seq_length))
for segm in range(0, segm_num):
placeholder = np.zeros((stft_size, lstm_seq_length))
curr_seq = segm*lstm_seq_length
if lstm_seq_length <= stft.shape[1]-curr_seq:
stft_seq = stft[:,curr_seq:curr_seq+lstm_seq_length].astype('float64')
np.copyto(placeholder[:,:len(stft_seq)],stft_seq)
placeholder = np.transpose(placeholder)
placeholder = np.reshape(placeholder, (1,lstm_seq_length,stft_size))
temp_vec = np.append(temp_vec, placeholder, 0)
elif stft.shape[1]-curr_seq < lstm_seq_length:
rest = stft.shape[1]-curr_seq
placeholder[:,:rest] = stft[:,curr_seq:].astype('float64')
placeholder = np.transpose(placeholder)
placeholder = np.reshape(placeholder, (1,lstm_seq_length,stft_size))
temp_vec = np.append(temp_vec, placeholder, 0)
temp_vec = np.array(temp_vec)
temp_vec, xmax, xmin = normalize(temp_vec)
return temp_vec, stft.shape[1], xmax, xmin
# Variables
sample_rate = 44100
stft_size = 1024
hop_size = 512
lstm_seq_length = 100
num_layers = 8
# TF Vars and Graph
input_data = tf.placeholder(tf.float32, [None, lstm_seq_length, stft_size])
lstm_cell = tf.contrib.rnn.BasicLSTMCell(stft_size, forget_bias = 1.0, state_is_tuple = True)
stacked_lstm = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(num_layers)])
rnn_outputs, final_state = tf.nn.dynamic_rnn(stacked_lstm, input_data, dtype=tf.float32, sequence_length=length(input_data), swap_memory=True, time_major=False)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('../model/'))
folder_dir = "../data/Denoise/"
file_name = "final_mix_1122.wav"
file_rate, file_orig = wav.read(str(folder_dir) + str(file_name))
file = preprocess_file(file_rate, file_orig)
spec, file_len, xmax, xmin = perform_stft(file)
_, rnn_outputs_value = sess.run([final_state, rnn_outputs], {input_data: spec})
rnn_outputs_value = np.transpose(rnn_outputs_value, [0, 2, 1])
rnn_outputs_value = np.reshape(rnn_outputs_value, (stft_size, rnn_outputs_value.shape[0]*lstm_seq_length))
outputData_STFT = np.zeros([stft_size, file_len])
outputData_STFT = rnn_outputs_value[:,:file_len]
outputData_STFT = (outputData_STFT - 0.5)*(xmax - xmin)
# Compute ISTFT
_, outputData_ISTFT = signal.istft(outputData_STFT, fs=sample_rate, nperseg=stft_size, input_onesided = False)
outputData_ISTFT = outputData_ISTFT.real
wav.write("../data/Denoised_stuff/" + "test.wav", file_rate, outputData_ISTFT.astype(np.int16))
print("File saved to ../data/Denoised_stuff/" + "test.wav")
The result, however, of this denoising attempt is more than disappointing. The output is nothing but noise with a "tick" every second. It becomes fairly obvious that the training failed completely when comparing the original file and the result of the "denoising":
Obviously nothing useful whatsoever was produced by the network.
Now my questions are:
Am I misinterpreting the loss graph?
I assumed that the training went well, as it appears to be the picturebook loss curve. (Or is a loss value of approx. 5.0000e-6 simply not sufficient?)
Is the way I try to transform the output of the RNN back to a WAV file wrong?
I thought it was sufficient to reshape the output, reverse the normalization and then use ISTFT to restore the audio file the network generated. (As done in the bottom of the second code block)
Am a training the network wrong?
Looking at the tensorflow website, the input is said to be [batch_size, max_time, depth] (as seen here: https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn ). Does that mean I need to input the sequences of an audio file one at a time, but can input the same sequence from several files in a batch? (I input one whole audio file at once, but split into several sequences.)
- If I made such a fundamental mistake during training, how did the training seem to go so well then?
I know that it is a lot of code, but I hope that somebody with more experience is able to spot my mistake quickly.
If more information is needed, I am more than happy to provide it!
Any further comments are also very much appreciated.
I will of course also provide updates, but training takes quite a while unfortunately.
Thanks in advance and I'm looking forward to your replies!
Bests