Plateauing loss in neural style transfer

I am writing an implementation of style transfer by loading a vgg model from keras and supplying it to a tensorflow model.

I am using an adam optimizer. The loss function is reducing but it is very slow and plateaus off at about 10⁸. Also the style loss is huge (order of 10⁸) whereas content loss is much smaller(order of 10⁵). This is weird as the paper for style transfer says to scale content loss down by a factor of 100 or 1000 when calculating total loss.

I tried increasing the learning rate but that only makes the gradient overshoot.

I suspect there must be a bug in my implementation but despite searching endlessly I have been unable to find what's wrong.

Here's the code:

# coding: utf-8 # In[1]: from keras.applications.vgg16 import VGG16 from keras.models import Model import tensorflow as tf import tensorflow.contrib.eager as tfe import numpy as np import matplotlib.pyplot as plt # In[2]: content_image_path = './skyline.jpg' style_image_path = './starry_night.jpg' output_image_path = './output.jpg' # In[4]: from keras.preprocessing import image from keras.applications.vgg16 import preprocess_input # In[5]: content_image = image.load_img(content_image_path, target_size=(224, 224)) #plt.imshow(content_image) content_arr = image.img_to_array(content_image) content_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(content_arr, axis=0)), tf.float64) sess.run(tf.shape(content_arr)) # In[6]: style_image = image.load_img(style_image_path, target_size=(224, 224)) #plt.imshow(style_image) style_arr = image.img_to_array(style_image) style_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(style_arr, axis=0)), tf.float64) sess.run(tf.shape(style_arr)) # In[7]: #generate random image with pixel values b/w 0 -> 255 o_input = np.random.randint(low=0, high=256, size=(224, 224, 3)).astype('float64') plt.imshow(o_input) o_input_old = np.copy(o_input) o_input = preprocess_input(np.expand_dims(o_input, axis=0)) print(o_input_old) o_input_var = tf.Variable(o_input, name="gen_img_vector", trainable=True) # In[8]: content_model = VGG16(include_top=False, weights='imagenet', input_tensor=content_arr, input_shape=(224, 224, 3)) style_model = VGG16(include_top=False, weights='imagenet', input_tensor=style_arr, input_shape=(224, 224, 3)) train_model = VGG16(include_top=False, weights='imagenet', input_tensor=o_input_var, input_shape=(224, 224, 3)) # In[10]: content_model.summary() # In[11]: def get_feature_rep(layer_type, layer_names, model): outputs = for name in layer_names: out = model.get_layer(name=name).output N = tf.shape(out)[3]#number of channels M = tf.multiply(tf.shape(out)[1], tf.shape(out)[2])#product of dimensions out = tf.transpose(tf.reshape(out, (M, N)))#Flattens each channel into 1-D tensor & reshapes layer if layer_type == 'style': out = get_gram_matrix(out) print(out) outputs.append(out) return outputs # In[12]: def get_gram_matrix(F): G = tf.matmul(F, tf.transpose(F)) return G # In[13]: def style_loss(Gs, As): total = tf.Variable(tf.constant(0.0, tf.float64), name="style_loss", trainable=False) style_reps = list(zip(Gs, As)) for layer in style_reps: loss = tf.reduce_sum(tf.cast(tf.squared_difference(layer[0], layer[1]), tf.float64), [0, 1]) N_layer = tf.shape(layer[0])[0] M_layer = tf.shape(layer[0])[1] den = tf.square(tf.cast(tf.multiply(N_layer, M_layer), tf.float64)) loss = loss/den loss = loss*0.2/4.0 #weighting loss total = total + loss return total # In[14]: def content_loss(P, F): # loss = tf.Variable(tf.constant(0.0, tf.float64), name="content_loss", trainable=False) loss = tf.reduce_sum(tf.cast(tf.squared_difference(P, F), tf.float64), [0, 1]) loss = loss/2.0 return loss # In[15]: content_layer_names = ['block4_conv2'] style_layer_names = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1'] # In[32]: P = tf.squeeze(get_feature_rep('content', content_layer_names, content_model)) # In[34]: F = tf.squeeze(get_feature_rep('content', content_layer_names, train_model)) # In[18]: #Each member of As consists of a feature map corresponding to a particular layer (dim. channels x pixels per channel) As = get_feature_rep('style', style_layer_names, style_model) # In[19]: Gs = get_feature_rep('style', style_layer_names, train_model) # In[20]: styleloss = style_loss(Gs, As) # In[21]: contentloss = content_loss(P, F) # In[22]: total_loss = tf.add(styleloss, tf.multiply(tf.constant(0.01, tf.float64), contentloss)) # In[23]: optimizer = tf.train.AdamOptimizer(5).minimize(total_loss, var_list=[o_input_var]) # In[26]: def reprocess(x): VGG_MEAN = [123.68, 116.78, 103.94] means = tf.reshape(tf.constant(VGG_MEAN, tf.float64), [1, 1, 3]) #Undo mean imagenet scale preprocessing x = tf.add(x, means) tf.clip_by_value(x, 0, 255) #bgr to rgb x = x[..., ::-1] return x # In[27]: saver = tf.train.Saver(tf.global_variables()) # In[28]: init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) # saver.restore(sess, './model/nst_model.ckpt') for epoch in range(100): _, styleloss_curr, contentloss_curr, loss_curr, new_arr = sess.run([optimizer, styleloss, contentloss, total_loss, o_input_var]) print('Epoch: %i Content Loss: %.2f Style Loss: %.2f Total Loss: %.2f' % (epoch, contentloss_curr, styleloss_curr, loss_curr)) if epoch % 15 == 0: saver.save(sess, './model/nst_model.ckpt') # In[30]: with tf.Session() as sess: new_arr = reprocess(new_arr) new_im = sess.run(tf.cast(tf.round(tf.squeeze(new_arr)), tf.uint8)) # new_im = new_im[...,::-1] # print(sess.run(new_arr[0]/255)) print(sess.run(tf.shape(new_im))) plt.imshow(new_im)

Why do you use tf.float64? This totally wrecks the performance of the system. Also instead of reduce sum and then somehow dividing by the shape, just use a reduce_mean.
– Thomas Pinetz
Jun 29 at 11:36

There were overflow errors, actually. And I'm not dividing by the shape exactly. According to the paper referenced above the factor of division is different. Still, I'll keep that in mind.
– CodeChef123
Jun 29 at 11:44

it may be nothing I am confused about your use of the total variable use tf.add_n it is prettier. Otherwise at first glance your code seems okay-ish. Could you try to use a reference tf implementation and do a diff step by step ?
– jean
Jun 29 at 12:19

By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.

Search This Blog

Mgiyuk