Plateauing loss in neural style transfer
Plateauing loss in neural style transfer
I am writing an implementation of style transfer by loading a vgg model from keras and supplying it to a tensorflow model.
I am using an adam optimizer. The loss function is reducing but it is very slow and plateaus off at about 108. Also the style loss is huge (order of 108) whereas content loss is much smaller(order of 105). This is weird as the paper for style transfer says to scale content loss down by a factor of 100 or 1000 when calculating total loss.
I tried increasing the learning rate but that only makes the gradient overshoot.
I suspect there must be a bug in my implementation but despite searching endlessly I have been unable to find what's wrong.
Here's the code:
# coding: utf-8
# In[1]:
from keras.applications.vgg16 import VGG16
from keras.models import Model
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import matplotlib.pyplot as plt
# In[2]:
content_image_path = './skyline.jpg'
style_image_path = './starry_night.jpg'
output_image_path = './output.jpg'
# In[4]:
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
# In[5]:
content_image = image.load_img(content_image_path, target_size=(224, 224))
#plt.imshow(content_image)
content_arr = image.img_to_array(content_image)
content_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(content_arr, axis=0)), tf.float64)
sess.run(tf.shape(content_arr))
# In[6]:
style_image = image.load_img(style_image_path, target_size=(224, 224))
#plt.imshow(style_image)
style_arr = image.img_to_array(style_image)
style_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(style_arr, axis=0)), tf.float64)
sess.run(tf.shape(style_arr))
# In[7]:
#generate random image with pixel values b/w 0 -> 255
o_input = np.random.randint(low=0, high=256, size=(224, 224, 3)).astype('float64')
plt.imshow(o_input)
o_input_old = np.copy(o_input)
o_input = preprocess_input(np.expand_dims(o_input, axis=0))
print(o_input_old)
o_input_var = tf.Variable(o_input, name="gen_img_vector", trainable=True)
# In[8]:
content_model = VGG16(include_top=False, weights='imagenet', input_tensor=content_arr, input_shape=(224, 224, 3))
style_model = VGG16(include_top=False, weights='imagenet', input_tensor=style_arr, input_shape=(224, 224, 3))
train_model = VGG16(include_top=False, weights='imagenet', input_tensor=o_input_var, input_shape=(224, 224, 3))
# In[10]:
content_model.summary()
# In[11]:
def get_feature_rep(layer_type, layer_names, model):
outputs =
for name in layer_names:
out = model.get_layer(name=name).output
N = tf.shape(out)[3]#number of channels
M = tf.multiply(tf.shape(out)[1], tf.shape(out)[2])#product of dimensions
out = tf.transpose(tf.reshape(out, (M, N)))#Flattens each channel into 1-D tensor & reshapes layer
if layer_type == 'style':
out = get_gram_matrix(out)
print(out)
outputs.append(out)
return outputs
# In[12]:
def get_gram_matrix(F):
G = tf.matmul(F, tf.transpose(F))
return G
# In[13]:
def style_loss(Gs, As):
total = tf.Variable(tf.constant(0.0, tf.float64), name="style_loss", trainable=False)
style_reps = list(zip(Gs, As))
for layer in style_reps:
loss = tf.reduce_sum(tf.cast(tf.squared_difference(layer[0], layer[1]), tf.float64), [0, 1])
N_layer = tf.shape(layer[0])[0]
M_layer = tf.shape(layer[0])[1]
den = tf.square(tf.cast(tf.multiply(N_layer, M_layer), tf.float64))
loss = loss/den
loss = loss*0.2/4.0 #weighting loss
total = total + loss
return total
# In[14]:
def content_loss(P, F):
# loss = tf.Variable(tf.constant(0.0, tf.float64), name="content_loss", trainable=False)
loss = tf.reduce_sum(tf.cast(tf.squared_difference(P, F), tf.float64), [0, 1])
loss = loss/2.0
return loss
# In[15]:
content_layer_names = ['block4_conv2']
style_layer_names = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1']
# In[32]:
P = tf.squeeze(get_feature_rep('content', content_layer_names, content_model))
# In[34]:
F = tf.squeeze(get_feature_rep('content', content_layer_names, train_model))
# In[18]:
#Each member of As consists of a feature map corresponding to a particular layer (dim. channels x pixels per channel)
As = get_feature_rep('style', style_layer_names, style_model)
# In[19]:
Gs = get_feature_rep('style', style_layer_names, train_model)
# In[20]:
styleloss = style_loss(Gs, As)
# In[21]:
contentloss = content_loss(P, F)
# In[22]:
total_loss = tf.add(styleloss, tf.multiply(tf.constant(0.01, tf.float64), contentloss))
# In[23]:
optimizer = tf.train.AdamOptimizer(5).minimize(total_loss, var_list=[o_input_var])
# In[26]:
def reprocess(x):
VGG_MEAN = [123.68, 116.78, 103.94]
means = tf.reshape(tf.constant(VGG_MEAN, tf.float64), [1, 1, 3])
#Undo mean imagenet scale preprocessing
x = tf.add(x, means)
tf.clip_by_value(x, 0, 255)
#bgr to rgb
x = x[..., ::-1]
return x
# In[27]:
saver = tf.train.Saver(tf.global_variables())
# In[28]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
# saver.restore(sess, './model/nst_model.ckpt')
for epoch in range(100):
_, styleloss_curr, contentloss_curr, loss_curr, new_arr = sess.run([optimizer, styleloss, contentloss, total_loss, o_input_var])
print('Epoch: %i Content Loss: %.2f Style Loss: %.2f Total Loss: %.2f' % (epoch, contentloss_curr, styleloss_curr, loss_curr))
if epoch % 15 == 0:
saver.save(sess, './model/nst_model.ckpt')
# In[30]:
with tf.Session() as sess:
new_arr = reprocess(new_arr)
new_im = sess.run(tf.cast(tf.round(tf.squeeze(new_arr)), tf.uint8))
# new_im = new_im[...,::-1]
# print(sess.run(new_arr[0]/255))
print(sess.run(tf.shape(new_im)))
plt.imshow(new_im)
There were overflow errors, actually. And I'm not dividing by the shape exactly. According to the paper referenced above the factor of division is different. Still, I'll keep that in mind.
– CodeChef123
Jun 29 at 11:44
it may be nothing I am confused about your use of the total variable use tf.add_n it is prettier. Otherwise at first glance your code seems okay-ish. Could you try to use a reference tf implementation and do a diff step by step ?
– jean
Jun 29 at 12:19
By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.
Why do you use tf.float64? This totally wrecks the performance of the system. Also instead of reduce sum and then somehow dividing by the shape, just use a reduce_mean.
– Thomas Pinetz
Jun 29 at 11:36