0
votes

I am training the "Show and tell" model using tensorflow in which the model automatically generates the captions of the images. How ever I am getting this error.

This is the traceback:

------------------------------------------------------------------------
---
ValueError                                Traceback (most recent call 
last)
<ipython-input-36-b6da0a27b701> in <module>()
  1 try:
  2     #train(.001,False,False) #train from scratch
----> 3     train(.001,True,True)    #continue training from pretrained weights @epoch500
  4     #train(.001)  #train from previously saved weights
  5 except KeyboardInterrupt:
  ipython-input-35-39693d0edd0a> in train(learning_rate, continue_training, transfer)
 31     learning_rate = tf.train.exponential_decay(learning_rate, global_step,
 32                                        int(len(index)/batch_size), 0.95)
---> 33     train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
 34     tf.global_variables_initializer().run()
 35 
/home/niraj/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/optimizer.pyc in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
320           "No gradients provided for any variable, check your graph for ops"
321           " that do not support gradients, between variables %s 
and loss %s." %
--> 322           ([str(v) for _, v in grads_and_vars], loss))
323 
324     return self.apply_gradients(grads_and_vars, 
global_step=global_step,

ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["tf.Variable 'word_embedding:0' shape=(2943, 256) dtype=float32_ref>", "tf.Variable 'embedding_bias:0' shape=(256,) dtype=float32_ref>", "tf.Variable 'img_embedding:0' shape=(4096, 256) dtype=float32_ref>", "tf.Variable 'img_embedding_bias:0' shape=(256,) dtype=float32_ref>", "tf.Variable 'word_encoding:0' shape=(256, 2943) dtype=float32_ref>", "tf.Variable 'word_encoding_bias:0' shape=(2943,) dtype=float32_ref>"] and loss Tensor("RNN/div:0", shape=(), dtype=float32).

I know that the error is due to the fact that there is a variable which doesen't holds the gradient during optimisation which in turn is cutting the graph but I am unable to pick it out.

I am using already trained VGG-net 16 model parameters and the FLICKR-30 image dataset having corresponding annotations.

This is the code:

def get_data(annotation_path, feature_path):
 annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
 return np.load(feature_path,'r'), annotations['caption'].values

def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # function from Andre Karpathy's NeuralTalk
print('preprocessing %d word vocab' % (word_count_threshold, ))
word_counts = {}
nsents = 0
for sent in sentence_iterator:
  nsents += 1
  for w in sent.lower().split(' '):
    word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))


ixtoword = {}
ixtoword[0] = '.'  
wordtoix = {}
wordtoix['#START#'] = 0 
ix = 1
for w in vocab:
  wordtoix[w] = ix
  ixtoword[ix] = w
  ix += 1

word_counts['.'] = nsents
bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
bias_init_vector /= np.sum(bias_init_vector) 
bias_init_vector = np.log(bias_init_vector)
bias_init_vector -= np.max(bias_init_vector) 
return wordtoix, ixtoword, bias_init_vector.astype(np.float32)

class Caption_Generator():
def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b):

    self.dim_in = dim_in
    self.dim_embed = dim_embed
    self.dim_hidden = dim_hidden
    self.batch_size = batch_size
    self.n_lstm_steps = n_lstm_steps
    self.n_words = n_words

    # declare the variables to be used for our word embeddings
    with tf.device("/cpu:0"):
        self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')

    self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')

    # declare the LSTM itself
    self.lstm = tf.contrib.rnn.BasicLSTMCell(dim_hidden)

    # declare the variables to be used to embed the image feature embedding to the word embedding space
    self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')
    self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')

    # declare the variables to go from an LSTM output to a word encoding output
    self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')
    # initialize this bias variable from the preProBuildWordVocab output
    self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')

def build_model(self):
    # declaring the placeholders for our extracted image feature vectors, our caption, and our mask
    # (describes how long our caption is with an array of 0/1 values of length `maxlen`  
    img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
    caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
    mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])

    # getting an initial LSTM embedding from our image_imbedding
    image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias

    # setting initial state of our LSTM
    state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)

    total_loss = 0.0
    with tf.variable_scope("RNN"):
        for i in range(self.n_lstm_steps): 
            if i > 0:
               #if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding
               # to the (i-1)th word in our caption 
                with tf.device("/cpu:0"):
                    current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
            else:
                 #if this is the first iteration of our LSTM we utilize the embedded image as our input 
                current_embedding = image_embedding
                if i > 0: 
                # allows us to reuse the LSTM tensor variable on each iteration
                    tf.get_variable_scope().reuse_variables()

                    out, state = self.lstm(current_embedding, state)
                    #out, state = self.tf.nn.dynamic_rnn(current_embedding, state)


                if i > 0:
                #get the one-hot representation of the next word in our caption 
                    labels = tf.expand_dims(caption_placeholder[:, i], 1)
                    ix_range=tf.range(0, self.batch_size, 1)
                    ixs = tf.expand_dims(ix_range, 1)
                    concat = tf.concat([ixs, labels],1)
                    onehot = tf.sparse_to_dense(
                        concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)


                #perform a softmax classification to generate the next word in the caption
                    logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)
                    xentropy = xentropy * mask[:,i]

                    loss = tf.reduce_sum(xentropy)
                    total_loss += loss

        total_loss = total_loss / tf.reduce_sum(mask[:,1:])
        return total_loss, img,  caption_placeholder, mask


### Parameters ###
dim_embed = 256
dim_hidden = 256
dim_in = 4096
batch_size = 128
momentum = 0.9
n_epochs = 150

def train(learning_rate=0.001, continue_training=False, transfer=True):

tf.reset_default_graph()

feats, captions = get_data(annotation_path, feature_path)
wordtoix, ixtoword, init_b = preProBuildWordVocab(captions)

np.save('data/ixtoword', ixtoword)

index = (np.arange(len(feats)).astype(int))
np.random.shuffle(index)


sess = tf.InteractiveSession()
n_words = len(wordtoix)
maxlen = np.max( [x for x in map(lambda x: len(x.split(' ')), captions) ] )
caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words, init_b)

loss, image, sentence, mask = caption_generator.build_model()

saver = tf.train.Saver(max_to_keep=100)
global_step=tf.Variable(0,trainable=False)
learning_rate = tf.train.exponential_decay(learning_rate, global_step,
                                   int(len(index)/batch_size), 0.95)
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
tf.global_variables_initializer().run()

if continue_training:
    if not transfer:
        saver.restore(sess,tf.train.latest_checkpoint(model_path))
    else:
        saver.restore(sess,tf.train.latest_checkpoint(model_path_transfer))
losses=[]
for epoch in range(n_epochs):
    for start, end in zip( range(0, len(index), batch_size), range(batch_size, len(index), batch_size)):

        current_feats = feats[index[start:end]]
        current_captions = captions[index[start:end]]
        current_caption_ind = [x for x in map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions)]

        current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)
        current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] )

        current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
        nonzeros = np.array([x for x in map(lambda x: (x != 0).sum()+2, current_caption_matrix )])

        for ind, row in enumerate(current_mask_matrix):
            row[:nonzeros[ind]] = 1

        _, loss_value = sess.run([train_op, loss], feed_dict={
            image: current_feats.astype(np.float32),
            sentence : current_caption_matrix.astype(np.int32),
            mask : current_mask_matrix.astype(np.float32)
            })

        print("Current Cost: ", loss_value, "\t Epoch {}/{}".format(epoch, n_epochs), "\t Iter {}/{}".format(start,len(feats)))
    print("Saving the model from epoch: ", epoch)
    saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
1

1 Answers

0
votes

Branching in the loss building routine is invalid.

with tf.variable_scope("RNN"):
        for i in range(self.n_lstm_steps): 
            if i > 0:
                [...]
            else:
                [...]
                if i > 0: 
                    [...]
                if i > 0:
                    [...]

Note, that last two ifs will never run, as they are in the else clause, meaning that i <= 0. Consequently your loss is actually a constant, equal 0, and thus TF do not see how to optimise it wrt. variables.