How to change the activation function used in the output layer of the neural network from softmax to sigmoid?

Question

When doing image classification tasks, I want to change the activation function used in the output layer of the neural network from softmax to sigmoid, but after using this method I got an error (The error information is as follows. No error will be reported when using the softmax function).

Softmax code before the change:

def build_graph(top_k):
    # with tf.device('/cpu:0'):
    keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob')
    images = tf.placeholder(dtype=tf.float32, shape=[None, 64, 64, 1], name='image_batch')
    labels = tf.placeholder(dtype=tf.int64, shape=[None], name='label_batch')

    conv_1 = slim.conv2d(images, 64, [3, 3], 1, padding='SAME', scope='conv1')
    max_pool_1 = slim.max_pool2d(conv_1, [2, 2], [2, 2], padding='SAME')
    conv_2 = slim.conv2d(max_pool_1, 128, [3, 3], padding='SAME', scope='conv2')
    max_pool_2 = slim.max_pool2d(conv_2, [2, 2], [2, 2], padding='SAME')
    conv_3 = slim.conv2d(max_pool_2, 256, [3, 3], padding='SAME', scope='conv3')
    max_pool_3 = slim.max_pool2d(conv_3, [2, 2], [2, 2], padding='SAME')

    flatten = slim.flatten(max_pool_3)
    fc1 = slim.fully_connected(slim.dropout(flatten, keep_prob), 1024, activation_fn=tf.nn.tanh, scope='fc1')
    logits = slim.fully_connected(slim.dropout(fc1, keep_prob), FLAGS.charset_size, activation_fn=None, scope='fc2')
        # logits = slim.fully_connected(flatten, FLAGS.charset_size, activation_fn=None, reuse=reuse, scope='fc')
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits, 1), labels), tf.float32))

    global_step = tf.get_variable("step", [], initializer=tf.constant_initializer(0.0), trainable=False)
    rate = tf.train.exponential_decay(2e-4, global_step, decay_steps=2000, decay_rate=0.97, staircase=True)
    train_op = tf.train.AdamOptimizer(learning_rate=rate).minimize(loss, global_step=global_step)
    probabilities = tf.nn.softmax(logits)

Sigmoid code after the change:

def build_graph(top_k):
    # with tf.device('/cpu:0'):
    keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob')
    images = tf.placeholder(dtype=tf.float32, shape=[None, 64, 64, 1], name='image_batch')
    labels = tf.placeholder(dtype=tf.int64, shape=[None], name='label_batch')

    conv_1 = slim.conv2d(images, 64, [3, 3], 1, padding='SAME', scope='conv1')
    max_pool_1 = slim.max_pool2d(conv_1, [2, 2], [2, 2], padding='SAME')
    conv_2 = slim.conv2d(max_pool_1, 128, [3, 3], padding='SAME', scope='conv2')
    max_pool_2 = slim.max_pool2d(conv_2, [2, 2], [2, 2], padding='SAME')
    conv_3 = slim.conv2d(max_pool_2, 256, [3, 3], padding='SAME', scope='conv3')
    max_pool_3 = slim.max_pool2d(conv_3, [2, 2], [2, 2], padding='SAME')

    flatten = slim.flatten(max_pool_3)
    fc1 = slim.fully_connected(slim.dropout(flatten, keep_prob), 1024, activation_fn=tf.nn.tanh, scope='fc1')
    logits = slim.fully_connected(slim.dropout(fc1, keep_prob), FLAGS.charset_size, activation_fn=None, scope='fc2')
        # logits = slim.fully_connected(flatten, FLAGS.charset_size, activation_fn=None, reuse=reuse, scope='fc')
    # loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits, 1), labels), tf.float32))

    global_step = tf.get_variable("step", [], initializer=tf.constant_initializer(0.0), trainable=False)
    rate = tf.train.exponential_decay(2e-4, global_step, decay_steps=2000, decay_rate=0.97, staircase=True)
    train_op = tf.train.AdamOptimizer(learning_rate=rate).minimize(loss, global_step=global_step)
    # probabilities = tf.nn.softmax(logits)
    probabilities = tf.sigmoid(logits)

Code error message:

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Traceback (most recent call last):
  File "/usr/local/python3/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py", line 846, in merge_with
    self.assert_same_rank(other)
  File "/usr/local/python3/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py", line 891, in assert_same_rank
    other))
ValueError: Shapes (?,) and (?, 600) must have the same rank

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/python3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 164, in sigmoid_cross_entropy_with_logits
    labels.get_shape().merge_with(logits.get_shape())
  File "/usr/local/python3/lib/python3.6/site-packages/tensorflow/python/framework/tensor_shape.py", line 852, in merge_with
    raise ValueError("Shapes %s and %s are not compatible" % (self, other))
ValueError: Shapes (?,) and (?, 600) are not compatible

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "chinese_rec_my_sigmoid.py", line 379, in <module>
    tf.app.run()
  File "/usr/local/python3/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 125, in run
    _sys.exit(main(argv))
  File "chinese_rec_my_sigmoid.py", line 346, in main
    train()
  File "chinese_rec_my_sigmoid.py", line 160, in train
    graph = build_graph(top_k=1)
  File "chinese_rec_my_sigmoid.py", line 116, in build_graph
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
  File "/usr/local/python3/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 167, in sigmoid_cross_entropy_with_logits
    (logits.get_shape(), labels.get_shape()))
ValueError: logits and labels must have the same shape ((?, 600) vs (?,))

Note: (1)No error will be reported when using the softmax function. (2)The number 600 is the number of image classification categories.

SELLAM SELLAM · Accepted Answer · 2020-06-19T10:04:18

The problem in the line that calculates the loss using sigmoid_cross_entropy_with_logits. Unlike sparse_softmax_cross_entropy_with_logits, sigmoid_cross_entropy_with_logits expects the logits tensor and the labels tensor to have the same shape and type. In your case logits have BatchSize x 600 shape but the labels tensor have BatchSize x 1 shape.

I think you should encode the labels into a BatchSize x 600 shape using the one hot encoding if your label values are 1-600 (or 0 -599) before using sigmoid.

However, I beleive using softmax will give you better results than sigmoid for multiclass classification.

How to change the activation function used in the output layer of the neural network from softmax to sigmoid?

1 Answers