Softmax activation with cross entropy loss results in the outputs converging to exactly 0 and 1 for both classes, respectively

Question

I have implemented a simple Neural Net with just a single sigmoid hidden layer, with the choice of a sigmoid or softmax output layer and squared error or cross entropy loss function, respectively. After much research on the softmax activation function, the cross entropy loss, and their derivatives (and with following this blog) I believe that my implementation seems correct.

When attempting to learn the simple XOR function, the NN with the sigmoid output learns to a very small loss very quickly when using single binary outputs of 0 and 1. However, when changing the labels to one-hot encodings of [1, 0] = 0 and [0, 1] = 1, the softmax implementation does not work. The loss consistently increases as the network's outputs converge to exactly [0, 1] for the two outputs on every input, yet the labels of the data set is perfectly balanced between [0, 1] and [1, 0].

My code is below, where the choice of using sigmoid or softmax at the output layer can be chosen by uncommenting the necessary two lines near the bottom of the code. I cannot figure out why the softmax implementation is not working.

import numpy as np


class MLP:

    def __init__(self, numInputs, numHidden, numOutputs, activation):
        self.numInputs = numInputs
        self.numHidden = numHidden
        self.numOutputs = numOutputs

        self.activation = activation.upper()

        self.IH_weights = np.random.rand(numInputs, numHidden)      # Input -> Hidden
        self.HO_weights = np.random.rand(numHidden, numOutputs)     # Hidden -> Output

        self.IH_bias = np.zeros((1, numHidden))
        self.HO_bias = np.zeros((1, numOutputs))

        # Gradients corresponding to weight matrices computed during backprop
        self.IH_w_gradients = np.zeros_like(self.IH_weights)
        self.HO_w_gradients = np.zeros_like(self.HO_weights)

        # Gradients corresponding to biases computed during backprop
        self.IH_b_gradients = np.zeros_like(self.IH_bias)
        self.HO_b_gradients = np.zeros_like(self.HO_bias)

        # Input, hidden and output layer neuron values
        self.I = np.zeros(numInputs)    # Inputs
        self.L = np.zeros(numOutputs)   # Labels
        self.H = np.zeros(numHidden)    # Hidden
        self.O = np.zeros(numOutputs)   # Output

    # ##########################################################################
    # ACIVATION FUNCTIONS
    # ##########################################################################

    def sigmoid(self, x, derivative=False):
        if derivative:
            return x * (1 - x)
        return 1 / (1 + np.exp(-x))

    def softmax(self, prediction, label=None, derivative=False):
        if derivative:
            return prediction - label
        return np.exp(prediction) / np.sum(np.exp(prediction))

    # ##########################################################################
    # LOSS FUNCTIONS
    # ##########################################################################

    def squaredError(self, prediction, label, derivative=False):
        if derivative:
            return (-2 * prediction) + (2 * label)
        return (prediction - label) ** 2

    def crossEntropy(self, prediction, label, derivative=False):
        if derivative:
            return [-(y / x) for x, y in zip(prediction, label)]    # NOT NEEDED ###############################
        return - np.sum([y * np.log(x) for x, y in zip(prediction, label)])

    # ##########################################################################

    def forward(self, inputs):
        self.I = np.array(inputs).reshape(1, self.numInputs)    # [numInputs, ] -> [1, numInputs]
        self.H = self.I.dot(self.IH_weights) + self.IH_bias
        self.H = self.sigmoid(self.H)
        self.O = self.H.dot(self.HO_weights) + self.HO_bias

        if self.activation == 'SIGMOID':
            self.O = self.sigmoid(self.O)
        elif self.activation == 'SOFTMAX':
            self.O = self.softmax(self.O) + 1e-10   # allows for log(0)

        return self.O

    def backward(self, labels):
        self.L = np.array(labels).reshape(1, self.numOutputs)   # [numOutputs, ] -> [1, numOutputs]

        if self.activation == 'SIGMOID':
            self.O_error = self.squaredError(self.O, self.L)
            self.O_delta = self.squaredError(self.O, self.L, derivative=True) * self.sigmoid(self.O, derivative=True)
        elif self.activation == 'SOFTMAX':
            self.O_error = self.crossEntropy(self.O, self.L)
            self.O_delta = self.softmax(self.O, self.L, derivative=True)

        self.H_error = self.O_delta.dot(self.HO_weights.T)
        self.H_delta = self.H_error * self.sigmoid(self.H, derivative=True)

        self.IH_w_gradients += self.I.T.dot(self.H_delta)
        self.HO_w_gradients += self.H.T.dot(self.O_delta)

        self.IH_b_gradients += self.H_delta
        self.HO_b_gradients += self.O_delta

        return self.O_error

    def updateWeights(self, learningRate):
        self.IH_weights += learningRate * self.IH_w_gradients
        self.HO_weights += learningRate * self.HO_w_gradients
        self.IH_bias += learningRate * self.IH_b_gradients
        self.HO_bias += learningRate * self.HO_b_gradients

        self.IH_w_gradients = np.zeros_like(self.IH_weights)
        self.HO_w_gradients = np.zeros_like(self.HO_weights)
        self.IH_b_gradients = np.zeros_like(self.IH_bias)
        self.HO_b_gradients = np.zeros_like(self.HO_bias)


sigmoidData = [
    [[0, 0], 0],
    [[0, 1], 1],
    [[1, 0], 1],
    [[1, 1], 0]
]

softmaxData = [
    [[0, 0], [1, 0]],
    [[0, 1], [0, 1]],
    [[1, 0], [0, 1]],
    [[1, 1], [1, 0]]
]

sigmoidMLP = MLP(2, 10, 1, 'SIGMOID')
softmaxMLP = MLP(2, 10, 2, 'SOFTMAX')

# SIGMOID #######################
# data = sigmoidData
# mlp = sigmoidMLP
# ###############################

# SOFTMAX #######################
data = softmaxData
mlp = softmaxMLP
# ###############################

numEpochs = 5000
for epoch in range(numEpochs):
    losses = []
    for i in range(len(data)):
        print(mlp.forward(data[i][0]))      # Print outputs
        # mlp.forward(data[i][0])           # Don't print outputs
        loss = mlp.backward(data[i][1])
        losses.append(loss)
    mlp.updateWeights(0.001)
    # if epoch % 1000 == 0 or epoch == numEpochs - 1:   # Print loss every 1000 epochs
    print(np.mean(losses))                              # Print loss every epoch

KOB KOB · Accepted Answer · 2018-04-24T14:59:11

Contrary to all the information online, simply changing the derivative of the softmax cross entropy from prediction - label to label - prediction solved the problem. Perhaps I have something else backwards somewhere since every source I have come across has it as prediction - label.

Softmax activation with cross entropy loss results in the outputs converging to exactly 0 and 1 for both classes, respectively

1 Answers