I have implemented a simple Neural Net with just a single sigmoid hidden layer, with the choice of a sigmoid or softmax output layer and squared error or cross entropy loss function, respectively. After much research on the softmax activation function, the cross entropy loss, and their derivatives (and with following this blog) I believe that my implementation seems correct.
When attempting to learn the simple XOR function, the NN with the sigmoid output learns to a very small loss very quickly when using single binary outputs of 0 and 1. However, when changing the labels to one-hot encodings of [1, 0] = 0 and [0, 1] = 1, the softmax implementation does not work. The loss consistently increases as the network's outputs converge to exactly [0, 1] for the two outputs on every input, yet the labels of the data set is perfectly balanced between [0, 1] and [1, 0].
My code is below, where the choice of using sigmoid or softmax at the output layer can be chosen by uncommenting the necessary two lines near the bottom of the code. I cannot figure out why the softmax implementation is not working.
import numpy as np
class MLP:
def __init__(self, numInputs, numHidden, numOutputs, activation):
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
self.activation = activation.upper()
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
self.IH_bias = np.zeros((1, numHidden))
self.HO_bias = np.zeros((1, numOutputs))
# Gradients corresponding to weight matrices computed during backprop
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
# Gradients corresponding to biases computed during backprop
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
# Input, hidden and output layer neuron values
self.I = np.zeros(numInputs) # Inputs
self.L = np.zeros(numOutputs) # Labels
self.H = np.zeros(numHidden) # Hidden
self.O = np.zeros(numOutputs) # Output
# ##########################################################################
# ACIVATION FUNCTIONS
# ##########################################################################
def sigmoid(self, x, derivative=False):
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
def softmax(self, prediction, label=None, derivative=False):
if derivative:
return prediction - label
return np.exp(prediction) / np.sum(np.exp(prediction))
# ##########################################################################
# LOSS FUNCTIONS
# ##########################################################################
def squaredError(self, prediction, label, derivative=False):
if derivative:
return (-2 * prediction) + (2 * label)
return (prediction - label) ** 2
def crossEntropy(self, prediction, label, derivative=False):
if derivative:
return [-(y / x) for x, y in zip(prediction, label)] # NOT NEEDED ###############################
return - np.sum([y * np.log(x) for x, y in zip(prediction, label)])
# ##########################################################################
def forward(self, inputs):
self.I = np.array(inputs).reshape(1, self.numInputs) # [numInputs, ] -> [1, numInputs]
self.H = self.I.dot(self.IH_weights) + self.IH_bias
self.H = self.sigmoid(self.H)
self.O = self.H.dot(self.HO_weights) + self.HO_bias
if self.activation == 'SIGMOID':
self.O = self.sigmoid(self.O)
elif self.activation == 'SOFTMAX':
self.O = self.softmax(self.O) + 1e-10 # allows for log(0)
return self.O
def backward(self, labels):
self.L = np.array(labels).reshape(1, self.numOutputs) # [numOutputs, ] -> [1, numOutputs]
if self.activation == 'SIGMOID':
self.O_error = self.squaredError(self.O, self.L)
self.O_delta = self.squaredError(self.O, self.L, derivative=True) * self.sigmoid(self.O, derivative=True)
elif self.activation == 'SOFTMAX':
self.O_error = self.crossEntropy(self.O, self.L)
self.O_delta = self.softmax(self.O, self.L, derivative=True)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.sigmoid(self.H, derivative=True)
self.IH_w_gradients += self.I.T.dot(self.H_delta)
self.HO_w_gradients += self.H.T.dot(self.O_delta)
self.IH_b_gradients += self.H_delta
self.HO_b_gradients += self.O_delta
return self.O_error
def updateWeights(self, learningRate):
self.IH_weights += learningRate * self.IH_w_gradients
self.HO_weights += learningRate * self.HO_w_gradients
self.IH_bias += learningRate * self.IH_b_gradients
self.HO_bias += learningRate * self.HO_b_gradients
self.IH_w_gradients = np.zeros_like(self.IH_weights)
self.HO_w_gradients = np.zeros_like(self.HO_weights)
self.IH_b_gradients = np.zeros_like(self.IH_bias)
self.HO_b_gradients = np.zeros_like(self.HO_bias)
sigmoidData = [
[[0, 0], 0],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], 0]
]
softmaxData = [
[[0, 0], [1, 0]],
[[0, 1], [0, 1]],
[[1, 0], [0, 1]],
[[1, 1], [1, 0]]
]
sigmoidMLP = MLP(2, 10, 1, 'SIGMOID')
softmaxMLP = MLP(2, 10, 2, 'SOFTMAX')
# SIGMOID #######################
# data = sigmoidData
# mlp = sigmoidMLP
# ###############################
# SOFTMAX #######################
data = softmaxData
mlp = softmaxMLP
# ###############################
numEpochs = 5000
for epoch in range(numEpochs):
losses = []
for i in range(len(data)):
print(mlp.forward(data[i][0])) # Print outputs
# mlp.forward(data[i][0]) # Don't print outputs
loss = mlp.backward(data[i][1])
losses.append(loss)
mlp.updateWeights(0.001)
# if epoch % 1000 == 0 or epoch == numEpochs - 1: # Print loss every 1000 epochs
print(np.mean(losses)) # Print loss every epoch