Neural network backpropagation algorithm not working in Python

Question

I am writing a neural network in Python, following the example here. It seems that the backpropagation algorithm isn't working, given that the neural network fails to produce the right value (within a margin of error) after being trained 10 thousand times. Specifically, I am training it to compute the sine function in the following example:

import numpy as np

class Neuralnet:
    def __init__(self, neurons):
        self.weights = []
        self.inputs = []
        self.outputs = []
        self.errors = []
        self.rate = .1
        for layer in range(len(neurons)):
            self.inputs.append(np.empty(neurons[layer]))
            self.outputs.append(np.empty(neurons[layer]))
            self.errors.append(np.empty(neurons[layer]))
        for layer in range(len(neurons)-1):
            self.weights.append(
                np.random.normal(
                    scale=1/np.sqrt(neurons[layer]), 
                    size=[neurons[layer], neurons[layer + 1]]
                    )
                )

    def feedforward(self, inputs):
        self.inputs[0] = inputs
        for layer in range(len(self.weights)):
            self.outputs[layer] = np.tanh(self.inputs[layer])
            self.inputs[layer + 1] = np.dot(self.weights[layer].T, self.outputs[layer])
        self.outputs[-1] = np.tanh(self.inputs[-1])

    def backpropagate(self, targets):
        gradient = 1 - self.outputs[-1] * self.outputs[-1]
        self.errors[-1] = gradient * (self.outputs[-1] - targets)
        for layer in reversed(range(len(self.errors) - 1)):
            gradient = 1 - self.outputs[layer] * self.outputs[layer]
            self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
        for layer in range(len(self.weights)):
            self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])

def xor_example():
    net = Neuralnet([2, 2, 1])
    for step in range(100000):
        net.feedforward([0, 0])
        net.backpropagate([-1])
        net.feedforward([0, 1])
        net.backpropagate([1])
        net.feedforward([1, 0])
        net.backpropagate([1])
        net.feedforward([1, 1])
        net.backpropagate([-1])
    net.feedforward([1, 1])
    print(net.outputs[-1])

def identity_example():
    net = Neuralnet([1, 3, 1])
    for step in range(100000):
        x = np.random.normal()
        net.feedforward([x])
        net.backpropagate([np.tanh(x)])
    net.feedforward([-2])
    print(net.outputs[-1])

def sine_example():
    net = Neuralnet([1, 6, 1])
    for step in range(100000):
        x = np.random.normal()
        net.feedforward([x])
        net.backpropagate([np.tanh(np.sin(x))])
    net.feedforward([3])
    print(net.outputs[-1])

sine_example()

The output fails to be close to tanh(sin(3)) = 0.140190616. I suspected a mistake involving wrong indices or alignment, but Numpy isn't raising any errors like these. Any tips on where I went wrong?

EDIT: I forgot to add the bias neurons. Here is the updated code:

import numpy as np

class Neuralnet:
    def __init__(self, neurons):
        self.weights = []
        self.outputs = []
        self.inputs = []
        self.errors = []
        self.offsets = []
        self.rate = .01
        for layer in range(len(neurons)-1):
            self.weights.append(
                np.random.normal(
                    scale=1/np.sqrt(neurons[layer]), 
                    size=[neurons[layer], neurons[layer + 1]]
                    )
                )
            self.outputs.append(np.empty(neurons[layer]))
            self.inputs.append(np.empty(neurons[layer]))
            self.errors.append(np.empty(neurons[layer]))
            self.offsets.append(np.random.normal(scale=1/np.sqrt(neurons[layer]), size=neurons[layer + 1]))
        self.inputs.append(np.empty(neurons[-1]))
        self.errors.append(np.empty(neurons[-1]))

    def feedforward(self, inputs):
        self.inputs[0] = inputs
        for layer in range(len(self.weights)):
            self.outputs[layer] = np.tanh(self.inputs[layer])
            self.inputs[layer + 1] = self.offsets[layer] + np.dot(self.weights[layer].T, self.outputs[layer])

    def backpropagate(self, targets):
        self.errors[-1] = self.inputs[-1] - targets
        for layer in reversed(range(len(self.errors) - 1)):
            gradient = 1 - self.outputs[layer] * self.outputs[layer]
            self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
        for layer in range(len(self.weights)):
            self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
            self.offsets[layer] -= self.rate * self.errors[layer + 1]

def sine_example():
    net = Neuralnet([1, 5, 1])
    for step in range(10000):
        x = np.random.uniform(-5, 5)
        net.feedforward([x])
        net.backpropagate([np.sin(x)])
    net.feedforward([np.pi])
    print(net.inputs[-1])

def xor_example():
    net = Neuralnet([2, 2, 1])
    for step in range(10000):
        net.feedforward([0, 0])
        net.backpropagate([-1])
        net.feedforward([0, 1])
        net.backpropagate([1])
        net.feedforward([1, 0])
        net.backpropagate([1])
        net.feedforward([1, 1])
        net.backpropagate([-1])
    net.feedforward([1, 1])
    print(net.outputs[-1])

def identity_example():
    net = Neuralnet([1, 3, 1])
    for step in range(10000):
        x = np.random.normal()
        net.feedforward([x])
        net.backpropagate([x])
    net.feedforward([-2])
    print(net.outputs[-1])

identity_example()

I have read the docs you referred but not able to find the corresponding algorithm for sin(2). Would you mind to further elaborate? — White
@White sin(2) is just the standard sine function, if that is your question. User1667423, at a first glance I don't see any errors. Have you tried a simpler function like logic functions (AND, OR, ...) of two inputs or even just the identity function? And try using only 1 or no hidden layer. — Robin Spiess
Why don't you use an existing framework like theano / lasagne or TensorFlow? — Martin Thoma
It's better to try out the existing framework, then come up something new. — caot

Anton Anton · Accepted Answer · 2016-01-09T03:34:30

I think you train the NN in the wrong way. You have a loop over 10000 iterations and feed a new sample in each cycle. The NN will never get trained in this case.

(the statement is wrong! See the update! )

What you need to do is to generate a large array of true samples Y = sin(X), give it to your network ONCE and iterate over the training set forwards and backwards, in order to minimize the cost function. To check the algorithm you may need to plot the cost function depending on the iteration number and make sure the cost goes down.

Another important point is the initialization of the weights. Your numbers are pretty large and the network will take a lot of time to converge, especially when using low rates. It's a good practice to generate the initial weights in some small range [-eps .. eps] uniformly.

In my code I implemented two different activation functions: sigmoid() and tanh(). You need to scale your inputs depending on the selected function: [0 .. 1] and [-1 .. 1] respectively.

Here are some images which show the cost function and the resulting predictions for sigmoid() and tanh() activation functions:

As you can see the sigmoid() activation gives a little bit better results, than the tanh().

Also I got much better predictions when using a network [1, 6, 1], compared to a bigger network with 4 layers [1, 6, 4, 1]. So the size of the NN is not always the crucial factor. Here is the prediction for the mentioned network with 4 layers:

Here is my code with some comments. I tried to use your notations where it was possible.

import numpy as np
import math
import matplotlib.pyplot as plt

class Neuralnet:
    def __init__(self, neurons, activation):
        self.weights = []
        self.inputs = []
        self.outputs = []
        self.errors = []
        self.rate = 0.5
        self.activation = activation    #sigmoid or tanh

        self.neurons = neurons
        self.L = len(self.neurons)      #number of layers

        eps = 0.12;    # range for uniform distribution   -eps..+eps              
        for layer in range(len(neurons)-1):
            self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))            


    ###################################################################################################    
    def train(self, X, Y, iter_count):

        m = X.shape[0];

        for layer in range(self.L):
            self.inputs.append(np.empty([m, self.neurons[layer]]))        
            self.errors.append(np.empty([m, self.neurons[layer]]))

            if (layer < self.L -1):
                self.outputs.append(np.empty([m, self.neurons[layer]+1]))
            else:
                self.outputs.append(np.empty([m, self.neurons[layer]]))

        #accumulate the cost function
        J_history = np.zeros([iter_count, 1])


        for i in range(iter_count):

            self.feedforward(X)

            J = self.cost(Y, self.outputs[self.L-1])
            J_history[i, 0] = J

            self.backpropagate(Y)


        #plot the cost function to check the descent
        plt.plot(J_history)
        plt.show()


    ###################################################################################################    
    def cost(self, Y, H):     
        J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
        return J

    ###################################################################################################
    def feedforward(self, X):

        m = X.shape[0];

        self.outputs[0] = np.concatenate(  (np.ones([m, 1]),   X),   axis=1)

        for i in range(1, self.L):
            self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T  )

            if (self.activation == 'sigmoid'):
                output_temp = self.sigmoid(self.inputs[i])
            elif (self.activation == 'tanh'):
                output_temp = np.tanh(self.inputs[i])


            if (i < self.L - 1):
                self.outputs[i] = np.concatenate(  (np.ones([m, 1]),   output_temp),   axis=1)
            else:
                self.outputs[i] = output_temp

    ###################################################################################################
    def backpropagate(self, Y):

        self.errors[self.L-1] = self.outputs[self.L-1] - Y

        for i in range(self.L - 2, 0, -1):

            if (self.activation == 'sigmoid'):
                self.errors[i] = np.dot(  self.errors[i+1],   self.weights[i][:, 1:]  ) *  self.sigmoid_prime(self.inputs[i])
            elif (self.activation == 'tanh'):
                self.errors[i] = np.dot(  self.errors[i+1],   self.weights[i][:, 1:]  ) *  (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])

        for i in range(0, self.L-1):
            grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
            self.weights[i] = self.weights[i] - self.rate*grad

    ###################################################################################################
    def sigmoid(self, z):
        s = 1.0/(1.0 + np.exp(-z))
        return s

    ###################################################################################################
    def sigmoid_prime(self, z):
        s = self.sigmoid(z)*(1 - self.sigmoid(z))
        return s    

    ###################################################################################################
    def predict(self, X, weights):

        m = X.shape[0];

        self.inputs = []
        self.outputs = []
        self.weights = weights

        for layer in range(self.L):
            self.inputs.append(np.empty([m, self.neurons[layer]]))        

            if (layer < self.L -1):
                self.outputs.append(np.empty([m, self.neurons[layer]+1]))
            else:
                self.outputs.append(np.empty([m, self.neurons[layer]]))

        self.feedforward(X)

        return self.outputs[self.L-1]


###################################################################################################
#                MAIN PART

activation1 = 'sigmoid'     # the input should be scaled into [ 0..1]
activation2 = 'tanh'        # the input should be scaled into [-1..1]

activation = activation1

net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function


##########################################################################################
#                TRAINING

m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); # input training set


Y = np.sin(X) # target

kx = 0.1 # noise parameter
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise # noisy target

# scaling of the target depending on the activation function
if (activation == 'sigmoid'):
    Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
    Y_scaled = Y/(1+kx)


# number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training

# gained weights
trained_weights = net.weights

##########################################################################################
#                 PREDICTION

m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);

Y_new = net.predict(X_new, trained_weights) # prediction

#rescaling of the result 
if (activation == 'sigmoid'):
    Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
    Y_new = Y_new * (1+kx)

# visualization
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
plt.show()

raw_input('press any key to exit')

UPDATE

I would like to take back the statement regarding the training method used in your code. The network can be indeed trained using only one sample per iteration. I got interesting results in online-training using both sigmoid and tanh activation functions:

Online-training using Sigmoid (cost function and prediction)

Online-training using Tanh (cost function and prediction)

As can be seen the choice of Sigmoid as activation function gives better performance. The cost function looks not that good as during the offline-training, but at least it tends to go down.

I plotted the cost function in your implementation, it looks pretty jerky as well:

May be it is a good idea to try your code with the sigmoid or even the ReLU function.

Here is the updated source code. To switch between online and offline training modes just change the method variable.

import numpy as np
import math
import matplotlib.pyplot as plt

class Neuralnet:
    def __init__(self, neurons, activation):
        self.weights = []
        self.inputs = []
        self.outputs = []
        self.errors = []
        self.rate = 0.2
        self.activation = activation    #sigmoid or tanh

        self.neurons = neurons
        self.L = len(self.neurons)      #number of layers

        eps = 0.12;    #range for uniform distribution   -eps..+eps              
        for layer in range(len(neurons)-1):
            self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))            


    ###################################################################################################    
    def train(self, X, Y, iter_count):

        m = X.shape[0];

        for layer in range(self.L):
            self.inputs.append(np.empty([m, self.neurons[layer]]))        
            self.errors.append(np.empty([m, self.neurons[layer]]))

            if (layer < self.L -1):
                self.outputs.append(np.empty([m, self.neurons[layer]+1]))
            else:
                self.outputs.append(np.empty([m, self.neurons[layer]]))

        #accumulate the cost function
        J_history = np.zeros([iter_count, 1])


        for i in range(iter_count):

            self.feedforward(X)

            J = self.cost(Y, self.outputs[self.L-1])
            J_history[i, 0] = J

            self.backpropagate(Y)


        #plot the cost function to check the descent
        #plt.plot(J_history)
        #plt.show()


    ###################################################################################################    
    def cost(self, Y, H):     
        J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
        return J


    ###################################################################################################
    def cost_online(self, min_x, max_x, iter_number):
        h_arr = np.zeros([iter_number, 1])
        y_arr = np.zeros([iter_number, 1])

        for step in range(iter_number):
            x = np.random.uniform(min_x, max_x, 1).reshape(1, 1)

            self.feedforward(x)
            h_arr[step, 0] = self.outputs[-1]
            y_arr[step, 0] = np.sin(x)



        J = np.sum(np.sum(np.power((y_arr - h_arr), 2), axis=0))/(2*iter_number)
        return J

    ###################################################################################################
    def feedforward(self, X):

        m = X.shape[0];

        self.outputs[0] = np.concatenate(  (np.ones([m, 1]),   X),   axis=1)

        for i in range(1, self.L):
            self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T  )

            if (self.activation == 'sigmoid'):
                output_temp = self.sigmoid(self.inputs[i])
            elif (self.activation == 'tanh'):
                output_temp = np.tanh(self.inputs[i])


            if (i < self.L - 1):
                self.outputs[i] = np.concatenate(  (np.ones([m, 1]),   output_temp),   axis=1)
            else:
                self.outputs[i] = output_temp

    ###################################################################################################
    def backpropagate(self, Y):

        self.errors[self.L-1] = self.outputs[self.L-1] - Y

        for i in range(self.L - 2, 0, -1):

            if (self.activation == 'sigmoid'):
                self.errors[i] = np.dot(  self.errors[i+1],   self.weights[i][:, 1:]  ) *  self.sigmoid_prime(self.inputs[i])
            elif (self.activation == 'tanh'):
                self.errors[i] = np.dot(  self.errors[i+1],   self.weights[i][:, 1:]  ) *  (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])

        for i in range(0, self.L-1):
            grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
            self.weights[i] = self.weights[i] - self.rate*grad


    ###################################################################################################
    def sigmoid(self, z):
        s = 1.0/(1.0 + np.exp(-z))
        return s

    ###################################################################################################
    def sigmoid_prime(self, z):
        s = self.sigmoid(z)*(1 - self.sigmoid(z))
        return s    

    ###################################################################################################
    def predict(self, X, weights):

        m = X.shape[0];

        self.inputs = []
        self.outputs = []
        self.weights = weights

        for layer in range(self.L):
            self.inputs.append(np.empty([m, self.neurons[layer]]))        

            if (layer < self.L -1):
                self.outputs.append(np.empty([m, self.neurons[layer]+1]))
            else:
                self.outputs.append(np.empty([m, self.neurons[layer]]))

        self.feedforward(X)

        return self.outputs[self.L-1]


###################################################################################################
#                MAIN PART

activation1 = 'sigmoid'     #the input should be scaled into [0..1]
activation2 = 'tanh'        #the input should be scaled into [-1..1]

activation = activation1

net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function


method1 = 'online'
method2 = 'offline'

method = method1

kx = 0.1 #noise parameter

###################################################################################################
#                TRAINING

if (method == 'offline'):

    m = 1000 #size of the training set
    X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); #input training set


    Y = np.sin(X) #target


    noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
    Y = Y + noise #noisy target

    #scaling of the target depending on the activation function
    if (activation == 'sigmoid'):
        Y_scaled = (Y/(1+kx) + 1)/2.0
    elif (activation == 'tanh'):
        Y_scaled = Y/(1+kx)


    #number of the iteration for the training stage
    iter_count = 20000
    net.train(X, Y_scaled, iter_count) #training

elif (method == 'online'):

    sampling_count = 100000 # number of samplings during the training stage


    m = 1 #batch size

    iter_count = sampling_count/m

    for layer in range(net.L):
        net.inputs.append(np.empty([m, net.neurons[layer]]))        
        net.errors.append(np.empty([m, net.neurons[layer]]))

        if (layer < net.L -1):
            net.outputs.append(np.empty([m, net.neurons[layer]+1]))
        else:
            net.outputs.append(np.empty([m, net.neurons[layer]]))    

    J_history = []
    step_history = []

    for i in range(iter_count):
        X = np.random.uniform(0, 4*math.pi, m).reshape(m, 1)

        Y = np.sin(X) #target
        noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
        Y = Y + noise #noisy target

        #scaling of the target depending on the activation function
        if (activation == 'sigmoid'):
            Y_scaled = (Y/(1+kx) + 1)/2.0
        elif (activation == 'tanh'):
            Y_scaled = Y/(1+kx)

        net.feedforward(X)
        net.backpropagate(Y_scaled)


        if (np.remainder(i, 1000) == 0):
            J = net.cost_online(0, 4*math.pi, 1000)
            J_history.append(J)
            step_history.append(i)

    plt.plot(step_history, J_history)
    plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
    #plt.ylim([0, 0.1])

    plt.show()

#gained weights
trained_weights = net.weights

##########################################################################################
#                 PREDICTION

m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);

Y_new = net.predict(X_new, trained_weights) #prediction

#rescaling of the result 
if (activation == 'sigmoid'):
    Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
    Y_new = Y_new * (1+kx)

#visualization

#fake sine curve to show the ideal signal
if (method == 'online'):
    X = np.linspace(0, 4*math.pi, num = 100)
    Y = np.sin(X)

plt.plot(X, Y)

plt.plot(X_new, Y_new, 'ro')
if (method == 'online'):
    plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
plt.ylim([-1.5, 1.5])
plt.show()

raw_input('press any key to exit')

Now I have some remarks to your current code:

Your sine function looks like this:

def sine_example():
    net = Neuralnet([1, 6, 1])
    for step in range(100000):
        x = np.random.normal()
        net.feedforward([x])
        net.backpropagate([np.tanh(np.sin(x))])
    net.feedforward([3])
    print(net.outputs[-1])

I don't know why you use tanh in your target input. If you really want to use tanh of sine as target, you need to scale it to [-1..1], because tanh(sin(x)) returns values in range [-0.76..0.76].

The next thing is the range of your training set. You use x = np.random.normal() to generate the samples. Here is the distribution of such an input:

After it you want your network to predict the sine of 3, but the network has almost never seen this number during the training stage. I would use the uniform distribution in a wider range for sample generation instead.

Neural network backpropagation algorithm not working in Python

1 Answers