backpropagation trouble; getting higher and higher total cost up until its infinity

Question

I made a FC neural network with numpy based on the video's of welch's lab but when I try to train it I seem to have exploding gradients at launch, which is weird, I will put down the whole code which is testable in python 3+. only costfunctionprime seem to break the gradient descent stuff going but I have no idea what is happening. Can someone smarter than me help?

EDIT: the trng_input and trng_output are not the one I use, I use a big dataset

import numpy as np
import random

trng_input = [[random.random() for _ in range(7)] for _ in range(100)]
trng_output = [[random.random() for _ in range(2)] for _ in range(100)]

def relu(x):
    return x * (x > 0)

def reluprime(x):
    return (x>0).astype(x.dtype)


class Neural_Net():
    def __init__(self, data_input, data_output):
        self.data_input = data_input
        self.trng_output = trng_output
        self.bias = 0
        self.nodes = np.array([7, 2])
        self.LR = 0.01
        self.weightinit()
        self.training(1000, self.LR)

    def randomweight(self, n):
        output = []
        for i in range(n):
            output.append(random.uniform(-1,1))
        return output

    def weightinit(self):
        self.weights = []
        for n in range(len(self.nodes)-1):
            temp = []
            for _ in range(self.nodes[n]+self.bias):
                temp.append(self.randomweight(self.nodes[n+1]))
            self.weights.append(temp)
        self.weights = [np.array(tuple(self.weights[i])) for i in range(len(self.weights))]


    def forward(self, data):
        self.Z = []
        self.A = [np.array(data)]

        for layer in range(len(self.weights)):
            self.Z.append(np.dot(self.A[layer], self.weights[layer]))
            self.A.append(relu(self.Z[layer]))

        self.output = self.A[-1]
        return self.output

    def costFunction(self):
        self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
        return self.totalcost

    def costFunctionPrime(self):
        self.forward(self.data_input)
        self.delta = [[] for x in range(len(self.weights))]
        self.DcostDw = [[] for x in range(len(self.weights))]

        for layer in reversed(range(len(self.weights))):
            Zprime = reluprime(self.Z[layer])
            if layer == len(self.weights)-1:
                self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
            else:
                self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
            self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])

        return self.DcostDw

    def backprop(self, LR):
        self.DcostDw = (np.array(self.DcostDw)*LR).tolist()
        self.weights = (np.array(self.weights) - np.array(self.DcostDw)).tolist()

    def training(self, iteration, LR):
        for i in range(iteration):
            self.costFunctionPrime()
            self.backprop(LR)
            if (i/1000.0) == (i/1000):
                print(self.costFunction())
        print(sum(self.costFunction())/len(self.costFunction()))

NN = Neural_Net(trng_input, trng_output)

as asked, this is the expected result (result I got using the sigmoid activation function):

as you can see, the numbers are going down and thus the network is training.

this is the result using the relu activation function:

Here, the network is stuck and isnt getting trained, it never gets trained using the relu activation function and would like to understand why

David Dale David Dale · Accepted Answer · 2018-06-01T04:38:58

If your cost doesn't decrease with ReLu activation, it seems like your network is stuck in the region where the input of ReLu is negative, so its output is a constant zero, and no graident flows back - the neuron is dead.

You can tackle this problem by using leaky ReLu instead of simple ReLu. You should also start training biases. With ReLu, it is recommended to initialize biases with small positive values, to avoid this dead neuron problem.

For some problems, it would also help to decrease learning rate and make the network deeper. Maybe, you would like to make learning rate adjustable, e.g. if the cost does not decrease, multiply LR by 0.5.

With leaky ReLu, trainable biases, and some refactoring, your model could look like this:

import numpy as np
trng_input = np.random.uniform(size=(1000, 7))
trng_output = np.column_stack([np.sin(trng_input).sum(axis=1), np.cos(trng_input).sum(axis=1)])

LEAK = 0.0001

def relu(x):
    return x * (x > 0) + LEAK * x * (x < 0)

def reluprime(x):
    return (x>0).astype(x.dtype) + LEAK * (x<0).astype(x.dtype)


class Neural_Net():
    def __init__(self, data_input, data_output):
        self.data_input = data_input
        self.trng_output = trng_output
        self.nodes = np.array([7, 10, 2])
        self.LR = 0.00001
        self.weightinit()
        self.training(2000, self.LR)

    def weightinit(self):
        self.weights = [np.random.uniform(-1, 1, size=self.nodes[i:(i+2)]) for i in range(len(self.nodes) - 1)]
        self.biases = [np.random.uniform(0, 1, size=self.nodes[i+1]) for i in range(len(self.nodes) - 1)]

    def forward(self, data):
        self.Z = []
        self.A = [np.array(data)]
        for layer in range(len(self.weights)):
            self.Z.append(np.dot(self.A[layer], self.weights[layer]) + self.biases[layer])
            self.A.append(relu(self.Z[layer]))
        self.output = self.A[-1]
        return self.output

    def costFunction(self):
        self.totalcost = 0.5*np.sum((self.trng_output-self.output)**2, axis=0)
        return self.totalcost

    def costFunctionPrime(self):
        self.forward(self.data_input)
        self.delta = [[] for x in range(len(self.weights))]
        self.DcostDw = [[] for x in range(len(self.weights))]
        self.DcostDb = [[] for x in range(len(self.weights))]
        for layer in reversed(range(len(self.weights))):
            Zprime = reluprime(self.Z[layer])
            if layer == len(self.weights)-1:
                self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
            else:
                self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
            self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])
            self.DcostDb[layer] = np.sum(self.delta[layer], axis=0)

    def backprop(self, LR):
        for layer in range(len(self.weights)):
            self.weights[layer] -= self.DcostDw[layer] * LR
            self.biases[layer] -= self.DcostDb[layer] * LR

    def training(self, iteration, LR):
        for i in range(iteration):
            self.costFunctionPrime()
            self.backprop(LR)
            if (i/100.0) == (i/100):
                print(self.costFunction())
        print(sum(self.costFunction())/len(self.costFunction()))

NN = Neural_Net(trng_input, trng_output)

backpropagation trouble; getting higher and higher total cost up until its infinity

2 Answers