Neural Network Gradient Descent: Matrix Shapes of Derivatives of Weights not Aligned

Question

I am trying to create my own network from scratch (without using libraries such as keras or tensorflow) to better understand machine learning and neural networks. I have run into the problem that when using a network with a certain configuration of layers, the gradient descent does not function properly. With the values of each layer being the derivatives of the respective set of weights and that derivatives multiplied together chains the weights nearer to the input to the output, the layers multiplied by themselves do not work. For example, a neural network with 2 input neurons, 3 hidden neurons and 1 output neuron, the derivative relating the cost and the set of weights that link the input and the hidden must surely include multiplication of the derivatives (The values stored in each layer) inorder to chain the weights to the output.

Here is the full code: (Try entering [[3,1,None],[2,None,None],[1,None,None]] for the variable network for a reproducable error)

import numpy as np
import random
from matplotlib import pyplot as plt
def sigmoid(x):
    return 1/(1+np.exp(-x))
def sigmoid_p(x):
    return sigmoid(x)*(1 -sigmoid(x))
def network_propagation(weights,biases,activations,input_data):
    pre_funcs = []
    outputs = []
    input_layer = input_data
    for i in range(len(network)):
        pre_func = np.dot(input_layer,weights[i]) + biases[i]
        pre_funcs.append(pre_func)
        if activations[i]:
            output = activations[i](pre_func)
        else:
            output = pre_func
        outputs.append(output)
        input_layer = output
    return pre_funcs,outputs

def initialize_network(network):
    weights = []
    biases = []
    activations = []

    for layer in network:
        layer_weights = []
        layer_size = layer[0]
        input_size = layer[1]
        activation = layer[2]
        if input_size == None:
            input_size = network[network.index(layer)-1][0]
        activations.append(activation)
        biases.append(np.random.randn())
        for i in range(layer_size*input_size):
            layer_weights.append(np.random.randn())
        weights.append(np.reshape(np.array(layer_weights),(input_size,layer_size)))
    return weights,biases,activations

def train(data,answers,network,weights,biases,activations):
    learning_rate = 0.2
    loss_history = []
    learning_rate_history = []
    epochs = 20000
    threshold_value = 100
    threshold = False
    lowest_c = np.inf
    schedule = True
    best_weights = weights
    best_biases = biases
    for i in range(epochs):
        if threshold == False:
            ri = np.random.randint(len(data))
            point = data[ri]
            target = answers[ri]
            pre_funcs,outputs = network_propagation(weights,biases,activations,point)
            pred = outputs[-1]
            cost = np.square(pred - target)
            if i % 100 == 0:
                c = 0
                for j in range(len(data)):
                    p = data[j]
                    target = answers[j]
                    pre_funcs,outputs = network_propagation(weights,biases,activations,p)
                    p_pred = outputs[-1]
                    c += np.square(p_pred - target)
                loss_history.append(c)

            dcost_dpred = 2 * (pred - target)
            dpred_dz = sigmoid_p(pre_funcs[-1])
            #Changes start here
            dz_dweights = [[]] * len(weights)
            dz_dweights[0] = point
#             if activations[-1]:
#                 dz_dweights[0] = sigmoid_p(np.array(point))
            for i in range(0,len(pre_funcs[:-1])):
                if activations[i]:
                    dz_dweights[i+1] = sigmoid_p(pre_funcs[:-1][i])
                else:
                    dz_dweights[i+1] = pre_funcs[:-1][i]
                for j in range(len(dz_dweights)):
                    if np.array(dz_dweights[i-j]).tolist() and i-j > 0:
                        dz_dweights[i+1] *= dz_dweights[i-j]
            dz_dbias = 1
            dcost_dz = dcost_dpred*dpred_dz
            dcost_dweights = [[]] * len(weights)
            for i in range(len(dcost_dweights)): 
                dcost_dweights[i] = np.dot(dcost_dz,[dz_dweights[i]])
            dcost_dbias = dcost_dz*dz_dbias
            for i in range(len(weights)):
                weights[i] -= learning_rate*dcost_dweights[i][0]
            for i in range(len(biases)):
                biases[i] -= learning_rate*np.array(dcost_dbias)
            acc = (1-c)*100
            if c < lowest_c:
                lowest_c = c
                best_weights = weights
                best_biases = biases
            if round(acc[0]) >= threshold_value:
                threshold = True
    return best_weights,best_biases,loss_history

def training_stats(loss_history,weights,biases,activations,data,answers):
    plt.plot(loss_history)
    pre_funcs,outputs = network_propagation(weights,biases,activations,data)
    answers = np.reshape(answers,outputs[-1].shape)
    loss = (outputs[-1] - answers) ** 2
    min_loss = sum(loss)[0]
    first_loss = loss_history[0]
    improvement = round(((first_loss[0] - min_loss)/first_loss[0]),0)
    max_acc = (1-min_loss)*100

    print('Minimum Loss:',round(min_loss,2))
    print('Improvement:',str(improvement*100)+'%'+' (From '+str(round(first_loss[0],2))+')')
    print('Highest Accuracy:',round(max_acc,2))
    print('Best Weights:',weights)
    print('Best Biases:',biases)

def normalize_data(data):
    data = np.array(data)
    data_shape = data.shape
    flatten = lambda l: [item for sublist in l for item in sublist]
    data = flatten(data)
    min_val = min(data)
    max_val = max(data)
    norm_data = []
    for term in data:
        term = (term-min_val)/(max_val-min_val)
        norm_data.append(term)
    norm_data = np.reshape(np.array(norm_data),data_shape)
    return norm_data

def prediction(pred_data,weights,biases,activations):
    pre_funcs,outputs = network_propagation(weights,biases,activations,pred_data)
    return outputs[-1]
# Layer_size,input_size,activation
network = [[1,1,None],[1,None,None]]
data = [[1],[2],[3],[4],[5]]
answers = [2,4,6,8,10]
weights,biases,activations = initialize_network(network)
weights,biases,loss_history = train(data,answers,network,weights,biases,activations)
training_stats(loss_history,weights,biases,activations,data,answers)
pred = prediction(data,weights,biases,activations)

Is my understanding wrong or is my code faulty?

Jetdr Jetdr · Accepted Answer · 2020-06-03T12:53:28

The problem is that you are not encompassing for the fact that one layer could affect the next layer in more than one way, when the network has a hidden layer. Try using np.dot when there are more than one link between the different layers.

Neural Network Gradient Descent: Matrix Shapes of Derivatives of Weights not Aligned

1 Answers