I am trying to create my own network from scratch (without using libraries such as keras or tensorflow) to better understand machine learning and neural networks. I have run into the problem that when using a network with a certain configuration of layers, the gradient descent does not function properly. With the values of each layer being the derivatives of the respective set of weights and that derivatives multiplied together chains the weights nearer to the input to the output, the layers multiplied by themselves do not work. For example, a neural network with 2 input neurons, 3 hidden neurons and 1 output neuron, the derivative relating the cost and the set of weights that link the input and the hidden must surely include multiplication of the derivatives (The values stored in each layer) inorder to chain the weights to the output.
Here is the full code:
(Try entering [[3,1,None],[2,None,None],[1,None,None]]
for the variable network for a reproducable error)
import numpy as np
import random
from matplotlib import pyplot as plt
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_p(x):
return sigmoid(x)*(1 -sigmoid(x))
def network_propagation(weights,biases,activations,input_data):
pre_funcs = []
outputs = []
input_layer = input_data
for i in range(len(network)):
pre_func = np.dot(input_layer,weights[i]) + biases[i]
pre_funcs.append(pre_func)
if activations[i]:
output = activations[i](pre_func)
else:
output = pre_func
outputs.append(output)
input_layer = output
return pre_funcs,outputs
def initialize_network(network):
weights = []
biases = []
activations = []
for layer in network:
layer_weights = []
layer_size = layer[0]
input_size = layer[1]
activation = layer[2]
if input_size == None:
input_size = network[network.index(layer)-1][0]
activations.append(activation)
biases.append(np.random.randn())
for i in range(layer_size*input_size):
layer_weights.append(np.random.randn())
weights.append(np.reshape(np.array(layer_weights),(input_size,layer_size)))
return weights,biases,activations
def train(data,answers,network,weights,biases,activations):
learning_rate = 0.2
loss_history = []
learning_rate_history = []
epochs = 20000
threshold_value = 100
threshold = False
lowest_c = np.inf
schedule = True
best_weights = weights
best_biases = biases
for i in range(epochs):
if threshold == False:
ri = np.random.randint(len(data))
point = data[ri]
target = answers[ri]
pre_funcs,outputs = network_propagation(weights,biases,activations,point)
pred = outputs[-1]
cost = np.square(pred - target)
if i % 100 == 0:
c = 0
for j in range(len(data)):
p = data[j]
target = answers[j]
pre_funcs,outputs = network_propagation(weights,biases,activations,p)
p_pred = outputs[-1]
c += np.square(p_pred - target)
loss_history.append(c)
dcost_dpred = 2 * (pred - target)
dpred_dz = sigmoid_p(pre_funcs[-1])
#Changes start here
dz_dweights = [[]] * len(weights)
dz_dweights[0] = point
# if activations[-1]:
# dz_dweights[0] = sigmoid_p(np.array(point))
for i in range(0,len(pre_funcs[:-1])):
if activations[i]:
dz_dweights[i+1] = sigmoid_p(pre_funcs[:-1][i])
else:
dz_dweights[i+1] = pre_funcs[:-1][i]
for j in range(len(dz_dweights)):
if np.array(dz_dweights[i-j]).tolist() and i-j > 0:
dz_dweights[i+1] *= dz_dweights[i-j]
dz_dbias = 1
dcost_dz = dcost_dpred*dpred_dz
dcost_dweights = [[]] * len(weights)
for i in range(len(dcost_dweights)):
dcost_dweights[i] = np.dot(dcost_dz,[dz_dweights[i]])
dcost_dbias = dcost_dz*dz_dbias
for i in range(len(weights)):
weights[i] -= learning_rate*dcost_dweights[i][0]
for i in range(len(biases)):
biases[i] -= learning_rate*np.array(dcost_dbias)
acc = (1-c)*100
if c < lowest_c:
lowest_c = c
best_weights = weights
best_biases = biases
if round(acc[0]) >= threshold_value:
threshold = True
return best_weights,best_biases,loss_history
def training_stats(loss_history,weights,biases,activations,data,answers):
plt.plot(loss_history)
pre_funcs,outputs = network_propagation(weights,biases,activations,data)
answers = np.reshape(answers,outputs[-1].shape)
loss = (outputs[-1] - answers) ** 2
min_loss = sum(loss)[0]
first_loss = loss_history[0]
improvement = round(((first_loss[0] - min_loss)/first_loss[0]),0)
max_acc = (1-min_loss)*100
print('Minimum Loss:',round(min_loss,2))
print('Improvement:',str(improvement*100)+'%'+' (From '+str(round(first_loss[0],2))+')')
print('Highest Accuracy:',round(max_acc,2))
print('Best Weights:',weights)
print('Best Biases:',biases)
def normalize_data(data):
data = np.array(data)
data_shape = data.shape
flatten = lambda l: [item for sublist in l for item in sublist]
data = flatten(data)
min_val = min(data)
max_val = max(data)
norm_data = []
for term in data:
term = (term-min_val)/(max_val-min_val)
norm_data.append(term)
norm_data = np.reshape(np.array(norm_data),data_shape)
return norm_data
def prediction(pred_data,weights,biases,activations):
pre_funcs,outputs = network_propagation(weights,biases,activations,pred_data)
return outputs[-1]
# Layer_size,input_size,activation
network = [[1,1,None],[1,None,None]]
data = [[1],[2],[3],[4],[5]]
answers = [2,4,6,8,10]
weights,biases,activations = initialize_network(network)
weights,biases,loss_history = train(data,answers,network,weights,biases,activations)
training_stats(loss_history,weights,biases,activations,data,answers)
pred = prediction(data,weights,biases,activations)
Is my understanding wrong or is my code faulty?