In theory, one-hidden-layer neural network with m hidden nodes can be trained by gradient descent to fit n data points with 0 training error, where m >= n.
I have 100 data points (x, y), x in R and y in R, no specific pattern, just random. And I was using one-hidden-layer neural network with 1000/2000/10000/... hidden nodes to fit those points (with stochastic gradient descent and ReLU).
But I can't achieve that. Any idea what's the problem here?
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation
from keras.optimizers import SGD
from keras import initializers
## initializing x_train and y_train randomly ##
def f1(x):
if x < 3:
return np.abs(x-1)
else:
return -np.abs(x-1)+4
n = 100
x_train = np.random.uniform(-4+1, 4+1, size = n)
e = np.random.normal(0, 0.5, size = n)
y_train = np.vectorize(f1)(x_train) + e
np.random.shuffle(y_train)
k = 10000 # number of hidden nodes
ep = 5
loss = []
model = Sequential()
model.add(Dense(k, kernel_initializer = 'random_normal', input_shape = (1,), use_bias=True))
model.add(Activation('relu'))
model.add(Dense(1, kernel_initializer = 'random_normal', use_bias=True))
#sgd = SGD(lr=0.00005, decay=1e-6, momentum=0.9)
sgd = SGD(lr=0.00008)
model.compile(loss='mse', optimizer=sgd, metrics = ['mse'])
for i in range(5000):
H = model.fit(x_train, y_train, epochs=ep, verbose=False)
wt = model.get_weights()
temp = H.history['mean_squared_error'][-1]
print(temp)
loss.append(temp)