I am trying to implement full gradient descent in keras. This means that for each epoch I am training on the entire dataset. This is why the batch size is defined to be the length size of the training set.
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD,Adam
from keras import regularizers
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import seed
import random
def xrange(start_point,end_point,N,base):
temp = np.logspace(0.1, 1, N,base=base,endpoint=False)
temp=temp-temp.min()
temp=(0.0+temp)/(0.0+temp.max()) #this is between 0 and 1
return (end_point-start_point)*temp +start_point #this is the range
def train_model(x_train,y_train,x_test):
#seed(1)
model=Sequential()
num_units=100
act='relu'
model.add(Dense(num_units,input_shape=(1,),activation=act))
model.add(Dense(num_units,activation=act))
model.add(Dense(num_units,activation=act))
model.add(Dense(num_units,activation=act))
model.add(Dense(1,activation='tanh')) #output layer 1 unit ; activation='tanh'
model.compile(Adam(),'mean_squared_error',metrics=['mse'])
history=model.fit(x_train,y_train,batch_size=len(x_train),epochs=500,verbose=0,validation_split = 0.2 ) #train on the noise (not moshe)
fit=model.predict(x_test)
loss = history.history['loss']
val_loss = history.history['val_loss']
return fit
N = 1024
start_point=-5.25
end_point=5.25
base=500# the base of the log of the trainning
train_step=0.0007
x_test=np.arange(start_point,end_point,train_step+0.05)
x_train=xrange(start_point,end_point,N,base)
#random.shuffle(x_train)
function_y=np.sin(3*x_train)/2
noise=np.random.uniform(-0.2,0.2,len(function_y))
y_train=function_y+noise
fit=train_model(x_train,y_train,x_test)
plt.scatter(x_train,y_train, facecolors='none', edgecolors='g') #plt.plot(x_value,sample,'bo')
plt.scatter(x_test, fit, facecolors='none', edgecolors='b') #plt.plot(x_value,sample,'bo')
However when I uncomment the #random.shuffle(x_train) - in order to shuffle the trainning. :
I don't understand why I get different plots (the green circles are the trainning and the blue are the are what the modern learned). as in both cases the batch is of ALL the dataset. So the shuffle shouldn't change anything.
Thank you .
Ariel