I am a bit stuck trying to train a pretty standard MLP model using Theano. My model code looks like this
class Layer(object): def __init__(self, inputs, n_in, n_out, activation=T.nnet.softmax): def weights(shape): return np.array(np.random.uniform(size=shape), dtype='float64') def biases(size): return np.zeros((size), dtype='float64') self.W = theano.shared(value=weights((n_in, n_out)), name='weights', borrow=True) self.b = theano.shared(value=biases(n_out), name='biases', borrow=True) self.output = activation(T.dot(inputs, self.W) + self.b) self.pred = T.argmax(self.output, axis=1) self.params = [self.W, self.b] class MLP(object): def __init__(self, inputs, n_in, n_hidden, n_out): """ for now lets go with one hidden layer""" self._hidden = Layer(inputs, n_in, n_hidden, activation=T.tanh) self._output = Layer(self._hidden.output, n_hidden, n_out) # softmax by default def loss(self, one_hot): return T.mean(T.sqr(one_hot - self._output.output) def accuracy(self, y): return T.mean(T.eq(self._output.pred, y)) def updates(self, loss, rate=0.01): updates = [] updates.append((self._hidden.W, self._hidden.W - rate * T.grad(cost=loss, wrt=self._hidden.W))) updates.append((self._hidden.b, self._hidden.b - rate * T.grad(cost=loss, wrt=self._hidden.b))) updates.append((self._output.W, self._output.W - rate * T.grad(cost=loss, wrt=self._output.W))) updates.append((self._output.b, self._output.b - rate * T.grad(cost=loss, wrt=self._output.b))) return updates
Then I attempt to train it like this
x = T.matrix('x', dtype='float64') y = T.vector('y', dtype='int32') # basic logistic model # model = Layer(x, 784, 10, activation=T.nnet.softmax) # basic multi-layer perceptron model = MLP(x, 784, 128, 10) labels = T.extra_ops.to_one_hot(y, 10) # loss function #loss = T.mean(T.sqr(labels - model.output)) loss = model.loss(labels) # average number of correct predictions over a batch #accuracy = T.mean(T.eq(model.pred, y)) accuracy = model.accuracy(y) # updates #rate = 0.05 #g_W = T.grad(cost=loss, wrt=model.W) #g_b = T.grad(cost=loss, wrt=model.b) #updates = [(model.W, model.W - rate * g_W), # (model.b, model.b - rate * g_b)] updates = model.updates(loss, rate=0.3) # batch index index = T.scalar('batch index', dtype='int32') size = T.scalar('batch size', dtype='int32') train = theano.function([index, size], [loss, accuracy], updates=updates, givens={x: train_set[0][index * size: (index + 1) * size], y: train_set[1][index * size: (index + 1) * size]}) valid = theano.function([index, size], [loss, accuracy], givens={x: valid_set[0][index * size: (index + 1) * size], y: valid_set[1][index * size: (index + 1) * size]}) test = theano.function([index, size], [accuracy], givens={x: test_set[0][index * size: (index + 1) * size], y: test_set[1][index * size: (index + 1) * size]}) n_epochs = 10 batch_size = 500 # number of items in training dataset / batch size batches_in_epoch = datasets[0][0].shape[0] // batch_size losses = np.empty(0) errors = np.empty(0) for epoch in range(1, n_epochs + 1): epoch_losses = np.empty(0) epoch_errors = np.empty(0) for batch_n in range(batches_in_epoch): l, e = train(batch_n, batch_size) epoch_losses = np.append(epoch_losses, l) epoch_errors = np.append(epoch_errors, e) print('[%s]' % time.ctime(), 'epoch: ', epoch, 'batch: ', batch_n, 'loss: ', np.round(l, 4), 'accuracy: ', np.round(e, 4)) # shuffle train set every epoch shuffle = np.arange(datasets[0][1].shape[0]) np.random.shuffle(shuffle) train_set[0] = train_set[0][shuffle] train_set[1] = train_set[1][shuffle] losses = np.concatenate([losses, epoch_losses]) errors = np.concatenate([errors, epoch_errors]) valid_l, valid_e = valid(0, datasets[1][0].shape[0]) print('[%s]' % time.ctime(), 'epoch: ', epoch, 'validation loss: ', valid_l, 'validation accuracy: ', valid_e) acc = test(0, datasets[2][0].shape[0]) print() print('Final accuracy: ', np.round(acc, 4)[0])
Now, if you look at the comments, I tried it with a basic logistic regression model and it worked, I got some 80% accuracy. But it doesn't work when I replace it with my MLP model. It doesn't converge to anything and I get 10% accuracy random guesses. What am I doing wrong? The data I am using is the MNIST dataset loaded into shared variables the way Theano tutorials do.