I have the following simple LSTM network:
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.rnn = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
self.batch_size = None
self.hidden = None
def forward(self, x):
h0, c0 = self.init_hidden(x)
out, (hn, cn) = self.rnn(x, (h0, c0))
out = self.fc(out[:, -1, :])
return out
def init_hidden(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
return [t for t in (h0, c0)]
I am initialising this model as"
model = LSTMClassifier(28, 10, 6, 1)
i.e. each input instance has 6 time steps and the dimension of each time step is 28, and the hidden dimension is 10. The inputs are being mapped to an output dim of 1.
The training data is being prepared in batches of size 16, meaning that the data passed in the training loop has the shape:
torch.Size([16, 6, 28])
With labels of shape:
batches[1][0].size()
An example of the input is:
tensor([[-0.3674, 0.0347, -0.2169, -0.0821, -0.3673, -0.1773, 1.1840, -0.2669,
-0.4202, -0.1473, -0.1132, -0.4756, -0.3565, 0.5010, 0.1274, -0.1147,
0.2783, 0.0836, -1.3251, -0.8067, -0.6447, -0.7396, -0.3241, 1.3329,
1.3801, 0.8198, 0.6098, 0.0697],
[-0.2710, 0.1596, -0.2524, -0.0821, -0.3673, -0.1773, 0.0302, -0.2099,
-0.4550, 0.1451, -0.4561, -0.5207, -0.5657, -0.5287, -0.2690, -0.1147,
-0.0346, -0.1043, -0.7515, -0.8392, -0.4745, -0.7396, -0.3924, 0.8122,
-0.1624, -1.2198, 0.0326, -0.9306],
[-0.1746, 0.0972, -0.2702, -0.0821, -0.3673, -0.1773, -0.0468, -1.1225,
-0.4480, -0.4397, 0.4011, -1.1073, -1.0536, -0.1855, -0.7502, -0.1147,
-0.0146, -0.1545, -0.1919, -0.1674, 0.0930, -0.7396, 0.8106, 1.1594,
0.4546, -1.2198, -0.5446, -1.2640],
[-0.2710, 0.0660, -0.2524, -0.0821, -0.4210, -0.1773, 1.8251, -0.5236,
-0.4410, -0.7321, 0.4011, -0.6110, -0.2171, 1.1875, -0.2973, -0.1147,
-0.1278, 0.7728, -0.9334, -0.5141, -2.1202, 1.3521, -0.9393, 0.5085,
-0.4709, 0.8198, -1.1218, 0.0697],
[-0.3674, -0.0277, -0.2347, -0.0821, -0.0448, -0.1773, 0.2866, -0.1386,
-0.4271, 0.4375, -0.2847, -0.1146, -0.4262, -0.3571, -0.0425, -0.1147,
-0.4207, -0.4552, -0.5277, -0.9584, -0.4177, -0.7396, -0.2967, 0.5085,
0.4546, -1.2198, -0.3522, -1.2640],
[-0.3674, -0.1447, -0.1991, -0.0821, 0.1701, -0.1773, 0.0430, 0.1324,
-0.4271, 0.7299, -0.4561, 0.2915, -0.5657, -0.1855, -0.2123, -0.1147,
-0.0413, -0.8311, -0.6396, -1.0451, -0.4177, -0.7396, -0.2967, -0.4028,
0.7631, -1.2198, -0.3522, -1.2640]])
When I train the model as:
Epochs = 10
batch_size = 32
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
for epoch in range(Epochs):
print(f"Epoch {epoch + 1}")
for n, (X, y) in enumerate(batches):
model.train()
optimizer.zero_grad()
y_pred = model(X)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
model.eval()
accurate = 0
for X_instance, y_instance in zip(test_X, test_y):
if y_instance == round(model(X_instance.view(-1, 6, 28)).detach().item()):
accurate += 1
print(f"Accuracy test set: {accurate/len(test_X)}")
The accuracy does not converge:
Epoch 1
Accuracy test set: 0.23169107856191745
Sample params:
tensor([-0.3356, -0.0105, -0.3405, -0.0049, 0.0037, 0.1707, 0.2685, -0.3893,
-0.4707, -0.2872, -0.1544, -0.1455, 0.0393, 0.0774, -0.4194, 0.0780,
-0.2177, -0.3829, -0.4679, 0.0370, -0.0794, 0.0455, -0.1331, -0.0169,
-0.1551, -0.0348, 0.1746, -0.5163], grad_fn=<SelectBackward>)
tensor([ 0.2137, -0.2558, 0.1509, -0.0975, 0.5591, 0.0907, -0.1249, 0.3095,
0.2112, 0.3134, -0.1581, -0.3051, -0.3559, -0.0177, 0.1485, 0.4397,
-0.1441, 0.1705, 0.3230, -0.3236, 0.0692, 0.0920, -0.2691, -0.3695,
-0.0692, 0.3747, 0.0149, 0.5216], grad_fn=<SelectBackward>)
Epoch 2
Accuracy test set: 0.23049267643142476
Sample params:
tensor([-0.3483, -0.0144, -0.3512, 0.0213, -0.0081, 0.1777, 0.2674, -0.4031,
-0.4628, -0.3041, -0.1651, -0.1511, 0.0216, 0.0513, -0.4320, 0.0839,
-0.2602, -0.3629, -0.4541, 0.0398, -0.0768, 0.0432, -0.1150, -0.0160,
-0.1346, -0.0727, 0.1801, -0.5253], grad_fn=<SelectBackward>)
tensor([ 0.1879, -0.2534, 0.1461, -0.1141, 0.5735, 0.0872, -0.1286, 0.3273,
0.2084, 0.3037, -0.1535, -0.2934, -0.3870, -0.0252, 0.1492, 0.4752,
-0.1709, 0.1776, 0.3390, -0.3318, 0.0734, 0.1077, -0.2790, -0.3777,
-0.0518, 0.3726, 0.0228, 0.5404], grad_fn=<SelectBackward>)
Epoch 3
Accuracy test set: 0.22982689747003995
Sample params:
tensor([-0.3725, -0.0069, -0.3623, 0.0393, -0.0167, 0.1748, 0.2577, -0.4183,
-0.4681, -0.3196, -0.1657, -0.1613, 0.0122, 0.0268, -0.4361, 0.0838,
-0.2962, -0.3566, -0.4344, 0.0366, -0.0822, 0.0486, -0.1150, -0.0295,
-0.1080, -0.1094, 0.1841, -0.5336], grad_fn=<SelectBackward>)
tensor([ 0.1664, -0.2456, 0.1477, -0.1332, 0.5820, 0.0819, -0.1228, 0.3426,
0.2066, 0.2985, -0.1464, -0.2824, -0.4199, -0.0323, 0.1530, 0.5057,
-0.1991, 0.1856, 0.3407, -0.3347, 0.0800, 0.1203, -0.2791, -0.3863,
-0.0426, 0.3760, 0.0327, 0.5641], grad_fn=<SelectBackward>)
Epoch 4
Accuracy test set: 0.23249001331557922
Sample params:
tensor([-0.3945, 0.0032, -0.3765, 0.0600, -0.0248, 0.1713, 0.2442, -0.4297,
-0.4741, -0.3311, -0.1653, -0.1667, 0.0029, 0.0066, -0.4373, 0.0738,
-0.3320, -0.3530, -0.4136, 0.0390, -0.0731, 0.0552, -0.1117, -0.0517,
-0.0871, -0.1455, 0.1841, -0.5359], grad_fn=<SelectBackward>)
tensor([ 0.1495, -0.2292, 0.1524, -0.1473, 0.5938, 0.0661, -0.1157, 0.3626,
0.2013, 0.2927, -0.1350, -0.2661, -0.4558, -0.0411, 0.1562, 0.5381,
-0.2279, 0.1927, 0.3319, -0.3431, 0.0852, 0.1402, -0.2747, -0.4026,
-0.0297, 0.3757, 0.0396, 0.5856], grad_fn=<SelectBackward>)
Have I made a mistake in the model definition?
model.parameters()
– zwep