Loss dimensionality issue in PyTorch (sequence to label learning)

Question

I am doing a sequence to label learning model in PyTorch. I have two sentences and I am classifying whether they are entailed or not (SNLI dataset). I concatenate two 50 word sentences together (sometimes padded) into a vector of length 100. I then send in minibatches into word embeddings -> LSTM -> Linear layer. I am doing cross entropy loss but I need a vector of [mini_batch, C] to go into the CrossEntropyLoss function. Instead I still have the 100 words in my vector as [mini_batch, 100, C]

Here is my model:

class myLSTM(nn.Module):
    def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256):
        super(myLSTM, self).__init__()
        self.embedding = nn.Embedding(v_size, embed_d)
        self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
        self.mlp = nn.Linear(mlp_d, 1024)

        # Set static embedding vectors
        self.embedding.weight.requires_grad = False

        #self.sm = nn.CrossEntropyLoss()

    def display(self):
        for param in self.parameters():
            print(param.data.size())

    def filter_params(self):
        # Might not be compatible with python 3
        #self.parameters = filter(lambda p: p.requires_grad, self.parameters())
        pass

    def init_hidden(self):
        # Need to init hidden weights in LSTM
        pass

    def forward(self, sentence):
        print(sentence.size())
        embeds = self.embedding(sentence)
        print(embeds.size())
        out, _ = self.lstm(embeds)
        print(out.size())
        out = self.mlp(out)
        return out

My training sequences with output:

batch_size = 3
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, weight_decay=1e-4)
ADM_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)

criterion = nn.CrossEntropyLoss()
num_epochs = 50

from torch.autograd import Variable
from torch import optim

for epoch in range(num_epochs):
    print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, float(epoch)/num_epochs))
    for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):

        # Convert minibatch to numpy
        s1, s2, y = convert_to_numpy(n_data[start:end])

        # Convert numpy to Tensor
        res = np.concatenate((s1,s2), axis=1) # Attach two sentences into 1 input vector
        input_tensor = torch.from_numpy(res).type(torch.LongTensor)
        target_tensor = torch.from_numpy(y).type(torch.FloatTensor)
        data, target = Variable(input_tensor), Variable(target_tensor)

        # Zero gradients
        SGD_optimizer.zero_grad()

        # Forward Pass
        output = model.forward(data) 
        print("Output size: ")
        print(output.size())
        print("Target size: ")
        print(target.size())
        # Calculate loss with respect to training labels
        loss = criterion(output, target)

        # Backprogogate and update optimizer
        loss.backward()
        SGD_optimizer.step()
        #ADAM_optimizer.step()

output:

Epoch 0/50: 0.0%
torch.Size([3, 100])
torch.Size([3, 100, 300])
torch.Size([3, 100, 256])
Output size: 
torch.Size([3, 100, 1024])
Target size: 
torch.Size([3])

error:

ValueError: Expected 2 or 4 dimensions (got 3)

EDITED -------------------------------------------------------------------

I have now got my model training but I am getting low accuracy. Is there an issue with my LSTM outputs being concatenated and then condensed to a smaller tensor to go through my linear layer?

New Model:

class myLSTM(nn.Module):
    def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256, num_classes=3, lstm_layers=1):
        super(myLSTM, self).__init__()
        self.num_layers = lstm_layers
        self.hidden_size = h_size
        self.embedding = nn.Embedding(v_size, embed_d)
        self.lstm = nn.LSTM(embed_d, h_size, num_layers=lstm_layers, bidirectional=True, batch_first=True)
        self.mlp = nn.Linear(2 * h_size * 2, num_classes)

        # Set static embedding vectors
        self.embedding.weight.requires_grad = False

    def forward(self, s1, s2):
        # Set initial states
        #h0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda() # 2 for bidirection 
        #c0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda()

        batch_size = s1.size()[0]
        embeds_1 = self.embedding(s1)
        embeds_2 = self.embedding(s2)
        _, (h_1_last, _) = self.lstm(embeds_1)#, (h0, c0)) #note the change here. Last hidden state is taken
        _, (h_2_last, _) = self.lstm(embeds_2)#, (h0, c0))
        concat = torch.cat( (h_1_last, h_2_last), dim=2) #double check the dimension
        concat = concat.view(batch_size, -1)
        scores = self.mlp(concat)
        return scores

New Training

batch_size = 64
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)

criterion = nn.CrossEntropyLoss()
num_epochs = 10
model.train()

if cuda:
    model = model.cuda()
    criterion = criterion.cuda()

from torch.autograd import Variable
from torch import optim

epoch_losses = []

for epoch in range(num_epochs):
    print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, 100*float(epoch)/num_epochs))

    # Batch loss aggregator
    losses = []

    for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
        # Convert minibatch to numpy
        s1, s2, y = convert_to_numpy(n_data[start:end])

        # Convert numpy to Tensor
        s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
        s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
        target_tensor = torch.from_numpy(y).type(torch.LongTensor)

        s1 = Variable(s1_tensor)
        s2 = Variable(s2_tensor)
        target = Variable(target_tensor)

        if cuda:
            s1 = s1.cuda()
            s2 = s2.cuda()
            target = target.cuda()

        # Zero gradients
        SGD_optimizer.zero_grad()

        # Forward Pass
        output = model.forward(s1,s2) 

        # Calculate loss with respect to training labels
        loss = criterion(output, target)
        losses.append(loss.data[0])

        # Backprogogate and update optimizer
        loss.backward()
        SGD_optimizer.step()

    # concat losses to epoch losses
    epoch_losses += losses

training with tensor sizes printed:

Epoch 0/10: 0.0%
Batch size: 64
Sentences
torch.Size([64, 50])
torch.Size([64, 50])
torch.Size([64, 50, 300])
torch.Size([64, 50, 300])
Hidden states
torch.Size([2, 64, 128])
torch.Size([2, 64, 128])
Concatenated hidden states
torch.Size([2, 64, 256])
Reshaped tensors for linear layer
torch.Size([64, 512])
Linear propogation
torch.Size([64, 3])

Evaluation

def eval_model(model, mode='dev'):
    file_name = 'snli_1.0/snli_1.0_dev.jsonl' if mode == 'dev' else 'snli_1.0/snli_1.0_test.jsonl'

    dev_data, _ = obtain_data(file_name)
    dev_n_data = vocab.process_data(dev_data)

    print("Length of data: {}".format(len(dev_n_data)))

    eval_batch_size = 1024
    model.eval()

    total = len(dev_n_data)
    hit = 0
    correct = 0

    # Batch dev eval
    for start, end in batch_index_gen(eval_batch_size, len(dev_n_data)):

        s1, s2, y = convert_to_numpy(dev_n_data[start:end])

        s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
        s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
        target_tensor = torch.from_numpy(y).type(torch.LongTensor)

        s1 = Variable(s1_tensor, volatile=True)
        s2 = Variable(s2_tensor, volatile=True)
        target = Variable(target_tensor, volatile=True)

        if cuda:
            s1 = s1.cuda()
            s2 = s2.cuda()
            target = target.cuda()

        output = model.forward(s1,s2)
        loss = criterion(output, target)

        #print("output size: {}".format(output.size()))
        #print("target size: {}".format(target.size()))
        pred = output.data.max(1)[1] # get the index of the max log-probability
        #print(pred[:5])
        #print(output[:])
        correct += pred.eq(target.data).cpu().sum()

    return correct / float(total)

eval_model(model)

Well, your output has size: [3, 100, 1024] while CrossEntropyLoss says: "The input is expected to contain scores for each class. input has to be a 2D Tensor of size batch x n. This criterion expects a class index (0 to nClasses-1) as the target for each value of a 1D tensor of size n" — Manuel Lagunas

Egor Lakomkin Egor Lakomkin · Accepted Answer · 2017-11-30T12:13:47

I think there is an issue in a way you are trying to solve an entailment problem.

Maybe you can do it this way:

design your module to accept two sentences as input
embed both of them with your embeddings
encode them using the LSTM module.
now you have two fixed length vector representations of two sentences. Simpliest thing to do is to just concatenate them together.
Add liner layer on top to evaluate scores for each entailment class (3 I suppose)
apply softmax to get a proper probability distribution

So your model can look like this (double check the dimensions):

class myLSTM(nn.Module):
    def __init__(self, h_size=128, v_size=10, embed_d=300, num_classes = 3):
        super(myLSTM, self).__init__()
        self.embedding = nn.Embedding(v_size, embed_d)
        self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
        self.mlp = nn.Linear(2*h_size*2, num_classes) #<- change here

    def forward(self, sentence1, sentence2):
        embeds_1 = self.embedding(sentence1)
        embeds_2 = self.embedding(sentence2)
        _, (h_1_last, _) = self.lstm(embeds_1) #note the change here. Last hidden state is taken
        _, (h_2_last, _) = self.lstm(embeds_2)
        concat = torch.concat([h_1_last, h_2_last], dim=1) #double check the dimension
        scores = self.mlp(concat)
        probas = F.softmax(scores) #from torch.functional ...

Then you can play around with adding more hidden layers or thinking how combining two sentences can be done in more intelligent way (attention, etc). Double check what CrossEntropyLoss accepts as input and target and adjust (is it unnormalized class scores or probability distribution). Check http://pytorch.org/docs/master/nn.html#lstm for LSTM module documentation to clarify what LSTM returns (do you need hidden states for every word or just the representation after the last one).

Loss dimensionality issue in PyTorch (sequence to label learning)

1 Answers