Huggingface GPT2 and T5 model APIs for sentence classification?

Question

I've successfully used the Huggingface Transformers BERT model to do sentence classification using the BERTForSequenceClassification class and API. I've used it for both 1-sentence sentiment analysis and 2-sentence NLI.

I can see that other models have analogous classes, e.g. XLNetForSequenceClassification and RobertaForSequenceClassification. This type of sentence classification usually involves placing a classifier layer on top of a dense vector representing the entirety of the sentence.

Now I'm trying to use the GPT2 and T5 models. However, when I look at the available classes and API for each one, there is no equivalent "ForSequenceClassification" class. For example, for GPT2 there are GPT2Model, GPT2LMHeadModel, and GPT2DoubleHeadsModel classes. Perhaps I'm not familiar enough with the research for GPT2 and T5, but I'm certain that both models are capable of sentence classification.

So my questions are:

What Huggingface classes for GPT2 and T5 should I use for 1-sentence classification?
What classes should I use for 2-sentence (sentence pair) classification (like natural language inference)?

Thank you for any help.

Regarding GPT2 you can have a look at github notebook. It is important that you place the CLS token at the end of your sentence because GPT2 uses only the left context (unlike BERT which is bidirectional). — cronoik
Just a shot in the dark regarding T5: Can't you simply train it with model(input_ids=sentence_ids, lm_labels=[class_id, eos_id]) where class_id is a new token added to the vocabulary? — cronoik

Pablo Marino Pablo Marino · Accepted Answer · 2020-07-01T21:06:43

You need to use GPT2Model class to generate the sentence embeddings of the text. once you have the embeddings feed them to a Linear NN and softmax function to obtain the logits, below is a component for text classification using GPT2 I'm working on(still a work in progress, so I'm open to suggestions), it follows the logic I just described:

from torch_model_base import TorchModelBase
import torch
import torch.nn as nn
import torch.utils.data
from transformers import GPT2Tokenizer, GPT2Model
import random
from spacy.util import minibatch, compounding
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
from typing import List, Tuple


def mean_across_all_tokens(hidden_states):
    return torch.mean(hidden_states[-1], dim=1)

def sum_all_tokens(hidden_states):
    return torch.sum(hidden_states[-1], dim=1)

def concat_all_tokens(hidden_states):
    batch_size, max_tokens, emb_dim = hidden_states[-1].shape
    return torch.reshape(hidden_states[-1], (batch_size, max_tokens * emb_dim))



class GPT2SequenceClassifierModel(nn.Module):
    def __init__(
            self,
            hidden_size: int,
            num_classes: int,
            gpt_model_name: str,
            max_seq_length: int = 280,
            embedding_func=mean_across_all_tokens,
            combine_sentence_tokens=True
    ):
        super(GPT2SequenceClassifierModel, self).__init__()
        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(hidden_size, num_classes)
        self.model = GPT2Model.from_pretrained(
            gpt_model_name,
            output_hidden_states=True
        )
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt_model_name)
        self.combine_sentence_tokens = combine_sentence_tokens;
        self.embedding_func = embedding_func;
        self.model.eval()
        self.max_length = max_seq_length

    def _tokenize(self, text_list: List[str]) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        #self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.tokenizer.add_special_tokens({'cls_token': '[CLS]'})
        self.model.resize_token_embeddings(len(self.tokenizer))
        input_ids = self.tokenizer.batch_encode_plus(text_list,
                                                     add_special_tokens=True,
                                                     max_length=self.max_length,
                                                     pad_to_max_length=True
                                                     )["input_ids"]

        return torch.LongTensor(input_ids)

    def _tokenize_and_predict(self, text_list: List[str]) -> torch.tensor:
        input_ids_tensor = self._tokenize(text_list)
        out = self.model(input_ids=input_ids_tensor)
        hidden_states = out[2]
        if (self.combine_sentence_tokens):
            return self.embedding_func(hidden_states)
        else:
            return hidden_states[-1];


    def forward(self, text_list: List[str]):
        """
        :param input_ids: (torch.LongTensor of shape (batch_size, input_ids_length))
        :return: logits for class
        """
        if isinstance(text_list, pd.Series):
            text_list = text_list.tolist()
        with torch.no_grad():
            # fine tuning GPT2 model is too expensive, so won't do it
            gpt_out = self._tokenize_and_predict(text_list)
        batch_size = len(text_list)
        assert gpt_out.shape == (batch_size, self.hidden_size)
        prediction_vector = self.fc1(gpt_out)  # (batch_size , max_len, num_classes)
        logits = torch.softmax(prediction_vector, dim=1)
        return logits


class GPT2Classifier(TorchModelBase):
    """GPT2 + NN head for classification problems.
    The network will work for any kind of classification task.

    Parameters
    ----------
    embed_dim: dimension of byte-pair/token embeddings generated by the model, check the model card(n_embd prop), since each model is compatible with only 1 no. of dimensions
    max_seq_length: max tokens in a sequence(n_positions param in hugging face model config), if sequenc is shorter will get padded
    """
    def __init__(self,
            model_name="distilgpt2",
                 embed_dim=768,
                 max_seq_length=1024,
                 **kwargs
                 ):
        self.model_name = model_name
        self.embed_dim = embed_dim
        self.max_seq_length = max_seq_length
        self.model = None # call fit() to set this
        self.tokenizer = None  # call fit() to set this
        self.classes = None # call fit() to set this
        super(GPT2Classifier, self).__init__(**kwargs)
        self.params += ['model_name']

    def fit(self, X, y):
        """Standard `fit` method.

        Parameters
        ----------
        X : np.array
        y : array-like
        Returns
        -------
        self

        """
        self.classes = list(set(y))
        self.model = GPT2SequenceClassifierModel(
            hidden_size=self.embed_dim,
            num_classes=len(self.classes),
            gpt_model_name=self.model_name,
            max_seq_length=self.max_seq_length
        )
        self.opt = self.optimizer(
            self.model.parameters()
        )
        self.model.train()
        loss = nn.CrossEntropyLoss()
        print("Training... max iters: ", self.max_iter)
        for ephoc in range(self.max_iter):
            print("ephoc no: ", ephoc)
            zipped_data = list(zip(X,y))
            random.shuffle(zipped_data)
            batches = minibatch(zipped_data, size=self.batch_size)
            for batch in batches:
                X_batch, y_batch = zip(*batch)
                batch_preds = self.model(X_batch)
                err = loss(batch_preds, torch.LongTensor(y_batch))
                # Backprop:
                self.opt.zero_grad()
                err.backward()
                self.opt.step()
        return self

    def predict_proba(self, X):
        """Predicted probabilities for the examples in `X`.

        Parameters
        ----------
        X : np.array

        Returns
        -------
        np.array with shape (len(X), self.n_classes_)

        """
        self.model.eval()
        with torch.no_grad():
            preds = self.model(X)
            preds = preds.numpy()
            return preds

    def predict(self, X):
        """Predicted labels for the examples in `X`. These are converted
        from the integers that PyTorch needs back to their original
        values in `self.classes_`.

        Parameters
        ----------
        X : np.array

        Returns
        -------
        list of length len(X)

        """
        probs = self.predict_proba(X)
        return [self.classes[i] for i in probs.argmax(axis=1)]

Huggingface GPT2 and T5 model APIs for sentence classification?

2 Answers