In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.eval();

 from .autonotebook import tqdm as notebook_tqdm


# Try forward pass on single Example

In [2]:
sequence = "We need more quality doctors, engineers and lawyers in our nation."
token_ids = torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sequence)))
token_ids

tensor([2057, 2342, 2062, 3737, 7435, 1010, 6145, 1998, 9559, 1999, 2256, 3842,
 1012])

In [3]:
with torch.no_grad():
 model(token_ids)

RuntimeError: The size of tensor a (13) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
token_ids

tensor([2057, 2342, 2062, 3737, 7435, 1010, 6145, 1998, 9559, 1999, 2256, 3842,
 1012])

As seen above our model does not have a batch dimension because of which we are seeing this issue. Let's add a batch dimension and then pass our sequence through the model

In [None]:
with torch.no_grad():
 out = model(token_ids.unsqueeze(0))
out

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.2781, -1.0656]]), hidden_states=None, attentions=None)

Let's try by duplicating the input if we get the same logits

In [None]:
with torch.no_grad():
 inp = torch.cat([token_ids.unsqueeze(0), token_ids.unsqueeze(0)], dim = 0)
 out = model(inp)
out.logits

tensor([[ 1.2781, -1.0656],
 [ 1.2781, -1.0656]])

# Input padding

In [None]:
padding_id = 100

batched_ids = [
 [200, 200, 200],
 [200, 200, padding_id],
]

print(model(torch.tensor([batched_ids[0]])).logits)
print(model(torch.tensor([batched_ids[1][:2]])).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=)
tensor([[ 0.5803, -0.4125]], grad_fn=)
tensor([[ 1.5694, -1.3895],
 [ 0.9907, -0.9139]], grad_fn=)


There’s something wrong with the logits in our batched predictions: the second row should be the same as the logits for the second sentence, but we’ve got completely different values!

This is because when we add padding, we need to make sure we nullify it's impact during the attention matrix computation step. This is why we need a mask so that we can explicily shut these tokens from the attention calculation.

# Cross checking the working of attention masks

In [None]:
tokens

{'input_ids': tensor([[ 101, 1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172,
 2607, 2026, 2878, 2166, 1012, 102],
 [ 101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0,
 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [None]:
sentences = ["I’ve been waiting for a HuggingFace course my whole life.",
 "I hate this so much!"]
tokens = tokenizer(sentences, padding=True, return_tensors="pt")
with torch.no_grad():
 out = model(**tokens)

In [None]:
print(tokens)

{'input_ids': tensor([[ 101, 1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172,
 2607, 2026, 2878, 2166, 1012, 102],
 [ 101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0,
 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
out.logits

tensor([[-1.5979, 1.6390],
 [ 4.1692, -3.3464]])

In [None]:
# Do the entire forward pass manually for sentence 1

# Tokenize the sentence and get the tokenids
token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))

# Add the special token CLS and SEP at the start and end of the token rspectively
token_ids = [101] + token_ids + [102]

# Perform the forward pass and print the logits
with torch.no_grad():
 print(model(torch.tensor([token_ids])).logits)

tensor([[-1.5979, 1.6390]])


In [None]:
# Do the entire forward pass manually for sentence 2

# Tokenize the sentence and get the tokenids
s0_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))
token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[1]))
s1_tokens = len(token_ids)
additional_ids = len(s0_ids) - len(token_ids)

# Add the special token CLS and SEP at the start and end of the token repectively
# Also create an attention mask here to stop the attention from considering additional padding tokens
token_ids = [101] + token_ids + [102] + [0 for _ in range(additional_ids)]
attention_mask = [1 for _ in range(s1_tokens + 2)] + [0 for _ in range(additional_ids)]

# Perform the forward pass and print the logits
with torch.no_grad():
 print(model(input_ids = torch.tensor([token_ids]),
 attention_mask = torch.tensor([attention_mask])).logits)

tensor([[ 4.1692, -3.3464]])
