import torch | |
import numpy as np | |
import random | |
def get_batch_text_representation(texts, model, tokenizer, batch_size=1): | |
""" | |
Get mean-pooled representations of given texts in batches. | |
""" | |
mean_pooled_batch = [] | |
for i in range(0, len(texts), batch_size): | |
batch_texts = texts[i:i+batch_size] | |
inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True) | |
with torch.no_grad(): | |
outputs = model(**inputs, output_hidden_states=False) | |
last_hidden_states = outputs.last_hidden_state | |
input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float() | |
sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1) | |
sum_mask = input_mask_expanded.sum(1) | |
mean_pooled = sum_embeddings / sum_mask | |
mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy()) | |
return np.array(mean_pooled_batch) | |