Memory leak(memory increasing slowly in each inference in cpu)
Hello,
I used transformers library of intfloat/multilingual-e5-large this model. I used the same code that is shared in its model card. I dockerized it and started to use. It increases the memory in each inference and then it exceeding my memory limit after a while.
Here is my code,
import torch.nn.functional as F
from torch import Tensor, no_grad, cuda, device
from transformers import AutoTokenizer, AutoModel
import gc
class Model():
def __init__(self, path='resources/intfloat_multilingual-e5-large'):
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.model = AutoModel.from_pretrained(path)
dvc = device('cpu')
self.model.to(dvc)
self.model.eval()
def average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
def inference(self, texts):
with no_grad():
batch_dict = self.tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
#print(batch_dict)
#print(self.model.config)
outputs = self.model(**batch_dict)
embeddings = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
del outputs
embeddings = F.normalize(embeddings, p=2, dim=1)
embeddings = embeddings.numpy().tolist()
gc.collect()
cuda.empty_cache()
return embeddings
model = Model()
Here is my docker stats. Its initially uses around 2.6gb ram in memory. But in each iteration it increases slowly.
Please let me know if I can clear the cache of the memory or in any way I can stop this memory leak.
Thanks
This looks strange, python and pytorch should do GC automatically. Is it possible that you store too many embedding vectors that cause the OOM issue?
I can confirm this issue - my input_texts are only about 100MB in memory but getting embeddings for them takes reaches 200GB (which is where my server crashes)
I have this issue too
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large', cache_dir=CACHE_DIR)
model = AutoModel.from_pretrained(
'intfloat/multilingual-e5-large',
cache_dir=CACHE_DIR,
device_map=DEVICE,
torch_dtype=torch.bfloat16
)
model.eval()
# Tokenize the input texts
batch_size = 128
hidden_size = model.config.hidden_size
all_embeddings = torch.zeros(len(input_texts), hidden_size, device="cpu")
for batch_start in trange(0, len(input_texts), batch_size):
print_gpu_memory()
batch_end = batch_start + batch_size
batch_dict = tokenizer(input_texts[batch_start:batch_end], max_length=512, padding=True, truncation=True, return_tensors='pt')
batch_dict['input_ids'] = batch_dict['input_ids'].to(DEVICE)
batch_dict['attention_mask'] = batch_dict['attention_mask'].to(DEVICE)
outputs = model(**batch_dict)
# outputs.last_hidden_state.shape (batch_size, sequence_length, hidden_size) hidden activations of the last layer
batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # average over the sequence length
# normalize embeddings
batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
batch_embeddings = batch_embeddings.to('cpu')
all_embeddings[batch_start:batch_end] = batch_embeddings
all_embeddings = all_embeddings.to("cpu")
del outputs.last_hidden_state
del outputs
del batch_embeddings, batch_dict
torch.cuda.empty_cache()
gc.collect()
actually, using the with torch.no_grad()
context did the trick and resolved OOM issues.
@intfloat
why is the no_grad context necessary?
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large', cache_dir=CACHE_DIR)
model = AutoModel.from_pretrained(
'intfloat/multilingual-e5-large',
cache_dir=CACHE_DIR,
device_map=DEVICE,
torch_dtype=torch.bfloat16
)
#%%
# Tokenize the input texts
batch_size = 128
hidden_size = model.config.hidden_size
all_embeddings = torch.zeros(len(input_texts), hidden_size, device="cpu")
with torch.no_grad():
for batch_start in trange(0, len(input_texts), batch_size):
print_gpu_memory()
batch_end = batch_start + batch_size
batch_dict = tokenizer(input_texts[batch_start:batch_end], max_length=512, padding=True, truncation=True, return_tensors='pt')
batch_dict['input_ids'] = batch_dict['input_ids'].to(DEVICE)
batch_dict['attention_mask'] = batch_dict['attention_mask'].to(DEVICE)
outputs = model(**batch_dict)
# outputs.last_hidden_state.shape (batch_size, sequence_length, hidden_size) hidden activations of the last layer
batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # average over the sequence length
# normalize embeddings
batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
batch_embeddings = batch_embeddings.to('cpu')
all_embeddings[batch_start:batch_end] = batch_embeddings
all_embeddings = all_embeddings.to("cpu")
del outputs.last_hidden_state
del outputs
del batch_embeddings, batch_dict
torch.cuda.empty_cache()
gc.collect()