Memory leak(memory increasing slowly in each inference in cpu)

#19
by scancet - opened

Hello,
I used transformers library of intfloat/multilingual-e5-large this model. I used the same code that is shared in its model card. I dockerized it and started to use. It increases the memory in each inference and then it exceeding my memory limit after a while.

Here is my code,

import torch.nn.functional as F
from torch import Tensor, no_grad, cuda, device
from transformers import AutoTokenizer, AutoModel
import gc

class Model():

def __init__(self, path='resources/intfloat_multilingual-e5-large'):
    self.tokenizer = AutoTokenizer.from_pretrained(path)
    self.model = AutoModel.from_pretrained(path)
    dvc = device('cpu')
    self.model.to(dvc)
    self.model.eval()

def average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def inference(self, texts): 
    with no_grad():
        batch_dict = self.tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
        #print(batch_dict)
        #print(self.model.config)
        outputs = self.model(**batch_dict)
        embeddings = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        del outputs
        embeddings = F.normalize(embeddings, p=2, dim=1)
        embeddings = embeddings.numpy().tolist()
        gc.collect()
        cuda.empty_cache()
    return embeddings

model = Model()

Here is my docker stats. Its initially uses around 2.6gb ram in memory. But in each iteration it increases slowly.
Please let me know if I can clear the cache of the memory or in any way I can stop this memory leak.

Screen Shot 2023-11-03 at 14.54.52.png

Thanks

This looks strange, python and pytorch should do GC automatically. Is it possible that you store too many embedding vectors that cause the OOM issue?

I can confirm this issue - my input_texts are only about 100MB in memory but getting embeddings for them takes reaches 200GB (which is where my server crashes)

I have this issue too

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large', cache_dir=CACHE_DIR)
model = AutoModel.from_pretrained(
    'intfloat/multilingual-e5-large', 
    cache_dir=CACHE_DIR, 
    device_map=DEVICE,
    torch_dtype=torch.bfloat16
)

model.eval()

# Tokenize the input texts

batch_size = 128
hidden_size = model.config.hidden_size

all_embeddings = torch.zeros(len(input_texts), hidden_size, device="cpu")

for batch_start in trange(0, len(input_texts), batch_size):
    print_gpu_memory()
    batch_end = batch_start + batch_size
    batch_dict = tokenizer(input_texts[batch_start:batch_end], max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict['input_ids'] = batch_dict['input_ids'].to(DEVICE)
    batch_dict['attention_mask'] = batch_dict['attention_mask'].to(DEVICE)

    outputs = model(**batch_dict)

    # outputs.last_hidden_state.shape (batch_size, sequence_length, hidden_size) hidden activations of the last layer
    batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # average over the sequence length

    # normalize embeddings
    batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
    batch_embeddings = batch_embeddings.to('cpu')
    
    all_embeddings[batch_start:batch_end] = batch_embeddings
    all_embeddings = all_embeddings.to("cpu")
    del outputs.last_hidden_state
    del outputs
    del batch_embeddings, batch_dict
    torch.cuda.empty_cache()
    gc.collect()

actually, using the with torch.no_grad() context did the trick and resolved OOM issues.
@intfloat why is the no_grad context necessary?

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large', cache_dir=CACHE_DIR)
model = AutoModel.from_pretrained(
    'intfloat/multilingual-e5-large', 
    cache_dir=CACHE_DIR, 
    device_map=DEVICE,
    torch_dtype=torch.bfloat16
)
#%%
# Tokenize the input texts

batch_size = 128
hidden_size = model.config.hidden_size

all_embeddings = torch.zeros(len(input_texts), hidden_size, device="cpu")

with torch.no_grad():
    for batch_start in trange(0, len(input_texts), batch_size):
        print_gpu_memory()
        batch_end = batch_start + batch_size
    batch_dict = tokenizer(input_texts[batch_start:batch_end], max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict['input_ids'] = batch_dict['input_ids'].to(DEVICE)
    batch_dict['attention_mask'] = batch_dict['attention_mask'].to(DEVICE)

    outputs = model(**batch_dict)

    # outputs.last_hidden_state.shape (batch_size, sequence_length, hidden_size) hidden activations of the last layer
    batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # average over the sequence length

    # normalize embeddings
    batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
    batch_embeddings = batch_embeddings.to('cpu')
    
    all_embeddings[batch_start:batch_end] = batch_embeddings
    all_embeddings = all_embeddings.to("cpu")
    del outputs.last_hidden_state
    del outputs
    del batch_embeddings, batch_dict
    torch.cuda.empty_cache()
    gc.collect()

@canrager Thanks for letting us know! I think the default behavior of pytorch is to save intermediate activations for backward calculation later, so the memory keeps increasing without torch.no_grad().

Sign up or log in to comment