Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
7 |
# Set environment variables for GPU usage and memory allocation
|
8 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
9 |
torch.cuda.empty_cache()
|
10 |
-
torch.cuda.set_per_process_memory_fraction(0.
|
11 |
|
12 |
# Define device
|
13 |
device = "cuda" # The device to load the model onto
|
@@ -18,9 +18,7 @@ system_message = ""
|
|
18 |
# Load the model and tokenizer
|
19 |
def hermes_model():
|
20 |
tokenizer = AutoTokenizer.from_pretrained("TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ")
|
21 |
-
model = AutoModelForCausalLM.from_pretrained(
|
22 |
-
"TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ", low_cpu_mem_usage=True, device_map="auto"
|
23 |
-
)
|
24 |
return model, tokenizer
|
25 |
|
26 |
model, tokenizer = hermes_model()
|
|
|
7 |
# Set environment variables for GPU usage and memory allocation
|
8 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
9 |
torch.cuda.empty_cache()
|
10 |
+
torch.cuda.set_per_process_memory_fraction(0.5) # Adjust the fraction as needed
|
11 |
|
12 |
# Define device
|
13 |
device = "cuda" # The device to load the model onto
|
|
|
18 |
# Load the model and tokenizer
|
19 |
def hermes_model():
|
20 |
tokenizer = AutoTokenizer.from_pretrained("TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ")
|
21 |
+
model = AutoModelForCausalLM.from_pretrained("TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ", low_cpu_mem_usage=True, device_map="auto")
|
|
|
|
|
22 |
return model, tokenizer
|
23 |
|
24 |
model, tokenizer = hermes_model()
|