Spaces:
Runtime error
Runtime error
mounseflit
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import torch
|
2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
from peft import PeftModel
|
4 |
import gradio as gr
|
5 |
|
@@ -10,10 +10,17 @@ fine_tuned_model = "mounseflit/falcon-7b-marrakech"
|
|
10 |
# Load tokenizer
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
|
13 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
base_model = AutoModelForCausalLM.from_pretrained(
|
15 |
model_name,
|
16 |
-
|
17 |
device_map="auto", # Auto device map for offloading
|
18 |
offload_folder="offload", # Offload large parts of the model to disk
|
19 |
offload_state_dict=True # Enable state dict offloading to reduce memory usage
|
|
|
1 |
import torch
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
3 |
from peft import PeftModel
|
4 |
import gradio as gr
|
5 |
|
|
|
10 |
# Load tokenizer
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
|
13 |
+
# Set up 8-bit quantization config
|
14 |
+
quantization_config = BitsAndBytesConfig(
|
15 |
+
load_in_8bit=True, # Use 8-bit quantization
|
16 |
+
llm_int8_threshold=6.0, # Adjust for better precision
|
17 |
+
llm_int8_has_fp16_weight=False # More compatible for CPU usage
|
18 |
+
)
|
19 |
+
|
20 |
+
# Load base model with 8-bit quantization and CPU offloading
|
21 |
base_model = AutoModelForCausalLM.from_pretrained(
|
22 |
model_name,
|
23 |
+
quantization_config=quantization_config, # Pass quantization config
|
24 |
device_map="auto", # Auto device map for offloading
|
25 |
offload_folder="offload", # Offload large parts of the model to disk
|
26 |
offload_state_dict=True # Enable state dict offloading to reduce memory usage
|