Spaces:
Runtime error
Runtime error
mounseflit
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import torch
|
2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
from peft import PeftModel
|
4 |
import gradio as gr
|
5 |
|
@@ -10,23 +10,15 @@ fine_tuned_model = "mounseflit/falcon-7b-marrakech"
|
|
10 |
# Load tokenizer
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
|
13 |
-
#
|
14 |
-
quantization_config = BitsAndBytesConfig(
|
15 |
-
load_in_8bit=True, # Use 8-bit quantization
|
16 |
-
llm_int8_threshold=6.0, # Adjust for better precision
|
17 |
-
llm_int8_has_fp16_weight=False # More compatible for CPU usage
|
18 |
-
)
|
19 |
-
|
20 |
-
# Load base model with 8-bit quantization and CPU offloading
|
21 |
base_model = AutoModelForCausalLM.from_pretrained(
|
22 |
model_name,
|
23 |
-
|
24 |
-
|
25 |
-
offload_folder="offload", # Offload large parts of the model to disk
|
26 |
offload_state_dict=True # Enable state dict offloading to reduce memory usage
|
27 |
)
|
28 |
|
29 |
-
# Load the fine-tuned LoRA model on top of the
|
30 |
model = PeftModel.from_pretrained(base_model, fine_tuned_model)
|
31 |
|
32 |
# Ensure the model is in evaluation mode
|
@@ -40,7 +32,7 @@ def generate_text(prompt):
|
|
40 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
41 |
|
42 |
# Create Gradio interface
|
43 |
-
iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite")
|
44 |
|
45 |
# Launch the app
|
46 |
iface.launch()
|
|
|
1 |
import torch
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
from peft import PeftModel
|
4 |
import gradio as gr
|
5 |
|
|
|
10 |
# Load tokenizer
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
|
13 |
+
# Load base model (CPU-only, no quantization)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
base_model = AutoModelForCausalLM.from_pretrained(
|
15 |
model_name,
|
16 |
+
device_map="auto", # Auto device map for CPU
|
17 |
+
offload_folder="offload", # Offload large parts of the model to disk to save memory
|
|
|
18 |
offload_state_dict=True # Enable state dict offloading to reduce memory usage
|
19 |
)
|
20 |
|
21 |
+
# Load the fine-tuned LoRA model on top of the base model
|
22 |
model = PeftModel.from_pretrained(base_model, fine_tuned_model)
|
23 |
|
24 |
# Ensure the model is in evaluation mode
|
|
|
32 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
33 |
|
34 |
# Create Gradio interface
|
35 |
+
iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite (CPU)")
|
36 |
|
37 |
# Launch the app
|
38 |
iface.launch()
|