Spaces:

mounseflit
/

Marrakech-Heritage-LLM

Runtime error

mounseflit commited on Sep 8, 2024

Commit

ffbc1c3

verified ·

1 Parent(s): 52673bb

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
 import gradio as gr
@@ -10,23 +10,15 @@ fine_tuned_model = "mounseflit/falcon-7b-marrakech"
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Set up 8-bit quantization config
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True,  # Use 8-bit quantization
-    llm_int8_threshold=6.0,  # Adjust for better precision
-    llm_int8_has_fp16_weight=False  # More compatible for CPU usage
-)
-# Load base model with 8-bit quantization and CPU offloading
 base_model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    quantization_config=quantization_config,  # Pass quantization config
-    device_map="auto",        # Auto device map for offloading
-    offload_folder="offload", # Offload large parts of the model to disk
     offload_state_dict=True   # Enable state dict offloading to reduce memory usage
 )
-# Load the fine-tuned LoRA model on top of the quantized model
 model = PeftModel.from_pretrained(base_model, fine_tuned_model)
 # Ensure the model is in evaluation mode
@@ -40,7 +32,7 @@ def generate_text(prompt):
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Create Gradio interface
-iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite")
 # Launch the app
 iface.launch()

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import gradio as gr
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Load base model (CPU-only, no quantization)
 base_model = AutoModelForCausalLM.from_pretrained(
     model_name,
+    device_map="auto",        # Auto device map for CPU
+    offload_folder="offload", # Offload large parts of the model to disk to save memory
     offload_state_dict=True   # Enable state dict offloading to reduce memory usage
 )
+# Load the fine-tuned LoRA model on top of the base model
 model = PeftModel.from_pretrained(base_model, fine_tuned_model)
 # Ensure the model is in evaluation mode
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Create Gradio interface
+iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite (CPU)")
 # Launch the app
 iface.launch()