mounseflit commited on
Commit
52673bb
·
verified ·
1 Parent(s): ab52c4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from peft import PeftModel
4
  import gradio as gr
5
 
@@ -10,10 +10,17 @@ fine_tuned_model = "mounseflit/falcon-7b-marrakech"
10
  # Load tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
- # Load base model with 8-bit precision and offload to CPU
 
 
 
 
 
 
 
14
  base_model = AutoModelForCausalLM.from_pretrained(
15
  model_name,
16
- load_in_8bit=True, # Quantization to 8-bit
17
  device_map="auto", # Auto device map for offloading
18
  offload_folder="offload", # Offload large parts of the model to disk
19
  offload_state_dict=True # Enable state dict offloading to reduce memory usage
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
  from peft import PeftModel
4
  import gradio as gr
5
 
 
10
  # Load tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
+ # Set up 8-bit quantization config
14
+ quantization_config = BitsAndBytesConfig(
15
+ load_in_8bit=True, # Use 8-bit quantization
16
+ llm_int8_threshold=6.0, # Adjust for better precision
17
+ llm_int8_has_fp16_weight=False # More compatible for CPU usage
18
+ )
19
+
20
+ # Load base model with 8-bit quantization and CPU offloading
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
  model_name,
23
+ quantization_config=quantization_config, # Pass quantization config
24
  device_map="auto", # Auto device map for offloading
25
  offload_folder="offload", # Offload large parts of the model to disk
26
  offload_state_dict=True # Enable state dict offloading to reduce memory usage