mounseflit commited on
Commit
ffbc1c3
·
verified ·
1 Parent(s): 52673bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -14
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
  from peft import PeftModel
4
  import gradio as gr
5
 
@@ -10,23 +10,15 @@ fine_tuned_model = "mounseflit/falcon-7b-marrakech"
10
  # Load tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
- # Set up 8-bit quantization config
14
- quantization_config = BitsAndBytesConfig(
15
- load_in_8bit=True, # Use 8-bit quantization
16
- llm_int8_threshold=6.0, # Adjust for better precision
17
- llm_int8_has_fp16_weight=False # More compatible for CPU usage
18
- )
19
-
20
- # Load base model with 8-bit quantization and CPU offloading
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
  model_name,
23
- quantization_config=quantization_config, # Pass quantization config
24
- device_map="auto", # Auto device map for offloading
25
- offload_folder="offload", # Offload large parts of the model to disk
26
  offload_state_dict=True # Enable state dict offloading to reduce memory usage
27
  )
28
 
29
- # Load the fine-tuned LoRA model on top of the quantized model
30
  model = PeftModel.from_pretrained(base_model, fine_tuned_model)
31
 
32
  # Ensure the model is in evaluation mode
@@ -40,7 +32,7 @@ def generate_text(prompt):
40
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
41
 
42
  # Create Gradio interface
43
- iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite")
44
 
45
  # Launch the app
46
  iface.launch()
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from peft import PeftModel
4
  import gradio as gr
5
 
 
10
  # Load tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
+ # Load base model (CPU-only, no quantization)
 
 
 
 
 
 
 
14
  base_model = AutoModelForCausalLM.from_pretrained(
15
  model_name,
16
+ device_map="auto", # Auto device map for CPU
17
+ offload_folder="offload", # Offload large parts of the model to disk to save memory
 
18
  offload_state_dict=True # Enable state dict offloading to reduce memory usage
19
  )
20
 
21
+ # Load the fine-tuned LoRA model on top of the base model
22
  model = PeftModel.from_pretrained(base_model, fine_tuned_model)
23
 
24
  # Ensure the model is in evaluation mode
 
32
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
33
 
34
  # Create Gradio interface
35
+ iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite (CPU)")
36
 
37
  # Launch the app
38
  iface.launch()