Ozaii commited on
Commit
562853a
·
verified ·
1 Parent(s): 696e8bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -29
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import torch
2
- from unsloth import FastLanguageModel
3
- from peft import PeftConfig
4
- from transformers import AutoTokenizer, TextIteratorStreamer
5
  from threading import Thread
6
  import gradio as gr
7
  import spaces
@@ -10,38 +9,38 @@ MODEL_PATH = "Ozaii/zephyr-bae"
10
  BASE_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
11
  max_seq_length = 2048
12
 
13
- print("Attempting to load Zephyr... Cross your fingers! 🤞")
 
 
 
14
 
15
  @spaces.GPU
16
  def load_model():
17
- try:
18
- peft_config = PeftConfig.from_pretrained(MODEL_PATH)
19
-
20
- model, tokenizer = FastLanguageModel.from_pretrained(
21
- model_name=BASE_MODEL,
22
- max_seq_length=max_seq_length,
23
- dtype=None, # Auto-detect
24
- load_in_4bit=True,
25
- )
26
-
27
- model = FastLanguageModel.get_peft_model(
28
- model,
29
- peft_config=peft_config,
30
- adapter_name="default",
31
- use_gradient_checkpointing=True,
32
- )
33
-
34
- FastLanguageModel.for_inference(model)
35
- print("Zephyr loaded successfully! Time to charm!")
36
- return model, tokenizer
37
- except Exception as e:
38
- print(f"Oops! Zephyr seems to be playing hide and seek. Error: {str(e)}")
39
- raise
40
-
41
- model, tokenizer = load_model()
42
 
43
  @spaces.GPU
44
  def generate_response(prompt, max_new_tokens=128):
 
45
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
46
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
47
  generation_kwargs = dict(
 
1
  import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
3
+ from peft import PeftConfig, PeftModel
 
4
  from threading import Thread
5
  import gradio as gr
6
  import spaces
 
9
  BASE_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
10
  max_seq_length = 2048
11
 
12
+ print("Zephyr is getting ready to charm! 🌟")
13
+
14
+ model = None
15
+ tokenizer = None
16
 
17
  @spaces.GPU
18
  def load_model():
19
+ global model, tokenizer
20
+ if model is None:
21
+ try:
22
+ peft_config = PeftConfig.from_pretrained(MODEL_PATH)
23
+
24
+ base_model = AutoModelForCausalLM.from_pretrained(
25
+ BASE_MODEL,
26
+ torch_dtype=torch.float16,
27
+ device_map="auto",
28
+ load_in_4bit=True
29
+ )
30
+
31
+ model = PeftModel.from_pretrained(base_model, MODEL_PATH)
32
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
33
+ tokenizer.pad_token = tokenizer.eos_token
34
+
35
+ print("Zephyr loaded successfully! Time to charm!")
36
+ except Exception as e:
37
+ print(f"Oops! Zephyr seems to be playing hide and seek. Error: {str(e)}")
38
+ raise
39
+ return model, tokenizer
 
 
 
 
40
 
41
  @spaces.GPU
42
  def generate_response(prompt, max_new_tokens=128):
43
+ model, tokenizer = load_model()
44
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
45
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
46
  generation_kwargs = dict(