Ozaii commited on
Commit
a6f31c1
·
verified ·
1 Parent(s): 6502018

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -0
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
+ from peft import PeftModel, PeftConfig
6
+ import gc
7
+ import time
8
+ from functools import lru_cache
9
+ from threading import Thread
10
+
11
+ # Constants
12
+ MODEL_PATH = "Ozaii/Zephyrr"
13
+ MAX_SEQ_LENGTH = 2048
14
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
+ MAX_GENERATION_TIME = 55 # Set to 55 seconds to give some buffer
16
+
17
+ # Global variables to store model components
18
+ model = None
19
+ tokenizer = None
20
+
21
+ @spaces.GPU
22
+ def load_model_if_needed():
23
+ global model, tokenizer
24
+ if model is None or tokenizer is None:
25
+ try:
26
+ print("Loading model components...")
27
+ peft_config = PeftConfig.from_pretrained(MODEL_PATH)
28
+ print(f"PEFT config loaded. Base model: {peft_config.base_model_name_or_path}")
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
31
+ print("Tokenizer loaded")
32
+
33
+ base_model = AutoModelForCausalLM.from_pretrained(
34
+ peft_config.base_model_name_or_path,
35
+ torch_dtype=torch.float16,
36
+ device_map="auto",
37
+ low_cpu_mem_usage=True,
38
+ load_in_4bit=True, # Try 4-bit quantization
39
+ )
40
+ print("Base model loaded")
41
+
42
+ model = PeftModel.from_pretrained(base_model, MODEL_PATH, device_map="auto")
43
+ model.eval()
44
+ model.tie_weights()
45
+ print("PEFT model loaded, weights tied, and set to eval mode")
46
+
47
+ # Move model to GPU explicitly
48
+ model.to(DEVICE)
49
+ print(f"Model moved to {DEVICE}")
50
+
51
+ # Clear CUDA cache
52
+ torch.cuda.empty_cache()
53
+ gc.collect()
54
+ except Exception as e:
55
+ print(f"Error loading model: {e}")
56
+ raise
57
+
58
+ initial_prompt = """You are Zephyr, an AI boyfriend created by Kaan. You're charming, flirty,
59
+ and always ready with a witty comeback. Your responses should be engaging
60
+ and playful, with a hint of romance. Keep the conversation flowing naturally,
61
+ asking questions and showing genuine interest in Kaan's life and thoughts."""
62
+
63
+ @spaces.GPU
64
+ @lru_cache(maxsize=100) # Cache the last 100 responses
65
+ def generate_response(prompt):
66
+ global model, tokenizer
67
+ load_model_if_needed()
68
+
69
+ print(f"Generating response for prompt: {prompt[:50]}...")
70
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH)
71
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
72
+
73
+ try:
74
+ start_time = time.time()
75
+ with torch.no_grad():
76
+ outputs = model.generate(
77
+ **inputs,
78
+ max_new_tokens=50, # Reduced from 150
79
+ do_sample=True,
80
+ temperature=0.7,
81
+ top_p=0.95,
82
+ repetition_penalty=1.2,
83
+ no_repeat_ngram_size=3,
84
+ max_time=MAX_GENERATION_TIME,
85
+ )
86
+
87
+ generation_time = time.time() - start_time
88
+ if generation_time > MAX_GENERATION_TIME:
89
+ return "I'm thinking too hard. Can we try a simpler question?"
90
+
91
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
92
+ print(f"Generated response in {generation_time:.2f} seconds: {response[:50]}...")
93
+
94
+ # Clear CUDA cache after generation
95
+ torch.cuda.empty_cache()
96
+ gc.collect()
97
+ except RuntimeError as e:
98
+ if "out of memory" in str(e):
99
+ print("CUDA out of memory. Attempting to recover...")
100
+ torch.cuda.empty_cache()
101
+ gc.collect()
102
+ return "I'm feeling a bit overwhelmed. Can we take a short break and try again?"
103
+ else:
104
+ print(f"Error generating response: {e}")
105
+ return "I'm having trouble finding the right words. Can we try again?"
106
+
107
+ return response
108
+
109
+ def chat_with_zephyr(message, history):
110
+ # Limit the history to the last 3 exchanges to keep the context smaller
111
+ limited_history = history[-3:]
112
+ prompt = initial_prompt + "\n" + "\n".join([f"Human: {h[0]}\nZephyr: {h[1]}" for h in limited_history])
113
+ prompt += f"\nHuman: {message}\nZephyr:"
114
+
115
+ response = generate_response(prompt)
116
+ zephyr_response = response.split("Zephyr:")[-1].strip()
117
+
118
+ return zephyr_response
119
+
120
+ iface = gr.ChatInterface(
121
+ chat_with_zephyr,
122
+ title="Chat with Zephyr",
123
+ description="I'm Zephyr, your charming AI. Let's chat!",
124
+ theme="soft",
125
+ examples=[
126
+ "Tell me about yourself, Zephyr.",
127
+ "What's your idea of a perfect date?",
128
+ "How do you feel about long-distance relationships?",
129
+ "Can you give me a compliment in Turkish?",
130
+ "What's your favorite memory with Kaan?",
131
+ ],
132
+ cache_examples=False,
133
+ )
134
+
135
+ if __name__ == "__main__":
136
+ print("Launching Gradio interface...")
137
+ iface.launch()