Update app.py
Browse files
app.py
CHANGED
@@ -9,9 +9,11 @@ from threading import Thread
|
|
9 |
|
10 |
print(f"Starting to load the model to memory")
|
11 |
m = AutoModelForCausalLM.from_pretrained(
|
12 |
-
"stabilityai/stablelm-2-zephyr-1_6b", torch_dtype=torch.
|
13 |
tok = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", trust_remote_code=True)
|
14 |
-
|
|
|
|
|
15 |
print(f"Sucessfully loaded the model to the memory")
|
16 |
|
17 |
|
@@ -31,7 +33,7 @@ def chat(message, history):
|
|
31 |
chat.append({"role": "user", "content": message})
|
32 |
messages = tok.apply_chat_template(chat, tokenize=False)
|
33 |
# Tokenize the messages string
|
34 |
-
model_inputs = tok([messages], return_tensors="pt")
|
35 |
streamer = TextIteratorStreamer(
|
36 |
tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
|
37 |
generate_kwargs = dict(
|
|
|
9 |
|
10 |
print(f"Starting to load the model to memory")
|
11 |
m = AutoModelForCausalLM.from_pretrained(
|
12 |
+
"stabilityai/stablelm-2-zephyr-1_6b", torch_dtype=torch.float16, trust_remote_code=True)
|
13 |
tok = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", trust_remote_code=True)
|
14 |
+
# using CUDA for an optimal experience
|
15 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
16 |
+
m = m.to(device)
|
17 |
print(f"Sucessfully loaded the model to the memory")
|
18 |
|
19 |
|
|
|
33 |
chat.append({"role": "user", "content": message})
|
34 |
messages = tok.apply_chat_template(chat, tokenize=False)
|
35 |
# Tokenize the messages string
|
36 |
+
model_inputs = tok([messages], return_tensors="pt").to(device)
|
37 |
streamer = TextIteratorStreamer(
|
38 |
tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
|
39 |
generate_kwargs = dict(
|