File size: 2,226 Bytes
bd0305e
 
015885c
bd0305e
 
 
d14c800
015885c
 
1019a35
d14c800
00572fe
6f60281
040b554
 
 
1019a35
 
26dd269
8a546d4
c58f313
 
14e5da3
c58f313
 
 
e4f8042
8a546d4
 
 
 
9b522e7
e4f8042
d9b8972
14e5da3
040b554
c58f313
 
015885c
 
 
 
 
 
 
8a546d4
015885c
 
 
 
 
14e5da3
 
015885c
c58f313
14e5da3
 
d0ede55
1019a35
 
14e5da3
a1cf708
b373db2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
import time
import numpy as np
from torch.nn import functional as F
import os
from threading import Thread

print(f"Starting to load the model to memory")
m = AutoModelForCausalLM.from_pretrained(
    "stabilityai/stablelm-2-zephyr-1_6b", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True)
tok = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", trust_remote_code=True)
# using CUDA for an optimal experience
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
m = m.to(device)
print(f"Sucessfully loaded the model to the memory")


start_message = ""

def user(message, history):
    # Append the user's message to the conversation history
    return "", history + [[message, ""]]


def chat(message, history):
    chat = []
    for item in history:
        chat.append({"role": "user", "content": item[0]})
        if item[1] is not None:
            chat.append({"role": "assistant", "content": item[1]})
    chat.append({"role": "user", "content": message})
    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    # Tokenize the messages string
    model_inputs = tok([messages], return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(
        tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.95,
        top_k=1000,
        temperature=0.75,
        num_beams=1,
    )
    t = Thread(target=m.generate, kwargs=generate_kwargs)
    t.start()

    # Initialize an empty string to store the generated text
    partial_text = ""
    for new_text in streamer:
        # print(new_text)
        partial_text += new_text
        # Yield an empty string to cleanup the message textbox and the updated conversation history
        yield partial_text



demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Stable LM 2 Zephyr 1.6b")
demo.launch()