File size: 5,699 Bytes
352a6c0
 
75c74b0
cc932be
3d905f8
cc932be
f1a7c60
8aee95a
cc932be
 
37a9f1c
 
cc932be
 
94559fc
cc932be
3d905f8
 
cc932be
64b4ed5
2b0b840
cc932be
 
f076786
 
 
 
 
 
 
 
 
352a6c0
94559fc
ff32de9
 
 
 
45d4d15
 
 
 
ff32de9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352a6c0
 
 
 
 
 
 
 
96a08ea
 
b15fb69
 
b9d96b3
b15fb69
 
96a08ea
 
 
 
 
 
 
 
ad8bce1
 
0970989
f25fb1b
273fe29
477ad25
cbe42e9
477ad25
4377d27
477ad25
 
ff32de9
477ad25
 
a97881e
477ad25
 
 
 
ff32de9
4377d27
477ad25
 
 
4377d27
477ad25
96a08ea
75c74b0
 
 
477ad25
cc932be
352a6c0
 
 
72f7511
 
 
 
8aee95a
44881c4
f1ccd3c
72f7511
 
a896bb2
d98cec7
48eeb68
352a6c0
 
 
7b5b897
a896bb2
c0252bb
352a6c0
 
 
4329549
352a6c0
 
 
 
d98cec7
352a6c0
44881c4
 
 
 
 
 
 
 
 
f1ccd3c
44881c4
 
 
 
 
 
 
 
 
 
48eeb68
352a6c0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import gradio as gr

import os, sys
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
from transformers import LlamaTokenizer
import torch
import spaces
import psutil

# Define the model repository
REPO_NAME = 'schuler/experimental-JP47D20'
# REPO_NAME = 'schuler/experimental-JP47D21-KPhi-3-micro-4k-instruct'

# How to cache?
@spaces.GPU()
def load_model(repo_name):
    # tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
    tokenizer = LlamaTokenizer.from_pretrained(repo_name, trust_remote_code=True)
    generator_conf = GenerationConfig.from_pretrained(repo_name)
    model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True, torch_dtype=torch.bfloat16, attn_implementation="eager")
    # model.to('cuda')
    return tokenizer, generator_conf, model

# tokenizer, generator_conf, model, generator = False, False, False, False 
# with gr.Blocks() as main_block:

tokenizer, generator_conf, model = load_model(REPO_NAME)
global_error = ''
try:
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
except Exception as e:
  global_error =  f"Failed to load model: {str(e)}"

@spaces.GPU()
def local_generate(
    prompt,
    generation_config,
    max_new_tokens,
    do_sample=True,
    top_p=0.25,
    repetition_penalty=1.2,
    temperature=1.0
):
    response_output = generator(
        prompt,
        generation_config=generation_config,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        temperature=temperature
    )        
    generated_text = response_output[0]['generated_text']
    # Extract the assistant's response
    result = generated_text[len(prompt):]    
    return result

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    result = 'none'
    try:
        # Build the conversation prompt
        prompt = ''
        messages = []
        if (len(system_message)>0):
            prompt = "<|assistant|>"+system_message+f"<|end|>\n"    
        for val in history:
            if val[0]:
                messages.append({"role": "user", "content": val[0]})
            if val[1]:
                messages.append({"role": "assistant", "content": val[1]})
    
        messages.append({"role": "user", "content": message})
    
        for hmessage in messages:
            role = "<|assistant|>" if hmessage['role'] == 'assistant' else "<|user|>"
            prompt += f"{role}{hmessage['content']}<|end|>"        
        prompt += f"<|assistant|>"

        tokens_cnt = 0
        tokens_inc = 64
        last_token_len = 1
        full_result = ''
        while ( (tokens_cnt < max_tokens) and (last_token_len > 0) ):
            # Generate the response
            result = local_generate(
                prompt,
                generation_config=generator_conf,
                max_new_tokens=tokens_inc,
                do_sample=True,
                top_p=top_p,
                repetition_penalty=1.2,
                temperature=temperature
            )
            full_result = full_result + result
            prompt = prompt + result
            tokens_cnt = tokens_cnt + tokens_inc
            last_token_len = len(result)
            yield full_result
        
    except Exception as error:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        result = str(error) +':'+ exc_type +':'+ fname +':'+ exc_tb.tb_lineno
        yield result

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
embed_params = sum(p.numel() for p in model.model.embed_tokens.parameters())*2
non_embed_params = (trainable_params - embed_params) / 1e6
cpu_usage = psutil.cpu_percent(interval=1)
status_text = \
  f"This chat uses the {REPO_NAME} model with {model.get_memory_footprint() / 1e6:.2f} MB memory footprint. " + \
  f"Current CPU usage is {cpu_usage:.2f}% . '" + \
  f"Total number of non embedding trainable parameters: {non_embed_params:.2f} million. " + \
  f"You may ask questions such as 'What is biology?' or 'What is the human body?'"

# """
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="" + global_error, label="System message"),
        gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.25,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    description=status_text
)
"""
with gr.Blocks() as demo:
    # Display the status text at the top
    gr.Markdown(status_text)
    # Create the ChatInterface
    chat = gr.ChatInterface(
        respond,
        additional_inputs=[
            gr.Textbox(value="" + global_error, label="System message"),
            gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens"),
            gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Temperature"),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.25,
                step=0.05,
                label="Top-p (nucleus sampling)",
            ),
        ],
    )
"""

if __name__ == "__main__":
    demo.launch()