File size: 6,408 Bytes
a9f41ef
 
 
 
 
 
 
 
 
 
abc8209
23c4eea
4b7c7df
abc8209
 
a9f41ef
 
 
 
 
 
4b7c7df
a9f41ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab25562
5275314
a9f41ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b056bd3
 
 
 
 
 
 
a9f41ef
 
 
b056bd3
 
 
 
 
 
 
 
 
 
 
a9f41ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab25562
a9f41ef
 
 
 
 
b056bd3
a9f41ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import gradio as gr

import copy
import random
import os
import requests
import time
import sys

from huggingface_hub import snapshot_download

os.system("pip install --upgrade pip")
os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_FP16_VA=ON" pip install llama-cpp-python''')


from llama_cpp import Llama


SYSTEM_PROMPT = '''You are a helpful, respectful and honest INTP-T AI Assistant named "Shi-Ci" in English or "兮辞" in Chinese.
You are good at speaking English and Chinese.
You are talking to a human User. If the question is meaningless, please explain the reason and don't share false information.
You are based on SEA-CausalLM model, not related to GPT, LLaMA, Meta, Mistral or OpenAI.
Let's work this out in a step by step way to be sure we have the right answer.\n\n'''
SYSTEM_TOKEN = 1587
USER_TOKEN = 2188
BOT_TOKEN = 12435
LINEBREAK_TOKEN = 13


ROLE_TOKENS = {
    "user": USER_TOKEN,
    "bot": BOT_TOKEN,
    "system": SYSTEM_TOKEN
}


def get_message_tokens(model, role, content):
    message_tokens = model.tokenize(content.encode("utf-8"))
    message_tokens.insert(1, ROLE_TOKENS[role])
    message_tokens.insert(2, LINEBREAK_TOKEN)
    message_tokens.append(model.token_eos())
    return message_tokens


def get_system_tokens(model):
    system_message = {"role": "system", "content": SYSTEM_PROMPT}
    return get_message_tokens(model, **system_message)


repo_name = "TheBloke/CausalLM-14B-GGUF"
model_name = "causallm_14b.Q4_1.gguf"

snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)

model = Llama(
    model_path=model_name,
    n_ctx=2000,
    n_parts=1,
)

max_new_tokens = 1500

def user(message, history):
    new_history = history + [[message, None]]
    return "", new_history


def bot(
    history,
    system_prompt,
    top_p,
    top_k,
    temp
):
    tokens = get_system_tokens(model)[:]
    tokens.append(LINEBREAK_TOKEN)

    for user_message, bot_message in history[:-1]:
        message_tokens = get_message_tokens(model=model, role="user", content=user_message)
        tokens.extend(message_tokens)
        if bot_message:
            message_tokens = get_message_tokens(model=model, role="bot", content=bot_message)
            tokens.extend(message_tokens)

    last_user_message = history[-1][0]
    message_tokens = get_message_tokens(model=model, role="user", content=last_user_message)
    tokens.extend(message_tokens)

    role_tokens = [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]
    tokens.extend(role_tokens)
    generator = model.generate(
        tokens,
        top_k=top_k,
        top_p=top_p,
        temp=temp
    )

    partial_text = ""
    for i, token in enumerate(generator):
        if token == model.token_eos() or (max_new_tokens is not None and i >= max_new_tokens):
            break
        partial_text += model.detokenize([token]).decode("utf-8", "ignore")
        history[-1][1] = partial_text
        yield history

with gr.Blocks(
    theme=gr.themes.Soft()
) as demo:
    gr.Markdown(f"""<h1><center>上师附外-兮辞·CausalLM-人工智能助理</center></h1>""")
    gr.Markdown(value="""欢迎使用!
        这里是一个ChatBot。这是 CausalLM/14B 的部署,具有 140亿 个参数,正在 CPU 上运行。
        CausalLM/14B 是一种会话语言模型,在多种类型的语料库上进行训练。
        本节目由 JWorld & 上海师范大学附属外国语中学 NLPark 赞助播出
        特别鸣谢 CausalLM 团队提供的如此优秀的模型,以及 TheBloke 提供的量化""")
    
    with gr.Row():
        with gr.Column(scale=5):
            chatbot = gr.Chatbot(label="兮辞如是说").style(height=400)
    with gr.Row():
        with gr.Column():
            msg = gr.Textbox(
                label="来问问兮辞吧……",
                placeholder="兮辞折寿中……",
                show_label=True,
            ).style(container=True)
            submit = gr.Button("Submit / 开凹!")
            stop = gr.Button("Stop / 全局时空断裂")
            clear = gr.Button("Clear / 打扫群内垃圾")
    with gr.Row():
        with gr.Column(min_width=80, scale=1):
            with gr.Tab(label="设置参数"):
                top_p = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    interactive=True,
                    label="Top-p",
                )
                top_k = gr.Slider(
                    minimum=10,
                    maximum=100,
                    value=30,
                    step=5,
                    interactive=True,
                    label="Top-k",
                )
                temp = gr.Slider(
                    minimum=0.0,
                    maximum=2.0,
                    value=0.2,
                    step=0.01,
                    interactive=True,
                    label="情感温度"
                )
        with gr.Column():
            system_prompt = gr.Textbox(label="系统提示词", placeholder="", value=SYSTEM_PROMPT, interactive=False)
    with gr.Row():
        gr.Markdown(
            """警告:该模型可能会生成事实上或道德上不正确的文本。NLPark和兮辞对此不承担任何责任。"""
        )

    # Pressing Enter
    submit_event = msg.submit(
        fn=user,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).success(
        fn=bot,
        inputs=[
            chatbot,
            system_prompt,
            top_p,
            top_k,
            temp
        ],
        outputs=chatbot,
        queue=True,
    )

    # Pressing the button
    submit_click_event = submit.click(
        fn=user,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).success(
        fn=bot,
        inputs=[
            chatbot,
            system_prompt,
            top_p,
            top_k,
            temp
        ],
        outputs=chatbot,
        queue=True,
    )

    # Stop generation
    stop.click(
        fn=None,
        inputs=None,
        outputs=None,
        cancels=[submit_event, submit_click_event],
        queue=False,
    )

    # Clear history
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue(max_size=128, concurrency_count=1)
demo.launch()