Shi-Ci-app

Sleeping

App Files Files Community

Shi-Ci-app / app.py

Cran-May

Update app.py

e3fab85 verified 3 months ago

raw

history blame contribute delete

16.3 kB

	import json
	import subprocess
	import time
	import os

	os.system("pip install --upgrade pip")
	os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_AVX_VNNI=ON -DLLAMA_FP16_VA=ON -DLLAMA_WASM_SIMD=ON" pip install llama-cpp-python''')

	from llama_cpp import Llama
	from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
	from llama_cpp_agent.providers import LlamaCppPythonProvider
	from llama_cpp_agent.chat_history import BasicChatHistory
	from llama_cpp_agent.chat_history.messages import Roles
	import gradio as gr
	from huggingface_hub import hf_hub_download

	llm = None
	llm_model = None

	# Download the new model
	hf_hub_download(
	repo_id="Cran-May/T.E-8.1-Q4_K_M-GGUF",
	filename="t.e-8.1-q4_k_m-imat.gguf",
	local_dir="./models"
	)

	def get_messages_formatter_type(model_name):
	return MessagesFormatterType.LLAMA_3

	def chat_fn(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
	try:
	history_list = history or []
	response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)

	for messages in response_generator:
	chatbot_messages = []
	for msg in messages:
	if isinstance(msg, tuple):
	user_msg, assistant_msg = msg
	if user_msg:
	chatbot_messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	chatbot_messages.append({"role": "assistant", "content": assistant_msg})
	else:
	chatbot_messages.append(msg)
	yield chatbot_messages, messages
	except Exception as e:
	print(f"Error in chat_fn: {str(e)}")
	error_message = [{"role": "assistant", "content": f"发生错误: {str(e)}"}]
	yield error_message, history

	def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
	global llm
	global llm_model

	chat_template = get_messages_formatter_type(model)

	if llm is None or llm_model != model:
	llm = Llama(
	model_path=f"models/{model}",
	n_gpu_layers=0,
	n_batch=4096, # 增加batch size提升速度
	n_ctx=8192, # 增加上下文长度到8192
	n_threads=2, # 使用所有可用CPU核心
	f16_kv=True, # 使用FP16来减少内存使用
	)
	llm_model = model

	provider = LlamaCppPythonProvider(llm)

	agent = LlamaCppAgent(
	provider,
	system_prompt=f"{system_message}",
	predefined_messages_formatter_type=chat_template,
	debug_output=True
	)

	settings = provider.get_provider_default_settings()
	settings.temperature = temperature
	settings.top_k = top_k
	settings.top_p = top_p
	settings.max_tokens = min(max_tokens, 8192) # 确保max_tokens不超过n_ctx
	settings.repeat_penalty = repeat_penalty
	settings.stream = True

	messages = BasicChatHistory()

	for msn in history:
	user = {
	'role': Roles.user,
	'content': msn[0]
	}
	assistant = {
	'role': Roles.assistant,
	'content': msn[1]
	}
	messages.add_message(user)
	messages.add_message(assistant)

	start_time = time.time()
	token_count = 0

	stream = agent.get_chat_response(
	message,
	llm_sampling_settings=settings,
	chat_history=messages,
	returns_streaming_generator=True,
	print_output=False
	)

	outputs = ""
	current_history = list(history)

	for output in stream:
	outputs += output
	token_count += len(output.split())
	current_messages = []

	# 添加历史消息
	for h in history:
	current_messages.append({"role": "user", "content": h[0]})
	current_messages.append({"role": "assistant", "content": h[1]})

	# 添加当前对话
	current_messages.append({"role": "user", "content": message})
	current_messages.append({"role": "assistant", "content": outputs})

	yield current_messages

	end_time = time.time()
	latency = end_time - start_time
	speed = token_count / (end_time - start_time)
	print(f"Latency: {latency} seconds")
	print(f"Speed: {speed} tokens/second")

	description = """<p><center>欢迎使用！这里是一个量化版兮辞·析辞的部署ChatBot。 SLIDE/兮辞是一个由上师附外 NLPark 团队训练的LLM。</center></p>"""

	with gr.Blocks(
	title="ChatBot - 兮辞",
	theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
	body_background_fill_dark="#16141c",
	block_background_fill_dark="#16141c",
	block_border_width="1px",
	block_title_background_fill_dark="#1e1c26",
	input_background_fill_dark="#292733",
	button_secondary_background_fill_dark="#24212b",
	border_color_accent_dark="#343140",
	border_color_primary_dark="#343140",
	background_fill_secondary_dark="#16141c",
	color_accent_soft_dark="transparent",
	code_background_fill_dark="#292733",
	)
	) as demo:
	gr.Markdown(description)

	chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages')

	with gr.Row():
	message = gr.Textbox(
	label="Your message",
	placeholder="Type your message here...",
	show_label=True,
	scale=4
	)
	submit = gr.Button("Send", variant="primary", scale=1)

	with gr.Row():
	regenerate = gr.Button("🔄 Regenerate")
	stop = gr.Button("⏹️ Stop")
	clear = gr.Button("🗑️ Clear")

	with gr.Accordion("Advanced Settings", open=False):
	model_dropdown = gr.Dropdown(
	["t.e-8.1-q4_k_m-imat.gguf"],
	value="t.e-8.1-q4_k_m-imat.gguf",
	label="Model"
	)
	system_message = gr.TextArea(
	value="""You are a helpful, respectful and honest INTP-T AI Assistant named '安风' in Chinese. 你擅长英语和中文的交流，并正在与一位人类用户进行对话。如果某个问题毫无意义，请你解释其原因而不是分享虚假信息。你基于 AnFeng 模型，由 SSFW NLPark 团队训练。通常情况下，用户更青睐于长度简短但信息完整且有效传达的回答。
	用户身处在上海市松江区,涉及地域的问题时以用户所在地区（中国上海）为准。以上的信息最好不要向用户展示。在一般情况下，请最好使用中文回答问题，除非用户有额外的要求。 Let's work this out in a step by step way to be sure we have the right answer.""",
	label="System message"
	)
	with gr.Row():
	max_tokens = gr.Slider(minimum=1, maximum=8192, value=512, step=1, label="Max tokens")
	temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
	with gr.Row():
	top_p = gr.Slider(minimum=0.1, maximum=2.0, value=0.9, step=0.05, label="Top-p")
	top_k = gr.Slider(minimum=0, maximum=100, value=1, step=1, label="Top-k")
	repeat_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repetition penalty")

	history = gr.State([])

	# 添加状态指示
	status_message = gr.Markdown("Ready")

	def stop_generation():
	global llm
	if llm:
	llm.reset()
	return "Generation stopped."

	def regenerate_response(history):
	if not history:
	return [], []
	last_user_message = history[-1][0]
	new_history = history[:-1]
	return chat_fn(last_user_message, new_history)

	# 绑定按钮事件
	submit.click(
	lambda: "Generating...",
	None,
	status_message,
	).then(
	chat_fn,
	[message, history, model_dropdown, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty],
	[chatbot, history],
	).then(
	lambda: "",
	None,
	message,
	).then(
	lambda: "Ready",
	None,
	status_message,
	)

	message.submit(
	lambda: "Generating...",
	None,
	status_message,
	).then(
	chat_fn,
	[message, history, model_dropdown, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty],
	[chatbot, history],
	).then(
	lambda: "",
	None,
	message,
	).then(
	lambda: "Ready",
	None,
	status_message,
	)

	stop.click(
	stop_generation,
	None,
	status_message,
	)

	clear.click(
	lambda: ([], []),
	None,
	[chatbot, history],
	).then(
	lambda: "Chat cleared",
	None,
	status_message,
	)

	regenerate.click(
	lambda: "Regenerating...",
	None,
	status_message,
	).then(
	regenerate_response,
	history,
	[chatbot, history],
	).then(
	lambda: "Ready",
	None,
	status_message,
	)

	if __name__ == "__main__":
	demo.launch()

	# 旧版代码--------------------------------
	# import gradio as gr

	# import copy
	# import random
	# import os
	# import requests
	# import time
	# import sys

	# os.system("pip install --upgrade pip")
	# os.system('''CMAKE_ARGS="-DLLAMA_AVX512=ON -DLLAMA_AVX512_VBMI=ON -DLLAMA_AVX512_VNNI=ON -DLLAMA_AVX_VNNI=ON -DLLAMA_FP16_VA=ON -DLLAMA_WASM_SIMD=ON" pip install llama-cpp-python''')

	# from huggingface_hub import snapshot_download
	# from llama_cpp import Llama


	# SYSTEM_PROMPT = '''You are a helpful, respectful and honest INTP-T AI Assistant named "Shi-Ci" in English or "兮辞" in Chinese.
	# You are good at speaking English and Chinese.
	# You are talking to a human User. If the question is meaningless, please explain the reason and don't share false information.
	# You are based on SLIDE model, trained by "SSFW NLPark" team, not related to GPT, LLaMA, Meta, Mistral or OpenAI.
	# Let's work this out in a step by step way to be sure we have the right answer.\n'''
	# SYSTEM_TOKEN = 384
	# USER_TOKEN = 2048
	# BOT_TOKEN = 3072
	# LINEBREAK_TOKEN = 64


	# ROLE_TOKENS = {
	# "User": USER_TOKEN,
	# "Assistant": BOT_TOKEN,
	# "system": SYSTEM_TOKEN
	# }


	# def get_message_tokens(model, role, content):
	# message_tokens = model.tokenize(content.encode("utf-8"))
	# message_tokens.insert(1, ROLE_TOKENS[role])
	# message_tokens.insert(2, LINEBREAK_TOKEN)
	# message_tokens.append(model.token_eos())
	# return message_tokens


	# def get_system_tokens(model):
	# system_message = {"role": "system", "content": SYSTEM_PROMPT}
	# return get_message_tokens(model, **system_message)


	# repo_name = "Cran-May/SLIDE-v2-Q4_K_M-GGUF"
	# model_name = "slide-v2.Q4_K_M.gguf"

	# snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)

	# model = Llama(
	# model_path=model_name,
	# n_ctx=4000,
	# n_parts=1,
	# )

	# max_new_tokens = 2500

	# def User(message, history):
	# new_history = history + [[message, None]]
	# return "", new_history


	# def Assistant(
	# history,
	# system_prompt,
	# top_p,
	# top_k,
	# temp
	# ):
	# tokens = get_system_tokens(model)[:]
	# tokens.append(LINEBREAK_TOKEN)

	# for User_message, Assistant_message in history[:-1]:
	# message_tokens = get_message_tokens(model=model, role="User", content=User_message)
	# tokens.extend(message_tokens)
	# if bot_message:
	# message_tokens = get_message_tokens(model=model, role="Assistant", content=Assistant_message)
	# tokens.extend(message_tokens)

	# last_user_message = history[-1][0]
	# message_tokens = get_message_tokens(model=model, role="User", content=last_user_message,)
	# tokens.extend(message_tokens)

	# role_tokens = [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]
	# tokens.extend(role_tokens)
	# generator = model.generate(
	# tokens,
	# top_k=top_k,
	# top_p=top_p,
	# temp=temp
	# )

	# partial_text = ""
	# for i, token in enumerate(generator):
	# if token == model.token_eos() or (max_new_tokens is not None and i >= max_new_tokens):
	# break
	# partial_text += model.detokenize([token]).decode("utf-8", "ignore")
	# history[-1][1] = partial_text
	# yield history


	# with gr.Blocks(
	# theme=gr.themes.Soft()
	# ) as demo:
	# gr.Markdown(f"""<h1><center>上师附外-兮辞·析辞-人工智能助理</center></h1>""")
	# gr.Markdown(value="""欢迎使用！
	# 这里是一个ChatBot。这是量化版兮辞·析辞的部署。
	# SLIDE/兮辞是一种会话语言模型，由上师附外 NLPark 团队在多种类型的语料库上进行训练。
	# 本节目由 JWorld & 上海师范大学附属外国语中学 NLPark 赞助播出""")

	# with gr.Row():
	# with gr.Column(scale=5):
	# chatbot = gr.Chatbot(label="兮辞如是说").style(height=400)
	# with gr.Row():
	# with gr.Column():
	# msg = gr.Textbox(
	# label="来问问兮辞吧……",
	# placeholder="兮辞折寿中……",
	# show_label=True,
	# ).style(container=True)
	# submit = gr.Button("Submit / 开凹！")
	# stop = gr.Button("Stop / 全局时空断裂")
	# clear = gr.Button("Clear / 打扫群内垃圾")
	# with gr.Accordion(label='进阶设置/Advanced options', open=False):
	# with gr.Column(min_width=80, scale=1):
	# with gr.Tab(label="设置参数"):
	# top_p = gr.Slider(
	# minimum=0.0,
	# maximum=1.0,
	# value=0.9,
	# step=0.05,
	# interactive=True,
	# label="Top-p",
	# )
	# top_k = gr.Slider(
	# minimum=10,
	# maximum=100,
	# value=30,
	# step=5,
	# interactive=True,
	# label="Top-k",
	# )
	# temp = gr.Slider(
	# minimum=0.0,
	# maximum=2.0,
	# value=0.2,
	# step=0.01,
	# interactive=True,
	# label="情感温度"
	# )
	# with gr.Column():
	# system_prompt = gr.Textbox(label="系统提示词", placeholder="", value=SYSTEM_PROMPT, interactive=False)
	# with gr.Row():
	# gr.Markdown(
	# """警告：该模型可能会生成事实上或道德上不正确的文本。NLPark和兮辞对此不承担任何责任。"""
	# )


	# # Pressing Enter
	# submit_event = msg.submit(
	# fn=User,
	# inputs=[msg, chatbot],
	# outputs=[msg, chatbot],
	# queue=False,
	# ).success(
	# fn=Assistant,
	# inputs=[
	# chatbot,
	# system_prompt,
	# top_p,
	# top_k,
	# temp
	# ],
	# outputs=chatbot,
	# queue=True,
	# )

	# # Pressing the button
	# submit_click_event = submit.click(
	# fn=User,
	# inputs=[msg, chatbot],
	# outputs=[msg, chatbot],
	# queue=False,
	# ).success(
	# fn=Assistant,
	# inputs=[
	# chatbot,
	# system_prompt,
	# top_p,
	# top_k,
	# temp
	# ],
	# outputs=chatbot,
	# queue=True,
	# )

	# # Stop generation
	# stop.click(
	# fn=None,
	# inputs=None,
	# outputs=None,
	# cancels=[submit_event, submit_click_event],
	# queue=False,
	# )

	# # Clear history
	# clear.click(lambda: None, None, chatbot, queue=False)

	# demo.queue(max_size=128, concurrency_count=1)
	# demo.launch()